├── Fake News.ipynb
├── Readme.md
└── model.pkl
/Fake News.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Data Import"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np\n",
17 | "import pandas as pd\n",
18 | "import matplotlib.pyplot as plt"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "True_news = pd.read_csv('True.csv')\n",
28 | "Fake_news = pd.read_csv('Fake.csv')"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 3,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "True_news['label'] = 0"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 4,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "Fake_news['label'] = 1"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 5,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "data": {
56 | "text/html": [
57 | "
\n",
58 | "\n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " | \n",
75 | " title | \n",
76 | " text | \n",
77 | " subject | \n",
78 | " date | \n",
79 | " label | \n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " 0 | \n",
85 | " As U.S. budget fight looms, Republicans flip t... | \n",
86 | " WASHINGTON (Reuters) - The head of a conservat... | \n",
87 | " politicsNews | \n",
88 | " December 31, 2017 | \n",
89 | " 0 | \n",
90 | "
\n",
91 | " \n",
92 | " 1 | \n",
93 | " U.S. military to accept transgender recruits o... | \n",
94 | " WASHINGTON (Reuters) - Transgender people will... | \n",
95 | " politicsNews | \n",
96 | " December 29, 2017 | \n",
97 | " 0 | \n",
98 | "
\n",
99 | " \n",
100 | " 2 | \n",
101 | " Senior U.S. Republican senator: 'Let Mr. Muell... | \n",
102 | " WASHINGTON (Reuters) - The special counsel inv... | \n",
103 | " politicsNews | \n",
104 | " December 31, 2017 | \n",
105 | " 0 | \n",
106 | "
\n",
107 | " \n",
108 | " 3 | \n",
109 | " FBI Russia probe helped by Australian diplomat... | \n",
110 | " WASHINGTON (Reuters) - Trump campaign adviser ... | \n",
111 | " politicsNews | \n",
112 | " December 30, 2017 | \n",
113 | " 0 | \n",
114 | "
\n",
115 | " \n",
116 | " 4 | \n",
117 | " Trump wants Postal Service to charge 'much mor... | \n",
118 | " SEATTLE/WASHINGTON (Reuters) - President Donal... | \n",
119 | " politicsNews | \n",
120 | " December 29, 2017 | \n",
121 | " 0 | \n",
122 | "
\n",
123 | " \n",
124 | "
\n",
125 | "
"
126 | ],
127 | "text/plain": [
128 | " title \\\n",
129 | "0 As U.S. budget fight looms, Republicans flip t... \n",
130 | "1 U.S. military to accept transgender recruits o... \n",
131 | "2 Senior U.S. Republican senator: 'Let Mr. Muell... \n",
132 | "3 FBI Russia probe helped by Australian diplomat... \n",
133 | "4 Trump wants Postal Service to charge 'much mor... \n",
134 | "\n",
135 | " text subject \\\n",
136 | "0 WASHINGTON (Reuters) - The head of a conservat... politicsNews \n",
137 | "1 WASHINGTON (Reuters) - Transgender people will... politicsNews \n",
138 | "2 WASHINGTON (Reuters) - The special counsel inv... politicsNews \n",
139 | "3 WASHINGTON (Reuters) - Trump campaign adviser ... politicsNews \n",
140 | "4 SEATTLE/WASHINGTON (Reuters) - President Donal... politicsNews \n",
141 | "\n",
142 | " date label \n",
143 | "0 December 31, 2017 0 \n",
144 | "1 December 29, 2017 0 \n",
145 | "2 December 31, 2017 0 \n",
146 | "3 December 30, 2017 0 \n",
147 | "4 December 29, 2017 0 "
148 | ]
149 | },
150 | "execution_count": 5,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "True_news.head()"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 6,
162 | "metadata": {},
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/html": [
167 | "\n",
168 | "\n",
181 | "
\n",
182 | " \n",
183 | " \n",
184 | " | \n",
185 | " title | \n",
186 | " text | \n",
187 | " subject | \n",
188 | " date | \n",
189 | " label | \n",
190 | "
\n",
191 | " \n",
192 | " \n",
193 | " \n",
194 | " 0 | \n",
195 | " Donald Trump Sends Out Embarrassing New Year’... | \n",
196 | " Donald Trump just couldn t wish all Americans ... | \n",
197 | " News | \n",
198 | " December 31, 2017 | \n",
199 | " 1 | \n",
200 | "
\n",
201 | " \n",
202 | " 1 | \n",
203 | " Drunk Bragging Trump Staffer Started Russian ... | \n",
204 | " House Intelligence Committee Chairman Devin Nu... | \n",
205 | " News | \n",
206 | " December 31, 2017 | \n",
207 | " 1 | \n",
208 | "
\n",
209 | " \n",
210 | " 2 | \n",
211 | " Sheriff David Clarke Becomes An Internet Joke... | \n",
212 | " On Friday, it was revealed that former Milwauk... | \n",
213 | " News | \n",
214 | " December 30, 2017 | \n",
215 | " 1 | \n",
216 | "
\n",
217 | " \n",
218 | " 3 | \n",
219 | " Trump Is So Obsessed He Even Has Obama’s Name... | \n",
220 | " On Christmas day, Donald Trump announced that ... | \n",
221 | " News | \n",
222 | " December 29, 2017 | \n",
223 | " 1 | \n",
224 | "
\n",
225 | " \n",
226 | " 4 | \n",
227 | " Pope Francis Just Called Out Donald Trump Dur... | \n",
228 | " Pope Francis used his annual Christmas Day mes... | \n",
229 | " News | \n",
230 | " December 25, 2017 | \n",
231 | " 1 | \n",
232 | "
\n",
233 | " \n",
234 | "
\n",
235 | "
"
236 | ],
237 | "text/plain": [
238 | " title \\\n",
239 | "0 Donald Trump Sends Out Embarrassing New Year’... \n",
240 | "1 Drunk Bragging Trump Staffer Started Russian ... \n",
241 | "2 Sheriff David Clarke Becomes An Internet Joke... \n",
242 | "3 Trump Is So Obsessed He Even Has Obama’s Name... \n",
243 | "4 Pope Francis Just Called Out Donald Trump Dur... \n",
244 | "\n",
245 | " text subject \\\n",
246 | "0 Donald Trump just couldn t wish all Americans ... News \n",
247 | "1 House Intelligence Committee Chairman Devin Nu... News \n",
248 | "2 On Friday, it was revealed that former Milwauk... News \n",
249 | "3 On Christmas day, Donald Trump announced that ... News \n",
250 | "4 Pope Francis used his annual Christmas Day mes... News \n",
251 | "\n",
252 | " date label \n",
253 | "0 December 31, 2017 1 \n",
254 | "1 December 31, 2017 1 \n",
255 | "2 December 30, 2017 1 \n",
256 | "3 December 29, 2017 1 \n",
257 | "4 December 25, 2017 1 "
258 | ]
259 | },
260 | "execution_count": 6,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "Fake_news.head()"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 7,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "dataset1 = True_news[['text','label']]\n",
276 | "dataset2 = Fake_news[['text','label']]"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 8,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "dataset = pd.concat([dataset1 , dataset2])"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 9,
291 | "metadata": {},
292 | "outputs": [
293 | {
294 | "data": {
295 | "text/plain": [
296 | "(44898, 2)"
297 | ]
298 | },
299 | "execution_count": 9,
300 | "metadata": {},
301 | "output_type": "execute_result"
302 | }
303 | ],
304 | "source": [
305 | "dataset.shape"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "### Null values"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 10,
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "data": {
322 | "text/plain": [
323 | "text 0\n",
324 | "label 0\n",
325 | "dtype: int64"
326 | ]
327 | },
328 | "execution_count": 10,
329 | "metadata": {},
330 | "output_type": "execute_result"
331 | }
332 | ],
333 | "source": [
334 | "dataset.isnull().sum() # no null values"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "### Balanced or Unbalanced dataset"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 11,
347 | "metadata": {},
348 | "outputs": [
349 | {
350 | "data": {
351 | "text/plain": [
352 | "1 23481\n",
353 | "0 21417\n",
354 | "Name: label, dtype: int64"
355 | ]
356 | },
357 | "execution_count": 11,
358 | "metadata": {},
359 | "output_type": "execute_result"
360 | }
361 | ],
362 | "source": [
363 | "dataset['label'].value_counts()"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": 12,
369 | "metadata": {},
370 | "outputs": [
371 | {
372 | "data": {
373 | "text/plain": [
374 | "(21417, 2)"
375 | ]
376 | },
377 | "execution_count": 12,
378 | "metadata": {},
379 | "output_type": "execute_result"
380 | }
381 | ],
382 | "source": [
383 | "dataset1.shape # true news"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 13,
389 | "metadata": {},
390 | "outputs": [
391 | {
392 | "data": {
393 | "text/plain": [
394 | "(23481, 2)"
395 | ]
396 | },
397 | "execution_count": 13,
398 | "metadata": {},
399 | "output_type": "execute_result"
400 | }
401 | ],
402 | "source": [
403 | "dataset2.shape # fake news"
404 | ]
405 | },
406 | {
407 | "cell_type": "markdown",
408 | "metadata": {},
409 | "source": [
410 | "### Shuffle or Resample"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 14,
416 | "metadata": {},
417 | "outputs": [],
418 | "source": [
419 | "dataset = dataset.sample(frac = 1)"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 15,
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "data": {
429 | "text/html": [
430 | "\n",
431 | "\n",
444 | "
\n",
445 | " \n",
446 | " \n",
447 | " | \n",
448 | " text | \n",
449 | " label | \n",
450 | "
\n",
451 | " \n",
452 | " \n",
453 | " \n",
454 | " 22161 | \n",
455 | " 21st Century Wire says It was an awkward fit o... | \n",
456 | " 1 | \n",
457 | "
\n",
458 | " \n",
459 | " 17723 | \n",
460 | " MONROVIA (Reuters) - One of Liberia s leading ... | \n",
461 | " 0 | \n",
462 | "
\n",
463 | " \n",
464 | " 13759 | \n",
465 | " MOSCOW (Reuters) - Former Russian economy mini... | \n",
466 | " 0 | \n",
467 | "
\n",
468 | " \n",
469 | " 9387 | \n",
470 | " (Reuters) - Officials from 11 U.S. states sued... | \n",
471 | " 0 | \n",
472 | "
\n",
473 | " \n",
474 | " 18704 | \n",
475 | " COLBERT LANGUAGE WARNING! This late night ho... | \n",
476 | " 1 | \n",
477 | "
\n",
478 | " \n",
479 | " 2476 | \n",
480 | " Kellyanne Conway, who serves as a senior advis... | \n",
481 | " 1 | \n",
482 | "
\n",
483 | " \n",
484 | " 16571 | \n",
485 | " I ll bet you re thinking this is a joke, right... | \n",
486 | " 1 | \n",
487 | "
\n",
488 | " \n",
489 | " 6836 | \n",
490 | " WASHINGTON (Reuters) - U.S. President-elect Do... | \n",
491 | " 0 | \n",
492 | "
\n",
493 | " \n",
494 | " 13742 | \n",
495 | " Don t buy into the media lie that every LEGAL ... | \n",
496 | " 1 | \n",
497 | "
\n",
498 | " \n",
499 | " 468 | \n",
500 | " Donald Trump has the maturity of a toddler, an... | \n",
501 | " 1 | \n",
502 | "
\n",
503 | " \n",
504 | " 13729 | \n",
505 | " TEGUCIGALPA (Reuters) - With 70 percent of bal... | \n",
506 | " 0 | \n",
507 | "
\n",
508 | " \n",
509 | " 7098 | \n",
510 | " Who needs experts, really, when one can rely o... | \n",
511 | " 1 | \n",
512 | "
\n",
513 | " \n",
514 | " 3084 | \n",
515 | " There s a man who s been permanently banned fr... | \n",
516 | " 1 | \n",
517 | "
\n",
518 | " \n",
519 | " 5895 | \n",
520 | " BERLIN (Reuters) - German Chancellor Angela Me... | \n",
521 | " 0 | \n",
522 | "
\n",
523 | " \n",
524 | " 4123 | \n",
525 | " WASHINGTON (Reuters) - U.S. Treasury Secretary... | \n",
526 | " 0 | \n",
527 | "
\n",
528 | " \n",
529 | " 6989 | \n",
530 | " NEW YORK (Reuters) - President-elect Donald Tr... | \n",
531 | " 0 | \n",
532 | "
\n",
533 | " \n",
534 | " 10640 | \n",
535 | " Will Austria s new mandates help to save their... | \n",
536 | " 1 | \n",
537 | "
\n",
538 | " \n",
539 | " 7968 | \n",
540 | " NEW YORK (Reuters) - A majority of Americans ... | \n",
541 | " 0 | \n",
542 | "
\n",
543 | " \n",
544 | " 2934 | \n",
545 | " WASHINGTON (Reuters) - Twenty moderate Republi... | \n",
546 | " 0 | \n",
547 | "
\n",
548 | " \n",
549 | " 5332 | \n",
550 | " NEW YORK (Reuters) - NATO is the “strongest al... | \n",
551 | " 0 | \n",
552 | "
\n",
553 | " \n",
554 | "
\n",
555 | "
"
556 | ],
557 | "text/plain": [
558 | " text label\n",
559 | "22161 21st Century Wire says It was an awkward fit o... 1\n",
560 | "17723 MONROVIA (Reuters) - One of Liberia s leading ... 0\n",
561 | "13759 MOSCOW (Reuters) - Former Russian economy mini... 0\n",
562 | "9387 (Reuters) - Officials from 11 U.S. states sued... 0\n",
563 | "18704 COLBERT LANGUAGE WARNING! This late night ho... 1\n",
564 | "2476 Kellyanne Conway, who serves as a senior advis... 1\n",
565 | "16571 I ll bet you re thinking this is a joke, right... 1\n",
566 | "6836 WASHINGTON (Reuters) - U.S. President-elect Do... 0\n",
567 | "13742 Don t buy into the media lie that every LEGAL ... 1\n",
568 | "468 Donald Trump has the maturity of a toddler, an... 1\n",
569 | "13729 TEGUCIGALPA (Reuters) - With 70 percent of bal... 0\n",
570 | "7098 Who needs experts, really, when one can rely o... 1\n",
571 | "3084 There s a man who s been permanently banned fr... 1\n",
572 | "5895 BERLIN (Reuters) - German Chancellor Angela Me... 0\n",
573 | "4123 WASHINGTON (Reuters) - U.S. Treasury Secretary... 0\n",
574 | "6989 NEW YORK (Reuters) - President-elect Donald Tr... 0\n",
575 | "10640 Will Austria s new mandates help to save their... 1\n",
576 | "7968 NEW YORK (Reuters) - A majority of Americans ... 0\n",
577 | "2934 WASHINGTON (Reuters) - Twenty moderate Republi... 0\n",
578 | "5332 NEW YORK (Reuters) - NATO is the “strongest al... 0"
579 | ]
580 | },
581 | "execution_count": 15,
582 | "metadata": {},
583 | "output_type": "execute_result"
584 | }
585 | ],
586 | "source": [
587 | "dataset.head(20)"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 16,
593 | "metadata": {},
594 | "outputs": [],
595 | "source": [
596 | "import nltk"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": 17,
602 | "metadata": {},
603 | "outputs": [],
604 | "source": [
605 | "import re\n",
606 | "from nltk.corpus import stopwords\n",
607 | "from nltk.stem import WordNetLemmatizer"
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": 18,
613 | "metadata": {},
614 | "outputs": [],
615 | "source": [
616 | "ps = WordNetLemmatizer()"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": 19,
622 | "metadata": {},
623 | "outputs": [],
624 | "source": [
625 | "stopwords = stopwords.words('english')"
626 | ]
627 | },
628 | {
629 | "cell_type": "code",
630 | "execution_count": 20,
631 | "metadata": {},
632 | "outputs": [
633 | {
634 | "name": "stderr",
635 | "output_type": "stream",
636 | "text": [
637 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
638 | "[nltk_data] Package wordnet is already up-to-date!\n"
639 | ]
640 | },
641 | {
642 | "data": {
643 | "text/plain": [
644 | "True"
645 | ]
646 | },
647 | "execution_count": 20,
648 | "metadata": {},
649 | "output_type": "execute_result"
650 | }
651 | ],
652 | "source": [
653 | "nltk.download('wordnet')"
654 | ]
655 | },
656 | {
657 | "cell_type": "code",
658 | "execution_count": 21,
659 | "metadata": {},
660 | "outputs": [],
661 | "source": [
662 | "def cleaning_data(row):\n",
663 | " \n",
664 | " # convert text to into lower case\n",
665 | " row = row.lower() \n",
666 | " \n",
667 | " # this line of code only take words from text and remove number and special character using RegX\n",
668 | " row = re.sub('[^a-zA-Z]' , ' ' , row)\n",
669 | " \n",
670 | " # split the data and make token.\n",
671 | " token = row.split() \n",
672 | " \n",
673 | " # lemmatize the word and remove stop words like a, an , the , is ,are ...\n",
674 | " news = [ps.lemmatize(word) for word in token if not word in stopwords] \n",
675 | " \n",
676 | " # finaly join all the token with space\n",
677 | " cleanned_news = ' '.join(news) \n",
678 | " \n",
679 | " # return cleanned data\n",
680 | " return cleanned_news "
681 | ]
682 | },
683 | {
684 | "cell_type": "code",
685 | "execution_count": 22,
686 | "metadata": {},
687 | "outputs": [],
688 | "source": [
689 | "dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(x))"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": 23,
695 | "metadata": {},
696 | "outputs": [
697 | {
698 | "data": {
699 | "text/plain": [
700 | "text 0\n",
701 | "label 0\n",
702 | "dtype: int64"
703 | ]
704 | },
705 | "execution_count": 23,
706 | "metadata": {},
707 | "output_type": "execute_result"
708 | }
709 | ],
710 | "source": [
711 | "dataset.isnull().sum()"
712 | ]
713 | },
714 | {
715 | "cell_type": "code",
716 | "execution_count": 24,
717 | "metadata": {},
718 | "outputs": [],
719 | "source": [
720 | "from sklearn.feature_extraction.text import TfidfVectorizer"
721 | ]
722 | },
723 | {
724 | "cell_type": "code",
725 | "execution_count": 25,
726 | "metadata": {},
727 | "outputs": [],
728 | "source": [
729 | "vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))"
730 | ]
731 | },
732 | {
733 | "cell_type": "code",
734 | "execution_count": 26,
735 | "metadata": {},
736 | "outputs": [
737 | {
738 | "data": {
739 | "text/plain": [
740 | "(44898, 2)"
741 | ]
742 | },
743 | "execution_count": 26,
744 | "metadata": {},
745 | "output_type": "execute_result"
746 | }
747 | ],
748 | "source": [
749 | "dataset.shape"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 27,
755 | "metadata": {},
756 | "outputs": [],
757 | "source": [
758 | "X = dataset.iloc[:35000,0]\n",
759 | "y = dataset.iloc[:35000,1]"
760 | ]
761 | },
762 | {
763 | "cell_type": "code",
764 | "execution_count": 28,
765 | "metadata": {},
766 | "outputs": [
767 | {
768 | "data": {
769 | "text/plain": [
770 | "22161 st century wire say awkward fit nerve donald m...\n",
771 | "17723 monrovia reuters one liberia leading political...\n",
772 | "13759 moscow reuters former russian economy minister...\n",
773 | "9387 reuters official u state sued obama administra...\n",
774 | "18704 colbert language warning late night host one a...\n",
775 | "Name: text, dtype: object"
776 | ]
777 | },
778 | "execution_count": 28,
779 | "metadata": {},
780 | "output_type": "execute_result"
781 | }
782 | ],
783 | "source": [
784 | "X.head()"
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 29,
790 | "metadata": {},
791 | "outputs": [
792 | {
793 | "data": {
794 | "text/plain": [
795 | "22161 1\n",
796 | "17723 0\n",
797 | "13759 0\n",
798 | "9387 0\n",
799 | "18704 1\n",
800 | "Name: label, dtype: int64"
801 | ]
802 | },
803 | "execution_count": 29,
804 | "metadata": {},
805 | "output_type": "execute_result"
806 | }
807 | ],
808 | "source": [
809 | "y.head()"
810 | ]
811 | },
812 | {
813 | "cell_type": "code",
814 | "execution_count": 30,
815 | "metadata": {},
816 | "outputs": [],
817 | "source": [
818 | "from sklearn.model_selection import train_test_split\n",
819 | "train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)"
820 | ]
821 | },
822 | {
823 | "cell_type": "code",
824 | "execution_count": 31,
825 | "metadata": {},
826 | "outputs": [],
827 | "source": [
828 | "vec_train_data = vectorizer.fit_transform(train_data)"
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": 32,
834 | "metadata": {},
835 | "outputs": [],
836 | "source": [
837 | "vec_train_data = vec_train_data.toarray()"
838 | ]
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": 33,
843 | "metadata": {},
844 | "outputs": [
845 | {
846 | "data": {
847 | "text/plain": [
848 | "((28000,), (7000,))"
849 | ]
850 | },
851 | "execution_count": 33,
852 | "metadata": {},
853 | "output_type": "execute_result"
854 | }
855 | ],
856 | "source": [
857 | "train_data.shape , test_data.shape"
858 | ]
859 | },
860 | {
861 | "cell_type": "code",
862 | "execution_count": 34,
863 | "metadata": {},
864 | "outputs": [],
865 | "source": [
866 | "vec_test_data = vectorizer.transform(test_data).toarray()"
867 | ]
868 | },
869 | {
870 | "cell_type": "code",
871 | "execution_count": 35,
872 | "metadata": {},
873 | "outputs": [
874 | {
875 | "data": {
876 | "text/plain": [
877 | "((28000, 50000), (7000, 50000))"
878 | ]
879 | },
880 | "execution_count": 35,
881 | "metadata": {},
882 | "output_type": "execute_result"
883 | }
884 | ],
885 | "source": [
886 | "vec_train_data.shape , vec_test_data.shape"
887 | ]
888 | },
889 | {
890 | "cell_type": "code",
891 | "execution_count": 36,
892 | "metadata": {},
893 | "outputs": [
894 | {
895 | "data": {
896 | "text/plain": [
897 | "1 14615\n",
898 | "0 13385\n",
899 | "Name: label, dtype: int64"
900 | ]
901 | },
902 | "execution_count": 36,
903 | "metadata": {},
904 | "output_type": "execute_result"
905 | }
906 | ],
907 | "source": [
908 | "train_label.value_counts() # balanced partition"
909 | ]
910 | },
911 | {
912 | "cell_type": "code",
913 | "execution_count": 37,
914 | "metadata": {},
915 | "outputs": [
916 | {
917 | "data": {
918 | "text/plain": [
919 | "1 3674\n",
920 | "0 3326\n",
921 | "Name: label, dtype: int64"
922 | ]
923 | },
924 | "execution_count": 37,
925 | "metadata": {},
926 | "output_type": "execute_result"
927 | }
928 | ],
929 | "source": [
930 | "test_label.value_counts() # balanced partition"
931 | ]
932 | },
933 | {
934 | "cell_type": "code",
935 | "execution_count": 38,
936 | "metadata": {},
937 | "outputs": [],
938 | "source": [
939 | "training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names())\n",
940 | "testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names())"
941 | ]
942 | },
943 | {
944 | "cell_type": "code",
945 | "execution_count": 39,
946 | "metadata": {},
947 | "outputs": [],
948 | "source": [
949 | "from sklearn.naive_bayes import MultinomialNB"
950 | ]
951 | },
952 | {
953 | "cell_type": "code",
954 | "execution_count": 40,
955 | "metadata": {},
956 | "outputs": [],
957 | "source": [
958 | "from sklearn.metrics import accuracy_score,classification_report"
959 | ]
960 | },
961 | {
962 | "cell_type": "code",
963 | "execution_count": 41,
964 | "metadata": {},
965 | "outputs": [],
966 | "source": [
967 | "clf = MultinomialNB()"
968 | ]
969 | },
970 | {
971 | "cell_type": "code",
972 | "execution_count": 42,
973 | "metadata": {},
974 | "outputs": [],
975 | "source": [
976 | "clf.fit(training_data, train_label)\n",
977 | "y_pred = clf.predict(testing_data)"
978 | ]
979 | },
980 | {
981 | "cell_type": "markdown",
982 | "metadata": {},
983 | "source": [
984 | "### MultinomialNB"
985 | ]
986 | },
987 | {
988 | "cell_type": "code",
989 | "execution_count": 43,
990 | "metadata": {},
991 | "outputs": [
992 | {
993 | "data": {
994 | "text/plain": [
995 | "1 3734\n",
996 | "0 3266\n",
997 | "dtype: int64"
998 | ]
999 | },
1000 | "execution_count": 43,
1001 | "metadata": {},
1002 | "output_type": "execute_result"
1003 | }
1004 | ],
1005 | "source": [
1006 | "pd.Series(y_pred).value_counts()"
1007 | ]
1008 | },
1009 | {
1010 | "cell_type": "code",
1011 | "execution_count": 44,
1012 | "metadata": {},
1013 | "outputs": [
1014 | {
1015 | "data": {
1016 | "text/plain": [
1017 | "1 3674\n",
1018 | "0 3326\n",
1019 | "Name: label, dtype: int64"
1020 | ]
1021 | },
1022 | "execution_count": 44,
1023 | "metadata": {},
1024 | "output_type": "execute_result"
1025 | }
1026 | ],
1027 | "source": [
1028 | "test_label.value_counts()"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "code",
1033 | "execution_count": 45,
1034 | "metadata": {},
1035 | "outputs": [
1036 | {
1037 | "name": "stdout",
1038 | "output_type": "stream",
1039 | "text": [
1040 | " precision recall f1-score support\n",
1041 | "\n",
1042 | " 0 0.96 0.94 0.95 3326\n",
1043 | " 1 0.95 0.96 0.96 3674\n",
1044 | "\n",
1045 | " accuracy 0.95 7000\n",
1046 | " macro avg 0.95 0.95 0.95 7000\n",
1047 | "weighted avg 0.95 0.95 0.95 7000\n",
1048 | "\n"
1049 | ]
1050 | }
1051 | ],
1052 | "source": [
1053 | "print(classification_report(test_label , y_pred))"
1054 | ]
1055 | },
1056 | {
1057 | "cell_type": "markdown",
1058 | "metadata": {},
1059 | "source": [
1060 | "Now predict on both train set"
1061 | ]
1062 | },
1063 | {
1064 | "cell_type": "code",
1065 | "execution_count": 46,
1066 | "metadata": {},
1067 | "outputs": [
1068 | {
1069 | "name": "stdout",
1070 | "output_type": "stream",
1071 | "text": [
1072 | " precision recall f1-score support\n",
1073 | "\n",
1074 | " 0 0.96 0.95 0.96 13385\n",
1075 | " 1 0.96 0.96 0.96 14615\n",
1076 | "\n",
1077 | " accuracy 0.96 28000\n",
1078 | " macro avg 0.96 0.96 0.96 28000\n",
1079 | "weighted avg 0.96 0.96 0.96 28000\n",
1080 | "\n"
1081 | ]
1082 | }
1083 | ],
1084 | "source": [
1085 | "y_pred_train = clf.predict(training_data)\n",
1086 | "print(classification_report(train_label , y_pred_train))"
1087 | ]
1088 | },
1089 | {
1090 | "cell_type": "code",
1091 | "execution_count": 47,
1092 | "metadata": {},
1093 | "outputs": [
1094 | {
1095 | "data": {
1096 | "text/plain": [
1097 | "0.9584642857142858"
1098 | ]
1099 | },
1100 | "execution_count": 47,
1101 | "metadata": {},
1102 | "output_type": "execute_result"
1103 | }
1104 | ],
1105 | "source": [
1106 | "accuracy_score(train_label , y_pred_train)"
1107 | ]
1108 | },
1109 | {
1110 | "cell_type": "code",
1111 | "execution_count": 48,
1112 | "metadata": {},
1113 | "outputs": [
1114 | {
1115 | "data": {
1116 | "text/plain": [
1117 | "0.9531428571428572"
1118 | ]
1119 | },
1120 | "execution_count": 48,
1121 | "metadata": {},
1122 | "output_type": "execute_result"
1123 | }
1124 | ],
1125 | "source": [
1126 | "accuracy_score(test_label , y_pred)"
1127 | ]
1128 | },
1129 | {
1130 | "cell_type": "code",
1131 | "execution_count": 49,
1132 | "metadata": {},
1133 | "outputs": [],
1134 | "source": [
1135 | "news = cleaning_data(str(\"Imposters posing as army personnel on the social media have been called out by the Indian Army as false news and disinformation.\"))"
1136 | ]
1137 | },
1138 | {
1139 | "cell_type": "code",
1140 | "execution_count": 50,
1141 | "metadata": {},
1142 | "outputs": [
1143 | {
1144 | "data": {
1145 | "text/plain": [
1146 | "array([1])"
1147 | ]
1148 | },
1149 | "execution_count": 50,
1150 | "metadata": {},
1151 | "output_type": "execute_result"
1152 | }
1153 | ],
1154 | "source": [
1155 | "single_prediction = clf.predict(vectorizer.transform([news]).toarray())\n",
1156 | "single_prediction"
1157 | ]
1158 | },
1159 | {
1160 | "cell_type": "markdown",
1161 | "metadata": {},
1162 | "source": [
1163 | "### Save the Model"
1164 | ]
1165 | },
1166 | {
1167 | "cell_type": "code",
1168 | "execution_count": 2,
1169 | "metadata": {},
1170 | "outputs": [],
1171 | "source": [
1172 | "import joblib "
1173 | ]
1174 | },
1175 | {
1176 | "cell_type": "code",
1177 | "execution_count": 53,
1178 | "metadata": {},
1179 | "outputs": [
1180 | {
1181 | "data": {
1182 | "text/plain": [
1183 | "['model.pkl']"
1184 | ]
1185 | },
1186 | "execution_count": 53,
1187 | "metadata": {},
1188 | "output_type": "execute_result"
1189 | }
1190 | ],
1191 | "source": [
1192 | "joblib.dump(clf , 'model.pkl')"
1193 | ]
1194 | },
1195 | {
1196 | "cell_type": "code",
1197 | "execution_count": null,
1198 | "metadata": {},
1199 | "outputs": [],
1200 | "source": [
1201 | "model = joblib.load('model.pkl')"
1202 | ]
1203 | }
1204 | ],
1205 | "metadata": {
1206 | "kernelspec": {
1207 | "display_name": "Python 3",
1208 | "language": "python",
1209 | "name": "python3"
1210 | },
1211 | "language_info": {
1212 | "codemirror_mode": {
1213 | "name": "ipython",
1214 | "version": 3
1215 | },
1216 | "file_extension": ".py",
1217 | "mimetype": "text/x-python",
1218 | "name": "python",
1219 | "nbconvert_exporter": "python",
1220 | "pygments_lexer": "ipython3",
1221 | "version": "3.7.4"
1222 | }
1223 | },
1224 | "nbformat": 4,
1225 | "nbformat_minor": 4
1226 | }
1227 |
--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/manthan89-py/Fake_News_detection/840c105c82bf7756569744e647bbc374ea32faa9/model.pkl
--------------------------------------------------------------------------------