├── Projects
    ├── .gitkeep
    ├── Spam-Ham Classification
    │   ├── .gitkeep
    │   ├── bagofwords_classifier.py
    │   ├── HashVectorizer.py
    │   ├── passive_aggresiveClassifier.py
    │   └── readme.md
    ├── Covid_tweets Sentiment Analysis
    │   ├── data.zip
    │   ├── readme.md
    │   └── Corona_NLP text classification.ipynb
    ├── IMDB Movie Reviews
    │   └── readme.md
    ├── Tweet Sentiment Extraction
    │   └── readme.md
    ├── Fake News Detection
    │   ├── readme.md
    │   └── Fake News.ipynb
    ├── Daily News for Stock Market Prediction
    │   ├── README.md
    │   └── Stock Prediction using News Headlines.ipynb
    ├── Twitter Sentiment Analysis(Beginners)
    │   ├── readme.md
    │   └── Twitter Sentiment Analysis (Small Dataset).ipynb
    ├── Women's E-Commerce Clothing Reviews
    │   └── readme.md
    └── Yelp Reviews
    │   └── readme.md
├── Deep Learning
    ├── readme.md
    ├── Loss
    │   ├── .gitkeep
    │   └── loss.py
    ├── Activation Functions
    │   ├── .gitkeep
    │   └── activationfunctions.py
    └── cuda.py
├── Regular Expression
    └── .gitkeep
├── cuda.py
├── stemming_demo.py
├── tokenize.py
├── stopwords_demo.py
├── Basic Perceptron.py
├── Lemmatization.py
├── README.md
├── speech_tagging.py
├── chinking.py
├── nameEntity_recog.py
├── chunking.py
├── tf-idf.py
├── bagofwords.py
└── word2vec.py


/Projects/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Deep Learning/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Deep Learning/Loss/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Regular Expression/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Deep Learning/Activation Functions/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Projects/Spam-Ham Classification/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Projects/Covid_tweets Sentiment Analysis/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bhav09/NLP-Basics/HEAD/Projects/Covid_tweets Sentiment Analysis/data.zip


--------------------------------------------------------------------------------
/cuda.py:
--------------------------------------------------------------------------------
1 | #cuda 
2 | import torch
3 | print(torch.cuda.is_available)
4 | 
5 | device=torch.device("cuda")
6 | x=torch.randn(2,2).to(device)
7 | 
8 | #we cannot perform operators with cuda and cpu tensors at once
9 | #both the tensors either have to be CPU or Cuda tensors


--------------------------------------------------------------------------------
/Deep Learning/cuda.py:
--------------------------------------------------------------------------------
1 | #cuda 
2 | import torch
3 | print(torch.cuda.is_available)
4 | 
5 | device=torch.device("cuda")
6 | x=torch.randn(2,2).to(device)
7 | 
8 | #we cannot perform operators with cuda and cpu tensors at once
9 | #both the tensors either have to be CPU or Cuda tensors


--------------------------------------------------------------------------------
/stemming_demo.py:
--------------------------------------------------------------------------------
 1 | '''Stemming is the process of reducing a word to its word stem that affixes to 
 2 | suffixes and prefixes or to the roots of words known as a lemma
 3 | '''
 4 | from nltk.stem import PorterStemmer
 5 | from nltk.tokenize import word_tokenize
 6 | 
 7 | text='He played football every Tuesday.	He plays football every Tuesday. He is going to play football every Tuesday.'
 8 | words=word_tokenize(text)
 9 | #print(words)
10 | 
11 | ps=PorterStemmer()
12 | 
13 | for w in words:
14 | 	print(ps.stem(w))
15 | 


--------------------------------------------------------------------------------
/Projects/IMDB Movie Reviews/readme.md:
--------------------------------------------------------------------------------
1 | ## About
2 | 
3 | IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
4 | 
5 | This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets.
6 | We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.
7 | 
8 | ## Dataset [Link](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
9 | 


--------------------------------------------------------------------------------
/Projects/Tweet Sentiment Extraction/readme.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | ### Files
 4 | 
 5 | train.csv - the training set
 6 | 
 7 | test.csv - the test set
 8 | 
 9 | sample_submission.csv - a sample submission file in the correct format
10 | 
11 | Columns
12 | 
13 | textID - unique ID for each piece of text
14 | 
15 | text - the text of the tweet
16 | 
17 | sentiment - the general sentiment of the tweet
18 | 
19 | selected_text - [train only] the text that supports the tweet's sentiment
20 | 
21 | ### Source: Kaggle
22 | 
23 | ### Dataset [Link](https://www.kaggle.com/c/tweet-sentiment-extraction/data)
24 | 


--------------------------------------------------------------------------------
/tokenize.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Tokenizing is the process in which huge sentence/ paragraphs are divided into smaller segments called tokens.
 3 | Here we will be seeing two tokenizers : word_tokenizer, sent_tokenizer 
 4 | word_tokenizer=it actually divides a group of sentence where the delimiter is the word
 5 | sent_tokenizer= it delimits the para/sentences on sentences.
 6 | '''
 7 | 
 8 | from nltk.tokenize import sent_tokenize, word_tokenize
 9 | 
10 | 
11 | text='Hello Mr. Bhavishya Pandit. How are doing? Hope everything is going smooth.'
12 | print('Sentence Tokenize:',sent_tokenize(text))
13 | print()
14 | print('Word Tokenize:',word_tokenize(text))


--------------------------------------------------------------------------------
/Projects/Fake News Detection/readme.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | train.csv: A full training dataset with the following attributes:
 4 | 
 5 | id: unique id for a news article
 6 | title: the title of a news article
 7 | author: author of the news article
 8 | text: the text of the article; could be incomplete
 9 | label: a label that marks the article as potentially unreliable
10 | 
11 | 1: unreliable
12 | 
13 | 0: reliable
14 | 
15 | test.csv: A testing training dataset with all the same attributes at train.csv without the label.
16 | 
17 | submit.csv: A sample submission that you can
18 | 
19 | #### Source: Kaggle
20 | 
21 | ### Dataset [Link](https://www.kaggle.com/c/fake-news/data)
22 | 


--------------------------------------------------------------------------------
/stopwords_demo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | stopwords can be understood as : to exclude or stop at a point where a certain word among the list of words occur
 3 | in a particular para/ sentence
 4 | '''
 5 | from nltk.corpus import stopwords
 6 | from nltk.tokenize import word_tokenize
 7 | 
 8 | text='It is an example of showing the stop words filteration.'
 9 | stop_words=stopwords.words('english')
10 | #print(stop_words)
11 | 
12 | filtered_list=[]
13 | #now filtering our sentence
14 | words=word_tokenize(text)
15 | for w in words:
16 | 	if w not in stop_words:
17 | 		filtered_list.append(w)
18 | print(filtered_list)  # ['It', 'example', 'showing', 'stop', 'words', 'filteration', '.']


--------------------------------------------------------------------------------
/Projects/Daily News for Stock Market Prediction/README.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | There are two channels of data provided in this dataset:
 4 | 
 5 | News data: I crawled historical news headlines from Reddit WorldNews Channel (/r/worldnews). They are ranked by reddit users' votes, and only the top 25 headlines are considered for a single date.
 6 | (Range: 2008-06-08 to 2016-07-01)
 7 | 
 8 | Stock data: Dow Jones Industrial Average (DJIA) is used to "prove the concept".
 9 | (Range: 2008-08-08 to 2016-07-01)
10 | 
11 | Note: If you'd like to cite this dataset in your publications, please use:
12 | 
13 | Sun, J. (2016, August). Daily News for Stock Market Prediction, Version 1. Retrieved [Date You Retrieved This Data] from https://www.kaggle.com/aaron7sun/stocknews.
14 | 
15 | ### Source: Kaggle
16 | 
17 | ### Dataset [Link](https://www.kaggle.com/aaron7sun/stocknews)
18 | 


--------------------------------------------------------------------------------
/Deep Learning/Loss/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def mse():
 6 | 	mse_loss = nn.MSELoss()
 7 | 	outputs = torch.randn(3, 5, requires_grad=True)
 8 | 	targets = torch.randn(3, 5)
 9 | 	loss = mse_loss(outputs, targets)
10 | 	print(loss)
11 | 
12 | def crossentropy():
13 | 	ce_loss = nn.CrossEntropyLoss()
14 | 	outputs = torch.randn(3, 5, requires_grad=True)
15 | 	targets = torch.tensor([1, 0, 3], dtype=torch.int64)
16 | 	loss = ce_loss(outputs, targets)
17 | 	print(loss)
18 | 	
19 | def bce(): #binary cross entropy
20 | 	bce_loss = nn.BCELoss()
21 | 	sigmoid = nn.Sigmoid()
22 | 	probabilities = sigmoid(torch.randn(4, 1, requires_grad=True))
23 | 	targets = torch.tensor([1, 0, 1, 0], dtype=torch.float32).view(4, 1)
24 | 	oss = bce_loss(probabilities, targets)
25 | 	print(probabilities)
26 | 	print(loss)	
27 | 	


--------------------------------------------------------------------------------
/Basic Perceptron.py:
--------------------------------------------------------------------------------
 1 | #perceptron with pytroch
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.functional as f
 6 | from torch.autograd import Variable
 7 | 
 8 | class Net(nn.Module):
 9 | 	def __init__(self):
10 | 		super(Net,self).__init__()
11 | 		self.fc1=nn.Linear(1,1)
12 | 	
13 | 	def forward(self,x):
14 | 		x=self.fc1(x)
15 | 		return x
16 | 
17 | net=Net()
18 | print(net)
19 | '''Net(
20 |   (fc1): Linear(in_features=1, out_features=1, bias=True)
21 | )'''
22 | 
23 | #to print the parameters of the neural network
24 | print(list(net.parameters()))
25 | '''[Parameter containing:
26 | tensor([[0.4780]], requires_grad=True), Parameter containing:
27 | tensor([0.1686], requires_grad=True)]'''
28 | 
29 | input = Variable(torch.randn(1,1,1), requires_grad=True)
30 | print(input) #tensor([[[0.7907]]], requires_grad=True)
31 | 
32 | output=net(input)
33 | print(output) #tensor([[[0.5466]]], grad_fn=<AddBackward0>)
34 | 
35 | import torch.optim as optim
36 | def criterion(out, label):
37 |     return (label - out)**2
38 | 


--------------------------------------------------------------------------------
/Deep Learning/Activation Functions/activationfunctions.py:
--------------------------------------------------------------------------------
 1 | #activation functions
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import torch
 5 | import numpy as np
 6 | 
 7 | fig, ax = plt.subplots(2,2)
 8 | fig.suptitle('Activation Functions')
 9 | 
10 | def sigmoid():
11 | 	x=torch.range(-5,5,0.1)
12 | 	y=torch.sigmoid(x)
13 | 	ax[0,0].grid()
14 | 	ax[0,0].plot(x.numpy(), y.numpy())
15 | 	ax[0,0].set_title('Sigmoid')
16 | 
17 | def tanh():
18 | 	x=torch.range(-5,5,0.1)
19 | 	y=torch.tanh(x)
20 | 	ax[0,1].grid()
21 | 	ax[0,1].plot(x.numpy(), y.numpy(),color='orange')
22 | 	ax[0,1].set_title('Tanh')
23 | 
24 | def relu():
25 | 	x=torch.range(-5,5,0.1)
26 | 	y=torch.relu(x)
27 | 	ax[1,0].grid()
28 | 	ax[1,0].plot(x.numpy(), y.numpy(),color='g')
29 | 	ax[1,0].set_title('RelU')
30 | 
31 | def prelu():
32 | 	prelu = torch.nn.PReLU(num_parameters=1)
33 | 	x=torch.range(-5,5,0.1)
34 | 	y=prelu(x)
35 | 	ax[1,1].grid()
36 | 	ax[1,1].plot(x.numpy(), y.detach().numpy(),color='r')
37 | 	ax[1,1].set_title('PRelU')
38 | 
39 | sigmoid()
40 | tanh()
41 | relu()
42 | prelu()


--------------------------------------------------------------------------------
/Lemmatization.py:
--------------------------------------------------------------------------------
 1 | '''Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as 
 2 | a single item. Lemmatization is similar to stemming but it brings context to the words. 
 3 | So it links words with similar meaning to one word.
 4 | 
 5 | Text preprocessing includes both Stemming as well as Lemmatization. 
 6 | Many times people find these two terms confusing. Some treat these two as same. 
 7 | Actually, lemmatization is preferred over Stemming because lemmatization does morphological analysis of the words.
 8 | The word resulting would have the same meaning but would be a synonym of the actual word
 9 | '''
10 | 
11 | #dependency : nltk.download('wordnet')
12 | 
13 | from nltk.stem import WordNetLemmatizer
14 | 
15 | lemm=WordNetLemmatizer()
16 | print(lemm.lemmatize('dogs')) #prints dog
17 | 
18 | print(lemm.lemmatize('mosquitoes')) #prints mosquito
19 | 
20 | print(lemm.lemmatize('better',pos="a")) #prints good   (a) stands for adjective 
21 | #also the default parameter for lemmatizer is noun (n)
22 | 
23 | print(lemm.lemmatize('eating',pos="v")) #prints eat
24 | 
25 | 


--------------------------------------------------------------------------------
/Projects/Twitter Sentiment Analysis(Beginners)/readme.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | There two datasets Respectively one Consists of Tweets from Twitter with Sentimental Label and the other from Reddit which Consists of Comments with its Sentimental Label.
 4 | 
 5 | 1.Twitter Dataset
 6 | 
 7 | 2.Reddit Dataset
 8 | 
 9 | All these Tweets and Comments were extracted using there Respective Apis Tweepy and PRAW.
10 | These tweets and Comments Were Made on Narendra Modi and Other Leaders as well as Peoples Opinion Towards the Next Prime Minister of The Nation ( In Context with General Elections Held In India - 2019).
11 | All the Tweets and Comments From twitter and Reddit are Cleaned using Pythons re and also NLP with a Sentimental Label to each ranging from -1 to 1.
12 | 
13 | 0 Indicating it is a Neutral Tweet/Comment
14 | 
15 | 1 Indicating a Postive Sentiment
16 | 
17 | -1 Indicating a Negative Tweet/Comment
18 | 
19 | Content
20 | 
21 | Twitter.csv Dataset has around 163K Tweets along with Sentiment Labels.
22 | Reddit.csv Dataset has around 37K Comments along with its Sentimental Label
23 | So Generally Each Dataset has two columns, the first column has the cleaned tweets and Comments and the Second one indicates its Sentimental Label
24 | 
25 | ## Dataset [Link](https://www.kaggle.com/cosmos98/twitter-and-reddit-sentimental-analysis-dataset?select=Twitter_Data.csv)
26 | 
27 | ### Source: Kaggle
28 | 


--------------------------------------------------------------------------------
/Projects/Covid_tweets Sentiment Analysis/readme.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | It is an NLP based classification problem of Covid tweets sentiment analyis. This is a very small dataset which consists of the tweets from a small location dating from mid march to mid april 2020.
 4 | 
 5 | ---------------------------------------------------------------------------------------
 6 | 
 7 | ### About Features
 8 | 
 9 | Username: User's username which is denoted with integers
10 | 
11 | ScreenName: Screen name which is denoted with integers
12 | 
13 | Location: Location of the user (Country Name/City Name)
14 | 
15 | TweetAt: Time of the tweet
16 | 
17 | Original Tweet: Text written in the tweet
18 | 
19 | Sentiment: It denotes the type of tweet. It is a categorical variable which has been divided into the following categoires
20 | 
21 |   * Extremely Positive
22 |   
23 |   * Positive
24 |   
25 |   * Neutral
26 |   
27 |   * Negative
28 |   
29 |   * Extremely Negative
30 | 
31 | ---------------------------------------------------------------------------------------
32 | 
33 | ### About Dataset
34 | 
35 | The data set consists of two files: train and test
36 | 
37 | Training set consists of 41157 tweets
38 | 
39 | Testing set consists of 3798 tweets
40 | 
41 | ---------------------------------------------------------------------------------------
42 | 
43 | ### Dataset [Link](https://github.com/bhav09/NLP_basics/blob/master/Projects/Covid_tweets%20Sentiment%20Analysis/data.zip)
44 | 


--------------------------------------------------------------------------------
/Projects/Women's E-Commerce Clothing Reviews/readme.md:
--------------------------------------------------------------------------------
 1 | # Context
 2 | 
 3 | Multi Class Classiification
 4 | 
 5 | Welcome. This is a Women’s Clothing E-Commerce dataset revolving around the reviews written by customers. Its nine supportive features offer a great environment to parse out the text through its multiple dimensions. Because this is real commercial data, it has been anonymized, and references to the company in the review text and body have been replaced with “retailer”.
 6 | 
 7 | # Content
 8 | This dataset includes 23486 rows and 10 feature variables. Each row corresponds to a customer review, and includes the variables:
 9 | 
10 | Clothing ID: Integer Categorical variable that refers to the specific piece being reviewed.
11 | 
12 | Age: Positive Integer variable of the reviewers age.
13 | 
14 | Title: String variable for the title of the review.
15 | 
16 | Review Text: String variable for the review body.
17 | 
18 | Rating: Positive Ordinal Integer variable for the product score granted by the customer from 1 Worst, to 5 Best.
19 | 
20 | Recommended IND: Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not recommended.
21 | 
22 | Positive Feedback Count: Positive Integer documenting the number of other customers who found this review positive.
23 | 
24 | Division Name: Categorical name of the product high level division.
25 | 
26 | Department Name: Categorical name of the product department name.
27 | 
28 | Class Name: Categorical name of the product class name.
29 | 
30 | ## Acknowledgements
31 | Anonymous but real source
32 | 
33 | ## Source: Kaggle
34 | 


--------------------------------------------------------------------------------
/Projects/Yelp Reviews/readme.md:
--------------------------------------------------------------------------------
 1 | # Context
 2 | 
 3 | This dataset is a subset of Yelp's businesses, reviews, and user data. It was originally put together for the Yelp Dataset Challenge which is a chance for students to conduct research or analysis on Yelp's data and share their discoveries. In the dataset you'll find information about businesses across 11 metropolitan areas in four countries.
 4 | 
 5 | # Content
 6 | 
 7 | This dataset contains seven CSV files. The original JSON files can be found in yelpacademicdataset.zip.
 8 | 
 9 | 
10 | You may find this documentation helpful:
11 | 
12 | https://www.yelp.com/dataset/documentation/json
13 | 
14 | In total, there are :
15 | 
16 | 5,200,000 user reviews
17 | Information on 174,000 businesses
18 | The data spans 11 metropolitan areas
19 | Acknowledgements
20 | The dataset was converted from JSON to CSV format and we thank the team of the Yelp dataset challenge for creating this dataset.
21 | 
22 | # Inspiration
23 | 
24 | Natural Language Processing & Sentiment Analysis
25 | 
26 | What's in a review? Is it positive or negative? Yelp's reviews contain a lot of metadata that can be mined and used to infer meaning, business attributes, and sentiment.
27 | 
28 | ## Graph Mining
29 | 
30 | We recently launched our Local Graph but can you take the graph further? How do user's relationships define their usage patterns? Where are the trend setters eating before it becomes popular?
31 | 
32 | 
33 | 
34 | ## Original Dataset [Link](https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_checkin.json)
35 | 
36 | ## Review Dataset [Link](https://www.kaggle.com/luisfredgs/yelp-reviews-csv)
37 | 
38 | ### Source: Kaggle
39 | 


--------------------------------------------------------------------------------
/Projects/Spam-Ham Classification/bagofwords_classifier.py:
--------------------------------------------------------------------------------
 1 | #to classify whether a sms is spam or ham
 2 | 
 3 | #dependencies
 4 | import pandas as pd
 5 | import nltk
 6 | import numpy as np
 7 | from nltk.corpus import stopwords
 8 | from nltk.tokenize import sent_tokenize as st
 9 | from nltk.stem import WordNetLemmatizer as wordnet
10 | import re
11 | 
12 | #reading the file
13 | df=pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2'])
14 | corpus=[]
15 | wordnet=wordnet()
16 | length=len(df['v2'])
17 | for i in range(length):
18 | 	rev=re.sub('[^a-zA-Z]',' ',df['v2'][i])
19 | 	rev=rev.lower()
20 | 	rev=rev.split()
21 | 	rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')]
22 | 	rev=' '.join(rev)
23 | 	corpus.append(rev)	
24 | 
25 | from sklearn.feature_extraction.text import CountVectorizer
26 | cv=CountVectorizer(max_features=2500)
27 | x=cv.fit_transform(corpus).toarray()
28 | y=df['v1'] #dependent variable
29 | 
30 | #y is a categorical variable so will encode it
31 | from sklearn.preprocessing import LabelEncoder
32 | le=LabelEncoder()
33 | y=le.fit_transform(y)
34 | 
35 | #now splittin the model into train and test set
36 | 
37 | from sklearn.model_selection import train_test_split
38 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
39 | 
40 | #training the model
41 | from sklearn.naive_bayes import MultinomialNB
42 | model=MultinomialNB()
43 | model.fit(x_train,y_train)
44 | 
45 | #predicting the values
46 | y_pred=model.predict(x_test)
47 | 
48 | #score of the model
49 | model.score(x_test,y_test)
50 | 
51 | from sklearn.metrics import confusion_matrix
52 | cm=confusion_matrix(y_test,y_pred)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NLP-Basics
 2 | 
 3 | 
 4 | This repository consists of all the necessary codes , basic theory and resources for anyone to start from scratch and study !
 5 | To practice all this , I would recommend spyder IDE for the same because it will notify you via error for any package that you won't be having of nltk.
 6 | 
 7 | 
 8 | To download all the packages of nltk type this in the command prompt: 
 9 | 
10 | import nltk
11 | 
12 | nltk.download()
13 | 
14 | 
15 | (make sure to have a python environment running on your cmd for the same)
16 | 
17 | ****************************************************************************************************************************
18 | 
19 | Note:
20 | 
21 | The order of learning all the theory and code should be in the order as that of the topics mentioned in the resource section. 
22 | 
23 | ****************************************************************************************************************************
24 | 
25 | Resources:
26 | 
27 |     1.More about NLP : https://machinelearningmastery.com/natural-language-processing/
28 |     
29 |     Topics:
30 |     
31 |     2.  Tokenization : https://intellipaat.com/community/9025/tokenization-in-nlp
32 |     
33 |         Stopwords : https://towardsdatascience.com/treat-negation-stopwords-differently-according-to-your-nlp-task-e5a59ab7c91f
34 |         
35 |         Stemming : https://searchenterpriseai.techtarget.com/definition/stemming
36 |          
37 |         Speech Tagging: https://www.geeksforgeeks.org/nlp-part-of-speech-default-tagging/
38 |         
39 |         Chunking : https://www.geeksforgeeks.org/nlp-chunking-and-chinking-with-regex/
40 |         
41 |         Named Entity Recognition : https://towardsdatascience.com/named-entity-recognition-3fad3f53c91e
42 |         
43 |         Lemmatization : https://www.datacamp.com/community/tutorials/stemming-lemmatization-python
44 |         
45 |     3. Applications of NLP : https://towardsdatascience.com/natural-language-processing-nlp-top-10-applications-to-know-b2c80bd428cb  
46 | 


--------------------------------------------------------------------------------
/speech_tagging.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | speech tagging - what is basically is doing , is tagging the words into various articulates of english grammar 
 3 | makes a tuple which is of the format : (word,tag)
 4 | 
 5 | 
 6 | POS tag list:
 7 | 
 8 | CC	coordinating conjunction
 9 | CD	cardinal digit
10 | DT	determiner
11 | EX	existential there (like: "there is" ... think of it like "there exists")
12 | FW	foreign word
13 | IN	preposition/subordinating conjunction
14 | JJ	adjective	'big'
15 | JJR	adjective, comparative	'bigger'
16 | JJS	adjective, superlative	'biggest'
17 | LS	list marker	1)
18 | MD	modal	could, will
19 | NN	noun, singular 'desk'
20 | NNS	noun plural	'desks'
21 | NNP	proper noun, singular	'Harrison'
22 | NNPS	proper noun, plural	'Americans'
23 | PDT	predeterminer	'all the kids'
24 | POS	possessive ending	parent\'s
25 | PRP	personal pronoun	I, he, she
26 | PRP$	possessive pronoun	my, his, hers
27 | RB	adverb	very, silently,
28 | RBR	adverb, comparative	better
29 | RBS	adverb, superlative	best
30 | RP	particle	give up
31 | TO	to	go 'to' the store.
32 | UH	interjection	errrrrrrrm
33 | VB	verb, base form	take
34 | VBD	verb, past tense	took
35 | VBG	verb, gerund/present participle	taking
36 | VBN	verb, past participle	taken
37 | VBP	verb, sing. present, non-3d	take
38 | VBZ	verb, 3rd person sing. present	takes
39 | WDT	wh-determiner	which
40 | WP	wh-pronoun	who, what
41 | WP$	possessive wh-pronoun	whose
42 | WRB	wh-abverb	where, when
43 | 
44 | '''
45 | 
46 | from nltk.corpus import state_union
47 | from nltk.tokenize import PunktSentenceTokenizer
48 | import nltk
49 | 
50 | train_text=state_union.raw('2005-GWBush.txt')
51 | test_text=state_union.raw('2006-GWBush.txt')
52 | #print(text)
53 | 
54 | custom_tokenizer=PunktSentenceTokenizer(train_text)
55 | test_tokenizer=custom_tokenizer.tokenize(test_text)
56 | 
57 | #print(test_tokenizer)
58 | 
59 | def our_content():
60 | 	try:
61 | 		for i in test_tokenizer:
62 | 			words=nltk.word_tokenize(i)
63 | 			tag=nltk.pos_tag(words)
64 | 			print(tag)
65 | 	except Exception as e:
66 | 		print(str(e))
67 | our_content()


--------------------------------------------------------------------------------
/chinking.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Chinking- the  words that we exclude from chunks are called chinks
 3 | 
 4 | 
 5 | POS tag list:
 6 | 
 7 | CC	coordinating conjunction
 8 | CD	cardinal digit
 9 | DT	determiner
10 | EX	existential there (like: "there is" ... think of it like "there exists")
11 | FW	foreign word
12 | IN	preposition/subordinating conjunction
13 | JJ	adjective	'big'
14 | JJR	adjective, comparative	'bigger'
15 | JJS	adjective, superlative	'biggest'
16 | LS	list marker	1)
17 | MD	modal	could, will
18 | NN	noun, singular 'desk'
19 | NNS	noun plural	'desks'
20 | NNP	proper noun, singular	'Harrison'
21 | NNPS	proper noun, plural	'Americans'
22 | PDT	predeterminer	'all the kids'
23 | POS	possessive ending	parent\'s
24 | PRP	personal pronoun	I, he, she
25 | PRP$	possessive pronoun	my, his, hers
26 | RB	adverb	very, silently,
27 | RBR	adverb, comparative	better
28 | RBS	adverb, superlative	best
29 | RP	particle	give up
30 | TO	to	go 'to' the store.
31 | UH	interjection	errrrrrrrm
32 | VB	verb, base form	take
33 | VBD	verb, past tense	took
34 | VBG	verb, gerund/present participle	taking
35 | VBN	verb, past participle	taken
36 | VBP	verb, sing. present, non-3d	take
37 | VBZ	verb, 3rd person sing. present	takes
38 | WDT	wh-determiner	which
39 | WP	wh-pronoun	who, what
40 | WP$	possessive wh-pronoun	whose
41 | WRB	wh-abverb	where, when
42 | 
43 | '''
44 | 
45 | from nltk.corpus import state_union
46 | from nltk.tokenize import PunktSentenceTokenizer
47 | import nltk
48 | from nltk.chunk import RegexpParser 
49 | 
50 | train_text=state_union.raw('2005-GWBush.txt')
51 | test_text=state_union.raw('2006-GWBush.txt')
52 | #print(text)
53 | 
54 | custom_tokenizer=PunktSentenceTokenizer(train_text)
55 | test_tokenizer=custom_tokenizer.tokenize(test_text)
56 | 
57 | #print(test_tokenizer)
58 | 
59 | def our_content():
60 | 	try:
61 | 		for i in test_tokenizer:
62 | 			words=nltk.word_tokenize(i)
63 | 			tag=nltk.pos_tag(words)
64 | 			print(tag)
65 | 			chunkGram=r''' Chunk:{<.*>+}}<VB.?|IN|DT+>{'''
66 | 			chunkParser=nltk.RegexpParser(chunkGram)
67 | 			chunked=chunkParser.parse(tag)
68 | 			chunked.draw()
69 | 	except Exception as e:
70 | 		print(str(e))
71 | our_content()


--------------------------------------------------------------------------------
/Projects/Spam-Ham Classification/HashVectorizer.py:
--------------------------------------------------------------------------------
 1 | #HashingVectorizer
 2 | 
 3 | import pandas as pd
 4 | import nltk
 5 | import numpy as np
 6 | from nltk.corpus import stopwords
 7 | from nltk.tokenize import sent_tokenize as st
 8 | from nltk.stem import WordNetLemmatizer as wordnet
 9 | import re
10 | from sklearn.metrics import classification_report
11 | 
12 | #reading the file
13 | df=pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2'])
14 | corpus=[]
15 | wordnet=wordnet()
16 | length=len(df['v2'])
17 | for i in range(length):
18 | 	rev=re.sub('[^a-zA-Z]',' ',df['v2'][i])
19 | 	rev=rev.lower()
20 | 	rev=rev.split()
21 | 	rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')]
22 | 	rev=' '.join(rev)
23 | 	corpus.append(rev)	
24 | 
25 | 	
26 | from sklearn.feature_extraction.text import HashingVectorizer as hv
27 | 
28 | hv=hv(n_features=5000)
29 | x=hv.fit_transform(corpus).toarray()
30 | y=df['v1'] #dependent variable
31 | 
32 | #y is a categorical variable so will encode it
33 | from sklearn.preprocessing import LabelEncoder
34 | le=LabelEncoder()
35 | y=le.fit_transform(y)
36 | 
37 | 						#now splittin the model into train and test set
38 | from sklearn.model_selection import train_test_split
39 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
40 | 						#training the model
41 | from sklearn.linear_model import PassiveAggressiveClassifier 
42 | model=PassiveAggressiveClassifier()
43 | model.fit(x_train,y_train)
44 | 						#predicting the values
45 | y_pred=model.predict(x_test)
46 | 						#score of the model
47 | model.score(x_test,y_test)
48 | from sklearn.metrics import confusion_matrix
49 | cm=confusion_matrix(y_test,y_pred)
50 | print(f"Classification Report : \n\n{classification_report(y_test, y_pred)}")
51 | '''Classification Report : 
52 | 
53 |               precision    recall  f1-score   support
54 | 
55 |            0       0.98      0.99      0.99       965
56 |            1       0.96      0.88      0.92       150
57 | 
58 |     accuracy                           0.98      1115
59 |    macro avg       0.97      0.94      0.95      1115
60 | weighted avg       0.98      0.98      0.98      1115
61 | '''


--------------------------------------------------------------------------------
/Projects/Spam-Ham Classification/passive_aggresiveClassifier.py:
--------------------------------------------------------------------------------
 1 | #passive aggressive classifier
 2 | 
 3 | import pandas as pd
 4 | import nltk
 5 | import numpy as np
 6 | from nltk.corpus import stopwords
 7 | from nltk.tokenize import sent_tokenize as st
 8 | from nltk.stem import WordNetLemmatizer as wordnet
 9 | import re
10 | from sklearn.metrics import classification_report
11 | 
12 | #reading the file
13 | df=pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2'])
14 | corpus=[]
15 | wordnet=wordnet()
16 | length=len(df['v2'])
17 | for i in range(length):
18 | 	rev=re.sub('[^a-zA-Z]',' ',df['v2'][i])
19 | 	rev=rev.lower()
20 | 	rev=rev.split()
21 | 	rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')]
22 | 	rev=' '.join(rev)
23 | 	corpus.append(rev)	
24 | from sklearn.feature_extraction.text import CountVectorizer
25 | cv=CountVectorizer(max_features=2500)
26 | x=cv.fit_transform(corpus).toarray()
27 | y=df['v1'] #dependent variable
28 | 
29 | #y is a categorical variable so will encode it
30 | from sklearn.preprocessing import LabelEncoder
31 | le=LabelEncoder()
32 | y=le.fit_transform(y)
33 | 
34 | 						#now splittin the model into train and test set
35 | from sklearn.model_selection import train_test_split
36 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
37 | 						#training the model
38 | from sklearn.linear_model import PassiveAggressiveClassifier 
39 | model=PassiveAggressiveClassifier()
40 | model.fit(x_train,y_train)
41 | 						#predicting the values
42 | y_pred=model.predict(x_test)
43 | 						#score of the model
44 | model.score(x_test,y_test)
45 | from sklearn.metrics import confusion_matrix
46 | cm=confusion_matrix(y_test,y_pred)
47 | print(f"Classification Report : \n\n{classification_report(y_test, y_pred)}")
48 | '''Classification Report : 
49 | 
50 |               precision    recall  f1-score   support
51 | 
52 |            0       0.98      0.99      0.99       965
53 |            1       0.96      0.88      0.92       150
54 | 
55 |     accuracy                           0.98      1115
56 |    macro avg       0.97      0.94      0.95      1115
57 | weighted avg       0.98      0.98      0.98      1115
58 | '''


--------------------------------------------------------------------------------
/nameEntity_recog.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Named entity recognition (NER) , also known as entity chunking/extraction , 
 3 | is a popular technique used in information extraction to identify and segment the named 
 4 | entities and classify or categorize them under various predefined classes.
 5 | 
 6 | 
 7 | POS tag list:
 8 | 
 9 | CC	coordinating conjunction
10 | CD	cardinal digit
11 | DT	determiner
12 | EX	existential there (like: "there is" ... think of it like "there exists")
13 | FW	foreign word
14 | IN	preposition/subordinating conjunction
15 | JJ	adjective	'big'
16 | JJR	adjective, comparative	'bigger'
17 | JJS	adjective, superlative	'biggest'
18 | LS	list marker	1)
19 | MD	modal	could, will
20 | NN	noun, singular 'desk'
21 | NNS	noun plural	'desks'
22 | NNP	proper noun, singular	'Harrison'
23 | NNPS	proper noun, plural	'Americans'
24 | PDT	predeterminer	'all the kids'
25 | POS	possessive ending	parent\'s
26 | PRP	personal pronoun	I, he, she
27 | PRP$	possessive pronoun	my, his, hers
28 | RB	adverb	very, silently,
29 | RBR	adverb, comparative	better
30 | RBS	adverb, superlative	best
31 | RP	particle	give up
32 | TO	to	go 'to' the store.
33 | UH	interjection	errrrrrrrm
34 | VB	verb, base form	take
35 | VBD	verb, past tense	took
36 | VBG	verb, gerund/present participle	taking
37 | VBN	verb, past participle	taken
38 | VBP	verb, sing. present, non-3d	take
39 | VBZ	verb, 3rd person sing. present	takes
40 | WDT	wh-determiner	which
41 | WP	wh-pronoun	who, what
42 | WP$	possessive wh-pronoun	whose
43 | WRB	wh-abverb	where, when
44 | 
45 | '''
46 | 
47 | from nltk.corpus import state_union
48 | from nltk.tokenize import PunktSentenceTokenizer
49 | import nltk
50 | from nltk.chunk import RegexpParser 
51 | 
52 | train_text=state_union.raw('2005-GWBush.txt')
53 | test_text=state_union.raw('2006-GWBush.txt')
54 | #print(text)
55 | 
56 | custom_tokenizer=PunktSentenceTokenizer(train_text)
57 | test_tokenizer=custom_tokenizer.tokenize(test_text)
58 | 
59 | #print(test_tokenizer)
60 | 
61 | def our_content():
62 | 	try:
63 | 		for i in test_tokenizer: 
64 | 			words=nltk.word_tokenize(i)
65 | 			tag=nltk.pos_tag(words)
66 | 			#print(tag)
67 | 			named_entity=nltk.ne_chunk(tag)
68 | 			named_entity.draw()
69 | 			
70 | 	except Exception as e:
71 | 		print(str(e))
72 | our_content()


--------------------------------------------------------------------------------
/chunking.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Chunking- also known as partial parsing. It is a process of meaningful extraction of short phrases from the sentences.
 3 | chunks are made up of words and the kinds of words and the kinds of words are defined using the parts of speech tag
 4 | 
 5 | 
 6 | POS tag list:
 7 | 
 8 | CC	coordinating conjunction
 9 | CD	cardinal digit
10 | DT	determiner
11 | EX	existential there (like: "there is" ... think of it like "there exists")
12 | FW	foreign word
13 | IN	preposition/subordinating conjunction
14 | JJ	adjective	'big'
15 | JJR	adjective, comparative	'bigger'
16 | JJS	adjective, superlative	'biggest'
17 | LS	list marker	1)
18 | MD	modal	could, will
19 | NN	noun, singular 'desk'
20 | NNS	noun plural	'desks'
21 | NNP	proper noun, singular	'Harrison'
22 | NNPS	proper noun, plural	'Americans'
23 | PDT	predeterminer	'all the kids'
24 | POS	possessive ending	parent\'s
25 | PRP	personal pronoun	I, he, she
26 | PRP$	possessive pronoun	my, his, hers
27 | RB	adverb	very, silently,
28 | RBR	adverb, comparative	better
29 | RBS	adverb, superlative	best
30 | RP	particle	give up
31 | TO	to	go 'to' the store.
32 | UH	interjection	errrrrrrrm
33 | VB	verb, base form	take
34 | VBD	verb, past tense	took
35 | VBG	verb, gerund/present participle	taking
36 | VBN	verb, past participle	taken
37 | VBP	verb, sing. present, non-3d	take
38 | VBZ	verb, 3rd person sing. present	takes
39 | WDT	wh-determiner	which
40 | WP	wh-pronoun	who, what
41 | WP$	possessive wh-pronoun	whose
42 | WRB	wh-abverb	where, when
43 | 
44 | '''
45 | 
46 | from nltk.corpus import state_union
47 | from nltk.tokenize import PunktSentenceTokenizer
48 | import nltk
49 | from nltk.chunk import RegexpParser 
50 | 
51 | train_text=state_union.raw('2005-GWBush.txt')
52 | test_text=state_union.raw('2006-GWBush.txt')
53 | #print(text)
54 | 
55 | custom_tokenizer=PunktSentenceTokenizer(train_text)
56 | test_tokenizer=custom_tokenizer.tokenize(test_text)
57 | 
58 | #print(test_tokenizer)
59 | 
60 | def our_content():
61 | 	try:
62 | 		for i in test_tokenizer:
63 | 			words=nltk.word_tokenize(i)
64 | 			tag=nltk.pos_tag(words)
65 | 			print(tag)
66 | 			chunkGram=r''' Chunk:{<RB.?>*<VB.?>}'''
67 | 			chunkParser=nltk.RegexpParser(chunkGram)
68 | 			chunked=chunkParser.parse(tag)
69 | 			chunked.draw()
70 | 	except Exception as e:
71 | 		print(str(e))
72 | our_content()


--------------------------------------------------------------------------------
/tf-idf.py:
--------------------------------------------------------------------------------
 1 | #tf idf
 2 | import nltk
 3 | 
 4 | para = '''An atom is the smallest unit of ordinary matter that forms a chemical element.
 5 | 		Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms.
 6 | 		Atoms are extremely small, typically around 100 picometers across. 
 7 | 		They are so small that accurately predicting their behavior using classical physics—as 
 8 | 		if they were tennis balls, for example—is not possible due to quantum effects.
 9 | 		Every atom is composed of a nucleus and one or more electrons bound to the nucleus. 
10 | 		The nucleus is made of one or more protons and a number of neutrons. 
11 | 		Only the most common variety of hydrogen has no neutrons. 
12 | 		More than 99.94% of an atom's mass is in the nucleus. 
13 | 		The protons have a positive electric charge, the electrons have a negative electric charge, 
14 | 		and the neutrons have no electric charge. If the number of protons and electrons are equal, 
15 | 		then the atom is electrically neutral. If an atom has more or fewer electrons than protons, 
16 | 		then it has an overall negative or positive charge, respectively – such atoms are called ions.
17 | 		The electrons of an atom are attracted to the protons in an atomic nucleus by the electromagnetic force. 
18 | 		The protons and neutrons in the nucleus are attracted to each other by the nuclear force. 
19 | 		This force is usually stronger than the electromagnetic force that repels the positively 
20 | 		charged protons from one another. Under certain circumstances, the repelling electromagnetic 
21 | 		force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves 
22 | 		behind different elements. This is a form of nuclear decay.'''
23 | 
24 | #dependencies
25 | import re #regular expression
26 | from nltk.tokenize import sent_tokenize as st, word_tokenize as wt #for tokenization
27 | from nltk.corpus import stopwords #stop words
28 | from nltk.stem import WordNetLemmatizer as wl #for lemmatization
29 |  
30 | wordnet=wl() #object creation for lemmatization
31 | corpus=[] #empty list
32 | sentences=st(para) #tokenizing the paragraph to sentences
33 | 
34 | for i in range(len(sentences)):
35 | 	rev=re.sub('[^a-zA-Z]',' ',sentences[i]) #replace all the letters by space except the alphabets
36 | 	rev=rev.lower() #lower the senteces
37 | 	rev=rev.split() #each word gets converted to an element of a list
38 | 	rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')]
39 | 	rev=' '.join(rev)
40 | 	corpus.append(rev)
41 | 	
42 | #creating TF-IDF model
43 | from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
44 | cv=tfidf() #object creation
45 | x=cv.fit_transform(corpus).toarray() #transforming


--------------------------------------------------------------------------------
/bagofwords.py:
--------------------------------------------------------------------------------
 1 | #dependency
 2 | import nltk
 3 | from nltk.tokenize import sent_tokenize as st
 4 | from nltk.corpus import stopwords
 5 | from nltk.stem import PorterStemmer as ps, WordNetLemmatizer as wl
 6 | 
 7 | para='''An atom is the smallest unit of ordinary matter that forms a chemical element.
 8 | 		Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms.
 9 | 		Atoms are extremely small, typically around 100 picometers across. 
10 | 		They are so small that accurately predicting their behavior using classical physics—as 
11 | 		if they were tennis balls, for example—is not possible due to quantum effects.
12 | 
13 | 		Every atom is composed of a nucleus and one or more electrons bound to the nucleus. 
14 | 		The nucleus is made of one or more protons and a number of neutrons. 
15 | 		Only the most common variety of hydrogen has no neutrons. 
16 | 		More than 99.94% of an atom's mass is in the nucleus. 
17 | 		The protons have a positive electric charge, the electrons have a negative electric charge, 
18 | 		and the neutrons have no electric charge. If the number of protons and electrons are equal, 
19 | 		then the atom is electrically neutral. If an atom has more or fewer electrons than protons, 
20 | 		then it has an overall negative or positive charge, respectively – such atoms are called ions.
21 | 
22 | 		The electrons of an atom are attracted to the protons in an atomic nucleus by the electromagnetic force. 
23 | 		The protons and neutrons in the nucleus are attracted to each other by the nuclear force. 
24 | 		This force is usually stronger than the electromagnetic force that repels the positively 
25 | 		charged protons from one another. Under certain circumstances, the repelling electromagnetic 
26 | 		force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves 
27 | 		behind different elements. This is a form of nuclear decay.'''
28 | 
29 | #clearning the texts
30 | import re
31 | 
32 | ps=ps() #object creation porter stemmer
33 | wl=wl() #object creation word net lemmatizer
34 | sentences=st(para) #tokenizing to sentences
35 | corpus=[]
36 | 
37 | for i in range(len(sentences)):
38 | 	rev=re.sub('[^a-zA-Z]',' ',sentences[i]) #everything other than alphabets would be replaced by space
39 | 	rev=rev.lower() #lowers the letters in the sentences
40 | 	rev=rev.split() #splits them word wise into elements of a list
41 | 	rev=[wl.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))]
42 | 	rev=' '.join(rev)
43 | 	corpus.append(rev) #appending to list
44 | 	
45 | #bag of words
46 | from sklearn.feature_extraction.text import CountVectorizer #importing countervectorizer
47 | cv=CountVectorizer()
48 | x=cv.fit_transform(corpus).toarray() #transforming it to an array
49 | 
50 | 


--------------------------------------------------------------------------------
/Projects/Spam-Ham Classification/readme.md:
--------------------------------------------------------------------------------
 1 | ## About
 2 | 
 3 | Context
 4 | The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.
 5 | 
 6 | Content
 7 | The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.
 8 | 
 9 | This corpus has been collected from free or free for research sources at the Internet:
10 | 
11 | -> A collection of 425 SMS spam messages was manually extracted from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages. The Grumbletext Web site is: [Web Link].
12 | -> A subset of 3,375 SMS randomly chosen ham messages of the NUS SMS Corpus (NSC), which is a dataset of about 10,000 legitimate messages collected for research at the Department of Computer Science at the National University of Singapore. The messages largely originate from Singaporeans and mostly from students attending the University. These messages were collected from volunteers who were made aware that their contributions were going to be made publicly available. The NUS SMS Corpus is avalaible at: [Web Link].
13 | -> A list of 450 SMS ham messages collected from Caroline Tag's PhD Thesis available at [Web Link].
14 | -> Finally, we have incorporated the SMS Spam Corpus v.0.1 Big. It has 1,002 SMS ham messages and 322 spam messages and it is public available at: [Web Link]. This corpus has been used in the following academic researches:
15 | 
16 | Acknowledgements
17 | The original dataset can be found here. The creators would like to note that in case you find the dataset useful, please make a reference to previous paper and the web page: http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/ in your papers, research, etc.
18 | 
19 | We offer a comprehensive study of this corpus in the following paper. This work presents a number of statistics, studies and baseline results for several machine learning methods.
20 | 
21 | Almeida, T.A., GÃ³mez Hidalgo, J.M., Yamakami, A. Contributions to the Study of SMS Spam Filtering: New Collection and Results. Proceedings of the 2011 ACM Symposium on Document Engineering (DOCENG'11), Mountain View, CA, USA, 2011.
22 | 
23 | Inspiration
24 | Can you use this dataset to build a prediction model that will accurately classify which texts are spam?
25 | 
26 | #### Source: Kaggle
27 | 
28 | ## Link to DataSet
29 | 
30 | Dataset: [Link](https://www.kaggle.com/uciml/sms-spam-collection-dataset)
31 | 


--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.tokenize import sent_tokenize,word_tokenize
 3 | from nltk.corpus import stopwords
 4 | import re
 5 | from gensim.models import Word2Vec
 6 | 
 7 | paragraph="""Before you discuss the resolution, let me place before you one or two things,
 8 | 	 I want you to understand two things very clearly and to consider them from the same
 9 | 	  point of view from which I am placing them before you. I ask you to consider it from
10 | 	   my point of view, because if you approve of it, you will be enjoined to carry out 
11 | 	    all I say. It will be a great responsibility. There are people who ask me whether 
12 | 		 I am the same man that I was in 1920, or whether there has been any change in me 
13 | 		  or you. You are right in asking that question.
14 | 
15 | 	Let me, however, hasten to assure that I am the same Gandhi as I was in 1920. 
16 | 	I have not changed in any fundamental respect. I attach the same importance 
17 | 	to non-violence that I did then. If at all, my emphasis on it has grown stronger. 
18 | 	There is no real contradiction between the present resolution and my previous writings and utterances.
19 | 
20 | 	Occasions like the present do not occur in everybody’s and rarely in anybody’s life. 
21 | 	I want you to know and feel that there is nothing but purest Ahimsa in all that I 
22 | 	am saying and doing today. The draft resolution of the Working Committee is based on 
23 | 	Ahimsa, the contemplated struggle similarly has its roots in Ahimsa. If, therefore, 
24 | 	there is any among you who has lost faith in Ahimsa or is wearied of it, let him not 
25 | 	vote for this resolution. Let me explain my position clearly. God has vouchsafed to 
26 | 	me a priceless gift in the weapon of Ahimsa. I and my Ahimsa are on our trail today. 
27 | 	If in the present crisis, when the earth is being scorched by the flames of Himsa 
28 | 	and crying for deliverance, I failed to make use of the God given talent, God will 
29 | 	not forgive me and I shall be judged unworthy of the great gift. I must act now. 
30 | 	I may not hesitate and merely look on, when Russia and China are threatened."""
31 | 
32 | #para='He is a very good man and everyone loves him!'
33 | para=re.sub('[^a-zA-Z.]',' ',paragraph) 
34 | para=re.sub('\s{2,10}',' ',para) #removed extra spaces
35 | para=para.lower()
36 | 
37 | sentences=sent_tokenize(para)
38 | 
39 | for i in range(len(sentences)):
40 | 	sentences[i]=sentences[i].split()
41 | 	sentences[i]=[word for word in sentences[i] if word not in stopwords.words('english')]
42 | 
43 | model=Word2Vec(sentences, min_count=1)
44 | 
45 | words=model.wv.vocab #vocab of the paragraph
46 | 
47 | #finding the vectors of the word
48 | vector=model.wv['assure'] #here we see 100 dimensions of the word
49 | 
50 | #finding the word which is similar to another word
51 | similar=model.wv.most_similar('faith')
52 | 
53 | 


--------------------------------------------------------------------------------
/Projects/Twitter Sentiment Analysis(Beginners)/Twitter Sentiment Analysis (Small Dataset).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "data=pd.read_csv('D:/Data Sets/Sentiment Analysis/Twitter and Reddit sentiment analysis/Twitter_Data.csv')"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/html": [
 30 |        "<div>\n",
 31 |        "<style scoped>\n",
 32 |        "    .dataframe tbody tr th:only-of-type {\n",
 33 |        "        vertical-align: middle;\n",
 34 |        "    }\n",
 35 |        "\n",
 36 |        "    .dataframe tbody tr th {\n",
 37 |        "        vertical-align: top;\n",
 38 |        "    }\n",
 39 |        "\n",
 40 |        "    .dataframe thead th {\n",
 41 |        "        text-align: right;\n",
 42 |        "    }\n",
 43 |        "</style>\n",
 44 |        "<table border=\"1\" class=\"dataframe\">\n",
 45 |        "  <thead>\n",
 46 |        "    <tr style=\"text-align: right;\">\n",
 47 |        "      <th></th>\n",
 48 |        "      <th>clean_text</th>\n",
 49 |        "      <th>category</th>\n",
 50 |        "    </tr>\n",
 51 |        "  </thead>\n",
 52 |        "  <tbody>\n",
 53 |        "    <tr>\n",
 54 |        "      <th>0</th>\n",
 55 |        "      <td>when modi promised “minimum government maximum...</td>\n",
 56 |        "      <td>-1.0</td>\n",
 57 |        "    </tr>\n",
 58 |        "    <tr>\n",
 59 |        "      <th>1</th>\n",
 60 |        "      <td>talk all the nonsense and continue all the dra...</td>\n",
 61 |        "      <td>0.0</td>\n",
 62 |        "    </tr>\n",
 63 |        "    <tr>\n",
 64 |        "      <th>2</th>\n",
 65 |        "      <td>what did just say vote for modi  welcome bjp t...</td>\n",
 66 |        "      <td>1.0</td>\n",
 67 |        "    </tr>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>3</th>\n",
 70 |        "      <td>asking his supporters prefix chowkidar their n...</td>\n",
 71 |        "      <td>1.0</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>4</th>\n",
 75 |        "      <td>answer who among these the most powerful world...</td>\n",
 76 |        "      <td>1.0</td>\n",
 77 |        "    </tr>\n",
 78 |        "  </tbody>\n",
 79 |        "</table>\n",
 80 |        "</div>"
 81 |       ],
 82 |       "text/plain": [
 83 |        "                                          clean_text  category\n",
 84 |        "0  when modi promised “minimum government maximum...      -1.0\n",
 85 |        "1  talk all the nonsense and continue all the dra...       0.0\n",
 86 |        "2  what did just say vote for modi  welcome bjp t...       1.0\n",
 87 |        "3  asking his supporters prefix chowkidar their n...       1.0\n",
 88 |        "4  answer who among these the most powerful world...       1.0"
 89 |       ]
 90 |      },
 91 |      "execution_count": 3,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "data.head()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 4,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "'talk all the nonsense and continue all the drama will vote for modi '"
109 |       ]
110 |      },
111 |      "execution_count": 4,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "data['clean_text'][1]"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "(162980, 2)"
129 |       ]
130 |      },
131 |      "execution_count": 6,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "data.shape"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "#here we see that the data is cleaned in terms of redundant letters and caps. \n",
147 |     "#also the sentiments have been encoded, so we can skip all these steps and can carry on with the model building"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 10,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/plain": [
158 |        "clean_text    4\n",
159 |        "category      7\n",
160 |        "dtype: int64"
161 |       ]
162 |      },
163 |      "execution_count": 10,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "data.isnull().sum()"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 12,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "name": "stdout",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "(162969, 2)\n"
182 |      ]
183 |     }
184 |    ],
185 |    "source": [
186 |     "data=data.dropna()\n",
187 |     "data=data.reset_index(drop=True)\n",
188 |     "print(data.shape)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 34,
194 |    "metadata": {},
195 |    "outputs": [
196 |     {
197 |      "name": "stdout",
198 |      "output_type": "stream",
199 |      "text": [
200 |       "AxesSubplot(0.125,0.125;0.775x0.755)\n"
201 |      ]
202 |     },
203 |     {
204 |      "data": {
205 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAECCAYAAAD+VKAWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUaklEQVR4nO3db4xddX7f8fdn7Sylu4XFMLiux9S0OJsapGXDyHW1UtXGbfGKKqYSVmalFity5YiSKqtWak37oOoDV1iVSotUaNyQYGi6xnWywVrKJpbJKqpK7R0IDTGsw3Rh8dTEnoCXsI1ga++3D+5vuncu1zN3/GfueP1+SVfn3O85v+Pv0ZXmM+d37vGkqpAk6RPDbkCStDQYCJIkwECQJDUGgiQJMBAkSY2BIEkCYPmwG7hQN910U61du3bYbUjSFeWll176o6oa6bftig2EtWvXMjExMew2JOmKkuQ759vmlJEkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDVX7INpi23tzueG3cJl9dbD9wy7BUlD5hWCJAkwECRJzbyBkOSzSV7pev1xki8nWZHkUJI32vKGrjEPJZlMcjzJ3V31u5K82rY9miStfk2SZ1r9SJK1l+VsJUnnNW8gVNXxqrqzqu4E7gL+BPgqsBM4XFXrgMPtPUnWA+PA7cBm4LEky9rhHgd2AOvaa3OrbwfOVNVtwCPA7ktydpKkgS10ymgT8L+q6jvAFmBvq+8F7m3rW4B9VfVRVb0JTAIbkqwCrquqF6uqgKd6xswc6wCwaebqQZK0OBYaCOPAV9r6yqp6B6Atb2711cCJrjFTrba6rffWZ42pqrPA+8CNC+xNknQRBg6EJJ8Efhr4L/Pt2qdWc9TnGtPbw44kE0kmpqen52lDkrQQC7lC+CLwclWdau9PtWkg2vJ0q08Ba7rGjQInW320T33WmCTLgeuB93obqKo9VTVWVWMjI33/4I8k6QItJBC+xA+niwAOAtva+jbg2a76ePvm0K10bh4fbdNKHyTZ2O4P3N8zZuZY9wEvtPsMkqRFMtCTykn+NPA3gZ/rKj8M7E+yHXgb2ApQVceS7AdeA84CD1bVuTbmAeBJ4Frg+fYCeAJ4OskknSuD8Ys4J0nSBRgoEKrqT+i5yVtV79L51lG//XcBu/rUJ4A7+tQ/pAWKJGk4fFJZkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqRkoEJJ8JsmBJN9K8nqSv5JkRZJDSd5oyxu69n8oyWSS40nu7qrfleTVtu3RJGn1a5I80+pHkqy95GcqSZrToFcI/w74elX9BPA54HVgJ3C4qtYBh9t7kqwHxoHbgc3AY0mWteM8DuwA1rXX5lbfDpypqtuAR4DdF3lekqQFmjcQklwH/FXgCYCq+n5VfRfYAuxtu+0F7m3rW4B9VfVRVb0JTAIbkqwCrquqF6uqgKd6xswc6wCwaebqQZK0OAa5QvgLwDTwK0l+N8kvJfkUsLKq3gFoy5vb/quBE13jp1ptdVvvrc8aU1VngfeBG3sbSbIjyUSSienp6QFPUZI0iEECYTnwk8DjVfV54P/QpofOo99v9jVHfa4xswtVe6pqrKrGRkZG5u5akrQggwTCFDBVVUfa+wN0AuJUmwaiLU937b+ma/wocLLVR/vUZ41Jshy4HnhvoScjSbpw8wZCVf0hcCLJZ1tpE/AacBDY1mrbgGfb+kFgvH1z6FY6N4+PtmmlD5JsbPcH7u8ZM3Os+4AX2n0GSdIiWT7gfv8Q+NUknwS+DfwsnTDZn2Q78DawFaCqjiXZTyc0zgIPVtW5dpwHgCeBa4Hn2ws6N6yfTjJJ58pg/CLPS5K0QAMFQlW9Aoz12bTpPPvvAnb1qU8Ad/Spf0gLFEnScPiksiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNYM+qSxd0dbufG7YLVw2bz18z7Bb0I8IrxAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoGCoQkbyV5NckrSSZabUWSQ0neaMsbuvZ/KMlkkuNJ7u6q39WOM5nk0SRp9WuSPNPqR5KsvcTnKUmax0KuEP56Vd1ZVWPt/U7gcFWtAw639yRZD4wDtwObgceSLGtjHgd2AOvaa3OrbwfOVNVtwCPA7gs/JUnShbiYKaMtwN62vhe4t6u+r6o+qqo3gUlgQ5JVwHVV9WJVFfBUz5iZYx0ANs1cPUiSFseggVDAbyV5KcmOVltZVe8AtOXNrb4aONE1dqrVVrf13vqsMVV1FngfuHFhpyJJuhiD/vfXX6iqk0luBg4l+dYc+/b7zb7mqM81ZvaBO2G0A+CWW26Zu2NJ0oIMdIVQVSfb8jTwVWADcKpNA9GWp9vuU8CaruGjwMlWH+1TnzUmyXLgeuC9Pn3sqaqxqhobGRkZpHVJ0oDmDYQkn0ryZ2bWgb8F/D5wENjWdtsGPNvWDwLj7ZtDt9K5eXy0TSt9kGRjuz9wf8+YmWPdB7zQ7jNIkhbJIFNGK4Gvtnu8y4H/XFVfT/JNYH+S7cDbwFaAqjqWZD/wGnAWeLCqzrVjPQA8CVwLPN9eAE8ATyeZpHNlMH4Jzk2StADzBkJVfRv4XJ/6u8Cm84zZBezqU58A7uhT/5AWKJKk4fBJZUkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqRm4EBIsizJ7yb5Wnu/IsmhJG+05Q1d+z6UZDLJ8SR3d9XvSvJq2/ZokrT6NUmeafUjSdZewnOUJA1gIVcIvwC83vV+J3C4qtYBh9t7kqwHxoHbgc3AY0mWtTGPAzuAde21udW3A2eq6jbgEWD3BZ2NJOmCDRQISUaBe4Bf6ipvAfa29b3AvV31fVX1UVW9CUwCG5KsAq6rqherqoCnesbMHOsAsGnm6kGStDgGvUL4t8A/AX7QVVtZVe8AtOXNrb4aONG131SrrW7rvfVZY6rqLPA+cOOgJyFJunjzBkKSvw2crqqXBjxmv9/sa476XGN6e9mRZCLJxPT09IDtSJIGMcgVwheAn07yFrAP+Kkk/wk41aaBaMvTbf8pYE3X+FHgZKuP9qnPGpNkOXA98F5vI1W1p6rGqmpsZGRkoBOUJA1m3kCoqoeqarSq1tK5WfxCVf1d4CCwre22DXi2rR8Exts3h26lc/P4aJtW+iDJxnZ/4P6eMTPHuq/9Gx+7QpAkXT7LL2Lsw8D+JNuBt4GtAFV1LMl+4DXgLPBgVZ1rYx4AngSuBZ5vL4AngKeTTNK5Mhi/iL4kSRdgQYFQVd8AvtHW3wU2nWe/XcCuPvUJ4I4+9Q9pgSJJGg6fVJYkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEXNyDaZJ02a3d+dywW7is3nr4nmG38P95hSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1MwbCEn+VJKjSf5nkmNJ/mWrr0hyKMkbbXlD15iHkkwmOZ7k7q76XUlebdseTZJWvybJM61+JMnay3CukqQ5DHKF8BHwU1X1OeBOYHOSjcBO4HBVrQMOt/ckWQ+MA7cDm4HHkixrx3oc2AGsa6/Nrb4dOFNVtwGPALsv/tQkSQsxbyBUx/fa2x9rrwK2AHtbfS9wb1vfAuyrqo+q6k1gEtiQZBVwXVW9WFUFPNUzZuZYB4BNM1cPkqTFMdA9hCTLkrwCnAYOVdURYGVVvQPQlje33VcDJ7qGT7Xa6rbeW581pqrOAu8DN/bpY0eSiSQT09PTA52gJGkwAwVCVZ2rqjuBUTq/7d8xx+79frOvOepzjentY09VjVXV2MjIyDxdS5IWYkHfMqqq7wLfoDP3f6pNA9GWp9tuU8CarmGjwMlWH+1TnzUmyXLgeuC9hfQmSbo4g3zLaCTJZ9r6tcDfAL4FHAS2td22Ac+29YPAePvm0K10bh4fbdNKHyTZ2O4P3N8zZuZY9wEvtPsMkqRFMsjfVF4F7G3fFPoEsL+qvpbkRWB/ku3A28BWgKo6lmQ/8BpwFniwqs61Yz0APAlcCzzfXgBPAE8nmaRzZTB+KU5OkjS4eQOhqn4P+Hyf+rvApvOM2QXs6lOfAD52/6GqPqQFiiRpOHxSWZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQMEQpI1SX47yetJjiX5hVZfkeRQkjfa8oauMQ8lmUxyPMndXfW7krzatj2aJK1+TZJnWv1IkrWX4VwlSXMY5ArhLPCPq+ovARuBB5OsB3YCh6tqHXC4vadtGwduBzYDjyVZ1o71OLADWNdem1t9O3Cmqm4DHgF2X4JzkyQtwLyBUFXvVNXLbf0D4HVgNbAF2Nt22wvc29a3APuq6qOqehOYBDYkWQVcV1UvVlUBT/WMmTnWAWDTzNWDJGlxLOgeQpvK+TxwBFhZVe9AJzSAm9tuq4ETXcOmWm11W++tzxpTVWeB94EbF9KbJOniDBwIST4N/Brw5ar647l27VOrOepzjentYUeSiSQT09PT87UsSVqAgQIhyY/RCYNfrapfb+VTbRqItjzd6lPAmq7ho8DJVh/tU581Jsly4Hrgvd4+qmpPVY1V1djIyMggrUuSBjTIt4wCPAG8XlX/pmvTQWBbW98GPNtVH2/fHLqVzs3jo21a6YMkG9sx7+8ZM3Os+4AX2n0GSdIiWT7APl8A/h7wapJXWu2fAQ8D+5NsB94GtgJU1bEk+4HX6HxD6cGqOtfGPQA8CVwLPN9e0Amcp5NM0rkyGL+405IkLdS8gVBV/43+c/wAm84zZhewq099ArijT/1DWqBIkobDJ5UlSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSmnkDIckvJzmd5Pe7aiuSHEryRlve0LXtoSSTSY4nuburfleSV9u2R5Ok1a9J8kyrH0my9hKfoyRpAINcITwJbO6p7QQOV9U64HB7T5L1wDhwexvzWJJlbczjwA5gXXvNHHM7cKaqbgMeAXZf6MlIki7cvIFQVb8DvNdT3gLsbet7gXu76vuq6qOqehOYBDYkWQVcV1UvVlUBT/WMmTnWAWDTzNWDJGnxXOg9hJVV9Q5AW97c6quBE137TbXa6rbeW581pqrOAu8DN15gX5KkC3Spbyr3+82+5qjPNebjB092JJlIMjE9PX2BLUqS+rnQQDjVpoFoy9OtPgWs6dpvFDjZ6qN96rPGJFkOXM/Hp6gAqKo9VTVWVWMjIyMX2LokqZ8LDYSDwLa2vg14tqs+3r45dCudm8dH27TSB0k2tvsD9/eMmTnWfcAL7T6DJGkRLZ9vhyRfAf4acFOSKeBfAA8D+5NsB94GtgJU1bEk+4HXgLPAg1V1rh3qATrfWLoWeL69AJ4Ank4ySefKYPySnJkkaUHmDYSq+tJ5Nm06z/67gF196hPAHX3qH9ICRZI0PD6pLEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQsmUBIsjnJ8SSTSXYOux9JutosiUBIsgz498AXgfXAl5KsH25XknR1WRKBAGwAJqvq21X1fWAfsGXIPUnSVWX5sBtoVgMnut5PAX+5d6ckO4Ad7e33khxfhN6G5SbgjxbrH8vuxfqXrgp+dle2H/XP78+fb8NSCYT0qdXHClV7gD2Xv53hSzJRVWPD7kML52d3ZbuaP7+lMmU0Bazpej8KnBxSL5J0VVoqgfBNYF2SW5N8EhgHDg65J0m6qiyJKaOqOpvk54HfBJYBv1xVx4bc1rBdFVNjP6L87K5sV+3nl6qPTdVLkq5CS2XKSJI0ZAaCJAkwECRJjYEgSQKWyLeMpCtdkpV0nrgv4GRVnRpySxpAkuXAduDvAH+O9vkBzwJPVNX/HWJ7i85vGS0h/lC58iS5E/gPwPXA/27lUeC7wD+oqpeH05kGkeQrdD6rvXQekIXO57cNWFFVPzOk1obCQFgC/KFy5UryCvBzVXWkp74R+MWq+txQGtNAkhyvqs+eZ9sfVNWPL3ZPw+SU0dLwJOf/ofIrgD9Ulq5P9X5uAFX1P5J8ahgNaUHOJNkK/FpV/QAgySeArcCZoXY2BAbC0uAPlSvX80meA57ih/9j7xrgfuDrQ+tKgxoHdgOPJZkJgM8Av922XVWcMloCkjwK/EX6/1B5s6p+fli9aX5Jvkjn73espvM/904BB6vqvw61MS1Ikhvp/ExctP/6eqkxEJYIf6hIS0uSP1tVfzjsPhaTgSBdJkl2tL/hoStQkueq6p5h97GYfDBtiWt/JU5Xpn5/+ElXiKstDMCbylcCf6gscUl+gh9O98082HSwqn5xqI3poiT5dFV9b9h9LCavEJa+7w+7AZ1fkn8K7KMT3Efp/LGnAF9JsnOYvemivTbsBhab9xCWuCRvV9Utw+5D/SX5A+D23v/ioP3lv2NVtW44nWkQSf7R+TYB/7yqVixmP8PmlNESkOT3zrcJWLmYvWjBfkDn/8D5Tk99Vdumpe1fAf8aONtn21U3g2IgLA0rgbv5+JORAf774rejBfgycDjJG/zwGZJbgNsAnx9Z+l4GfqOqXurdkOTvD6GfoTIQloavAZ+uqld6NyT5xqJ3o4FV1deT/DiwgdnPkHyzqs4NtTkN4meBd7sLXc8fjA2npeHxHoIkdUnyclX95LD7GIarbo5MkuZx1X7V20CQpNn+47AbGBanjCRJgFcIkqTGQJAkAQaCJKkxECRJgIEgSWr+HykGFTvcUZ7EAAAAAElFTkSuQmCC\n",
206 |       "text/plain": [
207 |        "<Figure size 432x288 with 1 Axes>"
208 |       ]
209 |      },
210 |      "metadata": {
211 |       "needs_background": "light"
212 |      },
213 |      "output_type": "display_data"
214 |     }
215 |    ],
216 |    "source": [
217 |     "print(data['category'].value_counts().plot(kind='bar'))"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 17,
223 |    "metadata": {},
224 |    "outputs": [],
225 |    "source": [
226 |     "import nltk\n",
227 |     "from sklearn.feature_extraction.text import CountVectorizer as cv, TfidfVectorizer as tfidf"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 18,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "from sklearn.metrics import accuracy_score, classification_report"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "### Bag of Words "
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 19,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "cv=cv(max_features=5000) #for bag of words\n",
253 |     "x=cv.fit_transform(data['clean_text']).toarray()\n",
254 |     "y=data['category'].values"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 20,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/plain": [
265 |        "(162969, 5000)"
266 |       ]
267 |      },
268 |      "execution_count": 20,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "x.shape"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 21,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "data": {
284 |       "text/plain": [
285 |        "array([[0, 0, 0, ..., 0, 0, 0],\n",
286 |        "       [0, 0, 0, ..., 0, 0, 0],\n",
287 |        "       [0, 0, 0, ..., 0, 0, 0],\n",
288 |        "       ...,\n",
289 |        "       [0, 0, 0, ..., 0, 0, 0],\n",
290 |        "       [0, 0, 0, ..., 0, 0, 0],\n",
291 |        "       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
292 |       ]
293 |      },
294 |      "execution_count": 21,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "x"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 22,
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "data": {
310 |       "text/plain": [
311 |        "array([-1.,  0.,  1., ...,  0.,  0.,  1.])"
312 |       ]
313 |      },
314 |      "execution_count": 22,
315 |      "metadata": {},
316 |      "output_type": "execute_result"
317 |     }
318 |    ],
319 |    "source": [
320 |     "y"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 23,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "(162969,)"
332 |       ]
333 |      },
334 |      "execution_count": 23,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "y.shape"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 24,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "from sklearn.model_selection import train_test_split\n",
350 |     "x_train_bow,x_test_bow,y_train_bow,y_test_bow=train_test_split(x,y,test_size=0.2)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 27,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "MultinomialNB()"
362 |       ]
363 |      },
364 |      "execution_count": 27,
365 |      "metadata": {},
366 |      "output_type": "execute_result"
367 |     }
368 |    ],
369 |    "source": [
370 |     "from sklearn.naive_bayes import MultinomialNB as nb\n",
371 |     "classifier_nb=nb()\n",
372 |     "classifier_nb.fit(x_train_bow,y_train_bow)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 32,
378 |    "metadata": {},
379 |    "outputs": [
380 |     {
381 |      "name": "stdout",
382 |      "output_type": "stream",
383 |      "text": [
384 |       "Accuracy is: 81.1008161011229\n",
385 |       "              precision    recall  f1-score   support\n",
386 |       "\n",
387 |       "        -1.0       0.71      0.75      0.73      7136\n",
388 |       "         0.0       0.89      0.79      0.84     11026\n",
389 |       "         1.0       0.81      0.86      0.83     14432\n",
390 |       "\n",
391 |       "    accuracy                           0.81     32594\n",
392 |       "   macro avg       0.80      0.80      0.80     32594\n",
393 |       "weighted avg       0.82      0.81      0.81     32594\n",
394 |       "\n"
395 |      ]
396 |     }
397 |    ],
398 |    "source": [
399 |     "#predicting values\n",
400 |     "y_pred_bow=classifier_nb.predict(x_test_bow)\n",
401 |     "print(\"Accuracy is:\",accuracy_score(y_test_bow,y_pred_bow)*100)\n",
402 |     "print(classification_report(y_test_bow,y_pred_bow))"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "## TF-IDF"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 43,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "tf=tfidf(max_features=100)\n",
419 |     "x_tf=tf.fit_transform(data['clean_text']).toarray()\n",
420 |     "y_tf=y"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 44,
426 |    "metadata": {},
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "text/plain": [
431 |        "MultinomialNB()"
432 |       ]
433 |      },
434 |      "execution_count": 44,
435 |      "metadata": {},
436 |      "output_type": "execute_result"
437 |     }
438 |    ],
439 |    "source": [
440 |     "#splitting the data again\n",
441 |     "x_train_tf,x_test_tf,y_train_tf,y_test_tf=train_test_split(x_tf,y,test_size=0.2)\n",
442 |     "classifier_tf=nb()\n",
443 |     "classifier_tf.fit(x_train_tf,y_train_tf)"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 45,
449 |    "metadata": {},
450 |    "outputs": [
451 |     {
452 |      "name": "stdout",
453 |      "output_type": "stream",
454 |      "text": [
455 |       "Accuracy is: 0.47502607841934097\n",
456 |       "              precision    recall  f1-score   support\n",
457 |       "\n",
458 |       "        -1.0       0.00      0.00      0.00      7170\n",
459 |       "         0.0       0.53      0.25      0.34     11039\n",
460 |       "         1.0       0.46      0.89      0.61     14385\n",
461 |       "\n",
462 |       "    accuracy                           0.48     32594\n",
463 |       "   macro avg       0.33      0.38      0.32     32594\n",
464 |       "weighted avg       0.38      0.48      0.38     32594\n",
465 |       "\n"
466 |      ]
467 |     },
468 |     {
469 |      "name": "stderr",
470 |      "output_type": "stream",
471 |      "text": [
472 |       "C:\\Users\\91884\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
473 |       "  _warn_prf(average, modifier, msg_start, len(result))\n"
474 |      ]
475 |     }
476 |    ],
477 |    "source": [
478 |     "#predicting the values\n",
479 |     "y_pred_tf=classifier_tf.predict(x_test_tf)\n",
480 |     "print('Accuracy is:',accuracy_score(y_test_tf,y_pred_tf))\n",
481 |     "print(classification_report(y_test_tf,y_pred_tf))"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": [
490 |     "#Note here in TFIDF, the accuracy fell drastically because the system is not alloting any more space\n",
491 |     "#so I had to go with 100 features in tfdf in contrast to 5000 features in Bag of words\n",
492 |     "#you can surely try it out with different values of features"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": []
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": []
508 |   }
509 |  ],
510 |  "metadata": {
511 |   "kernelspec": {
512 |    "display_name": "Python 3",
513 |    "language": "python",
514 |    "name": "python3"
515 |   },
516 |   "language_info": {
517 |    "codemirror_mode": {
518 |     "name": "ipython",
519 |     "version": 3
520 |    },
521 |    "file_extension": ".py",
522 |    "mimetype": "text/x-python",
523 |    "name": "python",
524 |    "nbconvert_exporter": "python",
525 |    "pygments_lexer": "ipython3",
526 |    "version": "3.7.4"
527 |   }
528 |  },
529 |  "nbformat": 4,
530 |  "nbformat_minor": 2
531 | }
532 | 


--------------------------------------------------------------------------------
/Projects/Fake News Detection/Fake News.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 45,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#dependencies \n",
 10 |     "import nltk \n",
 11 |     "from nltk.corpus import stopwords\n",
 12 |     "import pandas as pd\n",
 13 |     "import numpy as np"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 54,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "df=pd.read_csv('D:/Data Sets/fake-news/train.csv') #reading file csv and converting to a dataframe"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 55,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/html": [
 33 |        "<div>\n",
 34 |        "<style scoped>\n",
 35 |        "    .dataframe tbody tr th:only-of-type {\n",
 36 |        "        vertical-align: middle;\n",
 37 |        "    }\n",
 38 |        "\n",
 39 |        "    .dataframe tbody tr th {\n",
 40 |        "        vertical-align: top;\n",
 41 |        "    }\n",
 42 |        "\n",
 43 |        "    .dataframe thead th {\n",
 44 |        "        text-align: right;\n",
 45 |        "    }\n",
 46 |        "</style>\n",
 47 |        "<table border=\"1\" class=\"dataframe\">\n",
 48 |        "  <thead>\n",
 49 |        "    <tr style=\"text-align: right;\">\n",
 50 |        "      <th></th>\n",
 51 |        "      <th>id</th>\n",
 52 |        "      <th>title</th>\n",
 53 |        "      <th>author</th>\n",
 54 |        "      <th>text</th>\n",
 55 |        "      <th>label</th>\n",
 56 |        "    </tr>\n",
 57 |        "  </thead>\n",
 58 |        "  <tbody>\n",
 59 |        "    <tr>\n",
 60 |        "      <th>0</th>\n",
 61 |        "      <td>0</td>\n",
 62 |        "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
 63 |        "      <td>Darrell Lucus</td>\n",
 64 |        "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
 65 |        "      <td>1</td>\n",
 66 |        "    </tr>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>1</th>\n",
 69 |        "      <td>1</td>\n",
 70 |        "      <td>FLYNN: Hillary Clinton, Big Woman on Campus - ...</td>\n",
 71 |        "      <td>Daniel J. Flynn</td>\n",
 72 |        "      <td>Ever get the feeling your life circles the rou...</td>\n",
 73 |        "      <td>0</td>\n",
 74 |        "    </tr>\n",
 75 |        "    <tr>\n",
 76 |        "      <th>2</th>\n",
 77 |        "      <td>2</td>\n",
 78 |        "      <td>Why the Truth Might Get You Fired</td>\n",
 79 |        "      <td>Consortiumnews.com</td>\n",
 80 |        "      <td>Why the Truth Might Get You Fired October 29, ...</td>\n",
 81 |        "      <td>1</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>3</th>\n",
 85 |        "      <td>3</td>\n",
 86 |        "      <td>15 Civilians Killed In Single US Airstrike Hav...</td>\n",
 87 |        "      <td>Jessica Purkiss</td>\n",
 88 |        "      <td>Videos 15 Civilians Killed In Single US Airstr...</td>\n",
 89 |        "      <td>1</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>4</th>\n",
 93 |        "      <td>4</td>\n",
 94 |        "      <td>Iranian woman jailed for fictional unpublished...</td>\n",
 95 |        "      <td>Howard Portnoy</td>\n",
 96 |        "      <td>Print \\nAn Iranian woman has been sentenced to...</td>\n",
 97 |        "      <td>1</td>\n",
 98 |        "    </tr>\n",
 99 |        "  </tbody>\n",
100 |        "</table>\n",
101 |        "</div>"
102 |       ],
103 |       "text/plain": [
104 |        "   id                                              title              author  \\\n",
105 |        "0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   \n",
106 |        "1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   \n",
107 |        "2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   \n",
108 |        "3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   \n",
109 |        "4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   \n",
110 |        "\n",
111 |        "                                                text  label  \n",
112 |        "0  House Dem Aide: We Didn’t Even See Comey’s Let...      1  \n",
113 |        "1  Ever get the feeling your life circles the rou...      0  \n",
114 |        "2  Why the Truth Might Get You Fired October 29, ...      1  \n",
115 |        "3  Videos 15 Civilians Killed In Single US Airstr...      1  \n",
116 |        "4  Print \\nAn Iranian woman has been sentenced to...      1  "
117 |       ]
118 |      },
119 |      "execution_count": 55,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "#first 5 columns\n",
126 |     "df.head()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 56,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "id           0\n",
138 |        "title      558\n",
139 |        "author    1957\n",
140 |        "text        39\n",
141 |        "label        0\n",
142 |        "dtype: int64"
143 |       ]
144 |      },
145 |      "execution_count": 56,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     }
149 |    ],
150 |    "source": [
151 |     "df.isnull().sum()"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 57,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "9.408653846153847\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "print(1957*100/df.shape[0])#9 percent can be removed "
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 58,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "df=df.dropna()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 59,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "(18285, 5)"
189 |       ]
190 |      },
191 |      "execution_count": 59,
192 |      "metadata": {},
193 |      "output_type": "execute_result"
194 |     }
195 |    ],
196 |    "source": [
197 |     "df.shape"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 60,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "#the indices would have disrupted\n",
207 |     "df.reset_index(inplace=True)"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "#label column means the output variable\n",
217 |     "#0 represents that the news can be relied on(not fake)\n",
218 |     "#1 represents it cannot be relied on(fake)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 61,
224 |    "metadata": {},
225 |    "outputs": [
226 |     {
227 |      "data": {
228 |       "text/plain": [
229 |        "0        1\n",
230 |        "1        0\n",
231 |        "2        1\n",
232 |        "3        1\n",
233 |        "4        1\n",
234 |        "        ..\n",
235 |        "18280    0\n",
236 |        "18281    0\n",
237 |        "18282    0\n",
238 |        "18283    1\n",
239 |        "18284    1\n",
240 |        "Name: label, Length: 18285, dtype: int64"
241 |       ]
242 |      },
243 |      "execution_count": 61,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "#dependent variable(y)\n",
250 |     "y=df['label']\n",
251 |     "y"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 62,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/html": [
262 |        "<div>\n",
263 |        "<style scoped>\n",
264 |        "    .dataframe tbody tr th:only-of-type {\n",
265 |        "        vertical-align: middle;\n",
266 |        "    }\n",
267 |        "\n",
268 |        "    .dataframe tbody tr th {\n",
269 |        "        vertical-align: top;\n",
270 |        "    }\n",
271 |        "\n",
272 |        "    .dataframe thead th {\n",
273 |        "        text-align: right;\n",
274 |        "    }\n",
275 |        "</style>\n",
276 |        "<table border=\"1\" class=\"dataframe\">\n",
277 |        "  <thead>\n",
278 |        "    <tr style=\"text-align: right;\">\n",
279 |        "      <th></th>\n",
280 |        "      <th>index</th>\n",
281 |        "      <th>id</th>\n",
282 |        "      <th>title</th>\n",
283 |        "      <th>author</th>\n",
284 |        "      <th>text</th>\n",
285 |        "    </tr>\n",
286 |        "  </thead>\n",
287 |        "  <tbody>\n",
288 |        "    <tr>\n",
289 |        "      <th>0</th>\n",
290 |        "      <td>0</td>\n",
291 |        "      <td>0</td>\n",
292 |        "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
293 |        "      <td>Darrell Lucus</td>\n",
294 |        "      <td>House Dem Aide: We Didn’t Even See Comey’s Let...</td>\n",
295 |        "    </tr>\n",
296 |        "    <tr>\n",
297 |        "      <th>1</th>\n",
298 |        "      <td>1</td>\n",
299 |        "      <td>1</td>\n",
300 |        "      <td>FLYNN: Hillary Clinton, Big Woman on Campus - ...</td>\n",
301 |        "      <td>Daniel J. Flynn</td>\n",
302 |        "      <td>Ever get the feeling your life circles the rou...</td>\n",
303 |        "    </tr>\n",
304 |        "    <tr>\n",
305 |        "      <th>2</th>\n",
306 |        "      <td>2</td>\n",
307 |        "      <td>2</td>\n",
308 |        "      <td>Why the Truth Might Get You Fired</td>\n",
309 |        "      <td>Consortiumnews.com</td>\n",
310 |        "      <td>Why the Truth Might Get You Fired October 29, ...</td>\n",
311 |        "    </tr>\n",
312 |        "    <tr>\n",
313 |        "      <th>3</th>\n",
314 |        "      <td>3</td>\n",
315 |        "      <td>3</td>\n",
316 |        "      <td>15 Civilians Killed In Single US Airstrike Hav...</td>\n",
317 |        "      <td>Jessica Purkiss</td>\n",
318 |        "      <td>Videos 15 Civilians Killed In Single US Airstr...</td>\n",
319 |        "    </tr>\n",
320 |        "    <tr>\n",
321 |        "      <th>4</th>\n",
322 |        "      <td>4</td>\n",
323 |        "      <td>4</td>\n",
324 |        "      <td>Iranian woman jailed for fictional unpublished...</td>\n",
325 |        "      <td>Howard Portnoy</td>\n",
326 |        "      <td>Print \\nAn Iranian woman has been sentenced to...</td>\n",
327 |        "    </tr>\n",
328 |        "    <tr>\n",
329 |        "      <th>...</th>\n",
330 |        "      <td>...</td>\n",
331 |        "      <td>...</td>\n",
332 |        "      <td>...</td>\n",
333 |        "      <td>...</td>\n",
334 |        "      <td>...</td>\n",
335 |        "    </tr>\n",
336 |        "    <tr>\n",
337 |        "      <th>18280</th>\n",
338 |        "      <td>20795</td>\n",
339 |        "      <td>20795</td>\n",
340 |        "      <td>Rapper T.I.: Trump a ’Poster Child For White S...</td>\n",
341 |        "      <td>Jerome Hudson</td>\n",
342 |        "      <td>Rapper T. I. unloaded on black celebrities who...</td>\n",
343 |        "    </tr>\n",
344 |        "    <tr>\n",
345 |        "      <th>18281</th>\n",
346 |        "      <td>20796</td>\n",
347 |        "      <td>20796</td>\n",
348 |        "      <td>N.F.L. Playoffs: Schedule, Matchups and Odds -...</td>\n",
349 |        "      <td>Benjamin Hoffman</td>\n",
350 |        "      <td>When the Green Bay Packers lost to the Washing...</td>\n",
351 |        "    </tr>\n",
352 |        "    <tr>\n",
353 |        "      <th>18282</th>\n",
354 |        "      <td>20797</td>\n",
355 |        "      <td>20797</td>\n",
356 |        "      <td>Macy’s Is Said to Receive Takeover Approach by...</td>\n",
357 |        "      <td>Michael J. de la Merced and Rachel Abrams</td>\n",
358 |        "      <td>The Macy’s of today grew from the union of sev...</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>18283</th>\n",
362 |        "      <td>20798</td>\n",
363 |        "      <td>20798</td>\n",
364 |        "      <td>NATO, Russia To Hold Parallel Exercises In Bal...</td>\n",
365 |        "      <td>Alex Ansary</td>\n",
366 |        "      <td>NATO, Russia To Hold Parallel Exercises In Bal...</td>\n",
367 |        "    </tr>\n",
368 |        "    <tr>\n",
369 |        "      <th>18284</th>\n",
370 |        "      <td>20799</td>\n",
371 |        "      <td>20799</td>\n",
372 |        "      <td>What Keeps the F-35 Alive</td>\n",
373 |        "      <td>David Swanson</td>\n",
374 |        "      <td>David Swanson is an author, activist, journa...</td>\n",
375 |        "    </tr>\n",
376 |        "  </tbody>\n",
377 |        "</table>\n",
378 |        "<p>18285 rows × 5 columns</p>\n",
379 |        "</div>"
380 |       ],
381 |       "text/plain": [
382 |        "       index     id                                              title  \\\n",
383 |        "0          0      0  House Dem Aide: We Didn’t Even See Comey’s Let...   \n",
384 |        "1          1      1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   \n",
385 |        "2          2      2                  Why the Truth Might Get You Fired   \n",
386 |        "3          3      3  15 Civilians Killed In Single US Airstrike Hav...   \n",
387 |        "4          4      4  Iranian woman jailed for fictional unpublished...   \n",
388 |        "...      ...    ...                                                ...   \n",
389 |        "18280  20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   \n",
390 |        "18281  20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   \n",
391 |        "18282  20797  20797  Macy’s Is Said to Receive Takeover Approach by...   \n",
392 |        "18283  20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   \n",
393 |        "18284  20799  20799                          What Keeps the F-35 Alive   \n",
394 |        "\n",
395 |        "                                          author  \\\n",
396 |        "0                                  Darrell Lucus   \n",
397 |        "1                                Daniel J. Flynn   \n",
398 |        "2                             Consortiumnews.com   \n",
399 |        "3                                Jessica Purkiss   \n",
400 |        "4                                 Howard Portnoy   \n",
401 |        "...                                          ...   \n",
402 |        "18280                              Jerome Hudson   \n",
403 |        "18281                           Benjamin Hoffman   \n",
404 |        "18282  Michael J. de la Merced and Rachel Abrams   \n",
405 |        "18283                                Alex Ansary   \n",
406 |        "18284                              David Swanson   \n",
407 |        "\n",
408 |        "                                                    text  \n",
409 |        "0      House Dem Aide: We Didn’t Even See Comey’s Let...  \n",
410 |        "1      Ever get the feeling your life circles the rou...  \n",
411 |        "2      Why the Truth Might Get You Fired October 29, ...  \n",
412 |        "3      Videos 15 Civilians Killed In Single US Airstr...  \n",
413 |        "4      Print \\nAn Iranian woman has been sentenced to...  \n",
414 |        "...                                                  ...  \n",
415 |        "18280  Rapper T. I. unloaded on black celebrities who...  \n",
416 |        "18281  When the Green Bay Packers lost to the Washing...  \n",
417 |        "18282  The Macy’s of today grew from the union of sev...  \n",
418 |        "18283  NATO, Russia To Hold Parallel Exercises In Bal...  \n",
419 |        "18284    David Swanson is an author, activist, journa...  \n",
420 |        "\n",
421 |        "[18285 rows x 5 columns]"
422 |       ]
423 |      },
424 |      "execution_count": 62,
425 |      "metadata": {},
426 |      "output_type": "execute_result"
427 |     }
428 |    ],
429 |    "source": [
430 |     "#independent variable\n",
431 |     "x=df.iloc[:,:-1]\n",
432 |     "x"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": 63,
438 |    "metadata": {},
439 |    "outputs": [
440 |     {
441 |      "data": {
442 |       "text/plain": [
443 |        "'Print \\nAn Iranian woman has been sentenced to six years in prison after Iran’s Revolutionary Guard searched her home and found a notebook that contained a fictional story she’d written about a woman who was stoned to death, according to the Eurasia Review . \\nGolrokh Ebrahimi Iraee, 35, is the wife of political prisoner Arash Sadeghi, 36, who is serving a 19-year prison sentence for being a human rights activist, the publication reported. \\n“When the intelligence unit of the Revolutionary Guards came to arrest her husband, they raided their apartment – without a warrant – and found drafts of stories that Ebrahimi Iraee had written,” the article stated. \\n“One of the confiscated drafts was a story about stoning women to death for adultery – never published, never presented to anyone,” the article stated. “The narrative followed the story of a protagonist that watched a movie about stoning of women under Islamic law for adultery.'"
444 |       ]
445 |      },
446 |      "execution_count": 63,
447 |      "metadata": {},
448 |      "output_type": "execute_result"
449 |     }
450 |    ],
451 |    "source": [
452 |     "x['text'][4] # in this we see there are characters like \\n which we will be removing, for that we need re(regular expression) "
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 69,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "import re\n",
462 |     "from nltk.stem import PorterStemmer as ps\n",
463 |     "from tqdm import tqdm"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": 70,
469 |    "metadata": {},
470 |    "outputs": [
471 |     {
472 |      "name": "stderr",
473 |      "output_type": "stream",
474 |      "text": [
475 |       "100%|██████████████████████████████████████████████████████████████████████████| 18285/18285 [1:29:19<00:00,  3.41it/s]\n"
476 |      ]
477 |     }
478 |    ],
479 |    "source": [
480 |     "#here the main text is the key feature because it tells us whether the news can be relied or not,\n",
481 |     "#alone with headline we cannot determine how reliable the news is\n",
482 |     "ps=ps()\n",
483 |     "corpus=[]\n",
484 |     "l=len(x['text'])\n",
485 |     "for i in tqdm(range(l)):\n",
486 |     "    texts=re.sub('[^a-zA-Z]',' ',x['text'][i])\n",
487 |     "    texts=texts.lower()\n",
488 |     "    texts=texts.split()\n",
489 |     "    texts=[ps.stem(text) for text in texts if text not in stopwords.words('english')]\n",
490 |     "    texts=' '.join(texts)\n",
491 |     "    corpus.append(texts)\n",
492 |     "    "
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 72,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": [
501 |     "#it literally took 1:29:19 to pre process the text!!!!\n",
502 |     "from sklearn.feature_extraction.text import TfidfVectorizer as tfidf\n",
503 |     "tfidf=tfidf(max_features=6000,ngram_range=(1,3))\n",
504 |     "x=tfidf.fit_transform(corpus).toarray()"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": 73,
510 |    "metadata": {},
511 |    "outputs": [
512 |     {
513 |      "data": {
514 |       "text/plain": [
515 |        "array([0., 0., 0., ..., 0., 0., 0.])"
516 |       ]
517 |      },
518 |      "execution_count": 73,
519 |      "metadata": {},
520 |      "output_type": "execute_result"
521 |     }
522 |    ],
523 |    "source": [
524 |     "x[1]"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 74,
530 |    "metadata": {},
531 |    "outputs": [
532 |     {
533 |      "data": {
534 |       "text/plain": [
535 |        "(18285, 6000)"
536 |       ]
537 |      },
538 |      "execution_count": 74,
539 |      "metadata": {},
540 |      "output_type": "execute_result"
541 |     }
542 |    ],
543 |    "source": [
544 |     "x.shape"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": 75,
550 |    "metadata": {},
551 |    "outputs": [],
552 |    "source": [
553 |     "from sklearn.model_selection import train_test_split\n",
554 |     "x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": 76,
560 |    "metadata": {},
561 |    "outputs": [
562 |     {
563 |      "data": {
564 |       "text/plain": [
565 |        "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
566 |       ]
567 |      },
568 |      "execution_count": 76,
569 |      "metadata": {},
570 |      "output_type": "execute_result"
571 |     }
572 |    ],
573 |    "source": [
574 |     "from sklearn.naive_bayes import MultinomialNB\n",
575 |     "classifier=MultinomialNB()\n",
576 |     "classifier.fit(x_train,y_train)"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 77,
582 |    "metadata": {},
583 |    "outputs": [],
584 |    "source": [
585 |     "#predicting values\n",
586 |     "y_pred=classifier.predict(x_test)"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": 79,
592 |    "metadata": {},
593 |    "outputs": [
594 |     {
595 |      "name": "stdout",
596 |      "output_type": "stream",
597 |      "text": [
598 |       "[[2431  120]\n",
599 |       " [ 352 1669]]\n"
600 |      ]
601 |     }
602 |    ],
603 |    "source": [
604 |     "from sklearn.metrics import confusion_matrix,accuracy_score\n",
605 |     "cm=confusion_matrix(y_test,y_pred)\n",
606 |     "print(cm)"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 84,
612 |    "metadata": {},
613 |    "outputs": [
614 |     {
615 |      "name": "stdout",
616 |      "output_type": "stream",
617 |      "text": [
618 |       "0.8967629046369204\n"
619 |      ]
620 |     }
621 |    ],
622 |    "source": [
623 |     "print(accuracy_score(y_test,y_pred))"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": null,
629 |    "metadata": {},
630 |    "outputs": [],
631 |    "source": []
632 |   }
633 |  ],
634 |  "metadata": {
635 |   "kernelspec": {
636 |    "display_name": "Python 3",
637 |    "language": "python",
638 |    "name": "python3"
639 |   },
640 |   "language_info": {
641 |    "codemirror_mode": {
642 |     "name": "ipython",
643 |     "version": 3
644 |    },
645 |    "file_extension": ".py",
646 |    "mimetype": "text/x-python",
647 |    "name": "python",
648 |    "nbconvert_exporter": "python",
649 |    "pygments_lexer": "ipython3",
650 |    "version": "3.7.4"
651 |   }
652 |  },
653 |  "nbformat": 4,
654 |  "nbformat_minor": 2
655 | }
656 | 


--------------------------------------------------------------------------------
/Projects/Covid_tweets Sentiment Analysis/Corona_NLP text classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Covid NLP Tweet Sentiment Analysis"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## We would be using two models\n",
 15 |     "\n",
 16 |     "* Bag of Words\n",
 17 |     "    * Multinomial NB\n",
 18 |     "    * PassiveAggressiveClassifer\n",
 19 |     " "
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import pandas as pd\n",
 29 |     "import numpy as np"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Bag of Words Model\n",
 37 |     "\n",
 38 |     "----------------------------------------"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Train Set"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "df=pd.read_csv('D:/Data Sets/Corona_NLP text classification/Corona_NLP_train.csv',encoding='latin-1')"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/html": [
 65 |        "<div>\n",
 66 |        "<style scoped>\n",
 67 |        "    .dataframe tbody tr th:only-of-type {\n",
 68 |        "        vertical-align: middle;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe tbody tr th {\n",
 72 |        "        vertical-align: top;\n",
 73 |        "    }\n",
 74 |        "\n",
 75 |        "    .dataframe thead th {\n",
 76 |        "        text-align: right;\n",
 77 |        "    }\n",
 78 |        "</style>\n",
 79 |        "<table border=\"1\" class=\"dataframe\">\n",
 80 |        "  <thead>\n",
 81 |        "    <tr style=\"text-align: right;\">\n",
 82 |        "      <th></th>\n",
 83 |        "      <th>UserName</th>\n",
 84 |        "      <th>ScreenName</th>\n",
 85 |        "      <th>Location</th>\n",
 86 |        "      <th>TweetAt</th>\n",
 87 |        "      <th>OriginalTweet</th>\n",
 88 |        "      <th>Sentiment</th>\n",
 89 |        "    </tr>\n",
 90 |        "  </thead>\n",
 91 |        "  <tbody>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>0</th>\n",
 94 |        "      <td>3799</td>\n",
 95 |        "      <td>48751</td>\n",
 96 |        "      <td>London</td>\n",
 97 |        "      <td>16-03-2020</td>\n",
 98 |        "      <td>@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...</td>\n",
 99 |        "      <td>Neutral</td>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "      <th>1</th>\n",
103 |        "      <td>3800</td>\n",
104 |        "      <td>48752</td>\n",
105 |        "      <td>UK</td>\n",
106 |        "      <td>16-03-2020</td>\n",
107 |        "      <td>advice Talk to your neighbours family to excha...</td>\n",
108 |        "      <td>Positive</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>2</th>\n",
112 |        "      <td>3801</td>\n",
113 |        "      <td>48753</td>\n",
114 |        "      <td>Vagabonds</td>\n",
115 |        "      <td>16-03-2020</td>\n",
116 |        "      <td>Coronavirus Australia: Woolworths to give elde...</td>\n",
117 |        "      <td>Positive</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>3</th>\n",
121 |        "      <td>3802</td>\n",
122 |        "      <td>48754</td>\n",
123 |        "      <td>NaN</td>\n",
124 |        "      <td>16-03-2020</td>\n",
125 |        "      <td>My food stock is not the only one which is emp...</td>\n",
126 |        "      <td>Positive</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>4</th>\n",
130 |        "      <td>3803</td>\n",
131 |        "      <td>48755</td>\n",
132 |        "      <td>NaN</td>\n",
133 |        "      <td>16-03-2020</td>\n",
134 |        "      <td>Me, ready to go at supermarket during the #COV...</td>\n",
135 |        "      <td>Extremely Negative</td>\n",
136 |        "    </tr>\n",
137 |        "  </tbody>\n",
138 |        "</table>\n",
139 |        "</div>"
140 |       ],
141 |       "text/plain": [
142 |        "   UserName  ScreenName   Location     TweetAt  \\\n",
143 |        "0      3799       48751     London  16-03-2020   \n",
144 |        "1      3800       48752         UK  16-03-2020   \n",
145 |        "2      3801       48753  Vagabonds  16-03-2020   \n",
146 |        "3      3802       48754        NaN  16-03-2020   \n",
147 |        "4      3803       48755        NaN  16-03-2020   \n",
148 |        "\n",
149 |        "                                       OriginalTweet           Sentiment  \n",
150 |        "0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             Neutral  \n",
151 |        "1  advice Talk to your neighbours family to excha...            Positive  \n",
152 |        "2  Coronavirus Australia: Woolworths to give elde...            Positive  \n",
153 |        "3  My food stock is not the only one which is emp...            Positive  \n",
154 |        "4  Me, ready to go at supermarket during the #COV...  Extremely Negative  "
155 |       ]
156 |      },
157 |      "execution_count": 3,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "df.head()"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 4,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "{'Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive'}"
175 |       ]
176 |      },
177 |      "execution_count": 4,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "#to see the different categories/sentiments\n",
184 |     "set(df['Sentiment'])"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 5,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "#df.iloc[df.shape[0]-1] "
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 6,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "(41157, 6)"
205 |       ]
206 |      },
207 |      "execution_count": 6,
208 |      "metadata": {},
209 |      "output_type": "execute_result"
210 |     }
211 |    ],
212 |    "source": [
213 |     "df.shape"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 7,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "y=df['Sentiment'].values"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 8,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "#encoding labels\n",
232 |     "from sklearn.preprocessing import LabelEncoder as le\n",
233 |     "le=le()\n",
234 |     "y=le.fit_transform(y)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 9,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/plain": [
245 |        "array([3, 4, 4, ..., 4, 3, 2])"
246 |       ]
247 |      },
248 |      "execution_count": 9,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "y"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 10,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "tweets=list(df['OriginalTweet'])"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 11,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "['@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8',\n",
275 |        " 'advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order']"
276 |       ]
277 |      },
278 |      "execution_count": 11,
279 |      "metadata": {},
280 |      "output_type": "execute_result"
281 |     }
282 |    ],
283 |    "source": [
284 |     "tweets[:2]"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 13,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "import nltk\n",
294 |     "from nltk.corpus import stopwords\n",
295 |     "from nltk.stem import PorterStemmer as ps\n",
296 |     "from sklearn.feature_extraction.text import CountVectorizer as cv"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 14,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "name": "stderr",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "100%|███████████████████████████████████████████████████████████████████████████| 41157/41157 [06:50<00:00, 100.26it/s]\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "import re\n",
314 |     "from tqdm import tqdm\n",
315 |     "\n",
316 |     "ps=ps()\n",
317 |     "corpus=[]\n",
318 |     "for i in tqdm(range(len(tweets))):\n",
319 |     "    tweet=re.sub('[^a-zA-Z]',' ',tweets[i])\n",
320 |     "    tweet=tweet.lower()\n",
321 |     "    #print(tweet)\n",
322 |     "    tweet=tweet.split()\n",
323 |     "    tweet=[ps.stem(word) for word in tweet if word not in stopwords.words('english')]\n",
324 |     "    tweet=' '.join(tweet)\n",
325 |     "    corpus.append(tweet)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 15,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "cv=cv(max_features=2500)\n",
335 |     "x=cv.fit_transform(corpus).toarray()"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 16,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "data": {
345 |       "text/plain": [
346 |        "(41157, 2500)"
347 |       ]
348 |      },
349 |      "execution_count": 16,
350 |      "metadata": {},
351 |      "output_type": "execute_result"
352 |     }
353 |    ],
354 |    "source": [
355 |     "x.shape"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 17,
361 |    "metadata": {},
362 |    "outputs": [
363 |     {
364 |      "name": "stderr",
365 |      "output_type": "stream",
366 |      "text": [
367 |       "100%|██████████████████████████████████████████████████████████████████████████| 41157/41157 [00:35<00:00, 1143.92it/s]"
368 |      ]
369 |     },
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "629912\n"
375 |      ]
376 |     },
377 |     {
378 |      "name": "stderr",
379 |      "output_type": "stream",
380 |      "text": [
381 |       "\n"
382 |      ]
383 |     }
384 |    ],
385 |    "source": [
386 |     "c=0\n",
387 |     "for i in tqdm(x):\n",
388 |     "    for j in i:\n",
389 |     "        if j>0:\n",
390 |     "            c+=1\n",
391 |     "print(c)"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "#x,y are our variables. This is our train set so we will fit the model on this data \n",
401 |     "#and then will read the test file to see how good our model really is"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 18,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "x_train,y_train=x,y"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "#we will also be comparing different models"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 43,
425 |    "metadata": {},
426 |    "outputs": [
427 |     {
428 |      "data": {
429 |       "text/plain": [
430 |        "MultinomialNB(alpha=1000)"
431 |       ]
432 |      },
433 |      "execution_count": 43,
434 |      "metadata": {},
435 |      "output_type": "execute_result"
436 |     }
437 |    ],
438 |    "source": [
439 |     "from sklearn.naive_bayes import MultinomialNB as nb\n",
440 |     "classifier_nb=nb(alpha=1000)\n",
441 |     "classifier_nb.fit(x_train,y_train)"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 32,
447 |    "metadata": {},
448 |    "outputs": [
449 |     {
450 |      "data": {
451 |       "text/plain": [
452 |        "PassiveAggressiveClassifier()"
453 |       ]
454 |      },
455 |      "execution_count": 32,
456 |      "metadata": {},
457 |      "output_type": "execute_result"
458 |     }
459 |    ],
460 |    "source": [
461 |     "from sklearn.linear_model import PassiveAggressiveClassifier as pac\n",
462 |     "classifier_pac=pac()\n",
463 |     "classifier_pac.fit(x_train,y_train)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "## Test Set"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 21,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "df_test=pd.read_csv('D:/Data Sets/Corona_NLP text classification/Corona_NLP_test.csv',encoding='latin-1')"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 22,
485 |    "metadata": {},
486 |    "outputs": [
487 |     {
488 |      "data": {
489 |       "text/html": [
490 |        "<div>\n",
491 |        "<style scoped>\n",
492 |        "    .dataframe tbody tr th:only-of-type {\n",
493 |        "        vertical-align: middle;\n",
494 |        "    }\n",
495 |        "\n",
496 |        "    .dataframe tbody tr th {\n",
497 |        "        vertical-align: top;\n",
498 |        "    }\n",
499 |        "\n",
500 |        "    .dataframe thead th {\n",
501 |        "        text-align: right;\n",
502 |        "    }\n",
503 |        "</style>\n",
504 |        "<table border=\"1\" class=\"dataframe\">\n",
505 |        "  <thead>\n",
506 |        "    <tr style=\"text-align: right;\">\n",
507 |        "      <th></th>\n",
508 |        "      <th>UserName</th>\n",
509 |        "      <th>ScreenName</th>\n",
510 |        "      <th>Location</th>\n",
511 |        "      <th>TweetAt</th>\n",
512 |        "      <th>OriginalTweet</th>\n",
513 |        "      <th>Sentiment</th>\n",
514 |        "    </tr>\n",
515 |        "  </thead>\n",
516 |        "  <tbody>\n",
517 |        "    <tr>\n",
518 |        "      <th>0</th>\n",
519 |        "      <td>1</td>\n",
520 |        "      <td>44953</td>\n",
521 |        "      <td>NYC</td>\n",
522 |        "      <td>02-03-2020</td>\n",
523 |        "      <td>TRENDING: New Yorkers encounter empty supermar...</td>\n",
524 |        "      <td>Extremely Negative</td>\n",
525 |        "    </tr>\n",
526 |        "    <tr>\n",
527 |        "      <th>1</th>\n",
528 |        "      <td>2</td>\n",
529 |        "      <td>44954</td>\n",
530 |        "      <td>Seattle, WA</td>\n",
531 |        "      <td>02-03-2020</td>\n",
532 |        "      <td>When I couldn't find hand sanitizer at Fred Me...</td>\n",
533 |        "      <td>Positive</td>\n",
534 |        "    </tr>\n",
535 |        "    <tr>\n",
536 |        "      <th>2</th>\n",
537 |        "      <td>3</td>\n",
538 |        "      <td>44955</td>\n",
539 |        "      <td>NaN</td>\n",
540 |        "      <td>02-03-2020</td>\n",
541 |        "      <td>Find out how you can protect yourself and love...</td>\n",
542 |        "      <td>Extremely Positive</td>\n",
543 |        "    </tr>\n",
544 |        "    <tr>\n",
545 |        "      <th>3</th>\n",
546 |        "      <td>4</td>\n",
547 |        "      <td>44956</td>\n",
548 |        "      <td>Chicagoland</td>\n",
549 |        "      <td>02-03-2020</td>\n",
550 |        "      <td>#Panic buying hits #NewYork City as anxious sh...</td>\n",
551 |        "      <td>Negative</td>\n",
552 |        "    </tr>\n",
553 |        "    <tr>\n",
554 |        "      <th>4</th>\n",
555 |        "      <td>5</td>\n",
556 |        "      <td>44957</td>\n",
557 |        "      <td>Melbourne, Victoria</td>\n",
558 |        "      <td>03-03-2020</td>\n",
559 |        "      <td>#toiletpaper #dunnypaper #coronavirus #coronav...</td>\n",
560 |        "      <td>Neutral</td>\n",
561 |        "    </tr>\n",
562 |        "  </tbody>\n",
563 |        "</table>\n",
564 |        "</div>"
565 |       ],
566 |       "text/plain": [
567 |        "   UserName  ScreenName             Location     TweetAt  \\\n",
568 |        "0         1       44953                  NYC  02-03-2020   \n",
569 |        "1         2       44954          Seattle, WA  02-03-2020   \n",
570 |        "2         3       44955                  NaN  02-03-2020   \n",
571 |        "3         4       44956          Chicagoland  02-03-2020   \n",
572 |        "4         5       44957  Melbourne, Victoria  03-03-2020   \n",
573 |        "\n",
574 |        "                                       OriginalTweet           Sentiment  \n",
575 |        "0  TRENDING: New Yorkers encounter empty supermar...  Extremely Negative  \n",
576 |        "1  When I couldn't find hand sanitizer at Fred Me...            Positive  \n",
577 |        "2  Find out how you can protect yourself and love...  Extremely Positive  \n",
578 |        "3  #Panic buying hits #NewYork City as anxious sh...            Negative  \n",
579 |        "4  #toiletpaper #dunnypaper #coronavirus #coronav...             Neutral  "
580 |       ]
581 |      },
582 |      "execution_count": 22,
583 |      "metadata": {},
584 |      "output_type": "execute_result"
585 |     }
586 |    ],
587 |    "source": [
588 |     "df_test.head()"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 23,
594 |    "metadata": {},
595 |    "outputs": [],
596 |    "source": [
597 |     "tweets_test=df_test['OriginalTweet']\n",
598 |     "y_test=df_test['Sentiment'].values"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": 24,
604 |    "metadata": {},
605 |    "outputs": [],
606 |    "source": [
607 |     "#encoding the dependent variable\n",
608 |     "y_test=le.fit_transform(y_test)"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "code",
613 |    "execution_count": 25,
614 |    "metadata": {},
615 |    "outputs": [
616 |     {
617 |      "data": {
618 |       "text/plain": [
619 |        "array([0, 4, 1, ..., 3, 0, 1])"
620 |       ]
621 |      },
622 |      "execution_count": 25,
623 |      "metadata": {},
624 |      "output_type": "execute_result"
625 |     }
626 |    ],
627 |    "source": [
628 |     "y_test"
629 |    ]
630 |   },
631 |   {
632 |    "cell_type": "code",
633 |    "execution_count": 26,
634 |    "metadata": {},
635 |    "outputs": [
636 |     {
637 |      "name": "stderr",
638 |      "output_type": "stream",
639 |      "text": [
640 |       "100%|██████████████████████████████████████████████████████████████████████████████| 3798/3798 [00:43<00:00, 87.13it/s]\n"
641 |      ]
642 |     }
643 |    ],
644 |    "source": [
645 |     "from nltk.stem import PorterStemmer as ps\n",
646 |     "\n",
647 |     "test_corpus=[]\n",
648 |     "ps_test=ps()\n",
649 |     "for i in tqdm(range(len(tweets_test))):\n",
650 |     "    test_tweet=re.sub('[^a-zA-Z]',' ',tweets_test[i])\n",
651 |     "    test_tweet=test_tweet.lower()\n",
652 |     "    #print(test_tweet)\n",
653 |     "    test_tweet=test_tweet.split()\n",
654 |     "    test_tweet=[ps_test.stem(w) for w in test_tweet if w not in stopwords.words('english')]\n",
655 |     "    test_tweet=' '.join(test_tweet)\n",
656 |     "    test_corpus.append(test_tweet)"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": 34,
662 |    "metadata": {},
663 |    "outputs": [
664 |     {
665 |      "name": "stderr",
666 |      "output_type": "stream",
667 |      "text": [
668 |       "100%|████████████████████████████████████████████████████████████████████████████| 3798/3798 [00:03<00:00, 1222.75it/s]"
669 |      ]
670 |     },
671 |     {
672 |      "name": "stdout",
673 |      "output_type": "stream",
674 |      "text": [
675 |       "63427\n"
676 |      ]
677 |     },
678 |     {
679 |      "name": "stderr",
680 |      "output_type": "stream",
681 |      "text": [
682 |       "\n"
683 |      ]
684 |     }
685 |    ],
686 |    "source": [
687 |     "from sklearn.feature_extraction.text import CountVectorizer as cv\n",
688 |     "\n",
689 |     "cv=cv(max_features=2500)\n",
690 |     "x_test=cv.fit_transform(test_corpus).toarray()\n",
691 |     "\n",
692 |     "count=0\n",
693 |     "for i in tqdm(x_test):\n",
694 |     "    for j in i:\n",
695 |     "        if j>0:\n",
696 |     "            count+=1\n",
697 |     "print(count)"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "code",
702 |    "execution_count": 44,
703 |    "metadata": {},
704 |    "outputs": [],
705 |    "source": [
706 |     "#predicting values from the test set\n",
707 |     "y_pred_nb=classifier_nb.predict(x_test)\n",
708 |     "y_pred_pac=classifier_pac.predict(x_test)"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": 36,
714 |    "metadata": {},
715 |    "outputs": [],
716 |    "source": [
717 |     "from sklearn.metrics import classification_report,accuracy_score"
718 |    ]
719 |   },
720 |   {
721 |    "cell_type": "code",
722 |    "execution_count": 45,
723 |    "metadata": {},
724 |    "outputs": [
725 |     {
726 |      "name": "stdout",
727 |      "output_type": "stream",
728 |      "text": [
729 |       "Accuracy is: 0.2632964718272775\n",
730 |       "              precision    recall  f1-score   support\n",
731 |       "\n",
732 |       "           0       0.41      0.01      0.02       592\n",
733 |       "           1       0.24      0.01      0.02       599\n",
734 |       "           2       0.30      0.25      0.28      1041\n",
735 |       "           3       0.09      0.02      0.03       619\n",
736 |       "           4       0.26      0.75      0.39       947\n",
737 |       "\n",
738 |       "    accuracy                           0.26      3798\n",
739 |       "   macro avg       0.26      0.21      0.15      3798\n",
740 |       "weighted avg       0.26      0.26      0.18      3798\n",
741 |       "\n"
742 |      ]
743 |     }
744 |    ],
745 |    "source": [
746 |     "#Multinomial NaiveBayes\n",
747 |     "print('Accuracy is:',accuracy_score(y_test,y_pred_nb))\n",
748 |     "print(classification_report(y_test,y_pred_nb))"
749 |    ]
750 |   },
751 |   {
752 |    "cell_type": "code",
753 |    "execution_count": 38,
754 |    "metadata": {},
755 |    "outputs": [
756 |     {
757 |      "name": "stdout",
758 |      "output_type": "stream",
759 |      "text": [
760 |       "Accuracy is: 0.25223802001053186\n",
761 |       "              precision    recall  f1-score   support\n",
762 |       "\n",
763 |       "           0       0.20      0.11      0.14       592\n",
764 |       "           1       0.18      0.26      0.22       599\n",
765 |       "           2       0.29      0.40      0.34      1041\n",
766 |       "           3       0.30      0.22      0.26       619\n",
767 |       "           4       0.25      0.19      0.22       947\n",
768 |       "\n",
769 |       "    accuracy                           0.25      3798\n",
770 |       "   macro avg       0.24      0.24      0.23      3798\n",
771 |       "weighted avg       0.25      0.25      0.24      3798\n",
772 |       "\n"
773 |      ]
774 |     }
775 |    ],
776 |    "source": [
777 |     "#PassiveAggressiveClassifier\n",
778 |     "print('Accuracy is:',accuracy_score(y_test,y_pred_pac))\n",
779 |     "print(classification_report(y_test,y_pred_pac))"
780 |    ]
781 |   },
782 |   {
783 |    "cell_type": "code",
784 |    "execution_count": null,
785 |    "metadata": {},
786 |    "outputs": [],
787 |    "source": []
788 |   },
789 |   {
790 |    "cell_type": "code",
791 |    "execution_count": null,
792 |    "metadata": {},
793 |    "outputs": [],
794 |    "source": []
795 |   },
796 |   {
797 |    "cell_type": "code",
798 |    "execution_count": null,
799 |    "metadata": {},
800 |    "outputs": [],
801 |    "source": []
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": null,
806 |    "metadata": {},
807 |    "outputs": [],
808 |    "source": []
809 |   },
810 |   {
811 |    "cell_type": "code",
812 |    "execution_count": null,
813 |    "metadata": {},
814 |    "outputs": [],
815 |    "source": []
816 |   }
817 |  ],
818 |  "metadata": {
819 |   "kernelspec": {
820 |    "display_name": "Python 3",
821 |    "language": "python",
822 |    "name": "python3"
823 |   },
824 |   "language_info": {
825 |    "codemirror_mode": {
826 |     "name": "ipython",
827 |     "version": 3
828 |    },
829 |    "file_extension": ".py",
830 |    "mimetype": "text/x-python",
831 |    "name": "python",
832 |    "nbconvert_exporter": "python",
833 |    "pygments_lexer": "ipython3",
834 |    "version": "3.7.4"
835 |   }
836 |  },
837 |  "nbformat": 4,
838 |  "nbformat_minor": 2
839 | }
840 | 


--------------------------------------------------------------------------------
/Projects/Daily News for Stock Market Prediction/Stock Prediction using News Headlines.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Stock Sentiment Analysis using News Headlines"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "#dependency\n",
 17 |     "import pandas as pd"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "df=pd.read_csv('D:/Data Sets/Daily News for Stock Market Prediction/Stock-Sentiment-Analysis/trunk/Data.csv', encoding = \"ISO-8859-1\")"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/html": [
 37 |        "<div>\n",
 38 |        "<style scoped>\n",
 39 |        "    .dataframe tbody tr th:only-of-type {\n",
 40 |        "        vertical-align: middle;\n",
 41 |        "    }\n",
 42 |        "\n",
 43 |        "    .dataframe tbody tr th {\n",
 44 |        "        vertical-align: top;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe thead th {\n",
 48 |        "        text-align: right;\n",
 49 |        "    }\n",
 50 |        "</style>\n",
 51 |        "<table border=\"1\" class=\"dataframe\">\n",
 52 |        "  <thead>\n",
 53 |        "    <tr style=\"text-align: right;\">\n",
 54 |        "      <th></th>\n",
 55 |        "      <th>Date</th>\n",
 56 |        "      <th>Label</th>\n",
 57 |        "      <th>Top1</th>\n",
 58 |        "      <th>Top2</th>\n",
 59 |        "      <th>Top3</th>\n",
 60 |        "      <th>Top4</th>\n",
 61 |        "      <th>Top5</th>\n",
 62 |        "      <th>Top6</th>\n",
 63 |        "      <th>Top7</th>\n",
 64 |        "      <th>Top8</th>\n",
 65 |        "      <th>...</th>\n",
 66 |        "      <th>Top16</th>\n",
 67 |        "      <th>Top17</th>\n",
 68 |        "      <th>Top18</th>\n",
 69 |        "      <th>Top19</th>\n",
 70 |        "      <th>Top20</th>\n",
 71 |        "      <th>Top21</th>\n",
 72 |        "      <th>Top22</th>\n",
 73 |        "      <th>Top23</th>\n",
 74 |        "      <th>Top24</th>\n",
 75 |        "      <th>Top25</th>\n",
 76 |        "    </tr>\n",
 77 |        "  </thead>\n",
 78 |        "  <tbody>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>0</th>\n",
 81 |        "      <td>2000-01-03</td>\n",
 82 |        "      <td>0</td>\n",
 83 |        "      <td>A 'hindrance to operations': extracts from the...</td>\n",
 84 |        "      <td>Scorecard</td>\n",
 85 |        "      <td>Hughes' instant hit buoys Blues</td>\n",
 86 |        "      <td>Jack gets his skates on at ice-cold Alex</td>\n",
 87 |        "      <td>Chaos as Maracana builds up for United</td>\n",
 88 |        "      <td>Depleted Leicester prevail as Elliott spoils E...</td>\n",
 89 |        "      <td>Hungry Spurs sense rich pickings</td>\n",
 90 |        "      <td>Gunners so wide of an easy target</td>\n",
 91 |        "      <td>...</td>\n",
 92 |        "      <td>Flintoff injury piles on woe for England</td>\n",
 93 |        "      <td>Hunters threaten Jospin with new battle of the...</td>\n",
 94 |        "      <td>Kohl's successor drawn into scandal</td>\n",
 95 |        "      <td>The difference between men and women</td>\n",
 96 |        "      <td>Sara Denver, nurse turned solicitor</td>\n",
 97 |        "      <td>Diana's landmine crusade put Tories in a panic</td>\n",
 98 |        "      <td>Yeltsin's resignation caught opposition flat-f...</td>\n",
 99 |        "      <td>Russian roulette</td>\n",
100 |        "      <td>Sold out</td>\n",
101 |        "      <td>Recovering a title</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>1</th>\n",
105 |        "      <td>2000-01-04</td>\n",
106 |        "      <td>0</td>\n",
107 |        "      <td>Scorecard</td>\n",
108 |        "      <td>The best lake scene</td>\n",
109 |        "      <td>Leader: German sleaze inquiry</td>\n",
110 |        "      <td>Cheerio, boyo</td>\n",
111 |        "      <td>The main recommendations</td>\n",
112 |        "      <td>Has Cubie killed fees?</td>\n",
113 |        "      <td>Has Cubie killed fees?</td>\n",
114 |        "      <td>Has Cubie killed fees?</td>\n",
115 |        "      <td>...</td>\n",
116 |        "      <td>On the critical list</td>\n",
117 |        "      <td>The timing of their lives</td>\n",
118 |        "      <td>Dear doctor</td>\n",
119 |        "      <td>Irish court halts IRA man's extradition to Nor...</td>\n",
120 |        "      <td>Burundi peace initiative fades after rebels re...</td>\n",
121 |        "      <td>PE points the way forward to the ECB</td>\n",
122 |        "      <td>Campaigners keep up pressure on Nazi war crime...</td>\n",
123 |        "      <td>Jane Ratcliffe</td>\n",
124 |        "      <td>Yet more things you wouldn't know without the ...</td>\n",
125 |        "      <td>Millennium bug fails to bite</td>\n",
126 |        "    </tr>\n",
127 |        "    <tr>\n",
128 |        "      <th>2</th>\n",
129 |        "      <td>2000-01-05</td>\n",
130 |        "      <td>0</td>\n",
131 |        "      <td>Coventry caught on counter by Flo</td>\n",
132 |        "      <td>United's rivals on the road to Rio</td>\n",
133 |        "      <td>Thatcher issues defence before trial by video</td>\n",
134 |        "      <td>Police help Smith lay down the law at Everton</td>\n",
135 |        "      <td>Tale of Trautmann bears two more retellings</td>\n",
136 |        "      <td>England on the rack</td>\n",
137 |        "      <td>Pakistan retaliate with call for video of Walsh</td>\n",
138 |        "      <td>Cullinan continues his Cape monopoly</td>\n",
139 |        "      <td>...</td>\n",
140 |        "      <td>South Melbourne (Australia)</td>\n",
141 |        "      <td>Necaxa (Mexico)</td>\n",
142 |        "      <td>Real Madrid (Spain)</td>\n",
143 |        "      <td>Raja Casablanca (Morocco)</td>\n",
144 |        "      <td>Corinthians (Brazil)</td>\n",
145 |        "      <td>Tony's pet project</td>\n",
146 |        "      <td>Al Nassr (Saudi Arabia)</td>\n",
147 |        "      <td>Ideal Holmes show</td>\n",
148 |        "      <td>Pinochet leaves hospital after tests</td>\n",
149 |        "      <td>Useful links</td>\n",
150 |        "    </tr>\n",
151 |        "    <tr>\n",
152 |        "      <th>3</th>\n",
153 |        "      <td>2000-01-06</td>\n",
154 |        "      <td>1</td>\n",
155 |        "      <td>Pilgrim knows how to progress</td>\n",
156 |        "      <td>Thatcher facing ban</td>\n",
157 |        "      <td>McIlroy calls for Irish fighting spirit</td>\n",
158 |        "      <td>Leicester bin stadium blueprint</td>\n",
159 |        "      <td>United braced for Mexican wave</td>\n",
160 |        "      <td>Auntie back in fashion, even if the dress look...</td>\n",
161 |        "      <td>Shoaib appeal goes to the top</td>\n",
162 |        "      <td>Hussain hurt by 'shambles' but lays blame on e...</td>\n",
163 |        "      <td>...</td>\n",
164 |        "      <td>Putin admits Yeltsin quit to give him a head s...</td>\n",
165 |        "      <td>BBC worst hit as digital TV begins to bite</td>\n",
166 |        "      <td>How much can you pay for...</td>\n",
167 |        "      <td>Christmas glitches</td>\n",
168 |        "      <td>Upending a table, Chopping a line and Scoring ...</td>\n",
169 |        "      <td>Scientific evidence 'unreliable', defence claims</td>\n",
170 |        "      <td>Fusco wins judicial review in extradition case</td>\n",
171 |        "      <td>Rebels thwart Russian advance</td>\n",
172 |        "      <td>Blair orders shake-up of failing NHS</td>\n",
173 |        "      <td>Lessons of law's hard heart</td>\n",
174 |        "    </tr>\n",
175 |        "    <tr>\n",
176 |        "      <th>4</th>\n",
177 |        "      <td>2000-01-07</td>\n",
178 |        "      <td>1</td>\n",
179 |        "      <td>Hitches and Horlocks</td>\n",
180 |        "      <td>Beckham off but United survive</td>\n",
181 |        "      <td>Breast cancer screening</td>\n",
182 |        "      <td>Alan Parker</td>\n",
183 |        "      <td>Guardian readers: are you all whingers?</td>\n",
184 |        "      <td>Hollywood Beyond</td>\n",
185 |        "      <td>Ashes and diamonds</td>\n",
186 |        "      <td>Whingers - a formidable minority</td>\n",
187 |        "      <td>...</td>\n",
188 |        "      <td>Most everywhere:  UDIs</td>\n",
189 |        "      <td>Most wanted:  Chloe lunettes</td>\n",
190 |        "      <td>Return of the cane 'completely off the agenda'</td>\n",
191 |        "      <td>From Sleepy Hollow to Greeneland</td>\n",
192 |        "      <td>Blunkett outlines vision for over 11s</td>\n",
193 |        "      <td>Embattled Dobson attacks 'play now, pay later'...</td>\n",
194 |        "      <td>Doom and the Dome</td>\n",
195 |        "      <td>What is the north-south divide?</td>\n",
196 |        "      <td>Aitken released from jail</td>\n",
197 |        "      <td>Gone aloft</td>\n",
198 |        "    </tr>\n",
199 |        "  </tbody>\n",
200 |        "</table>\n",
201 |        "<p>5 rows × 27 columns</p>\n",
202 |        "</div>"
203 |       ],
204 |       "text/plain": [
205 |        "         Date  Label                                               Top1  \\\n",
206 |        "0  2000-01-03      0  A 'hindrance to operations': extracts from the...   \n",
207 |        "1  2000-01-04      0                                          Scorecard   \n",
208 |        "2  2000-01-05      0                  Coventry caught on counter by Flo   \n",
209 |        "3  2000-01-06      1                      Pilgrim knows how to progress   \n",
210 |        "4  2000-01-07      1                               Hitches and Horlocks   \n",
211 |        "\n",
212 |        "                                 Top2  \\\n",
213 |        "0                           Scorecard   \n",
214 |        "1                 The best lake scene   \n",
215 |        "2  United's rivals on the road to Rio   \n",
216 |        "3                 Thatcher facing ban   \n",
217 |        "4      Beckham off but United survive   \n",
218 |        "\n",
219 |        "                                            Top3  \\\n",
220 |        "0                Hughes' instant hit buoys Blues   \n",
221 |        "1                  Leader: German sleaze inquiry   \n",
222 |        "2  Thatcher issues defence before trial by video   \n",
223 |        "3        McIlroy calls for Irish fighting spirit   \n",
224 |        "4                        Breast cancer screening   \n",
225 |        "\n",
226 |        "                                            Top4  \\\n",
227 |        "0       Jack gets his skates on at ice-cold Alex   \n",
228 |        "1                                  Cheerio, boyo   \n",
229 |        "2  Police help Smith lay down the law at Everton   \n",
230 |        "3                Leicester bin stadium blueprint   \n",
231 |        "4                                    Alan Parker   \n",
232 |        "\n",
233 |        "                                          Top5  \\\n",
234 |        "0       Chaos as Maracana builds up for United   \n",
235 |        "1                     The main recommendations   \n",
236 |        "2  Tale of Trautmann bears two more retellings   \n",
237 |        "3               United braced for Mexican wave   \n",
238 |        "4      Guardian readers: are you all whingers?   \n",
239 |        "\n",
240 |        "                                                Top6  \\\n",
241 |        "0  Depleted Leicester prevail as Elliott spoils E...   \n",
242 |        "1                             Has Cubie killed fees?   \n",
243 |        "2                                England on the rack   \n",
244 |        "3  Auntie back in fashion, even if the dress look...   \n",
245 |        "4                                   Hollywood Beyond   \n",
246 |        "\n",
247 |        "                                              Top7  \\\n",
248 |        "0                 Hungry Spurs sense rich pickings   \n",
249 |        "1                           Has Cubie killed fees?   \n",
250 |        "2  Pakistan retaliate with call for video of Walsh   \n",
251 |        "3                    Shoaib appeal goes to the top   \n",
252 |        "4                               Ashes and diamonds   \n",
253 |        "\n",
254 |        "                                                Top8  ...  \\\n",
255 |        "0                  Gunners so wide of an easy target  ...   \n",
256 |        "1                             Has Cubie killed fees?  ...   \n",
257 |        "2               Cullinan continues his Cape monopoly  ...   \n",
258 |        "3  Hussain hurt by 'shambles' but lays blame on e...  ...   \n",
259 |        "4                   Whingers - a formidable minority  ...   \n",
260 |        "\n",
261 |        "                                               Top16  \\\n",
262 |        "0           Flintoff injury piles on woe for England   \n",
263 |        "1                               On the critical list   \n",
264 |        "2                        South Melbourne (Australia)   \n",
265 |        "3  Putin admits Yeltsin quit to give him a head s...   \n",
266 |        "4                             Most everywhere:  UDIs   \n",
267 |        "\n",
268 |        "                                               Top17  \\\n",
269 |        "0  Hunters threaten Jospin with new battle of the...   \n",
270 |        "1                          The timing of their lives   \n",
271 |        "2                                    Necaxa (Mexico)   \n",
272 |        "3         BBC worst hit as digital TV begins to bite   \n",
273 |        "4                       Most wanted:  Chloe lunettes   \n",
274 |        "\n",
275 |        "                                            Top18  \\\n",
276 |        "0             Kohl's successor drawn into scandal   \n",
277 |        "1                                     Dear doctor   \n",
278 |        "2                             Real Madrid (Spain)   \n",
279 |        "3                     How much can you pay for...   \n",
280 |        "4  Return of the cane 'completely off the agenda'   \n",
281 |        "\n",
282 |        "                                               Top19  \\\n",
283 |        "0               The difference between men and women   \n",
284 |        "1  Irish court halts IRA man's extradition to Nor...   \n",
285 |        "2                          Raja Casablanca (Morocco)   \n",
286 |        "3                                 Christmas glitches   \n",
287 |        "4                   From Sleepy Hollow to Greeneland   \n",
288 |        "\n",
289 |        "                                               Top20  \\\n",
290 |        "0                Sara Denver, nurse turned solicitor   \n",
291 |        "1  Burundi peace initiative fades after rebels re...   \n",
292 |        "2                               Corinthians (Brazil)   \n",
293 |        "3  Upending a table, Chopping a line and Scoring ...   \n",
294 |        "4              Blunkett outlines vision for over 11s   \n",
295 |        "\n",
296 |        "                                               Top21  \\\n",
297 |        "0     Diana's landmine crusade put Tories in a panic   \n",
298 |        "1               PE points the way forward to the ECB   \n",
299 |        "2                                 Tony's pet project   \n",
300 |        "3   Scientific evidence 'unreliable', defence claims   \n",
301 |        "4  Embattled Dobson attacks 'play now, pay later'...   \n",
302 |        "\n",
303 |        "                                               Top22  \\\n",
304 |        "0  Yeltsin's resignation caught opposition flat-f...   \n",
305 |        "1  Campaigners keep up pressure on Nazi war crime...   \n",
306 |        "2                            Al Nassr (Saudi Arabia)   \n",
307 |        "3     Fusco wins judicial review in extradition case   \n",
308 |        "4                                  Doom and the Dome   \n",
309 |        "\n",
310 |        "                             Top23  \\\n",
311 |        "0                 Russian roulette   \n",
312 |        "1                   Jane Ratcliffe   \n",
313 |        "2                Ideal Holmes show   \n",
314 |        "3    Rebels thwart Russian advance   \n",
315 |        "4  What is the north-south divide?   \n",
316 |        "\n",
317 |        "                                               Top24  \\\n",
318 |        "0                                           Sold out   \n",
319 |        "1  Yet more things you wouldn't know without the ...   \n",
320 |        "2               Pinochet leaves hospital after tests   \n",
321 |        "3               Blair orders shake-up of failing NHS   \n",
322 |        "4                          Aitken released from jail   \n",
323 |        "\n",
324 |        "                          Top25  \n",
325 |        "0            Recovering a title  \n",
326 |        "1  Millennium bug fails to bite  \n",
327 |        "2                  Useful links  \n",
328 |        "3   Lessons of law's hard heart  \n",
329 |        "4                    Gone aloft  \n",
330 |        "\n",
331 |        "[5 rows x 27 columns]"
332 |       ]
333 |      },
334 |      "execution_count": 3,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "df.head()"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 4,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "#segregating into train and test on the basis of timestamps\n",
350 |     "train = df[df['Date'] < '20150101']\n",
351 |     "test = df[df['Date'] > '20141231']"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 5,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "data": {
361 |       "text/html": [
362 |        "<div>\n",
363 |        "<style scoped>\n",
364 |        "    .dataframe tbody tr th:only-of-type {\n",
365 |        "        vertical-align: middle;\n",
366 |        "    }\n",
367 |        "\n",
368 |        "    .dataframe tbody tr th {\n",
369 |        "        vertical-align: top;\n",
370 |        "    }\n",
371 |        "\n",
372 |        "    .dataframe thead th {\n",
373 |        "        text-align: right;\n",
374 |        "    }\n",
375 |        "</style>\n",
376 |        "<table border=\"1\" class=\"dataframe\">\n",
377 |        "  <thead>\n",
378 |        "    <tr style=\"text-align: right;\">\n",
379 |        "      <th></th>\n",
380 |        "      <th>0</th>\n",
381 |        "      <th>1</th>\n",
382 |        "      <th>2</th>\n",
383 |        "      <th>3</th>\n",
384 |        "      <th>4</th>\n",
385 |        "      <th>5</th>\n",
386 |        "      <th>6</th>\n",
387 |        "      <th>7</th>\n",
388 |        "      <th>8</th>\n",
389 |        "      <th>9</th>\n",
390 |        "      <th>...</th>\n",
391 |        "      <th>15</th>\n",
392 |        "      <th>16</th>\n",
393 |        "      <th>17</th>\n",
394 |        "      <th>18</th>\n",
395 |        "      <th>19</th>\n",
396 |        "      <th>20</th>\n",
397 |        "      <th>21</th>\n",
398 |        "      <th>22</th>\n",
399 |        "      <th>23</th>\n",
400 |        "      <th>24</th>\n",
401 |        "    </tr>\n",
402 |        "  </thead>\n",
403 |        "  <tbody>\n",
404 |        "    <tr>\n",
405 |        "      <th>0</th>\n",
406 |        "      <td>A  hindrance to operations   extracts from the...</td>\n",
407 |        "      <td>Scorecard</td>\n",
408 |        "      <td>Hughes  instant hit buoys Blues</td>\n",
409 |        "      <td>Jack gets his skates on at ice cold Alex</td>\n",
410 |        "      <td>Chaos as Maracana builds up for United</td>\n",
411 |        "      <td>Depleted Leicester prevail as Elliott spoils E...</td>\n",
412 |        "      <td>Hungry Spurs sense rich pickings</td>\n",
413 |        "      <td>Gunners so wide of an easy target</td>\n",
414 |        "      <td>Derby raise a glass to Strupar s debut double</td>\n",
415 |        "      <td>Southgate strikes  Leeds pay the penalty</td>\n",
416 |        "      <td>...</td>\n",
417 |        "      <td>Flintoff injury piles on woe for England</td>\n",
418 |        "      <td>Hunters threaten Jospin with new battle of the...</td>\n",
419 |        "      <td>Kohl s successor drawn into scandal</td>\n",
420 |        "      <td>The difference between men and women</td>\n",
421 |        "      <td>Sara Denver  nurse turned solicitor</td>\n",
422 |        "      <td>Diana s landmine crusade put Tories in a panic</td>\n",
423 |        "      <td>Yeltsin s resignation caught opposition flat f...</td>\n",
424 |        "      <td>Russian roulette</td>\n",
425 |        "      <td>Sold out</td>\n",
426 |        "      <td>Recovering a title</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>1</th>\n",
430 |        "      <td>Scorecard</td>\n",
431 |        "      <td>The best lake scene</td>\n",
432 |        "      <td>Leader  German sleaze inquiry</td>\n",
433 |        "      <td>Cheerio  boyo</td>\n",
434 |        "      <td>The main recommendations</td>\n",
435 |        "      <td>Has Cubie killed fees</td>\n",
436 |        "      <td>Has Cubie killed fees</td>\n",
437 |        "      <td>Has Cubie killed fees</td>\n",
438 |        "      <td>Hopkins  furious  at Foster s lack of Hannibal...</td>\n",
439 |        "      <td>Has Cubie killed fees</td>\n",
440 |        "      <td>...</td>\n",
441 |        "      <td>On the critical list</td>\n",
442 |        "      <td>The timing of their lives</td>\n",
443 |        "      <td>Dear doctor</td>\n",
444 |        "      <td>Irish court halts IRA man s extradition to Nor...</td>\n",
445 |        "      <td>Burundi peace initiative fades after rebels re...</td>\n",
446 |        "      <td>PE points the way forward to the ECB</td>\n",
447 |        "      <td>Campaigners keep up pressure on Nazi war crime...</td>\n",
448 |        "      <td>Jane Ratcliffe</td>\n",
449 |        "      <td>Yet more things you wouldn t know without the ...</td>\n",
450 |        "      <td>Millennium bug fails to bite</td>\n",
451 |        "    </tr>\n",
452 |        "    <tr>\n",
453 |        "      <th>2</th>\n",
454 |        "      <td>Coventry caught on counter by Flo</td>\n",
455 |        "      <td>United s rivals on the road to Rio</td>\n",
456 |        "      <td>Thatcher issues defence before trial by video</td>\n",
457 |        "      <td>Police help Smith lay down the law at Everton</td>\n",
458 |        "      <td>Tale of Trautmann bears two more retellings</td>\n",
459 |        "      <td>England on the rack</td>\n",
460 |        "      <td>Pakistan retaliate with call for video of Walsh</td>\n",
461 |        "      <td>Cullinan continues his Cape monopoly</td>\n",
462 |        "      <td>McGrath puts India out of their misery</td>\n",
463 |        "      <td>Blair Witch bandwagon rolls on</td>\n",
464 |        "      <td>...</td>\n",
465 |        "      <td>South Melbourne  Australia</td>\n",
466 |        "      <td>Necaxa  Mexico</td>\n",
467 |        "      <td>Real Madrid  Spain</td>\n",
468 |        "      <td>Raja Casablanca  Morocco</td>\n",
469 |        "      <td>Corinthians  Brazil</td>\n",
470 |        "      <td>Tony s pet project</td>\n",
471 |        "      <td>Al Nassr  Saudi Arabia</td>\n",
472 |        "      <td>Ideal Holmes show</td>\n",
473 |        "      <td>Pinochet leaves hospital after tests</td>\n",
474 |        "      <td>Useful links</td>\n",
475 |        "    </tr>\n",
476 |        "    <tr>\n",
477 |        "      <th>3</th>\n",
478 |        "      <td>Pilgrim knows how to progress</td>\n",
479 |        "      <td>Thatcher facing ban</td>\n",
480 |        "      <td>McIlroy calls for Irish fighting spirit</td>\n",
481 |        "      <td>Leicester bin stadium blueprint</td>\n",
482 |        "      <td>United braced for Mexican wave</td>\n",
483 |        "      <td>Auntie back in fashion  even if the dress look...</td>\n",
484 |        "      <td>Shoaib appeal goes to the top</td>\n",
485 |        "      <td>Hussain hurt by  shambles  but lays blame on e...</td>\n",
486 |        "      <td>England s decade of disasters</td>\n",
487 |        "      <td>Revenge is sweet for jubilant Cronje</td>\n",
488 |        "      <td>...</td>\n",
489 |        "      <td>Putin admits Yeltsin quit to give him a head s...</td>\n",
490 |        "      <td>BBC worst hit as digital TV begins to bite</td>\n",
491 |        "      <td>How much can you pay for</td>\n",
492 |        "      <td>Christmas glitches</td>\n",
493 |        "      <td>Upending a table  Chopping a line and Scoring ...</td>\n",
494 |        "      <td>Scientific evidence  unreliable   defence claims</td>\n",
495 |        "      <td>Fusco wins judicial review in extradition case</td>\n",
496 |        "      <td>Rebels thwart Russian advance</td>\n",
497 |        "      <td>Blair orders shake up of failing NHS</td>\n",
498 |        "      <td>Lessons of law s hard heart</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>4</th>\n",
502 |        "      <td>Hitches and Horlocks</td>\n",
503 |        "      <td>Beckham off but United survive</td>\n",
504 |        "      <td>Breast cancer screening</td>\n",
505 |        "      <td>Alan Parker</td>\n",
506 |        "      <td>Guardian readers  are you all whingers</td>\n",
507 |        "      <td>Hollywood Beyond</td>\n",
508 |        "      <td>Ashes and diamonds</td>\n",
509 |        "      <td>Whingers   a formidable minority</td>\n",
510 |        "      <td>Alan Parker   part two</td>\n",
511 |        "      <td>Thuggery  Toxins and Ties</td>\n",
512 |        "      <td>...</td>\n",
513 |        "      <td>Most everywhere   UDIs</td>\n",
514 |        "      <td>Most wanted   Chloe lunettes</td>\n",
515 |        "      <td>Return of the cane  completely off the agenda</td>\n",
516 |        "      <td>From Sleepy Hollow to Greeneland</td>\n",
517 |        "      <td>Blunkett outlines vision for over   s</td>\n",
518 |        "      <td>Embattled Dobson attacks  play now  pay later ...</td>\n",
519 |        "      <td>Doom and the Dome</td>\n",
520 |        "      <td>What is the north south divide</td>\n",
521 |        "      <td>Aitken released from jail</td>\n",
522 |        "      <td>Gone aloft</td>\n",
523 |        "    </tr>\n",
524 |        "  </tbody>\n",
525 |        "</table>\n",
526 |        "<p>5 rows × 25 columns</p>\n",
527 |        "</div>"
528 |       ],
529 |       "text/plain": [
530 |        "                                                   0  \\\n",
531 |        "0  A  hindrance to operations   extracts from the...   \n",
532 |        "1                                          Scorecard   \n",
533 |        "2                  Coventry caught on counter by Flo   \n",
534 |        "3                      Pilgrim knows how to progress   \n",
535 |        "4                               Hitches and Horlocks   \n",
536 |        "\n",
537 |        "                                    1  \\\n",
538 |        "0                           Scorecard   \n",
539 |        "1                 The best lake scene   \n",
540 |        "2  United s rivals on the road to Rio   \n",
541 |        "3                 Thatcher facing ban   \n",
542 |        "4      Beckham off but United survive   \n",
543 |        "\n",
544 |        "                                               2  \\\n",
545 |        "0                Hughes  instant hit buoys Blues   \n",
546 |        "1                  Leader  German sleaze inquiry   \n",
547 |        "2  Thatcher issues defence before trial by video   \n",
548 |        "3        McIlroy calls for Irish fighting spirit   \n",
549 |        "4                        Breast cancer screening   \n",
550 |        "\n",
551 |        "                                               3  \\\n",
552 |        "0       Jack gets his skates on at ice cold Alex   \n",
553 |        "1                                  Cheerio  boyo   \n",
554 |        "2  Police help Smith lay down the law at Everton   \n",
555 |        "3                Leicester bin stadium blueprint   \n",
556 |        "4                                    Alan Parker   \n",
557 |        "\n",
558 |        "                                             4  \\\n",
559 |        "0       Chaos as Maracana builds up for United   \n",
560 |        "1                     The main recommendations   \n",
561 |        "2  Tale of Trautmann bears two more retellings   \n",
562 |        "3               United braced for Mexican wave   \n",
563 |        "4      Guardian readers  are you all whingers    \n",
564 |        "\n",
565 |        "                                                   5  \\\n",
566 |        "0  Depleted Leicester prevail as Elliott spoils E...   \n",
567 |        "1                             Has Cubie killed fees    \n",
568 |        "2                                England on the rack   \n",
569 |        "3  Auntie back in fashion  even if the dress look...   \n",
570 |        "4                                   Hollywood Beyond   \n",
571 |        "\n",
572 |        "                                                 6  \\\n",
573 |        "0                 Hungry Spurs sense rich pickings   \n",
574 |        "1                           Has Cubie killed fees    \n",
575 |        "2  Pakistan retaliate with call for video of Walsh   \n",
576 |        "3                    Shoaib appeal goes to the top   \n",
577 |        "4                               Ashes and diamonds   \n",
578 |        "\n",
579 |        "                                                   7  \\\n",
580 |        "0                  Gunners so wide of an easy target   \n",
581 |        "1                             Has Cubie killed fees    \n",
582 |        "2               Cullinan continues his Cape monopoly   \n",
583 |        "3  Hussain hurt by  shambles  but lays blame on e...   \n",
584 |        "4                   Whingers   a formidable minority   \n",
585 |        "\n",
586 |        "                                                   8  \\\n",
587 |        "0      Derby raise a glass to Strupar s debut double   \n",
588 |        "1  Hopkins  furious  at Foster s lack of Hannibal...   \n",
589 |        "2             McGrath puts India out of their misery   \n",
590 |        "3                      England s decade of disasters   \n",
591 |        "4                             Alan Parker   part two   \n",
592 |        "\n",
593 |        "                                          9  ...  \\\n",
594 |        "0  Southgate strikes  Leeds pay the penalty  ...   \n",
595 |        "1                    Has Cubie killed fees   ...   \n",
596 |        "2            Blair Witch bandwagon rolls on  ...   \n",
597 |        "3      Revenge is sweet for jubilant Cronje  ...   \n",
598 |        "4                 Thuggery  Toxins and Ties  ...   \n",
599 |        "\n",
600 |        "                                                  15  \\\n",
601 |        "0           Flintoff injury piles on woe for England   \n",
602 |        "1                               On the critical list   \n",
603 |        "2                        South Melbourne  Australia    \n",
604 |        "3  Putin admits Yeltsin quit to give him a head s...   \n",
605 |        "4                             Most everywhere   UDIs   \n",
606 |        "\n",
607 |        "                                                  16  \\\n",
608 |        "0  Hunters threaten Jospin with new battle of the...   \n",
609 |        "1                          The timing of their lives   \n",
610 |        "2                                    Necaxa  Mexico    \n",
611 |        "3         BBC worst hit as digital TV begins to bite   \n",
612 |        "4                       Most wanted   Chloe lunettes   \n",
613 |        "\n",
614 |        "                                               17  \\\n",
615 |        "0             Kohl s successor drawn into scandal   \n",
616 |        "1                                     Dear doctor   \n",
617 |        "2                             Real Madrid  Spain    \n",
618 |        "3                     How much can you pay for      \n",
619 |        "4  Return of the cane  completely off the agenda    \n",
620 |        "\n",
621 |        "                                                  18  \\\n",
622 |        "0               The difference between men and women   \n",
623 |        "1  Irish court halts IRA man s extradition to Nor...   \n",
624 |        "2                          Raja Casablanca  Morocco    \n",
625 |        "3                                 Christmas glitches   \n",
626 |        "4                   From Sleepy Hollow to Greeneland   \n",
627 |        "\n",
628 |        "                                                  19  \\\n",
629 |        "0                Sara Denver  nurse turned solicitor   \n",
630 |        "1  Burundi peace initiative fades after rebels re...   \n",
631 |        "2                               Corinthians  Brazil    \n",
632 |        "3  Upending a table  Chopping a line and Scoring ...   \n",
633 |        "4              Blunkett outlines vision for over   s   \n",
634 |        "\n",
635 |        "                                                  20  \\\n",
636 |        "0     Diana s landmine crusade put Tories in a panic   \n",
637 |        "1               PE points the way forward to the ECB   \n",
638 |        "2                                 Tony s pet project   \n",
639 |        "3   Scientific evidence  unreliable   defence claims   \n",
640 |        "4  Embattled Dobson attacks  play now  pay later ...   \n",
641 |        "\n",
642 |        "                                                  21  \\\n",
643 |        "0  Yeltsin s resignation caught opposition flat f...   \n",
644 |        "1  Campaigners keep up pressure on Nazi war crime...   \n",
645 |        "2                            Al Nassr  Saudi Arabia    \n",
646 |        "3     Fusco wins judicial review in extradition case   \n",
647 |        "4                                  Doom and the Dome   \n",
648 |        "\n",
649 |        "                                22  \\\n",
650 |        "0                 Russian roulette   \n",
651 |        "1                   Jane Ratcliffe   \n",
652 |        "2                Ideal Holmes show   \n",
653 |        "3    Rebels thwart Russian advance   \n",
654 |        "4  What is the north south divide    \n",
655 |        "\n",
656 |        "                                                  23  \\\n",
657 |        "0                                           Sold out   \n",
658 |        "1  Yet more things you wouldn t know without the ...   \n",
659 |        "2               Pinochet leaves hospital after tests   \n",
660 |        "3               Blair orders shake up of failing NHS   \n",
661 |        "4                          Aitken released from jail   \n",
662 |        "\n",
663 |        "                             24  \n",
664 |        "0            Recovering a title  \n",
665 |        "1  Millennium bug fails to bite  \n",
666 |        "2                  Useful links  \n",
667 |        "3   Lessons of law s hard heart  \n",
668 |        "4                    Gone aloft  \n",
669 |        "\n",
670 |        "[5 rows x 25 columns]"
671 |       ]
672 |      },
673 |      "execution_count": 5,
674 |      "metadata": {},
675 |      "output_type": "execute_result"
676 |     }
677 |    ],
678 |    "source": [
679 |     "# Removing punctuations\n",
680 |     "data=train.iloc[:,2:27]\n",
681 |     "data.replace(\"[^a-zA-Z]\",\" \",regex=True, inplace=True)\n",
682 |     "\n",
683 |     "# Renaming column names for ease of access\n",
684 |     "list1= [i for i in range(25)]\n",
685 |     "new_Index=[str(i) for i in list1]\n",
686 |     "data.columns= new_Index\n",
687 |     "data.head(5)\n",
688 |     "\n"
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "code",
693 |    "execution_count": 6,
694 |    "metadata": {},
695 |    "outputs": [
696 |     {
697 |      "data": {
698 |       "text/html": [
699 |        "<div>\n",
700 |        "<style scoped>\n",
701 |        "    .dataframe tbody tr th:only-of-type {\n",
702 |        "        vertical-align: middle;\n",
703 |        "    }\n",
704 |        "\n",
705 |        "    .dataframe tbody tr th {\n",
706 |        "        vertical-align: top;\n",
707 |        "    }\n",
708 |        "\n",
709 |        "    .dataframe thead th {\n",
710 |        "        text-align: right;\n",
711 |        "    }\n",
712 |        "</style>\n",
713 |        "<table border=\"1\" class=\"dataframe\">\n",
714 |        "  <thead>\n",
715 |        "    <tr style=\"text-align: right;\">\n",
716 |        "      <th></th>\n",
717 |        "      <th>0</th>\n",
718 |        "      <th>1</th>\n",
719 |        "      <th>2</th>\n",
720 |        "      <th>3</th>\n",
721 |        "      <th>4</th>\n",
722 |        "      <th>5</th>\n",
723 |        "      <th>6</th>\n",
724 |        "      <th>7</th>\n",
725 |        "      <th>8</th>\n",
726 |        "      <th>9</th>\n",
727 |        "      <th>...</th>\n",
728 |        "      <th>15</th>\n",
729 |        "      <th>16</th>\n",
730 |        "      <th>17</th>\n",
731 |        "      <th>18</th>\n",
732 |        "      <th>19</th>\n",
733 |        "      <th>20</th>\n",
734 |        "      <th>21</th>\n",
735 |        "      <th>22</th>\n",
736 |        "      <th>23</th>\n",
737 |        "      <th>24</th>\n",
738 |        "    </tr>\n",
739 |        "  </thead>\n",
740 |        "  <tbody>\n",
741 |        "    <tr>\n",
742 |        "      <th>0</th>\n",
743 |        "      <td>a  hindrance to operations   extracts from the...</td>\n",
744 |        "      <td>scorecard</td>\n",
745 |        "      <td>hughes  instant hit buoys blues</td>\n",
746 |        "      <td>jack gets his skates on at ice cold alex</td>\n",
747 |        "      <td>chaos as maracana builds up for united</td>\n",
748 |        "      <td>depleted leicester prevail as elliott spoils e...</td>\n",
749 |        "      <td>hungry spurs sense rich pickings</td>\n",
750 |        "      <td>gunners so wide of an easy target</td>\n",
751 |        "      <td>derby raise a glass to strupar s debut double</td>\n",
752 |        "      <td>southgate strikes  leeds pay the penalty</td>\n",
753 |        "      <td>...</td>\n",
754 |        "      <td>flintoff injury piles on woe for england</td>\n",
755 |        "      <td>hunters threaten jospin with new battle of the...</td>\n",
756 |        "      <td>kohl s successor drawn into scandal</td>\n",
757 |        "      <td>the difference between men and women</td>\n",
758 |        "      <td>sara denver  nurse turned solicitor</td>\n",
759 |        "      <td>diana s landmine crusade put tories in a panic</td>\n",
760 |        "      <td>yeltsin s resignation caught opposition flat f...</td>\n",
761 |        "      <td>russian roulette</td>\n",
762 |        "      <td>sold out</td>\n",
763 |        "      <td>recovering a title</td>\n",
764 |        "    </tr>\n",
765 |        "  </tbody>\n",
766 |        "</table>\n",
767 |        "<p>1 rows × 25 columns</p>\n",
768 |        "</div>"
769 |       ],
770 |       "text/plain": [
771 |        "                                                   0          1  \\\n",
772 |        "0  a  hindrance to operations   extracts from the...  scorecard   \n",
773 |        "\n",
774 |        "                                 2                                         3  \\\n",
775 |        "0  hughes  instant hit buoys blues  jack gets his skates on at ice cold alex   \n",
776 |        "\n",
777 |        "                                        4  \\\n",
778 |        "0  chaos as maracana builds up for united   \n",
779 |        "\n",
780 |        "                                                   5  \\\n",
781 |        "0  depleted leicester prevail as elliott spoils e...   \n",
782 |        "\n",
783 |        "                                  6                                  7  \\\n",
784 |        "0  hungry spurs sense rich pickings  gunners so wide of an easy target   \n",
785 |        "\n",
786 |        "                                               8  \\\n",
787 |        "0  derby raise a glass to strupar s debut double   \n",
788 |        "\n",
789 |        "                                          9  ...  \\\n",
790 |        "0  southgate strikes  leeds pay the penalty  ...   \n",
791 |        "\n",
792 |        "                                         15  \\\n",
793 |        "0  flintoff injury piles on woe for england   \n",
794 |        "\n",
795 |        "                                                  16  \\\n",
796 |        "0  hunters threaten jospin with new battle of the...   \n",
797 |        "\n",
798 |        "                                    17                                    18  \\\n",
799 |        "0  kohl s successor drawn into scandal  the difference between men and women   \n",
800 |        "\n",
801 |        "                                    19  \\\n",
802 |        "0  sara denver  nurse turned solicitor   \n",
803 |        "\n",
804 |        "                                               20  \\\n",
805 |        "0  diana s landmine crusade put tories in a panic   \n",
806 |        "\n",
807 |        "                                                  21                22  \\\n",
808 |        "0  yeltsin s resignation caught opposition flat f...  russian roulette   \n",
809 |        "\n",
810 |        "         23                  24  \n",
811 |        "0  sold out  recovering a title  \n",
812 |        "\n",
813 |        "[1 rows x 25 columns]"
814 |       ]
815 |      },
816 |      "execution_count": 6,
817 |      "metadata": {},
818 |      "output_type": "execute_result"
819 |     }
820 |    ],
821 |    "source": [
822 |     "# Convertng headlines to lower case\n",
823 |     "for index in new_Index:\n",
824 |     "    data[index]=data[index].str.lower()\n",
825 |     "data.head(1)"
826 |    ]
827 |   },
828 |   {
829 |    "cell_type": "code",
830 |    "execution_count": 11,
831 |    "metadata": {},
832 |    "outputs": [
833 |     {
834 |      "data": {
835 |       "text/plain": [
836 |        "'scorecard the best lake scene leader  german sleaze inquiry cheerio  boyo the main recommendations has cubie killed fees  has cubie killed fees  has cubie killed fees  hopkins  furious  at foster s lack of hannibal appetite has cubie killed fees  a tale of two tails i say what i like and i like what i say elbows  eyes and nipples task force to assess risk of asteroid collision how i found myself at last on the critical list the timing of their lives dear doctor irish court halts ira man s extradition to northern ireland burundi peace initiative fades after rebels reject mandela as mediator pe points the way forward to the ecb campaigners keep up pressure on nazi war crimes suspect jane ratcliffe yet more things you wouldn t know without the movies millennium bug fails to bite'"
837 |       ]
838 |      },
839 |      "execution_count": 11,
840 |      "metadata": {},
841 |      "output_type": "execute_result"
842 |     }
843 |    ],
844 |    "source": [
845 |     "' '.join(str(x) for x in data.iloc[1,0:25])"
846 |    ]
847 |   },
848 |   {
849 |    "cell_type": "code",
850 |    "execution_count": 12,
851 |    "metadata": {},
852 |    "outputs": [],
853 |    "source": [
854 |     "headlines = []\n",
855 |     "for row in range(0,len(data.index)):\n",
856 |     "    headlines.append(' '.join(str(x) for x in data.iloc[row,0:25]))"
857 |    ]
858 |   },
859 |   {
860 |    "cell_type": "code",
861 |    "execution_count": 13,
862 |    "metadata": {},
863 |    "outputs": [
864 |     {
865 |      "data": {
866 |       "text/plain": [
867 |        "'a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'"
868 |       ]
869 |      },
870 |      "execution_count": 13,
871 |      "metadata": {},
872 |      "output_type": "execute_result"
873 |     }
874 |    ],
875 |    "source": [
876 |     "headlines[0]"
877 |    ]
878 |   },
879 |   {
880 |    "cell_type": "code",
881 |    "execution_count": 14,
882 |    "metadata": {},
883 |    "outputs": [],
884 |    "source": [
885 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
886 |     "from sklearn.ensemble import RandomForestClassifier"
887 |    ]
888 |   },
889 |   {
890 |    "cell_type": "code",
891 |    "execution_count": 15,
892 |    "metadata": {},
893 |    "outputs": [],
894 |    "source": [
895 |     "## implement BAG OF WORDS\n",
896 |     "countvector=CountVectorizer(ngram_range=(1,3))\n",
897 |     "traindataset=countvector.fit_transform(headlines)"
898 |    ]
899 |   },
900 |   {
901 |    "cell_type": "code",
902 |    "execution_count": null,
903 |    "metadata": {},
904 |    "outputs": [],
905 |    "source": [
906 |     "# implement RandomForest Classifier\n",
907 |     "randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')\n",
908 |     "randomclassifier.fit(traindataset,train['Label'])"
909 |    ]
910 |   },
911 |   {
912 |    "cell_type": "code",
913 |    "execution_count": 35,
914 |    "metadata": {},
915 |    "outputs": [],
916 |    "source": [
917 |     "## Predict for the Test Dataset\n",
918 |     "test_transform= []\n",
919 |     "for row in range(0,len(test.index)):\n",
920 |     "    test_transform.append(' '.join(str(x) for x in test.iloc[row,2:27]))\n",
921 |     "test_dataset = countvector.transform(test_transform)\n",
922 |     "predictions = randomclassifier.predict(test_dataset)"
923 |    ]
924 |   },
925 |   {
926 |    "cell_type": "code",
927 |    "execution_count": 36,
928 |    "metadata": {},
929 |    "outputs": [],
930 |    "source": [
931 |     "## Import library to check accuracy\n",
932 |     "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score"
933 |    ]
934 |   },
935 |   {
936 |    "cell_type": "code",
937 |    "execution_count": 37,
938 |    "metadata": {},
939 |    "outputs": [
940 |     {
941 |      "name": "stdout",
942 |      "output_type": "stream",
943 |      "text": [
944 |       "[[139  47]\n",
945 |       " [ 13 179]]\n",
946 |       "0.8412698412698413\n",
947 |       "              precision    recall  f1-score   support\n",
948 |       "\n",
949 |       "           0       0.91      0.75      0.82       186\n",
950 |       "           1       0.79      0.93      0.86       192\n",
951 |       "\n",
952 |       "   micro avg       0.84      0.84      0.84       378\n",
953 |       "   macro avg       0.85      0.84      0.84       378\n",
954 |       "weighted avg       0.85      0.84      0.84       378\n",
955 |       "\n"
956 |      ]
957 |     }
958 |    ],
959 |    "source": [
960 |     "matrix=confusion_matrix(test['Label'],predictions)\n",
961 |     "print(matrix)\n",
962 |     "score=accuracy_score(test['Label'],predictions)\n",
963 |     "print(score)\n",
964 |     "report=classification_report(test['Label'],predictions)\n",
965 |     "print(report)"
966 |    ]
967 |   },
968 |   {
969 |    "cell_type": "code",
970 |    "execution_count": null,
971 |    "metadata": {},
972 |    "outputs": [],
973 |    "source": []
974 |   }
975 |  ],
976 |  "metadata": {
977 |   "kernelspec": {
978 |    "display_name": "Python 3",
979 |    "language": "python",
980 |    "name": "python3"
981 |   },
982 |   "language_info": {
983 |    "codemirror_mode": {
984 |     "name": "ipython",
985 |     "version": 3
986 |    },
987 |    "file_extension": ".py",
988 |    "mimetype": "text/x-python",
989 |    "name": "python",
990 |    "nbconvert_exporter": "python",
991 |    "pygments_lexer": "ipython3",
992 |    "version": "3.7.4"
993 |   }
994 |  },
995 |  "nbformat": 4,
996 |  "nbformat_minor": 2
997 | }
998 | 


--------------------------------------------------------------------------------