├── Projects ├── .gitkeep ├── Spam-Ham Classification │ ├── .gitkeep │ ├── bagofwords_classifier.py │ ├── HashVectorizer.py │ ├── passive_aggresiveClassifier.py │ └── readme.md ├── Covid_tweets Sentiment Analysis │ ├── data.zip │ ├── readme.md │ └── Corona_NLP text classification.ipynb ├── IMDB Movie Reviews │ └── readme.md ├── Tweet Sentiment Extraction │ └── readme.md ├── Fake News Detection │ ├── readme.md │ └── Fake News.ipynb ├── Daily News for Stock Market Prediction │ ├── README.md │ └── Stock Prediction using News Headlines.ipynb ├── Twitter Sentiment Analysis(Beginners) │ ├── readme.md │ └── Twitter Sentiment Analysis (Small Dataset).ipynb ├── Women's E-Commerce Clothing Reviews │ └── readme.md └── Yelp Reviews │ └── readme.md ├── Deep Learning ├── readme.md ├── Loss │ ├── .gitkeep │ └── loss.py ├── Activation Functions │ ├── .gitkeep │ └── activationfunctions.py └── cuda.py ├── Regular Expression └── .gitkeep ├── cuda.py ├── stemming_demo.py ├── tokenize.py ├── stopwords_demo.py ├── Basic Perceptron.py ├── Lemmatization.py ├── README.md ├── speech_tagging.py ├── chinking.py ├── nameEntity_recog.py ├── chunking.py ├── tf-idf.py ├── bagofwords.py └── word2vec.py /Projects/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Deep Learning/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Deep Learning/Loss/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Regular Expression/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Deep Learning/Activation Functions/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Projects/Spam-Ham Classification/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Projects/Covid_tweets Sentiment Analysis/data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bhav09/NLP-Basics/HEAD/Projects/Covid_tweets Sentiment Analysis/data.zip -------------------------------------------------------------------------------- /cuda.py: -------------------------------------------------------------------------------- 1 | #cuda 2 | import torch 3 | print(torch.cuda.is_available) 4 | 5 | device=torch.device("cuda") 6 | x=torch.randn(2,2).to(device) 7 | 8 | #we cannot perform operators with cuda and cpu tensors at once 9 | #both the tensors either have to be CPU or Cuda tensors -------------------------------------------------------------------------------- /Deep Learning/cuda.py: -------------------------------------------------------------------------------- 1 | #cuda 2 | import torch 3 | print(torch.cuda.is_available) 4 | 5 | device=torch.device("cuda") 6 | x=torch.randn(2,2).to(device) 7 | 8 | #we cannot perform operators with cuda and cpu tensors at once 9 | #both the tensors either have to be CPU or Cuda tensors -------------------------------------------------------------------------------- /stemming_demo.py: -------------------------------------------------------------------------------- 1 | '''Stemming is the process of reducing a word to its word stem that affixes to 2 | suffixes and prefixes or to the roots of words known as a lemma 3 | ''' 4 | from nltk.stem import PorterStemmer 5 | from nltk.tokenize import word_tokenize 6 | 7 | text='He played football every Tuesday. He plays football every Tuesday. He is going to play football every Tuesday.' 8 | words=word_tokenize(text) 9 | #print(words) 10 | 11 | ps=PorterStemmer() 12 | 13 | for w in words: 14 | print(ps.stem(w)) 15 | -------------------------------------------------------------------------------- /Projects/IMDB Movie Reviews/readme.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | IMDB dataset having 50K movie reviews for natural language processing or Text analytics. 4 | 5 | This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. 6 | We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms. 7 | 8 | ## Dataset [Link](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) 9 | -------------------------------------------------------------------------------- /Projects/Tweet Sentiment Extraction/readme.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | ### Files 4 | 5 | train.csv - the training set 6 | 7 | test.csv - the test set 8 | 9 | sample_submission.csv - a sample submission file in the correct format 10 | 11 | Columns 12 | 13 | textID - unique ID for each piece of text 14 | 15 | text - the text of the tweet 16 | 17 | sentiment - the general sentiment of the tweet 18 | 19 | selected_text - [train only] the text that supports the tweet's sentiment 20 | 21 | ### Source: Kaggle 22 | 23 | ### Dataset [Link](https://www.kaggle.com/c/tweet-sentiment-extraction/data) 24 | -------------------------------------------------------------------------------- /tokenize.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Tokenizing is the process in which huge sentence/ paragraphs are divided into smaller segments called tokens. 3 | Here we will be seeing two tokenizers : word_tokenizer, sent_tokenizer 4 | word_tokenizer=it actually divides a group of sentence where the delimiter is the word 5 | sent_tokenizer= it delimits the para/sentences on sentences. 6 | ''' 7 | 8 | from nltk.tokenize import sent_tokenize, word_tokenize 9 | 10 | 11 | text='Hello Mr. Bhavishya Pandit. How are doing? Hope everything is going smooth.' 12 | print('Sentence Tokenize:',sent_tokenize(text)) 13 | print() 14 | print('Word Tokenize:',word_tokenize(text)) -------------------------------------------------------------------------------- /Projects/Fake News Detection/readme.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | train.csv: A full training dataset with the following attributes: 4 | 5 | id: unique id for a news article 6 | title: the title of a news article 7 | author: author of the news article 8 | text: the text of the article; could be incomplete 9 | label: a label that marks the article as potentially unreliable 10 | 11 | 1: unreliable 12 | 13 | 0: reliable 14 | 15 | test.csv: A testing training dataset with all the same attributes at train.csv without the label. 16 | 17 | submit.csv: A sample submission that you can 18 | 19 | #### Source: Kaggle 20 | 21 | ### Dataset [Link](https://www.kaggle.com/c/fake-news/data) 22 | -------------------------------------------------------------------------------- /stopwords_demo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | stopwords can be understood as : to exclude or stop at a point where a certain word among the list of words occur 3 | in a particular para/ sentence 4 | ''' 5 | from nltk.corpus import stopwords 6 | from nltk.tokenize import word_tokenize 7 | 8 | text='It is an example of showing the stop words filteration.' 9 | stop_words=stopwords.words('english') 10 | #print(stop_words) 11 | 12 | filtered_list=[] 13 | #now filtering our sentence 14 | words=word_tokenize(text) 15 | for w in words: 16 | if w not in stop_words: 17 | filtered_list.append(w) 18 | print(filtered_list) # ['It', 'example', 'showing', 'stop', 'words', 'filteration', '.'] -------------------------------------------------------------------------------- /Projects/Daily News for Stock Market Prediction/README.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | There are two channels of data provided in this dataset: 4 | 5 | News data: I crawled historical news headlines from Reddit WorldNews Channel (/r/worldnews). They are ranked by reddit users' votes, and only the top 25 headlines are considered for a single date. 6 | (Range: 2008-06-08 to 2016-07-01) 7 | 8 | Stock data: Dow Jones Industrial Average (DJIA) is used to "prove the concept". 9 | (Range: 2008-08-08 to 2016-07-01) 10 | 11 | Note: If you'd like to cite this dataset in your publications, please use: 12 | 13 | Sun, J. (2016, August). Daily News for Stock Market Prediction, Version 1. Retrieved [Date You Retrieved This Data] from https://www.kaggle.com/aaron7sun/stocknews. 14 | 15 | ### Source: Kaggle 16 | 17 | ### Dataset [Link](https://www.kaggle.com/aaron7sun/stocknews) 18 | -------------------------------------------------------------------------------- /Deep Learning/Loss/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import matplotlib.pyplot as plt 4 | 5 | def mse(): 6 | mse_loss = nn.MSELoss() 7 | outputs = torch.randn(3, 5, requires_grad=True) 8 | targets = torch.randn(3, 5) 9 | loss = mse_loss(outputs, targets) 10 | print(loss) 11 | 12 | def crossentropy(): 13 | ce_loss = nn.CrossEntropyLoss() 14 | outputs = torch.randn(3, 5, requires_grad=True) 15 | targets = torch.tensor([1, 0, 3], dtype=torch.int64) 16 | loss = ce_loss(outputs, targets) 17 | print(loss) 18 | 19 | def bce(): #binary cross entropy 20 | bce_loss = nn.BCELoss() 21 | sigmoid = nn.Sigmoid() 22 | probabilities = sigmoid(torch.randn(4, 1, requires_grad=True)) 23 | targets = torch.tensor([1, 0, 1, 0], dtype=torch.float32).view(4, 1) 24 | oss = bce_loss(probabilities, targets) 25 | print(probabilities) 26 | print(loss) 27 | -------------------------------------------------------------------------------- /Basic Perceptron.py: -------------------------------------------------------------------------------- 1 | #perceptron with pytroch 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.functional as f 6 | from torch.autograd import Variable 7 | 8 | class Net(nn.Module): 9 | def __init__(self): 10 | super(Net,self).__init__() 11 | self.fc1=nn.Linear(1,1) 12 | 13 | def forward(self,x): 14 | x=self.fc1(x) 15 | return x 16 | 17 | net=Net() 18 | print(net) 19 | '''Net( 20 | (fc1): Linear(in_features=1, out_features=1, bias=True) 21 | )''' 22 | 23 | #to print the parameters of the neural network 24 | print(list(net.parameters())) 25 | '''[Parameter containing: 26 | tensor([[0.4780]], requires_grad=True), Parameter containing: 27 | tensor([0.1686], requires_grad=True)]''' 28 | 29 | input = Variable(torch.randn(1,1,1), requires_grad=True) 30 | print(input) #tensor([[[0.7907]]], requires_grad=True) 31 | 32 | output=net(input) 33 | print(output) #tensor([[[0.5466]]], grad_fn=) 34 | 35 | import torch.optim as optim 36 | def criterion(out, label): 37 | return (label - out)**2 38 | -------------------------------------------------------------------------------- /Deep Learning/Activation Functions/activationfunctions.py: -------------------------------------------------------------------------------- 1 | #activation functions 2 | 3 | import matplotlib.pyplot as plt 4 | import torch 5 | import numpy as np 6 | 7 | fig, ax = plt.subplots(2,2) 8 | fig.suptitle('Activation Functions') 9 | 10 | def sigmoid(): 11 | x=torch.range(-5,5,0.1) 12 | y=torch.sigmoid(x) 13 | ax[0,0].grid() 14 | ax[0,0].plot(x.numpy(), y.numpy()) 15 | ax[0,0].set_title('Sigmoid') 16 | 17 | def tanh(): 18 | x=torch.range(-5,5,0.1) 19 | y=torch.tanh(x) 20 | ax[0,1].grid() 21 | ax[0,1].plot(x.numpy(), y.numpy(),color='orange') 22 | ax[0,1].set_title('Tanh') 23 | 24 | def relu(): 25 | x=torch.range(-5,5,0.1) 26 | y=torch.relu(x) 27 | ax[1,0].grid() 28 | ax[1,0].plot(x.numpy(), y.numpy(),color='g') 29 | ax[1,0].set_title('RelU') 30 | 31 | def prelu(): 32 | prelu = torch.nn.PReLU(num_parameters=1) 33 | x=torch.range(-5,5,0.1) 34 | y=prelu(x) 35 | ax[1,1].grid() 36 | ax[1,1].plot(x.numpy(), y.detach().numpy(),color='r') 37 | ax[1,1].set_title('PRelU') 38 | 39 | sigmoid() 40 | tanh() 41 | relu() 42 | prelu() -------------------------------------------------------------------------------- /Lemmatization.py: -------------------------------------------------------------------------------- 1 | '''Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as 2 | a single item. Lemmatization is similar to stemming but it brings context to the words. 3 | So it links words with similar meaning to one word. 4 | 5 | Text preprocessing includes both Stemming as well as Lemmatization. 6 | Many times people find these two terms confusing. Some treat these two as same. 7 | Actually, lemmatization is preferred over Stemming because lemmatization does morphological analysis of the words. 8 | The word resulting would have the same meaning but would be a synonym of the actual word 9 | ''' 10 | 11 | #dependency : nltk.download('wordnet') 12 | 13 | from nltk.stem import WordNetLemmatizer 14 | 15 | lemm=WordNetLemmatizer() 16 | print(lemm.lemmatize('dogs')) #prints dog 17 | 18 | print(lemm.lemmatize('mosquitoes')) #prints mosquito 19 | 20 | print(lemm.lemmatize('better',pos="a")) #prints good (a) stands for adjective 21 | #also the default parameter for lemmatizer is noun (n) 22 | 23 | print(lemm.lemmatize('eating',pos="v")) #prints eat 24 | 25 | -------------------------------------------------------------------------------- /Projects/Twitter Sentiment Analysis(Beginners)/readme.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | There two datasets Respectively one Consists of Tweets from Twitter with Sentimental Label and the other from Reddit which Consists of Comments with its Sentimental Label. 4 | 5 | 1.Twitter Dataset 6 | 7 | 2.Reddit Dataset 8 | 9 | All these Tweets and Comments were extracted using there Respective Apis Tweepy and PRAW. 10 | These tweets and Comments Were Made on Narendra Modi and Other Leaders as well as Peoples Opinion Towards the Next Prime Minister of The Nation ( In Context with General Elections Held In India - 2019). 11 | All the Tweets and Comments From twitter and Reddit are Cleaned using Pythons re and also NLP with a Sentimental Label to each ranging from -1 to 1. 12 | 13 | 0 Indicating it is a Neutral Tweet/Comment 14 | 15 | 1 Indicating a Postive Sentiment 16 | 17 | -1 Indicating a Negative Tweet/Comment 18 | 19 | Content 20 | 21 | Twitter.csv Dataset has around 163K Tweets along with Sentiment Labels. 22 | Reddit.csv Dataset has around 37K Comments along with its Sentimental Label 23 | So Generally Each Dataset has two columns, the first column has the cleaned tweets and Comments and the Second one indicates its Sentimental Label 24 | 25 | ## Dataset [Link](https://www.kaggle.com/cosmos98/twitter-and-reddit-sentimental-analysis-dataset?select=Twitter_Data.csv) 26 | 27 | ### Source: Kaggle 28 | -------------------------------------------------------------------------------- /Projects/Covid_tweets Sentiment Analysis/readme.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | It is an NLP based classification problem of Covid tweets sentiment analyis. This is a very small dataset which consists of the tweets from a small location dating from mid march to mid april 2020. 4 | 5 | --------------------------------------------------------------------------------------- 6 | 7 | ### About Features 8 | 9 | Username: User's username which is denoted with integers 10 | 11 | ScreenName: Screen name which is denoted with integers 12 | 13 | Location: Location of the user (Country Name/City Name) 14 | 15 | TweetAt: Time of the tweet 16 | 17 | Original Tweet: Text written in the tweet 18 | 19 | Sentiment: It denotes the type of tweet. It is a categorical variable which has been divided into the following categoires 20 | 21 | * Extremely Positive 22 | 23 | * Positive 24 | 25 | * Neutral 26 | 27 | * Negative 28 | 29 | * Extremely Negative 30 | 31 | --------------------------------------------------------------------------------------- 32 | 33 | ### About Dataset 34 | 35 | The data set consists of two files: train and test 36 | 37 | Training set consists of 41157 tweets 38 | 39 | Testing set consists of 3798 tweets 40 | 41 | --------------------------------------------------------------------------------------- 42 | 43 | ### Dataset [Link](https://github.com/bhav09/NLP_basics/blob/master/Projects/Covid_tweets%20Sentiment%20Analysis/data.zip) 44 | -------------------------------------------------------------------------------- /Projects/Women's E-Commerce Clothing Reviews/readme.md: -------------------------------------------------------------------------------- 1 | # Context 2 | 3 | Multi Class Classiification 4 | 5 | Welcome. This is a Women’s Clothing E-Commerce dataset revolving around the reviews written by customers. Its nine supportive features offer a great environment to parse out the text through its multiple dimensions. Because this is real commercial data, it has been anonymized, and references to the company in the review text and body have been replaced with “retailer”. 6 | 7 | # Content 8 | This dataset includes 23486 rows and 10 feature variables. Each row corresponds to a customer review, and includes the variables: 9 | 10 | Clothing ID: Integer Categorical variable that refers to the specific piece being reviewed. 11 | 12 | Age: Positive Integer variable of the reviewers age. 13 | 14 | Title: String variable for the title of the review. 15 | 16 | Review Text: String variable for the review body. 17 | 18 | Rating: Positive Ordinal Integer variable for the product score granted by the customer from 1 Worst, to 5 Best. 19 | 20 | Recommended IND: Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not recommended. 21 | 22 | Positive Feedback Count: Positive Integer documenting the number of other customers who found this review positive. 23 | 24 | Division Name: Categorical name of the product high level division. 25 | 26 | Department Name: Categorical name of the product department name. 27 | 28 | Class Name: Categorical name of the product class name. 29 | 30 | ## Acknowledgements 31 | Anonymous but real source 32 | 33 | ## Source: Kaggle 34 | -------------------------------------------------------------------------------- /Projects/Yelp Reviews/readme.md: -------------------------------------------------------------------------------- 1 | # Context 2 | 3 | This dataset is a subset of Yelp's businesses, reviews, and user data. It was originally put together for the Yelp Dataset Challenge which is a chance for students to conduct research or analysis on Yelp's data and share their discoveries. In the dataset you'll find information about businesses across 11 metropolitan areas in four countries. 4 | 5 | # Content 6 | 7 | This dataset contains seven CSV files. The original JSON files can be found in yelpacademicdataset.zip. 8 | 9 | 10 | You may find this documentation helpful: 11 | 12 | https://www.yelp.com/dataset/documentation/json 13 | 14 | In total, there are : 15 | 16 | 5,200,000 user reviews 17 | Information on 174,000 businesses 18 | The data spans 11 metropolitan areas 19 | Acknowledgements 20 | The dataset was converted from JSON to CSV format and we thank the team of the Yelp dataset challenge for creating this dataset. 21 | 22 | # Inspiration 23 | 24 | Natural Language Processing & Sentiment Analysis 25 | 26 | What's in a review? Is it positive or negative? Yelp's reviews contain a lot of metadata that can be mined and used to infer meaning, business attributes, and sentiment. 27 | 28 | ## Graph Mining 29 | 30 | We recently launched our Local Graph but can you take the graph further? How do user's relationships define their usage patterns? Where are the trend setters eating before it becomes popular? 31 | 32 | 33 | 34 | ## Original Dataset [Link](https://www.kaggle.com/yelp-dataset/yelp-dataset?select=yelp_academic_dataset_checkin.json) 35 | 36 | ## Review Dataset [Link](https://www.kaggle.com/luisfredgs/yelp-reviews-csv) 37 | 38 | ### Source: Kaggle 39 | -------------------------------------------------------------------------------- /Projects/Spam-Ham Classification/bagofwords_classifier.py: -------------------------------------------------------------------------------- 1 | #to classify whether a sms is spam or ham 2 | 3 | #dependencies 4 | import pandas as pd 5 | import nltk 6 | import numpy as np 7 | from nltk.corpus import stopwords 8 | from nltk.tokenize import sent_tokenize as st 9 | from nltk.stem import WordNetLemmatizer as wordnet 10 | import re 11 | 12 | #reading the file 13 | df=pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2']) 14 | corpus=[] 15 | wordnet=wordnet() 16 | length=len(df['v2']) 17 | for i in range(length): 18 | rev=re.sub('[^a-zA-Z]',' ',df['v2'][i]) 19 | rev=rev.lower() 20 | rev=rev.split() 21 | rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] 22 | rev=' '.join(rev) 23 | corpus.append(rev) 24 | 25 | from sklearn.feature_extraction.text import CountVectorizer 26 | cv=CountVectorizer(max_features=2500) 27 | x=cv.fit_transform(corpus).toarray() 28 | y=df['v1'] #dependent variable 29 | 30 | #y is a categorical variable so will encode it 31 | from sklearn.preprocessing import LabelEncoder 32 | le=LabelEncoder() 33 | y=le.fit_transform(y) 34 | 35 | #now splittin the model into train and test set 36 | 37 | from sklearn.model_selection import train_test_split 38 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) 39 | 40 | #training the model 41 | from sklearn.naive_bayes import MultinomialNB 42 | model=MultinomialNB() 43 | model.fit(x_train,y_train) 44 | 45 | #predicting the values 46 | y_pred=model.predict(x_test) 47 | 48 | #score of the model 49 | model.score(x_test,y_test) 50 | 51 | from sklearn.metrics import confusion_matrix 52 | cm=confusion_matrix(y_test,y_pred) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP-Basics 2 | 3 | 4 | This repository consists of all the necessary codes , basic theory and resources for anyone to start from scratch and study ! 5 | To practice all this , I would recommend spyder IDE for the same because it will notify you via error for any package that you won't be having of nltk. 6 | 7 | 8 | To download all the packages of nltk type this in the command prompt: 9 | 10 | import nltk 11 | 12 | nltk.download() 13 | 14 | 15 | (make sure to have a python environment running on your cmd for the same) 16 | 17 | **************************************************************************************************************************** 18 | 19 | Note: 20 | 21 | The order of learning all the theory and code should be in the order as that of the topics mentioned in the resource section. 22 | 23 | **************************************************************************************************************************** 24 | 25 | Resources: 26 | 27 | 1.More about NLP : https://machinelearningmastery.com/natural-language-processing/ 28 | 29 | Topics: 30 | 31 | 2. Tokenization : https://intellipaat.com/community/9025/tokenization-in-nlp 32 | 33 | Stopwords : https://towardsdatascience.com/treat-negation-stopwords-differently-according-to-your-nlp-task-e5a59ab7c91f 34 | 35 | Stemming : https://searchenterpriseai.techtarget.com/definition/stemming 36 | 37 | Speech Tagging: https://www.geeksforgeeks.org/nlp-part-of-speech-default-tagging/ 38 | 39 | Chunking : https://www.geeksforgeeks.org/nlp-chunking-and-chinking-with-regex/ 40 | 41 | Named Entity Recognition : https://towardsdatascience.com/named-entity-recognition-3fad3f53c91e 42 | 43 | Lemmatization : https://www.datacamp.com/community/tutorials/stemming-lemmatization-python 44 | 45 | 3. Applications of NLP : https://towardsdatascience.com/natural-language-processing-nlp-top-10-applications-to-know-b2c80bd428cb 46 | -------------------------------------------------------------------------------- /speech_tagging.py: -------------------------------------------------------------------------------- 1 | ''' 2 | speech tagging - what is basically is doing , is tagging the words into various articulates of english grammar 3 | makes a tuple which is of the format : (word,tag) 4 | 5 | 6 | POS tag list: 7 | 8 | CC coordinating conjunction 9 | CD cardinal digit 10 | DT determiner 11 | EX existential there (like: "there is" ... think of it like "there exists") 12 | FW foreign word 13 | IN preposition/subordinating conjunction 14 | JJ adjective 'big' 15 | JJR adjective, comparative 'bigger' 16 | JJS adjective, superlative 'biggest' 17 | LS list marker 1) 18 | MD modal could, will 19 | NN noun, singular 'desk' 20 | NNS noun plural 'desks' 21 | NNP proper noun, singular 'Harrison' 22 | NNPS proper noun, plural 'Americans' 23 | PDT predeterminer 'all the kids' 24 | POS possessive ending parent\'s 25 | PRP personal pronoun I, he, she 26 | PRP$ possessive pronoun my, his, hers 27 | RB adverb very, silently, 28 | RBR adverb, comparative better 29 | RBS adverb, superlative best 30 | RP particle give up 31 | TO to go 'to' the store. 32 | UH interjection errrrrrrrm 33 | VB verb, base form take 34 | VBD verb, past tense took 35 | VBG verb, gerund/present participle taking 36 | VBN verb, past participle taken 37 | VBP verb, sing. present, non-3d take 38 | VBZ verb, 3rd person sing. present takes 39 | WDT wh-determiner which 40 | WP wh-pronoun who, what 41 | WP$ possessive wh-pronoun whose 42 | WRB wh-abverb where, when 43 | 44 | ''' 45 | 46 | from nltk.corpus import state_union 47 | from nltk.tokenize import PunktSentenceTokenizer 48 | import nltk 49 | 50 | train_text=state_union.raw('2005-GWBush.txt') 51 | test_text=state_union.raw('2006-GWBush.txt') 52 | #print(text) 53 | 54 | custom_tokenizer=PunktSentenceTokenizer(train_text) 55 | test_tokenizer=custom_tokenizer.tokenize(test_text) 56 | 57 | #print(test_tokenizer) 58 | 59 | def our_content(): 60 | try: 61 | for i in test_tokenizer: 62 | words=nltk.word_tokenize(i) 63 | tag=nltk.pos_tag(words) 64 | print(tag) 65 | except Exception as e: 66 | print(str(e)) 67 | our_content() -------------------------------------------------------------------------------- /chinking.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Chinking- the words that we exclude from chunks are called chinks 3 | 4 | 5 | POS tag list: 6 | 7 | CC coordinating conjunction 8 | CD cardinal digit 9 | DT determiner 10 | EX existential there (like: "there is" ... think of it like "there exists") 11 | FW foreign word 12 | IN preposition/subordinating conjunction 13 | JJ adjective 'big' 14 | JJR adjective, comparative 'bigger' 15 | JJS adjective, superlative 'biggest' 16 | LS list marker 1) 17 | MD modal could, will 18 | NN noun, singular 'desk' 19 | NNS noun plural 'desks' 20 | NNP proper noun, singular 'Harrison' 21 | NNPS proper noun, plural 'Americans' 22 | PDT predeterminer 'all the kids' 23 | POS possessive ending parent\'s 24 | PRP personal pronoun I, he, she 25 | PRP$ possessive pronoun my, his, hers 26 | RB adverb very, silently, 27 | RBR adverb, comparative better 28 | RBS adverb, superlative best 29 | RP particle give up 30 | TO to go 'to' the store. 31 | UH interjection errrrrrrrm 32 | VB verb, base form take 33 | VBD verb, past tense took 34 | VBG verb, gerund/present participle taking 35 | VBN verb, past participle taken 36 | VBP verb, sing. present, non-3d take 37 | VBZ verb, 3rd person sing. present takes 38 | WDT wh-determiner which 39 | WP wh-pronoun who, what 40 | WP$ possessive wh-pronoun whose 41 | WRB wh-abverb where, when 42 | 43 | ''' 44 | 45 | from nltk.corpus import state_union 46 | from nltk.tokenize import PunktSentenceTokenizer 47 | import nltk 48 | from nltk.chunk import RegexpParser 49 | 50 | train_text=state_union.raw('2005-GWBush.txt') 51 | test_text=state_union.raw('2006-GWBush.txt') 52 | #print(text) 53 | 54 | custom_tokenizer=PunktSentenceTokenizer(train_text) 55 | test_tokenizer=custom_tokenizer.tokenize(test_text) 56 | 57 | #print(test_tokenizer) 58 | 59 | def our_content(): 60 | try: 61 | for i in test_tokenizer: 62 | words=nltk.word_tokenize(i) 63 | tag=nltk.pos_tag(words) 64 | print(tag) 65 | chunkGram=r''' Chunk:{<.*>+}}{''' 66 | chunkParser=nltk.RegexpParser(chunkGram) 67 | chunked=chunkParser.parse(tag) 68 | chunked.draw() 69 | except Exception as e: 70 | print(str(e)) 71 | our_content() -------------------------------------------------------------------------------- /Projects/Spam-Ham Classification/HashVectorizer.py: -------------------------------------------------------------------------------- 1 | #HashingVectorizer 2 | 3 | import pandas as pd 4 | import nltk 5 | import numpy as np 6 | from nltk.corpus import stopwords 7 | from nltk.tokenize import sent_tokenize as st 8 | from nltk.stem import WordNetLemmatizer as wordnet 9 | import re 10 | from sklearn.metrics import classification_report 11 | 12 | #reading the file 13 | df=pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2']) 14 | corpus=[] 15 | wordnet=wordnet() 16 | length=len(df['v2']) 17 | for i in range(length): 18 | rev=re.sub('[^a-zA-Z]',' ',df['v2'][i]) 19 | rev=rev.lower() 20 | rev=rev.split() 21 | rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] 22 | rev=' '.join(rev) 23 | corpus.append(rev) 24 | 25 | 26 | from sklearn.feature_extraction.text import HashingVectorizer as hv 27 | 28 | hv=hv(n_features=5000) 29 | x=hv.fit_transform(corpus).toarray() 30 | y=df['v1'] #dependent variable 31 | 32 | #y is a categorical variable so will encode it 33 | from sklearn.preprocessing import LabelEncoder 34 | le=LabelEncoder() 35 | y=le.fit_transform(y) 36 | 37 | #now splittin the model into train and test set 38 | from sklearn.model_selection import train_test_split 39 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) 40 | #training the model 41 | from sklearn.linear_model import PassiveAggressiveClassifier 42 | model=PassiveAggressiveClassifier() 43 | model.fit(x_train,y_train) 44 | #predicting the values 45 | y_pred=model.predict(x_test) 46 | #score of the model 47 | model.score(x_test,y_test) 48 | from sklearn.metrics import confusion_matrix 49 | cm=confusion_matrix(y_test,y_pred) 50 | print(f"Classification Report : \n\n{classification_report(y_test, y_pred)}") 51 | '''Classification Report : 52 | 53 | precision recall f1-score support 54 | 55 | 0 0.98 0.99 0.99 965 56 | 1 0.96 0.88 0.92 150 57 | 58 | accuracy 0.98 1115 59 | macro avg 0.97 0.94 0.95 1115 60 | weighted avg 0.98 0.98 0.98 1115 61 | ''' -------------------------------------------------------------------------------- /Projects/Spam-Ham Classification/passive_aggresiveClassifier.py: -------------------------------------------------------------------------------- 1 | #passive aggressive classifier 2 | 3 | import pandas as pd 4 | import nltk 5 | import numpy as np 6 | from nltk.corpus import stopwords 7 | from nltk.tokenize import sent_tokenize as st 8 | from nltk.stem import WordNetLemmatizer as wordnet 9 | import re 10 | from sklearn.metrics import classification_report 11 | 12 | #reading the file 13 | df=pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2']) 14 | corpus=[] 15 | wordnet=wordnet() 16 | length=len(df['v2']) 17 | for i in range(length): 18 | rev=re.sub('[^a-zA-Z]',' ',df['v2'][i]) 19 | rev=rev.lower() 20 | rev=rev.split() 21 | rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] 22 | rev=' '.join(rev) 23 | corpus.append(rev) 24 | from sklearn.feature_extraction.text import CountVectorizer 25 | cv=CountVectorizer(max_features=2500) 26 | x=cv.fit_transform(corpus).toarray() 27 | y=df['v1'] #dependent variable 28 | 29 | #y is a categorical variable so will encode it 30 | from sklearn.preprocessing import LabelEncoder 31 | le=LabelEncoder() 32 | y=le.fit_transform(y) 33 | 34 | #now splittin the model into train and test set 35 | from sklearn.model_selection import train_test_split 36 | x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) 37 | #training the model 38 | from sklearn.linear_model import PassiveAggressiveClassifier 39 | model=PassiveAggressiveClassifier() 40 | model.fit(x_train,y_train) 41 | #predicting the values 42 | y_pred=model.predict(x_test) 43 | #score of the model 44 | model.score(x_test,y_test) 45 | from sklearn.metrics import confusion_matrix 46 | cm=confusion_matrix(y_test,y_pred) 47 | print(f"Classification Report : \n\n{classification_report(y_test, y_pred)}") 48 | '''Classification Report : 49 | 50 | precision recall f1-score support 51 | 52 | 0 0.98 0.99 0.99 965 53 | 1 0.96 0.88 0.92 150 54 | 55 | accuracy 0.98 1115 56 | macro avg 0.97 0.94 0.95 1115 57 | weighted avg 0.98 0.98 0.98 1115 58 | ''' -------------------------------------------------------------------------------- /nameEntity_recog.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Named entity recognition (NER) , also known as entity chunking/extraction , 3 | is a popular technique used in information extraction to identify and segment the named 4 | entities and classify or categorize them under various predefined classes. 5 | 6 | 7 | POS tag list: 8 | 9 | CC coordinating conjunction 10 | CD cardinal digit 11 | DT determiner 12 | EX existential there (like: "there is" ... think of it like "there exists") 13 | FW foreign word 14 | IN preposition/subordinating conjunction 15 | JJ adjective 'big' 16 | JJR adjective, comparative 'bigger' 17 | JJS adjective, superlative 'biggest' 18 | LS list marker 1) 19 | MD modal could, will 20 | NN noun, singular 'desk' 21 | NNS noun plural 'desks' 22 | NNP proper noun, singular 'Harrison' 23 | NNPS proper noun, plural 'Americans' 24 | PDT predeterminer 'all the kids' 25 | POS possessive ending parent\'s 26 | PRP personal pronoun I, he, she 27 | PRP$ possessive pronoun my, his, hers 28 | RB adverb very, silently, 29 | RBR adverb, comparative better 30 | RBS adverb, superlative best 31 | RP particle give up 32 | TO to go 'to' the store. 33 | UH interjection errrrrrrrm 34 | VB verb, base form take 35 | VBD verb, past tense took 36 | VBG verb, gerund/present participle taking 37 | VBN verb, past participle taken 38 | VBP verb, sing. present, non-3d take 39 | VBZ verb, 3rd person sing. present takes 40 | WDT wh-determiner which 41 | WP wh-pronoun who, what 42 | WP$ possessive wh-pronoun whose 43 | WRB wh-abverb where, when 44 | 45 | ''' 46 | 47 | from nltk.corpus import state_union 48 | from nltk.tokenize import PunktSentenceTokenizer 49 | import nltk 50 | from nltk.chunk import RegexpParser 51 | 52 | train_text=state_union.raw('2005-GWBush.txt') 53 | test_text=state_union.raw('2006-GWBush.txt') 54 | #print(text) 55 | 56 | custom_tokenizer=PunktSentenceTokenizer(train_text) 57 | test_tokenizer=custom_tokenizer.tokenize(test_text) 58 | 59 | #print(test_tokenizer) 60 | 61 | def our_content(): 62 | try: 63 | for i in test_tokenizer: 64 | words=nltk.word_tokenize(i) 65 | tag=nltk.pos_tag(words) 66 | #print(tag) 67 | named_entity=nltk.ne_chunk(tag) 68 | named_entity.draw() 69 | 70 | except Exception as e: 71 | print(str(e)) 72 | our_content() -------------------------------------------------------------------------------- /chunking.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Chunking- also known as partial parsing. It is a process of meaningful extraction of short phrases from the sentences. 3 | chunks are made up of words and the kinds of words and the kinds of words are defined using the parts of speech tag 4 | 5 | 6 | POS tag list: 7 | 8 | CC coordinating conjunction 9 | CD cardinal digit 10 | DT determiner 11 | EX existential there (like: "there is" ... think of it like "there exists") 12 | FW foreign word 13 | IN preposition/subordinating conjunction 14 | JJ adjective 'big' 15 | JJR adjective, comparative 'bigger' 16 | JJS adjective, superlative 'biggest' 17 | LS list marker 1) 18 | MD modal could, will 19 | NN noun, singular 'desk' 20 | NNS noun plural 'desks' 21 | NNP proper noun, singular 'Harrison' 22 | NNPS proper noun, plural 'Americans' 23 | PDT predeterminer 'all the kids' 24 | POS possessive ending parent\'s 25 | PRP personal pronoun I, he, she 26 | PRP$ possessive pronoun my, his, hers 27 | RB adverb very, silently, 28 | RBR adverb, comparative better 29 | RBS adverb, superlative best 30 | RP particle give up 31 | TO to go 'to' the store. 32 | UH interjection errrrrrrrm 33 | VB verb, base form take 34 | VBD verb, past tense took 35 | VBG verb, gerund/present participle taking 36 | VBN verb, past participle taken 37 | VBP verb, sing. present, non-3d take 38 | VBZ verb, 3rd person sing. present takes 39 | WDT wh-determiner which 40 | WP wh-pronoun who, what 41 | WP$ possessive wh-pronoun whose 42 | WRB wh-abverb where, when 43 | 44 | ''' 45 | 46 | from nltk.corpus import state_union 47 | from nltk.tokenize import PunktSentenceTokenizer 48 | import nltk 49 | from nltk.chunk import RegexpParser 50 | 51 | train_text=state_union.raw('2005-GWBush.txt') 52 | test_text=state_union.raw('2006-GWBush.txt') 53 | #print(text) 54 | 55 | custom_tokenizer=PunktSentenceTokenizer(train_text) 56 | test_tokenizer=custom_tokenizer.tokenize(test_text) 57 | 58 | #print(test_tokenizer) 59 | 60 | def our_content(): 61 | try: 62 | for i in test_tokenizer: 63 | words=nltk.word_tokenize(i) 64 | tag=nltk.pos_tag(words) 65 | print(tag) 66 | chunkGram=r''' Chunk:{*}''' 67 | chunkParser=nltk.RegexpParser(chunkGram) 68 | chunked=chunkParser.parse(tag) 69 | chunked.draw() 70 | except Exception as e: 71 | print(str(e)) 72 | our_content() -------------------------------------------------------------------------------- /tf-idf.py: -------------------------------------------------------------------------------- 1 | #tf idf 2 | import nltk 3 | 4 | para = '''An atom is the smallest unit of ordinary matter that forms a chemical element. 5 | Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms. 6 | Atoms are extremely small, typically around 100 picometers across. 7 | They are so small that accurately predicting their behavior using classical physics—as 8 | if they were tennis balls, for example—is not possible due to quantum effects. 9 | Every atom is composed of a nucleus and one or more electrons bound to the nucleus. 10 | The nucleus is made of one or more protons and a number of neutrons. 11 | Only the most common variety of hydrogen has no neutrons. 12 | More than 99.94% of an atom's mass is in the nucleus. 13 | The protons have a positive electric charge, the electrons have a negative electric charge, 14 | and the neutrons have no electric charge. If the number of protons and electrons are equal, 15 | then the atom is electrically neutral. If an atom has more or fewer electrons than protons, 16 | then it has an overall negative or positive charge, respectively – such atoms are called ions. 17 | The electrons of an atom are attracted to the protons in an atomic nucleus by the electromagnetic force. 18 | The protons and neutrons in the nucleus are attracted to each other by the nuclear force. 19 | This force is usually stronger than the electromagnetic force that repels the positively 20 | charged protons from one another. Under certain circumstances, the repelling electromagnetic 21 | force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves 22 | behind different elements. This is a form of nuclear decay.''' 23 | 24 | #dependencies 25 | import re #regular expression 26 | from nltk.tokenize import sent_tokenize as st, word_tokenize as wt #for tokenization 27 | from nltk.corpus import stopwords #stop words 28 | from nltk.stem import WordNetLemmatizer as wl #for lemmatization 29 | 30 | wordnet=wl() #object creation for lemmatization 31 | corpus=[] #empty list 32 | sentences=st(para) #tokenizing the paragraph to sentences 33 | 34 | for i in range(len(sentences)): 35 | rev=re.sub('[^a-zA-Z]',' ',sentences[i]) #replace all the letters by space except the alphabets 36 | rev=rev.lower() #lower the senteces 37 | rev=rev.split() #each word gets converted to an element of a list 38 | rev=[wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] 39 | rev=' '.join(rev) 40 | corpus.append(rev) 41 | 42 | #creating TF-IDF model 43 | from sklearn.feature_extraction.text import TfidfVectorizer as tfidf 44 | cv=tfidf() #object creation 45 | x=cv.fit_transform(corpus).toarray() #transforming -------------------------------------------------------------------------------- /bagofwords.py: -------------------------------------------------------------------------------- 1 | #dependency 2 | import nltk 3 | from nltk.tokenize import sent_tokenize as st 4 | from nltk.corpus import stopwords 5 | from nltk.stem import PorterStemmer as ps, WordNetLemmatizer as wl 6 | 7 | para='''An atom is the smallest unit of ordinary matter that forms a chemical element. 8 | Every solid, liquid, gas, and plasma is composed of neutral or ionized atoms. 9 | Atoms are extremely small, typically around 100 picometers across. 10 | They are so small that accurately predicting their behavior using classical physics—as 11 | if they were tennis balls, for example—is not possible due to quantum effects. 12 | 13 | Every atom is composed of a nucleus and one or more electrons bound to the nucleus. 14 | The nucleus is made of one or more protons and a number of neutrons. 15 | Only the most common variety of hydrogen has no neutrons. 16 | More than 99.94% of an atom's mass is in the nucleus. 17 | The protons have a positive electric charge, the electrons have a negative electric charge, 18 | and the neutrons have no electric charge. If the number of protons and electrons are equal, 19 | then the atom is electrically neutral. If an atom has more or fewer electrons than protons, 20 | then it has an overall negative or positive charge, respectively – such atoms are called ions. 21 | 22 | The electrons of an atom are attracted to the protons in an atomic nucleus by the electromagnetic force. 23 | The protons and neutrons in the nucleus are attracted to each other by the nuclear force. 24 | This force is usually stronger than the electromagnetic force that repels the positively 25 | charged protons from one another. Under certain circumstances, the repelling electromagnetic 26 | force becomes stronger than the nuclear force. In this case, the nucleus splits and leaves 27 | behind different elements. This is a form of nuclear decay.''' 28 | 29 | #clearning the texts 30 | import re 31 | 32 | ps=ps() #object creation porter stemmer 33 | wl=wl() #object creation word net lemmatizer 34 | sentences=st(para) #tokenizing to sentences 35 | corpus=[] 36 | 37 | for i in range(len(sentences)): 38 | rev=re.sub('[^a-zA-Z]',' ',sentences[i]) #everything other than alphabets would be replaced by space 39 | rev=rev.lower() #lowers the letters in the sentences 40 | rev=rev.split() #splits them word wise into elements of a list 41 | rev=[wl.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))] 42 | rev=' '.join(rev) 43 | corpus.append(rev) #appending to list 44 | 45 | #bag of words 46 | from sklearn.feature_extraction.text import CountVectorizer #importing countervectorizer 47 | cv=CountVectorizer() 48 | x=cv.fit_transform(corpus).toarray() #transforming it to an array 49 | 50 | -------------------------------------------------------------------------------- /Projects/Spam-Ham Classification/readme.md: -------------------------------------------------------------------------------- 1 | ## About 2 | 3 | Context 4 | The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam. 5 | 6 | Content 7 | The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text. 8 | 9 | This corpus has been collected from free or free for research sources at the Internet: 10 | 11 | -> A collection of 425 SMS spam messages was manually extracted from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages. The Grumbletext Web site is: [Web Link]. 12 | -> A subset of 3,375 SMS randomly chosen ham messages of the NUS SMS Corpus (NSC), which is a dataset of about 10,000 legitimate messages collected for research at the Department of Computer Science at the National University of Singapore. The messages largely originate from Singaporeans and mostly from students attending the University. These messages were collected from volunteers who were made aware that their contributions were going to be made publicly available. The NUS SMS Corpus is avalaible at: [Web Link]. 13 | -> A list of 450 SMS ham messages collected from Caroline Tag's PhD Thesis available at [Web Link]. 14 | -> Finally, we have incorporated the SMS Spam Corpus v.0.1 Big. It has 1,002 SMS ham messages and 322 spam messages and it is public available at: [Web Link]. This corpus has been used in the following academic researches: 15 | 16 | Acknowledgements 17 | The original dataset can be found here. The creators would like to note that in case you find the dataset useful, please make a reference to previous paper and the web page: http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/ in your papers, research, etc. 18 | 19 | We offer a comprehensive study of this corpus in the following paper. This work presents a number of statistics, studies and baseline results for several machine learning methods. 20 | 21 | Almeida, T.A., Gómez Hidalgo, J.M., Yamakami, A. Contributions to the Study of SMS Spam Filtering: New Collection and Results. Proceedings of the 2011 ACM Symposium on Document Engineering (DOCENG'11), Mountain View, CA, USA, 2011. 22 | 23 | Inspiration 24 | Can you use this dataset to build a prediction model that will accurately classify which texts are spam? 25 | 26 | #### Source: Kaggle 27 | 28 | ## Link to DataSet 29 | 30 | Dataset: [Link](https://www.kaggle.com/uciml/sms-spam-collection-dataset) 31 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.tokenize import sent_tokenize,word_tokenize 3 | from nltk.corpus import stopwords 4 | import re 5 | from gensim.models import Word2Vec 6 | 7 | paragraph="""Before you discuss the resolution, let me place before you one or two things, 8 | I want you to understand two things very clearly and to consider them from the same 9 | point of view from which I am placing them before you. I ask you to consider it from 10 | my point of view, because if you approve of it, you will be enjoined to carry out 11 | all I say. It will be a great responsibility. There are people who ask me whether 12 | I am the same man that I was in 1920, or whether there has been any change in me 13 | or you. You are right in asking that question. 14 | 15 | Let me, however, hasten to assure that I am the same Gandhi as I was in 1920. 16 | I have not changed in any fundamental respect. I attach the same importance 17 | to non-violence that I did then. If at all, my emphasis on it has grown stronger. 18 | There is no real contradiction between the present resolution and my previous writings and utterances. 19 | 20 | Occasions like the present do not occur in everybody’s and rarely in anybody’s life. 21 | I want you to know and feel that there is nothing but purest Ahimsa in all that I 22 | am saying and doing today. The draft resolution of the Working Committee is based on 23 | Ahimsa, the contemplated struggle similarly has its roots in Ahimsa. If, therefore, 24 | there is any among you who has lost faith in Ahimsa or is wearied of it, let him not 25 | vote for this resolution. Let me explain my position clearly. God has vouchsafed to 26 | me a priceless gift in the weapon of Ahimsa. I and my Ahimsa are on our trail today. 27 | If in the present crisis, when the earth is being scorched by the flames of Himsa 28 | and crying for deliverance, I failed to make use of the God given talent, God will 29 | not forgive me and I shall be judged unworthy of the great gift. I must act now. 30 | I may not hesitate and merely look on, when Russia and China are threatened.""" 31 | 32 | #para='He is a very good man and everyone loves him!' 33 | para=re.sub('[^a-zA-Z.]',' ',paragraph) 34 | para=re.sub('\s{2,10}',' ',para) #removed extra spaces 35 | para=para.lower() 36 | 37 | sentences=sent_tokenize(para) 38 | 39 | for i in range(len(sentences)): 40 | sentences[i]=sentences[i].split() 41 | sentences[i]=[word for word in sentences[i] if word not in stopwords.words('english')] 42 | 43 | model=Word2Vec(sentences, min_count=1) 44 | 45 | words=model.wv.vocab #vocab of the paragraph 46 | 47 | #finding the vectors of the word 48 | vector=model.wv['assure'] #here we see 100 dimensions of the word 49 | 50 | #finding the word which is similar to another word 51 | similar=model.wv.most_similar('faith') 52 | 53 | -------------------------------------------------------------------------------- /Projects/Twitter Sentiment Analysis(Beginners)/Twitter Sentiment Analysis (Small Dataset).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "data=pd.read_csv('D:/Data Sets/Sentiment Analysis/Twitter and Reddit sentiment analysis/Twitter_Data.csv')" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/html": [ 30 | "
\n", 31 | "\n", 44 | "\n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | "
clean_textcategory
0when modi promised “minimum government maximum...-1.0
1talk all the nonsense and continue all the dra...0.0
2what did just say vote for modi welcome bjp t...1.0
3asking his supporters prefix chowkidar their n...1.0
4answer who among these the most powerful world...1.0
\n", 80 | "
" 81 | ], 82 | "text/plain": [ 83 | " clean_text category\n", 84 | "0 when modi promised “minimum government maximum... -1.0\n", 85 | "1 talk all the nonsense and continue all the dra... 0.0\n", 86 | "2 what did just say vote for modi welcome bjp t... 1.0\n", 87 | "3 asking his supporters prefix chowkidar their n... 1.0\n", 88 | "4 answer who among these the most powerful world... 1.0" 89 | ] 90 | }, 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "data.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "'talk all the nonsense and continue all the drama will vote for modi '" 109 | ] 110 | }, 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "data['clean_text'][1]" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "(162980, 2)" 129 | ] 130 | }, 131 | "execution_count": 6, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "data.shape" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "#here we see that the data is cleaned in terms of redundant letters and caps. \n", 147 | "#also the sentiments have been encoded, so we can skip all these steps and can carry on with the model building" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 10, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "clean_text 4\n", 159 | "category 7\n", 160 | "dtype: int64" 161 | ] 162 | }, 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "data.isnull().sum()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 12, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "(162969, 2)\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "data=data.dropna()\n", 187 | "data=data.reset_index(drop=True)\n", 188 | "print(data.shape)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 34, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "AxesSubplot(0.125,0.125;0.775x0.755)\n" 201 | ] 202 | }, 203 | { 204 | "data": { 205 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAECCAYAAAD+VKAWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy86wFpkAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUaklEQVR4nO3db4xddX7f8fdn7Sylu4XFMLiux9S0OJsapGXDyHW1UtXGbfGKKqYSVmalFity5YiSKqtWak37oOoDV1iVSotUaNyQYGi6xnWywVrKJpbJKqpK7R0IDTGsw3Rh8dTEnoCXsI1ga++3D+5vuncu1zN3/GfueP1+SVfn3O85v+Pv0ZXmM+d37vGkqpAk6RPDbkCStDQYCJIkwECQJDUGgiQJMBAkSY2BIEkCYPmwG7hQN910U61du3bYbUjSFeWll176o6oa6bftig2EtWvXMjExMew2JOmKkuQ759vmlJEkCTAQJEmNgSBJAgwESVJjIEiSAANBktQYCJIkwECQJDVX7INpi23tzueG3cJl9dbD9wy7BUlD5hWCJAkwECRJzbyBkOSzSV7pev1xki8nWZHkUJI32vKGrjEPJZlMcjzJ3V31u5K82rY9miStfk2SZ1r9SJK1l+VsJUnnNW8gVNXxqrqzqu4E7gL+BPgqsBM4XFXrgMPtPUnWA+PA7cBm4LEky9rhHgd2AOvaa3OrbwfOVNVtwCPA7ktydpKkgS10ymgT8L+q6jvAFmBvq+8F7m3rW4B9VfVRVb0JTAIbkqwCrquqF6uqgKd6xswc6wCwaebqQZK0OBYaCOPAV9r6yqp6B6Atb2711cCJrjFTrba6rffWZ42pqrPA+8CNC+xNknQRBg6EJJ8Efhr4L/Pt2qdWc9TnGtPbw44kE0kmpqen52lDkrQQC7lC+CLwclWdau9PtWkg2vJ0q08Ba7rGjQInW320T33WmCTLgeuB93obqKo9VTVWVWMjI33/4I8k6QItJBC+xA+niwAOAtva+jbg2a76ePvm0K10bh4fbdNKHyTZ2O4P3N8zZuZY9wEvtPsMkqRFMtCTykn+NPA3gZ/rKj8M7E+yHXgb2ApQVceS7AdeA84CD1bVuTbmAeBJ4Frg+fYCeAJ4OskknSuD8Ys4J0nSBRgoEKrqT+i5yVtV79L51lG//XcBu/rUJ4A7+tQ/pAWKJGk4fFJZkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqRkoEJJ8JsmBJN9K8nqSv5JkRZJDSd5oyxu69n8oyWSS40nu7qrfleTVtu3RJGn1a5I80+pHkqy95GcqSZrToFcI/w74elX9BPA54HVgJ3C4qtYBh9t7kqwHxoHbgc3AY0mWteM8DuwA1rXX5lbfDpypqtuAR4DdF3lekqQFmjcQklwH/FXgCYCq+n5VfRfYAuxtu+0F7m3rW4B9VfVRVb0JTAIbkqwCrquqF6uqgKd6xswc6wCwaebqQZK0OAa5QvgLwDTwK0l+N8kvJfkUsLKq3gFoy5vb/quBE13jp1ptdVvvrc8aU1VngfeBG3sbSbIjyUSSienp6QFPUZI0iEECYTnwk8DjVfV54P/QpofOo99v9jVHfa4xswtVe6pqrKrGRkZG5u5akrQggwTCFDBVVUfa+wN0AuJUmwaiLU937b+ma/wocLLVR/vUZ41Jshy4HnhvoScjSbpw8wZCVf0hcCLJZ1tpE/AacBDY1mrbgGfb+kFgvH1z6FY6N4+PtmmlD5JsbPcH7u8ZM3Os+4AX2n0GSdIiWT7gfv8Q+NUknwS+DfwsnTDZn2Q78DawFaCqjiXZTyc0zgIPVtW5dpwHgCeBa4Hn2ws6N6yfTjJJ58pg/CLPS5K0QAMFQlW9Aoz12bTpPPvvAnb1qU8Ad/Spf0gLFEnScPiksiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNYM+qSxd0dbufG7YLVw2bz18z7Bb0I8IrxAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoGCoQkbyV5NckrSSZabUWSQ0neaMsbuvZ/KMlkkuNJ7u6q39WOM5nk0SRp9WuSPNPqR5KsvcTnKUmax0KuEP56Vd1ZVWPt/U7gcFWtAw639yRZD4wDtwObgceSLGtjHgd2AOvaa3OrbwfOVNVtwCPA7gs/JUnShbiYKaMtwN62vhe4t6u+r6o+qqo3gUlgQ5JVwHVV9WJVFfBUz5iZYx0ANs1cPUiSFseggVDAbyV5KcmOVltZVe8AtOXNrb4aONE1dqrVVrf13vqsMVV1FngfuHFhpyJJuhiD/vfXX6iqk0luBg4l+dYc+/b7zb7mqM81ZvaBO2G0A+CWW26Zu2NJ0oIMdIVQVSfb8jTwVWADcKpNA9GWp9vuU8CaruGjwMlWH+1TnzUmyXLgeuC9Pn3sqaqxqhobGRkZpHVJ0oDmDYQkn0ryZ2bWgb8F/D5wENjWdtsGPNvWDwLj7ZtDt9K5eXy0TSt9kGRjuz9wf8+YmWPdB7zQ7jNIkhbJIFNGK4Gvtnu8y4H/XFVfT/JNYH+S7cDbwFaAqjqWZD/wGnAWeLCqzrVjPQA8CVwLPN9eAE8ATyeZpHNlMH4Jzk2StADzBkJVfRv4XJ/6u8Cm84zZBezqU58A7uhT/5AWKJKk4fBJZUkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqRm4EBIsizJ7yb5Wnu/IsmhJG+05Q1d+z6UZDLJ8SR3d9XvSvJq2/ZokrT6NUmeafUjSdZewnOUJA1gIVcIvwC83vV+J3C4qtYBh9t7kqwHxoHbgc3AY0mWtTGPAzuAde21udW3A2eq6jbgEWD3BZ2NJOmCDRQISUaBe4Bf6ipvAfa29b3AvV31fVX1UVW9CUwCG5KsAq6rqherqoCnesbMHOsAsGnm6kGStDgGvUL4t8A/AX7QVVtZVe8AtOXNrb4aONG131SrrW7rvfVZY6rqLPA+cOOgJyFJunjzBkKSvw2crqqXBjxmv9/sa476XGN6e9mRZCLJxPT09IDtSJIGMcgVwheAn07yFrAP+Kkk/wk41aaBaMvTbf8pYE3X+FHgZKuP9qnPGpNkOXA98F5vI1W1p6rGqmpsZGRkoBOUJA1m3kCoqoeqarSq1tK5WfxCVf1d4CCwre22DXi2rR8Exts3h26lc/P4aJtW+iDJxnZ/4P6eMTPHuq/9Gx+7QpAkXT7LL2Lsw8D+JNuBt4GtAFV1LMl+4DXgLPBgVZ1rYx4AngSuBZ5vL4AngKeTTNK5Mhi/iL4kSRdgQYFQVd8AvtHW3wU2nWe/XcCuPvUJ4I4+9Q9pgSJJGg6fVJYkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEXNyDaZJ02a3d+dywW7is3nr4nmG38P95hSBJAgwESVJjIEiSAANBktQYCJIkwECQJDUGgiQJMBAkSY2BIEkCDARJUmMgSJIAA0GS1MwbCEn+VJKjSf5nkmNJ/mWrr0hyKMkbbXlD15iHkkwmOZ7k7q76XUlebdseTZJWvybJM61+JMnay3CukqQ5DHKF8BHwU1X1OeBOYHOSjcBO4HBVrQMOt/ckWQ+MA7cDm4HHkixrx3oc2AGsa6/Nrb4dOFNVtwGPALsv/tQkSQsxbyBUx/fa2x9rrwK2AHtbfS9wb1vfAuyrqo+q6k1gEtiQZBVwXVW9WFUFPNUzZuZYB4BNM1cPkqTFMdA9hCTLkrwCnAYOVdURYGVVvQPQlje33VcDJ7qGT7Xa6rbeW581pqrOAu8DN/bpY0eSiSQT09PTA52gJGkwAwVCVZ2rqjuBUTq/7d8xx+79frOvOepzjentY09VjVXV2MjIyDxdS5IWYkHfMqqq7wLfoDP3f6pNA9GWp9tuU8CarmGjwMlWH+1TnzUmyXLgeuC9hfQmSbo4g3zLaCTJZ9r6tcDfAL4FHAS2td22Ac+29YPAePvm0K10bh4fbdNKHyTZ2O4P3N8zZuZY9wEvtPsMkqRFMsjfVF4F7G3fFPoEsL+qvpbkRWB/ku3A28BWgKo6lmQ/8BpwFniwqs61Yz0APAlcCzzfXgBPAE8nmaRzZTB+KU5OkjS4eQOhqn4P+Hyf+rvApvOM2QXs6lOfAD52/6GqPqQFiiRpOHxSWZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSGgNBkgQYCJKkxkCQJAEGgiSpMRAkSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQMEQpI1SX47yetJjiX5hVZfkeRQkjfa8oauMQ8lmUxyPMndXfW7krzatj2aJK1+TZJnWv1IkrWX4VwlSXMY5ArhLPCPq+ovARuBB5OsB3YCh6tqHXC4vadtGwduBzYDjyVZ1o71OLADWNdem1t9O3Cmqm4DHgF2X4JzkyQtwLyBUFXvVNXLbf0D4HVgNbAF2Nt22wvc29a3APuq6qOqehOYBDYkWQVcV1UvVlUBT/WMmTnWAWDTzNWDJGlxLOgeQpvK+TxwBFhZVe9AJzSAm9tuq4ETXcOmWm11W++tzxpTVWeB94EbF9KbJOniDBwIST4N/Brw5ar647l27VOrOepzjentYUeSiSQT09PT87UsSVqAgQIhyY/RCYNfrapfb+VTbRqItjzd6lPAmq7ho8DJVh/tU581Jsly4Hrgvd4+qmpPVY1V1djIyMggrUuSBjTIt4wCPAG8XlX/pmvTQWBbW98GPNtVH2/fHLqVzs3jo21a6YMkG9sx7+8ZM3Os+4AX2n0GSdIiWT7APl8A/h7wapJXWu2fAQ8D+5NsB94GtgJU1bEk+4HX6HxD6cGqOtfGPQA8CVwLPN9e0Amcp5NM0rkyGL+405IkLdS8gVBV/43+c/wAm84zZhewq099ArijT/1DWqBIkobDJ5UlSYCBIElqDARJEmAgSJIaA0GSBBgIkqTGQJAkAQaCJKkxECRJgIEgSWoMBEkSYCBIkhoDQZIEGAiSpMZAkCQBBoIkqTEQJEmAgSBJagwESRJgIEiSmnkDIckvJzmd5Pe7aiuSHEryRlve0LXtoSSTSY4nuburfleSV9u2R5Ok1a9J8kyrH0my9hKfoyRpAINcITwJbO6p7QQOV9U64HB7T5L1wDhwexvzWJJlbczjwA5gXXvNHHM7cKaqbgMeAXZf6MlIki7cvIFQVb8DvNdT3gLsbet7gXu76vuq6qOqehOYBDYkWQVcV1UvVlUBT/WMmTnWAWDTzNWDJGnxXOg9hJVV9Q5AW97c6quBE137TbXa6rbeW581pqrOAu8DN15gX5KkC3Spbyr3+82+5qjPNebjB092JJlIMjE9PX2BLUqS+rnQQDjVpoFoy9OtPgWs6dpvFDjZ6qN96rPGJFkOXM/Hp6gAqKo9VTVWVWMjIyMX2LokqZ8LDYSDwLa2vg14tqs+3r45dCudm8dH27TSB0k2tvsD9/eMmTnWfcAL7T6DJGkRLZ9vhyRfAf4acFOSKeBfAA8D+5NsB94GtgJU1bEk+4HXgLPAg1V1rh3qATrfWLoWeL69AJ4Ank4ySefKYPySnJkkaUHmDYSq+tJ5Nm06z/67gF196hPAHX3qH9ICRZI0PD6pLEkCDARJUmMgSJIAA0GS1BgIkiTAQJAkNQaCJAkwECRJjYEgSQIMBElSYyBIkgADQZLUGAiSJMBAkCQ1BoIkCTAQJEmNgSBJAgwESVJjIEiSAANBktQsmUBIsjnJ8SSTSXYOux9JutosiUBIsgz498AXgfXAl5KsH25XknR1WRKBAGwAJqvq21X1fWAfsGXIPUnSVWX5sBtoVgMnut5PAX+5d6ckO4Ad7e33khxfhN6G5SbgjxbrH8vuxfqXrgp+dle2H/XP78+fb8NSCYT0qdXHClV7gD2Xv53hSzJRVWPD7kML52d3ZbuaP7+lMmU0Bazpej8KnBxSL5J0VVoqgfBNYF2SW5N8EhgHDg65J0m6qiyJKaOqOpvk54HfBJYBv1xVx4bc1rBdFVNjP6L87K5sV+3nl6qPTdVLkq5CS2XKSJI0ZAaCJAkwECRJjYEgSQKWyLeMpCtdkpV0nrgv4GRVnRpySxpAkuXAduDvAH+O9vkBzwJPVNX/HWJ7i85vGS0h/lC58iS5E/gPwPXA/27lUeC7wD+oqpeH05kGkeQrdD6rvXQekIXO57cNWFFVPzOk1obCQFgC/KFy5UryCvBzVXWkp74R+MWq+txQGtNAkhyvqs+eZ9sfVNWPL3ZPw+SU0dLwJOf/ofIrgD9Ulq5P9X5uAFX1P5J8ahgNaUHOJNkK/FpV/QAgySeArcCZoXY2BAbC0uAPlSvX80meA57ih/9j7xrgfuDrQ+tKgxoHdgOPJZkJgM8Av922XVWcMloCkjwK/EX6/1B5s6p+fli9aX5Jvkjn73espvM/904BB6vqvw61MS1Ikhvp/ExctP/6eqkxEJYIf6hIS0uSP1tVfzjsPhaTgSBdJkl2tL/hoStQkueq6p5h97GYfDBtiWt/JU5Xpn5/+ElXiKstDMCbylcCf6gscUl+gh9O98082HSwqn5xqI3poiT5dFV9b9h9LCavEJa+7w+7AZ1fkn8K7KMT3Efp/LGnAF9JsnOYvemivTbsBhab9xCWuCRvV9Utw+5D/SX5A+D23v/ioP3lv2NVtW44nWkQSf7R+TYB/7yqVixmP8PmlNESkOT3zrcJWLmYvWjBfkDn/8D5Tk99Vdumpe1fAf8aONtn21U3g2IgLA0rgbv5+JORAf774rejBfgycDjJG/zwGZJbgNsAnx9Z+l4GfqOqXurdkOTvD6GfoTIQloavAZ+uqld6NyT5xqJ3o4FV1deT/DiwgdnPkHyzqs4NtTkN4meBd7sLXc8fjA2npeHxHoIkdUnyclX95LD7GIarbo5MkuZx1X7V20CQpNn+47AbGBanjCRJgFcIkqTGQJAkAQaCJKkxECRJgIEgSWr+HykGFTvcUZ7EAAAAAElFTkSuQmCC\n", 206 | "text/plain": [ 207 | "
" 208 | ] 209 | }, 210 | "metadata": { 211 | "needs_background": "light" 212 | }, 213 | "output_type": "display_data" 214 | } 215 | ], 216 | "source": [ 217 | "print(data['category'].value_counts().plot(kind='bar'))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 17, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "import nltk\n", 227 | "from sklearn.feature_extraction.text import CountVectorizer as cv, TfidfVectorizer as tfidf" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 18, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "from sklearn.metrics import accuracy_score, classification_report" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "### Bag of Words " 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 19, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "cv=cv(max_features=5000) #for bag of words\n", 253 | "x=cv.fit_transform(data['clean_text']).toarray()\n", 254 | "y=data['category'].values" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 20, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "(162969, 5000)" 266 | ] 267 | }, 268 | "execution_count": 20, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "x.shape" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 21, 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/plain": [ 285 | "array([[0, 0, 0, ..., 0, 0, 0],\n", 286 | " [0, 0, 0, ..., 0, 0, 0],\n", 287 | " [0, 0, 0, ..., 0, 0, 0],\n", 288 | " ...,\n", 289 | " [0, 0, 0, ..., 0, 0, 0],\n", 290 | " [0, 0, 0, ..., 0, 0, 0],\n", 291 | " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)" 292 | ] 293 | }, 294 | "execution_count": 21, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "x" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 22, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "array([-1., 0., 1., ..., 0., 0., 1.])" 312 | ] 313 | }, 314 | "execution_count": 22, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "y" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 23, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "(162969,)" 332 | ] 333 | }, 334 | "execution_count": 23, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "y.shape" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 24, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "from sklearn.model_selection import train_test_split\n", 350 | "x_train_bow,x_test_bow,y_train_bow,y_test_bow=train_test_split(x,y,test_size=0.2)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 27, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "MultinomialNB()" 362 | ] 363 | }, 364 | "execution_count": 27, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "from sklearn.naive_bayes import MultinomialNB as nb\n", 371 | "classifier_nb=nb()\n", 372 | "classifier_nb.fit(x_train_bow,y_train_bow)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 32, 378 | "metadata": {}, 379 | "outputs": [ 380 | { 381 | "name": "stdout", 382 | "output_type": "stream", 383 | "text": [ 384 | "Accuracy is: 81.1008161011229\n", 385 | " precision recall f1-score support\n", 386 | "\n", 387 | " -1.0 0.71 0.75 0.73 7136\n", 388 | " 0.0 0.89 0.79 0.84 11026\n", 389 | " 1.0 0.81 0.86 0.83 14432\n", 390 | "\n", 391 | " accuracy 0.81 32594\n", 392 | " macro avg 0.80 0.80 0.80 32594\n", 393 | "weighted avg 0.82 0.81 0.81 32594\n", 394 | "\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "#predicting values\n", 400 | "y_pred_bow=classifier_nb.predict(x_test_bow)\n", 401 | "print(\"Accuracy is:\",accuracy_score(y_test_bow,y_pred_bow)*100)\n", 402 | "print(classification_report(y_test_bow,y_pred_bow))" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "## TF-IDF" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 43, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "tf=tfidf(max_features=100)\n", 419 | "x_tf=tf.fit_transform(data['clean_text']).toarray()\n", 420 | "y_tf=y" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 44, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "MultinomialNB()" 432 | ] 433 | }, 434 | "execution_count": 44, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "#splitting the data again\n", 441 | "x_train_tf,x_test_tf,y_train_tf,y_test_tf=train_test_split(x_tf,y,test_size=0.2)\n", 442 | "classifier_tf=nb()\n", 443 | "classifier_tf.fit(x_train_tf,y_train_tf)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 45, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "Accuracy is: 0.47502607841934097\n", 456 | " precision recall f1-score support\n", 457 | "\n", 458 | " -1.0 0.00 0.00 0.00 7170\n", 459 | " 0.0 0.53 0.25 0.34 11039\n", 460 | " 1.0 0.46 0.89 0.61 14385\n", 461 | "\n", 462 | " accuracy 0.48 32594\n", 463 | " macro avg 0.33 0.38 0.32 32594\n", 464 | "weighted avg 0.38 0.48 0.38 32594\n", 465 | "\n" 466 | ] 467 | }, 468 | { 469 | "name": "stderr", 470 | "output_type": "stream", 471 | "text": [ 472 | "C:\\Users\\91884\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", 473 | " _warn_prf(average, modifier, msg_start, len(result))\n" 474 | ] 475 | } 476 | ], 477 | "source": [ 478 | "#predicting the values\n", 479 | "y_pred_tf=classifier_tf.predict(x_test_tf)\n", 480 | "print('Accuracy is:',accuracy_score(y_test_tf,y_pred_tf))\n", 481 | "print(classification_report(y_test_tf,y_pred_tf))" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "#Note here in TFIDF, the accuracy fell drastically because the system is not alloting any more space\n", 491 | "#so I had to go with 100 features in tfdf in contrast to 5000 features in Bag of words\n", 492 | "#you can surely try it out with different values of features" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [] 508 | } 509 | ], 510 | "metadata": { 511 | "kernelspec": { 512 | "display_name": "Python 3", 513 | "language": "python", 514 | "name": "python3" 515 | }, 516 | "language_info": { 517 | "codemirror_mode": { 518 | "name": "ipython", 519 | "version": 3 520 | }, 521 | "file_extension": ".py", 522 | "mimetype": "text/x-python", 523 | "name": "python", 524 | "nbconvert_exporter": "python", 525 | "pygments_lexer": "ipython3", 526 | "version": "3.7.4" 527 | } 528 | }, 529 | "nbformat": 4, 530 | "nbformat_minor": 2 531 | } 532 | -------------------------------------------------------------------------------- /Projects/Fake News Detection/Fake News.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 45, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#dependencies \n", 10 | "import nltk \n", 11 | "from nltk.corpus import stopwords\n", 12 | "import pandas as pd\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 54, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "df=pd.read_csv('D:/Data Sets/fake-news/train.csv') #reading file csv and converting to a dataframe" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 55, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/html": [ 33 | "
\n", 34 | "\n", 47 | "\n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | "
idtitleauthortextlabel
00House Dem Aide: We Didn’t Even See Comey’s Let...Darrell LucusHouse Dem Aide: We Didn’t Even See Comey’s Let...1
11FLYNN: Hillary Clinton, Big Woman on Campus - ...Daniel J. FlynnEver get the feeling your life circles the rou...0
22Why the Truth Might Get You FiredConsortiumnews.comWhy the Truth Might Get You Fired October 29, ...1
3315 Civilians Killed In Single US Airstrike Hav...Jessica PurkissVideos 15 Civilians Killed In Single US Airstr...1
44Iranian woman jailed for fictional unpublished...Howard PortnoyPrint \\nAn Iranian woman has been sentenced to...1
\n", 101 | "
" 102 | ], 103 | "text/plain": [ 104 | " id title author \\\n", 105 | "0 0 House Dem Aide: We Didn’t Even See Comey’s Let... Darrell Lucus \n", 106 | "1 1 FLYNN: Hillary Clinton, Big Woman on Campus - ... Daniel J. Flynn \n", 107 | "2 2 Why the Truth Might Get You Fired Consortiumnews.com \n", 108 | "3 3 15 Civilians Killed In Single US Airstrike Hav... Jessica Purkiss \n", 109 | "4 4 Iranian woman jailed for fictional unpublished... Howard Portnoy \n", 110 | "\n", 111 | " text label \n", 112 | "0 House Dem Aide: We Didn’t Even See Comey’s Let... 1 \n", 113 | "1 Ever get the feeling your life circles the rou... 0 \n", 114 | "2 Why the Truth Might Get You Fired October 29, ... 1 \n", 115 | "3 Videos 15 Civilians Killed In Single US Airstr... 1 \n", 116 | "4 Print \\nAn Iranian woman has been sentenced to... 1 " 117 | ] 118 | }, 119 | "execution_count": 55, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "#first 5 columns\n", 126 | "df.head()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 56, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "id 0\n", 138 | "title 558\n", 139 | "author 1957\n", 140 | "text 39\n", 141 | "label 0\n", 142 | "dtype: int64" 143 | ] 144 | }, 145 | "execution_count": 56, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "df.isnull().sum()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 57, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "9.408653846153847\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "print(1957*100/df.shape[0])#9 percent can be removed " 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 58, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "df=df.dropna()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 59, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "(18285, 5)" 189 | ] 190 | }, 191 | "execution_count": 59, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "df.shape" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 60, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "#the indices would have disrupted\n", 207 | "df.reset_index(inplace=True)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "#label column means the output variable\n", 217 | "#0 represents that the news can be relied on(not fake)\n", 218 | "#1 represents it cannot be relied on(fake)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 61, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "0 1\n", 230 | "1 0\n", 231 | "2 1\n", 232 | "3 1\n", 233 | "4 1\n", 234 | " ..\n", 235 | "18280 0\n", 236 | "18281 0\n", 237 | "18282 0\n", 238 | "18283 1\n", 239 | "18284 1\n", 240 | "Name: label, Length: 18285, dtype: int64" 241 | ] 242 | }, 243 | "execution_count": 61, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "#dependent variable(y)\n", 250 | "y=df['label']\n", 251 | "y" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 62, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/html": [ 262 | "
\n", 263 | "\n", 276 | "\n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | "
indexidtitleauthortext
000House Dem Aide: We Didn’t Even See Comey’s Let...Darrell LucusHouse Dem Aide: We Didn’t Even See Comey’s Let...
111FLYNN: Hillary Clinton, Big Woman on Campus - ...Daniel J. FlynnEver get the feeling your life circles the rou...
222Why the Truth Might Get You FiredConsortiumnews.comWhy the Truth Might Get You Fired October 29, ...
33315 Civilians Killed In Single US Airstrike Hav...Jessica PurkissVideos 15 Civilians Killed In Single US Airstr...
444Iranian woman jailed for fictional unpublished...Howard PortnoyPrint \\nAn Iranian woman has been sentenced to...
..................
182802079520795Rapper T.I.: Trump a ’Poster Child For White S...Jerome HudsonRapper T. I. unloaded on black celebrities who...
182812079620796N.F.L. Playoffs: Schedule, Matchups and Odds -...Benjamin HoffmanWhen the Green Bay Packers lost to the Washing...
182822079720797Macy’s Is Said to Receive Takeover Approach by...Michael J. de la Merced and Rachel AbramsThe Macy’s of today grew from the union of sev...
182832079820798NATO, Russia To Hold Parallel Exercises In Bal...Alex AnsaryNATO, Russia To Hold Parallel Exercises In Bal...
182842079920799What Keeps the F-35 AliveDavid SwansonDavid Swanson is an author, activist, journa...
\n", 378 | "

18285 rows × 5 columns

\n", 379 | "
" 380 | ], 381 | "text/plain": [ 382 | " index id title \\\n", 383 | "0 0 0 House Dem Aide: We Didn’t Even See Comey’s Let... \n", 384 | "1 1 1 FLYNN: Hillary Clinton, Big Woman on Campus - ... \n", 385 | "2 2 2 Why the Truth Might Get You Fired \n", 386 | "3 3 3 15 Civilians Killed In Single US Airstrike Hav... \n", 387 | "4 4 4 Iranian woman jailed for fictional unpublished... \n", 388 | "... ... ... ... \n", 389 | "18280 20795 20795 Rapper T.I.: Trump a ’Poster Child For White S... \n", 390 | "18281 20796 20796 N.F.L. Playoffs: Schedule, Matchups and Odds -... \n", 391 | "18282 20797 20797 Macy’s Is Said to Receive Takeover Approach by... \n", 392 | "18283 20798 20798 NATO, Russia To Hold Parallel Exercises In Bal... \n", 393 | "18284 20799 20799 What Keeps the F-35 Alive \n", 394 | "\n", 395 | " author \\\n", 396 | "0 Darrell Lucus \n", 397 | "1 Daniel J. Flynn \n", 398 | "2 Consortiumnews.com \n", 399 | "3 Jessica Purkiss \n", 400 | "4 Howard Portnoy \n", 401 | "... ... \n", 402 | "18280 Jerome Hudson \n", 403 | "18281 Benjamin Hoffman \n", 404 | "18282 Michael J. de la Merced and Rachel Abrams \n", 405 | "18283 Alex Ansary \n", 406 | "18284 David Swanson \n", 407 | "\n", 408 | " text \n", 409 | "0 House Dem Aide: We Didn’t Even See Comey’s Let... \n", 410 | "1 Ever get the feeling your life circles the rou... \n", 411 | "2 Why the Truth Might Get You Fired October 29, ... \n", 412 | "3 Videos 15 Civilians Killed In Single US Airstr... \n", 413 | "4 Print \\nAn Iranian woman has been sentenced to... \n", 414 | "... ... \n", 415 | "18280 Rapper T. I. unloaded on black celebrities who... \n", 416 | "18281 When the Green Bay Packers lost to the Washing... \n", 417 | "18282 The Macy’s of today grew from the union of sev... \n", 418 | "18283 NATO, Russia To Hold Parallel Exercises In Bal... \n", 419 | "18284 David Swanson is an author, activist, journa... \n", 420 | "\n", 421 | "[18285 rows x 5 columns]" 422 | ] 423 | }, 424 | "execution_count": 62, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "#independent variable\n", 431 | "x=df.iloc[:,:-1]\n", 432 | "x" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 63, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "'Print \\nAn Iranian woman has been sentenced to six years in prison after Iran’s Revolutionary Guard searched her home and found a notebook that contained a fictional story she’d written about a woman who was stoned to death, according to the Eurasia Review . \\nGolrokh Ebrahimi Iraee, 35, is the wife of political prisoner Arash Sadeghi, 36, who is serving a 19-year prison sentence for being a human rights activist, the publication reported. \\n“When the intelligence unit of the Revolutionary Guards came to arrest her husband, they raided their apartment – without a warrant – and found drafts of stories that Ebrahimi Iraee had written,” the article stated. \\n“One of the confiscated drafts was a story about stoning women to death for adultery – never published, never presented to anyone,” the article stated. “The narrative followed the story of a protagonist that watched a movie about stoning of women under Islamic law for adultery.'" 444 | ] 445 | }, 446 | "execution_count": 63, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "x['text'][4] # in this we see there are characters like \\n which we will be removing, for that we need re(regular expression) " 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 69, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "import re\n", 462 | "from nltk.stem import PorterStemmer as ps\n", 463 | "from tqdm import tqdm" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 70, 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "name": "stderr", 473 | "output_type": "stream", 474 | "text": [ 475 | "100%|██████████████████████████████████████████████████████████████████████████| 18285/18285 [1:29:19<00:00, 3.41it/s]\n" 476 | ] 477 | } 478 | ], 479 | "source": [ 480 | "#here the main text is the key feature because it tells us whether the news can be relied or not,\n", 481 | "#alone with headline we cannot determine how reliable the news is\n", 482 | "ps=ps()\n", 483 | "corpus=[]\n", 484 | "l=len(x['text'])\n", 485 | "for i in tqdm(range(l)):\n", 486 | " texts=re.sub('[^a-zA-Z]',' ',x['text'][i])\n", 487 | " texts=texts.lower()\n", 488 | " texts=texts.split()\n", 489 | " texts=[ps.stem(text) for text in texts if text not in stopwords.words('english')]\n", 490 | " texts=' '.join(texts)\n", 491 | " corpus.append(texts)\n", 492 | " " 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 72, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "#it literally took 1:29:19 to pre process the text!!!!\n", 502 | "from sklearn.feature_extraction.text import TfidfVectorizer as tfidf\n", 503 | "tfidf=tfidf(max_features=6000,ngram_range=(1,3))\n", 504 | "x=tfidf.fit_transform(corpus).toarray()" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 73, 510 | "metadata": {}, 511 | "outputs": [ 512 | { 513 | "data": { 514 | "text/plain": [ 515 | "array([0., 0., 0., ..., 0., 0., 0.])" 516 | ] 517 | }, 518 | "execution_count": 73, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "x[1]" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 74, 530 | "metadata": {}, 531 | "outputs": [ 532 | { 533 | "data": { 534 | "text/plain": [ 535 | "(18285, 6000)" 536 | ] 537 | }, 538 | "execution_count": 74, 539 | "metadata": {}, 540 | "output_type": "execute_result" 541 | } 542 | ], 543 | "source": [ 544 | "x.shape" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 75, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "from sklearn.model_selection import train_test_split\n", 554 | "x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 76, 560 | "metadata": {}, 561 | "outputs": [ 562 | { 563 | "data": { 564 | "text/plain": [ 565 | "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" 566 | ] 567 | }, 568 | "execution_count": 76, 569 | "metadata": {}, 570 | "output_type": "execute_result" 571 | } 572 | ], 573 | "source": [ 574 | "from sklearn.naive_bayes import MultinomialNB\n", 575 | "classifier=MultinomialNB()\n", 576 | "classifier.fit(x_train,y_train)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 77, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "#predicting values\n", 586 | "y_pred=classifier.predict(x_test)" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 79, 592 | "metadata": {}, 593 | "outputs": [ 594 | { 595 | "name": "stdout", 596 | "output_type": "stream", 597 | "text": [ 598 | "[[2431 120]\n", 599 | " [ 352 1669]]\n" 600 | ] 601 | } 602 | ], 603 | "source": [ 604 | "from sklearn.metrics import confusion_matrix,accuracy_score\n", 605 | "cm=confusion_matrix(y_test,y_pred)\n", 606 | "print(cm)" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 84, 612 | "metadata": {}, 613 | "outputs": [ 614 | { 615 | "name": "stdout", 616 | "output_type": "stream", 617 | "text": [ 618 | "0.8967629046369204\n" 619 | ] 620 | } 621 | ], 622 | "source": [ 623 | "print(accuracy_score(y_test,y_pred))" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [] 632 | } 633 | ], 634 | "metadata": { 635 | "kernelspec": { 636 | "display_name": "Python 3", 637 | "language": "python", 638 | "name": "python3" 639 | }, 640 | "language_info": { 641 | "codemirror_mode": { 642 | "name": "ipython", 643 | "version": 3 644 | }, 645 | "file_extension": ".py", 646 | "mimetype": "text/x-python", 647 | "name": "python", 648 | "nbconvert_exporter": "python", 649 | "pygments_lexer": "ipython3", 650 | "version": "3.7.4" 651 | } 652 | }, 653 | "nbformat": 4, 654 | "nbformat_minor": 2 655 | } 656 | -------------------------------------------------------------------------------- /Projects/Covid_tweets Sentiment Analysis/Corona_NLP text classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Covid NLP Tweet Sentiment Analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## We would be using two models\n", 15 | "\n", 16 | "* Bag of Words\n", 17 | " * Multinomial NB\n", 18 | " * PassiveAggressiveClassifer\n", 19 | " " 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import pandas as pd\n", 29 | "import numpy as np" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Bag of Words Model\n", 37 | "\n", 38 | "----------------------------------------" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Train Set" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df=pd.read_csv('D:/Data Sets/Corona_NLP text classification/Corona_NLP_train.csv',encoding='latin-1')" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/html": [ 65 | "
\n", 66 | "\n", 79 | "\n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | "
UserNameScreenNameLocationTweetAtOriginalTweetSentiment
0379948751London16-03-2020@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...Neutral
1380048752UK16-03-2020advice Talk to your neighbours family to excha...Positive
2380148753Vagabonds16-03-2020Coronavirus Australia: Woolworths to give elde...Positive
3380248754NaN16-03-2020My food stock is not the only one which is emp...Positive
4380348755NaN16-03-2020Me, ready to go at supermarket during the #COV...Extremely Negative
\n", 139 | "
" 140 | ], 141 | "text/plain": [ 142 | " UserName ScreenName Location TweetAt \\\n", 143 | "0 3799 48751 London 16-03-2020 \n", 144 | "1 3800 48752 UK 16-03-2020 \n", 145 | "2 3801 48753 Vagabonds 16-03-2020 \n", 146 | "3 3802 48754 NaN 16-03-2020 \n", 147 | "4 3803 48755 NaN 16-03-2020 \n", 148 | "\n", 149 | " OriginalTweet Sentiment \n", 150 | "0 @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i... Neutral \n", 151 | "1 advice Talk to your neighbours family to excha... Positive \n", 152 | "2 Coronavirus Australia: Woolworths to give elde... Positive \n", 153 | "3 My food stock is not the only one which is emp... Positive \n", 154 | "4 Me, ready to go at supermarket during the #COV... Extremely Negative " 155 | ] 156 | }, 157 | "execution_count": 3, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "df.head()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 4, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "{'Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive'}" 175 | ] 176 | }, 177 | "execution_count": 4, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "#to see the different categories/sentiments\n", 184 | "set(df['Sentiment'])" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 5, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "#df.iloc[df.shape[0]-1] " 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 6, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "(41157, 6)" 205 | ] 206 | }, 207 | "execution_count": 6, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "df.shape" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 7, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "y=df['Sentiment'].values" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 8, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "#encoding labels\n", 232 | "from sklearn.preprocessing import LabelEncoder as le\n", 233 | "le=le()\n", 234 | "y=le.fit_transform(y)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "array([3, 4, 4, ..., 4, 3, 2])" 246 | ] 247 | }, 248 | "execution_count": 9, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "y" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 10, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "tweets=list(df['OriginalTweet'])" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 11, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "['@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8',\n", 275 | " 'advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order']" 276 | ] 277 | }, 278 | "execution_count": 11, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "tweets[:2]" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 13, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "import nltk\n", 294 | "from nltk.corpus import stopwords\n", 295 | "from nltk.stem import PorterStemmer as ps\n", 296 | "from sklearn.feature_extraction.text import CountVectorizer as cv" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 14, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stderr", 306 | "output_type": "stream", 307 | "text": [ 308 | "100%|███████████████████████████████████████████████████████████████████████████| 41157/41157 [06:50<00:00, 100.26it/s]\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "import re\n", 314 | "from tqdm import tqdm\n", 315 | "\n", 316 | "ps=ps()\n", 317 | "corpus=[]\n", 318 | "for i in tqdm(range(len(tweets))):\n", 319 | " tweet=re.sub('[^a-zA-Z]',' ',tweets[i])\n", 320 | " tweet=tweet.lower()\n", 321 | " #print(tweet)\n", 322 | " tweet=tweet.split()\n", 323 | " tweet=[ps.stem(word) for word in tweet if word not in stopwords.words('english')]\n", 324 | " tweet=' '.join(tweet)\n", 325 | " corpus.append(tweet)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 15, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "cv=cv(max_features=2500)\n", 335 | "x=cv.fit_transform(corpus).toarray()" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 16, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "(41157, 2500)" 347 | ] 348 | }, 349 | "execution_count": 16, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | } 353 | ], 354 | "source": [ 355 | "x.shape" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 17, 361 | "metadata": {}, 362 | "outputs": [ 363 | { 364 | "name": "stderr", 365 | "output_type": "stream", 366 | "text": [ 367 | "100%|██████████████████████████████████████████████████████████████████████████| 41157/41157 [00:35<00:00, 1143.92it/s]" 368 | ] 369 | }, 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "629912\n" 375 | ] 376 | }, 377 | { 378 | "name": "stderr", 379 | "output_type": "stream", 380 | "text": [ 381 | "\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "c=0\n", 387 | "for i in tqdm(x):\n", 388 | " for j in i:\n", 389 | " if j>0:\n", 390 | " c+=1\n", 391 | "print(c)" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "#x,y are our variables. This is our train set so we will fit the model on this data \n", 401 | "#and then will read the test file to see how good our model really is" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 18, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "x_train,y_train=x,y" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "#we will also be comparing different models" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 43, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "data": { 429 | "text/plain": [ 430 | "MultinomialNB(alpha=1000)" 431 | ] 432 | }, 433 | "execution_count": 43, 434 | "metadata": {}, 435 | "output_type": "execute_result" 436 | } 437 | ], 438 | "source": [ 439 | "from sklearn.naive_bayes import MultinomialNB as nb\n", 440 | "classifier_nb=nb(alpha=1000)\n", 441 | "classifier_nb.fit(x_train,y_train)" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 32, 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "data": { 451 | "text/plain": [ 452 | "PassiveAggressiveClassifier()" 453 | ] 454 | }, 455 | "execution_count": 32, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "from sklearn.linear_model import PassiveAggressiveClassifier as pac\n", 462 | "classifier_pac=pac()\n", 463 | "classifier_pac.fit(x_train,y_train)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "## Test Set" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 21, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "df_test=pd.read_csv('D:/Data Sets/Corona_NLP text classification/Corona_NLP_test.csv',encoding='latin-1')" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 22, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "data": { 489 | "text/html": [ 490 | "
\n", 491 | "\n", 504 | "\n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | "
UserNameScreenNameLocationTweetAtOriginalTweetSentiment
0144953NYC02-03-2020TRENDING: New Yorkers encounter empty supermar...Extremely Negative
1244954Seattle, WA02-03-2020When I couldn't find hand sanitizer at Fred Me...Positive
2344955NaN02-03-2020Find out how you can protect yourself and love...Extremely Positive
3444956Chicagoland02-03-2020#Panic buying hits #NewYork City as anxious sh...Negative
4544957Melbourne, Victoria03-03-2020#toiletpaper #dunnypaper #coronavirus #coronav...Neutral
\n", 564 | "
" 565 | ], 566 | "text/plain": [ 567 | " UserName ScreenName Location TweetAt \\\n", 568 | "0 1 44953 NYC 02-03-2020 \n", 569 | "1 2 44954 Seattle, WA 02-03-2020 \n", 570 | "2 3 44955 NaN 02-03-2020 \n", 571 | "3 4 44956 Chicagoland 02-03-2020 \n", 572 | "4 5 44957 Melbourne, Victoria 03-03-2020 \n", 573 | "\n", 574 | " OriginalTweet Sentiment \n", 575 | "0 TRENDING: New Yorkers encounter empty supermar... Extremely Negative \n", 576 | "1 When I couldn't find hand sanitizer at Fred Me... Positive \n", 577 | "2 Find out how you can protect yourself and love... Extremely Positive \n", 578 | "3 #Panic buying hits #NewYork City as anxious sh... Negative \n", 579 | "4 #toiletpaper #dunnypaper #coronavirus #coronav... Neutral " 580 | ] 581 | }, 582 | "execution_count": 22, 583 | "metadata": {}, 584 | "output_type": "execute_result" 585 | } 586 | ], 587 | "source": [ 588 | "df_test.head()" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 23, 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [ 597 | "tweets_test=df_test['OriginalTweet']\n", 598 | "y_test=df_test['Sentiment'].values" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 24, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "#encoding the dependent variable\n", 608 | "y_test=le.fit_transform(y_test)" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": 25, 614 | "metadata": {}, 615 | "outputs": [ 616 | { 617 | "data": { 618 | "text/plain": [ 619 | "array([0, 4, 1, ..., 3, 0, 1])" 620 | ] 621 | }, 622 | "execution_count": 25, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "y_test" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 26, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "name": "stderr", 638 | "output_type": "stream", 639 | "text": [ 640 | "100%|██████████████████████████████████████████████████████████████████████████████| 3798/3798 [00:43<00:00, 87.13it/s]\n" 641 | ] 642 | } 643 | ], 644 | "source": [ 645 | "from nltk.stem import PorterStemmer as ps\n", 646 | "\n", 647 | "test_corpus=[]\n", 648 | "ps_test=ps()\n", 649 | "for i in tqdm(range(len(tweets_test))):\n", 650 | " test_tweet=re.sub('[^a-zA-Z]',' ',tweets_test[i])\n", 651 | " test_tweet=test_tweet.lower()\n", 652 | " #print(test_tweet)\n", 653 | " test_tweet=test_tweet.split()\n", 654 | " test_tweet=[ps_test.stem(w) for w in test_tweet if w not in stopwords.words('english')]\n", 655 | " test_tweet=' '.join(test_tweet)\n", 656 | " test_corpus.append(test_tweet)" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 34, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "name": "stderr", 666 | "output_type": "stream", 667 | "text": [ 668 | "100%|████████████████████████████████████████████████████████████████████████████| 3798/3798 [00:03<00:00, 1222.75it/s]" 669 | ] 670 | }, 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "63427\n" 676 | ] 677 | }, 678 | { 679 | "name": "stderr", 680 | "output_type": "stream", 681 | "text": [ 682 | "\n" 683 | ] 684 | } 685 | ], 686 | "source": [ 687 | "from sklearn.feature_extraction.text import CountVectorizer as cv\n", 688 | "\n", 689 | "cv=cv(max_features=2500)\n", 690 | "x_test=cv.fit_transform(test_corpus).toarray()\n", 691 | "\n", 692 | "count=0\n", 693 | "for i in tqdm(x_test):\n", 694 | " for j in i:\n", 695 | " if j>0:\n", 696 | " count+=1\n", 697 | "print(count)" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 44, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "#predicting values from the test set\n", 707 | "y_pred_nb=classifier_nb.predict(x_test)\n", 708 | "y_pred_pac=classifier_pac.predict(x_test)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 36, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "from sklearn.metrics import classification_report,accuracy_score" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 45, 723 | "metadata": {}, 724 | "outputs": [ 725 | { 726 | "name": "stdout", 727 | "output_type": "stream", 728 | "text": [ 729 | "Accuracy is: 0.2632964718272775\n", 730 | " precision recall f1-score support\n", 731 | "\n", 732 | " 0 0.41 0.01 0.02 592\n", 733 | " 1 0.24 0.01 0.02 599\n", 734 | " 2 0.30 0.25 0.28 1041\n", 735 | " 3 0.09 0.02 0.03 619\n", 736 | " 4 0.26 0.75 0.39 947\n", 737 | "\n", 738 | " accuracy 0.26 3798\n", 739 | " macro avg 0.26 0.21 0.15 3798\n", 740 | "weighted avg 0.26 0.26 0.18 3798\n", 741 | "\n" 742 | ] 743 | } 744 | ], 745 | "source": [ 746 | "#Multinomial NaiveBayes\n", 747 | "print('Accuracy is:',accuracy_score(y_test,y_pred_nb))\n", 748 | "print(classification_report(y_test,y_pred_nb))" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": 38, 754 | "metadata": {}, 755 | "outputs": [ 756 | { 757 | "name": "stdout", 758 | "output_type": "stream", 759 | "text": [ 760 | "Accuracy is: 0.25223802001053186\n", 761 | " precision recall f1-score support\n", 762 | "\n", 763 | " 0 0.20 0.11 0.14 592\n", 764 | " 1 0.18 0.26 0.22 599\n", 765 | " 2 0.29 0.40 0.34 1041\n", 766 | " 3 0.30 0.22 0.26 619\n", 767 | " 4 0.25 0.19 0.22 947\n", 768 | "\n", 769 | " accuracy 0.25 3798\n", 770 | " macro avg 0.24 0.24 0.23 3798\n", 771 | "weighted avg 0.25 0.25 0.24 3798\n", 772 | "\n" 773 | ] 774 | } 775 | ], 776 | "source": [ 777 | "#PassiveAggressiveClassifier\n", 778 | "print('Accuracy is:',accuracy_score(y_test,y_pred_pac))\n", 779 | "print(classification_report(y_test,y_pred_pac))" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": null, 792 | "metadata": {}, 793 | "outputs": [], 794 | "source": [] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "metadata": {}, 800 | "outputs": [], 801 | "source": [] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "metadata": {}, 814 | "outputs": [], 815 | "source": [] 816 | } 817 | ], 818 | "metadata": { 819 | "kernelspec": { 820 | "display_name": "Python 3", 821 | "language": "python", 822 | "name": "python3" 823 | }, 824 | "language_info": { 825 | "codemirror_mode": { 826 | "name": "ipython", 827 | "version": 3 828 | }, 829 | "file_extension": ".py", 830 | "mimetype": "text/x-python", 831 | "name": "python", 832 | "nbconvert_exporter": "python", 833 | "pygments_lexer": "ipython3", 834 | "version": "3.7.4" 835 | } 836 | }, 837 | "nbformat": 4, 838 | "nbformat_minor": 2 839 | } 840 | -------------------------------------------------------------------------------- /Projects/Daily News for Stock Market Prediction/Stock Prediction using News Headlines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Stock Sentiment Analysis using News Headlines" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "#dependency\n", 17 | "import pandas as pd" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "df=pd.read_csv('D:/Data Sets/Daily News for Stock Market Prediction/Stock-Sentiment-Analysis/trunk/Data.csv', encoding = \"ISO-8859-1\")" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/html": [ 37 | "
\n", 38 | "\n", 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | "
DateLabelTop1Top2Top3Top4Top5Top6Top7Top8...Top16Top17Top18Top19Top20Top21Top22Top23Top24Top25
02000-01-030A 'hindrance to operations': extracts from the...ScorecardHughes' instant hit buoys BluesJack gets his skates on at ice-cold AlexChaos as Maracana builds up for UnitedDepleted Leicester prevail as Elliott spoils E...Hungry Spurs sense rich pickingsGunners so wide of an easy target...Flintoff injury piles on woe for EnglandHunters threaten Jospin with new battle of the...Kohl's successor drawn into scandalThe difference between men and womenSara Denver, nurse turned solicitorDiana's landmine crusade put Tories in a panicYeltsin's resignation caught opposition flat-f...Russian rouletteSold outRecovering a title
12000-01-040ScorecardThe best lake sceneLeader: German sleaze inquiryCheerio, boyoThe main recommendationsHas Cubie killed fees?Has Cubie killed fees?Has Cubie killed fees?...On the critical listThe timing of their livesDear doctorIrish court halts IRA man's extradition to Nor...Burundi peace initiative fades after rebels re...PE points the way forward to the ECBCampaigners keep up pressure on Nazi war crime...Jane RatcliffeYet more things you wouldn't know without the ...Millennium bug fails to bite
22000-01-050Coventry caught on counter by FloUnited's rivals on the road to RioThatcher issues defence before trial by videoPolice help Smith lay down the law at EvertonTale of Trautmann bears two more retellingsEngland on the rackPakistan retaliate with call for video of WalshCullinan continues his Cape monopoly...South Melbourne (Australia)Necaxa (Mexico)Real Madrid (Spain)Raja Casablanca (Morocco)Corinthians (Brazil)Tony's pet projectAl Nassr (Saudi Arabia)Ideal Holmes showPinochet leaves hospital after testsUseful links
32000-01-061Pilgrim knows how to progressThatcher facing banMcIlroy calls for Irish fighting spiritLeicester bin stadium blueprintUnited braced for Mexican waveAuntie back in fashion, even if the dress look...Shoaib appeal goes to the topHussain hurt by 'shambles' but lays blame on e......Putin admits Yeltsin quit to give him a head s...BBC worst hit as digital TV begins to biteHow much can you pay for...Christmas glitchesUpending a table, Chopping a line and Scoring ...Scientific evidence 'unreliable', defence claimsFusco wins judicial review in extradition caseRebels thwart Russian advanceBlair orders shake-up of failing NHSLessons of law's hard heart
42000-01-071Hitches and HorlocksBeckham off but United surviveBreast cancer screeningAlan ParkerGuardian readers: are you all whingers?Hollywood BeyondAshes and diamondsWhingers - a formidable minority...Most everywhere: UDIsMost wanted: Chloe lunettesReturn of the cane 'completely off the agenda'From Sleepy Hollow to GreenelandBlunkett outlines vision for over 11sEmbattled Dobson attacks 'play now, pay later'...Doom and the DomeWhat is the north-south divide?Aitken released from jailGone aloft
\n", 201 | "

5 rows × 27 columns

\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " Date Label Top1 \\\n", 206 | "0 2000-01-03 0 A 'hindrance to operations': extracts from the... \n", 207 | "1 2000-01-04 0 Scorecard \n", 208 | "2 2000-01-05 0 Coventry caught on counter by Flo \n", 209 | "3 2000-01-06 1 Pilgrim knows how to progress \n", 210 | "4 2000-01-07 1 Hitches and Horlocks \n", 211 | "\n", 212 | " Top2 \\\n", 213 | "0 Scorecard \n", 214 | "1 The best lake scene \n", 215 | "2 United's rivals on the road to Rio \n", 216 | "3 Thatcher facing ban \n", 217 | "4 Beckham off but United survive \n", 218 | "\n", 219 | " Top3 \\\n", 220 | "0 Hughes' instant hit buoys Blues \n", 221 | "1 Leader: German sleaze inquiry \n", 222 | "2 Thatcher issues defence before trial by video \n", 223 | "3 McIlroy calls for Irish fighting spirit \n", 224 | "4 Breast cancer screening \n", 225 | "\n", 226 | " Top4 \\\n", 227 | "0 Jack gets his skates on at ice-cold Alex \n", 228 | "1 Cheerio, boyo \n", 229 | "2 Police help Smith lay down the law at Everton \n", 230 | "3 Leicester bin stadium blueprint \n", 231 | "4 Alan Parker \n", 232 | "\n", 233 | " Top5 \\\n", 234 | "0 Chaos as Maracana builds up for United \n", 235 | "1 The main recommendations \n", 236 | "2 Tale of Trautmann bears two more retellings \n", 237 | "3 United braced for Mexican wave \n", 238 | "4 Guardian readers: are you all whingers? \n", 239 | "\n", 240 | " Top6 \\\n", 241 | "0 Depleted Leicester prevail as Elliott spoils E... \n", 242 | "1 Has Cubie killed fees? \n", 243 | "2 England on the rack \n", 244 | "3 Auntie back in fashion, even if the dress look... \n", 245 | "4 Hollywood Beyond \n", 246 | "\n", 247 | " Top7 \\\n", 248 | "0 Hungry Spurs sense rich pickings \n", 249 | "1 Has Cubie killed fees? \n", 250 | "2 Pakistan retaliate with call for video of Walsh \n", 251 | "3 Shoaib appeal goes to the top \n", 252 | "4 Ashes and diamonds \n", 253 | "\n", 254 | " Top8 ... \\\n", 255 | "0 Gunners so wide of an easy target ... \n", 256 | "1 Has Cubie killed fees? ... \n", 257 | "2 Cullinan continues his Cape monopoly ... \n", 258 | "3 Hussain hurt by 'shambles' but lays blame on e... ... \n", 259 | "4 Whingers - a formidable minority ... \n", 260 | "\n", 261 | " Top16 \\\n", 262 | "0 Flintoff injury piles on woe for England \n", 263 | "1 On the critical list \n", 264 | "2 South Melbourne (Australia) \n", 265 | "3 Putin admits Yeltsin quit to give him a head s... \n", 266 | "4 Most everywhere: UDIs \n", 267 | "\n", 268 | " Top17 \\\n", 269 | "0 Hunters threaten Jospin with new battle of the... \n", 270 | "1 The timing of their lives \n", 271 | "2 Necaxa (Mexico) \n", 272 | "3 BBC worst hit as digital TV begins to bite \n", 273 | "4 Most wanted: Chloe lunettes \n", 274 | "\n", 275 | " Top18 \\\n", 276 | "0 Kohl's successor drawn into scandal \n", 277 | "1 Dear doctor \n", 278 | "2 Real Madrid (Spain) \n", 279 | "3 How much can you pay for... \n", 280 | "4 Return of the cane 'completely off the agenda' \n", 281 | "\n", 282 | " Top19 \\\n", 283 | "0 The difference between men and women \n", 284 | "1 Irish court halts IRA man's extradition to Nor... \n", 285 | "2 Raja Casablanca (Morocco) \n", 286 | "3 Christmas glitches \n", 287 | "4 From Sleepy Hollow to Greeneland \n", 288 | "\n", 289 | " Top20 \\\n", 290 | "0 Sara Denver, nurse turned solicitor \n", 291 | "1 Burundi peace initiative fades after rebels re... \n", 292 | "2 Corinthians (Brazil) \n", 293 | "3 Upending a table, Chopping a line and Scoring ... \n", 294 | "4 Blunkett outlines vision for over 11s \n", 295 | "\n", 296 | " Top21 \\\n", 297 | "0 Diana's landmine crusade put Tories in a panic \n", 298 | "1 PE points the way forward to the ECB \n", 299 | "2 Tony's pet project \n", 300 | "3 Scientific evidence 'unreliable', defence claims \n", 301 | "4 Embattled Dobson attacks 'play now, pay later'... \n", 302 | "\n", 303 | " Top22 \\\n", 304 | "0 Yeltsin's resignation caught opposition flat-f... \n", 305 | "1 Campaigners keep up pressure on Nazi war crime... \n", 306 | "2 Al Nassr (Saudi Arabia) \n", 307 | "3 Fusco wins judicial review in extradition case \n", 308 | "4 Doom and the Dome \n", 309 | "\n", 310 | " Top23 \\\n", 311 | "0 Russian roulette \n", 312 | "1 Jane Ratcliffe \n", 313 | "2 Ideal Holmes show \n", 314 | "3 Rebels thwart Russian advance \n", 315 | "4 What is the north-south divide? \n", 316 | "\n", 317 | " Top24 \\\n", 318 | "0 Sold out \n", 319 | "1 Yet more things you wouldn't know without the ... \n", 320 | "2 Pinochet leaves hospital after tests \n", 321 | "3 Blair orders shake-up of failing NHS \n", 322 | "4 Aitken released from jail \n", 323 | "\n", 324 | " Top25 \n", 325 | "0 Recovering a title \n", 326 | "1 Millennium bug fails to bite \n", 327 | "2 Useful links \n", 328 | "3 Lessons of law's hard heart \n", 329 | "4 Gone aloft \n", 330 | "\n", 331 | "[5 rows x 27 columns]" 332 | ] 333 | }, 334 | "execution_count": 3, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "df.head()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 4, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "#segregating into train and test on the basis of timestamps\n", 350 | "train = df[df['Date'] < '20150101']\n", 351 | "test = df[df['Date'] > '20141231']" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 5, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "data": { 361 | "text/html": [ 362 | "
\n", 363 | "\n", 376 | "\n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | "
0123456789...15161718192021222324
0A hindrance to operations extracts from the...ScorecardHughes instant hit buoys BluesJack gets his skates on at ice cold AlexChaos as Maracana builds up for UnitedDepleted Leicester prevail as Elliott spoils E...Hungry Spurs sense rich pickingsGunners so wide of an easy targetDerby raise a glass to Strupar s debut doubleSouthgate strikes Leeds pay the penalty...Flintoff injury piles on woe for EnglandHunters threaten Jospin with new battle of the...Kohl s successor drawn into scandalThe difference between men and womenSara Denver nurse turned solicitorDiana s landmine crusade put Tories in a panicYeltsin s resignation caught opposition flat f...Russian rouletteSold outRecovering a title
1ScorecardThe best lake sceneLeader German sleaze inquiryCheerio boyoThe main recommendationsHas Cubie killed feesHas Cubie killed feesHas Cubie killed feesHopkins furious at Foster s lack of Hannibal...Has Cubie killed fees...On the critical listThe timing of their livesDear doctorIrish court halts IRA man s extradition to Nor...Burundi peace initiative fades after rebels re...PE points the way forward to the ECBCampaigners keep up pressure on Nazi war crime...Jane RatcliffeYet more things you wouldn t know without the ...Millennium bug fails to bite
2Coventry caught on counter by FloUnited s rivals on the road to RioThatcher issues defence before trial by videoPolice help Smith lay down the law at EvertonTale of Trautmann bears two more retellingsEngland on the rackPakistan retaliate with call for video of WalshCullinan continues his Cape monopolyMcGrath puts India out of their miseryBlair Witch bandwagon rolls on...South Melbourne AustraliaNecaxa MexicoReal Madrid SpainRaja Casablanca MoroccoCorinthians BrazilTony s pet projectAl Nassr Saudi ArabiaIdeal Holmes showPinochet leaves hospital after testsUseful links
3Pilgrim knows how to progressThatcher facing banMcIlroy calls for Irish fighting spiritLeicester bin stadium blueprintUnited braced for Mexican waveAuntie back in fashion even if the dress look...Shoaib appeal goes to the topHussain hurt by shambles but lays blame on e...England s decade of disastersRevenge is sweet for jubilant Cronje...Putin admits Yeltsin quit to give him a head s...BBC worst hit as digital TV begins to biteHow much can you pay forChristmas glitchesUpending a table Chopping a line and Scoring ...Scientific evidence unreliable defence claimsFusco wins judicial review in extradition caseRebels thwart Russian advanceBlair orders shake up of failing NHSLessons of law s hard heart
4Hitches and HorlocksBeckham off but United surviveBreast cancer screeningAlan ParkerGuardian readers are you all whingersHollywood BeyondAshes and diamondsWhingers a formidable minorityAlan Parker part twoThuggery Toxins and Ties...Most everywhere UDIsMost wanted Chloe lunettesReturn of the cane completely off the agendaFrom Sleepy Hollow to GreenelandBlunkett outlines vision for over sEmbattled Dobson attacks play now pay later ...Doom and the DomeWhat is the north south divideAitken released from jailGone aloft
\n", 526 | "

5 rows × 25 columns

\n", 527 | "
" 528 | ], 529 | "text/plain": [ 530 | " 0 \\\n", 531 | "0 A hindrance to operations extracts from the... \n", 532 | "1 Scorecard \n", 533 | "2 Coventry caught on counter by Flo \n", 534 | "3 Pilgrim knows how to progress \n", 535 | "4 Hitches and Horlocks \n", 536 | "\n", 537 | " 1 \\\n", 538 | "0 Scorecard \n", 539 | "1 The best lake scene \n", 540 | "2 United s rivals on the road to Rio \n", 541 | "3 Thatcher facing ban \n", 542 | "4 Beckham off but United survive \n", 543 | "\n", 544 | " 2 \\\n", 545 | "0 Hughes instant hit buoys Blues \n", 546 | "1 Leader German sleaze inquiry \n", 547 | "2 Thatcher issues defence before trial by video \n", 548 | "3 McIlroy calls for Irish fighting spirit \n", 549 | "4 Breast cancer screening \n", 550 | "\n", 551 | " 3 \\\n", 552 | "0 Jack gets his skates on at ice cold Alex \n", 553 | "1 Cheerio boyo \n", 554 | "2 Police help Smith lay down the law at Everton \n", 555 | "3 Leicester bin stadium blueprint \n", 556 | "4 Alan Parker \n", 557 | "\n", 558 | " 4 \\\n", 559 | "0 Chaos as Maracana builds up for United \n", 560 | "1 The main recommendations \n", 561 | "2 Tale of Trautmann bears two more retellings \n", 562 | "3 United braced for Mexican wave \n", 563 | "4 Guardian readers are you all whingers \n", 564 | "\n", 565 | " 5 \\\n", 566 | "0 Depleted Leicester prevail as Elliott spoils E... \n", 567 | "1 Has Cubie killed fees \n", 568 | "2 England on the rack \n", 569 | "3 Auntie back in fashion even if the dress look... \n", 570 | "4 Hollywood Beyond \n", 571 | "\n", 572 | " 6 \\\n", 573 | "0 Hungry Spurs sense rich pickings \n", 574 | "1 Has Cubie killed fees \n", 575 | "2 Pakistan retaliate with call for video of Walsh \n", 576 | "3 Shoaib appeal goes to the top \n", 577 | "4 Ashes and diamonds \n", 578 | "\n", 579 | " 7 \\\n", 580 | "0 Gunners so wide of an easy target \n", 581 | "1 Has Cubie killed fees \n", 582 | "2 Cullinan continues his Cape monopoly \n", 583 | "3 Hussain hurt by shambles but lays blame on e... \n", 584 | "4 Whingers a formidable minority \n", 585 | "\n", 586 | " 8 \\\n", 587 | "0 Derby raise a glass to Strupar s debut double \n", 588 | "1 Hopkins furious at Foster s lack of Hannibal... \n", 589 | "2 McGrath puts India out of their misery \n", 590 | "3 England s decade of disasters \n", 591 | "4 Alan Parker part two \n", 592 | "\n", 593 | " 9 ... \\\n", 594 | "0 Southgate strikes Leeds pay the penalty ... \n", 595 | "1 Has Cubie killed fees ... \n", 596 | "2 Blair Witch bandwagon rolls on ... \n", 597 | "3 Revenge is sweet for jubilant Cronje ... \n", 598 | "4 Thuggery Toxins and Ties ... \n", 599 | "\n", 600 | " 15 \\\n", 601 | "0 Flintoff injury piles on woe for England \n", 602 | "1 On the critical list \n", 603 | "2 South Melbourne Australia \n", 604 | "3 Putin admits Yeltsin quit to give him a head s... \n", 605 | "4 Most everywhere UDIs \n", 606 | "\n", 607 | " 16 \\\n", 608 | "0 Hunters threaten Jospin with new battle of the... \n", 609 | "1 The timing of their lives \n", 610 | "2 Necaxa Mexico \n", 611 | "3 BBC worst hit as digital TV begins to bite \n", 612 | "4 Most wanted Chloe lunettes \n", 613 | "\n", 614 | " 17 \\\n", 615 | "0 Kohl s successor drawn into scandal \n", 616 | "1 Dear doctor \n", 617 | "2 Real Madrid Spain \n", 618 | "3 How much can you pay for \n", 619 | "4 Return of the cane completely off the agenda \n", 620 | "\n", 621 | " 18 \\\n", 622 | "0 The difference between men and women \n", 623 | "1 Irish court halts IRA man s extradition to Nor... \n", 624 | "2 Raja Casablanca Morocco \n", 625 | "3 Christmas glitches \n", 626 | "4 From Sleepy Hollow to Greeneland \n", 627 | "\n", 628 | " 19 \\\n", 629 | "0 Sara Denver nurse turned solicitor \n", 630 | "1 Burundi peace initiative fades after rebels re... \n", 631 | "2 Corinthians Brazil \n", 632 | "3 Upending a table Chopping a line and Scoring ... \n", 633 | "4 Blunkett outlines vision for over s \n", 634 | "\n", 635 | " 20 \\\n", 636 | "0 Diana s landmine crusade put Tories in a panic \n", 637 | "1 PE points the way forward to the ECB \n", 638 | "2 Tony s pet project \n", 639 | "3 Scientific evidence unreliable defence claims \n", 640 | "4 Embattled Dobson attacks play now pay later ... \n", 641 | "\n", 642 | " 21 \\\n", 643 | "0 Yeltsin s resignation caught opposition flat f... \n", 644 | "1 Campaigners keep up pressure on Nazi war crime... \n", 645 | "2 Al Nassr Saudi Arabia \n", 646 | "3 Fusco wins judicial review in extradition case \n", 647 | "4 Doom and the Dome \n", 648 | "\n", 649 | " 22 \\\n", 650 | "0 Russian roulette \n", 651 | "1 Jane Ratcliffe \n", 652 | "2 Ideal Holmes show \n", 653 | "3 Rebels thwart Russian advance \n", 654 | "4 What is the north south divide \n", 655 | "\n", 656 | " 23 \\\n", 657 | "0 Sold out \n", 658 | "1 Yet more things you wouldn t know without the ... \n", 659 | "2 Pinochet leaves hospital after tests \n", 660 | "3 Blair orders shake up of failing NHS \n", 661 | "4 Aitken released from jail \n", 662 | "\n", 663 | " 24 \n", 664 | "0 Recovering a title \n", 665 | "1 Millennium bug fails to bite \n", 666 | "2 Useful links \n", 667 | "3 Lessons of law s hard heart \n", 668 | "4 Gone aloft \n", 669 | "\n", 670 | "[5 rows x 25 columns]" 671 | ] 672 | }, 673 | "execution_count": 5, 674 | "metadata": {}, 675 | "output_type": "execute_result" 676 | } 677 | ], 678 | "source": [ 679 | "# Removing punctuations\n", 680 | "data=train.iloc[:,2:27]\n", 681 | "data.replace(\"[^a-zA-Z]\",\" \",regex=True, inplace=True)\n", 682 | "\n", 683 | "# Renaming column names for ease of access\n", 684 | "list1= [i for i in range(25)]\n", 685 | "new_Index=[str(i) for i in list1]\n", 686 | "data.columns= new_Index\n", 687 | "data.head(5)\n", 688 | "\n" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 6, 694 | "metadata": {}, 695 | "outputs": [ 696 | { 697 | "data": { 698 | "text/html": [ 699 | "
\n", 700 | "\n", 713 | "\n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | "
0123456789...15161718192021222324
0a hindrance to operations extracts from the...scorecardhughes instant hit buoys bluesjack gets his skates on at ice cold alexchaos as maracana builds up for uniteddepleted leicester prevail as elliott spoils e...hungry spurs sense rich pickingsgunners so wide of an easy targetderby raise a glass to strupar s debut doublesouthgate strikes leeds pay the penalty...flintoff injury piles on woe for englandhunters threaten jospin with new battle of the...kohl s successor drawn into scandalthe difference between men and womensara denver nurse turned solicitordiana s landmine crusade put tories in a panicyeltsin s resignation caught opposition flat f...russian roulettesold outrecovering a title
\n", 767 | "

1 rows × 25 columns

\n", 768 | "
" 769 | ], 770 | "text/plain": [ 771 | " 0 1 \\\n", 772 | "0 a hindrance to operations extracts from the... scorecard \n", 773 | "\n", 774 | " 2 3 \\\n", 775 | "0 hughes instant hit buoys blues jack gets his skates on at ice cold alex \n", 776 | "\n", 777 | " 4 \\\n", 778 | "0 chaos as maracana builds up for united \n", 779 | "\n", 780 | " 5 \\\n", 781 | "0 depleted leicester prevail as elliott spoils e... \n", 782 | "\n", 783 | " 6 7 \\\n", 784 | "0 hungry spurs sense rich pickings gunners so wide of an easy target \n", 785 | "\n", 786 | " 8 \\\n", 787 | "0 derby raise a glass to strupar s debut double \n", 788 | "\n", 789 | " 9 ... \\\n", 790 | "0 southgate strikes leeds pay the penalty ... \n", 791 | "\n", 792 | " 15 \\\n", 793 | "0 flintoff injury piles on woe for england \n", 794 | "\n", 795 | " 16 \\\n", 796 | "0 hunters threaten jospin with new battle of the... \n", 797 | "\n", 798 | " 17 18 \\\n", 799 | "0 kohl s successor drawn into scandal the difference between men and women \n", 800 | "\n", 801 | " 19 \\\n", 802 | "0 sara denver nurse turned solicitor \n", 803 | "\n", 804 | " 20 \\\n", 805 | "0 diana s landmine crusade put tories in a panic \n", 806 | "\n", 807 | " 21 22 \\\n", 808 | "0 yeltsin s resignation caught opposition flat f... russian roulette \n", 809 | "\n", 810 | " 23 24 \n", 811 | "0 sold out recovering a title \n", 812 | "\n", 813 | "[1 rows x 25 columns]" 814 | ] 815 | }, 816 | "execution_count": 6, 817 | "metadata": {}, 818 | "output_type": "execute_result" 819 | } 820 | ], 821 | "source": [ 822 | "# Convertng headlines to lower case\n", 823 | "for index in new_Index:\n", 824 | " data[index]=data[index].str.lower()\n", 825 | "data.head(1)" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": 11, 831 | "metadata": {}, 832 | "outputs": [ 833 | { 834 | "data": { 835 | "text/plain": [ 836 | "'scorecard the best lake scene leader german sleaze inquiry cheerio boyo the main recommendations has cubie killed fees has cubie killed fees has cubie killed fees hopkins furious at foster s lack of hannibal appetite has cubie killed fees a tale of two tails i say what i like and i like what i say elbows eyes and nipples task force to assess risk of asteroid collision how i found myself at last on the critical list the timing of their lives dear doctor irish court halts ira man s extradition to northern ireland burundi peace initiative fades after rebels reject mandela as mediator pe points the way forward to the ecb campaigners keep up pressure on nazi war crimes suspect jane ratcliffe yet more things you wouldn t know without the movies millennium bug fails to bite'" 837 | ] 838 | }, 839 | "execution_count": 11, 840 | "metadata": {}, 841 | "output_type": "execute_result" 842 | } 843 | ], 844 | "source": [ 845 | "' '.join(str(x) for x in data.iloc[1,0:25])" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": 12, 851 | "metadata": {}, 852 | "outputs": [], 853 | "source": [ 854 | "headlines = []\n", 855 | "for row in range(0,len(data.index)):\n", 856 | " headlines.append(' '.join(str(x) for x in data.iloc[row,0:25]))" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": 13, 862 | "metadata": {}, 863 | "outputs": [ 864 | { 865 | "data": { 866 | "text/plain": [ 867 | "'a hindrance to operations extracts from the leaked reports scorecard hughes instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes leeds pay the penalty hammers hand robson a youthful lesson saints party like it s wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'" 868 | ] 869 | }, 870 | "execution_count": 13, 871 | "metadata": {}, 872 | "output_type": "execute_result" 873 | } 874 | ], 875 | "source": [ 876 | "headlines[0]" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": 14, 882 | "metadata": {}, 883 | "outputs": [], 884 | "source": [ 885 | "from sklearn.feature_extraction.text import CountVectorizer\n", 886 | "from sklearn.ensemble import RandomForestClassifier" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 15, 892 | "metadata": {}, 893 | "outputs": [], 894 | "source": [ 895 | "## implement BAG OF WORDS\n", 896 | "countvector=CountVectorizer(ngram_range=(1,3))\n", 897 | "traindataset=countvector.fit_transform(headlines)" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "metadata": {}, 904 | "outputs": [], 905 | "source": [ 906 | "# implement RandomForest Classifier\n", 907 | "randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')\n", 908 | "randomclassifier.fit(traindataset,train['Label'])" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 35, 914 | "metadata": {}, 915 | "outputs": [], 916 | "source": [ 917 | "## Predict for the Test Dataset\n", 918 | "test_transform= []\n", 919 | "for row in range(0,len(test.index)):\n", 920 | " test_transform.append(' '.join(str(x) for x in test.iloc[row,2:27]))\n", 921 | "test_dataset = countvector.transform(test_transform)\n", 922 | "predictions = randomclassifier.predict(test_dataset)" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 36, 928 | "metadata": {}, 929 | "outputs": [], 930 | "source": [ 931 | "## Import library to check accuracy\n", 932 | "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": 37, 938 | "metadata": {}, 939 | "outputs": [ 940 | { 941 | "name": "stdout", 942 | "output_type": "stream", 943 | "text": [ 944 | "[[139 47]\n", 945 | " [ 13 179]]\n", 946 | "0.8412698412698413\n", 947 | " precision recall f1-score support\n", 948 | "\n", 949 | " 0 0.91 0.75 0.82 186\n", 950 | " 1 0.79 0.93 0.86 192\n", 951 | "\n", 952 | " micro avg 0.84 0.84 0.84 378\n", 953 | " macro avg 0.85 0.84 0.84 378\n", 954 | "weighted avg 0.85 0.84 0.84 378\n", 955 | "\n" 956 | ] 957 | } 958 | ], 959 | "source": [ 960 | "matrix=confusion_matrix(test['Label'],predictions)\n", 961 | "print(matrix)\n", 962 | "score=accuracy_score(test['Label'],predictions)\n", 963 | "print(score)\n", 964 | "report=classification_report(test['Label'],predictions)\n", 965 | "print(report)" 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": null, 971 | "metadata": {}, 972 | "outputs": [], 973 | "source": [] 974 | } 975 | ], 976 | "metadata": { 977 | "kernelspec": { 978 | "display_name": "Python 3", 979 | "language": "python", 980 | "name": "python3" 981 | }, 982 | "language_info": { 983 | "codemirror_mode": { 984 | "name": "ipython", 985 | "version": 3 986 | }, 987 | "file_extension": ".py", 988 | "mimetype": "text/x-python", 989 | "name": "python", 990 | "nbconvert_exporter": "python", 991 | "pygments_lexer": "ipython3", 992 | "version": "3.7.4" 993 | } 994 | }, 995 | "nbformat": 4, 996 | "nbformat_minor": 2 997 | } 998 | --------------------------------------------------------------------------------