├── Chinking.py ├── Chunking.py ├── Inestigating_biases.py ├── README.md ├── Save_Classifier_with_pickle.py ├── Senti_analysis.py ├── Senti_analysis_module.py ├── Taining_data.py ├── lemmatizing.py ├── naive_bayes.py ├── named_enti_reco.py ├── nltk_corpora.py ├── nltk_download.py ├── nltk_scikit_learn.py ├── part_speech_tagging.py ├── stemming.py ├── stop_words.py ├── text_classification.py ├── text_word_tokenizer.py ├── wordNet.py └── word_features.py /Chinking.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | #PunktSentenceTokenizer is Unsupervised ML tokenizer. We can train it if we want 4 | from nltk.tokenize import PunktSentenceTokenizer 5 | from nltk.tokenize import word_tokenize 6 | ''' 7 | POS TAG List : 8 | Put the words to be tagged here to clean the dataset 9 | Gtting the datat can be anything and do the things we can't do 10 | 11 | 12 | ''' 13 | train_text = state_union.raw("2005-GWBush.txt") 14 | sample_text = state_union.raw("2006-GWBush.txt") 15 | 16 | custom_tokenizer = PunktSentenceTokenizer(train_text) 17 | 18 | tokenized = custom_tokenizer.tokenize(sample_text) 19 | 20 | def process_content(): 21 | try: 22 | for i in tokenized: 23 | words = nltk.word_tokenize(i) 24 | tagged = nltk.pos_tag(words) 25 | chunkGram = ''' Chunk: {<.*>+} 26 | }+{''' 27 | chunkParser = nltk.RegexpParser(chunkGram) 28 | chunked = chunkParser.parse(tagged) 29 | print(chunked) 30 | chunked.draw() 31 | # except Exception as e: 32 | # print(str(e)) 33 | 34 | process_content() 35 | -------------------------------------------------------------------------------- /Chunking.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | #PunktSentenceTokenizer is Unsupervised ML tokenizer. We can train it if we want 4 | from nltk.tokenize import PunktSentenceTokenizer 5 | from nltk.tokenize import word_tokenize 6 | ''' 7 | POS TAG List : 8 | Put the words to be tagged here to clean the dataset 9 | Gtting the datat can be anything and do the things we can't do 10 | 11 | 12 | ''' 13 | train_text = state_union.raw("2005-GWBush.txt") 14 | sample_text = state_union.raw("2006-GWBush.txt") 15 | 16 | custom_tokenizer = PunktSentenceTokenizer(train_text) 17 | 18 | tokenized = custom_tokenizer.tokenize(sample_text) 19 | 20 | def process_content(): 21 | try: 22 | for i in tokenized: 23 | words = nltk.word_tokenize(i) 24 | tagged = nltk.pos_tag(words) 25 | print(tagged) 26 | except Exception as e: 27 | print(str(e)) 28 | 29 | process_content() 30 | 31 | def process_content(): 32 | try: 33 | for i in tokenized: 34 | words = nltk.word_tokenize(i) 35 | tagged = nltk.pos_tag(words) 36 | chunkGram = ''' Chunk: {**+?}''' 37 | chunkParser = nltk.RegexpParser(chunkGram) 38 | chunked = chunkParser.parse(tagged) 39 | print(chunked) 40 | chunked.draw() 41 | -------------------------------------------------------------------------------- /Inestigating_biases.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | import pickle 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | from nltk.corpus import movie_reviews 6 | 7 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB 8 | from sklearn.linear_model import LogisticRegression, SGDRegressor 9 | from sklearn.svm import SVC, LinearSVC, NuSVC 10 | 11 | from nltk.classify import ClassifierI 12 | from statistics import mode 13 | 14 | class VoteClassifier(ClassifierI): 15 | def __init__(self, *classifiers): 16 | self._classifiers = classifiers 17 | 18 | def classify(self, features): 19 | votes = [] 20 | for c in self._classifiers: 21 | v = c.classify(features) 22 | votes.append(v) 23 | return mode(votes) 24 | 25 | def confidence(self, features): 26 | votes = [] 27 | for c in self._classifiers: 28 | v = c.classify(features) 29 | votes.append(v) 30 | choice_votes = votes.count(mode(votes)) 31 | conf = choice_votes / len(votes) 32 | return conf 33 | 34 | documents = [(list( movie_reviews.words(fileid), category) 35 | for category in movie_reviews.categories() 36 | for fileid in movie_reviews.fileids(category)] 37 | 38 | random.shuffle(documents) 39 | 40 | all_words = [] 41 | 42 | for w in movie_reviews.words(): 43 | all_words.append(w.lower()) 44 | 45 | all_words = nltk.FreqDist(all_words) 46 | 47 | word_features = list(all_words.keys())[:3000] 48 | 49 | def find_features(document): 50 | words = set(document) 51 | features = {} 52 | for w in word_features: 53 | features[w] = (w in words) 54 | 55 | return features 56 | 57 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 58 | 59 | featuresets = [(find_features(rev) , category) for (rev , category) in documents] 60 | 61 | #Positive Data example 62 | training = featuresets[:1900] 63 | testing = featuresets[1900:] 64 | 65 | # Negative Data example 66 | # training = featuresets[100:] 67 | # testing = featuresets[:100] 68 | 69 | classifier = nltk.NaiveBayesClassifier.train(training) 70 | 71 | # 1st Way 72 | classifier_f = open("naive_bayes.pickle" , "rb") 73 | classifier = pickle.load(classifier_f) 74 | classifier_f.close() 75 | 76 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100) 77 | 78 | classifier.show_most_informative_features(20) 79 | 80 | # 2nd Way 81 | ''' 82 | save_classifier = open("naive_bayes.pickle" , "wb") 83 | pickle.dump(classifier, save_classifier) 84 | save_classifier.close() 85 | ''' 86 | 87 | MNB_classifier = SklearnClassifier(MultinomialNB()) 88 | MNB_classifier.train(training) 89 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100) 90 | 91 | BRN_classifier = SklearnClassifier(BernoulliNB()) 92 | BRN_classifier.train(training) 93 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100) 94 | 95 | GS_classifier = SklearnClassifier(GaussianNB()) 96 | GS_classifier.train(training) 97 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 98 | 99 | logistic_classifier = SklearnClassifier(LogisticRegression()) 100 | logistic_classifier.train(training) 101 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100) 102 | 103 | #svm_classifier = SklearnClassifier(SVC()) 104 | #svm_classifier.train(training) 105 | #print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 106 | 107 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor()) 108 | SGDRegressor_classifier.train(training) 109 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100) 110 | 111 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 112 | LinearSVC_classifier.train(training) 113 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100) 114 | 115 | NuSVC_classifier = SklearnClassifier(NuSVC()) 116 | NuSVC_classifier.train(training) 117 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100) 118 | 119 | 120 | voted_classifier = VoteClassifier(SGDRegressor_classifier,LinearSVC_classifier, 121 | NuSVC_classifier, MNB_classifier, classifier, 122 | GS_classifier, logistic_classifier, GS_classifier) 123 | print("Voted Classifier Accuracy: " , (nltk.classify.accuracy(voted_classifier, testing))*100) 124 | 125 | print("Classification:", voted_classifier.classify(testing[0][0]),"confidence %: ", voted_classifier.confidence(testing[0][0])) 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment-Analysis-for-Trading 2 | 3 | Welcome to my Sentiment Analysis Model Development. 4 | Currently in progress.. 5 | We can't Open source the whole code because we are currently in talk with companies for collaboration. 6 | 7 | Below is Prediction for NASDAQ using our model. Price is scaled to 1. Trained the model for past 2 years and predicted the results for 200 days in the future. 8 | ![Predicted Results for NASDAQ](https://i.imgur.com/tWV8CxV.png) 9 | -------------------------------------------------------------------------------- /Save_Classifier_with_pickle.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Pickle is the Way to save Object in the python. 3 | 4 | Save your trained Algorithm, so everytime you want to use your Algo you don't need to train 5 | run it train it. 6 | ''' 7 | 8 | import nltk 9 | import random 10 | from nltk.corpus import movie_reviews 11 | import pickle 12 | 13 | documents = [(list( movie_reviews.words(fileid), category) 14 | for category in movie_reviews.categories() 15 | for fileid in movie_reviews.fileids(category)] 16 | 17 | random.shuffle(documents) 18 | 19 | all_words = [] 20 | 21 | for w in movie_reviews.words(): 22 | all_words.append(w.lower()) 23 | 24 | all_words = nltk.FreqDist(all_words) 25 | 26 | word_features = list(all_words.keys())[:3000] 27 | 28 | def find_features(document): 29 | words = set(document) 30 | features = {} 31 | for w in word_features: 32 | features[w] = (w in words) 33 | 34 | return features 35 | 36 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 37 | 38 | featuresets = [(find_features(rev) , category) for (rev , category) in documents] 39 | 40 | training = featuresets[:1900] 41 | testing = featuresets[1900:] 42 | 43 | classifier = nltk.NaiveBayesClassifier.train(training) 44 | 45 | # 1st Way 46 | classifier_f = open("naive_bayes.pickle" , "rb") 47 | classifier = pickle.load(classifier_f) 48 | classifier_f.close() 49 | 50 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100) 51 | 52 | classifier.show_most_informative_features(20) 53 | 54 | # 2nd Way 55 | ''' 56 | save_classifier = open("naive_bayes.pickle" , "wb") 57 | pickle.dump(classifier, save_classifier) 58 | save_classifier.close() 59 | ''' 60 | -------------------------------------------------------------------------------- /Senti_analysis.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from urllib2 import urlopen 3 | import re 4 | import cookielib 5 | from cookielib import CookieJar 6 | import time 7 | 8 | cj = CookieJar() 9 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 10 | -------------------------------------------------------------------------------- /Senti_analysis_module.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | import pickle 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | from nltk.tokenize import word_tokenize 6 | from nltk.corpus import movie_reviews 7 | 8 | 9 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB 10 | from sklearn.linear_model import LogisticRegression, SGDRegressor 11 | from sklearn.svm import SVC, LinearSVC, NuSVC 12 | 13 | from nltk.classify import ClassifierI 14 | from statistics import mode 15 | 16 | class VoteClassifier(ClassifierI): 17 | def __init__(self, *classifiers): 18 | self._classifiers = classifiers 19 | 20 | def classify(self, features): 21 | votes = [] 22 | for c in self._classifiers: 23 | v = c.classify(features) 24 | votes.append(v) 25 | return mode(votes) 26 | 27 | def confidence(self, features): 28 | votes = [] 29 | for c in self._classifiers: 30 | v = c.classify(features) 31 | votes.append(v) 32 | choice_votes = votes.count(mode(votes)) 33 | conf = choice_votes / len(votes) 34 | return conf 35 | 36 | short_pos = open("short_reviews/positive.txt" , "r").read() 37 | short_neg = open("short_reviews/negative.txt" , "r").read() 38 | 39 | documents = [] 40 | 41 | for r in short_pos.split('\n'): 42 | documents.append((r, "pos")) 43 | words = word_tokenize(p) 44 | pos = nltk.pos_tag(words) 45 | for w in pos: 46 | if w[1][0] in allowed_words_types: 47 | all_words.append(w[0].lower()) 48 | 49 | for r in short_neg.split('\n'): 50 | documents.append((r, "neg")) 51 | words = word_tokenize(p) 52 | pos = nltk.pos_tag(words) 53 | for w in pos: 54 | if w[1][0] in allowed_words_types: 55 | all_words.append(w[0].lower()) 56 | 57 | save_documents = open("pickled_algos/document.pickle","wb") 58 | pickle.dump(documents, save_classifier) 59 | save_documents.close() 60 | 61 | all_words = [] 62 | short_pos_words = word_tokenize(short_pos) 63 | short_neg_words = word_tokenize(short_neg) 64 | 65 | for w in short_pos_words: 66 | all_words.append(w.lower()) 67 | 68 | for w in short_neg_words: 69 | all_words.append(w.lower()) 70 | 71 | all_words = nltk.FreqDist(all_words) 72 | 73 | word_features = list(all_words.keys())[:5000] 74 | 75 | def find_features(document): 76 | words = word_tokenize(document) 77 | features = {} 78 | for w in word_features: 79 | features[w] = (w in words) 80 | 81 | return features 82 | 83 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 84 | 85 | featuresets = [(find_features(rev) , category) for (rev , category) in documents] 86 | 87 | 88 | random.shuffle(featuresets) 89 | #Positive Data example 90 | training = featuresets[:10000] 91 | testing = featuresets[10000:] 92 | 93 | # Negative Data example 94 | # training = featuresets[100:] 95 | # testing = featuresets[:100] 96 | 97 | classifier = nltk.NaiveBayesClassifier.train(training) 98 | 99 | # 1st Way 100 | #classifier_f = open("naive_bayes.pickle" , "rb") 101 | #classifier = pickle.load(classifier_f) 102 | #classifier_f.close() 103 | 104 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100) 105 | 106 | classifier.show_most_informative_features(20) 107 | 108 | # 2nd Way 109 | ''' 110 | save_classifier = open("naive_bayes.pickle" , "wb") 111 | pickle.dump(classifier, save_classifier) 112 | save_classifier.close() 113 | ''' 114 | 115 | MNB_classifier = SklearnClassifier(MultinomialNB()) 116 | MNB_classifier.train(training) 117 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100) 118 | 119 | BRN_classifier = SklearnClassifier(BernoulliNB()) 120 | BRN_classifier.train(training) 121 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100) 122 | 123 | GS_classifier = SklearnClassifier(GaussianNB()) 124 | GS_classifier.train(training) 125 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 126 | 127 | logistic_classifier = SklearnClassifier(LogisticRegression()) 128 | logistic_classifier.train(training) 129 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100) 130 | 131 | #svm_classifier = SklearnClassifier(SVC()) 132 | #svm_classifier.train(training) 133 | #print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 134 | 135 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor()) 136 | SGDRegressor_classifier.train(training) 137 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100) 138 | 139 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 140 | LinearSVC_classifier.train(training) 141 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100) 142 | 143 | NuSVC_classifier = SklearnClassifier(NuSVC()) 144 | NuSVC_classifier.train(training) 145 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100) 146 | 147 | 148 | voted_classifier = VoteClassifier(SGDRegressor_classifier,LinearSVC_classifier, 149 | NuSVC_classifier, MNB_classifier, classifier, 150 | GS_classifier, logistic_classifier, GS_classifier) 151 | print("Voted Classifier Accuracy: " , (nltk.classify.accuracy(voted_classifier, testing))*100) 152 | 153 | print("Classification:", voted_classifier.classify(testing[0][0]),"confidence %: ", voted_classifier.confidence(testing[0][0])) 154 | -------------------------------------------------------------------------------- /Taining_data.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | import pickle 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | from nltk.tokenize import word_tokenize 6 | from nltk.corpus import movie_reviews 7 | 8 | 9 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB 10 | from sklearn.linear_model import LogisticRegression, SGDRegressor 11 | from sklearn.svm import SVC, LinearSVC, NuSVC 12 | 13 | from nltk.classify import ClassifierI 14 | from statistics import mode 15 | 16 | class VoteClassifier(ClassifierI): 17 | def __init__(self, *classifiers): 18 | self._classifiers = classifiers 19 | 20 | def classify(self, features): 21 | votes = [] 22 | for c in self._classifiers: 23 | v = c.classify(features) 24 | votes.append(v) 25 | return mode(votes) 26 | 27 | def confidence(self, features): 28 | votes = [] 29 | for c in self._classifiers: 30 | v = c.classify(features) 31 | votes.append(v) 32 | choice_votes = votes.count(mode(votes)) 33 | conf = choice_votes / len(votes) 34 | return conf 35 | 36 | short_pos = open("short_reviews/positive.txt" , "r").read() 37 | short_neg = open("short_reviews/negative.txt" , "r").read() 38 | 39 | documents = [] 40 | 41 | for r in short_pos.split('\n'): 42 | documents.append((r, "pos")) 43 | 44 | for r in short_neg.split('\n'): 45 | documents.append((r, "neg")) 46 | 47 | all_words = [] 48 | short_pos_words = word_tokenize(short_pos) 49 | short_neg_words = word_tokenize(short_neg) 50 | 51 | for w in short_pos_words: 52 | all_words.append(w.lower()) 53 | 54 | for w in short_neg_words: 55 | all_words.append(w.lower()) 56 | 57 | all_words = nltk.FreqDist(all_words) 58 | 59 | word_features = list(all_words.keys())[:5000] 60 | 61 | def find_features(document): 62 | words = word_tokenize(document) 63 | features = {} 64 | for w in word_features: 65 | features[w] = (w in words) 66 | 67 | return features 68 | 69 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 70 | 71 | featuresets = [(find_features(rev) , category) for (rev , category) in documents] 72 | 73 | 74 | random.shuffle(featuresets) 75 | #Positive Data example 76 | training = featuresets[:10000] 77 | testing = featuresets[10000:] 78 | 79 | # Negative Data example 80 | # training = featuresets[100:] 81 | # testing = featuresets[:100] 82 | 83 | classifier = nltk.NaiveBayesClassifier.train(training) 84 | 85 | # 1st Way 86 | #classifier_f = open("naive_bayes.pickle" , "rb") 87 | #classifier = pickle.load(classifier_f) 88 | #classifier_f.close() 89 | 90 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100) 91 | 92 | classifier.show_most_informative_features(20) 93 | 94 | # 2nd Way 95 | ''' 96 | save_classifier = open("naive_bayes.pickle" , "wb") 97 | pickle.dump(classifier, save_classifier) 98 | save_classifier.close() 99 | ''' 100 | 101 | MNB_classifier = SklearnClassifier(MultinomialNB()) 102 | MNB_classifier.train(training) 103 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100) 104 | 105 | BRN_classifier = SklearnClassifier(BernoulliNB()) 106 | BRN_classifier.train(training) 107 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100) 108 | 109 | GS_classifier = SklearnClassifier(GaussianNB()) 110 | GS_classifier.train(training) 111 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 112 | 113 | logistic_classifier = SklearnClassifier(LogisticRegression()) 114 | logistic_classifier.train(training) 115 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100) 116 | 117 | #svm_classifier = SklearnClassifier(SVC()) 118 | #svm_classifier.train(training) 119 | #print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 120 | 121 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor()) 122 | SGDRegressor_classifier.train(training) 123 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100) 124 | 125 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 126 | LinearSVC_classifier.train(training) 127 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100) 128 | 129 | NuSVC_classifier = SklearnClassifier(NuSVC()) 130 | NuSVC_classifier.train(training) 131 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100) 132 | 133 | 134 | voted_classifier = VoteClassifier(SGDRegressor_classifier,LinearSVC_classifier, 135 | NuSVC_classifier, MNB_classifier, classifier, 136 | GS_classifier, logistic_classifier, GS_classifier) 137 | print("Voted Classifier Accuracy: " , (nltk.classify.accuracy(voted_classifier, testing))*100) 138 | 139 | print("Classification:", voted_classifier.classify(testing[0][0]),"confidence %: ", voted_classifier.confidence(testing[0][0])) 140 | -------------------------------------------------------------------------------- /lemmatizing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Similar Operation as stemming only the end results will 3 | be real words. 4 | End words will be root words, synonymes to origina; words 5 | ''' 6 | from nltk.stem import WordNetLemmatizer 7 | lemmatizer = WordNetLemmatizer() 8 | 9 | print(lemmatizer.lemmatize("cats")) 10 | print(lemmatizer.lemmatize("cacti")) 11 | print(lemmatizer.lemmatize("geese")) 12 | print(lemmatizer.lemmatize("cows")) 13 | print(lemmatizer.lemmatize("dogy")) 14 | 15 | print(lemmatizer.lemmatize("better")) 16 | print(lemmatizer.lemmatize("better". pos="a")) 17 | -------------------------------------------------------------------------------- /naive_bayes.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | 5 | 6 | documents = [(list( movie_reviews.words(fileid), category) 7 | for category in movie_reviews.categories() 8 | for fileid in movie_reviews.fileids(category)] 9 | 10 | random.shuffle(documents) 11 | 12 | all_words = [] 13 | 14 | for w in movie_reviews.words(): 15 | all_words.append(w.lower()) 16 | 17 | all_words = nltk.FreqDist(all_words) 18 | 19 | word_features = list(all_words.keys())[:3000] 20 | 21 | def find_features(document): 22 | words = set(document) 23 | features = {} 24 | for w in word_features: 25 | features[w] = (w in words) 26 | 27 | return features 28 | 29 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 30 | 31 | featuresets = [(find_features(rev) , category) for (rev , category) in documents] 32 | 33 | training = featuresets[:1900] 34 | testing = featuresets[1900:] 35 | 36 | classifier = nltk.NaiveBayesClassifier.train(training) 37 | 38 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100) 39 | 40 | classifier.show_most_informative_features(20) 41 | -------------------------------------------------------------------------------- /named_enti_reco.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | from nltk.tokenize import PunktSentenceTokenizer 4 | 5 | train_text = state_union.raw("2005-GWBush.txt") 6 | test_text= state_union.raw("2006-GWBush.txt") 7 | 8 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text) 9 | tokenized = custom_sent_tokenizer(test_text) 10 | 11 | def process_content(): 12 | try: 13 | for i in tokenized[5:]: 14 | words = nltk.word_tokenize(i) 15 | tagged = nltk.pos_tag(words) 16 | 17 | namedEnt = nltk.ne_chunk(tagged, binary=True) 18 | 19 | namedEnt.draw() 20 | except Exception as e: 21 | print(str(e)) 22 | 23 | process_content() 24 | 25 | ''' 26 | NE Type Examples 27 | 28 | 29 | ''' 30 | -------------------------------------------------------------------------------- /nltk_corpora.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import gutenberg 3 | from nltk.tokenize import sent_tokenize, word_tokenize 4 | from nltk.corpus import stopwords 5 | 6 | print(nltk.__file__) 7 | 8 | stop_words = set(stopwords.words("english")) 9 | 10 | sample = gutenberg.raw("bible-kjv.txt") 11 | tok = sent_tokenize(sample, language='english') 12 | 13 | words = word_tokenize(sample, language='english') 14 | 15 | filtered_words = [w for w in words if not w in stop_words] 16 | print(tok[5:15], sep=' ', end='n', file=sys.stdout, flush=False) 17 | -------------------------------------------------------------------------------- /nltk_download.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.download() 3 | -------------------------------------------------------------------------------- /nltk_scikit_learn.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | import pickle 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | from nltk.corpus import movie_reviews 6 | 7 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB 8 | from sklearn.linear_model import LogisticRegression, SGDRegressor 9 | from sklearn.svm import SVC, LinearSVC, NuSVC 10 | 11 | documents = [(list( movie_reviews.words(fileid), category) 12 | for category in movie_reviews.categories() 13 | for fileid in movie_reviews.fileids(category)] 14 | 15 | random.shuffle(documents) 16 | 17 | all_words = [] 18 | 19 | for w in movie_reviews.words(): 20 | all_words.append(w.lower()) 21 | 22 | all_words = nltk.FreqDist(all_words) 23 | 24 | word_features = list(all_words.keys())[:3000] 25 | 26 | def find_features(document): 27 | words = set(document) 28 | features = {} 29 | for w in word_features: 30 | features[w] = (w in words) 31 | 32 | return features 33 | 34 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 35 | 36 | featuresets = [(find_features(rev) , category) for (rev , category) in documents] 37 | 38 | training = featuresets[:1900] 39 | testing = featuresets[1900:] 40 | 41 | classifier = nltk.NaiveBayesClassifier.train(training) 42 | 43 | # 1st Way 44 | classifier_f = open("naive_bayes.pickle" , "rb") 45 | classifier = pickle.load(classifier_f) 46 | classifier_f.close() 47 | 48 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100) 49 | 50 | classifier.show_most_informative_features(20) 51 | 52 | # 2nd Way 53 | ''' 54 | save_classifier = open("naive_bayes.pickle" , "wb") 55 | pickle.dump(classifier, save_classifier) 56 | save_classifier.close() 57 | ''' 58 | 59 | MNB_classifier = SklearnClassifier(MultinomialNB()) 60 | MNB_classifier.train(training) 61 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100) 62 | 63 | BRN_classifier = SklearnClassifier(BernoulliNB()) 64 | BRN_classifier.train(training) 65 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100) 66 | 67 | GS_classifier = SklearnClassifier(GaussianNB()) 68 | GS_classifier.train(training) 69 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 70 | 71 | logistic_classifier = SklearnClassifier(LogisticRegression()) 72 | logistic_classifier.train(training) 73 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100) 74 | 75 | svm_classifier = SklearnClassifier(SVC()) 76 | svm_classifier.train(training) 77 | print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100) 78 | 79 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor()) 80 | SGDRegressor_classifier.train(training) 81 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100) 82 | 83 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 84 | LinearSVC_classifier.train(training) 85 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100) 86 | 87 | NuSVC_classifier = SklearnClassifier(NuSVC()) 88 | NuSVC_classifier.train(training) 89 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100) 90 | -------------------------------------------------------------------------------- /part_speech_tagging.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | #PunktSentenceTokenizer is Unsupervised ML tokenizer. We can train it if we want 4 | from nltk.tokenize import PunktSentenceTokenizer 5 | from nltk.tokenize import word_tokenize 6 | ''' 7 | Put the words to be tagged here to clean the dataset 8 | Gtting the datat can be anything and do the things we can't do 9 | 10 | 11 | ''' 12 | train_text = state_union.raw("2005-GWBush.txt") 13 | sample_text = state_union.raw("2006-GWBush.txt") 14 | 15 | custom_tokenizer = PunktSentenceTokenizer(train_text) 16 | 17 | tokenized = custom_tokenizer.tokenize(sample_text) 18 | 19 | def process_content(): 20 | try: 21 | for i in tokenized: 22 | words = nltk.word_tokenize(i) 23 | tagged = nltk.pos_tag(words) 24 | print(tagged) 25 | except Exception as e: 26 | print(str(e)) 27 | 28 | process_content() 29 | -------------------------------------------------------------------------------- /stemming.py: -------------------------------------------------------------------------------- 1 | from nltk.stem import PorterStemmer 2 | from nltk.tokenize import word_tokenize 3 | 4 | ps = PorterStemmer() 5 | 6 | example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"] 7 | 8 | example_text = "I was riding the car and going too fast. Suddenly hit the stone and car flipped over" 9 | 10 | for w in example_words: 11 | print(ps.stem(w)) 12 | 13 | words = word_tokenize(example_text) 14 | 15 | for w in words: 16 | print(ps.stem(w)) 17 | -------------------------------------------------------------------------------- /stop_words.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | from nltk.tokenize import word_tokenize 3 | 4 | #Print Stop words 5 | stop_words = set(stopwords.words("english")) 6 | print(stop_words) 7 | 8 | example_text = "This is general sentence to just clarify if stop words are working or not. I have some awesome projects coming up" 9 | 10 | words = word_tokenize(example_text) 11 | 12 | filtered_sentence = [] 13 | for w in words: 14 | for w not in stop_words: 15 | filtered_sentence.append(w) 16 | 17 | #print filtered sentences 18 | print(filtered_sentence) 19 | 20 | #print in a line 21 | filtered_sentence1 = [w for w in words if not w in stop_words] 22 | 23 | #print filtered sentences 24 | print(filtered_sentence1) 25 | -------------------------------------------------------------------------------- /text_classification.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import movie_reviews 3 | from nltk.tokenize import sent_tokenize, word_tokenize 4 | from nltk.corpus import stopwords 5 | import random 6 | 7 | ''' 8 | Text Classification can be used for Stock and Politics and other subjects 9 | ''' 10 | 11 | documents = [(list( movie_reviews.words(fileid), category) 12 | for category in movie_reviews.categories() 13 | for fileid in movie_reviews.fileids(category)] 14 | 15 | ''' 16 | documents = [] 17 | 18 | for category in movie_reviews.categories(): 19 | for fileid in movie_reviews.fileids(category): 20 | documents.append(list.(movie_reviews.words(fileid)), category) 21 | ''' 22 | 23 | random.shuffle (documents, random=None) 24 | 25 | all_words = [] 26 | for w in movie_reviews.words(): 27 | all_words.append(w.lower()) 28 | 29 | all_words = nltk.FreqDist(all_words) 30 | 31 | print(all_words.most_common(15)) 32 | 33 | print (all_words["stupid"]) 34 | 35 | 36 | ''' 37 | Use ML-Naive Bayes Algorithm to identify possitive & negetive words 38 | ''' 39 | -------------------------------------------------------------------------------- /text_word_tokenizer.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import sent_tokenize, word_tokenize 2 | 3 | 4 | example_text = "How are you Mr. PyPatel. How is your day? Krishna is watching TV. Cool breeze is going on outside. Having fun learning python and nltk." 5 | 6 | #print separeted sentances 7 | print (sent_tokenize(example_text)) 8 | 9 | #print Separeted words 10 | print(word_tokenize(example_text)) 11 | 12 | #count number of words 13 | count = 0 14 | for i in word_tokenize: 15 | count+=1 16 | 17 | print(count) 18 | -------------------------------------------------------------------------------- /wordNet.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import wordnet 3 | from nltk.tokenize import sent_tokenize, word_tokenize 4 | from nltk.corpus import stopwords 5 | 6 | syns = wordnet.synset("program") 7 | 8 | # Synset 9 | print(syns[0].name()) 10 | 11 | # Just the word 12 | print(syns[0].lemmas()[0].name()) 13 | 14 | # Definition 15 | print(syns[0].definition()) 16 | 17 | # Examples 18 | print(syns[0].examples()) 19 | 20 | synonyms = [] 21 | antonyms = [] 22 | 23 | for syn in wordnet.synset("good"): 24 | for l in syn.lemmas(): 25 | #print("l:",l) 26 | synonyms.append(l.name()) 27 | if l.antonyms(): 28 | antonyms.append(l.antonyms()[0].name()) 29 | 30 | print(set(synonyms)) 31 | print(set(antonyms)) 32 | 33 | 34 | # Word Similarity 35 | w1 = wordnet.synset("ship.n.01") 36 | w2 = wordnet.synset("boat.n.01") 37 | w3 = wordnet.synset("car.n.01") 38 | w4 = wordnet.synset("cat.n.01") 39 | 40 | print(w1.wup_similarity(w2)) 41 | print(w1.wup_similarity(w3)) 42 | print(w1.wup_similarity(w4)) 43 | 44 | 45 | ''' Can create Bot that can run a NEWS chanel by scrapping the news from other site and 46 | change the synonyms to create their own article. Do the same thing for the Assignment submission 47 | to cheat the bots. 48 | 49 | Google is currently applying this in reverse to find out the Bot Websites which are totally operated 50 | by the Bots. 51 | ''' 52 | -------------------------------------------------------------------------------- /word_features.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | 5 | 6 | documents = [(list( movie_reviews.words(fileid), category) 7 | for category in movie_reviews.categories() 8 | for fileid in movie_reviews.fileids(category)] 9 | 10 | random.shuffle(documents) 11 | 12 | all_words = [] 13 | 14 | for w in movie_reviews.words(): 15 | all_words.append(w.lower()) 16 | 17 | all_words = nltk.FreqDist(all_words) 18 | 19 | word_features = list(all_words.keys())[:3000] 20 | 21 | def find_features(document): 22 | words = set(document) 23 | features = {} 24 | for w in word_features: 25 | features[w] = (w in words) 26 | 27 | return features 28 | 29 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 30 | 31 | featuresets = [(find_features(rev) , category) for (rev , category) in documents] 32 | --------------------------------------------------------------------------------