├── Chinking.py
├── Chunking.py
├── Inestigating_biases.py
├── README.md
├── Save_Classifier_with_pickle.py
├── Senti_analysis.py
├── Senti_analysis_module.py
├── Taining_data.py
├── lemmatizing.py
├── naive_bayes.py
├── named_enti_reco.py
├── nltk_corpora.py
├── nltk_download.py
├── nltk_scikit_learn.py
├── part_speech_tagging.py
├── stemming.py
├── stop_words.py
├── text_classification.py
├── text_word_tokenizer.py
├── wordNet.py
└── word_features.py


/Chinking.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | #PunktSentenceTokenizer is Unsupervised ML tokenizer. We can train it if we want
 4 | from nltk.tokenize import PunktSentenceTokenizer
 5 | from nltk.tokenize import word_tokenize
 6 | '''
 7 | POS TAG List :
 8 | Put the words to be tagged here to clean the dataset
 9 | Gtting the datat can be anything and do the things we can't do
10 | 
11 | 
12 | '''
13 | train_text = state_union.raw("2005-GWBush.txt")
14 | sample_text = state_union.raw("2006-GWBush.txt")
15 | 
16 | custom_tokenizer = PunktSentenceTokenizer(train_text)
17 | 
18 | tokenized = custom_tokenizer.tokenize(sample_text)
19 | 
20 | def process_content():
21 |     try:
22 |         for i in tokenized:
23 |             words = nltk.word_tokenize(i)
24 |             tagged = nltk.pos_tag(words)
25 |             chunkGram = ''' Chunk: {<.*>+}
26 |                                     }<VB.?|IN|DT>+{'''
27 |             chunkParser = nltk.RegexpParser(chunkGram)
28 |             chunked = chunkParser.parse(tagged)
29 |             print(chunked)
30 |             chunked.draw()
31 | #    except Exception as e:
32 | #        print(str(e))
33 | 
34 | process_content()
35 | 


--------------------------------------------------------------------------------
/Chunking.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | #PunktSentenceTokenizer is Unsupervised ML tokenizer. We can train it if we want
 4 | from nltk.tokenize import PunktSentenceTokenizer
 5 | from nltk.tokenize import word_tokenize
 6 | '''
 7 | POS TAG List :
 8 | Put the words to be tagged here to clean the dataset
 9 | Gtting the datat can be anything and do the things we can't do
10 | 
11 | 
12 | '''
13 | train_text = state_union.raw("2005-GWBush.txt")
14 | sample_text = state_union.raw("2006-GWBush.txt")
15 | 
16 | custom_tokenizer = PunktSentenceTokenizer(train_text)
17 | 
18 | tokenized = custom_tokenizer.tokenize(sample_text)
19 | 
20 | def process_content():
21 |     try:
22 |         for i in tokenized:
23 |             words = nltk.word_tokenize(i)
24 |             tagged = nltk.pos_tag(words)
25 |             print(tagged)
26 |     except Exception as e:
27 |         print(str(e))
28 | 
29 | process_content()
30 | 
31 | def process_content():
32 |     try:
33 |         for i in tokenized:
34 |             words = nltk.word_tokenize(i)
35 |             tagged = nltk.pos_tag(words)
36 |             chunkGram = ''' Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
37 |             chunkParser = nltk.RegexpParser(chunkGram)
38 |             chunked = chunkParser.parse(tagged)
39 |             print(chunked)
40 |             chunked.draw()
41 | 


--------------------------------------------------------------------------------
/Inestigating_biases.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | import pickle
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | from nltk.corpus import movie_reviews
  6 | 
  7 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
  8 | from sklearn.linear_model import LogisticRegression, SGDRegressor
  9 | from sklearn.svm import SVC, LinearSVC, NuSVC
 10 | 
 11 | from nltk.classify import ClassifierI
 12 | from statistics import mode
 13 | 
 14 | class VoteClassifier(ClassifierI):
 15 |     def __init__(self, *classifiers):
 16 |         self._classifiers = classifiers
 17 | 
 18 |     def classify(self, features):
 19 |         votes = []
 20 |         for c in self._classifiers:
 21 |             v = c.classify(features)
 22 |             votes.append(v)
 23 |         return  mode(votes)
 24 | 
 25 |     def confidence(self, features):
 26 |         votes = []
 27 |         for c in self._classifiers:
 28 |             v = c.classify(features)
 29 |             votes.append(v)
 30 |         choice_votes = votes.count(mode(votes))
 31 |         conf = choice_votes / len(votes)
 32 |         return conf
 33 | 
 34 | documents = [(list( movie_reviews.words(fileid), category)
 35 |             for category in movie_reviews.categories()
 36 |             for fileid in movie_reviews.fileids(category)]
 37 | 
 38 | random.shuffle(documents)
 39 | 
 40 | all_words = []
 41 | 
 42 | for w in movie_reviews.words():
 43 |     all_words.append(w.lower())
 44 | 
 45 | all_words = nltk.FreqDist(all_words)
 46 | 
 47 | word_features = list(all_words.keys())[:3000]
 48 | 
 49 | def find_features(document):
 50 |     words = set(document)
 51 |     features = {}
 52 |     for w in word_features:
 53 |         features[w] = (w in words)
 54 | 
 55 |     return features
 56 | 
 57 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
 58 | 
 59 | featuresets = [(find_features(rev) , category) for (rev , category) in documents]
 60 | 
 61 | #Positive Data example
 62 | training = featuresets[:1900]
 63 | testing = featuresets[1900:]
 64 | 
 65 | # Negative Data example
 66 | # training = featuresets[100:]
 67 | # testing = featuresets[:100]
 68 | 
 69 | classifier = nltk.NaiveBayesClassifier.train(training)
 70 | 
 71 | # 1st Way
 72 | classifier_f = open("naive_bayes.pickle" , "rb")
 73 | classifier = pickle.load(classifier_f)
 74 | classifier_f.close()
 75 | 
 76 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100)
 77 | 
 78 | classifier.show_most_informative_features(20)
 79 | 
 80 | # 2nd Way
 81 | '''
 82 | save_classifier = open("naive_bayes.pickle" , "wb")
 83 | pickle.dump(classifier, save_classifier)
 84 | save_classifier.close()
 85 | '''
 86 | 
 87 | MNB_classifier = SklearnClassifier(MultinomialNB())
 88 | MNB_classifier.train(training)
 89 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100)
 90 | 
 91 | BRN_classifier = SklearnClassifier(BernoulliNB())
 92 | BRN_classifier.train(training)
 93 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100)
 94 | 
 95 | GS_classifier = SklearnClassifier(GaussianNB())
 96 | GS_classifier.train(training)
 97 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
 98 | 
 99 | logistic_classifier = SklearnClassifier(LogisticRegression())
100 | logistic_classifier.train(training)
101 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100)
102 | 
103 | #svm_classifier = SklearnClassifier(SVC())
104 | #svm_classifier.train(training)
105 | #print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
106 | 
107 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor())
108 | SGDRegressor_classifier.train(training)
109 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100)
110 | 
111 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
112 | LinearSVC_classifier.train(training)
113 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100)
114 | 
115 | NuSVC_classifier = SklearnClassifier(NuSVC())
116 | NuSVC_classifier.train(training)
117 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100)
118 | 
119 | 
120 | voted_classifier = VoteClassifier(SGDRegressor_classifier,LinearSVC_classifier,
121 |                                   NuSVC_classifier, MNB_classifier, classifier,
122 |                                   GS_classifier, logistic_classifier, GS_classifier)
123 | print("Voted Classifier Accuracy: " , (nltk.classify.accuracy(voted_classifier, testing))*100)
124 | 
125 | print("Classification:", voted_classifier.classify(testing[0][0]),"confidence %: ", voted_classifier.confidence(testing[0][0]))
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sentiment-Analysis-for-Trading
2 | 
3 | Welcome to my Sentiment Analysis Model Development. 
4 | Currently in progress..
5 | We can't Open source the whole code because we are currently in talk with companies for collaboration.
6 | 
7 | Below is Prediction for NASDAQ using our model. Price is scaled to 1. Trained the model for past 2 years and predicted the results for 200 days in the future.
8 | ![Predicted Results for NASDAQ](https://i.imgur.com/tWV8CxV.png)
9 | 


--------------------------------------------------------------------------------
/Save_Classifier_with_pickle.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Pickle is the Way to save Object in the python.
 3 | 
 4 | Save your trained Algorithm, so everytime you want to use your Algo you don't need to train
 5 | run it train it.
 6 | '''
 7 | 
 8 | import nltk
 9 | import random
10 | from nltk.corpus import movie_reviews
11 | import pickle
12 | 
13 | documents = [(list( movie_reviews.words(fileid), category)
14 |             for category in movie_reviews.categories()
15 |             for fileid in movie_reviews.fileids(category)]
16 | 
17 | random.shuffle(documents)
18 | 
19 | all_words = []
20 | 
21 | for w in movie_reviews.words():
22 |     all_words.append(w.lower())
23 | 
24 | all_words = nltk.FreqDist(all_words)
25 | 
26 | word_features = list(all_words.keys())[:3000]
27 | 
28 | def find_features(document):
29 |     words = set(document)
30 |     features = {}
31 |     for w in word_features:
32 |         features[w] = (w in words)
33 | 
34 |     return features
35 | 
36 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
37 | 
38 | featuresets = [(find_features(rev) , category) for (rev , category) in documents]
39 | 
40 | training = featuresets[:1900]
41 | testing = featuresets[1900:]
42 | 
43 | classifier = nltk.NaiveBayesClassifier.train(training)
44 | 
45 | # 1st Way
46 | classifier_f = open("naive_bayes.pickle" , "rb")
47 | classifier = pickle.load(classifier_f)
48 | classifier_f.close()
49 | 
50 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100)
51 | 
52 | classifier.show_most_informative_features(20)
53 | 
54 | # 2nd Way
55 | '''
56 | save_classifier = open("naive_bayes.pickle" , "wb")
57 | pickle.dump(classifier, save_classifier)
58 | save_classifier.close()
59 | '''
60 | 


--------------------------------------------------------------------------------
/Senti_analysis.py:
--------------------------------------------------------------------------------
 1 | import urllib2
 2 | from urllib2 import urlopen
 3 | import re
 4 | import cookielib
 5 | from cookielib import CookieJar
 6 | import time
 7 | 
 8 | cj = CookieJar()
 9 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
10 | 


--------------------------------------------------------------------------------
/Senti_analysis_module.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | import pickle
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | from nltk.tokenize import word_tokenize
  6 | from nltk.corpus import movie_reviews
  7 | 
  8 | 
  9 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
 10 | from sklearn.linear_model import LogisticRegression, SGDRegressor
 11 | from sklearn.svm import SVC, LinearSVC, NuSVC
 12 | 
 13 | from nltk.classify import ClassifierI
 14 | from statistics import mode
 15 | 
 16 | class VoteClassifier(ClassifierI):
 17 |     def __init__(self, *classifiers):
 18 |         self._classifiers = classifiers
 19 | 
 20 |     def classify(self, features):
 21 |         votes = []
 22 |         for c in self._classifiers:
 23 |             v = c.classify(features)
 24 |             votes.append(v)
 25 |         return  mode(votes)
 26 | 
 27 |     def confidence(self, features):
 28 |         votes = []
 29 |         for c in self._classifiers:
 30 |             v = c.classify(features)
 31 |             votes.append(v)
 32 |         choice_votes = votes.count(mode(votes))
 33 |         conf = choice_votes / len(votes)
 34 |         return conf
 35 | 
 36 | short_pos = open("short_reviews/positive.txt" , "r").read()
 37 | short_neg = open("short_reviews/negative.txt" , "r").read()
 38 | 
 39 | documents = []
 40 | 
 41 | for r in short_pos.split('\n'):
 42 |     documents.append((r, "pos"))
 43 |     words = word_tokenize(p)
 44 |     pos = nltk.pos_tag(words)
 45 |     for w in pos:
 46 |         if w[1][0] in allowed_words_types:
 47 |             all_words.append(w[0].lower())
 48 | 
 49 | for r in short_neg.split('\n'):
 50 |     documents.append((r, "neg"))
 51 |     words = word_tokenize(p)
 52 |     pos = nltk.pos_tag(words)
 53 |     for w in pos:
 54 |         if w[1][0] in allowed_words_types:
 55 |             all_words.append(w[0].lower())
 56 | 
 57 | save_documents = open("pickled_algos/document.pickle","wb")
 58 | pickle.dump(documents, save_classifier)
 59 | save_documents.close()
 60 | 
 61 | all_words = []
 62 | short_pos_words = word_tokenize(short_pos)
 63 | short_neg_words = word_tokenize(short_neg)
 64 | 
 65 | for w in short_pos_words:
 66 |     all_words.append(w.lower())
 67 | 
 68 | for w in short_neg_words:
 69 |     all_words.append(w.lower())
 70 | 
 71 | all_words = nltk.FreqDist(all_words)
 72 | 
 73 | word_features = list(all_words.keys())[:5000]
 74 | 
 75 | def find_features(document):
 76 |     words = word_tokenize(document)
 77 |     features = {}
 78 |     for w in word_features:
 79 |         features[w] = (w in words)
 80 | 
 81 |     return features
 82 | 
 83 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
 84 | 
 85 | featuresets = [(find_features(rev) , category) for (rev , category) in documents]
 86 | 
 87 | 
 88 | random.shuffle(featuresets)
 89 | #Positive Data example
 90 | training = featuresets[:10000]
 91 | testing = featuresets[10000:]
 92 | 
 93 | # Negative Data example
 94 | # training = featuresets[100:]
 95 | # testing = featuresets[:100]
 96 | 
 97 | classifier = nltk.NaiveBayesClassifier.train(training)
 98 | 
 99 | # 1st Way
100 | #classifier_f = open("naive_bayes.pickle" , "rb")
101 | #classifier = pickle.load(classifier_f)
102 | #classifier_f.close()
103 | 
104 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100)
105 | 
106 | classifier.show_most_informative_features(20)
107 | 
108 | # 2nd Way
109 | '''
110 | save_classifier = open("naive_bayes.pickle" , "wb")
111 | pickle.dump(classifier, save_classifier)
112 | save_classifier.close()
113 | '''
114 | 
115 | MNB_classifier = SklearnClassifier(MultinomialNB())
116 | MNB_classifier.train(training)
117 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100)
118 | 
119 | BRN_classifier = SklearnClassifier(BernoulliNB())
120 | BRN_classifier.train(training)
121 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100)
122 | 
123 | GS_classifier = SklearnClassifier(GaussianNB())
124 | GS_classifier.train(training)
125 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
126 | 
127 | logistic_classifier = SklearnClassifier(LogisticRegression())
128 | logistic_classifier.train(training)
129 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100)
130 | 
131 | #svm_classifier = SklearnClassifier(SVC())
132 | #svm_classifier.train(training)
133 | #print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
134 | 
135 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor())
136 | SGDRegressor_classifier.train(training)
137 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100)
138 | 
139 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
140 | LinearSVC_classifier.train(training)
141 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100)
142 | 
143 | NuSVC_classifier = SklearnClassifier(NuSVC())
144 | NuSVC_classifier.train(training)
145 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100)
146 | 
147 | 
148 | voted_classifier = VoteClassifier(SGDRegressor_classifier,LinearSVC_classifier,
149 |                                   NuSVC_classifier, MNB_classifier, classifier,
150 |                                   GS_classifier, logistic_classifier, GS_classifier)
151 | print("Voted Classifier Accuracy: " , (nltk.classify.accuracy(voted_classifier, testing))*100)
152 | 
153 | print("Classification:", voted_classifier.classify(testing[0][0]),"confidence %: ", voted_classifier.confidence(testing[0][0]))
154 | 


--------------------------------------------------------------------------------
/Taining_data.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | import pickle
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | from nltk.tokenize import word_tokenize
  6 | from nltk.corpus import movie_reviews
  7 | 
  8 | 
  9 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
 10 | from sklearn.linear_model import LogisticRegression, SGDRegressor
 11 | from sklearn.svm import SVC, LinearSVC, NuSVC
 12 | 
 13 | from nltk.classify import ClassifierI
 14 | from statistics import mode
 15 | 
 16 | class VoteClassifier(ClassifierI):
 17 |     def __init__(self, *classifiers):
 18 |         self._classifiers = classifiers
 19 | 
 20 |     def classify(self, features):
 21 |         votes = []
 22 |         for c in self._classifiers:
 23 |             v = c.classify(features)
 24 |             votes.append(v)
 25 |         return  mode(votes)
 26 | 
 27 |     def confidence(self, features):
 28 |         votes = []
 29 |         for c in self._classifiers:
 30 |             v = c.classify(features)
 31 |             votes.append(v)
 32 |         choice_votes = votes.count(mode(votes))
 33 |         conf = choice_votes / len(votes)
 34 |         return conf
 35 | 
 36 | short_pos = open("short_reviews/positive.txt" , "r").read()
 37 | short_neg = open("short_reviews/negative.txt" , "r").read()
 38 | 
 39 | documents = []
 40 | 
 41 | for r in short_pos.split('\n'):
 42 |     documents.append((r, "pos"))
 43 | 
 44 | for r in short_neg.split('\n'):
 45 |     documents.append((r, "neg"))
 46 | 
 47 | all_words = []
 48 | short_pos_words = word_tokenize(short_pos)
 49 | short_neg_words = word_tokenize(short_neg)
 50 | 
 51 | for w in short_pos_words:
 52 |     all_words.append(w.lower())
 53 | 
 54 | for w in short_neg_words:
 55 |     all_words.append(w.lower())
 56 | 
 57 | all_words = nltk.FreqDist(all_words)
 58 | 
 59 | word_features = list(all_words.keys())[:5000]
 60 | 
 61 | def find_features(document):
 62 |     words = word_tokenize(document)
 63 |     features = {}
 64 |     for w in word_features:
 65 |         features[w] = (w in words)
 66 | 
 67 |     return features
 68 | 
 69 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
 70 | 
 71 | featuresets = [(find_features(rev) , category) for (rev , category) in documents]
 72 | 
 73 | 
 74 | random.shuffle(featuresets)
 75 | #Positive Data example
 76 | training = featuresets[:10000]
 77 | testing = featuresets[10000:]
 78 | 
 79 | # Negative Data example
 80 | # training = featuresets[100:]
 81 | # testing = featuresets[:100]
 82 | 
 83 | classifier = nltk.NaiveBayesClassifier.train(training)
 84 | 
 85 | # 1st Way
 86 | #classifier_f = open("naive_bayes.pickle" , "rb")
 87 | #classifier = pickle.load(classifier_f)
 88 | #classifier_f.close()
 89 | 
 90 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100)
 91 | 
 92 | classifier.show_most_informative_features(20)
 93 | 
 94 | # 2nd Way
 95 | '''
 96 | save_classifier = open("naive_bayes.pickle" , "wb")
 97 | pickle.dump(classifier, save_classifier)
 98 | save_classifier.close()
 99 | '''
100 | 
101 | MNB_classifier = SklearnClassifier(MultinomialNB())
102 | MNB_classifier.train(training)
103 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100)
104 | 
105 | BRN_classifier = SklearnClassifier(BernoulliNB())
106 | BRN_classifier.train(training)
107 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100)
108 | 
109 | GS_classifier = SklearnClassifier(GaussianNB())
110 | GS_classifier.train(training)
111 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
112 | 
113 | logistic_classifier = SklearnClassifier(LogisticRegression())
114 | logistic_classifier.train(training)
115 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100)
116 | 
117 | #svm_classifier = SklearnClassifier(SVC())
118 | #svm_classifier.train(training)
119 | #print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
120 | 
121 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor())
122 | SGDRegressor_classifier.train(training)
123 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100)
124 | 
125 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
126 | LinearSVC_classifier.train(training)
127 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100)
128 | 
129 | NuSVC_classifier = SklearnClassifier(NuSVC())
130 | NuSVC_classifier.train(training)
131 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100)
132 | 
133 | 
134 | voted_classifier = VoteClassifier(SGDRegressor_classifier,LinearSVC_classifier,
135 |                                   NuSVC_classifier, MNB_classifier, classifier,
136 |                                   GS_classifier, logistic_classifier, GS_classifier)
137 | print("Voted Classifier Accuracy: " , (nltk.classify.accuracy(voted_classifier, testing))*100)
138 | 
139 | print("Classification:", voted_classifier.classify(testing[0][0]),"confidence %: ", voted_classifier.confidence(testing[0][0]))
140 | 


--------------------------------------------------------------------------------
/lemmatizing.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Similar Operation as stemming only the end results will
 3 | be real words.
 4 | End words will be root words, synonymes to origina; words
 5 | '''
 6 | from nltk.stem import WordNetLemmatizer
 7 | lemmatizer = WordNetLemmatizer()
 8 | 
 9 | print(lemmatizer.lemmatize("cats"))
10 | print(lemmatizer.lemmatize("cacti"))
11 | print(lemmatizer.lemmatize("geese"))
12 | print(lemmatizer.lemmatize("cows"))
13 | print(lemmatizer.lemmatize("dogy"))
14 | 
15 | print(lemmatizer.lemmatize("better"))
16 | print(lemmatizer.lemmatize("better". pos="a"))
17 | 


--------------------------------------------------------------------------------
/naive_bayes.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | 
 5 | 
 6 | documents = [(list( movie_reviews.words(fileid), category)
 7 |             for category in movie_reviews.categories()
 8 |             for fileid in movie_reviews.fileids(category)]
 9 | 
10 | random.shuffle(documents)
11 | 
12 | all_words = []
13 | 
14 | for w in movie_reviews.words():
15 |     all_words.append(w.lower())
16 | 
17 | all_words = nltk.FreqDist(all_words)
18 | 
19 | word_features = list(all_words.keys())[:3000]
20 | 
21 | def find_features(document):
22 |     words = set(document)
23 |     features = {}
24 |     for w in word_features:
25 |         features[w] = (w in words)
26 | 
27 |     return features
28 | 
29 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
30 | 
31 | featuresets = [(find_features(rev) , category) for (rev , category) in documents]
32 | 
33 | training = featuresets[:1900]
34 | testing = featuresets[1900:]
35 | 
36 | classifier = nltk.NaiveBayesClassifier.train(training)
37 | 
38 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100)
39 | 
40 | classifier.show_most_informative_features(20)
41 | 


--------------------------------------------------------------------------------
/named_enti_reco.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | from nltk.tokenize import PunktSentenceTokenizer
 4 | 
 5 | train_text = state_union.raw("2005-GWBush.txt")
 6 | test_text=  state_union.raw("2006-GWBush.txt")
 7 | 
 8 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
 9 | tokenized = custom_sent_tokenizer(test_text)
10 | 
11 | def process_content():
12 |     try:
13 |         for i in tokenized[5:]:
14 |             words = nltk.word_tokenize(i)
15 |             tagged = nltk.pos_tag(words)
16 | 
17 |             namedEnt = nltk.ne_chunk(tagged, binary=True)
18 | 
19 |             namedEnt.draw()
20 |     except Exception as e:
21 |         print(str(e))
22 | 
23 | process_content()
24 | 
25 | '''
26 | NE Type Examples
27 | 
28 | 
29 | '''
30 | 


--------------------------------------------------------------------------------
/nltk_corpora.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import gutenberg
 3 | from nltk.tokenize import sent_tokenize, word_tokenize
 4 | from nltk.corpus import stopwords
 5 | 
 6 | print(nltk.__file__)
 7 | 
 8 | stop_words = set(stopwords.words("english"))
 9 | 
10 | sample = gutenberg.raw("bible-kjv.txt")
11 | tok = sent_tokenize(sample, language='english')
12 | 
13 | words = word_tokenize(sample, language='english')
14 | 
15 | filtered_words = [w for w in words if not w in stop_words]
16 | print(tok[5:15], sep=' ', end='n', file=sys.stdout, flush=False)
17 | 


--------------------------------------------------------------------------------
/nltk_download.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | nltk.download()
3 | 


--------------------------------------------------------------------------------
/nltk_scikit_learn.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | import pickle
 4 | from nltk.classify.scikitlearn import SklearnClassifier
 5 | from nltk.corpus import movie_reviews
 6 | 
 7 | from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
 8 | from sklearn.linear_model import LogisticRegression, SGDRegressor
 9 | from sklearn.svm import SVC, LinearSVC, NuSVC
10 | 
11 | documents = [(list( movie_reviews.words(fileid), category)
12 |             for category in movie_reviews.categories()
13 |             for fileid in movie_reviews.fileids(category)]
14 | 
15 | random.shuffle(documents)
16 | 
17 | all_words = []
18 | 
19 | for w in movie_reviews.words():
20 |     all_words.append(w.lower())
21 | 
22 | all_words = nltk.FreqDist(all_words)
23 | 
24 | word_features = list(all_words.keys())[:3000]
25 | 
26 | def find_features(document):
27 |     words = set(document)
28 |     features = {}
29 |     for w in word_features:
30 |         features[w] = (w in words)
31 | 
32 |     return features
33 | 
34 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
35 | 
36 | featuresets = [(find_features(rev) , category) for (rev , category) in documents]
37 | 
38 | training = featuresets[:1900]
39 | testing = featuresets[1900:]
40 | 
41 | classifier = nltk.NaiveBayesClassifier.train(training)
42 | 
43 | # 1st Way
44 | classifier_f = open("naive_bayes.pickle" , "rb")
45 | classifier = pickle.load(classifier_f)
46 | classifier_f.close()
47 | 
48 | print ("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing))*100)
49 | 
50 | classifier.show_most_informative_features(20)
51 | 
52 | # 2nd Way
53 | '''
54 | save_classifier = open("naive_bayes.pickle" , "wb")
55 | pickle.dump(classifier, save_classifier)
56 | save_classifier.close()
57 | '''
58 | 
59 | MNB_classifier = SklearnClassifier(MultinomialNB())
60 | MNB_classifier.train(training)
61 | print ("Multinomial Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing))*100)
62 | 
63 | BRN_classifier = SklearnClassifier(BernoulliNB())
64 | BRN_classifier.train(training)
65 | print ("Bernoulli Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(BRN_classifier, testing))*100)
66 | 
67 | GS_classifier = SklearnClassifier(GaussianNB())
68 | GS_classifier.train(training)
69 | print ("Gaussian Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
70 | 
71 | logistic_classifier = SklearnClassifier(LogisticRegression())
72 | logistic_classifier.train(training)
73 | print ("Logistic Regression Algo Accuracy: ", (nltk.classify.accuracy(logistic_classifier, testing))*100)
74 | 
75 | svm_classifier = SklearnClassifier(SVC())
76 | svm_classifier.train(training)
77 | print ("SVM Algo Accuracy: ", (nltk.classify.accuracy(GS_classifier, testing))*100)
78 | 
79 | SGDRegressor_classifier = SklearnClassifier(SGDRegressor())
80 | SGDRegressor_classifier.train(training)
81 | print ("SGDRegressor Algo Accuracy: ", (nltk.classify.accuracy(SGDRegressor_classifier, testing))*100)
82 | 
83 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
84 | LinearSVC_classifier.train(training)
85 | print ("LinearSVC Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing))*100)
86 | 
87 | NuSVC_classifier = SklearnClassifier(NuSVC())
88 | NuSVC_classifier.train(training)
89 | print ("NuSVC Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing))*100)
90 | 


--------------------------------------------------------------------------------
/part_speech_tagging.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | #PunktSentenceTokenizer is Unsupervised ML tokenizer. We can train it if we want
 4 | from nltk.tokenize import PunktSentenceTokenizer
 5 | from nltk.tokenize import word_tokenize
 6 | '''
 7 | Put the words to be tagged here to clean the dataset
 8 | Gtting the datat can be anything and do the things we can't do
 9 | 
10 | 
11 | '''
12 | train_text = state_union.raw("2005-GWBush.txt")
13 | sample_text = state_union.raw("2006-GWBush.txt")
14 | 
15 | custom_tokenizer = PunktSentenceTokenizer(train_text)
16 | 
17 | tokenized = custom_tokenizer.tokenize(sample_text)
18 | 
19 | def process_content():
20 |     try:
21 |         for i in tokenized:
22 |             words = nltk.word_tokenize(i)
23 |             tagged = nltk.pos_tag(words)
24 |             print(tagged)
25 |     except Exception as e:
26 |         print(str(e))
27 | 
28 | process_content()
29 | 


--------------------------------------------------------------------------------
/stemming.py:
--------------------------------------------------------------------------------
 1 | from nltk.stem import PorterStemmer
 2 | from nltk.tokenize import word_tokenize
 3 | 
 4 | ps = PorterStemmer()
 5 | 
 6 | example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]
 7 | 
 8 | example_text = "I was riding the car and going too fast. Suddenly hit the stone and car flipped over"
 9 | 
10 | for w in example_words:
11 |     print(ps.stem(w))
12 | 
13 | words = word_tokenize(example_text)
14 | 
15 | for w in words:
16 |     print(ps.stem(w))
17 | 


--------------------------------------------------------------------------------
/stop_words.py:
--------------------------------------------------------------------------------
 1 | from nltk.corpus import stopwords
 2 | from nltk.tokenize import word_tokenize
 3 | 
 4 | #Print Stop words
 5 | stop_words = set(stopwords.words("english"))
 6 | print(stop_words)
 7 | 
 8 | example_text = "This is general sentence to just clarify if stop words are working or not. I have some awesome projects coming up"
 9 | 
10 | words = word_tokenize(example_text)
11 | 
12 | filtered_sentence = []
13 | for w in words:
14 |     for w not in stop_words:
15 |         filtered_sentence.append(w)
16 | 
17 | #print filtered sentences
18 | print(filtered_sentence)
19 | 
20 | #print in a line
21 | filtered_sentence1 = [w for w in words if not w in stop_words]
22 | 
23 | #print filtered sentences
24 | print(filtered_sentence1)
25 | 


--------------------------------------------------------------------------------
/text_classification.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import movie_reviews
 3 | from nltk.tokenize import sent_tokenize, word_tokenize
 4 | from nltk.corpus import stopwords
 5 | import random
 6 | 
 7 | '''
 8 | Text Classification can be used for Stock and Politics and other subjects
 9 | '''
10 | 
11 | documents = [(list( movie_reviews.words(fileid), category)
12 |             for category in movie_reviews.categories()
13 |             for fileid in movie_reviews.fileids(category)]
14 | 
15 | '''
16 | documents = []
17 | 
18 | for category in movie_reviews.categories():
19 |     for fileid in movie_reviews.fileids(category):
20 |         documents.append(list.(movie_reviews.words(fileid)), category)
21 | '''
22 | 
23 | random.shuffle (documents, random=None)
24 | 
25 | all_words = []
26 | for w in movie_reviews.words():
27 |     all_words.append(w.lower())
28 | 
29 | all_words = nltk.FreqDist(all_words)
30 | 
31 | print(all_words.most_common(15))
32 | 
33 | print (all_words["stupid"])
34 | 
35 | 
36 | '''
37 | Use ML-Naive Bayes Algorithm to identify possitive & negetive words
38 | '''
39 | 


--------------------------------------------------------------------------------
/text_word_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import sent_tokenize, word_tokenize
 2 | 
 3 | 
 4 | example_text = "How are you Mr. PyPatel. How is your day? Krishna is watching TV. Cool breeze is going on outside. Having fun learning python and nltk."
 5 | 
 6 | #print separeted sentances
 7 | print (sent_tokenize(example_text))
 8 | 
 9 | #print Separeted words
10 | print(word_tokenize(example_text))
11 | 
12 | #count number of words
13 | count = 0
14 | for i in word_tokenize:
15 |     count+=1
16 | 
17 | print(count)
18 | 


--------------------------------------------------------------------------------
/wordNet.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import wordnet
 3 | from nltk.tokenize import sent_tokenize, word_tokenize
 4 | from nltk.corpus import stopwords
 5 | 
 6 | syns = wordnet.synset("program")
 7 | 
 8 | # Synset
 9 | print(syns[0].name())
10 | 
11 | # Just the word
12 | print(syns[0].lemmas()[0].name())
13 | 
14 | # Definition
15 | print(syns[0].definition())
16 | 
17 | # Examples
18 | print(syns[0].examples())
19 | 
20 | synonyms = []
21 | antonyms = []
22 | 
23 | for syn in wordnet.synset("good"):
24 |     for l in syn.lemmas():
25 |         #print("l:",l)
26 |         synonyms.append(l.name())
27 |         if l.antonyms():
28 |             antonyms.append(l.antonyms()[0].name())
29 | 
30 | print(set(synonyms))
31 | print(set(antonyms))
32 | 
33 | 
34 | # Word Similarity
35 | w1 = wordnet.synset("ship.n.01")
36 | w2 = wordnet.synset("boat.n.01")
37 | w3 = wordnet.synset("car.n.01")
38 | w4 = wordnet.synset("cat.n.01")
39 | 
40 | print(w1.wup_similarity(w2))
41 | print(w1.wup_similarity(w3))
42 | print(w1.wup_similarity(w4))
43 | 
44 | 
45 | ''' Can create Bot that can run a NEWS chanel by scrapping the news from other site and
46 | change the synonyms to create their own article. Do the same thing for the Assignment submission
47 | to cheat the bots.
48 | 
49 | Google is currently applying this in reverse to find out the Bot Websites which are totally operated
50 | by the Bots.
51 | '''
52 | 


--------------------------------------------------------------------------------
/word_features.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | 
 5 | 
 6 | documents = [(list( movie_reviews.words(fileid), category)
 7 |             for category in movie_reviews.categories()
 8 |             for fileid in movie_reviews.fileids(category)]
 9 | 
10 | random.shuffle(documents)
11 | 
12 | all_words = []
13 | 
14 | for w in movie_reviews.words():
15 |     all_words.append(w.lower())
16 | 
17 | all_words = nltk.FreqDist(all_words)
18 | 
19 | word_features = list(all_words.keys())[:3000]
20 | 
21 | def find_features(document):
22 |     words = set(document)
23 |     features = {}
24 |     for w in word_features:
25 |         features[w] = (w in words)
26 | 
27 |     return features
28 | 
29 | print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
30 | 
31 | featuresets = [(find_features(rev) , category) for (rev , category) in documents]
32 | 


--------------------------------------------------------------------------------