├── .gitattributes ├── .gitignore ├── BernoulliNB_classifier5k.pickle ├── LinearSVC_classifier5k.pickle ├── LogisticRegression_classifier5k.pickle ├── MNB_classifier5k.pickle ├── README.md ├── SGDC_classifier5k.pickle ├── documents.pickle ├── naivebayes.pickle ├── nltkvid1.py ├── nltkvid10.py ├── nltkvid11.py ├── nltkvid12.py ├── nltkvid13.py ├── nltkvid14.py ├── nltkvid15.py ├── nltkvid16.py ├── nltkvid17.py ├── nltkvid18.py ├── nltkvid19.1.py ├── nltkvid19.2.py ├── nltkvid19.3.py ├── nltkvid2.py ├── nltkvid20.py ├── nltkvid21.py ├── nltkvid3.py ├── nltkvid4.py ├── nltkvid5.py ├── nltkvid6.py ├── nltkvid7.py ├── nltkvid8.py ├── nltkvid9.py ├── originalnaivebayes5k.pickle ├── sentiment_mod.pickle ├── voted_classifier.pickle ├── voted_classifier5k.pickle ├── word_features5k.pickle └── word_features_3000.pickle /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /BernoulliNB_classifier5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/BernoulliNB_classifier5k.pickle -------------------------------------------------------------------------------- /LinearSVC_classifier5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/LinearSVC_classifier5k.pickle -------------------------------------------------------------------------------- /LogisticRegression_classifier5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/LogisticRegression_classifier5k.pickle -------------------------------------------------------------------------------- /MNB_classifier5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/MNB_classifier5k.pickle -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLTK-3----Natural-Language-Processing-with-Python-series 2 | Natural Language Processing with Python 3 and NLTK 3 series 3 | http://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ 4 | -------------------------------------------------------------------------------- /SGDC_classifier5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/SGDC_classifier5k.pickle -------------------------------------------------------------------------------- /documents.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/documents.pickle -------------------------------------------------------------------------------- /naivebayes.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/naivebayes.pickle -------------------------------------------------------------------------------- /nltkvid1.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import sent_tokenize, word_tokenize 2 | 3 | # tokenizing - word tokenizers.... sentence tokenizers 4 | # lexicon and corporas 5 | # corpora - body of text. ex: medical journals, presidential speeches, English language 6 | # lexicon - words and their means 7 | 8 | # investor-speak.... regular english-speak 9 | 10 | # investor speak 'bull' = someone who is positive about the market 11 | # english-speak 'bull' = scary animal you dont want running @ you 12 | 13 | example_text = "Hello Mr. Smith, how are you doing today? The weather is great and Python is awesome. The sky is pinkish-blue. You should not eat cardboard." 14 | 15 | ##print(sent_tokenize(example_text)) 16 | ## 17 | ##print(word_tokenize(example_text)) 18 | 19 | 20 | for i in word_tokenize(example_text): 21 | print(i) 22 | -------------------------------------------------------------------------------- /nltkvid10.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import wordnet 2 | 3 | syns = wordnet.synsets("program") 4 | 5 | #synset 6 | print(syns[0].name()) 7 | 8 | # just the word 9 | print(syns[0].lemmas()[0].name()) 10 | 11 | # definition 12 | print(syns[0].definition()) 13 | 14 | # examples 15 | print(syns[0].examples()) 16 | 17 | 18 | synonyms = [] 19 | antonyms = [] 20 | 21 | for syn in wordnet.synsets("good"): 22 | for l in syn.lemmas(): 23 | synonyms.append(l.name()) 24 | if l.antonyms(): 25 | antonyms.append(l.antonyms()[0].name()) 26 | 27 | print(set(synonyms)) 28 | print(set(antonyms)) 29 | 30 | 31 | w1 = wordnet.synset("ship.n.01") 32 | w2 = wordnet.synset("boat.n.01") 33 | print(w1.wup_similarity(w2)) 34 | 35 | 36 | w1 = wordnet.synset("ship.n.01") 37 | w2 = wordnet.synset("car.n.01") 38 | print(w1.wup_similarity(w2)) 39 | 40 | 41 | w1 = wordnet.synset("ship.n.01") 42 | w2 = wordnet.synset("cactus.n.01") 43 | print(w1.wup_similarity(w2)) 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /nltkvid11.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | 5 | documents = [(list(movie_reviews.words(fileid)), category) 6 | for category in movie_reviews.categories() 7 | for fileid in movie_reviews.fileids(category)] 8 | 9 | random.shuffle(documents) 10 | 11 | print(documents[1]) 12 | 13 | all_words = [] 14 | for w in movie_reviews.words(): 15 | all_words.append(w.lower()) 16 | 17 | all_words = nltk.FreqDist(all_words) 18 | print(all_words.most_common(15)) 19 | print(all_words["stupid"]) 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /nltkvid12.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | 5 | documents = [(list(movie_reviews.words(fileid)), category) 6 | for category in movie_reviews.categories() 7 | for fileid in movie_reviews.fileids(category)] 8 | 9 | random.shuffle(documents) 10 | 11 | all_words = [] 12 | 13 | for w in movie_reviews.words(): 14 | all_words.append(w.lower()) 15 | 16 | all_words = nltk.FreqDist(all_words) 17 | 18 | word_features = list(all_words.keys())[:3000] 19 | 20 | def find_features(document): 21 | words = set(document) 22 | features = {} 23 | for w in word_features: 24 | features[w] = (w in words) 25 | 26 | return features 27 | 28 | print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 29 | 30 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /nltkvid13.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | 5 | documents = [(list(movie_reviews.words(fileid)), category) 6 | for category in movie_reviews.categories() 7 | for fileid in movie_reviews.fileids(category)] 8 | 9 | random.shuffle(documents) 10 | 11 | all_words = [] 12 | 13 | for w in movie_reviews.words(): 14 | all_words.append(w.lower()) 15 | 16 | all_words = nltk.FreqDist(all_words) 17 | 18 | word_features = list(all_words.keys())[:3000] 19 | 20 | def find_features(document): 21 | words = set(document) 22 | features = {} 23 | for w in word_features: 24 | features[w] = (w in words) 25 | 26 | return features 27 | 28 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 29 | 30 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 31 | 32 | training_set = featuresets[:1900] 33 | testing_set = featuresets[1900:] 34 | 35 | classifier = nltk.NaiveBayesClassifier.train(training_set) 36 | print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 37 | classifier.show_most_informative_features(15) 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /nltkvid14.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | import pickle 5 | 6 | documents = [(list(movie_reviews.words(fileid)), category) 7 | for category in movie_reviews.categories() 8 | for fileid in movie_reviews.fileids(category)] 9 | 10 | random.shuffle(documents) 11 | 12 | all_words = [] 13 | 14 | for w in movie_reviews.words(): 15 | all_words.append(w.lower()) 16 | 17 | all_words = nltk.FreqDist(all_words) 18 | 19 | word_features = list(all_words.keys())[:3000] 20 | 21 | def find_features(document): 22 | words = set(document) 23 | features = {} 24 | for w in word_features: 25 | features[w] = (w in words) 26 | 27 | return features 28 | 29 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 30 | 31 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 32 | 33 | training_set = featuresets[:1900] 34 | testing_set = featuresets[1900:] 35 | 36 | #classifier = nltk.NaiveBayesClassifier.train(training_set) 37 | 38 | classifier_f = open("naivebayes.pickle","rb") 39 | classifier = pickle.load(classifier_f) 40 | classifier_f.close() 41 | 42 | 43 | 44 | 45 | print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 46 | classifier.show_most_informative_features(15) 47 | 48 | ##save_classifier = open("naivebayes.pickle","wb") 49 | ##pickle.dump(classifier, save_classifier) 50 | ##save_classifier.close() 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /nltkvid15.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | import pickle 6 | 7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 8 | from sklearn.linear_model import LogisticRegression, SGDClassifier 9 | from sklearn.svm import SVC, LinearSVC, NuSVC 10 | 11 | 12 | documents = [(list(movie_reviews.words(fileid)), category) 13 | for category in movie_reviews.categories() 14 | for fileid in movie_reviews.fileids(category)] 15 | 16 | random.shuffle(documents) 17 | 18 | all_words = [] 19 | 20 | for w in movie_reviews.words(): 21 | all_words.append(w.lower()) 22 | 23 | all_words = nltk.FreqDist(all_words) 24 | 25 | word_features = list(all_words.keys())[:3000] 26 | 27 | def find_features(document): 28 | words = set(document) 29 | features = {} 30 | for w in word_features: 31 | features[w] = (w in words) 32 | 33 | return features 34 | 35 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 36 | 37 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 38 | 39 | training_set = featuresets[:1900] 40 | testing_set = featuresets[1900:] 41 | 42 | #classifier = nltk.NaiveBayesClassifier.train(training_set) 43 | 44 | classifier_f = open("naivebayes.pickle","rb") 45 | classifier = pickle.load(classifier_f) 46 | classifier_f.close() 47 | 48 | 49 | 50 | 51 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 52 | classifier.show_most_informative_features(15) 53 | 54 | MNB_classifier = SklearnClassifier(MultinomialNB()) 55 | MNB_classifier.train(training_set) 56 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) 57 | 58 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) 59 | BernoulliNB_classifier.train(training_set) 60 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) 61 | 62 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) 63 | LogisticRegression_classifier.train(training_set) 64 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) 65 | 66 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) 67 | SGDClassifier_classifier.train(training_set) 68 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) 69 | 70 | SVC_classifier = SklearnClassifier(SVC()) 71 | SVC_classifier.train(training_set) 72 | print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) 73 | 74 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 75 | LinearSVC_classifier.train(training_set) 76 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) 77 | 78 | NuSVC_classifier = SklearnClassifier(NuSVC()) 79 | NuSVC_classifier.train(training_set) 80 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /nltkvid16.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | import pickle 6 | 7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 8 | from sklearn.linear_model import LogisticRegression, SGDClassifier 9 | from sklearn.svm import SVC, LinearSVC, NuSVC 10 | 11 | from nltk.classify import ClassifierI 12 | from statistics import mode 13 | 14 | 15 | class VoteClassifier(ClassifierI): 16 | def __init__(self, *classifiers): 17 | self._classifiers = classifiers 18 | 19 | def classify(self, features): 20 | votes = [] 21 | for c in self._classifiers: 22 | v = c.classify(features) 23 | votes.append(v) 24 | return mode(votes) 25 | 26 | def confidence(self, features): 27 | votes = [] 28 | for c in self._classifiers: 29 | v = c.classify(features) 30 | votes.append(v) 31 | 32 | choice_votes = votes.count(mode(votes)) 33 | conf = choice_votes / len(votes) 34 | return conf 35 | 36 | documents = [(list(movie_reviews.words(fileid)), category) 37 | for category in movie_reviews.categories() 38 | for fileid in movie_reviews.fileids(category)] 39 | 40 | random.shuffle(documents) 41 | 42 | all_words = [] 43 | 44 | for w in movie_reviews.words(): 45 | all_words.append(w.lower()) 46 | 47 | all_words = nltk.FreqDist(all_words) 48 | 49 | word_features = list(all_words.keys())[:3000] 50 | 51 | def find_features(document): 52 | words = set(document) 53 | features = {} 54 | for w in word_features: 55 | features[w] = (w in words) 56 | 57 | return features 58 | 59 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 60 | 61 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 62 | 63 | training_set = featuresets[:1900] 64 | testing_set = featuresets[1900:] 65 | 66 | #classifier = nltk.NaiveBayesClassifier.train(training_set) 67 | 68 | classifier_f = open("naivebayes.pickle","rb") 69 | classifier = pickle.load(classifier_f) 70 | classifier_f.close() 71 | 72 | 73 | 74 | 75 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 76 | classifier.show_most_informative_features(15) 77 | 78 | MNB_classifier = SklearnClassifier(MultinomialNB()) 79 | MNB_classifier.train(training_set) 80 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) 81 | 82 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) 83 | BernoulliNB_classifier.train(training_set) 84 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) 85 | 86 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) 87 | LogisticRegression_classifier.train(training_set) 88 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) 89 | 90 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) 91 | SGDClassifier_classifier.train(training_set) 92 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) 93 | 94 | ##SVC_classifier = SklearnClassifier(SVC()) 95 | ##SVC_classifier.train(training_set) 96 | ##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) 97 | 98 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 99 | LinearSVC_classifier.train(training_set) 100 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) 101 | 102 | NuSVC_classifier = SklearnClassifier(NuSVC()) 103 | NuSVC_classifier.train(training_set) 104 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) 105 | 106 | 107 | voted_classifier = VoteClassifier(classifier, 108 | NuSVC_classifier, 109 | LinearSVC_classifier, 110 | SGDClassifier_classifier, 111 | MNB_classifier, 112 | BernoulliNB_classifier, 113 | LogisticRegression_classifier) 114 | 115 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) 116 | 117 | print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100) 118 | print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100) 119 | print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100) 120 | print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100) 121 | print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100) 122 | print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100) 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /nltkvid17.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | import pickle 6 | 7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 8 | from sklearn.linear_model import LogisticRegression, SGDClassifier 9 | from sklearn.svm import SVC, LinearSVC, NuSVC 10 | 11 | from nltk.classify import ClassifierI 12 | from statistics import mode 13 | 14 | 15 | class VoteClassifier(ClassifierI): 16 | def __init__(self, *classifiers): 17 | self._classifiers = classifiers 18 | 19 | def classify(self, features): 20 | votes = [] 21 | for c in self._classifiers: 22 | v = c.classify(features) 23 | votes.append(v) 24 | return mode(votes) 25 | 26 | def confidence(self, features): 27 | votes = [] 28 | for c in self._classifiers: 29 | v = c.classify(features) 30 | votes.append(v) 31 | 32 | choice_votes = votes.count(mode(votes)) 33 | conf = choice_votes / len(votes) 34 | return conf 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | documents = [(list(movie_reviews.words(fileid)), category) 44 | for category in movie_reviews.categories() 45 | for fileid in movie_reviews.fileids(category)] 46 | 47 | #random.shuffle(documents) 48 | 49 | all_words = [] 50 | 51 | for w in movie_reviews.words(): 52 | all_words.append(w.lower()) 53 | 54 | all_words = nltk.FreqDist(all_words) 55 | 56 | word_features = list(all_words.keys())[:3000] 57 | 58 | def find_features(document): 59 | words = set(document) 60 | features = {} 61 | for w in word_features: 62 | features[w] = (w in words) 63 | 64 | return features 65 | 66 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 67 | 68 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 69 | 70 | # positive data example: 71 | training_set = featuresets[:1900] 72 | testing_set = featuresets[1900:] 73 | 74 | 75 | # negative data example: 76 | training_set = featuresets[100:] 77 | testing_set = featuresets[:100] 78 | 79 | 80 | #classifier = nltk.NaiveBayesClassifier.train(training_set) 81 | 82 | classifier_f = open("naivebayes.pickle","rb") 83 | classifier = pickle.load(classifier_f) 84 | classifier_f.close() 85 | 86 | 87 | 88 | 89 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 90 | classifier.show_most_informative_features(15) 91 | 92 | MNB_classifier = SklearnClassifier(MultinomialNB()) 93 | MNB_classifier.train(training_set) 94 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) 95 | 96 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) 97 | BernoulliNB_classifier.train(training_set) 98 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) 99 | 100 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) 101 | LogisticRegression_classifier.train(training_set) 102 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) 103 | 104 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) 105 | SGDClassifier_classifier.train(training_set) 106 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) 107 | 108 | ##SVC_classifier = SklearnClassifier(SVC()) 109 | ##SVC_classifier.train(training_set) 110 | ##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) 111 | 112 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 113 | LinearSVC_classifier.train(training_set) 114 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) 115 | 116 | NuSVC_classifier = SklearnClassifier(NuSVC()) 117 | NuSVC_classifier.train(training_set) 118 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | voted_classifier = VoteClassifier( 131 | NuSVC_classifier, 132 | LinearSVC_classifier, 133 | MNB_classifier, 134 | BernoulliNB_classifier, 135 | LogisticRegression_classifier) 136 | 137 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) 138 | 139 | ##print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100) 140 | ##print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100) 141 | ##print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100) 142 | ##print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100) 143 | ##print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100) 144 | ##print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100) 145 | ## 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /nltkvid18.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | from nltk.corpus import movie_reviews 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | import pickle 6 | 7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 8 | from sklearn.linear_model import LogisticRegression, SGDClassifier 9 | from sklearn.svm import SVC, LinearSVC, NuSVC 10 | 11 | from nltk.classify import ClassifierI 12 | from statistics import mode 13 | 14 | from nltk.tokenize import word_tokenize 15 | 16 | 17 | class VoteClassifier(ClassifierI): 18 | def __init__(self, *classifiers): 19 | self._classifiers = classifiers 20 | 21 | def classify(self, features): 22 | votes = [] 23 | for c in self._classifiers: 24 | v = c.classify(features) 25 | votes.append(v) 26 | return mode(votes) 27 | 28 | def confidence(self, features): 29 | votes = [] 30 | for c in self._classifiers: 31 | v = c.classify(features) 32 | votes.append(v) 33 | 34 | choice_votes = votes.count(mode(votes)) 35 | conf = choice_votes / len(votes) 36 | return conf 37 | 38 | short_pos = open("short_reviews/positive.txt","r").read() 39 | short_neg = open("short_reviews/negative.txt","r").read() 40 | 41 | documents = [] 42 | 43 | for r in short_pos.split('\n'): 44 | documents.append( (r, "pos") ) 45 | 46 | for r in short_neg.split('\n'): 47 | documents.append( (r, "neg") ) 48 | 49 | 50 | all_words = [] 51 | 52 | short_pos_words = word_tokenize(short_pos) 53 | short_neg_words = word_tokenize(short_neg) 54 | 55 | for w in short_pos_words: 56 | all_words.append(w.lower()) 57 | 58 | for w in short_neg_words: 59 | all_words.append(w.lower()) 60 | 61 | all_words = nltk.FreqDist(all_words) 62 | 63 | word_features = list(all_words.keys())[:5000] 64 | 65 | def find_features(document): 66 | words = word_tokenize(document) 67 | features = {} 68 | for w in word_features: 69 | features[w] = (w in words) 70 | 71 | return features 72 | 73 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) 74 | 75 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 76 | 77 | random.shuffle(featuresets) 78 | 79 | # positive data example: 80 | training_set = featuresets[:10000] 81 | testing_set = featuresets[10000:] 82 | 83 | ## 84 | ### negative data example: 85 | ##training_set = featuresets[100:] 86 | ##testing_set = featuresets[:100] 87 | 88 | 89 | classifier = nltk.NaiveBayesClassifier.train(training_set) 90 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 91 | classifier.show_most_informative_features(15) 92 | 93 | MNB_classifier = SklearnClassifier(MultinomialNB()) 94 | MNB_classifier.train(training_set) 95 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) 96 | 97 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) 98 | BernoulliNB_classifier.train(training_set) 99 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) 100 | 101 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) 102 | LogisticRegression_classifier.train(training_set) 103 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) 104 | 105 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) 106 | SGDClassifier_classifier.train(training_set) 107 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100) 108 | 109 | ##SVC_classifier = SklearnClassifier(SVC()) 110 | ##SVC_classifier.train(training_set) 111 | ##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) 112 | 113 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 114 | LinearSVC_classifier.train(training_set) 115 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) 116 | 117 | NuSVC_classifier = SklearnClassifier(NuSVC()) 118 | NuSVC_classifier.train(training_set) 119 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) 120 | 121 | 122 | voted_classifier = VoteClassifier( 123 | NuSVC_classifier, 124 | LinearSVC_classifier, 125 | MNB_classifier, 126 | BernoulliNB_classifier, 127 | LogisticRegression_classifier) 128 | 129 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /nltkvid19.1.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import random 3 | #from nltk.corpus import movie_reviews 4 | from nltk.classify.scikitlearn import SklearnClassifier 5 | import pickle 6 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 7 | from sklearn.linear_model import LogisticRegression, SGDClassifier 8 | from sklearn.svm import SVC, LinearSVC, NuSVC 9 | from nltk.classify import ClassifierI 10 | from statistics import mode 11 | from nltk.tokenize import word_tokenize 12 | 13 | 14 | 15 | class VoteClassifier(ClassifierI): 16 | def __init__(self, *classifiers): 17 | self._classifiers = classifiers 18 | 19 | def classify(self, features): 20 | votes = [] 21 | for c in self._classifiers: 22 | v = c.classify(features) 23 | votes.append(v) 24 | return mode(votes) 25 | 26 | def confidence(self, features): 27 | votes = [] 28 | for c in self._classifiers: 29 | v = c.classify(features) 30 | votes.append(v) 31 | 32 | choice_votes = votes.count(mode(votes)) 33 | conf = choice_votes / len(votes) 34 | return conf 35 | 36 | short_pos = open("short_reviews/positive.txt","r").read() 37 | short_neg = open("short_reviews/negative.txt","r").read() 38 | 39 | # move this up here 40 | all_words = [] 41 | documents = [] 42 | 43 | 44 | # j is adject, r is adverb, and v is verb 45 | #allowed_word_types = ["J","R","V"] 46 | allowed_word_types = ["J"] 47 | 48 | for p in short_pos.split('\n'): 49 | documents.append( (p, "pos") ) 50 | words = word_tokenize(p) 51 | pos = nltk.pos_tag(words) 52 | for w in pos: 53 | if w[1][0] in allowed_word_types: 54 | all_words.append(w[0].lower()) 55 | 56 | 57 | for p in short_neg.split('\n'): 58 | documents.append( (p, "neg") ) 59 | words = word_tokenize(p) 60 | pos = nltk.pos_tag(words) 61 | for w in pos: 62 | if w[1][0] in allowed_word_types: 63 | all_words.append(w[0].lower()) 64 | 65 | 66 | 67 | save_documents = open("pickled_algos/documents.pickle","wb") 68 | pickle.dump(documents, save_documents) 69 | save_documents.close() 70 | 71 | 72 | all_words = nltk.FreqDist(all_words) 73 | 74 | 75 | word_features = list(all_words.keys())[:5000] 76 | 77 | 78 | save_word_features = open("pickled_algos/word_features5k.pickle","wb") 79 | pickle.dump(word_features, save_word_features) 80 | save_word_features.close() 81 | 82 | 83 | def find_features(document): 84 | words = word_tokenize(document) 85 | features = {} 86 | for w in word_features: 87 | features[w] = (w in words) 88 | 89 | return features 90 | 91 | featuresets = [(find_features(rev), category) for (rev, category) in documents] 92 | 93 | random.shuffle(featuresets) 94 | print(len(featuresets)) 95 | 96 | testing_set = featuresets[10000:] 97 | training_set = featuresets[:10000] 98 | 99 | 100 | classifier = nltk.NaiveBayesClassifier.train(training_set) 101 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100) 102 | classifier.show_most_informative_features(15) 103 | 104 | ############### 105 | save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb") 106 | pickle.dump(classifier, save_classifier) 107 | save_classifier.close() 108 | 109 | MNB_classifier = SklearnClassifier(MultinomialNB()) 110 | MNB_classifier.train(training_set) 111 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100) 112 | 113 | save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb") 114 | pickle.dump(MNB_classifier, save_classifier) 115 | save_classifier.close() 116 | 117 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) 118 | BernoulliNB_classifier.train(training_set) 119 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100) 120 | 121 | save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb") 122 | pickle.dump(BernoulliNB_classifier, save_classifier) 123 | save_classifier.close() 124 | 125 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) 126 | LogisticRegression_classifier.train(training_set) 127 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100) 128 | 129 | save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle","wb") 130 | pickle.dump(LogisticRegression_classifier, save_classifier) 131 | save_classifier.close() 132 | 133 | 134 | LinearSVC_classifier = SklearnClassifier(LinearSVC()) 135 | LinearSVC_classifier.train(training_set) 136 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100) 137 | 138 | save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle","wb") 139 | pickle.dump(LinearSVC_classifier, save_classifier) 140 | save_classifier.close() 141 | 142 | 143 | ##NuSVC_classifier = SklearnClassifier(NuSVC()) 144 | ##NuSVC_classifier.train(training_set) 145 | ##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100) 146 | 147 | 148 | SGDC_classifier = SklearnClassifier(SGDClassifier()) 149 | SGDC_classifier.train(training_set) 150 | print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100) 151 | 152 | save_classifier = open("pickled_algos/SGDC_classifier5k.pickle","wb") 153 | pickle.dump(SGDC_classifier, save_classifier) 154 | save_classifier.close() 155 | 156 | 157 | voted_classifier = VoteClassifier( 158 | classifier, 159 | LinearSVC_classifier, 160 | MNB_classifier, 161 | BernoulliNB_classifier, 162 | LogisticRegression_classifier) 163 | 164 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) 165 | 166 | 167 | 168 | def sentiment(text): 169 | feats = find_features(text) 170 | 171 | return voted_classifier.classify(feats) 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /nltkvid19.2.py: -------------------------------------------------------------------------------- 1 | #File: sentiment_mod.py 2 | 3 | import nltk 4 | import random 5 | #from nltk.corpus import movie_reviews 6 | from nltk.classify.scikitlearn import SklearnClassifier 7 | import pickle 8 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB 9 | from sklearn.linear_model import LogisticRegression, SGDClassifier 10 | from sklearn.svm import SVC, LinearSVC, NuSVC 11 | from nltk.classify import ClassifierI 12 | from statistics import mode 13 | from nltk.tokenize import word_tokenize 14 | 15 | 16 | 17 | class VoteClassifier(ClassifierI): 18 | def __init__(self, *classifiers): 19 | self._classifiers = classifiers 20 | 21 | def classify(self, features): 22 | votes = [] 23 | for c in self._classifiers: 24 | v = c.classify(features) 25 | votes.append(v) 26 | return mode(votes) 27 | 28 | def confidence(self, features): 29 | votes = [] 30 | for c in self._classifiers: 31 | v = c.classify(features) 32 | votes.append(v) 33 | 34 | choice_votes = votes.count(mode(votes)) 35 | conf = choice_votes / len(votes) 36 | return conf 37 | 38 | 39 | documents_f = open("pickled_algos/documents.pickle", "rb") 40 | documents = pickle.load(documents_f) 41 | documents_f.close() 42 | 43 | 44 | 45 | 46 | word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb") 47 | word_features = pickle.load(word_features5k_f) 48 | word_features5k_f.close() 49 | 50 | 51 | def find_features(document): 52 | words = word_tokenize(document) 53 | features = {} 54 | for w in word_features: 55 | features[w] = (w in words) 56 | 57 | return features 58 | 59 | 60 | 61 | featuresets_f = open("pickled_algos/featuresets.pickle", "rb") 62 | featuresets = pickle.load(featuresets_f) 63 | featuresets_f.close() 64 | 65 | random.shuffle(featuresets) 66 | print(len(featuresets)) 67 | 68 | testing_set = featuresets[10000:] 69 | training_set = featuresets[:10000] 70 | 71 | 72 | 73 | open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb") 74 | classifier = pickle.load(open_file) 75 | open_file.close() 76 | 77 | 78 | open_file = open("pickled_algos/MNB_classifier5k.pickle", "rb") 79 | MNB_classifier = pickle.load(open_file) 80 | open_file.close() 81 | 82 | 83 | 84 | open_file = open("pickled_algos/BernoulliNB_classifier5k.pickle", "rb") 85 | BernoulliNB_classifier = pickle.load(open_file) 86 | open_file.close() 87 | 88 | 89 | open_file = open("pickled_algos/LogisticRegression_classifier5k.pickle", "rb") 90 | LogisticRegression_classifier = pickle.load(open_file) 91 | open_file.close() 92 | 93 | 94 | open_file = open("pickled_algos/LinearSVC_classifier5k.pickle", "rb") 95 | LinearSVC_classifier = pickle.load(open_file) 96 | open_file.close() 97 | 98 | 99 | open_file = open("pickled_algos/SGDC_classifier5k.pickle", "rb") 100 | SGDC_classifier = pickle.load(open_file) 101 | open_file.close() 102 | 103 | 104 | 105 | 106 | voted_classifier = VoteClassifier( 107 | classifier, 108 | LinearSVC_classifier, 109 | MNB_classifier, 110 | BernoulliNB_classifier, 111 | LogisticRegression_classifier) 112 | 113 | 114 | 115 | 116 | def sentiment(text): 117 | feats = find_features(text) 118 | 119 | return voted_classifier.classify(feats),voted_classifier.confidence(feats) 120 | 121 | 122 | 123 | # SAVE ME AS sentiment_mod.py 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /nltkvid19.3.py: -------------------------------------------------------------------------------- 1 | import sentiment_mod as s 2 | 3 | print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!")) 4 | print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10")) 5 | -------------------------------------------------------------------------------- /nltkvid2.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | from nltk.tokenize import word_tokenize 3 | 4 | example_sentence = "This is an example showing off stop word filtration." 5 | stop_words = set(stopwords.words("english")) 6 | 7 | words = word_tokenize(example_sentence) 8 | ## 9 | ##filtered_sentence = [] 10 | ## 11 | ##for w in words: 12 | ## if w not in stop_words: 13 | ## filtered_sentence.append(w) 14 | 15 | filtered_sentence = [w for w in words if not w in stop_words] 16 | 17 | print(filtered_sentence) 18 | -------------------------------------------------------------------------------- /nltkvid20.py: -------------------------------------------------------------------------------- 1 | from tweepy import Stream 2 | from tweepy import OAuthHandler 3 | from tweepy.streaming import StreamListener 4 | import json 5 | import sentiment_mod as s 6 | 7 | 8 | 9 | #consumer key, consumer secret, access token, access secret. 10 | ckey="asdfsafsafsaf" 11 | csecret="asdfasdfsadfsa" 12 | atoken="asdfsadfsafsaf-asdfsaf" 13 | asecret="asdfsadfsadfsadfsadfsad" 14 | 15 | from twitterapistuff import * 16 | 17 | class listener(StreamListener): 18 | 19 | def on_data(self, data): 20 | try: 21 | all_data = json.loads(data) 22 | 23 | tweet = all_data["text"] 24 | sentiment_value, confidence = s.sentiment(tweet) 25 | print(tweet, sentiment_value, confidence) 26 | 27 | if confidence*100 >= 80: 28 | output = open("twitter-out.txt","a") 29 | output.write(sentiment_value) 30 | output.write('\n') 31 | output.close() 32 | 33 | return True 34 | except: 35 | return True 36 | 37 | def on_error(self, status): 38 | print(status) 39 | 40 | auth = OAuthHandler(ckey, csecret) 41 | auth.set_access_token(atoken, asecret) 42 | 43 | twitterStream = Stream(auth, listener()) 44 | twitterStream.filter(track=["happy"]) 45 | -------------------------------------------------------------------------------- /nltkvid21.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.animation as animation 3 | from matplotlib import style 4 | import time 5 | 6 | style.use("ggplot") 7 | 8 | fig = plt.figure() 9 | ax1 = fig.add_subplot(1,1,1) 10 | 11 | def animate(i): 12 | pullData = open("twitter-out.txt","r").read() 13 | lines = pullData.split('\n') 14 | 15 | xar = [] 16 | yar = [] 17 | 18 | x = 0 19 | y = 0 20 | 21 | for l in lines[-200:]: 22 | x += 1 23 | if "pos" in l: 24 | y += 1 25 | elif "neg" in l: 26 | y -= 1 27 | 28 | xar.append(x) 29 | yar.append(y) 30 | 31 | ax1.clear() 32 | ax1.plot(xar,yar) 33 | ani = animation.FuncAnimation(fig, animate, interval=1000) 34 | plt.show() 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /nltkvid3.py: -------------------------------------------------------------------------------- 1 | from nltk.stem import PorterStemmer 2 | from nltk.tokenize import word_tokenize 3 | 4 | ps = PorterStemmer() 5 | 6 | example_words = ["python","pythoner","pythoning","pythoned","pythonly"] 7 | 8 | ##for w in example_words: 9 | ## print(ps.stem(w)) 10 | 11 | 12 | new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once." 13 | 14 | 15 | words = word_tokenize(new_text) 16 | 17 | for w in words: 18 | print(ps.stem(w)) 19 | -------------------------------------------------------------------------------- /nltkvid4.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | from nltk.tokenize import PunktSentenceTokenizer 4 | 5 | 6 | 7 | 8 | ''' 9 | POS tag list: 10 | 11 | CC coordinating conjunction 12 | CD cardinal digit 13 | DT determiner 14 | EX existential there (like: "there is" ... think of it like "there exists") 15 | FW foreign word 16 | IN preposition/subordinating conjunction 17 | JJ adjective 'big' 18 | JJR adjective, comparative 'bigger' 19 | JJS adjective, superlative 'biggest' 20 | LS list marker 1) 21 | MD modal could, will 22 | NN noun, singular 'desk' 23 | NNS noun plural 'desks' 24 | NNP proper noun, singular 'Harrison' 25 | NNPS proper noun, plural 'Americans' 26 | PDT predeterminer 'all the kids' 27 | POS possessive ending parent's 28 | PRP personal pronoun I, he, she 29 | PRP$ possessive pronoun my, his, hers 30 | RB adverb very, silently, 31 | RBR adverb, comparative better 32 | RBS adverb, superlative best 33 | RP particle give up 34 | TO to go 'to' the store. 35 | UH interjection errrrrrrrm 36 | VB verb, base form take 37 | VBD verb, past tense took 38 | VBG verb, gerund/present participle taking 39 | VBN verb, past participle taken 40 | VBP verb, sing. present, non-3d take 41 | VBZ verb, 3rd person sing. present takes 42 | WDT wh-determiner which 43 | WP wh-pronoun who, what 44 | WP$ possessive wh-pronoun whose 45 | WRB wh-abverb where, when 46 | 47 | ''' 48 | 49 | 50 | 51 | 52 | 53 | train_text = state_union.raw("2005-GWBush.txt") 54 | sample_text = state_union.raw("2006-GWBush.txt") 55 | 56 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text) 57 | tokenized = custom_sent_tokenizer.tokenize(sample_text) 58 | 59 | def process_content(): 60 | try: 61 | for i in tokenized[:5]: 62 | words = nltk.word_tokenize(i) 63 | tagged = nltk.pos_tag(words) 64 | print(tagged) 65 | 66 | except Exception as e: 67 | print(str(e)) 68 | 69 | 70 | process_content() 71 | -------------------------------------------------------------------------------- /nltkvid5.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | from nltk.tokenize import PunktSentenceTokenizer 4 | 5 | 6 | ''' 7 | POS tag list: 8 | 9 | CC coordinating conjunction 10 | CD cardinal digit 11 | DT determiner 12 | EX existential there (like: "there is" ... think of it like "there exists") 13 | FW foreign word 14 | IN preposition/subordinating conjunction 15 | JJ adjective 'big' 16 | JJR adjective, comparative 'bigger' 17 | JJS adjective, superlative 'biggest' 18 | LS list marker 1) 19 | MD modal could, will 20 | NN noun, singular 'desk' 21 | NNS noun plural 'desks' 22 | NNP proper noun, singular 'Harrison' 23 | NNPS proper noun, plural 'Americans' 24 | PDT predeterminer 'all the kids' 25 | POS possessive ending parent's 26 | PRP personal pronoun I, he, she 27 | PRP$ possessive pronoun my, his, hers 28 | RB adverb very, silently, 29 | RBR adverb, comparative better 30 | RBS adverb, superlative best 31 | RP particle give up 32 | TO to go 'to' the store. 33 | UH interjection errrrrrrrm 34 | VB verb, base form take 35 | VBD verb, past tense took 36 | VBG verb, gerund/present participle taking 37 | VBN verb, past participle taken 38 | VBP verb, sing. present, non-3d take 39 | VBZ verb, 3rd person sing. present takes 40 | WDT wh-determiner which 41 | WP wh-pronoun who, what 42 | WP$ possessive wh-pronoun whose 43 | WRB wh-abverb where, when 44 | 45 | ''' 46 | 47 | 48 | 49 | 50 | 51 | train_text = state_union.raw("2005-GWBush.txt") 52 | sample_text = state_union.raw("2006-GWBush.txt") 53 | 54 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text) 55 | 56 | tokenized = custom_sent_tokenizer.tokenize(sample_text) 57 | 58 | def process_content(): 59 | try: 60 | for i in tokenized: 61 | words = nltk.word_tokenize(i) 62 | tagged = nltk.pos_tag(words) 63 | 64 | chunkGram = r"""Chunk: {**+?}""" 65 | 66 | chunkParser = nltk.RegexpParser(chunkGram) 67 | chunked = chunkParser.parse(tagged) 68 | 69 | chunked.draw() 70 | 71 | except Exception as e: 72 | print(str(e)) 73 | 74 | 75 | process_content() 76 | -------------------------------------------------------------------------------- /nltkvid6.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | from nltk.tokenize import PunktSentenceTokenizer 4 | 5 | 6 | ''' 7 | POS tag list: 8 | 9 | CC coordinating conjunction 10 | CD cardinal digit 11 | DT determiner 12 | EX existential there (like: "there is" ... think of it like "there exists") 13 | FW foreign word 14 | IN preposition/subordinating conjunction 15 | JJ adjective 'big' 16 | JJR adjective, comparative 'bigger' 17 | JJS adjective, superlative 'biggest' 18 | LS list marker 1) 19 | MD modal could, will 20 | NN noun, singular 'desk' 21 | NNS noun plural 'desks' 22 | NNP proper noun, singular 'Harrison' 23 | NNPS proper noun, plural 'Americans' 24 | PDT predeterminer 'all the kids' 25 | POS possessive ending parent's 26 | PRP personal pronoun I, he, she 27 | PRP$ possessive pronoun my, his, hers 28 | RB adverb very, silently, 29 | RBR adverb, comparative better 30 | RBS adverb, superlative best 31 | RP particle give up 32 | TO to go 'to' the store. 33 | UH interjection errrrrrrrm 34 | VB verb, base form take 35 | VBD verb, past tense took 36 | VBG verb, gerund/present participle taking 37 | VBN verb, past participle taken 38 | VBP verb, sing. present, non-3d take 39 | VBZ verb, 3rd person sing. present takes 40 | WDT wh-determiner which 41 | WP wh-pronoun who, what 42 | WP$ possessive wh-pronoun whose 43 | WRB wh-abverb where, when 44 | 45 | ''' 46 | 47 | 48 | 49 | 50 | 51 | train_text = state_union.raw("2005-GWBush.txt") 52 | sample_text = state_union.raw("2006-GWBush.txt") 53 | 54 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text) 55 | 56 | tokenized = custom_sent_tokenizer.tokenize(sample_text) 57 | 58 | def process_content(): 59 | try: 60 | for i in tokenized[5:]: 61 | words = nltk.word_tokenize(i) 62 | tagged = nltk.pos_tag(words) 63 | 64 | chunkGram = r"""Chunk: {<.*>+} 65 | }+{""" 66 | 67 | chunkParser = nltk.RegexpParser(chunkGram) 68 | chunked = chunkParser.parse(tagged) 69 | 70 | chunked.draw() 71 | 72 | 73 | except Exception as e: 74 | print(str(e)) 75 | 76 | 77 | process_content() 78 | -------------------------------------------------------------------------------- /nltkvid7.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.corpus import state_union 3 | from nltk.tokenize import PunktSentenceTokenizer 4 | 5 | 6 | ''' 7 | POS tag list: 8 | 9 | CC coordinating conjunction 10 | CD cardinal digit 11 | DT determiner 12 | EX existential there (like: "there is" ... think of it like "there exists") 13 | FW foreign word 14 | IN preposition/subordinating conjunction 15 | JJ adjective 'big' 16 | JJR adjective, comparative 'bigger' 17 | JJS adjective, superlative 'biggest' 18 | LS list marker 1) 19 | MD modal could, will 20 | NN noun, singular 'desk' 21 | NNS noun plural 'desks' 22 | NNP proper noun, singular 'Harrison' 23 | NNPS proper noun, plural 'Americans' 24 | PDT predeterminer 'all the kids' 25 | POS possessive ending parent's 26 | PRP personal pronoun I, he, she 27 | PRP$ possessive pronoun my, his, hers 28 | RB adverb very, silently, 29 | RBR adverb, comparative better 30 | RBS adverb, superlative best 31 | RP particle give up 32 | TO to go 'to' the store. 33 | UH interjection errrrrrrrm 34 | VB verb, base form take 35 | VBD verb, past tense took 36 | VBG verb, gerund/present participle taking 37 | VBN verb, past participle taken 38 | VBP verb, sing. present, non-3d take 39 | VBZ verb, 3rd person sing. present takes 40 | WDT wh-determiner which 41 | WP wh-pronoun who, what 42 | WP$ possessive wh-pronoun whose 43 | WRB wh-abverb where, when 44 | 45 | ''' 46 | 47 | 48 | 49 | 50 | 51 | train_text = state_union.raw("2005-GWBush.txt") 52 | sample_text = state_union.raw("2006-GWBush.txt") 53 | 54 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text) 55 | 56 | tokenized = custom_sent_tokenizer.tokenize(sample_text) 57 | 58 | def process_content(): 59 | try: 60 | for i in tokenized[5:]: 61 | words = nltk.word_tokenize(i) 62 | tagged = nltk.pos_tag(words) 63 | namedEnt = nltk.ne_chunk(tagged, binary=False) 64 | namedEnt.draw() 65 | except Exception as e: 66 | print(str(e)) 67 | 68 | 69 | process_content() 70 | -------------------------------------------------------------------------------- /nltkvid8.py: -------------------------------------------------------------------------------- 1 | 2 | from nltk.stem import WordNetLemmatizer 3 | 4 | lemmatizer = WordNetLemmatizer() 5 | 6 | print(lemmatizer.lemmatize("cats")) 7 | print(lemmatizer.lemmatize("cacti")) 8 | print(lemmatizer.lemmatize("geese")) 9 | print(lemmatizer.lemmatize("rocks")) 10 | print(lemmatizer.lemmatize("python")) 11 | print(lemmatizer.lemmatize("better", pos="a")) 12 | print(lemmatizer.lemmatize("best", pos="a")) 13 | print(lemmatizer.lemmatize("run")) 14 | print(lemmatizer.lemmatize("run",'v')) 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /nltkvid9.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import gutenberg 2 | from nltk.tokenize import sent_tokenize 3 | 4 | sample = gutenberg.raw("bible-kjv.txt") 5 | 6 | tok = sent_tokenize(sample) 7 | 8 | print(tok[5:15]) 9 | -------------------------------------------------------------------------------- /originalnaivebayes5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/originalnaivebayes5k.pickle -------------------------------------------------------------------------------- /sentiment_mod.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/sentiment_mod.pickle -------------------------------------------------------------------------------- /voted_classifier.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/voted_classifier.pickle -------------------------------------------------------------------------------- /voted_classifier5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/voted_classifier5k.pickle -------------------------------------------------------------------------------- /word_features5k.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/word_features5k.pickle -------------------------------------------------------------------------------- /word_features_3000.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/word_features_3000.pickle --------------------------------------------------------------------------------