├── .gitattributes
├── .gitignore
├── BernoulliNB_classifier5k.pickle
├── LinearSVC_classifier5k.pickle
├── LogisticRegression_classifier5k.pickle
├── MNB_classifier5k.pickle
├── README.md
├── SGDC_classifier5k.pickle
├── documents.pickle
├── naivebayes.pickle
├── nltkvid1.py
├── nltkvid10.py
├── nltkvid11.py
├── nltkvid12.py
├── nltkvid13.py
├── nltkvid14.py
├── nltkvid15.py
├── nltkvid16.py
├── nltkvid17.py
├── nltkvid18.py
├── nltkvid19.1.py
├── nltkvid19.2.py
├── nltkvid19.3.py
├── nltkvid2.py
├── nltkvid20.py
├── nltkvid21.py
├── nltkvid3.py
├── nltkvid4.py
├── nltkvid5.py
├── nltkvid6.py
├── nltkvid7.py
├── nltkvid8.py
├── nltkvid9.py
├── originalnaivebayes5k.pickle
├── sentiment_mod.pickle
├── voted_classifier.pickle
├── voted_classifier5k.pickle
├── word_features5k.pickle
└── word_features_3000.pickle


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear on external disk
35 | .Spotlight-V100
36 | .Trashes
37 | 
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | 


--------------------------------------------------------------------------------
/BernoulliNB_classifier5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/BernoulliNB_classifier5k.pickle


--------------------------------------------------------------------------------
/LinearSVC_classifier5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/LinearSVC_classifier5k.pickle


--------------------------------------------------------------------------------
/LogisticRegression_classifier5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/LogisticRegression_classifier5k.pickle


--------------------------------------------------------------------------------
/MNB_classifier5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/MNB_classifier5k.pickle


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NLTK-3----Natural-Language-Processing-with-Python-series
2 | Natural Language Processing with Python 3 and NLTK 3 series
3 | http://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
4 | 


--------------------------------------------------------------------------------
/SGDC_classifier5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/SGDC_classifier5k.pickle


--------------------------------------------------------------------------------
/documents.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/documents.pickle


--------------------------------------------------------------------------------
/naivebayes.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/naivebayes.pickle


--------------------------------------------------------------------------------
/nltkvid1.py:
--------------------------------------------------------------------------------
 1 | from nltk.tokenize import sent_tokenize, word_tokenize
 2 | 
 3 | # tokenizing - word tokenizers.... sentence tokenizers
 4 | # lexicon and corporas
 5 | # corpora - body of text. ex: medical journals, presidential speeches, English language
 6 | # lexicon - words and their means
 7 | 
 8 | # investor-speak.... regular english-speak
 9 | 
10 | # investor speak 'bull' = someone who is positive about the market
11 | # english-speak 'bull' = scary animal you dont want running @ you
12 | 
13 | example_text = "Hello Mr. Smith, how are you doing today? The weather is great and Python is awesome. The sky is pinkish-blue. You should not eat cardboard."
14 | 
15 | ##print(sent_tokenize(example_text))
16 | ##
17 | ##print(word_tokenize(example_text))
18 | 
19 | 
20 | for i in word_tokenize(example_text):
21 |     print(i)
22 | 


--------------------------------------------------------------------------------
/nltkvid10.py:
--------------------------------------------------------------------------------
 1 | from nltk.corpus import wordnet
 2 | 
 3 | syns = wordnet.synsets("program")
 4 | 
 5 | #synset
 6 | print(syns[0].name())
 7 | 
 8 | # just the word
 9 | print(syns[0].lemmas()[0].name())
10 | 
11 | # definition
12 | print(syns[0].definition())
13 | 
14 | # examples
15 | print(syns[0].examples())
16 | 
17 | 
18 | synonyms = []
19 | antonyms = []
20 | 
21 | for syn in wordnet.synsets("good"):
22 |     for l in syn.lemmas():
23 |         synonyms.append(l.name())
24 |         if l.antonyms():
25 |             antonyms.append(l.antonyms()[0].name())
26 | 
27 | print(set(synonyms))
28 | print(set(antonyms))
29 | 
30 | 
31 | w1 = wordnet.synset("ship.n.01")
32 | w2 = wordnet.synset("boat.n.01")
33 | print(w1.wup_similarity(w2))
34 | 
35 | 
36 | w1 = wordnet.synset("ship.n.01")
37 | w2 = wordnet.synset("car.n.01")
38 | print(w1.wup_similarity(w2))
39 | 
40 | 
41 | w1 = wordnet.synset("ship.n.01")
42 | w2 = wordnet.synset("cactus.n.01")
43 | print(w1.wup_similarity(w2))
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/nltkvid11.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | 
 5 | documents = [(list(movie_reviews.words(fileid)), category)
 6 |              for category in movie_reviews.categories()
 7 |              for fileid in movie_reviews.fileids(category)]
 8 | 
 9 | random.shuffle(documents)
10 | 
11 | print(documents[1])
12 | 
13 | all_words = []
14 | for w in movie_reviews.words():
15 |     all_words.append(w.lower())
16 | 
17 | all_words = nltk.FreqDist(all_words)
18 | print(all_words.most_common(15))
19 | print(all_words["stupid"])
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 |     
27 | 
28 | 
29 |         
30 |     
31 |     
32 | 


--------------------------------------------------------------------------------
/nltkvid12.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | 
 5 | documents = [(list(movie_reviews.words(fileid)), category)
 6 |              for category in movie_reviews.categories()
 7 |              for fileid in movie_reviews.fileids(category)]
 8 | 
 9 | random.shuffle(documents)
10 | 
11 | all_words = []
12 | 
13 | for w in movie_reviews.words():
14 |     all_words.append(w.lower())
15 | 
16 | all_words = nltk.FreqDist(all_words)
17 | 
18 | word_features = list(all_words.keys())[:3000]
19 | 
20 | def find_features(document):
21 |     words = set(document)
22 |     features = {}
23 |     for w in word_features:
24 |         features[w] = (w in words)
25 | 
26 |     return features
27 | 
28 | print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
29 | 
30 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
31 |         
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 |     
61 | 
62 | 
63 |         
64 |     
65 |     
66 | 


--------------------------------------------------------------------------------
/nltkvid13.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | 
 5 | documents = [(list(movie_reviews.words(fileid)), category)
 6 |              for category in movie_reviews.categories()
 7 |              for fileid in movie_reviews.fileids(category)]
 8 | 
 9 | random.shuffle(documents)
10 | 
11 | all_words = []
12 | 
13 | for w in movie_reviews.words():
14 |     all_words.append(w.lower())
15 | 
16 | all_words = nltk.FreqDist(all_words)
17 | 
18 | word_features = list(all_words.keys())[:3000]
19 | 
20 | def find_features(document):
21 |     words = set(document)
22 |     features = {}
23 |     for w in word_features:
24 |         features[w] = (w in words)
25 | 
26 |     return features
27 | 
28 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
29 | 
30 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
31 |         
32 | training_set = featuresets[:1900]
33 | testing_set =  featuresets[1900:]
34 | 
35 | classifier = nltk.NaiveBayesClassifier.train(training_set)
36 | print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
37 | classifier.show_most_informative_features(15)
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 |     
65 | 
66 | 
67 |         
68 |     
69 |     
70 | 


--------------------------------------------------------------------------------
/nltkvid14.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import random
 3 | from nltk.corpus import movie_reviews
 4 | import pickle
 5 | 
 6 | documents = [(list(movie_reviews.words(fileid)), category)
 7 |              for category in movie_reviews.categories()
 8 |              for fileid in movie_reviews.fileids(category)]
 9 | 
10 | random.shuffle(documents)
11 | 
12 | all_words = []
13 | 
14 | for w in movie_reviews.words():
15 |     all_words.append(w.lower())
16 | 
17 | all_words = nltk.FreqDist(all_words)
18 | 
19 | word_features = list(all_words.keys())[:3000]
20 | 
21 | def find_features(document):
22 |     words = set(document)
23 |     features = {}
24 |     for w in word_features:
25 |         features[w] = (w in words)
26 | 
27 |     return features
28 | 
29 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
30 | 
31 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
32 |         
33 | training_set = featuresets[:1900]
34 | testing_set =  featuresets[1900:]
35 | 
36 | #classifier = nltk.NaiveBayesClassifier.train(training_set)
37 | 
38 | classifier_f = open("naivebayes.pickle","rb")
39 | classifier = pickle.load(classifier_f)
40 | classifier_f.close()
41 | 
42 | 
43 | 
44 | 
45 | print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
46 | classifier.show_most_informative_features(15)
47 | 
48 | ##save_classifier = open("naivebayes.pickle","wb")
49 | ##pickle.dump(classifier, save_classifier)
50 | ##save_classifier.close()
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 |     
76 | 
77 | 
78 |         
79 |     
80 |     
81 | 


--------------------------------------------------------------------------------
/nltkvid15.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | from nltk.corpus import movie_reviews
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | import pickle
  6 | 
  7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  8 | from sklearn.linear_model import LogisticRegression, SGDClassifier
  9 | from sklearn.svm import SVC, LinearSVC, NuSVC
 10 | 
 11 | 
 12 | documents = [(list(movie_reviews.words(fileid)), category)
 13 |              for category in movie_reviews.categories()
 14 |              for fileid in movie_reviews.fileids(category)]
 15 | 
 16 | random.shuffle(documents)
 17 | 
 18 | all_words = []
 19 | 
 20 | for w in movie_reviews.words():
 21 |     all_words.append(w.lower())
 22 | 
 23 | all_words = nltk.FreqDist(all_words)
 24 | 
 25 | word_features = list(all_words.keys())[:3000]
 26 | 
 27 | def find_features(document):
 28 |     words = set(document)
 29 |     features = {}
 30 |     for w in word_features:
 31 |         features[w] = (w in words)
 32 | 
 33 |     return features
 34 | 
 35 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
 36 | 
 37 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
 38 |         
 39 | training_set = featuresets[:1900]
 40 | testing_set =  featuresets[1900:]
 41 | 
 42 | #classifier = nltk.NaiveBayesClassifier.train(training_set)
 43 | 
 44 | classifier_f = open("naivebayes.pickle","rb")
 45 | classifier = pickle.load(classifier_f)
 46 | classifier_f.close()
 47 | 
 48 | 
 49 | 
 50 | 
 51 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
 52 | classifier.show_most_informative_features(15)
 53 | 
 54 | MNB_classifier = SklearnClassifier(MultinomialNB())
 55 | MNB_classifier.train(training_set)
 56 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
 57 | 
 58 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
 59 | BernoulliNB_classifier.train(training_set)
 60 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
 61 | 
 62 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
 63 | LogisticRegression_classifier.train(training_set)
 64 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
 65 | 
 66 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
 67 | SGDClassifier_classifier.train(training_set)
 68 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
 69 | 
 70 | SVC_classifier = SklearnClassifier(SVC())
 71 | SVC_classifier.train(training_set)
 72 | print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
 73 | 
 74 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
 75 | LinearSVC_classifier.train(training_set)
 76 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
 77 | 
 78 | NuSVC_classifier = SklearnClassifier(NuSVC())
 79 | NuSVC_classifier.train(training_set)
 80 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 |     
105 | 
106 | 
107 |         
108 |     
109 |     
110 | 


--------------------------------------------------------------------------------
/nltkvid16.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | from nltk.corpus import movie_reviews
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | import pickle
  6 | 
  7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  8 | from sklearn.linear_model import LogisticRegression, SGDClassifier
  9 | from sklearn.svm import SVC, LinearSVC, NuSVC
 10 | 
 11 | from nltk.classify import ClassifierI
 12 | from statistics import mode
 13 | 
 14 | 
 15 | class VoteClassifier(ClassifierI):
 16 |     def __init__(self, *classifiers):
 17 |         self._classifiers = classifiers
 18 | 
 19 |     def classify(self, features):
 20 |         votes = []
 21 |         for c in self._classifiers:
 22 |             v = c.classify(features)
 23 |             votes.append(v)
 24 |         return mode(votes)
 25 | 
 26 |     def confidence(self, features):
 27 |         votes = []
 28 |         for c in self._classifiers:
 29 |             v = c.classify(features)
 30 |             votes.append(v)
 31 | 
 32 |         choice_votes = votes.count(mode(votes))
 33 |         conf = choice_votes / len(votes)
 34 |         return conf
 35 | 
 36 | documents = [(list(movie_reviews.words(fileid)), category)
 37 |              for category in movie_reviews.categories()
 38 |              for fileid in movie_reviews.fileids(category)]
 39 | 
 40 | random.shuffle(documents)
 41 | 
 42 | all_words = []
 43 | 
 44 | for w in movie_reviews.words():
 45 |     all_words.append(w.lower())
 46 | 
 47 | all_words = nltk.FreqDist(all_words)
 48 | 
 49 | word_features = list(all_words.keys())[:3000]
 50 | 
 51 | def find_features(document):
 52 |     words = set(document)
 53 |     features = {}
 54 |     for w in word_features:
 55 |         features[w] = (w in words)
 56 | 
 57 |     return features
 58 | 
 59 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
 60 | 
 61 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
 62 |         
 63 | training_set = featuresets[:1900]
 64 | testing_set =  featuresets[1900:]
 65 | 
 66 | #classifier = nltk.NaiveBayesClassifier.train(training_set)
 67 | 
 68 | classifier_f = open("naivebayes.pickle","rb")
 69 | classifier = pickle.load(classifier_f)
 70 | classifier_f.close()
 71 | 
 72 | 
 73 | 
 74 | 
 75 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
 76 | classifier.show_most_informative_features(15)
 77 | 
 78 | MNB_classifier = SklearnClassifier(MultinomialNB())
 79 | MNB_classifier.train(training_set)
 80 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
 81 | 
 82 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
 83 | BernoulliNB_classifier.train(training_set)
 84 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
 85 | 
 86 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
 87 | LogisticRegression_classifier.train(training_set)
 88 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
 89 | 
 90 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
 91 | SGDClassifier_classifier.train(training_set)
 92 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
 93 | 
 94 | ##SVC_classifier = SklearnClassifier(SVC())
 95 | ##SVC_classifier.train(training_set)
 96 | ##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
 97 | 
 98 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
 99 | LinearSVC_classifier.train(training_set)
100 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
101 | 
102 | NuSVC_classifier = SklearnClassifier(NuSVC())
103 | NuSVC_classifier.train(training_set)
104 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
105 | 
106 | 
107 | voted_classifier = VoteClassifier(classifier,
108 |                                   NuSVC_classifier,
109 |                                   LinearSVC_classifier,
110 |                                   SGDClassifier_classifier,
111 |                                   MNB_classifier,
112 |                                   BernoulliNB_classifier,
113 |                                   LogisticRegression_classifier)
114 | 
115 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
116 | 
117 | print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
118 | print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
119 | print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
120 | print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
121 | print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
122 | print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 |     
143 | 
144 | 
145 |         
146 |     
147 |     
148 | 


--------------------------------------------------------------------------------
/nltkvid17.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | from nltk.corpus import movie_reviews
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | import pickle
  6 | 
  7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  8 | from sklearn.linear_model import LogisticRegression, SGDClassifier
  9 | from sklearn.svm import SVC, LinearSVC, NuSVC
 10 | 
 11 | from nltk.classify import ClassifierI
 12 | from statistics import mode
 13 | 
 14 | 
 15 | class VoteClassifier(ClassifierI):
 16 |     def __init__(self, *classifiers):
 17 |         self._classifiers = classifiers
 18 | 
 19 |     def classify(self, features):
 20 |         votes = []
 21 |         for c in self._classifiers:
 22 |             v = c.classify(features)
 23 |             votes.append(v)
 24 |         return mode(votes)
 25 | 
 26 |     def confidence(self, features):
 27 |         votes = []
 28 |         for c in self._classifiers:
 29 |             v = c.classify(features)
 30 |             votes.append(v)
 31 | 
 32 |         choice_votes = votes.count(mode(votes))
 33 |         conf = choice_votes / len(votes)
 34 |         return conf
 35 |         
 36 |         
 37 |         
 38 |         
 39 | 
 40 |     
 41 | 
 42 | 
 43 | documents = [(list(movie_reviews.words(fileid)), category)
 44 |              for category in movie_reviews.categories()
 45 |              for fileid in movie_reviews.fileids(category)]
 46 | 
 47 | #random.shuffle(documents)
 48 | 
 49 | all_words = []
 50 | 
 51 | for w in movie_reviews.words():
 52 |     all_words.append(w.lower())
 53 | 
 54 | all_words = nltk.FreqDist(all_words)
 55 | 
 56 | word_features = list(all_words.keys())[:3000]
 57 | 
 58 | def find_features(document):
 59 |     words = set(document)
 60 |     features = {}
 61 |     for w in word_features:
 62 |         features[w] = (w in words)
 63 | 
 64 |     return features
 65 | 
 66 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
 67 | 
 68 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
 69 | 
 70 | # positive data example:      
 71 | training_set = featuresets[:1900]
 72 | testing_set =  featuresets[1900:]
 73 | 
 74 | 
 75 | # negative data example:      
 76 | training_set = featuresets[100:]
 77 | testing_set =  featuresets[:100]
 78 | 
 79 | 
 80 | #classifier = nltk.NaiveBayesClassifier.train(training_set)
 81 | 
 82 | classifier_f = open("naivebayes.pickle","rb")
 83 | classifier = pickle.load(classifier_f)
 84 | classifier_f.close()
 85 | 
 86 | 
 87 | 
 88 | 
 89 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
 90 | classifier.show_most_informative_features(15)
 91 | 
 92 | MNB_classifier = SklearnClassifier(MultinomialNB())
 93 | MNB_classifier.train(training_set)
 94 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
 95 | 
 96 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
 97 | BernoulliNB_classifier.train(training_set)
 98 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
 99 | 
100 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
101 | LogisticRegression_classifier.train(training_set)
102 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
103 | 
104 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
105 | SGDClassifier_classifier.train(training_set)
106 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
107 | 
108 | ##SVC_classifier = SklearnClassifier(SVC())
109 | ##SVC_classifier.train(training_set)
110 | ##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
111 | 
112 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
113 | LinearSVC_classifier.train(training_set)
114 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
115 | 
116 | NuSVC_classifier = SklearnClassifier(NuSVC())
117 | NuSVC_classifier.train(training_set)
118 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | voted_classifier = VoteClassifier(
131 |                                   NuSVC_classifier,
132 |                                   LinearSVC_classifier,
133 |                                   MNB_classifier,
134 |                                   BernoulliNB_classifier,
135 |                                   LogisticRegression_classifier)
136 | 
137 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
138 | 
139 | ##print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
140 | ##print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
141 | ##print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
142 | ##print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
143 | ##print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
144 | ##print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)
145 | ##
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 |     
165 | 
166 | 
167 |         
168 |     
169 |     
170 | 


--------------------------------------------------------------------------------
/nltkvid18.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | from nltk.corpus import movie_reviews
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | import pickle
  6 | 
  7 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  8 | from sklearn.linear_model import LogisticRegression, SGDClassifier
  9 | from sklearn.svm import SVC, LinearSVC, NuSVC
 10 | 
 11 | from nltk.classify import ClassifierI
 12 | from statistics import mode
 13 | 
 14 | from nltk.tokenize import word_tokenize
 15 | 
 16 | 
 17 | class VoteClassifier(ClassifierI):
 18 |     def __init__(self, *classifiers):
 19 |         self._classifiers = classifiers
 20 | 
 21 |     def classify(self, features):
 22 |         votes = []
 23 |         for c in self._classifiers:
 24 |             v = c.classify(features)
 25 |             votes.append(v)
 26 |         return mode(votes)
 27 | 
 28 |     def confidence(self, features):
 29 |         votes = []
 30 |         for c in self._classifiers:
 31 |             v = c.classify(features)
 32 |             votes.append(v)
 33 | 
 34 |         choice_votes = votes.count(mode(votes))
 35 |         conf = choice_votes / len(votes)
 36 |         return conf
 37 |         
 38 | short_pos = open("short_reviews/positive.txt","r").read()
 39 | short_neg = open("short_reviews/negative.txt","r").read()
 40 | 
 41 | documents = []
 42 | 
 43 | for r in short_pos.split('\n'):
 44 |     documents.append( (r, "pos") )
 45 | 
 46 | for r in short_neg.split('\n'):
 47 |     documents.append( (r, "neg") )
 48 | 
 49 | 
 50 | all_words = []
 51 | 
 52 | short_pos_words = word_tokenize(short_pos)
 53 | short_neg_words = word_tokenize(short_neg)
 54 | 
 55 | for w in short_pos_words:
 56 |     all_words.append(w.lower())
 57 | 
 58 | for w in short_neg_words:
 59 |     all_words.append(w.lower())
 60 | 
 61 | all_words = nltk.FreqDist(all_words)
 62 | 
 63 | word_features = list(all_words.keys())[:5000]
 64 | 
 65 | def find_features(document):
 66 |     words = word_tokenize(document)
 67 |     features = {}
 68 |     for w in word_features:
 69 |         features[w] = (w in words)
 70 | 
 71 |     return features
 72 | 
 73 | #print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
 74 | 
 75 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
 76 | 
 77 | random.shuffle(featuresets)
 78 | 
 79 | # positive data example:      
 80 | training_set = featuresets[:10000]
 81 | testing_set =  featuresets[10000:]
 82 | 
 83 | ##
 84 | ### negative data example:      
 85 | ##training_set = featuresets[100:]
 86 | ##testing_set =  featuresets[:100]
 87 | 
 88 | 
 89 | classifier = nltk.NaiveBayesClassifier.train(training_set)
 90 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
 91 | classifier.show_most_informative_features(15)
 92 | 
 93 | MNB_classifier = SklearnClassifier(MultinomialNB())
 94 | MNB_classifier.train(training_set)
 95 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
 96 | 
 97 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
 98 | BernoulliNB_classifier.train(training_set)
 99 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
100 | 
101 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
102 | LogisticRegression_classifier.train(training_set)
103 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
104 | 
105 | SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
106 | SGDClassifier_classifier.train(training_set)
107 | print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
108 | 
109 | ##SVC_classifier = SklearnClassifier(SVC())
110 | ##SVC_classifier.train(training_set)
111 | ##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
112 | 
113 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
114 | LinearSVC_classifier.train(training_set)
115 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
116 | 
117 | NuSVC_classifier = SklearnClassifier(NuSVC())
118 | NuSVC_classifier.train(training_set)
119 | print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
120 | 
121 | 
122 | voted_classifier = VoteClassifier(
123 |                                   NuSVC_classifier,
124 |                                   LinearSVC_classifier,
125 |                                   MNB_classifier,
126 |                                   BernoulliNB_classifier,
127 |                                   LogisticRegression_classifier)
128 | 
129 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 |     
147 | 
148 | 
149 |         
150 |     
151 |     
152 | 


--------------------------------------------------------------------------------
/nltkvid19.1.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import random
  3 | #from nltk.corpus import movie_reviews
  4 | from nltk.classify.scikitlearn import SklearnClassifier
  5 | import pickle
  6 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  7 | from sklearn.linear_model import LogisticRegression, SGDClassifier
  8 | from sklearn.svm import SVC, LinearSVC, NuSVC
  9 | from nltk.classify import ClassifierI
 10 | from statistics import mode
 11 | from nltk.tokenize import word_tokenize
 12 | 
 13 | 
 14 | 
 15 | class VoteClassifier(ClassifierI):
 16 |     def __init__(self, *classifiers):
 17 |         self._classifiers = classifiers
 18 | 
 19 |     def classify(self, features):
 20 |         votes = []
 21 |         for c in self._classifiers:
 22 |             v = c.classify(features)
 23 |             votes.append(v)
 24 |         return mode(votes)
 25 | 
 26 |     def confidence(self, features):
 27 |         votes = []
 28 |         for c in self._classifiers:
 29 |             v = c.classify(features)
 30 |             votes.append(v)
 31 | 
 32 |         choice_votes = votes.count(mode(votes))
 33 |         conf = choice_votes / len(votes)
 34 |         return conf
 35 |     
 36 | short_pos = open("short_reviews/positive.txt","r").read()
 37 | short_neg = open("short_reviews/negative.txt","r").read()
 38 | 
 39 | # move this up here
 40 | all_words = []
 41 | documents = []
 42 | 
 43 | 
 44 | #  j is adject, r is adverb, and v is verb
 45 | #allowed_word_types = ["J","R","V"]
 46 | allowed_word_types = ["J"]
 47 | 
 48 | for p in short_pos.split('\n'):
 49 |     documents.append( (p, "pos") )
 50 |     words = word_tokenize(p)
 51 |     pos = nltk.pos_tag(words)
 52 |     for w in pos:
 53 |         if w[1][0] in allowed_word_types:
 54 |             all_words.append(w[0].lower())
 55 | 
 56 |     
 57 | for p in short_neg.split('\n'):
 58 |     documents.append( (p, "neg") )
 59 |     words = word_tokenize(p)
 60 |     pos = nltk.pos_tag(words)
 61 |     for w in pos:
 62 |         if w[1][0] in allowed_word_types:
 63 |             all_words.append(w[0].lower())
 64 | 
 65 | 
 66 | 
 67 | save_documents = open("pickled_algos/documents.pickle","wb")
 68 | pickle.dump(documents, save_documents)
 69 | save_documents.close()
 70 | 
 71 | 
 72 | all_words = nltk.FreqDist(all_words)
 73 | 
 74 | 
 75 | word_features = list(all_words.keys())[:5000]
 76 | 
 77 | 
 78 | save_word_features = open("pickled_algos/word_features5k.pickle","wb")
 79 | pickle.dump(word_features, save_word_features)
 80 | save_word_features.close()
 81 | 
 82 | 
 83 | def find_features(document):
 84 |     words = word_tokenize(document)
 85 |     features = {}
 86 |     for w in word_features:
 87 |         features[w] = (w in words)
 88 | 
 89 |     return features
 90 | 
 91 | featuresets = [(find_features(rev), category) for (rev, category) in documents]
 92 | 
 93 | random.shuffle(featuresets)
 94 | print(len(featuresets))
 95 | 
 96 | testing_set = featuresets[10000:]
 97 | training_set = featuresets[:10000]
 98 | 
 99 | 
100 | classifier = nltk.NaiveBayesClassifier.train(training_set)
101 | print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
102 | classifier.show_most_informative_features(15)
103 | 
104 | ###############
105 | save_classifier = open("pickled_algos/originalnaivebayes5k.pickle","wb")
106 | pickle.dump(classifier, save_classifier)
107 | save_classifier.close()
108 | 
109 | MNB_classifier = SklearnClassifier(MultinomialNB())
110 | MNB_classifier.train(training_set)
111 | print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
112 | 
113 | save_classifier = open("pickled_algos/MNB_classifier5k.pickle","wb")
114 | pickle.dump(MNB_classifier, save_classifier)
115 | save_classifier.close()
116 | 
117 | BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
118 | BernoulliNB_classifier.train(training_set)
119 | print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
120 | 
121 | save_classifier = open("pickled_algos/BernoulliNB_classifier5k.pickle","wb")
122 | pickle.dump(BernoulliNB_classifier, save_classifier)
123 | save_classifier.close()
124 | 
125 | LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
126 | LogisticRegression_classifier.train(training_set)
127 | print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
128 | 
129 | save_classifier = open("pickled_algos/LogisticRegression_classifier5k.pickle","wb")
130 | pickle.dump(LogisticRegression_classifier, save_classifier)
131 | save_classifier.close()
132 | 
133 | 
134 | LinearSVC_classifier = SklearnClassifier(LinearSVC())
135 | LinearSVC_classifier.train(training_set)
136 | print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
137 | 
138 | save_classifier = open("pickled_algos/LinearSVC_classifier5k.pickle","wb")
139 | pickle.dump(LinearSVC_classifier, save_classifier)
140 | save_classifier.close()
141 | 
142 | 
143 | ##NuSVC_classifier = SklearnClassifier(NuSVC())
144 | ##NuSVC_classifier.train(training_set)
145 | ##print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
146 | 
147 | 
148 | SGDC_classifier = SklearnClassifier(SGDClassifier())
149 | SGDC_classifier.train(training_set)
150 | print("SGDClassifier accuracy percent:",nltk.classify.accuracy(SGDC_classifier, testing_set)*100)
151 | 
152 | save_classifier = open("pickled_algos/SGDC_classifier5k.pickle","wb")
153 | pickle.dump(SGDC_classifier, save_classifier)
154 | save_classifier.close()
155 | 
156 | 
157 | voted_classifier = VoteClassifier(
158 |                                   classifier,
159 |                                   LinearSVC_classifier,
160 |                                   MNB_classifier,
161 |                                   BernoulliNB_classifier,
162 |                                   LogisticRegression_classifier)
163 | 
164 | print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
165 | 
166 | 
167 | 
168 | def sentiment(text):
169 |     feats = find_features(text)
170 | 
171 |     return voted_classifier.classify(feats)
172 |     
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 |     
186 | 
187 | 
188 |         
189 |     
190 |     
191 | 


--------------------------------------------------------------------------------
/nltkvid19.2.py:
--------------------------------------------------------------------------------
  1 | #File: sentiment_mod.py
  2 | 
  3 | import nltk
  4 | import random
  5 | #from nltk.corpus import movie_reviews
  6 | from nltk.classify.scikitlearn import SklearnClassifier
  7 | import pickle
  8 | from sklearn.naive_bayes import MultinomialNB, BernoulliNB
  9 | from sklearn.linear_model import LogisticRegression, SGDClassifier
 10 | from sklearn.svm import SVC, LinearSVC, NuSVC
 11 | from nltk.classify import ClassifierI
 12 | from statistics import mode
 13 | from nltk.tokenize import word_tokenize
 14 | 
 15 | 
 16 | 
 17 | class VoteClassifier(ClassifierI):
 18 |     def __init__(self, *classifiers):
 19 |         self._classifiers = classifiers
 20 | 
 21 |     def classify(self, features):
 22 |         votes = []
 23 |         for c in self._classifiers:
 24 |             v = c.classify(features)
 25 |             votes.append(v)
 26 |         return mode(votes)
 27 | 
 28 |     def confidence(self, features):
 29 |         votes = []
 30 |         for c in self._classifiers:
 31 |             v = c.classify(features)
 32 |             votes.append(v)
 33 | 
 34 |         choice_votes = votes.count(mode(votes))
 35 |         conf = choice_votes / len(votes)
 36 |         return conf
 37 | 
 38 | 
 39 | documents_f = open("pickled_algos/documents.pickle", "rb")
 40 | documents = pickle.load(documents_f)
 41 | documents_f.close()
 42 | 
 43 | 
 44 | 
 45 | 
 46 | word_features5k_f = open("pickled_algos/word_features5k.pickle", "rb")
 47 | word_features = pickle.load(word_features5k_f)
 48 | word_features5k_f.close()
 49 | 
 50 | 
 51 | def find_features(document):
 52 |     words = word_tokenize(document)
 53 |     features = {}
 54 |     for w in word_features:
 55 |         features[w] = (w in words)
 56 | 
 57 |     return features
 58 | 
 59 | 
 60 | 
 61 | featuresets_f = open("pickled_algos/featuresets.pickle", "rb")
 62 | featuresets = pickle.load(featuresets_f)
 63 | featuresets_f.close()
 64 | 
 65 | random.shuffle(featuresets)
 66 | print(len(featuresets))
 67 | 
 68 | testing_set = featuresets[10000:]
 69 | training_set = featuresets[:10000]
 70 | 
 71 | 
 72 | 
 73 | open_file = open("pickled_algos/originalnaivebayes5k.pickle", "rb")
 74 | classifier = pickle.load(open_file)
 75 | open_file.close()
 76 | 
 77 | 
 78 | open_file = open("pickled_algos/MNB_classifier5k.pickle", "rb")
 79 | MNB_classifier = pickle.load(open_file)
 80 | open_file.close()
 81 | 
 82 | 
 83 | 
 84 | open_file = open("pickled_algos/BernoulliNB_classifier5k.pickle", "rb")
 85 | BernoulliNB_classifier = pickle.load(open_file)
 86 | open_file.close()
 87 | 
 88 | 
 89 | open_file = open("pickled_algos/LogisticRegression_classifier5k.pickle", "rb")
 90 | LogisticRegression_classifier = pickle.load(open_file)
 91 | open_file.close()
 92 | 
 93 | 
 94 | open_file = open("pickled_algos/LinearSVC_classifier5k.pickle", "rb")
 95 | LinearSVC_classifier = pickle.load(open_file)
 96 | open_file.close()
 97 | 
 98 | 
 99 | open_file = open("pickled_algos/SGDC_classifier5k.pickle", "rb")
100 | SGDC_classifier = pickle.load(open_file)
101 | open_file.close()
102 | 
103 | 
104 | 
105 | 
106 | voted_classifier = VoteClassifier(
107 |                                   classifier,
108 |                                   LinearSVC_classifier,
109 |                                   MNB_classifier,
110 |                                   BernoulliNB_classifier,
111 |                                   LogisticRegression_classifier)
112 | 
113 | 
114 | 
115 | 
116 | def sentiment(text):
117 |     feats = find_features(text)
118 | 
119 |     return voted_classifier.classify(feats),voted_classifier.confidence(feats)
120 | 
121 | 
122 | 
123 | # SAVE ME AS sentiment_mod.py
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 |     
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 |     
148 | 
149 | 
150 |         
151 |     
152 |     
153 | 


--------------------------------------------------------------------------------
/nltkvid19.3.py:
--------------------------------------------------------------------------------
1 | import sentiment_mod as s
2 | 
3 | print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
4 | print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))
5 | 


--------------------------------------------------------------------------------
/nltkvid2.py:
--------------------------------------------------------------------------------
 1 | from nltk.corpus import stopwords
 2 | from nltk.tokenize import word_tokenize
 3 | 
 4 | example_sentence = "This is an example showing off stop word filtration."
 5 | stop_words = set(stopwords.words("english"))
 6 | 
 7 | words = word_tokenize(example_sentence)
 8 | ##
 9 | ##filtered_sentence = []
10 | ##
11 | ##for w in words:
12 | ##    if w not in stop_words:
13 | ##        filtered_sentence.append(w)
14 | 
15 | filtered_sentence = [w for w in words if not w in stop_words]
16 | 
17 | print(filtered_sentence)
18 | 


--------------------------------------------------------------------------------
/nltkvid20.py:
--------------------------------------------------------------------------------
 1 | from tweepy import Stream
 2 | from tweepy import OAuthHandler
 3 | from tweepy.streaming import StreamListener
 4 | import json
 5 | import sentiment_mod as s
 6 | 
 7 | 
 8 | 
 9 | #consumer key, consumer secret, access token, access secret.
10 | ckey="asdfsafsafsaf"
11 | csecret="asdfasdfsadfsa"
12 | atoken="asdfsadfsafsaf-asdfsaf"
13 | asecret="asdfsadfsadfsadfsadfsad"
14 | 
15 | from twitterapistuff import *
16 | 
17 | class listener(StreamListener):
18 | 
19 |     def on_data(self, data):
20 |         try:
21 |             all_data = json.loads(data)
22 | 
23 |             tweet = all_data["text"]
24 |             sentiment_value, confidence = s.sentiment(tweet)
25 |             print(tweet, sentiment_value, confidence)
26 | 
27 |             if confidence*100 >= 80:
28 |                 output = open("twitter-out.txt","a")
29 |                 output.write(sentiment_value)
30 |                 output.write('\n')
31 |                 output.close()
32 | 
33 |             return True
34 |         except:
35 |             return True
36 | 
37 |     def on_error(self, status):
38 |         print(status)
39 | 
40 | auth = OAuthHandler(ckey, csecret)
41 | auth.set_access_token(atoken, asecret)
42 | 
43 | twitterStream = Stream(auth, listener())
44 | twitterStream.filter(track=["happy"])
45 | 


--------------------------------------------------------------------------------
/nltkvid21.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import matplotlib.animation as animation
 3 | from matplotlib import style
 4 | import time
 5 | 
 6 | style.use("ggplot")
 7 | 
 8 | fig = plt.figure()
 9 | ax1 = fig.add_subplot(1,1,1)
10 | 
11 | def animate(i):
12 |     pullData = open("twitter-out.txt","r").read()
13 |     lines = pullData.split('\n')
14 | 
15 |     xar = []
16 |     yar = []
17 | 
18 |     x = 0
19 |     y = 0
20 | 
21 |     for l in lines[-200:]:
22 |         x += 1
23 |         if "pos" in l:
24 |             y += 1
25 |         elif "neg" in l:
26 |             y -= 1
27 | 
28 |         xar.append(x)
29 |         yar.append(y)
30 |         
31 |     ax1.clear()
32 |     ax1.plot(xar,yar)
33 | ani = animation.FuncAnimation(fig, animate, interval=1000)
34 | plt.show()
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/nltkvid3.py:
--------------------------------------------------------------------------------
 1 | from nltk.stem import PorterStemmer
 2 | from nltk.tokenize import word_tokenize
 3 | 
 4 | ps = PorterStemmer()
 5 | 
 6 | example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
 7 | 
 8 | ##for w in example_words:
 9 | ##    print(ps.stem(w))
10 | 
11 | 
12 | new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
13 | 
14 | 
15 | words = word_tokenize(new_text)
16 | 
17 | for w in words:
18 |     print(ps.stem(w))
19 | 


--------------------------------------------------------------------------------
/nltkvid4.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | from nltk.tokenize import PunktSentenceTokenizer
 4 | 
 5 | 
 6 | 
 7 | 
 8 | '''
 9 | POS tag list:
10 | 
11 | CC	coordinating conjunction
12 | CD	cardinal digit
13 | DT	determiner
14 | EX	existential there (like: "there is" ... think of it like "there exists")
15 | FW	foreign word
16 | IN	preposition/subordinating conjunction
17 | JJ	adjective	'big'
18 | JJR	adjective, comparative	'bigger'
19 | JJS	adjective, superlative	'biggest'
20 | LS	list marker	1)
21 | MD	modal	could, will
22 | NN	noun, singular 'desk'
23 | NNS	noun plural	'desks'
24 | NNP	proper noun, singular	'Harrison'
25 | NNPS	proper noun, plural	'Americans'
26 | PDT	predeterminer	'all the kids'
27 | POS	possessive ending	parent's
28 | PRP	personal pronoun	I, he, she
29 | PRP$	possessive pronoun	my, his, hers
30 | RB	adverb	very, silently,
31 | RBR	adverb, comparative	better
32 | RBS	adverb, superlative	best
33 | RP	particle	give up
34 | TO	to	go 'to' the store.
35 | UH	interjection	errrrrrrrm
36 | VB	verb, base form	take
37 | VBD	verb, past tense	took
38 | VBG	verb, gerund/present participle	taking
39 | VBN	verb, past participle	taken
40 | VBP	verb, sing. present, non-3d	take
41 | VBZ	verb, 3rd person sing. present	takes
42 | WDT	wh-determiner	which
43 | WP	wh-pronoun	who, what
44 | WP$	possessive wh-pronoun	whose
45 | WRB	wh-abverb	where, when
46 | 
47 | '''
48 | 
49 | 
50 | 
51 | 
52 | 
53 | train_text = state_union.raw("2005-GWBush.txt")
54 | sample_text = state_union.raw("2006-GWBush.txt")
55 | 
56 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
57 | tokenized = custom_sent_tokenizer.tokenize(sample_text)
58 | 
59 | def process_content():
60 |     try:
61 |         for i in tokenized[:5]:
62 |             words = nltk.word_tokenize(i)
63 |             tagged = nltk.pos_tag(words)
64 |             print(tagged)
65 | 
66 |     except Exception as e:
67 |         print(str(e))
68 | 
69 | 
70 | process_content()
71 | 


--------------------------------------------------------------------------------
/nltkvid5.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | from nltk.tokenize import PunktSentenceTokenizer
 4 | 
 5 | 
 6 | '''
 7 | POS tag list:
 8 | 
 9 | CC	coordinating conjunction
10 | CD	cardinal digit
11 | DT	determiner
12 | EX	existential there (like: "there is" ... think of it like "there exists")
13 | FW	foreign word
14 | IN	preposition/subordinating conjunction
15 | JJ	adjective	'big'
16 | JJR	adjective, comparative	'bigger'
17 | JJS	adjective, superlative	'biggest'
18 | LS	list marker	1)
19 | MD	modal	could, will
20 | NN	noun, singular 'desk'
21 | NNS	noun plural	'desks'
22 | NNP	proper noun, singular	'Harrison'
23 | NNPS	proper noun, plural	'Americans'
24 | PDT	predeterminer	'all the kids'
25 | POS	possessive ending	parent's
26 | PRP	personal pronoun	I, he, she
27 | PRP$	possessive pronoun	my, his, hers
28 | RB	adverb	very, silently,
29 | RBR	adverb, comparative	better
30 | RBS	adverb, superlative	best
31 | RP	particle	give up
32 | TO	to	go 'to' the store.
33 | UH	interjection	errrrrrrrm
34 | VB	verb, base form	take
35 | VBD	verb, past tense	took
36 | VBG	verb, gerund/present participle	taking
37 | VBN	verb, past participle	taken
38 | VBP	verb, sing. present, non-3d	take
39 | VBZ	verb, 3rd person sing. present	takes
40 | WDT	wh-determiner	which
41 | WP	wh-pronoun	who, what
42 | WP$	possessive wh-pronoun	whose
43 | WRB	wh-abverb	where, when
44 | 
45 | '''
46 | 
47 | 
48 | 
49 | 
50 | 
51 | train_text = state_union.raw("2005-GWBush.txt")
52 | sample_text = state_union.raw("2006-GWBush.txt")
53 | 
54 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
55 | 
56 | tokenized = custom_sent_tokenizer.tokenize(sample_text)
57 | 
58 | def process_content():
59 |     try:
60 |         for i in tokenized:
61 |             words = nltk.word_tokenize(i)
62 |             tagged = nltk.pos_tag(words)
63 | 
64 |             chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
65 | 
66 |             chunkParser = nltk.RegexpParser(chunkGram)
67 |             chunked = chunkParser.parse(tagged)
68 | 
69 |             chunked.draw()     
70 | 
71 |     except Exception as e:
72 |         print(str(e))
73 | 
74 | 
75 | process_content()
76 | 


--------------------------------------------------------------------------------
/nltkvid6.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | from nltk.tokenize import PunktSentenceTokenizer
 4 | 
 5 | 
 6 | '''
 7 | POS tag list:
 8 | 
 9 | CC	coordinating conjunction
10 | CD	cardinal digit
11 | DT	determiner
12 | EX	existential there (like: "there is" ... think of it like "there exists")
13 | FW	foreign word
14 | IN	preposition/subordinating conjunction
15 | JJ	adjective	'big'
16 | JJR	adjective, comparative	'bigger'
17 | JJS	adjective, superlative	'biggest'
18 | LS	list marker	1)
19 | MD	modal	could, will
20 | NN	noun, singular 'desk'
21 | NNS	noun plural	'desks'
22 | NNP	proper noun, singular	'Harrison'
23 | NNPS	proper noun, plural	'Americans'
24 | PDT	predeterminer	'all the kids'
25 | POS	possessive ending	parent's
26 | PRP	personal pronoun	I, he, she
27 | PRP$	possessive pronoun	my, his, hers
28 | RB	adverb	very, silently,
29 | RBR	adverb, comparative	better
30 | RBS	adverb, superlative	best
31 | RP	particle	give up
32 | TO	to	go 'to' the store.
33 | UH	interjection	errrrrrrrm
34 | VB	verb, base form	take
35 | VBD	verb, past tense	took
36 | VBG	verb, gerund/present participle	taking
37 | VBN	verb, past participle	taken
38 | VBP	verb, sing. present, non-3d	take
39 | VBZ	verb, 3rd person sing. present	takes
40 | WDT	wh-determiner	which
41 | WP	wh-pronoun	who, what
42 | WP$	possessive wh-pronoun	whose
43 | WRB	wh-abverb	where, when
44 | 
45 | '''
46 | 
47 | 
48 | 
49 | 
50 | 
51 | train_text = state_union.raw("2005-GWBush.txt")
52 | sample_text = state_union.raw("2006-GWBush.txt")
53 | 
54 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
55 | 
56 | tokenized = custom_sent_tokenizer.tokenize(sample_text)
57 | 
58 | def process_content():
59 |     try:
60 |         for i in tokenized[5:]:
61 |             words = nltk.word_tokenize(i)
62 |             tagged = nltk.pos_tag(words)
63 | 
64 |             chunkGram = r"""Chunk: {<.*>+}
65 |                                     }<VB.?|IN|DT|TO>+{"""
66 | 
67 |             chunkParser = nltk.RegexpParser(chunkGram)
68 |             chunked = chunkParser.parse(tagged)
69 | 
70 |             chunked.draw()
71 | 
72 | 
73 |     except Exception as e:
74 |         print(str(e))
75 | 
76 | 
77 | process_content()
78 | 


--------------------------------------------------------------------------------
/nltkvid7.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.corpus import state_union
 3 | from nltk.tokenize import PunktSentenceTokenizer
 4 | 
 5 | 
 6 | '''
 7 | POS tag list:
 8 | 
 9 | CC	coordinating conjunction
10 | CD	cardinal digit
11 | DT	determiner
12 | EX	existential there (like: "there is" ... think of it like "there exists")
13 | FW	foreign word
14 | IN	preposition/subordinating conjunction
15 | JJ	adjective	'big'
16 | JJR	adjective, comparative	'bigger'
17 | JJS	adjective, superlative	'biggest'
18 | LS	list marker	1)
19 | MD	modal	could, will
20 | NN	noun, singular 'desk'
21 | NNS	noun plural	'desks'
22 | NNP	proper noun, singular	'Harrison'
23 | NNPS	proper noun, plural	'Americans'
24 | PDT	predeterminer	'all the kids'
25 | POS	possessive ending	parent's
26 | PRP	personal pronoun	I, he, she
27 | PRP$	possessive pronoun	my, his, hers
28 | RB	adverb	very, silently,
29 | RBR	adverb, comparative	better
30 | RBS	adverb, superlative	best
31 | RP	particle	give up
32 | TO	to	go 'to' the store.
33 | UH	interjection	errrrrrrrm
34 | VB	verb, base form	take
35 | VBD	verb, past tense	took
36 | VBG	verb, gerund/present participle	taking
37 | VBN	verb, past participle	taken
38 | VBP	verb, sing. present, non-3d	take
39 | VBZ	verb, 3rd person sing. present	takes
40 | WDT	wh-determiner	which
41 | WP	wh-pronoun	who, what
42 | WP$	possessive wh-pronoun	whose
43 | WRB	wh-abverb	where, when
44 | 
45 | '''
46 | 
47 | 
48 | 
49 | 
50 | 
51 | train_text = state_union.raw("2005-GWBush.txt")
52 | sample_text = state_union.raw("2006-GWBush.txt")
53 | 
54 | custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
55 | 
56 | tokenized = custom_sent_tokenizer.tokenize(sample_text)
57 | 
58 | def process_content():
59 |     try:
60 |         for i in tokenized[5:]:
61 |             words = nltk.word_tokenize(i)
62 |             tagged = nltk.pos_tag(words)
63 |             namedEnt = nltk.ne_chunk(tagged, binary=False)
64 |             namedEnt.draw()
65 |     except Exception as e:
66 |         print(str(e))
67 | 
68 | 
69 | process_content()
70 | 


--------------------------------------------------------------------------------
/nltkvid8.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from nltk.stem import WordNetLemmatizer
 3 | 
 4 | lemmatizer = WordNetLemmatizer()
 5 | 
 6 | print(lemmatizer.lemmatize("cats"))
 7 | print(lemmatizer.lemmatize("cacti"))
 8 | print(lemmatizer.lemmatize("geese"))
 9 | print(lemmatizer.lemmatize("rocks"))
10 | print(lemmatizer.lemmatize("python"))
11 | print(lemmatizer.lemmatize("better", pos="a"))
12 | print(lemmatizer.lemmatize("best", pos="a"))
13 | print(lemmatizer.lemmatize("run"))
14 | print(lemmatizer.lemmatize("run",'v'))
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/nltkvid9.py:
--------------------------------------------------------------------------------
1 | from nltk.corpus import gutenberg
2 | from nltk.tokenize import sent_tokenize
3 | 
4 | sample = gutenberg.raw("bible-kjv.txt")
5 | 
6 | tok = sent_tokenize(sample)
7 | 
8 | print(tok[5:15])
9 | 


--------------------------------------------------------------------------------
/originalnaivebayes5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/originalnaivebayes5k.pickle


--------------------------------------------------------------------------------
/sentiment_mod.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/sentiment_mod.pickle


--------------------------------------------------------------------------------
/voted_classifier.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/voted_classifier.pickle


--------------------------------------------------------------------------------
/voted_classifier5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/voted_classifier5k.pickle


--------------------------------------------------------------------------------
/word_features5k.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/word_features5k.pickle


--------------------------------------------------------------------------------
/word_features_3000.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PythonProgramming/NLTK-3----Natural-Language-Processing-with-Python-series/e6dddc9329ef2c52f1e93ff4fa1315876da55b43/word_features_3000.pickle


--------------------------------------------------------------------------------