├── .classpath
├── .gitignore
├── .project
├── .settings
└── org.eclipse.jdt.core.prefs
├── Data
├── Output.txt
├── Test.data
├── Train.data
├── Vocab.data
└── stopwords.txt
├── LICENSE
├── README.md
└── src
└── NaiveBayesClassifier.java
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled class file
2 | *.class
3 |
4 | # Log file
5 | *.log
6 |
7 | # BlueJ files
8 | *.ctxt
9 |
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 |
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.ear
17 | *.zip
18 | *.tar.gz
19 | *.rar
20 |
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 | /bin/
24 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | Naive Bayes Classifier
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.8
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.8
12 |
--------------------------------------------------------------------------------
/Data/Output.txt:
--------------------------------------------------------------------------------
1 | Without removing stopwords
2 | Accuracy=0.8136
3 | Precision=0.75032 Recall=0.8590401172375893 F-Score=0.8010077717994706
4 | After removing stopwords
5 | Accuracy=0.82628
6 | Precision=0.77248 Recall=0.8656207978484984 F-Score=0.8164024519129148
7 |
8 | Binary Naive Bayes Classification:
9 | Without removing stopwords
10 | Accuracy=0.82992
11 | Precision=0.77304 Recall=0.8722693627008485 F-Score=0.8196623971498854
12 | After removing stopwords
13 | Accuracy=0.83796
14 | Precision=0.79184 Recall=0.8723010487353485 F-Score=0.8301253826477124
15 |
--------------------------------------------------------------------------------
/Data/stopwords.txt:
--------------------------------------------------------------------------------
1 | a
2 | about
3 | above
4 | after
5 | again
6 | against
7 | all
8 | am
9 | an
10 | and
11 | any
12 | are
13 | aren't
14 | as
15 | at
16 | be
17 | because
18 | been
19 | before
20 | being
21 | below
22 | between
23 | both
24 | but
25 | by
26 | can't
27 | cannot
28 | could
29 | couldn't
30 | did
31 | didn't
32 | do
33 | does
34 | doesn't
35 | doing
36 | don't
37 | down
38 | during
39 | each
40 | few
41 | for
42 | from
43 | further
44 | had
45 | hadn't
46 | has
47 | hasn't
48 | have
49 | haven't
50 | having
51 | he
52 | he'd
53 | he'll
54 | he's
55 | her
56 | here
57 | here's
58 | hers
59 | herself
60 | him
61 | himself
62 | his
63 | how
64 | how's
65 | i
66 | i'd
67 | i'll
68 | i'm
69 | i've
70 | if
71 | in
72 | into
73 | is
74 | isn't
75 | it
76 | it's
77 | its
78 | itself
79 | let's
80 | me
81 | more
82 | most
83 | mustn't
84 | my
85 | myself
86 | no
87 | nor
88 | not
89 | of
90 | off
91 | on
92 | once
93 | only
94 | or
95 | other
96 | ought
97 | our
98 | ours ourselves
99 | out
100 | over
101 | own
102 | same
103 | shan't
104 | she
105 | she'd
106 | she'll
107 | she's
108 | should
109 | shouldn't
110 | so
111 | some
112 | such
113 | than
114 | that
115 | that's
116 | the
117 | their
118 | theirs
119 | them
120 | themselves
121 | then
122 | there
123 | there's
124 | these
125 | they
126 | they'd
127 | they'll
128 | they're
129 | they've
130 | this
131 | those
132 | through
133 | to
134 | too
135 | under
136 | until
137 | up
138 | very
139 | was
140 | wasn't
141 | we
142 | we'd
143 | we'll
144 | we're
145 | we've
146 | were
147 | weren't
148 | what
149 | what's
150 | when
151 | when's
152 | where
153 | where's
154 | which
155 | while
156 | who
157 | who's
158 | whom
159 | why
160 | why's
161 | with
162 | won't
163 | would
164 | wouldn't
165 | you
166 | you'd
167 | you'll
168 | you're
169 | you've
170 | your
171 | yours
172 | yourself
173 | yourselves
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Keval Morabia
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Naive Bayes Classifier in Java
2 |
3 | Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem with the “naive” assumption of independence between every pair of features. Naive Bayes learners and classifiers can be extremely fast compared to more sophisticated methods. In spite of their apparently over-simplified assumptions, naive Bayes classifiers have worked quite well in many real-world situations, famously document classification and spam filtering.
4 |
Sentiment Analysis is the process of determining whether a piece of writing is positive, negative or neutral. It’s also known as opinion mining, deriving the opinion or attitude of a speaker. A common use case for this technology is to discover how people feel about a particular topic.
5 |
Your task is to classify whether a given review has a positive or negative tone using naive Bayes classifier.
6 |
Dataset used from - http://ai.stanford.edu/~amaas/data/sentiment/
7 |
Dataset has 12,500 positive and 12,500 negative reviews of movies for training and testing separately
8 |
9 |
Reference: Chapter6 - Machine Learning by Tom M. Mitchell
10 | 
11 |
12 |
13 |
14 | - **Steps to find the sentiment of any text:**
15 |
16 | The code I wrote only gives the sentiment as positive(1) or negative(0)
17 |
You cant know how much pos or neg
18 |
19 | To get a sentiment, write each text on a line with first integer as the actual sentiment
20 | (for testing the accuracy but you can keep it anything if you want to predict)
21 | after that each word is of the form: seperated by space
22 |
23 | For example
24 | 10 0:2 1:1 3:1 4:1
25 | Meaning the sentiment is 10 (positive) and contains
26 |
_The_ 2 times
27 |
_And_ 1 time
28 |
_Of_ 1 time
29 |
_To_ 1 time
30 |
(See Vocab.txt file)
31 | Put this content in test.data file
32 | Again, the value 10 is for test so if you know a sentiment you can verify that the classification should be 1 (pos)
33 |
34 | To get the predicted value, you can add a print statement after line 80 in the code and comment out the code from line 81 to 95
35 | which is just for finding the accuracy of prediction on test data
36 |
--------------------------------------------------------------------------------
/src/NaiveBayesClassifier.java:
--------------------------------------------------------------------------------
1 | import java.io.BufferedReader;
2 | import java.io.FileReader;
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.StringTokenizer;
6 |
7 | public class NaiveBayesClassifier {
8 |
9 | public static void NBClassifier(boolean binaryNB, String trainFile, String testFile, String vocabFile, String stopwordFile, boolean removeStopwords) throws IOException{
10 | long time = System.currentTimeMillis();
11 | String s;
12 | BufferedReader br;
13 | HashSet stopwords = new HashSet<>();
14 | int distinctWords = 0;
15 |
16 | HashSet stopwordsStr = new HashSet<>();
17 | if(removeStopwords) {
18 | br = new BufferedReader(new FileReader(stopwordFile));
19 | while((s = br.readLine())!=null) stopwordsStr.add(s);
20 | br.close();
21 | }
22 |
23 | br = new BufferedReader(new FileReader(vocabFile));
24 | while((s = br.readLine())!=null) {
25 | if(stopwordsStr.contains(s)) stopwords.add(distinctWords);
26 | distinctWords++;
27 | }
28 | br.close();
29 | int[] countPos = new int[distinctWords];//countPos[0] = Count(word=vocab[0] && Review=positive)
30 | int[] countNeg = new int[distinctWords];
31 | int posReviews = 0, negReviews = 0, totalWordsInPosReviews = 0, totalWordsInNegReviews = 0;
32 |
33 | br = new BufferedReader(new FileReader(trainFile));
34 | while((s = br.readLine())!=null) {
35 | StringTokenizer st = new StringTokenizer(s," :");
36 | if(st.countTokens()==0) continue;
37 | int rating = Integer.parseInt(st.nextToken());
38 | if(rating > 5) { // Positive review
39 | posReviews++;
40 | while(st.hasMoreTokens()) {
41 | int word = Integer.parseInt(st.nextToken());
42 | int freq = Integer.parseInt(st.nextToken());
43 | freq = binaryNB ? 1 : freq;
44 | if(stopwords.contains(word)) continue;
45 | countPos[word]+=freq;
46 | totalWordsInPosReviews+=freq;
47 | }
48 | }else { // Negative Review
49 | negReviews++;
50 | while(st.hasMoreTokens()) {
51 | int word = Integer.parseInt(st.nextToken());
52 | int freq = Integer.parseInt(st.nextToken());
53 | freq = binaryNB ? 1 : freq;
54 | if(stopwords.contains(word)) continue;
55 | countNeg[word]+=freq;
56 | totalWordsInNegReviews+=freq;
57 | }
58 | }
59 | }
60 | br.close();
61 |
62 | br = new BufferedReader(new FileReader(testFile));
63 | int truePositive = 0, falsePositive = 0, falseNegative = 0, correctClassification = 0, incorrectClassification = 0;
64 | while((s = br.readLine())!=null) {
65 | StringTokenizer st = new StringTokenizer(s, " :");
66 | int rating = Integer.parseInt(st.nextToken());
67 | int actual = rating>5 ? 1 : 0;//1-->yes, 0-->no
68 | double probOfPos = Math.log(posReviews/(posReviews+negReviews+0.0));
69 | double probOfNeg = Math.log(negReviews/(posReviews+negReviews+0.0));
70 |
71 | while(st.hasMoreTokens()) {
72 | int word = Integer.parseInt(st.nextToken());
73 | int freq = Integer.parseInt(st.nextToken());
74 | freq = binaryNB ? 1 : freq;
75 | if(stopwords.contains(word)) continue;
76 | probOfPos+=freq*Math.log((countPos[word]+1)/(totalWordsInPosReviews+distinctWords+0.0));
77 | probOfNeg+=freq*Math.log((countNeg[word]+1)/(totalWordsInNegReviews+distinctWords+0.0));
78 | }
79 |
80 | int predicted = (probOfPos>probOfNeg ? 1 : 0);
81 | if(predicted == actual ) correctClassification++;
82 | else incorrectClassification++;
83 |
84 | if(predicted==1 && actual==1) truePositive++;
85 | else if(predicted==1 && actual==0) falseNegative++;
86 | else if(predicted==0 && actual==1) falsePositive++;
87 | }
88 | br.close();
89 | double accuracy = correctClassification/(correctClassification + incorrectClassification + 0.0);
90 | double precision = truePositive/(truePositive+falsePositive+0.0);
91 | double recall = truePositive/(truePositive+falseNegative+0.0);
92 | double fscore = 2*precision*recall/(precision+recall);
93 | System.out.println("Accuracy="+accuracy+"\nPrecision="+precision+" Recall="+recall+" F-Score="+fscore);
94 | time = System.currentTimeMillis()-time;
95 | System.out.println("Time:"+time/1000d+"s");
96 | }
97 |
98 | public static void main(String[] args) throws IOException {
99 | String trainFile = "Data/Train.data", testFile = "Data/Test.data", vocabFile = "Data/Vocab.data", stopwordFile = "Data/stopwords.txt";
100 | System.out.println("Without removing stopwords");
101 | NBClassifier(false, trainFile, testFile, vocabFile, stopwordFile, false);
102 | System.out.println("After removing stopwords");
103 | NBClassifier(false, trainFile, testFile, vocabFile, stopwordFile, true);
104 |
105 | System.out.println("\nBinary Naive Bayes Classification:");
106 | System.out.println("Without removing stopwords");
107 | NBClassifier(true, trainFile, testFile, vocabFile, stopwordFile, false);
108 | System.out.println("After removing stopwords");
109 | NBClassifier(true, trainFile, testFile, vocabFile, stopwordFile, true);
110 | }
111 | }
--------------------------------------------------------------------------------