├── .classpath
├── .gitignore
├── .project
├── .settings
    └── org.eclipse.jdt.core.prefs
├── Data
    ├── Output.txt
    ├── Test.data
    ├── Train.data
    ├── Vocab.data
    └── stopwords.txt
├── LICENSE
├── README.md
└── src
    └── NaiveBayesClassifier.java


/.classpath:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <classpath>
3 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
4 | 	<classpathentry kind="src" path="src"/>
5 | 	<classpathentry kind="output" path="bin"/>
6 | </classpath>
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.ear
17 | *.zip
18 | *.tar.gz
19 | *.rar
20 | 
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 | /bin/
24 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>Naive Bayes Classifier</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.eclipse.jdt.core.javanature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.8
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.8
12 | 


--------------------------------------------------------------------------------
/Data/Output.txt:
--------------------------------------------------------------------------------
 1 | Without removing stopwords
 2 | Accuracy=0.8136
 3 | Precision=0.75032 Recall=0.8590401172375893 F-Score=0.8010077717994706
 4 | After removing stopwords
 5 | Accuracy=0.82628
 6 | Precision=0.77248 Recall=0.8656207978484984 F-Score=0.8164024519129148
 7 | 
 8 | Binary Naive Bayes Classification:
 9 | Without removing stopwords
10 | Accuracy=0.82992
11 | Precision=0.77304 Recall=0.8722693627008485 F-Score=0.8196623971498854
12 | After removing stopwords
13 | Accuracy=0.83796
14 | Precision=0.79184 Recall=0.8723010487353485 F-Score=0.8301253826477124
15 | 


--------------------------------------------------------------------------------
/Data/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | after
  5 | again
  6 | against
  7 | all
  8 | am
  9 | an
 10 | and
 11 | any
 12 | are
 13 | aren't
 14 | as
 15 | at
 16 | be
 17 | because
 18 | been
 19 | before
 20 | being
 21 | below
 22 | between
 23 | both
 24 | but
 25 | by
 26 | can't
 27 | cannot
 28 | could
 29 | couldn't
 30 | did
 31 | didn't
 32 | do
 33 | does
 34 | doesn't
 35 | doing
 36 | don't
 37 | down
 38 | during
 39 | each
 40 | few
 41 | for
 42 | from
 43 | further
 44 | had
 45 | hadn't
 46 | has
 47 | hasn't
 48 | have
 49 | haven't
 50 | having
 51 | he
 52 | he'd
 53 | he'll
 54 | he's
 55 | her
 56 | here
 57 | here's
 58 | hers
 59 | herself
 60 | him
 61 | himself
 62 | his
 63 | how
 64 | how's
 65 | i
 66 | i'd
 67 | i'll
 68 | i'm
 69 | i've
 70 | if
 71 | in
 72 | into
 73 | is
 74 | isn't
 75 | it
 76 | it's
 77 | its
 78 | itself
 79 | let's
 80 | me
 81 | more
 82 | most
 83 | mustn't
 84 | my
 85 | myself
 86 | no
 87 | nor
 88 | not
 89 | of
 90 | off
 91 | on
 92 | once
 93 | only
 94 | or
 95 | other
 96 | ought
 97 | our
 98 | ours	ourselves
 99 | out
100 | over
101 | own
102 | same
103 | shan't
104 | she
105 | she'd
106 | she'll
107 | she's
108 | should
109 | shouldn't
110 | so
111 | some
112 | such
113 | than
114 | that
115 | that's
116 | the
117 | their
118 | theirs
119 | them
120 | themselves
121 | then
122 | there
123 | there's
124 | these
125 | they
126 | they'd
127 | they'll
128 | they're
129 | they've
130 | this
131 | those
132 | through
133 | to
134 | too
135 | under
136 | until
137 | up
138 | very
139 | was
140 | wasn't
141 | we
142 | we'd
143 | we'll
144 | we're
145 | we've
146 | were
147 | weren't
148 | what
149 | what's
150 | when
151 | when's
152 | where
153 | where's
154 | which
155 | while
156 | who
157 | who's
158 | whom
159 | why
160 | why's
161 | with
162 | won't
163 | would
164 | wouldn't
165 | you
166 | you'd
167 | you'll
168 | you're
169 | you've
170 | your
171 | yours
172 | yourself
173 | yourselves


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Keval Morabia
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Naive Bayes Classifier in Java
 2 | 
 3 | Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem with the “naive” assumption of independence between every pair of features. Naive Bayes learners and classifiers can be extremely fast compared to more sophisticated methods. In spite of their apparently over-simplified assumptions, naive Bayes classifiers have worked quite well in many real-world situations, famously document classification and spam filtering.
 4 | <br>Sentiment Analysis is the process of determining whether a piece of writing is positive, negative or neutral. It’s also known as opinion mining, deriving the opinion or attitude of a speaker. A common use case for this technology is to discover how people feel about a particular topic.
 5 | <br>Your task is to classify whether a given review has a positive or negative tone using naive Bayes classifier.
 6 | <br><br>Dataset used from - http://ai.stanford.edu/~amaas/data/sentiment/
 7 | <br>Dataset has 12,500 positive and 12,500 negative reviews of movies for training and testing separately
 8 | 
 9 | <br>Reference: Chapter6 - Machine Learning by Tom M. Mitchell 
10 | ![Image](https://mrcheerful.000webhostapp.com/GitHub/naive-bayes.PNG)
11 | 
12 | <hr>
13 | 
14 | - **Steps to find the sentiment of any text:**
15 | 
16 | The code I wrote only gives the sentiment as positive(1) or negative(0)
17 | <br>You cant know how much pos or neg
18 | 
19 | To get a sentiment, write each text on a line with first integer as the actual sentiment
20 | (for testing the accuracy but you can keep it anything if you want to predict)
21 | after that each word is of the form: <Index_in_vocab:freq> seperated by space
22 | 
23 | For example
24 | 10 0:2 1:1 3:1 4:1 
25 | Meaning the sentiment is 10 (positive) and contains
26 | <br>_The_ 2 times
27 | <br>_And_ 1 time
28 | <br>_Of_ 1 time
29 | <br>_To_ 1 time
30 | <br>(See Vocab.txt file)
31 | Put this content in test.data file
32 | Again, the value 10 is for test so if you know a sentiment you can verify that the classification should be 1 (pos)
33 | 
34 | To get the predicted value, you can add a print statement after line 80 in the code and comment out the code from line 81 to 95
35 | which is just for finding the accuracy of prediction on test data
36 | 


--------------------------------------------------------------------------------
/src/NaiveBayesClassifier.java:
--------------------------------------------------------------------------------
  1 | import java.io.BufferedReader;
  2 | import java.io.FileReader;
  3 | import java.io.IOException;
  4 | import java.util.HashSet;
  5 | import java.util.StringTokenizer;
  6 | 
  7 | public class NaiveBayesClassifier {
  8 | 	
  9 | 	public static void NBClassifier(boolean binaryNB, String trainFile, String testFile, String vocabFile, String stopwordFile, boolean removeStopwords) throws IOException{
 10 | 		long time = System.currentTimeMillis();
 11 | 		String s;
 12 | 		BufferedReader br;
 13 | 		HashSet<Integer> stopwords = new HashSet<>();
 14 | 		int distinctWords = 0;
 15 | 		
 16 | 		HashSet<String> stopwordsStr = new HashSet<>();
 17 | 		if(removeStopwords) {
 18 | 			br = new BufferedReader(new FileReader(stopwordFile));
 19 | 			while((s = br.readLine())!=null)	stopwordsStr.add(s);
 20 | 			br.close();
 21 | 		}
 22 | 		
 23 | 		br = new BufferedReader(new FileReader(vocabFile));
 24 | 		while((s = br.readLine())!=null) {
 25 | 			if(stopwordsStr.contains(s))	stopwords.add(distinctWords);	
 26 | 			distinctWords++;
 27 | 		}
 28 | 		br.close();
 29 | 		int[] countPos = new int[distinctWords];//countPos[0] = Count(word=vocab[0] && Review=positive)
 30 | 		int[] countNeg = new int[distinctWords];
 31 | 		int posReviews = 0, negReviews = 0, totalWordsInPosReviews = 0, totalWordsInNegReviews = 0;
 32 | 		
 33 | 		br = new BufferedReader(new FileReader(trainFile));
 34 | 		while((s = br.readLine())!=null) {
 35 | 			StringTokenizer st = new StringTokenizer(s," :");
 36 | 			if(st.countTokens()==0)	continue;
 37 | 			int rating = Integer.parseInt(st.nextToken());
 38 | 			if(rating > 5) { // Positive review
 39 | 				posReviews++;
 40 | 				while(st.hasMoreTokens()) {
 41 | 					int word = Integer.parseInt(st.nextToken());
 42 | 					int freq = Integer.parseInt(st.nextToken());
 43 | 					freq = binaryNB ? 1 : freq;
 44 | 					if(stopwords.contains(word))	continue;
 45 | 					countPos[word]+=freq;
 46 | 					totalWordsInPosReviews+=freq;
 47 | 				}
 48 | 			}else { // Negative Review
 49 | 				negReviews++;
 50 | 				while(st.hasMoreTokens()) {
 51 | 					int word = Integer.parseInt(st.nextToken());
 52 | 					int freq = Integer.parseInt(st.nextToken());
 53 | 					freq = binaryNB ? 1 : freq;
 54 | 					if(stopwords.contains(word))	continue;
 55 | 					countNeg[word]+=freq;
 56 | 					totalWordsInNegReviews+=freq;
 57 | 				}
 58 | 			}
 59 | 		}
 60 | 		br.close();
 61 | 		
 62 | 		br = new BufferedReader(new FileReader(testFile));
 63 | 		int truePositive = 0, falsePositive = 0, falseNegative = 0, correctClassification = 0, incorrectClassification = 0;
 64 | 		while((s = br.readLine())!=null) {
 65 | 			StringTokenizer st = new StringTokenizer(s, " :");
 66 | 			int rating = Integer.parseInt(st.nextToken());
 67 | 			int actual = rating>5 ? 1 : 0;//1-->yes, 0-->no
 68 | 			double probOfPos = Math.log(posReviews/(posReviews+negReviews+0.0));
 69 | 			double probOfNeg = Math.log(negReviews/(posReviews+negReviews+0.0));
 70 | 			
 71 | 			while(st.hasMoreTokens()) {
 72 | 				int word = Integer.parseInt(st.nextToken());
 73 | 				int freq = Integer.parseInt(st.nextToken());
 74 | 				freq = binaryNB ? 1 : freq;
 75 | 				if(stopwords.contains(word))	continue;
 76 | 				probOfPos+=freq*Math.log((countPos[word]+1)/(totalWordsInPosReviews+distinctWords+0.0));
 77 | 				probOfNeg+=freq*Math.log((countNeg[word]+1)/(totalWordsInNegReviews+distinctWords+0.0));
 78 | 			}
 79 | 			
 80 | 			int predicted = (probOfPos>probOfNeg ? 1 : 0);
 81 | 			if(predicted  == actual )	correctClassification++;
 82 | 			else 						incorrectClassification++;
 83 | 		
 84 | 			if(predicted==1 && actual==1)			truePositive++;
 85 | 			else if(predicted==1 && actual==0)		falseNegative++;
 86 | 			else if(predicted==0 && actual==1)		falsePositive++;
 87 | 		}
 88 | 		br.close();
 89 | 		double accuracy = correctClassification/(correctClassification + incorrectClassification + 0.0);
 90 | 		double precision = truePositive/(truePositive+falsePositive+0.0);
 91 | 		double recall = truePositive/(truePositive+falseNegative+0.0);
 92 | 		double fscore = 2*precision*recall/(precision+recall);
 93 | 		System.out.println("Accuracy="+accuracy+"\nPrecision="+precision+" Recall="+recall+" F-Score="+fscore);
 94 | 		time = System.currentTimeMillis()-time;
 95 | 		System.out.println("Time:"+time/1000d+"s");
 96 | 	}
 97 | 	
 98 | 	public static void main(String[] args) throws IOException {
 99 | 		String trainFile = "Data/Train.data", testFile = "Data/Test.data", vocabFile = "Data/Vocab.data", stopwordFile = "Data/stopwords.txt";
100 | 		System.out.println("Without removing stopwords");
101 | 		NBClassifier(false, trainFile, testFile, vocabFile, stopwordFile, false);
102 | 		System.out.println("After removing stopwords");
103 | 		NBClassifier(false, trainFile, testFile, vocabFile, stopwordFile, true);
104 | 		
105 | 		System.out.println("\nBinary Naive Bayes Classification:");
106 | 		System.out.println("Without removing stopwords");
107 | 		NBClassifier(true, trainFile, testFile, vocabFile, stopwordFile, false);
108 | 		System.out.println("After removing stopwords");
109 | 		NBClassifier(true, trainFile, testFile, vocabFile, stopwordFile, true);
110 | 	}
111 | }


--------------------------------------------------------------------------------