├── .classpath ├── .gitignore ├── .project ├── .settings └── org.eclipse.jdt.core.prefs ├── Data ├── Output.txt ├── Test.data ├── Train.data ├── Vocab.data └── stopwords.txt ├── LICENSE ├── README.md └── src └── NaiveBayesClassifier.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.ear 17 | *.zip 18 | *.tar.gz 19 | *.rar 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | /bin/ 24 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Naive Bayes Classifier 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.8 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.8 12 | -------------------------------------------------------------------------------- /Data/Output.txt: -------------------------------------------------------------------------------- 1 | Without removing stopwords 2 | Accuracy=0.8136 3 | Precision=0.75032 Recall=0.8590401172375893 F-Score=0.8010077717994706 4 | After removing stopwords 5 | Accuracy=0.82628 6 | Precision=0.77248 Recall=0.8656207978484984 F-Score=0.8164024519129148 7 | 8 | Binary Naive Bayes Classification: 9 | Without removing stopwords 10 | Accuracy=0.82992 11 | Precision=0.77304 Recall=0.8722693627008485 F-Score=0.8196623971498854 12 | After removing stopwords 13 | Accuracy=0.83796 14 | Precision=0.79184 Recall=0.8723010487353485 F-Score=0.8301253826477124 15 | -------------------------------------------------------------------------------- /Data/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | after 5 | again 6 | against 7 | all 8 | am 9 | an 10 | and 11 | any 12 | are 13 | aren't 14 | as 15 | at 16 | be 17 | because 18 | been 19 | before 20 | being 21 | below 22 | between 23 | both 24 | but 25 | by 26 | can't 27 | cannot 28 | could 29 | couldn't 30 | did 31 | didn't 32 | do 33 | does 34 | doesn't 35 | doing 36 | don't 37 | down 38 | during 39 | each 40 | few 41 | for 42 | from 43 | further 44 | had 45 | hadn't 46 | has 47 | hasn't 48 | have 49 | haven't 50 | having 51 | he 52 | he'd 53 | he'll 54 | he's 55 | her 56 | here 57 | here's 58 | hers 59 | herself 60 | him 61 | himself 62 | his 63 | how 64 | how's 65 | i 66 | i'd 67 | i'll 68 | i'm 69 | i've 70 | if 71 | in 72 | into 73 | is 74 | isn't 75 | it 76 | it's 77 | its 78 | itself 79 | let's 80 | me 81 | more 82 | most 83 | mustn't 84 | my 85 | myself 86 | no 87 | nor 88 | not 89 | of 90 | off 91 | on 92 | once 93 | only 94 | or 95 | other 96 | ought 97 | our 98 | ours ourselves 99 | out 100 | over 101 | own 102 | same 103 | shan't 104 | she 105 | she'd 106 | she'll 107 | she's 108 | should 109 | shouldn't 110 | so 111 | some 112 | such 113 | than 114 | that 115 | that's 116 | the 117 | their 118 | theirs 119 | them 120 | themselves 121 | then 122 | there 123 | there's 124 | these 125 | they 126 | they'd 127 | they'll 128 | they're 129 | they've 130 | this 131 | those 132 | through 133 | to 134 | too 135 | under 136 | until 137 | up 138 | very 139 | was 140 | wasn't 141 | we 142 | we'd 143 | we'll 144 | we're 145 | we've 146 | were 147 | weren't 148 | what 149 | what's 150 | when 151 | when's 152 | where 153 | where's 154 | which 155 | while 156 | who 157 | who's 158 | whom 159 | why 160 | why's 161 | with 162 | won't 163 | would 164 | wouldn't 165 | you 166 | you'd 167 | you'll 168 | you're 169 | you've 170 | your 171 | yours 172 | yourself 173 | yourselves -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Keval Morabia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Naive Bayes Classifier in Java 2 | 3 | Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem with the “naive” assumption of independence between every pair of features. Naive Bayes learners and classifiers can be extremely fast compared to more sophisticated methods. In spite of their apparently over-simplified assumptions, naive Bayes classifiers have worked quite well in many real-world situations, famously document classification and spam filtering. 4 |
Sentiment Analysis is the process of determining whether a piece of writing is positive, negative or neutral. It’s also known as opinion mining, deriving the opinion or attitude of a speaker. A common use case for this technology is to discover how people feel about a particular topic. 5 |
Your task is to classify whether a given review has a positive or negative tone using naive Bayes classifier. 6 |

Dataset used from - http://ai.stanford.edu/~amaas/data/sentiment/ 7 |
Dataset has 12,500 positive and 12,500 negative reviews of movies for training and testing separately 8 | 9 |
Reference: Chapter6 - Machine Learning by Tom M. Mitchell 10 | ![Image](https://mrcheerful.000webhostapp.com/GitHub/naive-bayes.PNG) 11 | 12 |
13 | 14 | - **Steps to find the sentiment of any text:** 15 | 16 | The code I wrote only gives the sentiment as positive(1) or negative(0) 17 |
You cant know how much pos or neg 18 | 19 | To get a sentiment, write each text on a line with first integer as the actual sentiment 20 | (for testing the accuracy but you can keep it anything if you want to predict) 21 | after that each word is of the form: seperated by space 22 | 23 | For example 24 | 10 0:2 1:1 3:1 4:1 25 | Meaning the sentiment is 10 (positive) and contains 26 |
_The_ 2 times 27 |
_And_ 1 time 28 |
_Of_ 1 time 29 |
_To_ 1 time 30 |
(See Vocab.txt file) 31 | Put this content in test.data file 32 | Again, the value 10 is for test so if you know a sentiment you can verify that the classification should be 1 (pos) 33 | 34 | To get the predicted value, you can add a print statement after line 80 in the code and comment out the code from line 81 to 95 35 | which is just for finding the accuracy of prediction on test data 36 | -------------------------------------------------------------------------------- /src/NaiveBayesClassifier.java: -------------------------------------------------------------------------------- 1 | import java.io.BufferedReader; 2 | import java.io.FileReader; 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.StringTokenizer; 6 | 7 | public class NaiveBayesClassifier { 8 | 9 | public static void NBClassifier(boolean binaryNB, String trainFile, String testFile, String vocabFile, String stopwordFile, boolean removeStopwords) throws IOException{ 10 | long time = System.currentTimeMillis(); 11 | String s; 12 | BufferedReader br; 13 | HashSet stopwords = new HashSet<>(); 14 | int distinctWords = 0; 15 | 16 | HashSet stopwordsStr = new HashSet<>(); 17 | if(removeStopwords) { 18 | br = new BufferedReader(new FileReader(stopwordFile)); 19 | while((s = br.readLine())!=null) stopwordsStr.add(s); 20 | br.close(); 21 | } 22 | 23 | br = new BufferedReader(new FileReader(vocabFile)); 24 | while((s = br.readLine())!=null) { 25 | if(stopwordsStr.contains(s)) stopwords.add(distinctWords); 26 | distinctWords++; 27 | } 28 | br.close(); 29 | int[] countPos = new int[distinctWords];//countPos[0] = Count(word=vocab[0] && Review=positive) 30 | int[] countNeg = new int[distinctWords]; 31 | int posReviews = 0, negReviews = 0, totalWordsInPosReviews = 0, totalWordsInNegReviews = 0; 32 | 33 | br = new BufferedReader(new FileReader(trainFile)); 34 | while((s = br.readLine())!=null) { 35 | StringTokenizer st = new StringTokenizer(s," :"); 36 | if(st.countTokens()==0) continue; 37 | int rating = Integer.parseInt(st.nextToken()); 38 | if(rating > 5) { // Positive review 39 | posReviews++; 40 | while(st.hasMoreTokens()) { 41 | int word = Integer.parseInt(st.nextToken()); 42 | int freq = Integer.parseInt(st.nextToken()); 43 | freq = binaryNB ? 1 : freq; 44 | if(stopwords.contains(word)) continue; 45 | countPos[word]+=freq; 46 | totalWordsInPosReviews+=freq; 47 | } 48 | }else { // Negative Review 49 | negReviews++; 50 | while(st.hasMoreTokens()) { 51 | int word = Integer.parseInt(st.nextToken()); 52 | int freq = Integer.parseInt(st.nextToken()); 53 | freq = binaryNB ? 1 : freq; 54 | if(stopwords.contains(word)) continue; 55 | countNeg[word]+=freq; 56 | totalWordsInNegReviews+=freq; 57 | } 58 | } 59 | } 60 | br.close(); 61 | 62 | br = new BufferedReader(new FileReader(testFile)); 63 | int truePositive = 0, falsePositive = 0, falseNegative = 0, correctClassification = 0, incorrectClassification = 0; 64 | while((s = br.readLine())!=null) { 65 | StringTokenizer st = new StringTokenizer(s, " :"); 66 | int rating = Integer.parseInt(st.nextToken()); 67 | int actual = rating>5 ? 1 : 0;//1-->yes, 0-->no 68 | double probOfPos = Math.log(posReviews/(posReviews+negReviews+0.0)); 69 | double probOfNeg = Math.log(negReviews/(posReviews+negReviews+0.0)); 70 | 71 | while(st.hasMoreTokens()) { 72 | int word = Integer.parseInt(st.nextToken()); 73 | int freq = Integer.parseInt(st.nextToken()); 74 | freq = binaryNB ? 1 : freq; 75 | if(stopwords.contains(word)) continue; 76 | probOfPos+=freq*Math.log((countPos[word]+1)/(totalWordsInPosReviews+distinctWords+0.0)); 77 | probOfNeg+=freq*Math.log((countNeg[word]+1)/(totalWordsInNegReviews+distinctWords+0.0)); 78 | } 79 | 80 | int predicted = (probOfPos>probOfNeg ? 1 : 0); 81 | if(predicted == actual ) correctClassification++; 82 | else incorrectClassification++; 83 | 84 | if(predicted==1 && actual==1) truePositive++; 85 | else if(predicted==1 && actual==0) falseNegative++; 86 | else if(predicted==0 && actual==1) falsePositive++; 87 | } 88 | br.close(); 89 | double accuracy = correctClassification/(correctClassification + incorrectClassification + 0.0); 90 | double precision = truePositive/(truePositive+falsePositive+0.0); 91 | double recall = truePositive/(truePositive+falseNegative+0.0); 92 | double fscore = 2*precision*recall/(precision+recall); 93 | System.out.println("Accuracy="+accuracy+"\nPrecision="+precision+" Recall="+recall+" F-Score="+fscore); 94 | time = System.currentTimeMillis()-time; 95 | System.out.println("Time:"+time/1000d+"s"); 96 | } 97 | 98 | public static void main(String[] args) throws IOException { 99 | String trainFile = "Data/Train.data", testFile = "Data/Test.data", vocabFile = "Data/Vocab.data", stopwordFile = "Data/stopwords.txt"; 100 | System.out.println("Without removing stopwords"); 101 | NBClassifier(false, trainFile, testFile, vocabFile, stopwordFile, false); 102 | System.out.println("After removing stopwords"); 103 | NBClassifier(false, trainFile, testFile, vocabFile, stopwordFile, true); 104 | 105 | System.out.println("\nBinary Naive Bayes Classification:"); 106 | System.out.println("Without removing stopwords"); 107 | NBClassifier(true, trainFile, testFile, vocabFile, stopwordFile, false); 108 | System.out.println("After removing stopwords"); 109 | NBClassifier(true, trainFile, testFile, vocabFile, stopwordFile, true); 110 | } 111 | } --------------------------------------------------------------------------------