├── Init.java ├── README.md └── bayes └── NaiveBayesClassifier.java /Init.java: -------------------------------------------------------------------------------- 1 | import java.util.Scanner; 2 | 3 | import bayes.NaiveBayesClassifier; 4 | 5 | public class Init { 6 | 7 | public static Scanner s = new Scanner(System.in); 8 | public static void main(String[] args) { 9 | // String for System.In 10 | String input = ""; 11 | System.out.println( 12 | "======================================================================================\n\n\n" 13 | + " NAIVE BAYES TEXT CLASSIFIER \n" 14 | + " Patrick Roderman \n\n\n" 15 | + "======================================================================================\n" 16 | ); 17 | // Features,Training Data file paths, isEvidential (Evidential learning) 18 | NaiveBayesClassifier NBC = new NaiveBayesClassifier("E:\\Spring2016\\ML\\NaiveBayesClassifierTXT\\keywords.txt", 19 | "E:\\Spring2016\\ML\\NaiveBayesClassifierTXT\\trainingData.txt", 20 | true 21 | ); 22 | 23 | // input loop 24 | boolean isRunning = true; 25 | while(isRunning){ 26 | System.out.println("" 27 | + "======================================================================================\n" 28 | + "Enter Review (txt) file path to Classify: " 29 | + ""); 30 | while (!s.hasNext()) s.next(); 31 | input = s.next(); 32 | System.out.println("======================================================================================\n"); 33 | //Predict Text Class 34 | NBC.classify(input); 35 | 36 | } 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Naive Bayes Classifier (Java) 2 | Generalized Naive Bayes Classifier for text written in Java. 3 | 4 | ## Code Example 5 | ```Java 6 | //create new object 7 | NaiveBayesClassifier NBC = new NaiveBayesClassifier("\\keywords.txt", "\\trainingData.txt", true); 8 | 9 | //classify input 10 | String input = "Your entry here"; 11 | NBC.classify(input); 12 | ``` 13 | Configure the file location for training keywords and training entries in the NaiveBayesClassifier constructor. 14 | Third constructor paramter is to enable evidential learning. 15 | You can set the threshold for evidential learning inside the NaiveBayesClassifier if you wish (no threshold by default). 16 | 17 | ## Set Up 18 | Training Entries 19 | *** 20 | In a training data text file, include entries with their clasification delimited by ":" with each indivual entry delimited by ";". 21 | 22 | e.g. Your training entry : ClassificationName; 23 | 24 | Key Words 25 | *** 26 | In a key words text file, include your initial associations. Key words which belong to the same classification should be delimited by "," and the group associated to its classification by ":". The classified key words should be delimited by ";" 27 | 28 | e.g. keyWord1, keyWord2, keyWord3 : ClassificationName; 29 | -------------------------------------------------------------------------------- /bayes/NaiveBayesClassifier.java: -------------------------------------------------------------------------------- 1 | /** 2 | * NaiveBayesClassifier 3 | * A generalized implementation of a text based Naive Bayes 4 | * Classifier. 5 | * 6 | * @author PatrickRoderman 7 | * @version 1.0 8 | * @since 2016-02-21 9 | */ 10 | 11 | package bayes; 12 | 13 | import java.io.BufferedReader; 14 | import java.io.FileNotFoundException; 15 | import java.io.FileReader; 16 | import java.io.IOException; 17 | import java.util.HashMap; 18 | import java.util.LinkedHashSet; 19 | import java.util.List; 20 | import java.util.Set; 21 | import java.util.Vector; 22 | 23 | public class NaiveBayesClassifier { 24 | // boolean settings 25 | // add text and its predicted classification into the model 26 | public boolean isEvidential = false; 27 | 28 | // (feature names , (class, count)) 29 | public HashMap> features = new HashMap>(); 30 | 31 | // count for classes (class, count) 32 | public HashMap classCounts = new HashMap(); 33 | 34 | // maintains vector of keywords/features 35 | public Vector featureNames = new Vector(); 36 | 37 | public NaiveBayesClassifier(String featureFile, String trainingFile, boolean isEvidential) { 38 | this.isEvidential = isEvidential; 39 | // get features from file 40 | System.out.println("Attempting to read features file..."); 41 | Vector keyWordLines = fileReader(featureFile); 42 | System.out.println("Features successfully read."); 43 | // add features and classes to globals 44 | addFeatures(keyWordLines); 45 | 46 | // read training data and add to counters 47 | System.out.println("Attempting to read training file..."); 48 | Vector trainingLines = fileReader(trainingFile); 49 | System.out.println("Training entries successfully read.\n"); 50 | 51 | //set counters for each feature and class occurrence 52 | setCounters(trainingLines); 53 | } 54 | 55 | /** 56 | * Sets the global data structures (increments features and classes per instance of occurrence) 57 | * @param (Vector) trainingLines - training text stored in individual elements 58 | */ 59 | public void setCounters(Vector trainingLines) { 60 | Vector lines = trainingLines; 61 | 62 | //initializes features maps and class counts 63 | for (String line : lines) { 64 | 65 | //parses features and class 66 | String[] sample = line.split(" : "); 67 | String sampleClass = ""; 68 | // Check file format 69 | try{ 70 | sampleClass = sample[1]; 71 | }catch(Exception NullPointerException){ 72 | System.out.println("Invalid file formating - reconfigure file"); 73 | sampleClass = ""; 74 | } 75 | String[] allWords = sample[0].split(" "); 76 | Vector lineFeatures = getFeatures(allWords); 77 | 78 | //sets default values in HashMap to handle null 79 | for (String feat : lineFeatures) { 80 | //features.get(feat).probs.put(sampleClass, 0); 81 | features.get(feat).put(sampleClass, 0); 82 | } 83 | } 84 | 85 | //sets counts 86 | for (String line : lines) { 87 | String[] sample = line.split(" : "); 88 | String sampleClass = sample[1]; 89 | String[] allWords = sample[0].split(" "); 90 | 91 | // update count for features 92 | Vector lineFeatures = getFeatures(allWords); 93 | // update class count 94 | classCounts.put(sampleClass, (classCounts.get(sampleClass) + 1)); 95 | for (String feat : lineFeatures) { 96 | //features.get(feat).probs.put(sampleClass,(features.get(feat).probs.get(sampleClass)+1)); 97 | features.get(feat).put(sampleClass,(features.get(feat).get(sampleClass)+1)); 98 | } 99 | } 100 | } 101 | 102 | /** 103 | * Reads file and returns Vector of elements delimited by new lines 104 | * @param (String) filePath - global path to file 105 | * @return (Vector) - each element contains a line from file 106 | */ 107 | public Vector fileReader(String filePath) { 108 | 109 | Vector extracted = new Vector(); 110 | Vector allLines = new Vector(); 111 | String fileName = filePath; 112 | String line = null; 113 | 114 | try { 115 | FileReader fileReader = new FileReader(fileName); 116 | BufferedReader bufferedReader = new BufferedReader(fileReader); 117 | while ((line = bufferedReader.readLine()) != null) { 118 | // add line 119 | allLines.add(line); 120 | } 121 | bufferedReader.close(); 122 | 123 | 124 | } catch (FileNotFoundException ex) { 125 | System.out.println("Unable to open file '" + fileName + "'"); 126 | return null; 127 | } catch (IOException ex) { 128 | System.out.println("Error reading '" + fileName + "'"); 129 | return null; 130 | } finally { 131 | 132 | for(String s : allLines){ 133 | //check if empty 134 | String check = s.replaceAll(" ", ""); 135 | 136 | String entry = ""; 137 | if(s.contains(";")){ 138 | String append = s.replace(";", ""); 139 | entry = entry + append; 140 | extracted.add(entry); 141 | }else if(check.equals("")){ 142 | 143 | 144 | }else{ 145 | entry = entry + s; 146 | } 147 | } 148 | 149 | } 150 | return extracted; 151 | } 152 | 153 | /** 154 | * Returns features from specified class 155 | * @param (String[]) alWords - each word from text stored in different elements 156 | * @param (String) featureClass - the name of the class you want to get features from 157 | * @return (Vector) filtered - features from class name : String featureClass 158 | */ 159 | public Vector getFeatures(String[] allWords, String featureClass) { 160 | Vector filtered = new Vector(); 161 | // only add keywords from V into filtered 162 | for (int i = 0; i < allWords.length; i++) { 163 | for (String keyword : featureNames) { 164 | if ((allWords[i].toLowerCase()).contains(keyword.toLowerCase())) { 165 | //filtered.add(allWords[i].toLowerCase()); 166 | filtered.add(keyword.toLowerCase()); 167 | } 168 | } 169 | } 170 | // return all non-duplicate values from filtered Vector 171 | return (Vector) removeDups(filtered); 172 | } 173 | 174 | /** 175 | * Returns all features 176 | * @param (String[]) allWords - each word from text stored in different element 177 | * @return (Vector) filtered - contains all features from String[] allWords 178 | */ 179 | public Vector getFeatures(String[] allWords) { 180 | Vector filtered = new Vector(); 181 | 182 | for (int i = 0; i < allWords.length; i++) { 183 | for (String keyword : featureNames) { 184 | if ((allWords[i].toLowerCase()).contains((keyword).toLowerCase())) { 185 | //filtered.add(allWords[i].toLowerCase()); 186 | filtered.add(keyword.toLowerCase()); 187 | } 188 | } 189 | } 190 | // return all non-duplicate values from filtered Vector 191 | return (Vector) removeDups(filtered); 192 | } 193 | 194 | /** 195 | * Parses line for features. 196 | * Individual features should be delimited by "," and all features delimited by ":" from their class 197 | * Both features and their class should be delimited by a new line 198 | * @param (Vector lines - each element should contain the entire line/review/text 199 | */ 200 | public void addFeatures(Vector lines) { 201 | for (String sample : lines) { 202 | // remove spaces 203 | sample = sample.replaceAll("\\s+", ""); 204 | 205 | // separate into features, and class 206 | String[] associate = sample.split(":"); 207 | 208 | // separate features and class 209 | String[] allFeatures = associate[0].split(","); 210 | String featuresClass = associate[1]; 211 | Vector featuresVector = new Vector(); 212 | 213 | // copy features into list to remove duplicates 214 | for (int i = 0; i < allFeatures.length; i++) { 215 | featuresVector.add(allFeatures[i]); 216 | } 217 | Vector featuresNoDups = (Vector) removeDups(featuresVector); 218 | 219 | // add feature Class to vector to start 220 | classCounts.put(featuresClass, 0); 221 | for (String feature : featuresNoDups) { 222 | // add feature and its details to feature (HashMap) 223 | features.put(feature, new HashMap()); 224 | features.put(feature, new HashMap()); 225 | 226 | // add features to feature vector 227 | featureNames.add(feature); 228 | } 229 | } 230 | } 231 | 232 | /** 233 | * Removes duplicate keywords from lists 234 | * @param (List list - list for duplicate removal 235 | * @return (Vector) - Vector without any duplicates 236 | */ 237 | private static List removeDups(List list) { 238 | return new Vector(new LinkedHashSet(list)); 239 | } 240 | 241 | /** 242 | * Calculates conditional probability P(Features | Class) 243 | * @param (Vector) sample - text for feature extraction 244 | * @param (String) featureClass - name of Class 245 | * @return (double) P(Features | Class) 246 | */ 247 | public double conditionalProb(Vector sample, String featureClass){ 248 | double prob = 1; 249 | String file = sample.get(0); 250 | String[] allWords = file.split(" "); 251 | Vector featuresList = getFeatures(allWords); 252 | 253 | //get all classes 254 | Set keys = classCounts.keySet(); 255 | int combinedClassCount = 0; 256 | 257 | for(String key : keys){ 258 | 259 | combinedClassCount += classCounts.get(key); 260 | } 261 | 262 | //counter for no prior data 263 | int offset = 0; 264 | for (String s : featuresList) { 265 | if(features.get(s).get(featureClass) == null){ 266 | features.get(s).put(featureClass, 0); 267 | } 268 | System.out.println( "P(" + s + " | " + featureClass +") = " + features.get(s).get(featureClass) + "/" + classCounts.get(featureClass)); 269 | double featureProb = features.get(s).get(featureClass); 270 | 271 | if(featureProb == 0){ 272 | offset++; 273 | }else{ 274 | prob += featureProb; 275 | } 276 | } 277 | System.out.println("\n" + prob + "/" + ((combinedClassCount) - offset)); 278 | return prob/(combinedClassCount - offset); 279 | } 280 | 281 | /** 282 | * Displays conditional probabilities and conclusion - P(Features | Class) 283 | * @param (String) filePath - global file path to text you want classified (.txt) 284 | * @return (String) - class name with highest probability 285 | */ 286 | public String classify(String filePath){ 287 | 288 | //parse input to lines 289 | Vector sample = fileReader(filePath); 290 | 291 | //Get all Class Names 292 | Set allClasses = classCounts.keySet(); 293 | 294 | 295 | Vector featureClasses = new Vector(); 296 | 297 | // Store all class probabilities 298 | double[] classProb = new double[classCounts.size()]; 299 | 300 | // Checks for reviews 301 | if(sample.size() == 1){ 302 | int counter = 0; 303 | 304 | for(String featureClass : allClasses){ 305 | System.out.println("---------------------------------"); 306 | featureClasses.add(featureClass); 307 | classProb[counter] = conditionalProb(sample, featureClass); 308 | counter++; 309 | } 310 | System.out.println("---------------------------------"); 311 | 312 | //sort array 313 | double highestValue = 0; 314 | String highestValueClass = ""; 315 | 316 | for(int i = 0; i