├── Init.java
├── README.md
└── bayes
    └── NaiveBayesClassifier.java


/Init.java:
--------------------------------------------------------------------------------
 1 | import java.util.Scanner;
 2 | 
 3 | import bayes.NaiveBayesClassifier;
 4 | 
 5 | public class Init {
 6 | 
 7 | 	public static Scanner s = new Scanner(System.in);
 8 | 	public static void main(String[] args) {
 9 | 		// String for System.In
10 | 		String input = "";
11 | 		System.out.println(
12 | 				"======================================================================================\n\n\n"
13 | 				+ "                            NAIVE BAYES TEXT CLASSIFIER                               \n"
14 | 				+ "                                  Patrick Roderman                                    \n\n\n"
15 | 				+ "======================================================================================\n"
16 | 				);
17 | 		// Features,Training Data file paths, isEvidential (Evidential learning)
18 | 		NaiveBayesClassifier NBC = new NaiveBayesClassifier("E:\\Spring2016\\ML\\NaiveBayesClassifierTXT\\keywords.txt", 
19 | 															"E:\\Spring2016\\ML\\NaiveBayesClassifierTXT\\trainingData.txt", 
20 | 															true
21 | 															);
22 | 	
23 | 		// input loop
24 | 		boolean isRunning = true;
25 | 		while(isRunning){
26 | 			System.out.println(""
27 | 					+ "======================================================================================\n"
28 | 					+ "Enter Review (txt) file path to Classify: "
29 | 					+ "");
30 | 			while (!s.hasNext()) s.next();
31 | 			input = s.next();
32 | 			System.out.println("======================================================================================\n");
33 | 			//Predict Text Class
34 | 			NBC.classify(input);
35 | 			
36 | 		}
37 | 	}
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Naive Bayes Classifier (Java)
 2 | Generalized Naive Bayes Classifier for text written in Java.
 3 | 
 4 | ## Code Example
 5 | ```Java
 6 | //create new object
 7 | NaiveBayesClassifier NBC = new NaiveBayesClassifier("\\keywords.txt", "\\trainingData.txt", true);
 8 | 					
 9 | //classify input
10 | String input = "Your entry here";
11 | NBC.classify(input);
12 | ```
13 | Configure the file location for training keywords and training entries in the NaiveBayesClassifier constructor.
14 | Third constructor paramter is to enable evidential learning. 
15 | You can set the threshold for evidential learning inside the NaiveBayesClassifier if you wish (no threshold by default).
16 | 
17 | ## Set Up
18 | Training Entries
19 | ***
20 | In a training data text file, include entries with their clasification delimited by ":" with each indivual entry delimited by ";".
21 | 
22 | e.g.   Your training entry : ClassificationName;
23 | 
24 | Key Words
25 | ***
26 | In a key words text file, include your initial associations. Key words which belong to the same classification should be delimited by "," and the group associated to its classification by ":". The classified key words should be delimited by ";"
27 | 
28 | e.g.  keyWord1, keyWord2, keyWord3 : ClassificationName; 
29 | 


--------------------------------------------------------------------------------
/bayes/NaiveBayesClassifier.java:
--------------------------------------------------------------------------------
  1 | /** 
  2 |  * 	NaiveBayesClassifier
  3 |  * 	A generalized implementation of a text based Naive Bayes
  4 |  * 	Classifier.
  5 |  * 
  6 |  * 	@author PatrickRoderman
  7 |  * 	@version 1.0
  8 |  * 	@since   2016-02-21 
  9 |  */
 10 | 
 11 | package bayes;
 12 | 
 13 | import java.io.BufferedReader;
 14 | import java.io.FileNotFoundException;
 15 | import java.io.FileReader;
 16 | import java.io.IOException;
 17 | import java.util.HashMap;
 18 | import java.util.LinkedHashSet;
 19 | import java.util.List;
 20 | import java.util.Set;
 21 | import java.util.Vector;
 22 | 
 23 | public class NaiveBayesClassifier {
 24 | 	//	boolean settings
 25 | 	//	add text and its predicted classification into the model
 26 | 	public boolean isEvidential = false;
 27 | 	
 28 | 	// (feature names , (class, count))
 29 | 	public HashMap<String, HashMap<String, Integer>> features = new HashMap<String, HashMap<String, Integer>>();
 30 | 
 31 | 	// count for classes  (class, count)
 32 | 	public HashMap<String, Integer> classCounts = new HashMap<String, Integer>();
 33 | 
 34 | 	// maintains vector of keywords/features
 35 | 	public Vector<String> featureNames = new Vector<String>();
 36 | 
 37 | 	public NaiveBayesClassifier(String featureFile, String trainingFile, boolean isEvidential) {
 38 | 		this.isEvidential = isEvidential;
 39 | 		// get features from file
 40 | 		System.out.println("Attempting to read features file...");
 41 | 		Vector<String> keyWordLines = fileReader(featureFile);
 42 | 		System.out.println("Features successfully read.");
 43 | 		// add features and classes to globals
 44 | 		addFeatures(keyWordLines);
 45 | 		
 46 | 		// read training data and add to counters
 47 | 		System.out.println("Attempting to read training file...");
 48 | 		Vector<String> trainingLines = fileReader(trainingFile);
 49 | 		System.out.println("Training entries successfully read.\n");
 50 | 		
 51 | 		//set counters for each feature and class occurrence
 52 | 		setCounters(trainingLines);
 53 | 	}
 54 | 	
 55 | 	/** 
 56 | 	 * 	Sets the global data structures (increments features and classes per instance of occurrence)
 57 | 	 * 	@param (Vector<String>) trainingLines - training text stored in individual elements
 58 | 	 */
 59 | 	public void setCounters(Vector<String> trainingLines) {
 60 | 		Vector<String> lines = trainingLines;
 61 | 		
 62 | 		//initializes features maps and class counts
 63 | 		for (String line : lines) {
 64 | 			
 65 | 			//parses features and class
 66 | 			String[] sample = line.split(" : ");
 67 | 			String sampleClass = "";
 68 | 			// Check file format
 69 | 			try{ 
 70 | 				sampleClass = sample[1];
 71 | 			}catch(Exception NullPointerException){
 72 | 				System.out.println("Invalid file formating - reconfigure file");
 73 | 				sampleClass = "";
 74 | 			}
 75 | 			String[] allWords = sample[0].split(" ");
 76 | 			Vector<String> lineFeatures = getFeatures(allWords);
 77 | 			
 78 | 			//sets default values in HashMap to handle null
 79 | 			for (String feat : lineFeatures) {
 80 | 				//features.get(feat).probs.put(sampleClass, 0);
 81 | 				features.get(feat).put(sampleClass, 0);
 82 | 			}
 83 | 		}
 84 | 		
 85 | 		//sets counts
 86 | 		for (String line : lines) {
 87 | 			String[] sample = line.split(" : ");
 88 | 			String sampleClass = sample[1];
 89 | 			String[] allWords = sample[0].split(" ");
 90 | 
 91 | 			// update count for features
 92 | 			Vector<String> lineFeatures = getFeatures(allWords);
 93 | 			// update class count
 94 | 			classCounts.put(sampleClass, (classCounts.get(sampleClass) + 1));
 95 | 			for (String feat : lineFeatures) {
 96 | 				//features.get(feat).probs.put(sampleClass,(features.get(feat).probs.get(sampleClass)+1));
 97 | 				features.get(feat).put(sampleClass,(features.get(feat).get(sampleClass)+1));
 98 | 			}	
 99 | 		}
100 | 	}
101 | 	
102 | 	/** 
103 | 	 *	Reads file and returns Vector of elements delimited by new lines
104 | 	 *	@param (String) filePath - global path to file
105 | 	 *	@return (Vector<String>) - each element contains a line from file
106 | 	 */
107 | 	public Vector<String> fileReader(String filePath) {
108 | 		
109 | 		Vector<String> extracted = new Vector<String>();
110 | 		Vector<String> allLines = new Vector<String>();
111 | 		String fileName = filePath;
112 | 		String line = null;
113 | 		
114 | 		try {
115 | 			FileReader fileReader = new FileReader(fileName);
116 | 			BufferedReader bufferedReader = new BufferedReader(fileReader);
117 | 			while ((line = bufferedReader.readLine()) != null) {
118 | 				// add line
119 | 				allLines.add(line);
120 | 			}
121 | 			bufferedReader.close();
122 | 			
123 | 
124 | 		} catch (FileNotFoundException ex) {
125 | 			System.out.println("Unable to open file '" + fileName + "'");
126 | 			return null;
127 | 		} catch (IOException ex) {
128 | 			System.out.println("Error reading '" + fileName + "'");
129 | 			return null;
130 | 		} finally {
131 | 			
132 | 			for(String s : allLines){
133 | 				//check if empty
134 | 				String check = s.replaceAll(" ", "");
135 | 				
136 | 				String entry = "";
137 | 				if(s.contains(";")){
138 | 					String append = s.replace(";", "");
139 | 					entry = entry + append;
140 | 					extracted.add(entry);
141 | 				}else if(check.equals("")){
142 | 					
143 | 					
144 | 				}else{
145 | 					entry = entry + s;
146 | 				}
147 | 			}
148 | 			
149 | 		}
150 | 		return extracted;
151 | 	}
152 | 	
153 | 	/** 
154 | 	 * 	Returns features from specified class
155 | 	 * 	@param (String[]) alWords - each word from text stored in different elements
156 | 	 * 	@param (String) featureClass - the name of the class you want to get features from
157 | 	 *  @return (Vector<String>) filtered - features from class name : String featureClass
158 | 	 */
159 | 	public Vector<String> getFeatures(String[] allWords, String featureClass) {
160 | 		Vector<String> filtered = new Vector<String>();
161 | 		// only add keywords from V into filtered
162 | 		for (int i = 0; i < allWords.length; i++) {
163 | 			for (String keyword : featureNames) {
164 | 				if ((allWords[i].toLowerCase()).contains(keyword.toLowerCase())) {
165 | 					//filtered.add(allWords[i].toLowerCase());
166 | 					filtered.add(keyword.toLowerCase());
167 | 				}
168 | 			}
169 | 		}
170 | 		// return all non-duplicate values from filtered Vector
171 | 		return (Vector<String>) removeDups(filtered);
172 | 	}
173 | 	
174 | 	/** 
175 | 	 * 	Returns all features
176 | 	 * 	@param (String[]) allWords - each word from text stored in different element
177 | 	 *  @return (Vector<String>) filtered - contains all features from String[] allWords
178 | 	 */	
179 | 	public Vector<String> getFeatures(String[] allWords) {
180 | 		Vector<String> filtered = new Vector<String>();
181 | 
182 | 		for (int i = 0; i < allWords.length; i++) {
183 | 			for (String keyword : featureNames) {
184 | 				if ((allWords[i].toLowerCase()).contains((keyword).toLowerCase())) {
185 | 					//filtered.add(allWords[i].toLowerCase());
186 | 					filtered.add(keyword.toLowerCase());
187 | 				}
188 | 			}
189 | 		}
190 | 		// return all non-duplicate values from filtered Vector
191 | 		return (Vector<String>) removeDups(filtered);
192 | 	}
193 | 
194 | 	/** 
195 | 	 * 	Parses line for features.
196 | 	 * 	Individual features should be delimited by "," and all features delimited by ":" from their class
197 | 	 *	Both features and their class should be delimited by a new line
198 | 	 *	@param (Vector<String> lines - each element should contain the entire line/review/text
199 | 	 */
200 | 	public void addFeatures(Vector<String> lines) {
201 | 		for (String sample : lines) {
202 | 			// remove spaces
203 | 			sample = sample.replaceAll("\\s+", "");
204 | 			
205 | 			// separate into features, and class
206 | 			String[] associate = sample.split(":");
207 | 			
208 | 			// separate features and class
209 | 			String[] allFeatures = associate[0].split(",");
210 | 			String featuresClass = associate[1];
211 | 			Vector<String> featuresVector = new Vector<String>();
212 | 
213 | 			// copy features into list to remove duplicates
214 | 			for (int i = 0; i < allFeatures.length; i++) {
215 | 				featuresVector.add(allFeatures[i]);
216 | 			}
217 | 			Vector<String> featuresNoDups = (Vector<String>) removeDups(featuresVector);
218 | 			
219 | 			// add feature Class to vector to start
220 | 			classCounts.put(featuresClass, 0);
221 | 			for (String feature : featuresNoDups) {
222 | 				// add feature and its details to feature (HashMap)
223 | 				features.put(feature, new HashMap<String, Integer>());
224 | 				features.put(feature, new HashMap<String, Integer>());
225 | 				
226 | 				// add features to feature vector
227 | 				featureNames.add(feature);
228 | 			}
229 | 		}
230 | 	}
231 | 
232 | 	/** 
233 | 	 *	Removes duplicate keywords from lists
234 | 	 *	@param (List<T> list - list for duplicate removal
235 | 	 *	@return (Vector<T>) - Vector without any duplicates
236 | 	 */
237 | 	private static <T> List<T> removeDups(List<T> list) {
238 | 		return new Vector<T>(new LinkedHashSet<T>(list));
239 | 	}
240 | 
241 | 	/** 
242 | 	 * 	Calculates conditional probability P(Features | Class)
243 | 	 *	@param (Vector<String>) sample - text for feature extraction
244 | 	 *  @param (String) featureClass - name of Class
245 | 	 *	@return (double) P(Features | Class)
246 | 	 */
247 | 	public double conditionalProb(Vector<String> sample, String featureClass){
248 | 		double prob = 1;
249 | 		String file = sample.get(0);
250 | 		String[] allWords = file.split(" ");
251 | 		Vector<String> featuresList = getFeatures(allWords);
252 | 		
253 | 		//get all classes
254 | 		Set<String> keys = classCounts.keySet();
255 | 		int combinedClassCount = 0;
256 | 		
257 | 		for(String key : keys){
258 | 			
259 | 			combinedClassCount += classCounts.get(key);
260 | 		}
261 | 				
262 | 		//counter for no prior data
263 | 		int offset = 0;
264 | 		for (String s : featuresList) {
265 | 			if(features.get(s).get(featureClass) == null){
266 | 				features.get(s).put(featureClass, 0);
267 | 			}
268 | 			System.out.println( "P(" + s + " | " + featureClass +") = " + features.get(s).get(featureClass) + "/" + classCounts.get(featureClass));
269 | 			double featureProb = features.get(s).get(featureClass);
270 | 			
271 | 			if(featureProb == 0){
272 | 				offset++;
273 | 			}else{
274 | 				prob += featureProb; 
275 | 			}
276 | 		}
277 | 		System.out.println("\n" + prob + "/" + ((combinedClassCount) - offset));
278 | 		return prob/(combinedClassCount - offset);
279 | 	}
280 | 	
281 | 	/** 
282 | 	 * 	Displays conditional probabilities and conclusion - P(Features | Class)
283 | 	 *	@param (String) filePath - global file path to text you want classified (.txt)
284 | 	 *	@return (String) - class name with highest probability
285 | 	 */
286 | 	public String classify(String filePath){
287 | 		
288 | 		//parse input to lines
289 | 		Vector<String> sample = fileReader(filePath);
290 | 		
291 | 		//Get all Class Names
292 | 		Set<String> allClasses = classCounts.keySet();
293 | 		
294 | 		
295 | 		Vector<String> featureClasses = new Vector<String>();
296 | 		
297 | 		// Store all class probabilities
298 | 		double[] classProb = new double[classCounts.size()];
299 | 		
300 | 		// Checks for reviews
301 | 		if(sample.size() == 1){
302 | 			int counter = 0;
303 | 		
304 | 			for(String featureClass : allClasses){
305 | 				System.out.println("---------------------------------");
306 | 				featureClasses.add(featureClass);
307 | 				classProb[counter] = conditionalProb(sample, featureClass);
308 | 				counter++;
309 | 			}
310 | 			System.out.println("---------------------------------");
311 | 			
312 | 			//sort array
313 | 			double highestValue = 0;
314 | 			String highestValueClass = "";
315 | 		
316 | 			for(int i = 0; i<classProb.length; i++){
317 | 				System.out.println("P("+ featureClasses.get(i) + ") = " + classProb[i]);
318 | 				//set highest value
319 | 				if(highestValue < classProb[i]){
320 | 					highestValue = classProb[i];
321 | 					highestValueClass = featureClasses.get(i);
322 | 				}
323 | 			}
324 | 			
325 | 			System.out.println("---------------------------------");
326 | 			
327 | 			System.out.println("\n\nCONCLUSION:\n----------------\nThe review is most likely to be classified as : " + highestValueClass + ".\n");
328 | 			
329 | 			// Evidential learning / set counters
330 | 			
331 | 			if(isEvidential){
332 | 				// append predicted Classification class to review
333 | 				sample.set(0, sample.get(0)+ " : " + highestValueClass);
334 | 				setCounters(sample);
335 | 			}
336 | 			
337 | 			return highestValueClass;
338 | 		}else{
339 | 			//If supplied file contains more or less than one line
340 | 			System.out.println("The file you loaded was not properly formated.");
341 | 			return null;
342 | 		}
343 | 	}
344 | }


--------------------------------------------------------------------------------