├── README.mkd └── Classifier.java /README.mkd: -------------------------------------------------------------------------------- 1 | J2ME-Bayes is a general purpose Naive Bayes Classifier to the Java Micro Edition platform. It consists in a simple class, easy to use for many proposals, including: 2 | 3 | - Classify m-spam; 4 | - Classify SMS by author, category, etc; 5 | - Detect the language of a text; 6 | - Classify any kind of document. 7 | 8 | 9 | Simple example 10 | -------------- 11 | 12 |
13 | Classifier classifier = new Classifier();
14 | 
15 | classifier.train("the and is if on in at a i you he she it we they", "English");
16 | classifier.train("le la les dans cette du un une de en je vous il elle et", "French");
17 | classifier.train("ein eine ich er sie der die das der wir und dieser", "German");
18 | classifier.train("a e o ou se no na eu um uma eu ele ela voce isso isto", "Portuguese");
19 | classifier.train("que ella y el las de la uno una en esta", "Spanish");
20 | 
21 | /* Classify a Portuguese phrase which quotes an English sentence */
22 | classifier.classify("Se eu disser - 'The book is on the table.' - eu estarei falando uma frase muito comum em Inglês.");
23 | // returns "Portuguese"
24 | 
25 | -------------------------------------------------------------------------------- /Classifier.java: -------------------------------------------------------------------------------- 1 | /** J2ME-Bayes 2 | * 3 | * Rafael Alencar rafjaa at gmail.com 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * 16 | * J2ME-Bayes is a general purpose Naive Bayes Classifier to the Java 17 | * Micro Edition platform. It consists in a simple class, easy to use 18 | * for many proposals, including: 19 | * - Classify SMS spam; 20 | * - Classify SMS by author, category, etc; 21 | * - Detect the language of a text; 22 | * - Classify any kind of document. 23 | * 24 | * This algorithm is inspired in the book "Programming Collective Intelligence", 25 | * by Toby Segaran. 26 | * 27 | * @author Rafael Alencar 28 | * @version 0.1 November/2010 29 | */ 30 | 31 | import java.util.Enumeration; 32 | import java.util.Hashtable; 33 | import java.util.Vector; 34 | 35 | public class Classifier { 36 | 37 | private Hashtable featureCounter, 38 | categoryCounter; 39 | 40 | public Classifier() { 41 | featureCounter = new Hashtable(); 42 | categoryCounter = new Hashtable(); 43 | } 44 | 45 | /** 46 | * Implement this method if you need load a previous classifier train 47 | * to the hashtables above. 48 | */ 49 | public void load(){ 50 | throw new RuntimeException("Not implemented yet."); 51 | } 52 | 53 | /** 54 | * Implement this method if you need save the classifier train. 55 | */ 56 | public void save(){ 57 | throw new RuntimeException("Not implemented yet."); 58 | } 59 | 60 | /** A very simple string tokenizer. 61 | * 62 | * The J2ME plataform does not have: 63 | * The StringTokenizer Class; 64 | * The String split method; 65 | * A RegExp engine. 66 | * 67 | * @param The string which will be parsed. 68 | * @return A Vector with the String tokens. 69 | */ 70 | private Vector simpleTokenizer(String string){ 71 | int ascii, space, start = 0; 72 | String stringTrim, parsedString = ""; 73 | Vector tokens = new Vector(); 74 | 75 | stringTrim = string.trim().toLowerCase(); 76 | for(int i = 0; i < stringTrim.length(); i += 1){ 77 | ascii = (int)stringTrim.charAt(i); 78 | if(ascii == 32 || (ascii > 47 && ascii < 58) || (ascii > 96 && ascii < 123)) 79 | parsedString += stringTrim.charAt(i); 80 | } 81 | 82 | while(true){ 83 | space = parsedString.indexOf(' '); 84 | if(space == -1){ 85 | tokens.addElement(parsedString); 86 | break; 87 | } 88 | tokens.addElement(parsedString.substring(0, space)); 89 | parsedString = parsedString.substring(space + 1).trim(); 90 | start = space + 1; 91 | } 92 | return tokens; 93 | } 94 | 95 | private void incrementFeature(String feature, String category){ 96 | Hashtable featureData = (Hashtable)featureCounter.get(feature); 97 | if(featureData == null){ 98 | Hashtable newCount = new Hashtable(); 99 | newCount.put(category, new Integer(1)); 100 | featureCounter.put(feature, newCount); 101 | return; 102 | } 103 | Integer count = (Integer)featureData.get(category); 104 | int value = (count == null) ? 1 : count.intValue() + 1; 105 | featureData.put(category, new Integer(value)); 106 | featureCounter.put(feature, featureData); 107 | } 108 | 109 | private void incrementCategory(String category){ 110 | Integer count = (Integer)categoryCounter.get(category); 111 | int value = (count == null) ? 1 : count.intValue() + 1; 112 | categoryCounter.put(category, new Integer(value)); 113 | } 114 | 115 | private int featureCount(String feature, String category){ 116 | Hashtable featureData = (Hashtable)featureCounter.get(feature); 117 | if(featureData == null) 118 | return 0; 119 | Integer count = (Integer)featureData.get(category); 120 | return (count != null) ? count.intValue() : 0; 121 | } 122 | 123 | private int categoryCount(String category){ 124 | Integer count = (Integer)categoryCounter.get(category); 125 | return (count != null) ? count.intValue() : 0; 126 | } 127 | 128 | private void trainer(Vector features, String category){ 129 | String feature; 130 | while(!features.isEmpty()){ 131 | feature = (String)features.firstElement(); 132 | incrementFeature(feature, category); 133 | features.removeElementAt(0); 134 | } 135 | incrementCategory(category); 136 | } 137 | 138 | /** 139 | * Train the classifier parsing the item with the simpleTokenizer. 140 | */ 141 | public void train(String item, String category){ 142 | Vector features = simpleTokenizer(item); 143 | trainer(features, category); 144 | } 145 | 146 | /** 147 | * Train the classifier using an external parser to get the features. 148 | */ 149 | public void train(Vector features, String category){ 150 | trainer(features, category); 151 | } 152 | 153 | private int featureOccurrence(String feature){ 154 | Hashtable featureData = (Hashtable)featureCounter.get(feature); 155 | if(featureData == null) 156 | return 0; 157 | int count = 0; 158 | Enumeration keys = featureData.keys(); 159 | while(keys.hasMoreElements()){ 160 | Integer val = (Integer)featureData.get(keys.nextElement()); 161 | count += val.intValue(); 162 | } 163 | return count; 164 | } 165 | 166 | private double featureProbability(String feature, String category){ 167 | int count = categoryCount(category); 168 | return (count == 0) ? 0.0 : (double)featureCount(feature, category) / count; 169 | } 170 | 171 | private double weightedProbability(String feature, String category){ 172 | final double ASSUMED_PROBABILITY = 0.5; 173 | final double WEIGHT = 1.0; 174 | double basicProbability = featureProbability(feature, category); 175 | int occurrence = featureOccurrence(feature); 176 | return (double)(WEIGHT * ASSUMED_PROBABILITY + occurrence * basicProbability) / (WEIGHT + occurrence); 177 | } 178 | 179 | private double documentProbability(String item, String category){ 180 | Vector features = simpleTokenizer(item); 181 | String feature; 182 | double probability = 1; 183 | while(!features.isEmpty()){ 184 | feature = (String)features.firstElement(); 185 | features.removeElementAt(0); 186 | probability *= weightedProbability(feature, category); 187 | } 188 | return probability; 189 | } 190 | 191 | private double categoryProbability(String item, String category){ 192 | Integer categoryCount = (Integer)categoryCounter.get(category); 193 | double categoryProb = (double)categoryCount.intValue() / categoryCounter.size(); 194 | return categoryProb * documentProbability(item, category); 195 | } 196 | 197 | public String classify(String item){ 198 | final String DEFAULT = ""; 199 | String category, 200 | best = ""; 201 | double categoryProb, 202 | bestProbability, 203 | max = 0.0; 204 | Hashtable probabilities = new Hashtable(); 205 | 206 | Enumeration categoryKeys = categoryCounter.keys(); 207 | while(categoryKeys.hasMoreElements()){ 208 | category = (String)categoryKeys.nextElement(); 209 | categoryProb = categoryProbability(item, category); 210 | probabilities.put(category, new Double(categoryProb)); 211 | if(categoryProb > max){ 212 | max = categoryProb; 213 | best = category; 214 | } 215 | } 216 | 217 | Enumeration probKeys = probabilities.keys(); 218 | while(probKeys.hasMoreElements()){ 219 | category = (String)probKeys.nextElement(); 220 | if(category.equals(best)) 221 | continue; 222 | categoryProb = ((Double)probabilities.get(category)).doubleValue(); 223 | bestProbability = ((Double)probabilities.get(best)).doubleValue(); 224 | if(categoryProb >= bestProbability) 225 | return DEFAULT; 226 | } 227 | return best; 228 | } 229 | } 230 | --------------------------------------------------------------------------------