├── README.mkd └── Classifier.java /README.mkd: -------------------------------------------------------------------------------- 1 | J2ME-Bayes is a general purpose Naive Bayes Classifier to the Java Micro Edition platform. It consists in a simple class, easy to use for many proposals, including: 2 | 3 | - Classify m-spam; 4 | - Classify SMS by author, category, etc; 5 | - Detect the language of a text; 6 | - Classify any kind of document. 7 | 8 | 9 | Simple example 10 | -------------- 11 | 12 |
13 | Classifier classifier = new Classifier();
14 |
15 | classifier.train("the and is if on in at a i you he she it we they", "English");
16 | classifier.train("le la les dans cette du un une de en je vous il elle et", "French");
17 | classifier.train("ein eine ich er sie der die das der wir und dieser", "German");
18 | classifier.train("a e o ou se no na eu um uma eu ele ela voce isso isto", "Portuguese");
19 | classifier.train("que ella y el las de la uno una en esta", "Spanish");
20 |
21 | /* Classify a Portuguese phrase which quotes an English sentence */
22 | classifier.classify("Se eu disser - 'The book is on the table.' - eu estarei falando uma frase muito comum em Inglês.");
23 | // returns "Portuguese"
24 |
25 |
--------------------------------------------------------------------------------
/Classifier.java:
--------------------------------------------------------------------------------
1 | /** J2ME-Bayes
2 | *
3 | * Rafael Alencar rafjaa at gmail.com
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, either version 3 of the License, or
8 | * (at your option) any later version.
9 | *
10 | * This program is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | * GNU General Public License for more details.
14 | *
15 | *
16 | * J2ME-Bayes is a general purpose Naive Bayes Classifier to the Java
17 | * Micro Edition platform. It consists in a simple class, easy to use
18 | * for many proposals, including:
19 | * - Classify SMS spam;
20 | * - Classify SMS by author, category, etc;
21 | * - Detect the language of a text;
22 | * - Classify any kind of document.
23 | *
24 | * This algorithm is inspired in the book "Programming Collective Intelligence",
25 | * by Toby Segaran.
26 | *
27 | * @author Rafael Alencar
28 | * @version 0.1 November/2010
29 | */
30 |
31 | import java.util.Enumeration;
32 | import java.util.Hashtable;
33 | import java.util.Vector;
34 |
35 | public class Classifier {
36 |
37 | private Hashtable featureCounter,
38 | categoryCounter;
39 |
40 | public Classifier() {
41 | featureCounter = new Hashtable();
42 | categoryCounter = new Hashtable();
43 | }
44 |
45 | /**
46 | * Implement this method if you need load a previous classifier train
47 | * to the hashtables above.
48 | */
49 | public void load(){
50 | throw new RuntimeException("Not implemented yet.");
51 | }
52 |
53 | /**
54 | * Implement this method if you need save the classifier train.
55 | */
56 | public void save(){
57 | throw new RuntimeException("Not implemented yet.");
58 | }
59 |
60 | /** A very simple string tokenizer.
61 | *
62 | * The J2ME plataform does not have:
63 | * The StringTokenizer Class;
64 | * The String split method;
65 | * A RegExp engine.
66 | *
67 | * @param The string which will be parsed.
68 | * @return A Vector with the String tokens.
69 | */
70 | private Vector simpleTokenizer(String string){
71 | int ascii, space, start = 0;
72 | String stringTrim, parsedString = "";
73 | Vector tokens = new Vector();
74 |
75 | stringTrim = string.trim().toLowerCase();
76 | for(int i = 0; i < stringTrim.length(); i += 1){
77 | ascii = (int)stringTrim.charAt(i);
78 | if(ascii == 32 || (ascii > 47 && ascii < 58) || (ascii > 96 && ascii < 123))
79 | parsedString += stringTrim.charAt(i);
80 | }
81 |
82 | while(true){
83 | space = parsedString.indexOf(' ');
84 | if(space == -1){
85 | tokens.addElement(parsedString);
86 | break;
87 | }
88 | tokens.addElement(parsedString.substring(0, space));
89 | parsedString = parsedString.substring(space + 1).trim();
90 | start = space + 1;
91 | }
92 | return tokens;
93 | }
94 |
95 | private void incrementFeature(String feature, String category){
96 | Hashtable featureData = (Hashtable)featureCounter.get(feature);
97 | if(featureData == null){
98 | Hashtable newCount = new Hashtable();
99 | newCount.put(category, new Integer(1));
100 | featureCounter.put(feature, newCount);
101 | return;
102 | }
103 | Integer count = (Integer)featureData.get(category);
104 | int value = (count == null) ? 1 : count.intValue() + 1;
105 | featureData.put(category, new Integer(value));
106 | featureCounter.put(feature, featureData);
107 | }
108 |
109 | private void incrementCategory(String category){
110 | Integer count = (Integer)categoryCounter.get(category);
111 | int value = (count == null) ? 1 : count.intValue() + 1;
112 | categoryCounter.put(category, new Integer(value));
113 | }
114 |
115 | private int featureCount(String feature, String category){
116 | Hashtable featureData = (Hashtable)featureCounter.get(feature);
117 | if(featureData == null)
118 | return 0;
119 | Integer count = (Integer)featureData.get(category);
120 | return (count != null) ? count.intValue() : 0;
121 | }
122 |
123 | private int categoryCount(String category){
124 | Integer count = (Integer)categoryCounter.get(category);
125 | return (count != null) ? count.intValue() : 0;
126 | }
127 |
128 | private void trainer(Vector features, String category){
129 | String feature;
130 | while(!features.isEmpty()){
131 | feature = (String)features.firstElement();
132 | incrementFeature(feature, category);
133 | features.removeElementAt(0);
134 | }
135 | incrementCategory(category);
136 | }
137 |
138 | /**
139 | * Train the classifier parsing the item with the simpleTokenizer.
140 | */
141 | public void train(String item, String category){
142 | Vector features = simpleTokenizer(item);
143 | trainer(features, category);
144 | }
145 |
146 | /**
147 | * Train the classifier using an external parser to get the features.
148 | */
149 | public void train(Vector features, String category){
150 | trainer(features, category);
151 | }
152 |
153 | private int featureOccurrence(String feature){
154 | Hashtable featureData = (Hashtable)featureCounter.get(feature);
155 | if(featureData == null)
156 | return 0;
157 | int count = 0;
158 | Enumeration keys = featureData.keys();
159 | while(keys.hasMoreElements()){
160 | Integer val = (Integer)featureData.get(keys.nextElement());
161 | count += val.intValue();
162 | }
163 | return count;
164 | }
165 |
166 | private double featureProbability(String feature, String category){
167 | int count = categoryCount(category);
168 | return (count == 0) ? 0.0 : (double)featureCount(feature, category) / count;
169 | }
170 |
171 | private double weightedProbability(String feature, String category){
172 | final double ASSUMED_PROBABILITY = 0.5;
173 | final double WEIGHT = 1.0;
174 | double basicProbability = featureProbability(feature, category);
175 | int occurrence = featureOccurrence(feature);
176 | return (double)(WEIGHT * ASSUMED_PROBABILITY + occurrence * basicProbability) / (WEIGHT + occurrence);
177 | }
178 |
179 | private double documentProbability(String item, String category){
180 | Vector features = simpleTokenizer(item);
181 | String feature;
182 | double probability = 1;
183 | while(!features.isEmpty()){
184 | feature = (String)features.firstElement();
185 | features.removeElementAt(0);
186 | probability *= weightedProbability(feature, category);
187 | }
188 | return probability;
189 | }
190 |
191 | private double categoryProbability(String item, String category){
192 | Integer categoryCount = (Integer)categoryCounter.get(category);
193 | double categoryProb = (double)categoryCount.intValue() / categoryCounter.size();
194 | return categoryProb * documentProbability(item, category);
195 | }
196 |
197 | public String classify(String item){
198 | final String DEFAULT = "";
199 | String category,
200 | best = "";
201 | double categoryProb,
202 | bestProbability,
203 | max = 0.0;
204 | Hashtable probabilities = new Hashtable();
205 |
206 | Enumeration categoryKeys = categoryCounter.keys();
207 | while(categoryKeys.hasMoreElements()){
208 | category = (String)categoryKeys.nextElement();
209 | categoryProb = categoryProbability(item, category);
210 | probabilities.put(category, new Double(categoryProb));
211 | if(categoryProb > max){
212 | max = categoryProb;
213 | best = category;
214 | }
215 | }
216 |
217 | Enumeration probKeys = probabilities.keys();
218 | while(probKeys.hasMoreElements()){
219 | category = (String)probKeys.nextElement();
220 | if(category.equals(best))
221 | continue;
222 | categoryProb = ((Double)probabilities.get(category)).doubleValue();
223 | bestProbability = ((Double)probabilities.get(best)).doubleValue();
224 | if(categoryProb >= bestProbability)
225 | return DEFAULT;
226 | }
227 | return best;
228 | }
229 | }
230 |
--------------------------------------------------------------------------------