├── jar
    └── jLDADMM.jar
├── lib
    └── args4j-2.0.6.jar
├── src
    ├── utility
    │   ├── MTRandom.java
    │   ├── CmdArgs.java
    │   ├── FuncUtils.java
    │   └── MersenneTwister.java
    ├── jLDADMM.java
    ├── eval
    │   └── ClusteringEval.java
    └── models
    │   ├── GibbsSamplingLDA_Inf.java
    │   ├── GibbsSamplingLDA.java
    │   ├── GibbsSamplingDMM_Inf.java
    │   └── GibbsSamplingDMM.java
├── License.txt
├── test
    ├── corpus.LABEL
    ├── unseenTest.txt
    └── corpus.txt
└── README.md


/jar/jLDADMM.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/jLDADMM/HEAD/jar/jLDADMM.jar


--------------------------------------------------------------------------------
/lib/args4j-2.0.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/jLDADMM/HEAD/lib/args4j-2.0.6.jar


--------------------------------------------------------------------------------
/src/utility/MTRandom.java:
--------------------------------------------------------------------------------
 1 | package utility;
 2 | 
 3 | public class MTRandom
 4 | {
 5 | 
 6 |     private static MersenneTwister rand = new MersenneTwister();
 7 | 
 8 |     public static void setSeed(long seed)
 9 |     {
10 |         rand.setSeed(seed);
11 |     }
12 | 
13 |     public static double nextDouble()
14 |     {
15 |         return rand.nextDouble();
16 |     }
17 | 
18 |     public static int nextInt(int n)
19 |     {
20 |         return rand.nextInt(n);
21 |     }
22 | 
23 |     public static boolean nextBoolean()
24 |     {
25 |         return rand.nextBoolean();
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
 1 | jLDADMM: A Java package for the LDA and DMM topic models
 2 | 
 3 | 	Copyright (C) 2015-2017 by Dat Quoc Nguyen 
 4 | 	dat.nguyen@students.mq.edu.au
 5 | 	Department of Computing, Macquarie University, Australia
 6 | 	
 7 | jLDADMM's website: http://jldadmm.sourceforge.net/
 8 | 	
 9 | jLDADMM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version.
10 | 
11 | jLDADMM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
12 | 
13 | You should have received a copy of the GNU General Public License along with this program.  If not, see <http://www.gnu.org/licenses/>


--------------------------------------------------------------------------------
/src/utility/CmdArgs.java:
--------------------------------------------------------------------------------
 1 | package utility;
 2 | 
 3 | import org.kohsuke.args4j.Option;
 4 | 
 5 | public class CmdArgs
 6 | {
 7 | 
 8 | 	@Option(name = "-model", usage = "Specify model", required = true)
 9 | 	public String model = "";
10 | 
11 | 	@Option(name = "-corpus", usage = "Specify path to topic modeling corpus")
12 | 	public String corpus = "";
13 | 
14 | 	@Option(name = "-ntopics", usage = "Specify number of topics")
15 | 	public int ntopics = 20;
16 | 
17 | 	@Option(name = "-alpha", usage = "Specify alpha")
18 | 	public double alpha = 0.1;
19 | 
20 | 	@Option(name = "-beta", usage = "Specify beta")
21 | 	public double beta = 0.01;
22 | 
23 | 	@Option(name = "-niters", usage = "Specify number of iterations")
24 | 	public int niters = 2000;
25 | 
26 | 	@Option(name = "-twords", usage = "Specify number of top topical words")
27 | 	public int twords = 20;
28 | 
29 | 	@Option(name = "-name", usage = "Specify a name to topic modeling experiment")
30 | 	public String expModelName = "model";
31 | 
32 | 	@Option(name = "-seed", usage = "Specify a random seed for reproducibility")
33 | 	public int seed = 0;
34 | 
35 | 	@Option(name = "-initFile")
36 | 	public String initTopicAssgns = "";
37 | 
38 | 	@Option(name = "-sstep")
39 | 	public int savestep = 0;
40 | 
41 | 	@Option(name = "-dir")
42 | 	public String dir = "";
43 | 
44 | 	@Option(name = "-label")
45 | 	public String labelFile = "";
46 | 
47 | 	@Option(name = "-prob")
48 | 	public String prob = "";
49 | 
50 | 	@Option(name = "-paras", usage = "Specify path to hyper-parameter file")
51 | 	public String paras = "";
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/utility/FuncUtils.java:
--------------------------------------------------------------------------------
 1 | package utility;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.Comparator;
 5 | import java.util.LinkedHashMap;
 6 | import java.util.LinkedList;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | 
10 | public class FuncUtils
11 | {
12 |     public static <K, V extends Comparable<? super V>> Map<K, V> sortByValueDescending(Map<K, V> map)
13 |     {
14 |         List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
15 |         Collections.sort(list, new Comparator<Map.Entry<K, V>>()
16 |         {
17 |             @Override
18 |             public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
19 |             {
20 |                 int compare = (o1.getValue()).compareTo(o2.getValue());
21 |                 return -compare;
22 |             }
23 |         });
24 | 
25 |         Map<K, V> result = new LinkedHashMap<K, V>();
26 |         for (Map.Entry<K, V> entry : list) {
27 |             result.put(entry.getKey(), entry.getValue());
28 |         }
29 |         return result;
30 |     }
31 | 
32 |     public static <K, V extends Comparable<? super V>> Map<K, V> sortByValueAscending(Map<K, V> map)
33 |     {
34 |         List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
35 |         Collections.sort(list, new Comparator<Map.Entry<K, V>>()
36 |         {
37 |             @Override
38 |             public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
39 |             {
40 |                 int compare = (o1.getValue()).compareTo(o2.getValue());
41 |                 return compare;
42 |             }
43 |         });
44 | 
45 |         Map<K, V> result = new LinkedHashMap<K, V>();
46 |         for (Map.Entry<K, V> entry : list) {
47 |             result.put(entry.getKey(), entry.getValue());
48 |         }
49 |         return result;
50 |     }
51 | 
52 |     /**
53 |      * Sample a value from a double array
54 |      * 
55 |      * @param probs
56 |      * @return
57 |      */
58 |     public static int nextDiscrete(double[] probs)
59 |     {
60 |         double sum = 0.0;
61 |         for (int i = 0; i < probs.length; i++)
62 |             sum += probs[i];
63 | 
64 |         double r = MTRandom.nextDouble() * sum;
65 | 
66 |         sum = 0.0;
67 |         for (int i = 0; i < probs.length; i++) {
68 |             sum += probs[i];
69 |             if (sum > r)
70 |                 return i;
71 |         }
72 |         return probs.length - 1;
73 |     }
74 | 
75 |     public static double mean(double[] m)
76 |     {
77 |         double sum = 0;
78 |         for (int i = 0; i < m.length; i++)
79 |             sum += m[i];
80 |         return sum / m.length;
81 |     }
82 | 
83 |     public static double stddev(double[] m)
84 |     {
85 |         double mean = mean(m);
86 |         double s = 0;
87 |         for (int i = 0; i < m.length; i++)
88 |             s += (m[i] - mean) * (m[i] - mean);
89 |         return Math.sqrt(s / m.length);
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/jLDADMM.java:
--------------------------------------------------------------------------------
  1 | import models.GibbsSamplingDMM;
  2 | import models.GibbsSamplingDMM_Inf;
  3 | import models.GibbsSamplingLDA;
  4 | import models.GibbsSamplingLDA_Inf;
  5 | 
  6 | import org.kohsuke.args4j.CmdLineException;
  7 | import org.kohsuke.args4j.CmdLineParser;
  8 | 
  9 | import utility.MTRandom;
 10 | import utility.CmdArgs;
 11 | import eval.ClusteringEval;
 12 | 
 13 | /**
 14 |  * jLDADMM: A Java package for the LDA and DMM topic models
 15 |  * 
 16 |  * http://jldadmm.sourceforge.net/
 17 |  * 
 18 |  * @author: Dat Quoc Nguyen
 19 |  * 
 20 |  * @version: 1.0.3 
 21 |  * jLDADMM v1.0.3 incorporates the following change: Adding a random-seed option "-seed" for reproducibility
 22 |  */
 23 | public class jLDADMM
 24 | {
 25 | 	public static void main(String[] args)
 26 | 	{
 27 | 		CmdArgs cmdArgs = new CmdArgs();
 28 | 		CmdLineParser parser = new CmdLineParser(cmdArgs);
 29 | 		try {
 30 | 
 31 | 			parser.parseArgument(args);
 32 | 
 33 | 			if (cmdArgs.seed > 0){
 34 | 				MTRandom.setSeed(cmdArgs.seed);
 35 | 			}
 36 | 
 37 | 			if (cmdArgs.model.equals("LDA")) {
 38 | 				GibbsSamplingLDA lda = new GibbsSamplingLDA(cmdArgs.corpus,
 39 | 					cmdArgs.ntopics, cmdArgs.alpha, cmdArgs.beta,
 40 | 					cmdArgs.niters, cmdArgs.twords, cmdArgs.expModelName,
 41 | 					cmdArgs.initTopicAssgns, cmdArgs.savestep);
 42 | 				lda.inference();
 43 | 			}
 44 | 			else if (cmdArgs.model.equals("DMM")) {
 45 | 				GibbsSamplingDMM dmm = new GibbsSamplingDMM(cmdArgs.corpus,
 46 | 					cmdArgs.ntopics, cmdArgs.alpha, cmdArgs.beta,
 47 | 					cmdArgs.niters, cmdArgs.twords, cmdArgs.expModelName,
 48 | 					cmdArgs.initTopicAssgns, cmdArgs.savestep);
 49 | 				dmm.inference();
 50 | 			}
 51 | 			else if (cmdArgs.model.equals("LDAinf")) {
 52 | 				GibbsSamplingLDA_Inf lda = new GibbsSamplingLDA_Inf(
 53 | 					cmdArgs.paras, cmdArgs.corpus, cmdArgs.niters,
 54 | 					cmdArgs.twords, cmdArgs.expModelName, cmdArgs.savestep);
 55 | 				lda.inference();
 56 | 			}
 57 | 			else if (cmdArgs.model.equals("DMMinf")) {
 58 | 				GibbsSamplingDMM_Inf dmm = new GibbsSamplingDMM_Inf(
 59 | 					cmdArgs.paras, cmdArgs.corpus, cmdArgs.niters,
 60 | 					cmdArgs.twords, cmdArgs.expModelName, cmdArgs.savestep);
 61 | 				dmm.inference();
 62 | 			}
 63 | 			else if (cmdArgs.model.equals("Eval")) {
 64 | 				ClusteringEval.evaluate(cmdArgs.labelFile, cmdArgs.dir,
 65 | 					cmdArgs.prob);
 66 | 			}
 67 | 			else {
 68 | 				System.out
 69 | 					.println("Error: Option \"-model\" must get \"LDA\" or \"DMM\" or \"LDAinf\" or \"DMMinf\" or \"Eval\"");
 70 | 				System.out
 71 | 					.println("\tLDA: Specify the Latent Dirichlet Allocation topic model");
 72 | 				System.out
 73 | 					.println("\tDMM: Specify the one-topic-per-document Dirichlet Multinomial Mixture model");
 74 | 				System.out
 75 | 					.println("\tLDAinf: Infer topics for unseen corpus using a pre-trained LDA model");
 76 | 				System.out
 77 | 					.println("\tDMMinf: Infer topics for unseen corpus using a pre-trained DMM model");
 78 | 				System.out
 79 | 					.println("\tEval: Specify the document clustering evaluation");
 80 | 				help(parser);
 81 | 				return;
 82 | 			}
 83 | 		}
 84 | 		catch (CmdLineException cle) {
 85 | 			System.out.println("Error: " + cle.getMessage());
 86 | 			help(parser);
 87 | 			return;
 88 | 		}
 89 | 		catch (Exception e) {
 90 | 			System.out.println("Error: " + e.getMessage());
 91 | 			e.printStackTrace();
 92 | 			return;
 93 | 		}
 94 | 	}
 95 | 
 96 | 	public static void help(CmdLineParser parser)
 97 | 	{
 98 | 		System.out
 99 | 			.println("java -jar jLDADMM.jar [options ...] [arguments...]");
100 | 		parser.printUsage(System.out);
101 | 	}
102 | }
103 | 


--------------------------------------------------------------------------------
/test/corpus.LABEL:
--------------------------------------------------------------------------------
  1 | apple
  2 | apple
  3 | apple
  4 | apple
  5 | apple
  6 | apple
  7 | apple
  8 | apple
  9 | apple
 10 | apple
 11 | apple
 12 | apple
 13 | apple
 14 | apple
 15 | apple
 16 | apple
 17 | apple
 18 | apple
 19 | apple
 20 | apple
 21 | apple
 22 | apple
 23 | apple
 24 | apple
 25 | apple
 26 | apple
 27 | apple
 28 | apple
 29 | apple
 30 | apple
 31 | apple
 32 | apple
 33 | apple
 34 | apple
 35 | apple
 36 | apple
 37 | apple
 38 | apple
 39 | apple
 40 | apple
 41 | apple
 42 | apple
 43 | apple
 44 | apple
 45 | apple
 46 | apple
 47 | apple
 48 | apple
 49 | apple
 50 | apple
 51 | apple
 52 | apple
 53 | apple
 54 | apple
 55 | apple
 56 | apple
 57 | apple
 58 | apple
 59 | apple
 60 | apple
 61 | apple
 62 | apple
 63 | apple
 64 | apple
 65 | apple
 66 | apple
 67 | apple
 68 | apple
 69 | apple
 70 | apple
 71 | apple
 72 | apple
 73 | apple
 74 | apple
 75 | apple
 76 | apple
 77 | apple
 78 | apple
 79 | apple
 80 | apple
 81 | apple
 82 | apple
 83 | apple
 84 | apple
 85 | apple
 86 | apple
 87 | apple
 88 | apple
 89 | apple
 90 | apple
 91 | apple
 92 | apple
 93 | apple
 94 | apple
 95 | apple
 96 | apple
 97 | apple
 98 | apple
 99 | apple
100 | apple
101 | google
102 | google
103 | google
104 | google
105 | google
106 | google
107 | google
108 | google
109 | google
110 | google
111 | google
112 | google
113 | google
114 | google
115 | google
116 | google
117 | google
118 | google
119 | google
120 | google
121 | google
122 | google
123 | google
124 | google
125 | google
126 | google
127 | google
128 | google
129 | google
130 | google
131 | google
132 | google
133 | google
134 | google
135 | google
136 | google
137 | google
138 | google
139 | google
140 | google
141 | google
142 | google
143 | google
144 | google
145 | google
146 | google
147 | google
148 | google
149 | google
150 | google
151 | google
152 | google
153 | google
154 | google
155 | google
156 | google
157 | google
158 | google
159 | google
160 | google
161 | google
162 | google
163 | google
164 | google
165 | google
166 | google
167 | google
168 | google
169 | google
170 | google
171 | google
172 | google
173 | google
174 | google
175 | google
176 | google
177 | google
178 | google
179 | google
180 | google
181 | google
182 | google
183 | google
184 | google
185 | google
186 | google
187 | google
188 | google
189 | google
190 | google
191 | google
192 | google
193 | google
194 | google
195 | google
196 | google
197 | google
198 | google
199 | google
200 | google
201 | microsoft
202 | microsoft
203 | microsoft
204 | microsoft
205 | microsoft
206 | microsoft
207 | microsoft
208 | microsoft
209 | microsoft
210 | microsoft
211 | microsoft
212 | microsoft
213 | microsoft
214 | microsoft
215 | microsoft
216 | microsoft
217 | microsoft
218 | microsoft
219 | microsoft
220 | microsoft
221 | microsoft
222 | microsoft
223 | microsoft
224 | microsoft
225 | microsoft
226 | microsoft
227 | microsoft
228 | microsoft
229 | microsoft
230 | microsoft
231 | microsoft
232 | microsoft
233 | microsoft
234 | microsoft
235 | microsoft
236 | microsoft
237 | microsoft
238 | microsoft
239 | microsoft
240 | microsoft
241 | microsoft
242 | microsoft
243 | microsoft
244 | microsoft
245 | microsoft
246 | microsoft
247 | microsoft
248 | microsoft
249 | microsoft
250 | microsoft
251 | microsoft
252 | microsoft
253 | microsoft
254 | microsoft
255 | microsoft
256 | microsoft
257 | microsoft
258 | microsoft
259 | microsoft
260 | microsoft
261 | microsoft
262 | microsoft
263 | microsoft
264 | microsoft
265 | microsoft
266 | microsoft
267 | microsoft
268 | microsoft
269 | microsoft
270 | microsoft
271 | microsoft
272 | microsoft
273 | microsoft
274 | microsoft
275 | microsoft
276 | microsoft
277 | microsoft
278 | microsoft
279 | microsoft
280 | microsoft
281 | microsoft
282 | microsoft
283 | microsoft
284 | microsoft
285 | microsoft
286 | microsoft
287 | microsoft
288 | microsoft
289 | microsoft
290 | microsoft
291 | microsoft
292 | microsoft
293 | microsoft
294 | microsoft
295 | microsoft
296 | microsoft
297 | microsoft
298 | microsoft
299 | microsoft
300 | microsoft
301 | twitter
302 | twitter
303 | twitter
304 | twitter
305 | twitter
306 | twitter
307 | twitter
308 | twitter
309 | twitter
310 | twitter
311 | twitter
312 | twitter
313 | twitter
314 | twitter
315 | twitter
316 | twitter
317 | twitter
318 | twitter
319 | twitter
320 | twitter
321 | twitter
322 | twitter
323 | twitter
324 | twitter
325 | twitter
326 | twitter
327 | twitter
328 | twitter
329 | twitter
330 | twitter
331 | twitter
332 | twitter
333 | twitter
334 | twitter
335 | twitter
336 | twitter
337 | twitter
338 | twitter
339 | twitter
340 | twitter
341 | twitter
342 | twitter
343 | twitter
344 | twitter
345 | twitter
346 | twitter
347 | twitter
348 | twitter
349 | twitter
350 | twitter
351 | twitter
352 | twitter
353 | twitter
354 | twitter
355 | twitter
356 | twitter
357 | twitter
358 | twitter
359 | twitter
360 | twitter
361 | twitter
362 | twitter
363 | twitter
364 | twitter
365 | twitter
366 | twitter
367 | twitter
368 | twitter
369 | twitter
370 | twitter
371 | twitter
372 | twitter
373 | twitter
374 | twitter
375 | twitter
376 | twitter
377 | twitter
378 | twitter
379 | twitter
380 | twitter
381 | twitter
382 | twitter
383 | twitter
384 | twitter
385 | twitter
386 | twitter
387 | twitter
388 | twitter
389 | twitter
390 | twitter
391 | twitter
392 | twitter
393 | twitter
394 | twitter
395 | twitter
396 | twitter
397 | twitter
398 | twitter
399 | twitter
400 | twitter
401 | 


--------------------------------------------------------------------------------
/test/unseenTest.txt:
--------------------------------------------------------------------------------
  1 | making ipad feel ios 
  2 | nexus good feel bit guess android users android 
  3 | nice game helps search 
  4 | nice game helps search facebook 
  5 | build website website free 
  6 | android ics pretty good worth 
  7 | android ice cream sandwich nexus android nexus 
  8 | exciting day ice cream sandwich day android 
  9 | wow nexus beautiful totally gonna market share smart phone market 
 10 | integrated data usage manager brilliant design watching lol 
 11 | ice cream sandwich android works htc desire 
 12 | ice cream sandwich sounds android ice cream sandwich 
 13 | amazing imo android missing 
 14 | forget phone nice feature android nexus 
 15 | finally unveiled android ice cream sandwich good 
 16 | finally searches logged users 
 17 | rim strategy released hours release ics 
 18 | man love galaxy nexus samsung android 
 19 | doubt 
 20 | share winning war 
 21 | dear galaxy nexus send email technology 
 22 | telegraph reports biggest threat facebook power users 
 23 | samsung made bad android king 
 24 | facebook power users telegraph socialmedia 
 25 | impressed android update good font design 
 26 | video wallet wow 
 27 | tweet remember spell straight 
 28 | android samsung nexus 
 29 | efficient fun releases infinite digital bookcase 
 30 | pass social seo facebook 
 31 | ice cream sandwich stop carriers bullying smartphone users android 
 32 | agree freaking awesome 
 33 | icecream great 
 34 | helps 
 35 | samsung galaxy nexus iphone 
 36 | ice cream sandwich delicious iphone launches android aka 
 37 | loving 
 38 | samsung push mobile experience forward 
 39 | finally power volume screenshot ics 
 40 | nexus press conference slick 
 41 | high school appreciated 
 42 | scream scream scream android job major game mobile space 
 43 | thinking ahead 
 44 | venturebeat virtual bookcase sharing 
 45 | android phone keeping iphone 
 46 | android ice cream sandwich feature closer roboto type face read 
 47 | work samsung android ics impressive 
 48 | add profile webgl project add addthis 
 49 | work company work 
 50 | invention 
 51 | wait ice cream sandwich android 
 52 | stop nexus 
 53 | phone 
 54 | android device updated galaxy nexus 
 55 | android introducing ice cream sandwich delicious version android ics 
 56 | excited android features android ics 
 57 | wait nexus play 
 58 | check video introducing galaxy nexus simple beautiful smart youtube android nexus 
 59 | cream ice cream phone job 
 60 | great small businesses platform features thoughts 
 61 | loves presentations tool docs adding video 
 62 | brilliant webgl bookcase 
 63 | searches things 
 64 | android ice cream introducing galaxy nexus simple beautiful smart 
 65 | nexus prime android 
 66 | interesting bookcase venturebeat releases infinite digital bookcase 
 67 | good finally focus user experience android 
 68 | ics awesome phone android motorola 
 69 | iphone ice cream sandwich android 
 70 | nexus line smart move 
 71 | android beam alright made team team android 
 72 | android reply font good start ics 
 73 | ice cream sandwich face unlock works 
 74 | ready ice cream sandwich ics nexus android android 
 75 | ice cream sandwich android 
 76 | taste ice cream sandwich bite 
 77 | samsung event live blog gadget haven android 
 78 | android ice cream sandwich make smartphone operating systems 
 79 | photo sharing people application ice cream sandwich imo ics 
 80 | android nexus phone makes iphone cheap store android 
 81 | sweet ice cream sandwich android ice cream sandwich officially ics 
 82 | raise hand android powered phone samsung 
 83 | siri android device replace iphone 
 84 | nexus page live nexus android 
 85 | excited android beam face unlock android ics 
 86 | linkedin tools company page contact 
 87 | samsung ice cream sandwich samsung 
 88 | introducing galaxy nexus simple beautiful smart android ics samsung 
 89 | glad design android shows waiting 
 90 | thoughts android ics excited play features android 
 91 | register galaxy nexus android 
 92 | wow webgl infinite bookcase 
 93 | ics awesome wait face unlock android 
 94 | gotta pretty android chrome android 
 95 | november direct purchase samsung 
 96 | nexus wanna awesome 
 97 | event time change android samsung 
 98 | ios user ics awesome great job 
 99 | yeah great job ics 
100 | literally mind blown samsung 
101 | motorola verizon perfect 
102 | opens door spanish entrepreneurs project 
103 | intel ibm 
104 | windows phone mango update process ahead schedule mango 
105 | back smartphone rich 
106 | word works computer 
107 | free gen stores 
108 | watch codename data explorer ctp coming 
109 | lunch today vslive 
110 | watch codename data explorer ctp coming month 
111 | details search improvements windows start screen 
112 | mango shows taste smartphone success mango 
113 | awesome moving dev finally local 
114 | stores offer free windows phone devices 
115 | stores offer free windows phone devices neowin 
116 | store spend hard vslive 
117 | free west check 
118 | hey parents free tools kids online live family 
119 | cloud offers students free access improve tcn 
120 | awesome bit 
121 | details windows search improvements 
122 | yeah taking metro yeah good android 
123 | love kids tech 
124 | explains improvements windows start screen search tech 
125 | search idea search great 
126 | bing king search search 
127 | powerpoint users power create service bye solutions 
128 | future information innovators nov info 
129 | curate personal history project greenwich month 
130 | beam research project 
131 | great sql server session 
132 | works days 
133 | ballmer thinks computer scientist android tech agree 
134 | great time 
135 | win server works fine vmware 
136 | wow tech turns body touchscreen psfk 
137 | love love feeling building vslive bringing conference 
138 | research shows awesome step closer bit kinect 
139 | research shows science science fact cool sound 
140 | research shows science science fact 
141 | zune music canada music news 
142 | kinect makes learning playful education 
143 | mango 
144 | check change world 
145 | good world wait 
146 | watching windows pretty impressive finally mac interesting battle store 
147 | xbox share 
148 | god 
149 | blog post cool tool mouse tools 
150 | forget siri beating speech commands mango siri 
151 | tests proves appsense enterprise capability users personalization database enterprise 
152 | software good points sap dynamics 
153 | good dev 
154 | secure anti 
155 | impressed creating images 
156 | mac blown marketing 
157 | yahoo sale years back bought glad deal year 
158 | omnitouch impressive technology 
159 | good bing paying 
160 | ipads windows tablets study 
161 | home day great time 
162 | mango shows taste smartphone success 
163 | picture services cloud love 
164 | windows net dev 
165 | nice talk community 
166 | omg sharepoint working 
167 | innovation sad sad 
168 | office love genius 
169 | love gates foundation 
170 | good 
171 | skype family amazing things 
172 | absolutely loving mouse 
173 | fan cool video turn surface touchscreen 
174 | wow android ics lots talk mango launch people public speaking 
175 | updated computer windows 
176 | ics android kill mango nokia 
177 | people names mail week 
178 | outlook mac sucks hate 
179 | xbox accounts hack reports 
180 | update net 
181 | windows media center fail 
182 | eclipsed 
183 | word upgrade doc doc word won open doc suck 
184 | u.s. antitrust leaving business played dumb 
185 | lync crash issue mac fixed 
186 | broke played engages racketeering calls respect 
187 | nokia chief executive mole 
188 | frozen xbox live xbl accounts online games report hacked 
189 | gave windows dev preview good waiting beta windows 
190 | powerpoint fix powerpoint presentations 
191 | eclipsed guardian 
192 | kind search 
193 | great time family advertising 
194 | windows forget past antitrust issues 
195 | paying make racketeering 
196 | day talking talk tomorrow waiting 
197 | reader compares albatross neck agree join 
198 | lot word freeze minutes 
199 | lol perfect simple hate windows phones 
200 | months months lose 
201 | 


--------------------------------------------------------------------------------
/src/eval/ClusteringEval.java:
--------------------------------------------------------------------------------
  1 | package eval;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileReader;
  7 | import java.io.FileWriter;
  8 | import java.util.ArrayList;
  9 | import java.util.HashMap;
 10 | import java.util.HashSet;
 11 | import java.util.List;
 12 | import java.util.Set;
 13 | 
 14 | import utility.FuncUtils;
 15 | 
 16 | /**
 17 |  * jLDADMM: A Java package for the LDA and DMM topic models
 18 |  * 
 19 |  * Implementation of the Purity and NMI clustering evaluation scores, as
 20 |  * described in Section 16.3 in:
 21 |  * 
 22 |  * Christopher D. Manning, Prabhakar Raghavan, and Hinrich Sch¨utze. 2008.
 23 |  * Introduction to Information Retrieval. Cambridge University Press.
 24 |  * 
 25 |  * @author: Dat Quoc Nguyen
 26 |  */
 27 | 
 28 | public class ClusteringEval
 29 | {
 30 | 	String pathDocTopicProsFile;
 31 | 
 32 | 	String pathGoldenLabelsFile;
 33 | 
 34 | 	HashMap<String, Set<Integer>> goldenClusers;
 35 | 	HashMap<String, Set<Integer>> outputClusers;
 36 | 
 37 | 	int numDocs;
 38 | 
 39 | 	public ClusteringEval(String inPathGoldenLabelsFile,
 40 | 		String inPathDocTopicProsFile)
 41 | 		throws Exception
 42 | 	{
 43 | 		pathDocTopicProsFile = inPathDocTopicProsFile;
 44 | 		pathGoldenLabelsFile = inPathGoldenLabelsFile;
 45 | 
 46 | 		goldenClusers = new HashMap<String, Set<Integer>>();
 47 | 		outputClusers = new HashMap<String, Set<Integer>>();
 48 | 
 49 | 		readGoldenLabelsFile();
 50 | 		readDocTopicProsFile();
 51 | 	}
 52 | 
 53 | 	public void readGoldenLabelsFile()
 54 | 		throws Exception
 55 | 	{
 56 | 		System.out
 57 | 			.println("Reading golden labels file " + pathGoldenLabelsFile);
 58 | 
 59 | 		int id = 0;
 60 | 
 61 | 		BufferedReader br = null;
 62 | 		try {
 63 | 			br = new BufferedReader(new FileReader(pathGoldenLabelsFile));
 64 | 			for (String label; (label = br.readLine()) != null;) {
 65 | 				label = label.trim();
 66 | 				Set<Integer> ids = new HashSet<Integer>();
 67 | 				if (goldenClusers.containsKey(label))
 68 | 					ids = goldenClusers.get(label);
 69 | 				ids.add(id);
 70 | 				goldenClusers.put(label, ids);
 71 | 				id += 1;
 72 | 			}
 73 | 		}
 74 | 		catch (Exception e) {
 75 | 			e.printStackTrace();
 76 | 		}
 77 | 		numDocs = id;
 78 | 	}
 79 | 
 80 | 	public void readDocTopicProsFile()
 81 | 		throws Exception
 82 | 	{
 83 | 		System.out.println("Reading document-to-topic distribution file "
 84 | 			+ pathDocTopicProsFile);
 85 | 
 86 | 		HashMap<Integer, String> docLabelOutput = new HashMap<Integer, String>();
 87 | 
 88 | 		int docIndex = 0;
 89 | 
 90 | 		BufferedReader br = null;
 91 | 		try {
 92 | 			br = new BufferedReader(new FileReader(pathDocTopicProsFile));
 93 | 
 94 | 			for (String docTopicProbs; (docTopicProbs = br.readLine()) != null;) {
 95 | 				String[] pros = docTopicProbs.trim().split("\\s+");
 96 | 				double maxPro = 0.0;
 97 | 				int index = -1;
 98 | 				for (int topicIndex = 0; topicIndex < pros.length; topicIndex++) {
 99 | 					double pro = new Double(pros[topicIndex]);
100 | 					if (pro > maxPro) {
101 | 						maxPro = pro;
102 | 						index = topicIndex;
103 | 					}
104 | 				}
105 | 				docLabelOutput.put(docIndex,
106 | 					"Topic_" + new Integer(index).toString());
107 | 				docIndex++;
108 | 			}
109 | 		}
110 | 		catch (Exception e) {
111 | 			e.printStackTrace();
112 | 		}
113 | 
114 | 		if (numDocs != docIndex) {
115 | 			System.out
116 | 				.println("Error: the number of documents is different to the number of labels!");
117 | 			throw new Exception();
118 | 		}
119 | 
120 | 		for (Integer id : docLabelOutput.keySet()) {
121 | 			String label = docLabelOutput.get(id);
122 | 			Set<Integer> ids = new HashSet<Integer>();
123 | 			if (outputClusers.containsKey(label))
124 | 				ids = outputClusers.get(label);
125 | 			ids.add(id);
126 | 			outputClusers.put(label, ids);
127 | 		}
128 | 
129 | 	}
130 | 
131 | 	public double computePurity()
132 | 	{
133 | 		int count = 0;
134 | 		for (String label : outputClusers.keySet()) {
135 | 			Set<Integer> docs = outputClusers.get(label);
136 | 			int correctAssignedDocNum = 0;
137 | 			for (String goldenLabel : goldenClusers.keySet()) {
138 | 				Set<Integer> goldenDocs = goldenClusers.get(goldenLabel);
139 | 				Set<Integer> outputDocs = new HashSet<Integer>(docs);
140 | 				outputDocs.retainAll(goldenDocs);
141 | 				if (outputDocs.size() >= correctAssignedDocNum)
142 | 					correctAssignedDocNum = outputDocs.size();
143 | 			}
144 | 			count += correctAssignedDocNum;
145 | 		}
146 | 		double value = count * 1.0 / numDocs;
147 | 		System.out.println("\tPurity accuracy: " + value);
148 | 		return value;
149 | 	}
150 | 
151 | 	public double computeNMIscore()
152 | 	{
153 | 		double MIscore = 0.0;
154 | 		for (String label : outputClusers.keySet()) {
155 | 			Set<Integer> docs = outputClusers.get(label);
156 | 			for (String goldenLabel : goldenClusers.keySet()) {
157 | 				Set<Integer> goldenDocs = goldenClusers.get(goldenLabel);
158 | 				Set<Integer> outputDocs = new HashSet<Integer>(docs);
159 | 				outputDocs.retainAll(goldenDocs);
160 | 				double numCorrectAssignedDocs = outputDocs.size() * 1.0;
161 | 				if (numCorrectAssignedDocs == 0.0)
162 | 					continue;
163 | 				MIscore += (numCorrectAssignedDocs / numDocs)
164 | 					* Math.log(numCorrectAssignedDocs * numDocs
165 | 						/ (docs.size() * goldenDocs.size()));
166 | 			}
167 | 
168 | 		}
169 | 		double entropy = 0.0;
170 | 		for (String label : outputClusers.keySet()) {
171 | 			Set<Integer> docs = outputClusers.get(label);
172 | 			entropy += (-1.0 * docs.size() / numDocs)
173 | 				* Math.log(1.0 * docs.size() / numDocs);
174 | 		}
175 | 
176 | 		for (String label : goldenClusers.keySet()) {
177 | 			Set<Integer> docs = goldenClusers.get(label);
178 | 			entropy += (-1.0 * docs.size() / numDocs)
179 | 				* Math.log(1.0 * docs.size() / numDocs);
180 | 		}
181 | 
182 | 		double value = 2 * MIscore / entropy;
183 | 		System.out.println("\tNMI score: " + value);
184 | 		return value;
185 | 	}
186 | 
187 | 	public static void evaluate(String pathGoldenLabelsFile,
188 | 		String pathToFolderOfDocTopicProsFiles, String suffix)
189 | 		throws Exception
190 | 	{
191 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(
192 | 			pathToFolderOfDocTopicProsFiles + "/" + suffix + ".PurityNMI"));
193 | 		writer.write("Golden-labels in: " + pathGoldenLabelsFile + "\n\n");
194 | 		File[] files = new File(pathToFolderOfDocTopicProsFiles).listFiles();
195 | 
196 | 		List<Double> purity = new ArrayList<Double>(), nmi = new ArrayList<Double>();
197 | 		for (File file : files) {
198 | 			if (!file.getName().endsWith(suffix))
199 | 				continue;
200 | 			writer.write("Results for: " + file.getAbsolutePath() + "\n");
201 | 			ClusteringEval dce = new ClusteringEval(pathGoldenLabelsFile,
202 | 				file.getAbsolutePath());
203 | 			double value = dce.computePurity();
204 | 			writer.write("\tPurity: " + value + "\n");
205 | 			purity.add(value);
206 | 			value = dce.computeNMIscore();
207 | 			writer.write("\tNMI: " + value + "\n");
208 | 			nmi.add(value);
209 | 		}
210 | 		if (purity.size() == 0 || nmi.size() == 0) {
211 | 			System.out.println("Error: There is no file ending with " + suffix);
212 | 			throw new Exception();
213 | 		}
214 | 
215 | 		double[] purityValues = new double[purity.size()];
216 | 		double[] nmiValues = new double[nmi.size()];
217 | 
218 | 		for (int i = 0; i < purity.size(); i++)
219 | 			purityValues[i] = purity.get(i).doubleValue();
220 | 		for (int i = 0; i < nmi.size(); i++)
221 | 			nmiValues[i] = nmi.get(i).doubleValue();
222 | 
223 | 		writer.write("\n---\nMean purity: " + FuncUtils.mean(purityValues)
224 | 			+ ", standard deviation: " + FuncUtils.stddev(purityValues));
225 | 
226 | 		writer.write("\nMean NMI: " + FuncUtils.mean(nmiValues)
227 | 			+ ", standard deviation: " + FuncUtils.stddev(nmiValues));
228 | 
229 | 		System.out.println("---\nMean purity: " + FuncUtils.mean(purityValues)
230 | 			+ ", standard deviation: " + FuncUtils.stddev(purityValues));
231 | 
232 | 		System.out.println("Mean NMI: " + FuncUtils.mean(nmiValues)
233 | 			+ ", standard deviation: " + FuncUtils.stddev(nmiValues));
234 | 
235 | 		writer.close();
236 | 	}
237 | 
238 | 	public static void main(String[] args)
239 | 		throws Exception
240 | 	{
241 | 		ClusteringEval.evaluate("test/corpus.LABEL", "test", "theta");
242 | 	}
243 | }
244 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## jLDADMM: A Java package for the LDA and DMM topic models 
  2 | 
  3 | **jLDADMM** is released to provide alternatives for topic modeling on normal or short texts. Probabilistic topic models, such as Latent Dirichlet Allocation (LDA) [1] and related models [2], are widely used to discover latent topics in document collections. However, applying topic models for short texts (e.g. Tweets) is more challenging because of data sparsity and the limited contexts in such texts. One approach is to combine short texts into long pseudo-documents before training LDA. Another approach is to assume that there is only one topic per document [3].
  4 | 
  5 | jLDADMM provides implementations of the LDA topic model [1] and the one-topic-per-document Dirichlet Multinomial Mixture (DMM) model (i.e. mixture of unigrams) [4]. The implementations of LDA and DMM use the collapsed Gibbs sampling algorithms for inference as described in [5] and [6], respectively. Furthermore, jLDADMM supplies a document clustering evaluation to compare topic models, using two common metrics of Purity and normalized mutual information (NMI) [7].
  6 | 
  7 | Please cite jLDADMM whenever jLDADMM is used to produce published results or incorporated into other software:
  8 | 
  9 | 	@article{jldadmm,
 10 | 	  title={{jLDADMM: A Java package for the LDA and DMM topic models}},
 11 | 	  author={Dat Quoc Nguyen},
 12 | 	  journal={arXiv preprint arXiv:1808.03835},
 13 | 	  year={2018}
 14 | 	}
 15 | 
 16 | Bug reports, comments and suggestions about jLDADMM are highly appreciated. As a free open-source package, jLDADMM is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | 
 18 | ### Using jLDADMM for topic modeling
 19 | 
 20 | This section describes the usage of jLDADMM in command line or terminal, using a  pre-compiled file named `jLDADMM.jar`. Here, it is supposed that Java is already set to run in command line or terminal (e.g. adding Java to the environment variable `path` in Windows OS).
 21 | 
 22 | Users can find the pre-compiled file `jLDADMM.jar` and source codes in folders `jar` and `src`, respectively. **The users can recompile the source codes by simply running `ant` (it is also expected that `ant` is already installed)**. In addition, the users can find input examples in `test` folder.
 23 | 
 24 | **File format of input corpus:**  Similar to file `corpus.txt`  in the `test` folder, jLDADMM assumes that each line in the input corpus represents a document. Here, a document is a sequence of words/tokens separated by white space characters. The users should preprocess the input corpus before training the LDA or DMM topic  models, for example: down-casing, removing non-alphabetic characters and stop-words, removing words shorter than 3 characters and words appearing less than a certain times.  
 25 | 
 26 | **Now, we can train LDA or DMM by executing:**
 27 | 
 28 | 	$ java [-Xmx1G] -jar jar/jLDADMM.jar –model <LDA_or_DMM> -corpus <Input_corpus_file_path> [-ntopics <int>] [-alpha <double>] [-beta <double>] [-niters <int>] [-twords <int>] [-name <String>] [-sstep <int>] [-seed <int>]
 29 | 
 30 | where parameters in [ ] are optional.
 31 | 
 32 | `-model`: Specify the topic model LDA or DMM
 33 | 
 34 | `-corpus`: Specify the path to the input corpus file.
 35 | 
 36 | `-ntopics <int>`: Specify the number of topics. The default value is 20.
 37 | 
 38 | `-alpha <double>`: Specify the hyper-parameter `alpha`. Following [6, 8], the default  `alpha` value is 0.1.
 39 | 
 40 | `-beta <double>`: Specify the hyper-parameter `beta`. The default `beta` value is 0.01 which is a common setting in  literature [5]. Following [6], the users may consider to the `beta` value of 0.1 for short texts.
 41 | 
 42 | `-niters <int>`: Specify the number of Gibbs sampling iterations. The default value is 2000.
 43 | 
 44 | `-twords <int>`: Specify the number of the most probable topical words. The default value is 20.
 45 | 
 46 | `-name <String>`: Specify a name to the topic modeling experiment. The default value is `model`.
 47 | 
 48 | `-sstep <int>`: Specify a step to save the sampling outputs. The default value is 0 (i.e. only saving the output from the last sample).
 49 | 
 50 | `-seed <int>`: Specify the random _seed_ for the Gibbs sampler. Default is 0, which will use the clock.
 51 | 
 52 | **Examples:**
 53 | 
 54 | 	$ java -jar jar/jLDADMM.jar -model LDA -corpus test/corpus.txt -name testLDA
 55 | 
 56 | The output files are saved in the same folder containing the input corpus file, in this case in the `test` folder. We have output files of `testLDA.theta`, `testLDA.phi`, `testLDA.topWords`, `testLDA.topicAssignments` and `testLDA.paras` referring to the document-to-topic distributions, topic-to-word distributions, top topical words, topic assignments and model parameters, respectively. Similarly, we perform:
 57 | 
 58 | 	$ java -jar jar/jLDADMM.jar -model DMM -corpus test/corpus.txt -beta 0.1 -name testDMM
 59 | 
 60 | Output files `testDMM.theta`, `testDMM.phi`, `testDMM.topWords`, `testDMM.topicAssignments` and `testDMM.paras` are also in the `test` folder.
 61 | 
 62 | ### Using jLDADMM for document clustering evaluation
 63 | 
 64 | Here, we treat each topic as a cluster, and we assign every document the topic with the highest probability given the document [8]. To get the Purity and NMI clustering scores, we perform:
 65 | 
 66 | 	$ java –jar jar/jLDADMM.jar –model Eval –label <Golden_label_file_path> -dir <Directory_path> -prob <Document-topic-prob/Suffix>
 67 | 
 68 | `–label`: Specify the path to the ground truth label file. Each line in this label file contains the golden label of the corresponding document in the input corpus. See files `corpus.LABEL` and `corpus.txt` in the `test` folder.
 69 | 
 70 | `-dir`: Specify the path to the directory containing document-to-topic distribution files.
 71 | 
 72 | `-prob`: Specify a document-to-topic distribution file OR a group of document-to-topic distribution files in the specified directory.
 73 | 
 74 | **Examples:**
 75 | 
 76 | 	$ java -jar jar/jLDADMM.jar -model Eval -label test/corpus.LABEL -dir test -prob testLDA.theta
 77 | 
 78 | 	$ java -jar jar/jLDADMM.jar -model Eval -label test/corpus.LABEL -dir test -prob testDMM.theta
 79 | 
 80 | The above commands will produce the clustering scores for files `testLDA.theta` and `testDMM.theta`  in the `test` folder, separately. The following command
 81 | 
 82 | 	$ java -jar jar/jLDADMM.jar -model Eval -label test/corpus.LABEL -dir test -prob theta
 83 | 
 84 | will produce the clustering scores for all document-to-topic distribution files with their names ending in `theta`. In this case, they are are `testLDA.theta` and `testDMM.theta`. The command also provides the mean and standard deviation of the clustering scores.
 85 | 
 86 | To improve evaluation scores, the users might consider using [latent feature topic models LF-LDA and LF-DMM](https://github.com/datquocnguyen/LFTM) [3], which extend the LDA and DMM topic models with word embeddings. 
 87 | 
 88 | ### Topic inference on new/unseen corpus
 89 | 
 90 | To infer topics on a new/unseen corpus using a pre-trained LDA/DMM topic model, we perform:
 91 | 
 92 | 	$ java -jar jar/jLDADMM.jar -model <LDAinf_or_DMMinf> -paras <Hyperparameter_file_path> -corpus <Unseen_corpus_file_path> [-niters <int>] [-twords <int>] [-name <String>] [-sstep <int>] [-seed <int>]
 93 | 
 94 | * `-paras`: Specify the path to the hyper-parameter file produced by the pre-trained LDA/DMM topic model.
 95 | 
 96 | <b>Examples:</b>
 97 | 
 98 | 	$ java -jar jar/jLDADMM.jar -model LDAinf -paras test/testLDA.paras -corpus test/unseenTest.txt -niters 100 -name testLDAinf
 99 | 
100 | 	$ java -jar jar/jLDADMM.jar -model DMMinf -paras test/testDMM.paras -corpus test/unseenTest.txt -niters 100 -name testDMMinf
101 | 
102 | ### References
103 | 
104 | [1]   David M. Blei, Andrew Y. Ng, and Michael I. Jordan. 2003. Latent Dirichlet Allocation. Journal of Machine Learning Research, 3:993–1022.
105 | 
106 | [2]   David M. Blei. 2012. Probabilistic Topic Models. Communications of the ACM, 55(4):77–84.
107 | 
108 | [3]   Dat Quoc Nguyen, Richard Billingsley, Lan Du and Mark Johnson. 2015. [Improving Topic Models with Latent Feature Word Representations](https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/view/582/158). Transactions of the Association for Computational Linguistics, vol. 3, pp. 299-313. [[CODE]](https://github.com/datquocnguyen/LFTM)
109 | 
110 | [4]   Kamal Nigam, AK McCallum, S Thrun, and T Mitchell. 2000. Text Classification from Labeled and Unlabeled Documents Using EM. Machine learning, 39:103– 134.
111 | 
112 | [5]   Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics. Proceedings of the National Academy of Sciences of the United States of America, 101(Suppl 1):5228–5235.
113 | 
114 | [6]   Jianhua Yin and Jianyong Wang. 2014. A Dirichlet Multinomial Mixture Model-based Approach for Short Text Clustering. In Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pages 233–242.
115 | 
116 | [7]   Christopher D. Manning, Prabhakar Raghavan, and Hinrich Sch¨utze. 2008. Introduction to Information Retrieval. Cambridge University Press.
117 | 
118 | [8]   Yue Lu, Qiaozhu Mei, and ChengXiang Zhai. 2011. Investigating task performance of probabilistic topic models: an empirical study of PLSA and LDA. Information Retrieval, 14:178–203.
119 | 


--------------------------------------------------------------------------------
/test/corpus.txt:
--------------------------------------------------------------------------------
  1 | iphone crack iphone 
  2 | adding support iphone announced 
  3 | youtube video guy siri pretty love 
  4 | rim made easy switch iphone yeah 
  5 | realized ios 
  6 | current blackberry user bit disappointed move android iphone 
  7 | things siri sooo glad gave siri sense humor 
  8 | great personal event tonight store 
  9 | companies experience customer service 
 10 | apply job hope call lol 
 11 | lmao siri find hide body 
 12 | registered developer appreciated 
 13 | wow great deals ipad gen offers great deals gen ipads 
 14 | learning trip hong kong gotta hand iphones apps 
 15 | dark side hey send free iphone publicly burn blackberry 
 16 | find mac air 
 17 | macbook keyboard lunch break today warranty 
 18 | ipads replace 
 19 | siri amazing 
 20 | amazing ios feature 
 21 | reply featured education apps website today sweet 
 22 | reply useless days 
 23 | iphone yesterday awesome amount info 
 24 | question brother iphone 
 25 | people iphone phone happy 
 26 | ceo points ios 
 27 | bus iphone 
 28 | umber appstore itunes mobile devices talking desktop application 
 29 | bring ipad ipad set red red ipad 
 30 | sells million iphone weekend steve jobs lives iphone 
 31 | apologize 
 32 | downloads ios users 
 33 | lmfao argument siri 
 34 | incredible million iphone screenshot days iphone iphone 
 35 | fixed ios battery drain problem replacement iphone working 
 36 | brand macbook professional macbook years miss time 
 37 | siri dad mom brother girlfriend 
 38 | store amazing call waiting music 
 39 | sweet replaced 
 40 | bad sells million iphones debut weekend smartphone 
 41 | loving technology iphone mac air icloud technology 
 42 | loving ios update 
 43 | mention store great customer service store 
 44 | time iphone forward man longer paying texts 
 45 | girlfriend iphone great 
 46 | icloud set works cloud 
 47 | mommy totally email company great service store 
 48 | loving ios upgrade iphone 
 49 | ios ipad 
 50 | making switch android iphone iphone smartphone store 
 51 | incredible people offering water macbook professional wow 
 52 | macbook sick 
 53 | play man loving camera iphone facebook 
 54 | yeah ios changed life 
 55 | reader worldwide web 
 56 | love service case hand case 
 57 | years jobs iphone iphone iphone 
 58 | blackberry years lost service moving iphone 
 59 | sells million iphones days 
 60 | weekend iphone 
 61 | macbook professional year time selling android 
 62 | post card 
 63 | putting kind glad hear alive 
 64 | god youtube bad ass system loving 
 65 | days iphone nice gave 
 66 | ios email lock screen opening unlocking 
 67 | word wow iphone weekend sales top million 
 68 | love ios easter eggs pull middle top bottom pulls awesome feature ios 
 69 | love ios easter eggs pull middle top bottom pulls awesome feature 
 70 | run beautiful morning man love ios iphone 
 71 | simply 
 72 | made happy text lol text 
 73 | day great customer service received today phone phone 
 74 | loving ipod update 
 75 | upgraded iphone siri worth upgrade forward siri 
 76 | great world missed 
 77 | loving iphone ios 
 78 | love 
 79 | iphone great genius 
 80 | cards application card arrived local post office today 
 81 | iphone siri 
 82 | meet siri iphone click link 
 83 | work feel worst 
 84 | ios upgrade good luck blackberry 
 85 | loving ios awesome 
 86 | iphone addicted club 
 87 | guy playing facetime watching game bar 
 88 | blackberry boo powered technology work 
 89 | ipod time iphone good job guys 
 90 | siri year lead lost 
 91 | ios sweet notifications phone search covers mail wifi sync icloud backup integrated 
 92 | great james story today times retail success 
 93 | world due ios guys 
 94 | impressive service genius bar metro center power replaced free screen replacement free 
 95 | nice guy store replaced phone showed crack screen 
 96 | iphone battery longer day happened edge iphone nice job 
 97 | minutes write blackberry showing 
 98 | eye phone impressed 
 99 | iphone space amazing products people things 
100 | making ipad feel ios 
101 | nexus good feel bit guess android users android 
102 | nice game helps search 
103 | nice game helps search facebook 
104 | build website website free 
105 | android ics pretty good worth 
106 | android ice cream sandwich nexus android nexus 
107 | exciting day ice cream sandwich day android 
108 | wow nexus beautiful totally gonna market share smart phone market 
109 | integrated data usage manager brilliant design watching lol 
110 | ice cream sandwich android works htc desire 
111 | ice cream sandwich sounds android ice cream sandwich 
112 | amazing imo android missing 
113 | forget phone nice feature android nexus 
114 | finally unveiled android ice cream sandwich good 
115 | finally searches logged users 
116 | rim strategy released hours release ics 
117 | man love galaxy nexus samsung android 
118 | doubt 
119 | share winning war 
120 | dear galaxy nexus send email technology 
121 | telegraph reports biggest threat facebook power users 
122 | samsung made bad android king 
123 | facebook power users telegraph socialmedia 
124 | impressed android update good font design 
125 | video wallet wow 
126 | tweet remember spell straight 
127 | android samsung nexus 
128 | efficient fun releases infinite digital bookcase 
129 | pass social seo facebook 
130 | ice cream sandwich stop carriers bullying smartphone users android 
131 | agree freaking awesome 
132 | icecream great 
133 | helps 
134 | samsung galaxy nexus iphone 
135 | ice cream sandwich delicious iphone launches android aka 
136 | loving 
137 | samsung push mobile experience forward 
138 | finally power volume screenshot ics 
139 | nexus press conference slick 
140 | high school appreciated 
141 | scream scream scream android job major game mobile space 
142 | thinking ahead 
143 | venturebeat virtual bookcase sharing 
144 | android phone keeping iphone 
145 | android ice cream sandwich feature closer roboto type face read 
146 | work samsung android ics impressive 
147 | add profile webgl project add addthis 
148 | work company work 
149 | invention 
150 | wait ice cream sandwich android 
151 | stop nexus 
152 | phone 
153 | android device updated galaxy nexus 
154 | android introducing ice cream sandwich delicious version android ics 
155 | excited android features android ics 
156 | wait nexus play 
157 | check video introducing galaxy nexus simple beautiful smart youtube android nexus 
158 | cream ice cream phone job 
159 | great small businesses platform features thoughts 
160 | loves presentations tool docs adding video 
161 | brilliant webgl bookcase 
162 | searches things 
163 | android ice cream introducing galaxy nexus simple beautiful smart 
164 | nexus prime android 
165 | interesting bookcase venturebeat releases infinite digital bookcase 
166 | good finally focus user experience android 
167 | ics awesome phone android motorola 
168 | iphone ice cream sandwich android 
169 | nexus line smart move 
170 | android beam alright made team team android 
171 | android reply font good start ics 
172 | ice cream sandwich face unlock works 
173 | ready ice cream sandwich ics nexus android android 
174 | ice cream sandwich android 
175 | taste ice cream sandwich bite 
176 | samsung event live blog gadget haven android 
177 | android ice cream sandwich make smartphone operating systems 
178 | photo sharing people application ice cream sandwich imo ics 
179 | android nexus phone makes iphone cheap store android 
180 | sweet ice cream sandwich android ice cream sandwich officially ics 
181 | raise hand android powered phone samsung 
182 | siri android device replace iphone 
183 | nexus page live nexus android 
184 | excited android beam face unlock android ics 
185 | linkedin tools company page contact 
186 | samsung ice cream sandwich samsung 
187 | introducing galaxy nexus simple beautiful smart android ics samsung 
188 | glad design android shows waiting 
189 | thoughts android ics excited play features android 
190 | register galaxy nexus android 
191 | wow webgl infinite bookcase 
192 | ics awesome wait face unlock android 
193 | gotta pretty android chrome android 
194 | november direct purchase samsung 
195 | nexus wanna awesome 
196 | event time change android samsung 
197 | ios user ics awesome great job 
198 | yeah great job ics 
199 | literally mind blown samsung 
200 | motorola verizon perfect 
201 | opens door spanish entrepreneurs project 
202 | intel ibm 
203 | windows phone mango update process ahead schedule mango 
204 | back smartphone rich 
205 | word works computer 
206 | free gen stores 
207 | watch codename data explorer ctp coming 
208 | lunch today vslive 
209 | watch codename data explorer ctp coming month 
210 | details search improvements windows start screen 
211 | mango shows taste smartphone success mango 
212 | awesome moving dev finally local 
213 | stores offer free windows phone devices 
214 | stores offer free windows phone devices neowin 
215 | store spend hard vslive 
216 | free west check 
217 | hey parents free tools kids online live family 
218 | cloud offers students free access improve tcn 
219 | awesome bit 
220 | details windows search improvements 
221 | yeah taking metro yeah good android 
222 | love kids tech 
223 | explains improvements windows start screen search tech 
224 | search idea search great 
225 | bing king search search 
226 | powerpoint users power create service bye solutions 
227 | future information innovators nov info 
228 | curate personal history project greenwich month 
229 | beam research project 
230 | great sql server session 
231 | works days 
232 | ballmer thinks computer scientist android tech agree 
233 | great time 
234 | win server works fine vmware 
235 | wow tech turns body touchscreen psfk 
236 | love love feeling building vslive bringing conference 
237 | research shows awesome step closer bit kinect 
238 | research shows science science fact cool sound 
239 | research shows science science fact 
240 | zune music canada music news 
241 | kinect makes learning playful education 
242 | mango 
243 | check change world 
244 | good world wait 
245 | watching windows pretty impressive finally mac interesting battle store 
246 | xbox share 
247 | god 
248 | blog post cool tool mouse tools 
249 | forget siri beating speech commands mango siri 
250 | tests proves appsense enterprise capability users personalization database enterprise 
251 | software good points sap dynamics 
252 | good dev 
253 | secure anti 
254 | impressed creating images 
255 | mac blown marketing 
256 | yahoo sale years back bought glad deal year 
257 | omnitouch impressive technology 
258 | good bing paying 
259 | ipads windows tablets study 
260 | home day great time 
261 | mango shows taste smartphone success 
262 | picture services cloud love 
263 | windows net dev 
264 | nice talk community 
265 | omg sharepoint working 
266 | innovation sad sad 
267 | office love genius 
268 | love gates foundation 
269 | good 
270 | skype family amazing things 
271 | absolutely loving mouse 
272 | fan cool video turn surface touchscreen 
273 | wow android ics lots talk mango launch people public speaking 
274 | updated computer windows 
275 | ics android kill mango nokia 
276 | people names mail week 
277 | outlook mac sucks hate 
278 | xbox accounts hack reports 
279 | update net 
280 | windows media center fail 
281 | eclipsed 
282 | word upgrade doc doc word won open doc suck 
283 | u.s. antitrust leaving business played dumb 
284 | lync crash issue mac fixed 
285 | broke played engages racketeering calls respect 
286 | nokia chief executive mole 
287 | frozen xbox live xbl accounts online games report hacked 
288 | gave windows dev preview good waiting beta windows 
289 | powerpoint fix powerpoint presentations 
290 | eclipsed guardian 
291 | kind search 
292 | great time family advertising 
293 | windows forget past antitrust issues 
294 | paying make racketeering 
295 | day talking talk tomorrow waiting 
296 | reader compares albatross neck agree join 
297 | lot word freeze minutes 
298 | lol perfect simple hate windows phones 
299 | months months lose 
300 | reader compares albatross neck agree join discussion 
301 | make sleep plan 
302 | feel world put facebook blackberry helps 
303 | miss boo 
304 | everytime leave back back telling lol 
305 | application ass theme 
306 | sleep sleep 
307 | starting sending hashtags emails taking lives 
308 | shit lol hell 
309 | today introduced social media love 
310 | facebook 
311 | yeah shows glad 
312 | pretty facebook 
313 | gotta love shit round world speed 
314 | bed gonna minute bed 
315 | dear fucking missed today internet 
316 | tweet keeping busy school 
317 | good thing people left social 
318 | social media 
319 | guess addicted university exam questions 
320 | good thing people left social side 
321 | apples facebook content 
322 | bed favorite application facebook 
323 | facebook change makes excited privacy 
324 | impressive numbers smm socialmedia 
325 | fuck facebook bullshit bitch 
326 | cool love 
327 | fuck facebook follow 
328 | haven shit man haven fun 
329 | find song end television show watched 
330 | literally back facebook text email technology good 
331 | isn pretty damn amazing hope year fast 
332 | dear missed promise touch 
333 | bored sad mad happy true friend 
334 | facebook sucks amp 
335 | shit funny haven shit day 
336 | voice people real life lol 
337 | yeah time bug 
338 | science hashtags facebook 
339 | feeling real world 
340 | biggest 
341 | facebook messed make add reliable 
342 | freaking kidding wth 
343 | tomorrow blue ass bird continued 
344 | dead 
345 | emails telling 
346 | sucks follow 
347 | people reporting retweets working technical problem 
348 | back lol 
349 | retweets broken haven tuesday 
350 | tomorrow blue ass bird ass 
351 | ain showing current mentions tweets 
352 | gonna problems fixed asap 
353 | retweets 
354 | man boring 
355 | application show touch tweet 
356 | trouble application updating application 
357 | messed everytime text message 
358 | show fucking retweets bitch 
359 | sooo trash 
360 | showing retweets shit 
361 | mom argument pretty 
362 | addicted care 
363 | appreciated start working computer 
364 | retweets section account working hours problem 
365 | good send bloody tweets 
366 | feel 
367 | make account 
368 | fucking late damn 
369 | dear fix shit retweets mentions 
370 | dead fuck 
371 | point 
372 | giving tweets tweeted past days lol 
373 | messed followers numbers 
374 | timeline mentions shit 
375 | garbage 
376 | hell television man 
377 | stupid fucking give damn mentions ugh 
378 | fucking 
379 | facebook television wanna study 
380 | show retweets ill back facebook 
381 | reply opinions 
382 | forget day time haven 
383 | blogs tumblr 
384 | talk step game 
385 | reminder fail 
386 | join follow 
387 | ways competition 
388 | people facebook day life 
389 | drop follow show love 
390 | telling reply 
391 | sleep time 
392 | emotions 
393 | call night 
394 | work break time yeah 
395 | tumblr love 
396 | age year days hours minutes seconds find 
397 | wanna aye shit living 
398 | shout favorite people happy girls 
399 | follow back 
400 | sleep good people trip 
401 | 


--------------------------------------------------------------------------------
/src/models/GibbsSamplingLDA_Inf.java:
--------------------------------------------------------------------------------
  1 | package models;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.FileReader;
  6 | import java.io.FileWriter;
  7 | import java.io.IOException;
  8 | import java.util.ArrayList;
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | import java.util.Set;
 13 | import java.util.TreeMap;
 14 | 
 15 | import utility.FuncUtils;
 16 | 
 17 | /**
 18 |  * jLDADMM: A Java package for the LDA and DMM topic models
 19 |  * 
 20 |  * http://jldadmm.sourceforge.net/
 21 |  * 
 22 |  * @author: Dat Quoc Nguyen
 23 |  * 
 24 |  */
 25 | 
 26 | public class GibbsSamplingLDA_Inf
 27 | {
 28 | 	public double alpha; // Hyper-parameter alpha
 29 | 	public double beta; // Hyper-parameter alpha
 30 | 	public int numTopics; // Number of topics
 31 | 	public int numIterations; // Number of Gibbs sampling iterations
 32 | 	public int topWords; // Number of most probable words for each topic
 33 | 
 34 | 	public double alphaSum; // alpha * numTopics
 35 | 	public double betaSum; // beta * vocabularySize
 36 | 
 37 | 	public List<List<Integer>> corpus; // Word ID-based corpus
 38 | 	public List<List<Integer>> topicAssignments; // Topics assignments for words
 39 | 													// in the corpus
 40 | 	public int numDocuments; // Number of documents in the corpus
 41 | 	public int numWordsInCorpus; // Number of words in the corpus
 42 | 
 43 | 	public HashMap<String, Integer> word2IdVocabulary; // Vocabulary to get ID
 44 | 														// given a word
 45 | 	public HashMap<Integer, String> id2WordVocabulary; // Vocabulary to get word
 46 | 														// given an ID
 47 | 	public int vocabularySize; // The number of word types in the corpus
 48 | 
 49 | 	// numDocuments * numTopics matrix
 50 | 	// Given a document: number of its words assigned to each topic
 51 | 	public int[][] docTopicCount;
 52 | 	// Number of words in every document
 53 | 	public int[] sumDocTopicCount;
 54 | 	// numTopics * vocabularySize matrix
 55 | 	// Given a topic: number of times a word type assigned to the topic
 56 | 	public int[][] topicWordCount;
 57 | 	// Total number of words assigned to a topic
 58 | 	public int[] sumTopicWordCount;
 59 | 
 60 | 	// Double array used to sample a topic
 61 | 	public double[] multiPros;
 62 | 
 63 | 	// Path to the directory containing the corpus
 64 | 	public String folderPath;
 65 | 	// Path to the topic modeling corpus
 66 | 	public String corpusPath;
 67 | 
 68 | 	public String expName = "LDAinf";
 69 | 	public String orgExpName = "LDAinf";
 70 | 	public String tAssignsFilePath = "";
 71 | 	public int savestep = 0;
 72 | 
 73 | 	public GibbsSamplingLDA_Inf(String pathToTrainingParasFile,
 74 | 		String pathToUnseenCorpus, int inNumIterations, int inTopWords,
 75 | 		String inExpName, int inSaveStep)
 76 | 		throws Exception
 77 | 	{
 78 | 		HashMap<String, String> paras = parseTrainingParasFile(pathToTrainingParasFile);
 79 | 		if (!paras.get("-model").equals("LDA")) {
 80 | 			throw new Exception("Wrong pre-trained model!!!");
 81 | 		}
 82 | 		alpha = new Double(paras.get("-alpha"));
 83 | 		beta = new Double(paras.get("-beta"));
 84 | 		numTopics = new Integer(paras.get("-ntopics"));
 85 | 
 86 | 		numIterations = inNumIterations;
 87 | 		topWords = inTopWords;
 88 | 		savestep = inSaveStep;
 89 | 		expName = inExpName;
 90 | 		orgExpName = expName;
 91 | 
 92 | 		String trainingCorpus = paras.get("-corpus");
 93 | 		String trainingCorpusfolder = trainingCorpus.substring(
 94 | 			0,
 95 | 			Math.max(trainingCorpus.lastIndexOf("/"),
 96 | 				trainingCorpus.lastIndexOf("\\")) + 1);
 97 | 		String topicAssignment4TrainFile = trainingCorpusfolder
 98 | 			+ paras.get("-name") + ".topicAssignments";
 99 | 
100 | 		word2IdVocabulary = new HashMap<String, Integer>();
101 | 		id2WordVocabulary = new HashMap<Integer, String>();
102 | 		initializeWordCount(trainingCorpus, topicAssignment4TrainFile);
103 | 
104 | 		corpusPath = pathToUnseenCorpus;
105 | 		folderPath = pathToUnseenCorpus.substring(
106 | 			0,
107 | 			Math.max(pathToUnseenCorpus.lastIndexOf("/"),
108 | 				pathToUnseenCorpus.lastIndexOf("\\")) + 1);
109 | 		System.out.println("Reading unseen corpus: " + pathToUnseenCorpus);
110 | 		corpus = new ArrayList<List<Integer>>();
111 | 		numDocuments = 0;
112 | 		numWordsInCorpus = 0;
113 | 
114 | 		BufferedReader br = null;
115 | 		try {
116 | 			br = new BufferedReader(new FileReader(pathToUnseenCorpus));
117 | 			for (String doc; (doc = br.readLine()) != null;) {
118 | 
119 | 				if (doc.trim().length() == 0)
120 | 					continue;
121 | 
122 | 				String[] words = doc.trim().split("\\s+");
123 | 				List<Integer> document = new ArrayList<Integer>();
124 | 
125 | 				for (String word : words) {
126 | 					if (word2IdVocabulary.containsKey(word)) {
127 | 						document.add(word2IdVocabulary.get(word));
128 | 					}
129 | 					else {
130 | 						// Skip this unknown-word
131 | 					}
132 | 				}
133 | 				numDocuments++;
134 | 				numWordsInCorpus += document.size();
135 | 				corpus.add(document);
136 | 			}
137 | 		}
138 | 		catch (Exception e) {
139 | 			e.printStackTrace();
140 | 			return;
141 | 		}
142 | 
143 | 		docTopicCount = new int[numDocuments][numTopics];
144 | 		sumDocTopicCount = new int[numDocuments];
145 | 		multiPros = new double[numTopics];
146 | 		for (int i = 0; i < numTopics; i++) {
147 | 			multiPros[i] = 1.0 / numTopics;
148 | 		}
149 | 
150 | 		alphaSum = numTopics * alpha;
151 | 		betaSum = vocabularySize * beta;
152 | 
153 | 		System.out.println("Corpus size: " + numDocuments + " docs, "
154 | 			+ numWordsInCorpus + " words");
155 | 		System.out.println("Vocabuary size: " + vocabularySize);
156 | 		System.out.println("Number of topics: " + numTopics);
157 | 		System.out.println("alpha: " + alpha);
158 | 		System.out.println("beta: " + beta);
159 | 		System.out.println("Number of sampling iterations: " + numIterations);
160 | 		System.out.println("Number of top topical words: " + topWords);
161 | 
162 | 		initialize();
163 | 	}
164 | 
165 | 	private HashMap<String, String> parseTrainingParasFile(
166 | 		String pathToTrainingParasFile)
167 | 		throws Exception
168 | 	{
169 | 		HashMap<String, String> paras = new HashMap<String, String>();
170 | 		BufferedReader br = null;
171 | 		try {
172 | 			br = new BufferedReader(new FileReader(pathToTrainingParasFile));
173 | 			for (String line; (line = br.readLine()) != null;) {
174 | 
175 | 				if (line.trim().length() == 0)
176 | 					continue;
177 | 
178 | 				String[] paraOptions = line.trim().split("\\s+");
179 | 				paras.put(paraOptions[0], paraOptions[1]);
180 | 			}
181 | 		}
182 | 		catch (Exception e) {
183 | 			e.printStackTrace();
184 | 		}
185 | 		return paras;
186 | 	}
187 | 
188 | 	private void initializeWordCount(String pathToTrainingCorpus,
189 | 		String pathToTopicAssignmentFile)
190 | 	{
191 | 		System.out.println("Loading pre-trained model...");
192 | 		List<List<Integer>> trainCorpus = new ArrayList<List<Integer>>();
193 | 		BufferedReader br = null;
194 | 		try {
195 | 			int indexWord = -1;
196 | 			br = new BufferedReader(new FileReader(pathToTrainingCorpus));
197 | 			for (String doc; (doc = br.readLine()) != null;) {
198 | 
199 | 				if (doc.trim().length() == 0)
200 | 					continue;
201 | 
202 | 				String[] words = doc.trim().split("\\s+");
203 | 				List<Integer> document = new ArrayList<Integer>();
204 | 
205 | 				for (String word : words) {
206 | 					if (word2IdVocabulary.containsKey(word)) {
207 | 						document.add(word2IdVocabulary.get(word));
208 | 					}
209 | 					else {
210 | 						indexWord += 1;
211 | 						word2IdVocabulary.put(word, indexWord);
212 | 						id2WordVocabulary.put(indexWord, word);
213 | 						document.add(indexWord);
214 | 					}
215 | 				}
216 | 				trainCorpus.add(document);
217 | 			}
218 | 		}
219 | 		catch (Exception e) {
220 | 			e.printStackTrace();
221 | 		}
222 | 
223 | 		vocabularySize = word2IdVocabulary.size();
224 | 		topicWordCount = new int[numTopics][vocabularySize];
225 | 		sumTopicWordCount = new int[numTopics];
226 | 
227 | 		try {
228 | 			br = new BufferedReader(new FileReader(pathToTopicAssignmentFile));
229 | 			int docId = 0;
230 | 			for (String line; (line = br.readLine()) != null;) {
231 | 				String[] strTopics = line.trim().split("\\s+");
232 | 				for (int j = 0; j < strTopics.length; j++) {
233 | 					int wordId = trainCorpus.get(docId).get(j);
234 | 					int topic = new Integer(strTopics[j]);
235 | 					topicWordCount[topic][wordId] += 1;
236 | 					sumTopicWordCount[topic] += 1;
237 | 				}
238 | 				docId++;
239 | 			}
240 | 		}
241 | 		catch (Exception e) {
242 | 			e.printStackTrace();
243 | 		}
244 | 	}
245 | 
246 | 	/**
247 | 	 * Randomly initialize topic assignments
248 | 	 */
249 | 	public void initialize()
250 | 		throws IOException
251 | 	{
252 | 		System.out.println("Randomly initializing topic assignments ...");
253 | 
254 | 		topicAssignments = new ArrayList<List<Integer>>();
255 | 
256 | 		for (int i = 0; i < numDocuments; i++) {
257 | 			List<Integer> topics = new ArrayList<Integer>();
258 | 			int docSize = corpus.get(i).size();
259 | 			for (int j = 0; j < docSize; j++) {
260 | 				int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic
261 | 				// Increase counts
262 | 				docTopicCount[i][topic] += 1;
263 | 				topicWordCount[topic][corpus.get(i).get(j)] += 1;
264 | 				sumDocTopicCount[i] += 1;
265 | 				sumTopicWordCount[topic] += 1;
266 | 
267 | 				topics.add(topic);
268 | 			}
269 | 			topicAssignments.add(topics);
270 | 		}
271 | 	}
272 | 
273 | 	public void inference()
274 | 		throws IOException
275 | 	{
276 | 		writeParameters();
277 | 		writeDictionary();
278 | 
279 | 		System.out.println("Running Gibbs sampling inference: ");
280 | 
281 | 		for (int iter = 1; iter <= numIterations; iter++) {
282 | 
283 | 			System.out.println("\tSampling iteration: " + (iter));
284 | 			// System.out.println("\t\tPerplexity: " + computePerplexity());
285 | 
286 | 			sampleInSingleIteration();
287 | 
288 | 			if ((savestep > 0) && (iter % savestep == 0)
289 | 				&& (iter < numIterations)) {
290 | 				System.out.println("\t\tSaving the output from the " + iter
291 | 					+ "^{th} sample");
292 | 				expName = orgExpName + "-" + iter;
293 | 				write();
294 | 			}
295 | 		}
296 | 		expName = orgExpName;
297 | 
298 | 		System.out.println("Writing output from the last sample ...");
299 | 		write();
300 | 
301 | 		System.out.println("Sampling completed!");
302 | 
303 | 	}
304 | 
305 | 	public void sampleInSingleIteration()
306 | 	{
307 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
308 | 			int docSize = corpus.get(dIndex).size();
309 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
310 | 				// Get current word and its topic
311 | 				int topic = topicAssignments.get(dIndex).get(wIndex);
312 | 				int word = corpus.get(dIndex).get(wIndex);
313 | 
314 | 				// Decrease counts
315 | 				docTopicCount[dIndex][topic] -= 1;
316 | 				// docTopicSum[dIndex] -= 1;
317 | 				topicWordCount[topic][word] -= 1;
318 | 				sumTopicWordCount[topic] -= 1;
319 | 
320 | 				// Sample a topic
321 | 				for (int tIndex = 0; tIndex < numTopics; tIndex++) {
322 | 					multiPros[tIndex] = (docTopicCount[dIndex][tIndex] + alpha)
323 | 						* ((topicWordCount[tIndex][word] + beta) / (sumTopicWordCount[tIndex] + betaSum));
324 | 					// multiPros[tIndex] = ((docTopicCount[dIndex][tIndex] +
325 | 					// alpha) /
326 | 					// (docTopicSum[dIndex] + alphaSum))
327 | 					// * ((topicWordCount[tIndex][word] + beta) /
328 | 					// (topicWordSum[tIndex] + betaSum));
329 | 				}
330 | 				topic = FuncUtils.nextDiscrete(multiPros);
331 | 
332 | 				// Increase counts
333 | 				docTopicCount[dIndex][topic] += 1;
334 | 				// docTopicSum[dIndex] += 1;
335 | 				topicWordCount[topic][word] += 1;
336 | 				sumTopicWordCount[topic] += 1;
337 | 
338 | 				// Update topic assignments
339 | 				topicAssignments.get(dIndex).set(wIndex, topic);
340 | 			}
341 | 		}
342 | 	}
343 | 
344 | 	public void writeParameters()
345 | 		throws IOException
346 | 	{
347 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
348 | 			+ expName + ".paras"));
349 | 		writer.write("-model" + "\t" + "LDA");
350 | 		writer.write("\n-corpus" + "\t" + corpusPath);
351 | 		writer.write("\n-ntopics" + "\t" + numTopics);
352 | 		writer.write("\n-alpha" + "\t" + alpha);
353 | 		writer.write("\n-beta" + "\t" + beta);
354 | 		writer.write("\n-niters" + "\t" + numIterations);
355 | 		writer.write("\n-twords" + "\t" + topWords);
356 | 		writer.write("\n-name" + "\t" + expName);
357 | 		if (tAssignsFilePath.length() > 0)
358 | 			writer.write("\n-initFile" + "\t" + tAssignsFilePath);
359 | 		if (savestep > 0)
360 | 			writer.write("\n-sstep" + "\t" + savestep);
361 | 
362 | 		writer.close();
363 | 	}
364 | 
365 | 	public void writeDictionary()
366 | 		throws IOException
367 | 	{
368 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
369 | 			+ expName + ".vocabulary"));
370 | 		for (int id = 0; id < vocabularySize; id++)
371 | 			writer.write(id2WordVocabulary.get(id) + " " + id + "\n");
372 | 		writer.close();
373 | 	}
374 | 
375 | 	public void writeIDbasedCorpus()
376 | 		throws IOException
377 | 	{
378 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
379 | 			+ expName + ".IDcorpus"));
380 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
381 | 			int docSize = corpus.get(dIndex).size();
382 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
383 | 				writer.write(corpus.get(dIndex).get(wIndex) + " ");
384 | 			}
385 | 			writer.write("\n");
386 | 		}
387 | 		writer.close();
388 | 	}
389 | 
390 | 	public void writeTopicAssignments()
391 | 		throws IOException
392 | 	{
393 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
394 | 			+ expName + ".topicAssignments"));
395 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
396 | 			int docSize = corpus.get(dIndex).size();
397 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
398 | 				writer.write(topicAssignments.get(dIndex).get(wIndex) + " ");
399 | 			}
400 | 			writer.write("\n");
401 | 		}
402 | 		writer.close();
403 | 	}
404 | 
405 | 	public void writeTopTopicalWords()
406 | 		throws IOException
407 | 	{
408 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
409 | 			+ expName + ".topWords"));
410 | 
411 | 		for (int tIndex = 0; tIndex < numTopics; tIndex++) {
412 | 			writer.write("Topic" + new Integer(tIndex) + ":");
413 | 
414 | 			Map<Integer, Integer> wordCount = new TreeMap<Integer, Integer>();
415 | 			for (int wIndex = 0; wIndex < vocabularySize; wIndex++) {
416 | 				wordCount.put(wIndex, topicWordCount[tIndex][wIndex]);
417 | 			}
418 | 			wordCount = FuncUtils.sortByValueDescending(wordCount);
419 | 
420 | 			Set<Integer> mostLikelyWords = wordCount.keySet();
421 | 			int count = 0;
422 | 			for (Integer index : mostLikelyWords) {
423 | 				if (count < topWords) {
424 | 					double pro = (topicWordCount[tIndex][index] + beta)
425 | 						/ (sumTopicWordCount[tIndex] + betaSum);
426 | 					pro = Math.round(pro * 1000000.0) / 1000000.0;
427 | 					writer.write(" " + id2WordVocabulary.get(index) + "(" + pro
428 | 						+ ")");
429 | 					count += 1;
430 | 				}
431 | 				else {
432 | 					writer.write("\n\n");
433 | 					break;
434 | 				}
435 | 			}
436 | 		}
437 | 		writer.close();
438 | 	}
439 | 
440 | 	public void writeTopicWordPros()
441 | 		throws IOException
442 | 	{
443 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
444 | 			+ expName + ".phi"));
445 | 		for (int i = 0; i < numTopics; i++) {
446 | 			for (int j = 0; j < vocabularySize; j++) {
447 | 				double pro = (topicWordCount[i][j] + beta)
448 | 					/ (sumTopicWordCount[i] + betaSum);
449 | 				writer.write(pro + " ");
450 | 			}
451 | 			writer.write("\n");
452 | 		}
453 | 		writer.close();
454 | 	}
455 | 
456 | 	public void writeTopicWordCount()
457 | 		throws IOException
458 | 	{
459 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
460 | 			+ expName + ".WTcount"));
461 | 		for (int i = 0; i < numTopics; i++) {
462 | 			for (int j = 0; j < vocabularySize; j++) {
463 | 				writer.write(topicWordCount[i][j] + " ");
464 | 			}
465 | 			writer.write("\n");
466 | 		}
467 | 		writer.close();
468 | 
469 | 	}
470 | 
471 | 	public void writeDocTopicPros()
472 | 		throws IOException
473 | 	{
474 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
475 | 			+ expName + ".theta"));
476 | 		for (int i = 0; i < numDocuments; i++) {
477 | 			for (int j = 0; j < numTopics; j++) {
478 | 				double pro = (docTopicCount[i][j] + alpha)
479 | 					/ (sumDocTopicCount[i] + alphaSum);
480 | 				writer.write(pro + " ");
481 | 			}
482 | 			writer.write("\n");
483 | 		}
484 | 		writer.close();
485 | 	}
486 | 
487 | 	public void writeDocTopicCount()
488 | 		throws IOException
489 | 	{
490 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
491 | 			+ expName + ".DTcount"));
492 | 		for (int i = 0; i < numDocuments; i++) {
493 | 			for (int j = 0; j < numTopics; j++) {
494 | 				writer.write(docTopicCount[i][j] + " ");
495 | 			}
496 | 			writer.write("\n");
497 | 		}
498 | 		writer.close();
499 | 	}
500 | 
501 | 	public void write()
502 | 		throws IOException
503 | 	{
504 | 		writeTopTopicalWords();
505 | 		writeDocTopicPros();
506 | 		writeTopicAssignments();
507 | 		writeTopicWordPros();
508 | 	}
509 | 
510 | 	public static void main(String args[])
511 | 		throws Exception
512 | 	{
513 | 		GibbsSamplingLDA_Inf lda = new GibbsSamplingLDA_Inf(
514 | 			"test/testLDA.paras", "test/unseenTest.txt", 100, 20, "testLDAinf",
515 | 			0);
516 | 		lda.inference();
517 | 	}
518 | }
519 | 


--------------------------------------------------------------------------------
/src/models/GibbsSamplingLDA.java:
--------------------------------------------------------------------------------
  1 | package models;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.FileReader;
  6 | import java.io.FileWriter;
  7 | import java.io.IOException;
  8 | import java.util.ArrayList;
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | import java.util.Set;
 13 | import java.util.TreeMap;
 14 | 
 15 | import utility.FuncUtils;
 16 | 
 17 | /**
 18 |  * jLDADMM: A Java package for the LDA and DMM topic models
 19 |  * 
 20 |  * Implementation of the Latent Dirichlet Allocation topic model, using
 21 |  * collapsed Gibbs sampling, as described in:
 22 |  * 
 23 |  * Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics.
 24 |  * Proceedings of the National Academy of Sciences of the United States of
 25 |  * America, 101(Suppl 1):5228–5235.
 26 |  * 
 27 |  * @author: Dat Quoc Nguyen
 28 |  */
 29 | 
 30 | public class GibbsSamplingLDA
 31 | {
 32 | 	public double alpha; // Hyper-parameter alpha
 33 | 	public double beta; // Hyper-parameter alpha
 34 | 	public int numTopics; // Number of topics
 35 | 	public int numIterations; // Number of Gibbs sampling iterations
 36 | 	public int topWords; // Number of most probable words for each topic
 37 | 
 38 | 	public double alphaSum; // alpha * numTopics
 39 | 	public double betaSum; // beta * vocabularySize
 40 | 
 41 | 	public List<List<Integer>> corpus; // Word ID-based corpus
 42 | 	public List<List<Integer>> topicAssignments; // Topics assignments for words
 43 | 													// in the corpus
 44 | 	public int numDocuments; // Number of documents in the corpus
 45 | 	public int numWordsInCorpus; // Number of words in the corpus
 46 | 
 47 | 	public HashMap<String, Integer> word2IdVocabulary; // Vocabulary to get ID
 48 | 														// given a word
 49 | 	public HashMap<Integer, String> id2WordVocabulary; // Vocabulary to get word
 50 | 														// given an ID
 51 | 	public int vocabularySize; // The number of word types in the corpus
 52 | 
 53 | 	// numDocuments * numTopics matrix
 54 | 	// Given a document: number of its words assigned to each topic
 55 | 	public int[][] docTopicCount;
 56 | 	// Number of words in every document
 57 | 	public int[] sumDocTopicCount;
 58 | 	// numTopics * vocabularySize matrix
 59 | 	// Given a topic: number of times a word type assigned to the topic
 60 | 	public int[][] topicWordCount;
 61 | 	// Total number of words assigned to a topic
 62 | 	public int[] sumTopicWordCount;
 63 | 
 64 | 	// Double array used to sample a topic
 65 | 	public double[] multiPros;
 66 | 
 67 | 	// Path to the directory containing the corpus
 68 | 	public String folderPath;
 69 | 	// Path to the topic modeling corpus
 70 | 	public String corpusPath;
 71 | 
 72 | 	public String expName = "LDAmodel";
 73 | 	public String orgExpName = "LDAmodel";
 74 | 	public String tAssignsFilePath = "";
 75 | 	public int savestep = 0;
 76 | 
 77 | 	public GibbsSamplingLDA(String pathToCorpus, int inNumTopics,
 78 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords)
 79 | 		throws Exception
 80 | 	{
 81 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
 82 | 			inTopWords, "LDAmodel");
 83 | 	}
 84 | 
 85 | 	public GibbsSamplingLDA(String pathToCorpus, int inNumTopics,
 86 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
 87 | 		String inExpName)
 88 | 		throws Exception
 89 | 	{
 90 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
 91 | 			inTopWords, inExpName, "", 0);
 92 | 	}
 93 | 
 94 | 	public GibbsSamplingLDA(String pathToCorpus, int inNumTopics,
 95 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
 96 | 		String inExpName, String pathToTAfile)
 97 | 		throws Exception
 98 | 	{
 99 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
100 | 			inTopWords, inExpName, pathToTAfile, 0);
101 | 	}
102 | 
103 | 	public GibbsSamplingLDA(String pathToCorpus, int inNumTopics,
104 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
105 | 		String inExpName, int inSaveStep)
106 | 		throws Exception
107 | 	{
108 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
109 | 			inTopWords, inExpName, "", inSaveStep);
110 | 	}
111 | 
112 | 	public GibbsSamplingLDA(String pathToCorpus, int inNumTopics,
113 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
114 | 		String inExpName, String pathToTAfile, int inSaveStep)
115 | 		throws Exception
116 | 	{
117 | 
118 | 		alpha = inAlpha;
119 | 		beta = inBeta;
120 | 		numTopics = inNumTopics;
121 | 		numIterations = inNumIterations;
122 | 		topWords = inTopWords;
123 | 		savestep = inSaveStep;
124 | 		expName = inExpName;
125 | 		orgExpName = expName;
126 | 		corpusPath = pathToCorpus;
127 | 		folderPath = pathToCorpus.substring(
128 | 			0,
129 | 			Math.max(pathToCorpus.lastIndexOf("/"),
130 | 				pathToCorpus.lastIndexOf("\\")) + 1);
131 | 
132 | 		System.out.println("Reading topic modeling corpus: " + pathToCorpus);
133 | 
134 | 		word2IdVocabulary = new HashMap<String, Integer>();
135 | 		id2WordVocabulary = new HashMap<Integer, String>();
136 | 		corpus = new ArrayList<List<Integer>>();
137 | 		numDocuments = 0;
138 | 		numWordsInCorpus = 0;
139 | 
140 | 		BufferedReader br = null;
141 | 		try {
142 | 			int indexWord = -1;
143 | 			br = new BufferedReader(new FileReader(pathToCorpus));
144 | 			for (String doc; (doc = br.readLine()) != null;) {
145 | 
146 | 				if (doc.trim().length() == 0)
147 | 					continue;
148 | 
149 | 				String[] words = doc.trim().split("\\s+");
150 | 				List<Integer> document = new ArrayList<Integer>();
151 | 
152 | 				for (String word : words) {
153 | 					if (word2IdVocabulary.containsKey(word)) {
154 | 						document.add(word2IdVocabulary.get(word));
155 | 					}
156 | 					else {
157 | 						indexWord += 1;
158 | 						word2IdVocabulary.put(word, indexWord);
159 | 						id2WordVocabulary.put(indexWord, word);
160 | 						document.add(indexWord);
161 | 					}
162 | 				}
163 | 
164 | 				numDocuments++;
165 | 				numWordsInCorpus += document.size();
166 | 				corpus.add(document);
167 | 			}
168 | 		}
169 | 		catch (Exception e) {
170 | 			e.printStackTrace();
171 | 		}
172 | 
173 | 		vocabularySize = word2IdVocabulary.size(); // vocabularySize = indexWord
174 | 		docTopicCount = new int[numDocuments][numTopics];
175 | 		topicWordCount = new int[numTopics][vocabularySize];
176 | 		sumDocTopicCount = new int[numDocuments];
177 | 		sumTopicWordCount = new int[numTopics];
178 | 
179 | 		multiPros = new double[numTopics];
180 | 		for (int i = 0; i < numTopics; i++) {
181 | 			multiPros[i] = 1.0 / numTopics;
182 | 		}
183 | 
184 | 		alphaSum = numTopics * alpha;
185 | 		betaSum = vocabularySize * beta;
186 | 
187 | 		System.out.println("Corpus size: " + numDocuments + " docs, "
188 | 			+ numWordsInCorpus + " words");
189 | 		System.out.println("Vocabuary size: " + vocabularySize);
190 | 		System.out.println("Number of topics: " + numTopics);
191 | 		System.out.println("alpha: " + alpha);
192 | 		System.out.println("beta: " + beta);
193 | 		System.out.println("Number of sampling iterations: " + numIterations);
194 | 		System.out.println("Number of top topical words: " + topWords);
195 | 
196 | 		tAssignsFilePath = pathToTAfile;
197 | 		if (tAssignsFilePath.length() > 0)
198 | 			initialize(tAssignsFilePath);
199 | 		else
200 | 			initialize();
201 | 	}
202 | 
203 | 	/**
204 | 	 * Randomly initialize topic assignments
205 | 	 */
206 | 	public void initialize()
207 | 		throws IOException
208 | 	{
209 | 		System.out.println("Randomly initializing topic assignments ...");
210 | 
211 | 		topicAssignments = new ArrayList<List<Integer>>();
212 | 
213 | 		for (int i = 0; i < numDocuments; i++) {
214 | 			List<Integer> topics = new ArrayList<Integer>();
215 | 			int docSize = corpus.get(i).size();
216 | 			for (int j = 0; j < docSize; j++) {
217 | 				int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic
218 | 				// Increase counts
219 | 				docTopicCount[i][topic] += 1;
220 | 				topicWordCount[topic][corpus.get(i).get(j)] += 1;
221 | 				sumDocTopicCount[i] += 1;
222 | 				sumTopicWordCount[topic] += 1;
223 | 
224 | 				topics.add(topic);
225 | 			}
226 | 			topicAssignments.add(topics);
227 | 		}
228 | 	}
229 | 
230 | 	/**
231 | 	 * Initialize topic assignments from a given file
232 | 	 */
233 | 	public void initialize(String pathToTopicAssignmentFile)
234 | 	{
235 | 		System.out.println("Reading topic-assignment file: "
236 | 			+ pathToTopicAssignmentFile);
237 | 
238 | 		topicAssignments = new ArrayList<List<Integer>>();
239 | 
240 | 		BufferedReader br = null;
241 | 		try {
242 | 			br = new BufferedReader(new FileReader(pathToTopicAssignmentFile));
243 | 			int docID = 0;
244 | 			int numWords = 0;
245 | 			for (String line; (line = br.readLine()) != null;) {
246 | 				String[] strTopics = line.trim().split("\\s+");
247 | 				List<Integer> topics = new ArrayList<Integer>();
248 | 				for (int j = 0; j < strTopics.length; j++) {
249 | 					int topic = new Integer(strTopics[j]);
250 | 					// Increase counts
251 | 					docTopicCount[docID][topic] += 1;
252 | 					topicWordCount[topic][corpus.get(docID).get(j)] += 1;
253 | 					sumDocTopicCount[docID] += 1;
254 | 					sumTopicWordCount[topic] += 1;
255 | 
256 | 					topics.add(topic);
257 | 					numWords++;
258 | 				}
259 | 				topicAssignments.add(topics);
260 | 				docID++;
261 | 			}
262 | 
263 | 			if ((docID != numDocuments) || (numWords != numWordsInCorpus)) {
264 | 				System.out
265 | 					.println("The topic modeling corpus and topic assignment file are not consistent!!!");
266 | 				throw new Exception();
267 | 			}
268 | 		}
269 | 		catch (Exception e) {
270 | 			e.printStackTrace();
271 | 		}
272 | 	}
273 | 
274 | 	public void inference()
275 | 		throws IOException
276 | 	{
277 | 		writeParameters();
278 | 		writeDictionary();
279 | 
280 | 		System.out.println("Running Gibbs sampling inference: ");
281 | 
282 | 		for (int iter = 1; iter <= numIterations; iter++) {
283 | 
284 | 			System.out.println("\tSampling iteration: " + (iter));
285 | 			// System.out.println("\t\tPerplexity: " + computePerplexity());
286 | 
287 | 			sampleInSingleIteration();
288 | 
289 | 			if ((savestep > 0) && (iter % savestep == 0)
290 | 				&& (iter < numIterations)) {
291 | 				System.out.println("\t\tSaving the output from the " + iter
292 | 					+ "^{th} sample");
293 | 				expName = orgExpName + "-" + iter;
294 | 				write();
295 | 			}
296 | 		}
297 | 		expName = orgExpName;
298 | 
299 | 		System.out.println("Writing output from the last sample ...");
300 | 		write();
301 | 
302 | 		System.out.println("Sampling completed!");
303 | 
304 | 	}
305 | 
306 | 	public void sampleInSingleIteration()
307 | 	{
308 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
309 | 			int docSize = corpus.get(dIndex).size();
310 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
311 | 				// Get current word and its topic
312 | 				int topic = topicAssignments.get(dIndex).get(wIndex);
313 | 				int word = corpus.get(dIndex).get(wIndex);
314 | 
315 | 				// Decrease counts
316 | 				docTopicCount[dIndex][topic] -= 1;
317 | 				// docTopicSum[dIndex] -= 1;
318 | 				topicWordCount[topic][word] -= 1;
319 | 				sumTopicWordCount[topic] -= 1;
320 | 
321 | 				// Sample a topic
322 | 				for (int tIndex = 0; tIndex < numTopics; tIndex++) {
323 | 					multiPros[tIndex] = (docTopicCount[dIndex][tIndex] + alpha)
324 | 						* ((topicWordCount[tIndex][word] + beta) / (sumTopicWordCount[tIndex] + betaSum));
325 | 					// multiPros[tIndex] = ((docTopicCount[dIndex][tIndex] +
326 | 					// alpha) /
327 | 					// (docTopicSum[dIndex] + alphaSum))
328 | 					// * ((topicWordCount[tIndex][word] + beta) /
329 | 					// (topicWordSum[tIndex] + betaSum));
330 | 				}
331 | 				topic = FuncUtils.nextDiscrete(multiPros);
332 | 
333 | 				// Increase counts
334 | 				docTopicCount[dIndex][topic] += 1;
335 | 				// docTopicSum[dIndex] += 1;
336 | 				topicWordCount[topic][word] += 1;
337 | 				sumTopicWordCount[topic] += 1;
338 | 
339 | 				// Update topic assignments
340 | 				topicAssignments.get(dIndex).set(wIndex, topic);
341 | 			}
342 | 		}
343 | 	}
344 | 
345 | 	public void writeParameters()
346 | 		throws IOException
347 | 	{
348 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
349 | 			+ expName + ".paras"));
350 | 		writer.write("-model" + "\t" + "LDA");
351 | 		writer.write("\n-corpus" + "\t" + corpusPath);
352 | 		writer.write("\n-ntopics" + "\t" + numTopics);
353 | 		writer.write("\n-alpha" + "\t" + alpha);
354 | 		writer.write("\n-beta" + "\t" + beta);
355 | 		writer.write("\n-niters" + "\t" + numIterations);
356 | 		writer.write("\n-twords" + "\t" + topWords);
357 | 		writer.write("\n-name" + "\t" + expName);
358 | 		if (tAssignsFilePath.length() > 0)
359 | 			writer.write("\n-initFile" + "\t" + tAssignsFilePath);
360 | 		if (savestep > 0)
361 | 			writer.write("\n-sstep" + "\t" + savestep);
362 | 
363 | 		writer.close();
364 | 	}
365 | 
366 | 	public void writeDictionary()
367 | 		throws IOException
368 | 	{
369 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
370 | 			+ expName + ".vocabulary"));
371 | 		for (int id = 0; id < vocabularySize; id++)
372 | 			writer.write(id2WordVocabulary.get(id) + " " + id + "\n");
373 | 		writer.close();
374 | 	}
375 | 
376 | 	public void writeIDbasedCorpus()
377 | 		throws IOException
378 | 	{
379 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
380 | 			+ expName + ".IDcorpus"));
381 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
382 | 			int docSize = corpus.get(dIndex).size();
383 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
384 | 				writer.write(corpus.get(dIndex).get(wIndex) + " ");
385 | 			}
386 | 			writer.write("\n");
387 | 		}
388 | 		writer.close();
389 | 	}
390 | 
391 | 	public void writeTopicAssignments()
392 | 		throws IOException
393 | 	{
394 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
395 | 			+ expName + ".topicAssignments"));
396 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
397 | 			int docSize = corpus.get(dIndex).size();
398 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
399 | 				writer.write(topicAssignments.get(dIndex).get(wIndex) + " ");
400 | 			}
401 | 			writer.write("\n");
402 | 		}
403 | 		writer.close();
404 | 	}
405 | 
406 | 	public void writeTopTopicalWords()
407 | 		throws IOException
408 | 	{
409 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
410 | 			+ expName + ".topWords"));
411 | 
412 | 		for (int tIndex = 0; tIndex < numTopics; tIndex++) {
413 | 			writer.write("Topic" + new Integer(tIndex) + ":");
414 | 
415 | 			Map<Integer, Integer> wordCount = new TreeMap<Integer, Integer>();
416 | 			for (int wIndex = 0; wIndex < vocabularySize; wIndex++) {
417 | 				wordCount.put(wIndex, topicWordCount[tIndex][wIndex]);
418 | 			}
419 | 			wordCount = FuncUtils.sortByValueDescending(wordCount);
420 | 
421 | 			Set<Integer> mostLikelyWords = wordCount.keySet();
422 | 			int count = 0;
423 | 			for (Integer index : mostLikelyWords) {
424 | 				if (count < topWords) {
425 | 					double pro = (topicWordCount[tIndex][index] + beta)
426 | 						/ (sumTopicWordCount[tIndex] + betaSum);
427 | 					pro = Math.round(pro * 1000000.0) / 1000000.0;
428 | 					writer.write(" " + id2WordVocabulary.get(index) + "(" + pro
429 | 						+ ")");
430 | 					count += 1;
431 | 				}
432 | 				else {
433 | 					writer.write("\n\n");
434 | 					break;
435 | 				}
436 | 			}
437 | 		}
438 | 		writer.close();
439 | 	}
440 | 
441 | 	public void writeTopicWordPros()
442 | 		throws IOException
443 | 	{
444 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
445 | 			+ expName + ".phi"));
446 | 		for (int i = 0; i < numTopics; i++) {
447 | 			for (int j = 0; j < vocabularySize; j++) {
448 | 				double pro = (topicWordCount[i][j] + beta)
449 | 					/ (sumTopicWordCount[i] + betaSum);
450 | 				writer.write(pro + " ");
451 | 			}
452 | 			writer.write("\n");
453 | 		}
454 | 		writer.close();
455 | 	}
456 | 
457 | 	public void writeTopicWordCount()
458 | 		throws IOException
459 | 	{
460 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
461 | 			+ expName + ".WTcount"));
462 | 		for (int i = 0; i < numTopics; i++) {
463 | 			for (int j = 0; j < vocabularySize; j++) {
464 | 				writer.write(topicWordCount[i][j] + " ");
465 | 			}
466 | 			writer.write("\n");
467 | 		}
468 | 		writer.close();
469 | 
470 | 	}
471 | 
472 | 	public void writeDocTopicPros()
473 | 		throws IOException
474 | 	{
475 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
476 | 			+ expName + ".theta"));
477 | 		for (int i = 0; i < numDocuments; i++) {
478 | 			for (int j = 0; j < numTopics; j++) {
479 | 				double pro = (docTopicCount[i][j] + alpha)
480 | 					/ (sumDocTopicCount[i] + alphaSum);
481 | 				writer.write(pro + " ");
482 | 			}
483 | 			writer.write("\n");
484 | 		}
485 | 		writer.close();
486 | 	}
487 | 
488 | 	public void writeDocTopicCount()
489 | 		throws IOException
490 | 	{
491 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
492 | 			+ expName + ".DTcount"));
493 | 		for (int i = 0; i < numDocuments; i++) {
494 | 			for (int j = 0; j < numTopics; j++) {
495 | 				writer.write(docTopicCount[i][j] + " ");
496 | 			}
497 | 			writer.write("\n");
498 | 		}
499 | 		writer.close();
500 | 	}
501 | 
502 | 	public void write()
503 | 		throws IOException
504 | 	{
505 | 		writeTopTopicalWords();
506 | 		writeDocTopicPros();
507 | 		writeTopicAssignments();
508 | 		writeTopicWordPros();
509 | 	}
510 | 
511 | 	public static void main(String args[])
512 | 		throws Exception
513 | 	{
514 | 		GibbsSamplingLDA lda = new GibbsSamplingLDA("test/corpus.txt", 7, 0.1,
515 | 			0.01, 2000, 20, "testLDA");
516 | 		lda.inference();
517 | 	}
518 | }
519 | 


--------------------------------------------------------------------------------
/src/models/GibbsSamplingDMM_Inf.java:
--------------------------------------------------------------------------------
  1 | package models;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.FileReader;
  6 | import java.io.FileWriter;
  7 | import java.io.IOException;
  8 | import java.util.ArrayList;
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | import java.util.Set;
 13 | import java.util.TreeMap;
 14 | 
 15 | import utility.FuncUtils;
 16 | 
 17 | /**
 18 |  * jLDADMM: A Java package for the LDA and DMM topic models
 19 |  * 
 20 |  * http://jldadmm.sourceforge.net/
 21 |  * 
 22 |  * @author: Dat Quoc Nguyen
 23 |  * 
 24 |  */
 25 | 
 26 | public class GibbsSamplingDMM_Inf
 27 | {
 28 | 	public double alpha; // Hyper-parameter alpha
 29 | 	public double beta; // Hyper-parameter alpha
 30 | 	public int numTopics; // Number of topics
 31 | 	public int numIterations; // Number of Gibbs sampling iterations
 32 | 	public int topWords; // Number of most probable words for each topic
 33 | 
 34 | 	public double alphaSum; // alpha * numTopics
 35 | 	public double betaSum; // beta * vocabularySize
 36 | 
 37 | 	public List<List<Integer>> corpus; // Word ID-based corpus
 38 | 	public List<Integer> topicAssignments; // Topics assignments for documents
 39 | 	public int numDocuments; // Number of documents in the corpus
 40 | 	public int numWordsInCorpus; // Number of words in the corpus
 41 | 
 42 | 	public HashMap<String, Integer> word2IdVocabulary; // Vocabulary to get ID
 43 | 														// given a word
 44 | 	public HashMap<Integer, String> id2WordVocabulary; // Vocabulary to get word
 45 | 														// given an ID
 46 | 	public int vocabularySize; // The number of word types in the corpus
 47 | 
 48 | 	// Number of documents assigned to a topic
 49 | 	public int[] docTopicCount;
 50 | 	// numTopics * vocabularySize matrix
 51 | 	// Given a topic: number of times a word type assigned to the topic
 52 | 	public int[][] topicWordCount;
 53 | 	// Total number of words assigned to a topic
 54 | 	public int[] sumTopicWordCount;
 55 | 
 56 | 	// Double array used to sample a topic
 57 | 	public double[] multiPros;
 58 | 
 59 | 	// Path to the directory containing the corpus
 60 | 	public String folderPath;
 61 | 	// Path to the topic modeling corpus
 62 | 	public String corpusPath;
 63 | 
 64 | 	// Given a document, number of times its i^{th} word appearing from
 65 | 	// the first index to the i^{th}-index in the document
 66 | 	// Example: given a document of "a a b a b c d c". We have: 1 2 1 3 2 1 1 2
 67 | 	public List<List<Integer>> occurenceToIndexCount;
 68 | 
 69 | 	public String expName = "DMMinf";
 70 | 	public String orgExpName = "DMMinf";
 71 | 	public String tAssignsFilePath = "";
 72 | 	public int savestep = 0;
 73 | 
 74 | 	public GibbsSamplingDMM_Inf(String pathToTrainingParasFile,
 75 | 		String pathToUnseenCorpus, int inNumIterations, int inTopWords,
 76 | 		String inExpName, int inSaveStep)
 77 | 		throws Exception
 78 | 	{
 79 | 		HashMap<String, String> paras = parseTrainingParasFile(pathToTrainingParasFile);
 80 | 		if (!paras.get("-model").equals("DMM")) {
 81 | 			throw new Exception("Wrong pre-trained model!!!");
 82 | 		}
 83 | 		alpha = new Double(paras.get("-alpha"));
 84 | 		beta = new Double(paras.get("-beta"));
 85 | 		numTopics = new Integer(paras.get("-ntopics"));
 86 | 
 87 | 		numIterations = inNumIterations;
 88 | 		topWords = inTopWords;
 89 | 		savestep = inSaveStep;
 90 | 		expName = inExpName;
 91 | 		orgExpName = expName;
 92 | 
 93 | 		String trainingCorpus = paras.get("-corpus");
 94 | 		String trainingCorpusfolder = trainingCorpus.substring(
 95 | 			0,
 96 | 			Math.max(trainingCorpus.lastIndexOf("/"),
 97 | 				trainingCorpus.lastIndexOf("\\")) + 1);
 98 | 		String topicAssignment4TrainFile = trainingCorpusfolder
 99 | 			+ paras.get("-name") + ".topicAssignments";
100 | 
101 | 		word2IdVocabulary = new HashMap<String, Integer>();
102 | 		id2WordVocabulary = new HashMap<Integer, String>();
103 | 		initializeWordCount(trainingCorpus, topicAssignment4TrainFile);
104 | 
105 | 		corpusPath = pathToUnseenCorpus;
106 | 		folderPath = pathToUnseenCorpus.substring(
107 | 			0,
108 | 			Math.max(pathToUnseenCorpus.lastIndexOf("/"),
109 | 				pathToUnseenCorpus.lastIndexOf("\\")) + 1);
110 | 		System.out.println("Reading unseen corpus: " + pathToUnseenCorpus);
111 | 		corpus = new ArrayList<List<Integer>>();
112 | 		occurenceToIndexCount = new ArrayList<List<Integer>>();
113 | 		numDocuments = 0;
114 | 		numWordsInCorpus = 0;
115 | 
116 | 		BufferedReader br = null;
117 | 		try {
118 | 			br = new BufferedReader(new FileReader(pathToUnseenCorpus));
119 | 			for (String doc; (doc = br.readLine()) != null;) {
120 | 				if (doc.trim().length() == 0)
121 | 					continue;
122 | 
123 | 				String[] words = doc.trim().split("\\s+");
124 | 				List<Integer> document = new ArrayList<Integer>();
125 | 
126 | 				List<Integer> wordOccurenceToIndexInDoc = new ArrayList<Integer>();
127 | 				HashMap<String, Integer> wordOccurenceToIndexInDocCount = new HashMap<String, Integer>();
128 | 
129 | 				for (String word : words) {
130 | 					if (word2IdVocabulary.containsKey(word)) {
131 | 						document.add(word2IdVocabulary.get(word));
132 | 						int times = 0;
133 | 						if (wordOccurenceToIndexInDocCount.containsKey(word)) {
134 | 							times = wordOccurenceToIndexInDocCount.get(word);
135 | 						}
136 | 						times += 1;
137 | 						wordOccurenceToIndexInDocCount.put(word, times);
138 | 						wordOccurenceToIndexInDoc.add(times);
139 | 					}
140 | 					else {
141 | 						// Skip this unknown-word
142 | 					}
143 | 				}
144 | 				numDocuments++;
145 | 				numWordsInCorpus += document.size();
146 | 				corpus.add(document);
147 | 				occurenceToIndexCount.add(wordOccurenceToIndexInDoc);
148 | 			}
149 | 		}
150 | 		catch (Exception e) {
151 | 			e.printStackTrace();
152 | 		}
153 | 
154 | 		docTopicCount = new int[numTopics];
155 | 		multiPros = new double[numTopics];
156 | 		for (int i = 0; i < numTopics; i++) {
157 | 			multiPros[i] = 1.0 / numTopics;
158 | 		}
159 | 
160 | 		alphaSum = numTopics * alpha;
161 | 		betaSum = vocabularySize * beta;
162 | 
163 | 		System.out.println("Corpus size: " + numDocuments + " docs, "
164 | 			+ numWordsInCorpus + " words");
165 | 		System.out.println("Vocabuary size: " + vocabularySize);
166 | 		System.out.println("Number of topics: " + numTopics);
167 | 		System.out.println("alpha: " + alpha);
168 | 		System.out.println("beta: " + beta);
169 | 		System.out.println("Number of sampling iterations: " + numIterations);
170 | 		System.out.println("Number of top topical words: " + topWords);
171 | 
172 | 		initialize();
173 | 	}
174 | 
175 | 	private HashMap<String, String> parseTrainingParasFile(
176 | 		String pathToTrainingParasFile)
177 | 		throws Exception
178 | 	{
179 | 		HashMap<String, String> paras = new HashMap<String, String>();
180 | 		BufferedReader br = null;
181 | 		try {
182 | 			br = new BufferedReader(new FileReader(pathToTrainingParasFile));
183 | 			for (String line; (line = br.readLine()) != null;) {
184 | 
185 | 				if (line.trim().length() == 0)
186 | 					continue;
187 | 
188 | 				String[] paraOptions = line.trim().split("\\s+");
189 | 				paras.put(paraOptions[0], paraOptions[1]);
190 | 			}
191 | 		}
192 | 		catch (Exception e) {
193 | 			e.printStackTrace();
194 | 		}
195 | 		return paras;
196 | 	}
197 | 
198 | 	private void initializeWordCount(String pathToTrainingCorpus,
199 | 		String pathToTopicAssignmentFile)
200 | 	{
201 | 		System.out.println("Loading pre-trained model...");
202 | 		List<List<Integer>> trainCorpus = new ArrayList<List<Integer>>();
203 | 		BufferedReader br = null;
204 | 		try {
205 | 			int indexWord = -1;
206 | 			br = new BufferedReader(new FileReader(pathToTrainingCorpus));
207 | 			for (String doc; (doc = br.readLine()) != null;) {
208 | 
209 | 				if (doc.trim().length() == 0)
210 | 					continue;
211 | 
212 | 				String[] words = doc.trim().split("\\s+");
213 | 				List<Integer> document = new ArrayList<Integer>();
214 | 
215 | 				for (String word : words) {
216 | 					if (word2IdVocabulary.containsKey(word)) {
217 | 						document.add(word2IdVocabulary.get(word));
218 | 					}
219 | 					else {
220 | 						indexWord += 1;
221 | 						word2IdVocabulary.put(word, indexWord);
222 | 						id2WordVocabulary.put(indexWord, word);
223 | 						document.add(indexWord);
224 | 					}
225 | 				}
226 | 				trainCorpus.add(document);
227 | 			}
228 | 		}
229 | 		catch (Exception e) {
230 | 			e.printStackTrace();
231 | 		}
232 | 
233 | 		vocabularySize = word2IdVocabulary.size();
234 | 		topicWordCount = new int[numTopics][vocabularySize];
235 | 		sumTopicWordCount = new int[numTopics];
236 | 
237 | 		try {
238 | 			br = new BufferedReader(new FileReader(pathToTopicAssignmentFile));
239 | 			int docId = 0;
240 | 			for (String line; (line = br.readLine()) != null;) {
241 | 				String[] strTopics = line.trim().split("\\s+");
242 | 				for (int j = 0; j < strTopics.length; j++) {
243 | 					int wordId = trainCorpus.get(docId).get(j);
244 | 					int topic = new Integer(strTopics[j]);
245 | 					topicWordCount[topic][wordId] += 1;
246 | 					sumTopicWordCount[topic] += 1;
247 | 				}
248 | 				docId++;
249 | 			}
250 | 		}
251 | 		catch (Exception e) {
252 | 			e.printStackTrace();
253 | 		}
254 | 	}
255 | 
256 | 	/**
257 | 	 * Randomly initialize topic assignments
258 | 	 */
259 | 	public void initialize()
260 | 		throws IOException
261 | 	{
262 | 		System.out.println("Randomly initialzing topic assignments ...");
263 | 		topicAssignments = new ArrayList<Integer>();
264 | 		for (int i = 0; i < numDocuments; i++) {
265 | 			int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic
266 | 			docTopicCount[topic] += 1;
267 | 			int docSize = corpus.get(i).size();
268 | 			for (int j = 0; j < docSize; j++) {
269 | 				topicWordCount[topic][corpus.get(i).get(j)] += 1;
270 | 				sumTopicWordCount[topic] += 1;
271 | 			}
272 | 			topicAssignments.add(topic);
273 | 		}
274 | 	}
275 | 
276 | 	public void inference()
277 | 		throws IOException
278 | 	{
279 | 		writeParameters();
280 | 		writeDictionary();
281 | 
282 | 		System.out.println("Running Gibbs sampling inference: ");
283 | 
284 | 		for (int iter = 1; iter <= numIterations; iter++) {
285 | 
286 | 			System.out.println("\tSampling iteration: " + (iter));
287 | 			// System.out.println("\t\tPerplexity: " + computePerplexity());
288 | 
289 | 			sampleInSingleIteration();
290 | 
291 | 			if ((savestep > 0) && (iter % savestep == 0)
292 | 				&& (iter < numIterations)) {
293 | 				System.out.println("\t\tSaving the output from the " + iter
294 | 					+ "^{th} sample");
295 | 				expName = orgExpName + "-" + iter;
296 | 				write();
297 | 			}
298 | 		}
299 | 		expName = orgExpName;
300 | 
301 | 		System.out.println("Writing output from the last sample ...");
302 | 		write();
303 | 
304 | 		System.out.println("Sampling completed!");
305 | 
306 | 	}
307 | 
308 | 	public void sampleInSingleIteration()
309 | 	{
310 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
311 | 			int topic = topicAssignments.get(dIndex);
312 | 			List<Integer> document = corpus.get(dIndex);
313 | 			int docSize = document.size();
314 | 
315 | 			// Decrease counts
316 | 			docTopicCount[topic] -= 1;
317 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
318 | 				int word = document.get(wIndex);
319 | 				topicWordCount[topic][word] -= 1;
320 | 				sumTopicWordCount[topic] -= 1;
321 | 			}
322 | 
323 | 			// Sample a topic
324 | 			for (int tIndex = 0; tIndex < numTopics; tIndex++) {
325 | 				multiPros[tIndex] = (docTopicCount[tIndex] + alpha);
326 | 				for (int wIndex = 0; wIndex < docSize; wIndex++) {
327 | 					int word = document.get(wIndex);
328 | 					multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta
329 | 						+ occurenceToIndexCount.get(dIndex).get(wIndex) - 1)
330 | 						/ (sumTopicWordCount[tIndex] + betaSum + wIndex);
331 | 				}
332 | 			}
333 | 			topic = FuncUtils.nextDiscrete(multiPros);
334 | 
335 | 			// Increase counts
336 | 			docTopicCount[topic] += 1;
337 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
338 | 				int word = document.get(wIndex);
339 | 				topicWordCount[topic][word] += 1;
340 | 				sumTopicWordCount[topic] += 1;
341 | 			}
342 | 			// Update topic assignments
343 | 			topicAssignments.set(dIndex, topic);
344 | 		}
345 | 	}
346 | 
347 | 	public void writeParameters()
348 | 		throws IOException
349 | 	{
350 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
351 | 			+ expName + ".paras"));
352 | 		writer.write("-model" + "\t" + "DMM");
353 | 		writer.write("\n-corpus" + "\t" + corpusPath);
354 | 		writer.write("\n-ntopics" + "\t" + numTopics);
355 | 		writer.write("\n-alpha" + "\t" + alpha);
356 | 		writer.write("\n-beta" + "\t" + beta);
357 | 		writer.write("\n-niters" + "\t" + numIterations);
358 | 		writer.write("\n-twords" + "\t" + topWords);
359 | 		writer.write("\n-name" + "\t" + expName);
360 | 		if (tAssignsFilePath.length() > 0)
361 | 			writer.write("\n-initFile" + "\t" + tAssignsFilePath);
362 | 		if (savestep > 0)
363 | 			writer.write("\n-sstep" + "\t" + savestep);
364 | 
365 | 		writer.close();
366 | 	}
367 | 
368 | 	public void writeDictionary()
369 | 		throws IOException
370 | 	{
371 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
372 | 			+ expName + ".vocabulary"));
373 | 		for (int id = 0; id < vocabularySize; id++)
374 | 			writer.write(id2WordVocabulary.get(id) + " " + id + "\n");
375 | 		writer.close();
376 | 	}
377 | 
378 | 	public void writeIDbasedCorpus()
379 | 		throws IOException
380 | 	{
381 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
382 | 			+ expName + ".IDcorpus"));
383 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
384 | 			int docSize = corpus.get(dIndex).size();
385 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
386 | 				writer.write(corpus.get(dIndex).get(wIndex) + " ");
387 | 			}
388 | 			writer.write("\n");
389 | 		}
390 | 		writer.close();
391 | 	}
392 | 
393 | 	public void writeTopicAssignments()
394 | 		throws IOException
395 | 	{
396 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
397 | 			+ expName + ".topicAssignments"));
398 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
399 | 			int docSize = corpus.get(dIndex).size();
400 | 			int topic = topicAssignments.get(dIndex);
401 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
402 | 				writer.write(topic + " ");
403 | 			}
404 | 			writer.write("\n");
405 | 		}
406 | 		writer.close();
407 | 	}
408 | 
409 | 	public void writeTopTopicalWords()
410 | 		throws IOException
411 | 	{
412 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
413 | 			+ expName + ".topWords"));
414 | 
415 | 		for (int tIndex = 0; tIndex < numTopics; tIndex++) {
416 | 			writer.write("Topic" + new Integer(tIndex) + ":");
417 | 
418 | 			Map<Integer, Integer> wordCount = new TreeMap<Integer, Integer>();
419 | 			for (int wIndex = 0; wIndex < vocabularySize; wIndex++) {
420 | 				wordCount.put(wIndex, topicWordCount[tIndex][wIndex]);
421 | 			}
422 | 			wordCount = FuncUtils.sortByValueDescending(wordCount);
423 | 
424 | 			Set<Integer> mostLikelyWords = wordCount.keySet();
425 | 			int count = 0;
426 | 			for (Integer index : mostLikelyWords) {
427 | 				if (count < topWords) {
428 | 					double pro = (topicWordCount[tIndex][index] + beta)
429 | 						/ (sumTopicWordCount[tIndex] + betaSum);
430 | 					pro = Math.round(pro * 1000000.0) / 1000000.0;
431 | 					writer.write(" " + id2WordVocabulary.get(index) + "(" + pro
432 | 						+ ")");
433 | 					count += 1;
434 | 				}
435 | 				else {
436 | 					writer.write("\n\n");
437 | 					break;
438 | 				}
439 | 			}
440 | 		}
441 | 		writer.close();
442 | 	}
443 | 
444 | 	public void writeTopicWordPros()
445 | 		throws IOException
446 | 	{
447 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
448 | 			+ expName + ".phi"));
449 | 		for (int i = 0; i < numTopics; i++) {
450 | 			for (int j = 0; j < vocabularySize; j++) {
451 | 				double pro = (topicWordCount[i][j] + beta)
452 | 					/ (sumTopicWordCount[i] + betaSum);
453 | 				writer.write(pro + " ");
454 | 			}
455 | 			writer.write("\n");
456 | 		}
457 | 		writer.close();
458 | 	}
459 | 
460 | 	public void writeTopicWordCount()
461 | 		throws IOException
462 | 	{
463 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
464 | 			+ expName + ".WTcount"));
465 | 		for (int i = 0; i < numTopics; i++) {
466 | 			for (int j = 0; j < vocabularySize; j++) {
467 | 				writer.write(topicWordCount[i][j] + " ");
468 | 			}
469 | 			writer.write("\n");
470 | 		}
471 | 		writer.close();
472 | 
473 | 	}
474 | 
475 | 	public void writeDocTopicPros()
476 | 		throws IOException
477 | 	{
478 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
479 | 			+ expName + ".theta"));
480 | 
481 | 		for (int i = 0; i < numDocuments; i++) {
482 | 			int docSize = corpus.get(i).size();
483 | 			double sum = 0.0;
484 | 			for (int tIndex = 0; tIndex < numTopics; tIndex++) {
485 | 				multiPros[tIndex] = (docTopicCount[tIndex] + alpha);
486 | 				for (int wIndex = 0; wIndex < docSize; wIndex++) {
487 | 					int word = corpus.get(i).get(wIndex);
488 | 					multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta)
489 | 						/ (sumTopicWordCount[tIndex] + betaSum);
490 | 				}
491 | 				sum += multiPros[tIndex];
492 | 			}
493 | 			for (int tIndex = 0; tIndex < numTopics; tIndex++) {
494 | 				writer.write((multiPros[tIndex] / sum) + " ");
495 | 			}
496 | 			writer.write("\n");
497 | 		}
498 | 		writer.close();
499 | 	}
500 | 
501 | 	public void write()
502 | 		throws IOException
503 | 	{
504 | 		writeTopTopicalWords();
505 | 		writeDocTopicPros();
506 | 		writeTopicAssignments();
507 | 		writeTopicWordPros();
508 | 	}
509 | 
510 | 	public static void main(String args[])
511 | 		throws Exception
512 | 	{
513 | 		GibbsSamplingDMM_Inf dmm = new GibbsSamplingDMM_Inf(
514 | 			"test/testDMM.paras", "test/unseenTest.txt", 100, 20, "testDMMinf",
515 | 			0);
516 | 		dmm.inference();
517 | 	}
518 | }
519 | 


--------------------------------------------------------------------------------
/src/models/GibbsSamplingDMM.java:
--------------------------------------------------------------------------------
  1 | package models;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.FileReader;
  6 | import java.io.FileWriter;
  7 | import java.io.IOException;
  8 | import java.util.ArrayList;
  9 | import java.util.HashMap;
 10 | import java.util.List;
 11 | import java.util.Map;
 12 | import java.util.Set;
 13 | import java.util.TreeMap;
 14 | 
 15 | import utility.FuncUtils;
 16 | 
 17 | /**
 18 |  * jLDADMM: A Java package for the LDA and DMM topic models
 19 |  * 
 20 |  * Implementation of the one-topic-per-document Dirichlet Multinomial Mixture
 21 |  * model, using collapsed Gibbs sampling, as described in:
 22 |  * 
 23 |  * Jianhua Yin and Jianyong Wang. 2014. A Dirichlet Multinomial Mixture
 24 |  * Model-based Approach for Short Text Clustering. In Proceedings of the 20th
 25 |  * ACM SIGKDD International Conference on Knowledge Discovery and Data Mining,
 26 |  * pages 233–242.
 27 |  * 
 28 |  * @author: Dat Quoc Nguyen
 29 |  */
 30 | 
 31 | public class GibbsSamplingDMM
 32 | {
 33 | 	public double alpha; // Hyper-parameter alpha
 34 | 	public double beta; // Hyper-parameter alpha
 35 | 	public int numTopics; // Number of topics
 36 | 	public int numIterations; // Number of Gibbs sampling iterations
 37 | 	public int topWords; // Number of most probable words for each topic
 38 | 
 39 | 	public double alphaSum; // alpha * numTopics
 40 | 	public double betaSum; // beta * vocabularySize
 41 | 
 42 | 	public List<List<Integer>> corpus; // Word ID-based corpus
 43 | 	public List<Integer> topicAssignments; // Topics assignments for documents
 44 | 	public int numDocuments; // Number of documents in the corpus
 45 | 	public int numWordsInCorpus; // Number of words in the corpus
 46 | 
 47 | 	public HashMap<String, Integer> word2IdVocabulary; // Vocabulary to get ID
 48 | 														// given a word
 49 | 	public HashMap<Integer, String> id2WordVocabulary; // Vocabulary to get word
 50 | 														// given an ID
 51 | 	public int vocabularySize; // The number of word types in the corpus
 52 | 
 53 | 	// Number of documents assigned to a topic
 54 | 	public int[] docTopicCount;
 55 | 	// numTopics * vocabularySize matrix
 56 | 	// Given a topic: number of times a word type assigned to the topic
 57 | 	public int[][] topicWordCount;
 58 | 	// Total number of words assigned to a topic
 59 | 	public int[] sumTopicWordCount;
 60 | 
 61 | 	// Double array used to sample a topic
 62 | 	public double[] multiPros;
 63 | 
 64 | 	// Path to the directory containing the corpus
 65 | 	public String folderPath;
 66 | 	// Path to the topic modeling corpus
 67 | 	public String corpusPath;
 68 | 
 69 | 	// Given a document, number of times its i^{th} word appearing from
 70 | 	// the first index to the i^{th}-index in the document
 71 | 	// Example: given a document of "a a b a b c d c". We have: 1 2 1 3 2 1 1 2
 72 | 	public List<List<Integer>> occurenceToIndexCount;
 73 | 
 74 | 	public String expName = "DMMmodel";
 75 | 	public String orgExpName = "DMMmodel";
 76 | 	public String tAssignsFilePath = "";
 77 | 	public int savestep = 0;
 78 | 
 79 | 	public GibbsSamplingDMM(String pathToCorpus, int inNumTopics,
 80 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords)
 81 | 		throws Exception
 82 | 	{
 83 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
 84 | 			inTopWords, "DMMmodel");
 85 | 	}
 86 | 
 87 | 	public GibbsSamplingDMM(String pathToCorpus, int inNumTopics,
 88 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
 89 | 		String inExpName)
 90 | 		throws Exception
 91 | 	{
 92 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
 93 | 			inTopWords, inExpName, "", 0);
 94 | 
 95 | 	}
 96 | 
 97 | 	public GibbsSamplingDMM(String pathToCorpus, int inNumTopics,
 98 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
 99 | 		String inExpName, String pathToTAfile)
100 | 		throws Exception
101 | 	{
102 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
103 | 			inTopWords, inExpName, pathToTAfile, 0);
104 | 
105 | 	}
106 | 
107 | 	public GibbsSamplingDMM(String pathToCorpus, int inNumTopics,
108 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
109 | 		String inExpName, int inSaveStep)
110 | 		throws Exception
111 | 	{
112 | 		this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations,
113 | 			inTopWords, inExpName, "", inSaveStep);
114 | 
115 | 	}
116 | 
117 | 	public GibbsSamplingDMM(String pathToCorpus, int inNumTopics,
118 | 		double inAlpha, double inBeta, int inNumIterations, int inTopWords,
119 | 		String inExpName, String pathToTAfile, int inSaveStep)
120 | 		throws IOException
121 | 	{
122 | 		alpha = inAlpha;
123 | 		beta = inBeta;
124 | 		numTopics = inNumTopics;
125 | 		numIterations = inNumIterations;
126 | 		topWords = inTopWords;
127 | 		savestep = inSaveStep;
128 | 		expName = inExpName;
129 | 		orgExpName = expName;
130 | 		corpusPath = pathToCorpus;
131 | 		folderPath = pathToCorpus.substring(
132 | 			0,
133 | 			Math.max(pathToCorpus.lastIndexOf("/"),
134 | 				pathToCorpus.lastIndexOf("\\")) + 1);
135 | 
136 | 		System.out.println("Reading topic modeling corpus: " + pathToCorpus);
137 | 
138 | 		word2IdVocabulary = new HashMap<String, Integer>();
139 | 		id2WordVocabulary = new HashMap<Integer, String>();
140 | 		corpus = new ArrayList<List<Integer>>();
141 | 		occurenceToIndexCount = new ArrayList<List<Integer>>();
142 | 		numDocuments = 0;
143 | 		numWordsInCorpus = 0;
144 | 
145 | 		BufferedReader br = null;
146 | 		try {
147 | 			int indexWord = -1;
148 | 			br = new BufferedReader(new FileReader(pathToCorpus));
149 | 			for (String doc; (doc = br.readLine()) != null;) {
150 | 				if (doc.trim().length() == 0)
151 | 					continue;
152 | 
153 | 				String[] words = doc.trim().split("\\s+");
154 | 				List<Integer> document = new ArrayList<Integer>();
155 | 
156 | 				List<Integer> wordOccurenceToIndexInDoc = new ArrayList<Integer>();
157 | 				HashMap<String, Integer> wordOccurenceToIndexInDocCount = new HashMap<String, Integer>();
158 | 
159 | 				for (String word : words) {
160 | 					if (word2IdVocabulary.containsKey(word)) {
161 | 						document.add(word2IdVocabulary.get(word));
162 | 					}
163 | 					else {
164 | 						indexWord += 1;
165 | 						word2IdVocabulary.put(word, indexWord);
166 | 						id2WordVocabulary.put(indexWord, word);
167 | 						document.add(indexWord);
168 | 					}
169 | 
170 | 					int times = 0;
171 | 					if (wordOccurenceToIndexInDocCount.containsKey(word)) {
172 | 						times = wordOccurenceToIndexInDocCount.get(word);
173 | 					}
174 | 					times += 1;
175 | 					wordOccurenceToIndexInDocCount.put(word, times);
176 | 					wordOccurenceToIndexInDoc.add(times);
177 | 				}
178 | 				numDocuments++;
179 | 				numWordsInCorpus += document.size();
180 | 				corpus.add(document);
181 | 				occurenceToIndexCount.add(wordOccurenceToIndexInDoc);
182 | 			}
183 | 		}
184 | 		catch (Exception e) {
185 | 			e.printStackTrace();
186 | 		}
187 | 
188 | 		vocabularySize = word2IdVocabulary.size();
189 | 		docTopicCount = new int[numTopics];
190 | 		topicWordCount = new int[numTopics][vocabularySize];
191 | 		sumTopicWordCount = new int[numTopics];
192 | 
193 | 		multiPros = new double[numTopics];
194 | 		for (int i = 0; i < numTopics; i++) {
195 | 			multiPros[i] = 1.0 / numTopics;
196 | 		}
197 | 
198 | 		alphaSum = numTopics * alpha;
199 | 		betaSum = vocabularySize * beta;
200 | 
201 | 		System.out.println("Corpus size: " + numDocuments + " docs, "
202 | 			+ numWordsInCorpus + " words");
203 | 		System.out.println("Vocabuary size: " + vocabularySize);
204 | 		System.out.println("Number of topics: " + numTopics);
205 | 		System.out.println("alpha: " + alpha);
206 | 		System.out.println("beta: " + beta);
207 | 		System.out.println("Number of sampling iterations: " + numIterations);
208 | 		System.out.println("Number of top topical words: " + topWords);
209 | 
210 | 		tAssignsFilePath = pathToTAfile;
211 | 		if (tAssignsFilePath.length() > 0)
212 | 			initialize(tAssignsFilePath);
213 | 		else
214 | 			initialize();
215 | 	}
216 | 
217 | 	/**
218 | 	 * Randomly initialize topic assignments
219 | 	 */
220 | 	public void initialize()
221 | 		throws IOException
222 | 	{
223 | 		System.out.println("Randomly initialzing topic assignments ...");
224 | 		topicAssignments = new ArrayList<Integer>();
225 | 		for (int i = 0; i < numDocuments; i++) {
226 | 			int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic
227 | 			docTopicCount[topic] += 1;
228 | 			int docSize = corpus.get(i).size();
229 | 			for (int j = 0; j < docSize; j++) {
230 | 				topicWordCount[topic][corpus.get(i).get(j)] += 1;
231 | 				sumTopicWordCount[topic] += 1;
232 | 			}
233 | 			topicAssignments.add(topic);
234 | 		}
235 | 	}
236 | 
237 | 	/**
238 | 	 * Initialize topic assignments from a given file
239 | 	 */
240 | 	public void initialize(String pathToTopicAssignmentFile)
241 | 	{
242 | 		System.out.println("Reading topic-assigment file: "
243 | 			+ pathToTopicAssignmentFile);
244 | 
245 | 		topicAssignments = new ArrayList<Integer>();
246 | 
247 | 		BufferedReader br = null;
248 | 		try {
249 | 			br = new BufferedReader(new FileReader(pathToTopicAssignmentFile));
250 | 			int docID = 0;
251 | 			int numWords = 0;
252 | 			for (String line; (line = br.readLine()) != null;) {
253 | 				String[] strTopics = line.trim().split("\\s+");
254 | 				int topic = new Integer(strTopics[0]) % numTopics;
255 | 				docTopicCount[topic] += 1;
256 | 				for (int j = 0; j < strTopics.length; j++) {
257 | 					// Increase counts
258 | 					topicWordCount[topic][corpus.get(docID).get(j)] += 1;
259 | 					sumTopicWordCount[topic] += 1;
260 | 
261 | 					numWords++;
262 | 				}
263 | 				topicAssignments.add(topic);
264 | 				docID++;
265 | 			}
266 | 
267 | 			if ((docID != numDocuments) || (numWords != numWordsInCorpus)) {
268 | 				System.out
269 | 					.println("The topic modeling corpus and topic assignment file are not consistent!!!");
270 | 				throw new Exception();
271 | 			}
272 | 		}
273 | 		catch (Exception e) {
274 | 			e.printStackTrace();
275 | 		}
276 | 	}
277 | 
278 | 	public void inference()
279 | 		throws IOException
280 | 	{
281 | 		writeParameters();
282 | 		writeDictionary();
283 | 
284 | 		System.out.println("Running Gibbs sampling inference: ");
285 | 
286 | 		for (int iter = 1; iter <= numIterations; iter++) {
287 | 
288 | 			System.out.println("\tSampling iteration: " + (iter));
289 | 			// System.out.println("\t\tPerplexity: " + computePerplexity());
290 | 
291 | 			sampleInSingleIteration();
292 | 
293 | 			if ((savestep > 0) && (iter % savestep == 0)
294 | 				&& (iter < numIterations)) {
295 | 				System.out.println("\t\tSaving the output from the " + iter
296 | 					+ "^{th} sample");
297 | 				expName = orgExpName + "-" + iter;
298 | 				write();
299 | 			}
300 | 		}
301 | 		expName = orgExpName;
302 | 
303 | 		System.out.println("Writing output from the last sample ...");
304 | 		write();
305 | 
306 | 		System.out.println("Sampling completed!");
307 | 
308 | 	}
309 | 
310 | 	public void sampleInSingleIteration()
311 | 	{
312 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
313 | 			int topic = topicAssignments.get(dIndex);
314 | 			List<Integer> document = corpus.get(dIndex);
315 | 			int docSize = document.size();
316 | 
317 | 			// Decrease counts
318 | 			docTopicCount[topic] -= 1;
319 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
320 | 				int word = document.get(wIndex);
321 | 				topicWordCount[topic][word] -= 1;
322 | 				sumTopicWordCount[topic] -= 1;
323 | 			}
324 | 
325 | 			// Sample a topic
326 | 			for (int tIndex = 0; tIndex < numTopics; tIndex++) {
327 | 				multiPros[tIndex] = (docTopicCount[tIndex] + alpha);
328 | 				for (int wIndex = 0; wIndex < docSize; wIndex++) {
329 | 					int word = document.get(wIndex);
330 | 					multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta
331 | 						+ occurenceToIndexCount.get(dIndex).get(wIndex) - 1)
332 | 						/ (sumTopicWordCount[tIndex] + betaSum + wIndex);
333 | 				}
334 | 			}
335 | 			topic = FuncUtils.nextDiscrete(multiPros);
336 | 
337 | 			// Increase counts
338 | 			docTopicCount[topic] += 1;
339 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
340 | 				int word = document.get(wIndex);
341 | 				topicWordCount[topic][word] += 1;
342 | 				sumTopicWordCount[topic] += 1;
343 | 			}
344 | 			// Update topic assignments
345 | 			topicAssignments.set(dIndex, topic);
346 | 		}
347 | 	}
348 | 
349 | 	public void writeParameters()
350 | 		throws IOException
351 | 	{
352 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
353 | 			+ expName + ".paras"));
354 | 		writer.write("-model" + "\t" + "DMM");
355 | 		writer.write("\n-corpus" + "\t" + corpusPath);
356 | 		writer.write("\n-ntopics" + "\t" + numTopics);
357 | 		writer.write("\n-alpha" + "\t" + alpha);
358 | 		writer.write("\n-beta" + "\t" + beta);
359 | 		writer.write("\n-niters" + "\t" + numIterations);
360 | 		writer.write("\n-twords" + "\t" + topWords);
361 | 		writer.write("\n-name" + "\t" + expName);
362 | 		if (tAssignsFilePath.length() > 0)
363 | 			writer.write("\n-initFile" + "\t" + tAssignsFilePath);
364 | 		if (savestep > 0)
365 | 			writer.write("\n-sstep" + "\t" + savestep);
366 | 
367 | 		writer.close();
368 | 	}
369 | 
370 | 	public void writeDictionary()
371 | 		throws IOException
372 | 	{
373 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
374 | 			+ expName + ".vocabulary"));
375 | 		for (int id = 0; id < vocabularySize; id++)
376 | 			writer.write(id2WordVocabulary.get(id) + " " + id + "\n");
377 | 		writer.close();
378 | 	}
379 | 
380 | 	public void writeIDbasedCorpus()
381 | 		throws IOException
382 | 	{
383 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
384 | 			+ expName + ".IDcorpus"));
385 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
386 | 			int docSize = corpus.get(dIndex).size();
387 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
388 | 				writer.write(corpus.get(dIndex).get(wIndex) + " ");
389 | 			}
390 | 			writer.write("\n");
391 | 		}
392 | 		writer.close();
393 | 	}
394 | 
395 | 	public void writeTopicAssignments()
396 | 		throws IOException
397 | 	{
398 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
399 | 			+ expName + ".topicAssignments"));
400 | 		for (int dIndex = 0; dIndex < numDocuments; dIndex++) {
401 | 			int docSize = corpus.get(dIndex).size();
402 | 			int topic = topicAssignments.get(dIndex);
403 | 			for (int wIndex = 0; wIndex < docSize; wIndex++) {
404 | 				writer.write(topic + " ");
405 | 			}
406 | 			writer.write("\n");
407 | 		}
408 | 		writer.close();
409 | 	}
410 | 
411 | 	public void writeTopTopicalWords()
412 | 		throws IOException
413 | 	{
414 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
415 | 			+ expName + ".topWords"));
416 | 
417 | 		for (int tIndex = 0; tIndex < numTopics; tIndex++) {
418 | 			writer.write("Topic" + new Integer(tIndex) + ":");
419 | 
420 | 			Map<Integer, Integer> wordCount = new TreeMap<Integer, Integer>();
421 | 			for (int wIndex = 0; wIndex < vocabularySize; wIndex++) {
422 | 				wordCount.put(wIndex, topicWordCount[tIndex][wIndex]);
423 | 			}
424 | 			wordCount = FuncUtils.sortByValueDescending(wordCount);
425 | 
426 | 			Set<Integer> mostLikelyWords = wordCount.keySet();
427 | 			int count = 0;
428 | 			for (Integer index : mostLikelyWords) {
429 | 				if (count < topWords) {
430 | 					double pro = (topicWordCount[tIndex][index] + beta)
431 | 						/ (sumTopicWordCount[tIndex] + betaSum);
432 | 					pro = Math.round(pro * 1000000.0) / 1000000.0;
433 | 					writer.write(" " + id2WordVocabulary.get(index) + "(" + pro
434 | 						+ ")");
435 | 					count += 1;
436 | 				}
437 | 				else {
438 | 					writer.write("\n\n");
439 | 					break;
440 | 				}
441 | 			}
442 | 		}
443 | 		writer.close();
444 | 	}
445 | 
446 | 	public void writeTopicWordPros()
447 | 		throws IOException
448 | 	{
449 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
450 | 			+ expName + ".phi"));
451 | 		for (int i = 0; i < numTopics; i++) {
452 | 			for (int j = 0; j < vocabularySize; j++) {
453 | 				double pro = (topicWordCount[i][j] + beta)
454 | 					/ (sumTopicWordCount[i] + betaSum);
455 | 				writer.write(pro + " ");
456 | 			}
457 | 			writer.write("\n");
458 | 		}
459 | 		writer.close();
460 | 	}
461 | 
462 | 	public void writeTopicWordCount()
463 | 		throws IOException
464 | 	{
465 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
466 | 			+ expName + ".WTcount"));
467 | 		for (int i = 0; i < numTopics; i++) {
468 | 			for (int j = 0; j < vocabularySize; j++) {
469 | 				writer.write(topicWordCount[i][j] + " ");
470 | 			}
471 | 			writer.write("\n");
472 | 		}
473 | 		writer.close();
474 | 
475 | 	}
476 | 
477 | 	public void writeDocTopicPros()
478 | 		throws IOException
479 | 	{
480 | 		BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath
481 | 			+ expName + ".theta"));
482 | 
483 | 		for (int i = 0; i < numDocuments; i++) {
484 | 			int docSize = corpus.get(i).size();
485 | 			double sum = 0.0;
486 | 			for (int tIndex = 0; tIndex < numTopics; tIndex++) {
487 | 				multiPros[tIndex] = (docTopicCount[tIndex] + alpha);
488 | 				for (int wIndex = 0; wIndex < docSize; wIndex++) {
489 | 					int word = corpus.get(i).get(wIndex);
490 | 					multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta)
491 | 						/ (sumTopicWordCount[tIndex] + betaSum);
492 | 				}
493 | 				sum += multiPros[tIndex];
494 | 			}
495 | 			for (int tIndex = 0; tIndex < numTopics; tIndex++) {
496 | 				writer.write((multiPros[tIndex] / sum) + " ");
497 | 			}
498 | 			writer.write("\n");
499 | 		}
500 | 		writer.close();
501 | 	}
502 | 
503 | 	public void write()
504 | 		throws IOException
505 | 	{
506 | 		writeTopTopicalWords();
507 | 		writeDocTopicPros();
508 | 		writeTopicAssignments();
509 | 		writeTopicWordPros();
510 | 	}
511 | 
512 | 	public static void main(String args[])
513 | 		throws Exception
514 | 	{
515 | 		GibbsSamplingDMM dmm = new GibbsSamplingDMM("test/corpus.txt", 7, 0.1,
516 | 			0.1, 2000, 20, "testDMM");
517 | 		dmm.inference();
518 | 	}
519 | }
520 | 


--------------------------------------------------------------------------------
/src/utility/MersenneTwister.java:
--------------------------------------------------------------------------------
  1 | package utility;
  2 | 
  3 | import java.io.DataInputStream;
  4 | import java.io.DataOutputStream;
  5 | import java.io.IOException;
  6 | import java.io.ObjectInputStream;
  7 | import java.io.ObjectOutputStream;
  8 | import java.io.Serializable;
  9 | 
 10 | /**
 11 |  * <h3>MersenneTwister and MersenneTwisterFast</h3>
 12 |  * <p>
 13 |  * <b>Version 20</b>, based on version MT199937(99/10/29) of the Mersenne Twister algorithm found at
 14 |  * <a href="http://www.math.keio.ac.jp/matumoto/emt.html"> The Mersenne Twister Home Page</a>, with
 15 |  * the initialization improved using the new 2002/1/26 initialization algorithm By Sean Luke,
 16 |  * October 2004.
 17 |  * 
 18 |  * <p>
 19 |  * <b>MersenneTwister</b> is a drop-in subclass replacement for java.util.Random. It is properly
 20 |  * synchronized and can be used in a multithreaded environment. On modern VMs such as HotSpot, it is
 21 |  * approximately 1/3 slower than java.util.Random.
 22 |  *
 23 |  * <p>
 24 |  * <b>MersenneTwisterFast</b> is not a subclass of java.util.Random. It has the same public methods
 25 |  * as Random does, however, and it is algorithmically identical to MersenneTwister.
 26 |  * MersenneTwisterFast has hard-code inlined all of its methods directly, and made all of them final
 27 |  * (well, the ones of consequence anyway). Further, these methods are <i>not</i> synchronized, so
 28 |  * the same MersenneTwisterFast instance cannot be shared by multiple threads. But all this helps
 29 |  * MersenneTwisterFast achieve well over twice the speed of MersenneTwister. java.util.Random is
 30 |  * about 1/3 slower than MersenneTwisterFast.
 31 |  *
 32 |  * <h3>About the Mersenne Twister</h3>
 33 |  * <p>
 34 |  * This is a Java version of the C-program for MT19937: Integer version. The MT19937 algorithm was
 35 |  * created by Makoto Matsumoto and Takuji Nishimura, who ask: "When you use this, send an email to:
 36 |  * matumoto@math.keio.ac.jp with an appropriate reference to your work". Indicate that this is a
 37 |  * translation of their algorithm into Java.
 38 |  *
 39 |  * <p>
 40 |  * <b>Reference. </b> Makato Matsumoto and Takuji Nishimura, "Mersenne Twister: A 623-Dimensionally
 41 |  * Equidistributed Uniform Pseudo-Random Number Generator", <i>ACM Transactions on Modeling and.
 42 |  * Computer Simulation,</i> Vol. 8, No. 1, January 1998, pp 3--30.
 43 |  *
 44 |  * <h3>About this Version</h3>
 45 |  *
 46 |  * <p>
 47 |  * <b>Changes since V19:</b> nextFloat(boolean, boolean) now returns float, not double.
 48 |  *
 49 |  * <p>
 50 |  * <b>Changes since V18:</b> Removed old final declarations, which used to potentially speed up the
 51 |  * code, but no longer.
 52 |  *
 53 |  * <p>
 54 |  * <b>Changes since V17:</b> Removed vestigial references to &= 0xffffffff which stemmed from the
 55 |  * original C code. The C code could not guarantee that ints were 32 bit, hence the masks. The
 56 |  * vestigial references in the Java code were likely optimized out anyway.
 57 |  *
 58 |  * <p>
 59 |  * <b>Changes since V16:</b> Added nextDouble(includeZero, includeOne) and nextFloat(includeZero,
 60 |  * includeOne) to allow for half-open, fully-closed, and fully-open intervals.
 61 |  *
 62 |  * <p>
 63 |  * <b>Changes Since V15:</b> Added serialVersionUID to quiet compiler warnings from Sun's overly
 64 |  * verbose compilers as of JDK 1.5.
 65 |  *
 66 |  * <p>
 67 |  * <b>Changes Since V14:</b> made strictfp, with StrictMath.log and StrictMath.sqrt in nextGaussian
 68 |  * instead of Math.log and Math.sqrt. This is largely just to be safe, as it presently makes no
 69 |  * difference in the speed, correctness, or results of the algorithm.
 70 |  *
 71 |  * <p>
 72 |  * <b>Changes Since V13:</b> clone() method CloneNotSupportedException removed.
 73 |  *
 74 |  * <p>
 75 |  * <b>Changes Since V12:</b> clone() method added.
 76 |  *
 77 |  * <p>
 78 |  * <b>Changes Since V11:</b> stateEquals(...) method added. MersenneTwisterFast is equal to other
 79 |  * MersenneTwisterFasts with identical state; likewise MersenneTwister is equal to other
 80 |  * MersenneTwister with identical state. This isn't equals(...) because that requires a contract of
 81 |  * immutability to compare by value.
 82 |  *
 83 |  * <p>
 84 |  * <b>Changes Since V10:</b> A documentation error suggested that setSeed(int[]) required an int[]
 85 |  * array 624 long. In fact, the array can be any non-zero length. The new version also checks for
 86 |  * this fact.
 87 |  *
 88 |  * <p>
 89 |  * <b>Changes Since V9:</b> readState(stream) and writeState(stream) provided.
 90 |  *
 91 |  * <p>
 92 |  * <b>Changes Since V8:</b> setSeed(int) was only using the first 28 bits of the seed; it should
 93 |  * have been 32 bits. For small-number seeds the behavior is identical.
 94 |  *
 95 |  * <p>
 96 |  * <b>Changes Since V7:</b> A documentation error in MersenneTwisterFast (but not MersenneTwister)
 97 |  * stated that nextDouble selects uniformly from the full-open interval [0,1]. It does not.
 98 |  * nextDouble's contract is identical across MersenneTwisterFast, MersenneTwister, and
 99 |  * java.util.Random, namely, selection in the half-open interval [0,1). That is, 1.0 should not be
100 |  * returned. A similar contract exists in nextFloat.
101 |  *
102 |  * <p>
103 |  * <b>Changes Since V6:</b> License has changed from LGPL to BSD. New timing information to compare
104 |  * against java.util.Random. Recent versions of HotSpot have helped Random increase in speed to the
105 |  * point where it is faster than MersenneTwister but slower than MersenneTwisterFast (which should
106 |  * be the case, as it's a less complex algorithm but is synchronized).
107 |  * 
108 |  * <p>
109 |  * <b>Changes Since V5:</b> New empty constructor made to work the same as java.util.Random --
110 |  * namely, it seeds based on the current time in milliseconds.
111 |  *
112 |  * <p>
113 |  * <b>Changes Since V4:</b> New initialization algorithms. See (see <a
114 |  * href="http://www.math.keio.ac.jp/matumoto/MT2002/emt19937ar.html"</a>
115 |  * http://www.math.keio.ac.jp/matumoto/MT2002/emt19937ar.html</a>)
116 |  *
117 |  * <p>
118 |  * The MersenneTwister code is based on standard MT19937 C/C++ code by Takuji Nishimura, with
119 |  * suggestions from Topher Cooper and Marc Rieffel, July 1997. The code was originally translated
120 |  * into Java by Michael Lecuyer, January 1999, and the original code is Copyright (c) 1999 by
121 |  * Michael Lecuyer.
122 |  *
123 |  * <h3>Java notes</h3>
124 |  * 
125 |  * <p>
126 |  * This implementation implements the bug fixes made in Java 1.2's version of Random, which means it
127 |  * can be used with earlier versions of Java. See <a
128 |  * href="http://www.javasoft.com/products/jdk/1.2/docs/api/java/util/Random.html"> the JDK 1.2
129 |  * java.util.Random documentation</a> for further documentation on the random-number generation
130 |  * contracts made. Additionally, there's an undocumented bug in the JDK java.util.Random.nextBytes()
131 |  * method, which this code fixes.
132 |  *
133 |  * <p>
134 |  * Just like java.util.Random, this generator accepts a long seed but doesn't use all of it.
135 |  * java.util.Random uses 48 bits. The Mersenne Twister instead uses 32 bits (int size). So it's best
136 |  * if your seed does not exceed the int range.
137 |  *
138 |  * <p>
139 |  * MersenneTwister can be used reliably on JDK version 1.1.5 or above. Earlier Java versions have
140 |  * serious bugs in java.util.Random; only MersenneTwisterFast (and not MersenneTwister nor
141 |  * java.util.Random) should be used with them.
142 |  *
143 |  * <h3>License</h3>
144 |  *
145 |  * Copyright (c) 2003 by Sean Luke. <br>
146 |  * Portions copyright (c) 1993 by Michael Lecuyer. <br>
147 |  * All rights reserved. <br>
148 |  *
149 |  * <p>
150 |  * Redistribution and use in source and binary forms, with or without modification, are permitted
151 |  * provided that the following conditions are met:
152 |  * <ul>
153 |  * <li>Redistributions of source code must retain the above copyright notice, this list of
154 |  * conditions and the following disclaimer.
155 |  * <li>Redistributions in binary form must reproduce the above copyright notice, this list of
156 |  * conditions and the following disclaimer in the documentation and/or other materials provided with
157 |  * the distribution.
158 |  * <li>Neither the name of the copyright owners, their employers, nor the names of its contributors
159 |  * may be used to endorse or promote products derived from this software without specific prior
160 |  * written permission.
161 |  * </ul>
162 |  * <p>
163 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
164 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
165 |  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNERS OR
166 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
167 |  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
168 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
169 |  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
170 |  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
171 |  *
172 |  * @version 20
173 |  */
174 | 
175 | public strictfp class MersenneTwister
176 |     extends java.util.Random
177 |     implements Serializable, Cloneable
178 | {
179 |     // Serialization
180 |     private static final long serialVersionUID = -4035832775130174188L; // locked as of Version 15
181 | 
182 |     // Period parameters
183 |     private static final int N = 624;
184 |     private static final int M = 397;
185 |     private static final int MATRIX_A = 0x9908b0df; // private static final * constant vector a
186 |     private static final int UPPER_MASK = 0x80000000; // most significant w-r bits
187 |     private static final int LOWER_MASK = 0x7fffffff; // least significant r bits
188 | 
189 |     // Tempering parameters
190 |     private static final int TEMPERING_MASK_B = 0x9d2c5680;
191 |     private static final int TEMPERING_MASK_C = 0xefc60000;
192 | 
193 |     private int mt[]; // the array for the state vector
194 |     private int mti; // mti==N+1 means mt[N] is not initialized
195 |     private int mag01[];
196 | 
197 |     // a good initial seed (of int size, though stored in a long)
198 |     // private static final long GOOD_SEED = 4357;
199 | 
200 |     /*
201 |      * implemented here because there's a bug in Random's implementation of the Gaussian code
202 |      * (divide by zero, and log(0), ugh!), yet its gaussian variables are private so we can't access
203 |      * them here. :-(
204 |      */
205 | 
206 |     private double __nextNextGaussian;
207 |     private boolean __haveNextNextGaussian;
208 | 
209 |     /* We're overriding all internal data, to my knowledge, so this should be okay */
210 |     public Object clone()
211 |     {
212 |         try {
213 |             MersenneTwister f = (MersenneTwister) (super.clone());
214 |             f.mt = (int[]) (mt.clone());
215 |             f.mag01 = (int[]) (mag01.clone());
216 |             return f;
217 |         }
218 |         catch (CloneNotSupportedException e) {
219 |             throw new InternalError();
220 |         } // should never happen
221 |     }
222 | 
223 |     public boolean stateEquals(Object o)
224 |     {
225 |         if (o == this)
226 |             return true;
227 |         if (o == null || !(o instanceof MersenneTwister))
228 |             return false;
229 |         MersenneTwister other = (MersenneTwister) o;
230 |         if (mti != other.mti)
231 |             return false;
232 |         for (int x = 0; x < mag01.length; x++)
233 |             if (mag01[x] != other.mag01[x])
234 |                 return false;
235 |         for (int x = 0; x < mt.length; x++)
236 |             if (mt[x] != other.mt[x])
237 |                 return false;
238 |         return true;
239 |     }
240 | 
241 |     /** Reads the entire state of the MersenneTwister RNG from the stream */
242 |     public void readState(DataInputStream stream)
243 |         throws IOException
244 |     {
245 |         int len = mt.length;
246 |         for (int x = 0; x < len; x++)
247 |             mt[x] = stream.readInt();
248 | 
249 |         len = mag01.length;
250 |         for (int x = 0; x < len; x++)
251 |             mag01[x] = stream.readInt();
252 | 
253 |         mti = stream.readInt();
254 |         __nextNextGaussian = stream.readDouble();
255 |         __haveNextNextGaussian = stream.readBoolean();
256 |     }
257 | 
258 |     /** Writes the entire state of the MersenneTwister RNG to the stream */
259 |     public void writeState(DataOutputStream stream)
260 |         throws IOException
261 |     {
262 |         int len = mt.length;
263 |         for (int x = 0; x < len; x++)
264 |             stream.writeInt(mt[x]);
265 | 
266 |         len = mag01.length;
267 |         for (int x = 0; x < len; x++)
268 |             stream.writeInt(mag01[x]);
269 | 
270 |         stream.writeInt(mti);
271 |         stream.writeDouble(__nextNextGaussian);
272 |         stream.writeBoolean(__haveNextNextGaussian);
273 |     }
274 | 
275 |     /**
276 |      * Constructor using the default seed.
277 |      */
278 |     public MersenneTwister()
279 |     {
280 |         this(System.currentTimeMillis());
281 |     }
282 | 
283 |     /**
284 |      * Constructor using a given seed. Though you pass this seed in as a long, it's best to make
285 |      * sure it's actually an integer.
286 |      */
287 |     public MersenneTwister(long seed)
288 |     {
289 |         super(seed); /* just in case */
290 |         setSeed(seed);
291 |     }
292 | 
293 |     /**
294 |      * Constructor using an array of integers as seed. Your array must have a non-zero length. Only
295 |      * the first 624 integers in the array are used; if the array is shorter than this then integers
296 |      * are repeatedly used in a wrap-around fashion.
297 |      */
298 |     public MersenneTwister(int[] array)
299 |     {
300 |         super(System.currentTimeMillis()); /* pick something at random just in case */
301 |         setSeed(array);
302 |     }
303 | 
304 |     /**
305 |      * Initalize the pseudo random number generator. Don't pass in a long that's bigger than an int
306 |      * (Mersenne Twister only uses the first 32 bits for its seed).
307 |      */
308 | 
309 |     synchronized public void setSeed(long seed)
310 |     {
311 |         // it's always good style to call super
312 |         super.setSeed(seed);
313 | 
314 |         // Due to a bug in java.util.Random clear up to 1.2, we're
315 |         // doing our own Gaussian variable.
316 |         __haveNextNextGaussian = false;
317 | 
318 |         mt = new int[N];
319 | 
320 |         mag01 = new int[2];
321 |         mag01[0] = 0x0;
322 |         mag01[1] = MATRIX_A;
323 | 
324 |         mt[0] = (int) (seed & 0xffffffff);
325 |         mt[0] = (int) seed;
326 |         for (mti = 1; mti < N; mti++) {
327 |             mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >>> 30)) + mti);
328 |             /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
329 |             /* In the previous versions, MSBs of the seed affect */
330 |             /* only MSBs of the array mt[]. */
331 |             /* 2002/01/09 modified by Makoto Matsumoto */
332 |             // mt[mti] &= 0xffffffff;
333 |             /* for >32 bit machines */
334 |         }
335 |     }
336 | 
337 |     /**
338 |      * Sets the seed of the MersenneTwister using an array of integers. Your array must have a
339 |      * non-zero length. Only the first 624 integers in the array are used; if the array is shorter
340 |      * than this then integers are repeatedly used in a wrap-around fashion.
341 |      */
342 | 
343 |     synchronized public void setSeed(int[] array)
344 |     {
345 |         if (array.length == 0)
346 |             throw new IllegalArgumentException("Array length must be greater than zero");
347 |         int i, j, k;
348 |         setSeed(19650218);
349 |         i = 1;
350 |         j = 0;
351 |         k = (N > array.length ? N : array.length);
352 |         for (; k != 0; k--) {
353 |             mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >>> 30)) * 1664525)) + array[j] + j; /*
354 |                                                                                             * non
355 |                                                                                             * linear
356 |                                                                                             */
357 |             // mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
358 |             i++;
359 |             j++;
360 |             if (i >= N) {
361 |                 mt[0] = mt[N - 1];
362 |                 i = 1;
363 |             }
364 |             if (j >= array.length)
365 |                 j = 0;
366 |         }
367 |         for (k = N - 1; k != 0; k--) {
368 |             mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >>> 30)) * 1566083941)) - i; /* non linear */
369 |             // mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
370 |             i++;
371 |             if (i >= N) {
372 |                 mt[0] = mt[N - 1];
373 |                 i = 1;
374 |             }
375 |         }
376 |         mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
377 |     }
378 | 
379 |     /**
380 |      * Returns an integer with <i>bits</i> bits filled with a random number.
381 |      */
382 |     synchronized protected int next(int bits)
383 |     {
384 |         int y;
385 | 
386 |         if (mti >= N) // generate N words at one time
387 |         {
388 |             int kk;
389 |             final int[] mt = this.mt; // locals are slightly faster
390 |             final int[] mag01 = this.mag01; // locals are slightly faster
391 | 
392 |             for (kk = 0; kk < N - M; kk++) {
393 |                 y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
394 |                 mt[kk] = mt[kk + M] ^ (y >>> 1) ^ mag01[y & 0x1];
395 |             }
396 |             for (; kk < N - 1; kk++) {
397 |                 y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
398 |                 mt[kk] = mt[kk + (M - N)] ^ (y >>> 1) ^ mag01[y & 0x1];
399 |             }
400 |             y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
401 |             mt[N - 1] = mt[M - 1] ^ (y >>> 1) ^ mag01[y & 0x1];
402 | 
403 |             mti = 0;
404 |         }
405 | 
406 |         y = mt[mti++];
407 |         y ^= y >>> 11; // TEMPERING_SHIFT_U(y)
408 |         y ^= (y << 7) & TEMPERING_MASK_B; // TEMPERING_SHIFT_S(y)
409 |         y ^= (y << 15) & TEMPERING_MASK_C; // TEMPERING_SHIFT_T(y)
410 |         y ^= (y >>> 18); // TEMPERING_SHIFT_L(y)
411 | 
412 |         return y >>> (32 - bits); // hope that's right!
413 |     }
414 | 
415 |     /*
416 |      * If you've got a truly old version of Java, you can omit these two next methods.
417 |      */
418 | 
419 |     private synchronized void writeObject(ObjectOutputStream out)
420 |         throws IOException
421 |     {
422 |         // just so we're synchronized.
423 |         out.defaultWriteObject();
424 |     }
425 | 
426 |     private synchronized void readObject(ObjectInputStream in)
427 |         throws IOException, ClassNotFoundException
428 |     {
429 |         // just so we're synchronized.
430 |         in.defaultReadObject();
431 |     }
432 | 
433 |     /**
434 |      * This method is missing from jdk 1.0.x and below. JDK 1.1 includes this for us, but what the
435 |      * heck.
436 |      */
437 |     public boolean nextBoolean()
438 |     {
439 |         return next(1) != 0;
440 |     }
441 | 
442 |     /**
443 |      * This generates a coin flip with a probability <tt>probability</tt> of returning true, else
444 |      * returning false. <tt>probability</tt> must be between 0.0 and 1.0, inclusive. Not as precise
445 |      * a random real event as nextBoolean(double), but twice as fast. To explicitly use this,
446 |      * remember you may need to cast to float first.
447 |      */
448 | 
449 |     public boolean nextBoolean(float probability)
450 |     {
451 |         if (probability < 0.0f || probability > 1.0f)
452 |             throw new IllegalArgumentException("probability must be between 0.0 and 1.0 inclusive.");
453 |         if (probability == 0.0f)
454 |             return false; // fix half-open issues
455 |         else if (probability == 1.0f)
456 |             return true; // fix half-open issues
457 |         return nextFloat() < probability;
458 |     }
459 | 
460 |     /**
461 |      * This generates a coin flip with a probability <tt>probability</tt> of returning true, else
462 |      * returning false. <tt>probability</tt> must be between 0.0 and 1.0, inclusive.
463 |      */
464 | 
465 |     public boolean nextBoolean(double probability)
466 |     {
467 |         if (probability < 0.0 || probability > 1.0)
468 |             throw new IllegalArgumentException("probability must be between 0.0 and 1.0 inclusive.");
469 |         if (probability == 0.0)
470 |             return false; // fix half-open issues
471 |         else if (probability == 1.0)
472 |             return true; // fix half-open issues
473 |         return nextDouble() < probability;
474 |     }
475 | 
476 |     /**
477 |      * This method is missing from JDK 1.1 and below. JDK 1.2 includes this for us, but what the
478 |      * heck.
479 |      */
480 | 
481 |     public int nextInt(int n)
482 |     {
483 |         if (n <= 0)
484 |             throw new IllegalArgumentException("n must be positive, got: " + n);
485 | 
486 |         if ((n & -n) == n)
487 |             return (int) ((n * (long) next(31)) >> 31);
488 | 
489 |         int bits, val;
490 |         do {
491 |             bits = next(31);
492 |             val = bits % n;
493 |         }
494 |         while (bits - val + (n - 1) < 0);
495 |         return val;
496 |     }
497 | 
498 |     /**
499 |      * This method is for completness' sake. Returns a long drawn uniformly from 0 to n-1. Suffice
500 |      * it to say, n must be > 0, or an IllegalArgumentException is raised.
501 |      */
502 | 
503 |     public long nextLong(long n)
504 |     {
505 |         if (n <= 0)
506 |             throw new IllegalArgumentException("n must be positive, got: " + n);
507 | 
508 |         long bits, val;
509 |         do {
510 |             bits = (nextLong() >>> 1);
511 |             val = bits % n;
512 |         }
513 |         while (bits - val + (n - 1) < 0);
514 |         return val;
515 |     }
516 | 
517 |     /**
518 |      * A bug fix for versions of JDK 1.1 and below. JDK 1.2 fixes this for us, but what the heck.
519 |      */
520 |     public double nextDouble()
521 |     {
522 |         return (((long) next(26) << 27) + next(27)) / (double) (1L << 53);
523 |     }
524 | 
525 |     /**
526 |      * Returns a double in the range from 0.0 to 1.0, possibly inclusive of 0.0 and 1.0 themselves.
527 |      * Thus:
528 |      * 
529 |      * <p>
530 |      * <table border=0>
531 |      * <th>
532 |      * <td>Expression
533 |      * <td>Interval
534 |      * <tr>
535 |      * <td>nextDouble(false, false)
536 |      * <td>(0.0, 1.0)
537 |      * <tr>
538 |      * <td>nextDouble(true, false)
539 |      * <td>[0.0, 1.0)
540 |      * <tr>
541 |      * <td>nextDouble(false, true)
542 |      * <td>(0.0, 1.0]
543 |      * <tr>
544 |      * <td>nextDouble(true, true)
545 |      * <td>[0.0, 1.0]
546 |      * </table>
547 |      * 
548 |      * <p>
549 |      * This version preserves all possible random values in the double range.
550 |      */
551 |     public double nextDouble(boolean includeZero, boolean includeOne)
552 |     {
553 |         double d = 0.0;
554 |         do {
555 |             d = nextDouble(); // grab a value, initially from half-open [0.0, 1.0)
556 |             if (includeOne && nextBoolean())
557 |                 d += 1.0; // if includeOne, with 1/2 probability, push to [1.0, 2.0)
558 |         }
559 |         while ((d > 1.0) || // everything above 1.0 is always invalid
560 |                 (!includeZero && d == 0.0)); // if we're not including zero, 0.0 is invalid
561 |         return d;
562 |     }
563 | 
564 |     /**
565 |      * A bug fix for versions of JDK 1.1 and below. JDK 1.2 fixes this for us, but what the heck.
566 |      */
567 | 
568 |     public float nextFloat()
569 |     {
570 |         return next(24) / ((float) (1 << 24));
571 |     }
572 | 
573 |     /**
574 |      * Returns a float in the range from 0.0f to 1.0f, possibly inclusive of 0.0f and 1.0f
575 |      * themselves. Thus:
576 |      * 
577 |      * <p>
578 |      * <table border=0>
579 |      * <th>
580 |      * <td>Expression
581 |      * <td>Interval
582 |      * <tr>
583 |      * <td>nextFloat(false, false)
584 |      * <td>(0.0f, 1.0f)
585 |      * <tr>
586 |      * <td>nextFloat(true, false)
587 |      * <td>[0.0f, 1.0f)
588 |      * <tr>
589 |      * <td>nextFloat(false, true)
590 |      * <td>(0.0f, 1.0f]
591 |      * <tr>
592 |      * <td>nextFloat(true, true)
593 |      * <td>[0.0f, 1.0f]
594 |      * </table>
595 |      * 
596 |      * <p>
597 |      * This version preserves all possible random values in the float range.
598 |      */
599 |     public float nextFloat(boolean includeZero, boolean includeOne)
600 |     {
601 |         float d = 0.0f;
602 |         do {
603 |             d = nextFloat(); // grab a value, initially from half-open [0.0f, 1.0f)
604 |             if (includeOne && nextBoolean())
605 |                 d += 1.0f; // if includeOne, with 1/2 probability, push to [1.0f, 2.0f)
606 |         }
607 |         while ((d > 1.0f) || // everything above 1.0f is always invalid
608 |                 (!includeZero && d == 0.0f)); // if we're not including zero, 0.0f is invalid
609 |         return d;
610 |     }
611 | 
612 |     /**
613 |      * A bug fix for all versions of the JDK. The JDK appears to use all four bytes in an integer as
614 |      * independent byte values! Totally wrong. I've submitted a bug report.
615 |      */
616 | 
617 |     public void nextBytes(byte[] bytes)
618 |     {
619 |         for (int x = 0; x < bytes.length; x++)
620 |             bytes[x] = (byte) next(8);
621 |     }
622 | 
623 |     /** For completeness' sake, though it's not in java.util.Random. */
624 | 
625 |     public char nextChar()
626 |     {
627 |         // chars are 16-bit UniCode values
628 |         return (char) (next(16));
629 |     }
630 | 
631 |     /** For completeness' sake, though it's not in java.util.Random. */
632 | 
633 |     public short nextShort()
634 |     {
635 |         return (short) (next(16));
636 |     }
637 | 
638 |     /** For completeness' sake, though it's not in java.util.Random. */
639 | 
640 |     public byte nextByte()
641 |     {
642 |         return (byte) (next(8));
643 |     }
644 | 
645 |     /**
646 |      * A bug fix for all JDK code including 1.2. nextGaussian can theoretically ask for the log of 0
647 |      * and divide it by 0! See Java bug <a
648 |      * href="http://developer.java.sun.com/developer/bugParade/bugs/4254501.html">
649 |      * http://developer.java.sun.com/developer/bugParade/bugs/4254501.html</a>
650 |      */
651 | 
652 |     synchronized public double nextGaussian()
653 |     {
654 |         if (__haveNextNextGaussian) {
655 |             __haveNextNextGaussian = false;
656 |             return __nextNextGaussian;
657 |         }
658 |         else {
659 |             double v1, v2, s;
660 |             do {
661 |                 v1 = 2 * nextDouble() - 1; // between -1.0 and 1.0
662 |                 v2 = 2 * nextDouble() - 1; // between -1.0 and 1.0
663 |                 s = v1 * v1 + v2 * v2;
664 |             }
665 |             while (s >= 1 || s == 0);
666 |             double multiplier = StrictMath.sqrt(-2 * StrictMath.log(s) / s);
667 |             __nextNextGaussian = v2 * multiplier;
668 |             __haveNextNextGaussian = true;
669 |             return v1 * multiplier;
670 |         }
671 |     }
672 | 
673 |     /**
674 |      * Tests the code.
675 |      */
676 |     public static void main(String args[])
677 |     {
678 |         int j;
679 | 
680 |         MersenneTwister r;
681 | 
682 |         // CORRECTNESS TEST
683 |         // COMPARE WITH http://www.math.keio.ac.jp/matumoto/CODES/MT2002/mt19937ar.out
684 | 
685 |         r = new MersenneTwister(new int[] { 0x123, 0x234, 0x345, 0x456 });
686 |         System.out.println("Output of MersenneTwister with new (2002/1/26) seeding mechanism");
687 |         for (j = 0; j < 1000; j++) {
688 |             // first, convert the int from signed to "unsigned"
689 |             long l = (long) r.nextInt();
690 |             if (l < 0)
691 |                 l += 4294967296L; // max int value
692 |             String s = String.valueOf(l);
693 |             while (s.length() < 10)
694 |                 s = " " + s; // buffer
695 |             System.out.print(s + " ");
696 |             if (j % 5 == 4)
697 |                 System.out.println();
698 |         }
699 | 
700 |         // SPEED TEST
701 | 
702 |         final long SEED = 4357;
703 | 
704 |         int xx;
705 |         long ms;
706 |         System.out.println("\nTime to test grabbing 100000000 ints");
707 | 
708 |         r = new MersenneTwister(SEED);
709 |         ms = System.currentTimeMillis();
710 |         xx = 0;
711 |         for (j = 0; j < 100000000; j++)
712 |             xx += r.nextInt();
713 |         System.out.println("Mersenne Twister: " + (System.currentTimeMillis() - ms)
714 |                 + "          Ignore this: " + xx);
715 | 
716 |         System.out
717 |                 .println("To compare this with java.util.Random, run this same test on MersenneTwisterFast.");
718 |         System.out
719 |                 .println("The comparison with Random is removed from MersenneTwister because it is a proper");
720 |         System.out
721 |                 .println("subclass of Random and this unfairly makes some of Random's methods un-inlinable,");
722 |         System.out.println("so it would make Random look worse than it is.");
723 | 
724 |         // TEST TO COMPARE TYPE CONVERSION BETWEEN
725 |         // MersenneTwisterFast.java AND MersenneTwister.java
726 | 
727 |         System.out.println("\nGrab the first 1000 booleans");
728 |         r = new MersenneTwister(SEED);
729 |         for (j = 0; j < 1000; j++) {
730 |             System.out.print(r.nextBoolean() + " ");
731 |             if (j % 8 == 7)
732 |                 System.out.println();
733 |         }
734 |         if (!(j % 8 == 7))
735 |             System.out.println();
736 | 
737 |         System.out
738 |                 .println("\nGrab 1000 booleans of increasing probability using nextBoolean(double)");
739 |         r = new MersenneTwister(SEED);
740 |         for (j = 0; j < 1000; j++) {
741 |             System.out.print(r.nextBoolean((double) (j / 999.0)) + " ");
742 |             if (j % 8 == 7)
743 |                 System.out.println();
744 |         }
745 |         if (!(j % 8 == 7))
746 |             System.out.println();
747 | 
748 |         System.out
749 |                 .println("\nGrab 1000 booleans of increasing probability using nextBoolean(float)");
750 |         r = new MersenneTwister(SEED);
751 |         for (j = 0; j < 1000; j++) {
752 |             System.out.print(r.nextBoolean((float) (j / 999.0f)) + " ");
753 |             if (j % 8 == 7)
754 |                 System.out.println();
755 |         }
756 |         if (!(j % 8 == 7))
757 |             System.out.println();
758 | 
759 |         byte[] bytes = new byte[1000];
760 |         System.out.println("\nGrab the first 1000 bytes using nextBytes");
761 |         r = new MersenneTwister(SEED);
762 |         r.nextBytes(bytes);
763 |         for (j = 0; j < 1000; j++) {
764 |             System.out.print(bytes[j] + " ");
765 |             if (j % 16 == 15)
766 |                 System.out.println();
767 |         }
768 |         if (!(j % 16 == 15))
769 |             System.out.println();
770 | 
771 |         byte b;
772 |         System.out.println("\nGrab the first 1000 bytes -- must be same as nextBytes");
773 |         r = new MersenneTwister(SEED);
774 |         for (j = 0; j < 1000; j++) {
775 |             System.out.print((b = r.nextByte()) + " ");
776 |             if (b != bytes[j])
777 |                 System.out.print("BAD ");
778 |             if (j % 16 == 15)
779 |                 System.out.println();
780 |         }
781 |         if (!(j % 16 == 15))
782 |             System.out.println();
783 | 
784 |         System.out.println("\nGrab the first 1000 shorts");
785 |         r = new MersenneTwister(SEED);
786 |         for (j = 0; j < 1000; j++) {
787 |             System.out.print(r.nextShort() + " ");
788 |             if (j % 8 == 7)
789 |                 System.out.println();
790 |         }
791 |         if (!(j % 8 == 7))
792 |             System.out.println();
793 | 
794 |         System.out.println("\nGrab the first 1000 ints");
795 |         r = new MersenneTwister(SEED);
796 |         for (j = 0; j < 1000; j++) {
797 |             System.out.print(r.nextInt() + " ");
798 |             if (j % 4 == 3)
799 |                 System.out.println();
800 |         }
801 |         if (!(j % 4 == 3))
802 |             System.out.println();
803 | 
804 |         System.out.println("\nGrab the first 1000 ints of different sizes");
805 |         r = new MersenneTwister(SEED);
806 |         int max = 1;
807 |         for (j = 0; j < 1000; j++) {
808 |             System.out.print(r.nextInt(max) + " ");
809 |             max *= 2;
810 |             if (max <= 0)
811 |                 max = 1;
812 |             if (j % 4 == 3)
813 |                 System.out.println();
814 |         }
815 |         if (!(j % 4 == 3))
816 |             System.out.println();
817 | 
818 |         System.out.println("\nGrab the first 1000 longs");
819 |         r = new MersenneTwister(SEED);
820 |         for (j = 0; j < 1000; j++) {
821 |             System.out.print(r.nextLong() + " ");
822 |             if (j % 3 == 2)
823 |                 System.out.println();
824 |         }
825 |         if (!(j % 3 == 2))
826 |             System.out.println();
827 | 
828 |         System.out.println("\nGrab the first 1000 longs of different sizes");
829 |         r = new MersenneTwister(SEED);
830 |         long max2 = 1;
831 |         for (j = 0; j < 1000; j++) {
832 |             System.out.print(r.nextLong(max2) + " ");
833 |             max2 *= 2;
834 |             if (max2 <= 0)
835 |                 max2 = 1;
836 |             if (j % 4 == 3)
837 |                 System.out.println();
838 |         }
839 |         if (!(j % 4 == 3))
840 |             System.out.println();
841 | 
842 |         System.out.println("\nGrab the first 1000 floats");
843 |         r = new MersenneTwister(SEED);
844 |         for (j = 0; j < 1000; j++) {
845 |             System.out.print(r.nextFloat() + " ");
846 |             if (j % 4 == 3)
847 |                 System.out.println();
848 |         }
849 |         if (!(j % 4 == 3))
850 |             System.out.println();
851 | 
852 |         System.out.println("\nGrab the first 1000 doubles");
853 |         r = new MersenneTwister(SEED);
854 |         for (j = 0; j < 1000; j++) {
855 |             System.out.print(r.nextDouble() + " ");
856 |             if (j % 3 == 2)
857 |                 System.out.println();
858 |         }
859 |         if (!(j % 3 == 2))
860 |             System.out.println();
861 | 
862 |         System.out.println("\nGrab the first 1000 gaussian doubles");
863 |         r = new MersenneTwister(SEED);
864 |         for (j = 0; j < 1000; j++) {
865 |             System.out.print(r.nextGaussian() + " ");
866 |             if (j % 3 == 2)
867 |                 System.out.println();
868 |         }
869 |         if (!(j % 3 == 2))
870 |             System.out.println();
871 | 
872 |     }
873 | 
874 | }
875 | 


--------------------------------------------------------------------------------