├── jar └── jLDADMM.jar ├── lib └── args4j-2.0.6.jar ├── src ├── utility │ ├── MTRandom.java │ ├── CmdArgs.java │ ├── FuncUtils.java │ └── MersenneTwister.java ├── jLDADMM.java ├── eval │ └── ClusteringEval.java └── models │ ├── GibbsSamplingLDA_Inf.java │ ├── GibbsSamplingLDA.java │ ├── GibbsSamplingDMM_Inf.java │ └── GibbsSamplingDMM.java ├── License.txt ├── test ├── corpus.LABEL ├── unseenTest.txt └── corpus.txt └── README.md /jar/jLDADMM.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/jLDADMM/HEAD/jar/jLDADMM.jar -------------------------------------------------------------------------------- /lib/args4j-2.0.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/jLDADMM/HEAD/lib/args4j-2.0.6.jar -------------------------------------------------------------------------------- /src/utility/MTRandom.java: -------------------------------------------------------------------------------- 1 | package utility; 2 | 3 | public class MTRandom 4 | { 5 | 6 | private static MersenneTwister rand = new MersenneTwister(); 7 | 8 | public static void setSeed(long seed) 9 | { 10 | rand.setSeed(seed); 11 | } 12 | 13 | public static double nextDouble() 14 | { 15 | return rand.nextDouble(); 16 | } 17 | 18 | public static int nextInt(int n) 19 | { 20 | return rand.nextInt(n); 21 | } 22 | 23 | public static boolean nextBoolean() 24 | { 25 | return rand.nextBoolean(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | jLDADMM: A Java package for the LDA and DMM topic models 2 | 3 | Copyright (C) 2015-2017 by Dat Quoc Nguyen 4 | dat.nguyen@students.mq.edu.au 5 | Department of Computing, Macquarie University, Australia 6 | 7 | jLDADMM's website: http://jldadmm.sourceforge.net/ 8 | 9 | jLDADMM is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. 10 | 11 | jLDADMM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 12 | 13 | You should have received a copy of the GNU General Public License along with this program. If not, see -------------------------------------------------------------------------------- /src/utility/CmdArgs.java: -------------------------------------------------------------------------------- 1 | package utility; 2 | 3 | import org.kohsuke.args4j.Option; 4 | 5 | public class CmdArgs 6 | { 7 | 8 | @Option(name = "-model", usage = "Specify model", required = true) 9 | public String model = ""; 10 | 11 | @Option(name = "-corpus", usage = "Specify path to topic modeling corpus") 12 | public String corpus = ""; 13 | 14 | @Option(name = "-ntopics", usage = "Specify number of topics") 15 | public int ntopics = 20; 16 | 17 | @Option(name = "-alpha", usage = "Specify alpha") 18 | public double alpha = 0.1; 19 | 20 | @Option(name = "-beta", usage = "Specify beta") 21 | public double beta = 0.01; 22 | 23 | @Option(name = "-niters", usage = "Specify number of iterations") 24 | public int niters = 2000; 25 | 26 | @Option(name = "-twords", usage = "Specify number of top topical words") 27 | public int twords = 20; 28 | 29 | @Option(name = "-name", usage = "Specify a name to topic modeling experiment") 30 | public String expModelName = "model"; 31 | 32 | @Option(name = "-seed", usage = "Specify a random seed for reproducibility") 33 | public int seed = 0; 34 | 35 | @Option(name = "-initFile") 36 | public String initTopicAssgns = ""; 37 | 38 | @Option(name = "-sstep") 39 | public int savestep = 0; 40 | 41 | @Option(name = "-dir") 42 | public String dir = ""; 43 | 44 | @Option(name = "-label") 45 | public String labelFile = ""; 46 | 47 | @Option(name = "-prob") 48 | public String prob = ""; 49 | 50 | @Option(name = "-paras", usage = "Specify path to hyper-parameter file") 51 | public String paras = ""; 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/utility/FuncUtils.java: -------------------------------------------------------------------------------- 1 | package utility; 2 | 3 | import java.util.Collections; 4 | import java.util.Comparator; 5 | import java.util.LinkedHashMap; 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | public class FuncUtils 11 | { 12 | public static > Map sortByValueDescending(Map map) 13 | { 14 | List> list = new LinkedList>(map.entrySet()); 15 | Collections.sort(list, new Comparator>() 16 | { 17 | @Override 18 | public int compare(Map.Entry o1, Map.Entry o2) 19 | { 20 | int compare = (o1.getValue()).compareTo(o2.getValue()); 21 | return -compare; 22 | } 23 | }); 24 | 25 | Map result = new LinkedHashMap(); 26 | for (Map.Entry entry : list) { 27 | result.put(entry.getKey(), entry.getValue()); 28 | } 29 | return result; 30 | } 31 | 32 | public static > Map sortByValueAscending(Map map) 33 | { 34 | List> list = new LinkedList>(map.entrySet()); 35 | Collections.sort(list, new Comparator>() 36 | { 37 | @Override 38 | public int compare(Map.Entry o1, Map.Entry o2) 39 | { 40 | int compare = (o1.getValue()).compareTo(o2.getValue()); 41 | return compare; 42 | } 43 | }); 44 | 45 | Map result = new LinkedHashMap(); 46 | for (Map.Entry entry : list) { 47 | result.put(entry.getKey(), entry.getValue()); 48 | } 49 | return result; 50 | } 51 | 52 | /** 53 | * Sample a value from a double array 54 | * 55 | * @param probs 56 | * @return 57 | */ 58 | public static int nextDiscrete(double[] probs) 59 | { 60 | double sum = 0.0; 61 | for (int i = 0; i < probs.length; i++) 62 | sum += probs[i]; 63 | 64 | double r = MTRandom.nextDouble() * sum; 65 | 66 | sum = 0.0; 67 | for (int i = 0; i < probs.length; i++) { 68 | sum += probs[i]; 69 | if (sum > r) 70 | return i; 71 | } 72 | return probs.length - 1; 73 | } 74 | 75 | public static double mean(double[] m) 76 | { 77 | double sum = 0; 78 | for (int i = 0; i < m.length; i++) 79 | sum += m[i]; 80 | return sum / m.length; 81 | } 82 | 83 | public static double stddev(double[] m) 84 | { 85 | double mean = mean(m); 86 | double s = 0; 87 | for (int i = 0; i < m.length; i++) 88 | s += (m[i] - mean) * (m[i] - mean); 89 | return Math.sqrt(s / m.length); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/jLDADMM.java: -------------------------------------------------------------------------------- 1 | import models.GibbsSamplingDMM; 2 | import models.GibbsSamplingDMM_Inf; 3 | import models.GibbsSamplingLDA; 4 | import models.GibbsSamplingLDA_Inf; 5 | 6 | import org.kohsuke.args4j.CmdLineException; 7 | import org.kohsuke.args4j.CmdLineParser; 8 | 9 | import utility.MTRandom; 10 | import utility.CmdArgs; 11 | import eval.ClusteringEval; 12 | 13 | /** 14 | * jLDADMM: A Java package for the LDA and DMM topic models 15 | * 16 | * http://jldadmm.sourceforge.net/ 17 | * 18 | * @author: Dat Quoc Nguyen 19 | * 20 | * @version: 1.0.3 21 | * jLDADMM v1.0.3 incorporates the following change: Adding a random-seed option "-seed" for reproducibility 22 | */ 23 | public class jLDADMM 24 | { 25 | public static void main(String[] args) 26 | { 27 | CmdArgs cmdArgs = new CmdArgs(); 28 | CmdLineParser parser = new CmdLineParser(cmdArgs); 29 | try { 30 | 31 | parser.parseArgument(args); 32 | 33 | if (cmdArgs.seed > 0){ 34 | MTRandom.setSeed(cmdArgs.seed); 35 | } 36 | 37 | if (cmdArgs.model.equals("LDA")) { 38 | GibbsSamplingLDA lda = new GibbsSamplingLDA(cmdArgs.corpus, 39 | cmdArgs.ntopics, cmdArgs.alpha, cmdArgs.beta, 40 | cmdArgs.niters, cmdArgs.twords, cmdArgs.expModelName, 41 | cmdArgs.initTopicAssgns, cmdArgs.savestep); 42 | lda.inference(); 43 | } 44 | else if (cmdArgs.model.equals("DMM")) { 45 | GibbsSamplingDMM dmm = new GibbsSamplingDMM(cmdArgs.corpus, 46 | cmdArgs.ntopics, cmdArgs.alpha, cmdArgs.beta, 47 | cmdArgs.niters, cmdArgs.twords, cmdArgs.expModelName, 48 | cmdArgs.initTopicAssgns, cmdArgs.savestep); 49 | dmm.inference(); 50 | } 51 | else if (cmdArgs.model.equals("LDAinf")) { 52 | GibbsSamplingLDA_Inf lda = new GibbsSamplingLDA_Inf( 53 | cmdArgs.paras, cmdArgs.corpus, cmdArgs.niters, 54 | cmdArgs.twords, cmdArgs.expModelName, cmdArgs.savestep); 55 | lda.inference(); 56 | } 57 | else if (cmdArgs.model.equals("DMMinf")) { 58 | GibbsSamplingDMM_Inf dmm = new GibbsSamplingDMM_Inf( 59 | cmdArgs.paras, cmdArgs.corpus, cmdArgs.niters, 60 | cmdArgs.twords, cmdArgs.expModelName, cmdArgs.savestep); 61 | dmm.inference(); 62 | } 63 | else if (cmdArgs.model.equals("Eval")) { 64 | ClusteringEval.evaluate(cmdArgs.labelFile, cmdArgs.dir, 65 | cmdArgs.prob); 66 | } 67 | else { 68 | System.out 69 | .println("Error: Option \"-model\" must get \"LDA\" or \"DMM\" or \"LDAinf\" or \"DMMinf\" or \"Eval\""); 70 | System.out 71 | .println("\tLDA: Specify the Latent Dirichlet Allocation topic model"); 72 | System.out 73 | .println("\tDMM: Specify the one-topic-per-document Dirichlet Multinomial Mixture model"); 74 | System.out 75 | .println("\tLDAinf: Infer topics for unseen corpus using a pre-trained LDA model"); 76 | System.out 77 | .println("\tDMMinf: Infer topics for unseen corpus using a pre-trained DMM model"); 78 | System.out 79 | .println("\tEval: Specify the document clustering evaluation"); 80 | help(parser); 81 | return; 82 | } 83 | } 84 | catch (CmdLineException cle) { 85 | System.out.println("Error: " + cle.getMessage()); 86 | help(parser); 87 | return; 88 | } 89 | catch (Exception e) { 90 | System.out.println("Error: " + e.getMessage()); 91 | e.printStackTrace(); 92 | return; 93 | } 94 | } 95 | 96 | public static void help(CmdLineParser parser) 97 | { 98 | System.out 99 | .println("java -jar jLDADMM.jar [options ...] [arguments...]"); 100 | parser.printUsage(System.out); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /test/corpus.LABEL: -------------------------------------------------------------------------------- 1 | apple 2 | apple 3 | apple 4 | apple 5 | apple 6 | apple 7 | apple 8 | apple 9 | apple 10 | apple 11 | apple 12 | apple 13 | apple 14 | apple 15 | apple 16 | apple 17 | apple 18 | apple 19 | apple 20 | apple 21 | apple 22 | apple 23 | apple 24 | apple 25 | apple 26 | apple 27 | apple 28 | apple 29 | apple 30 | apple 31 | apple 32 | apple 33 | apple 34 | apple 35 | apple 36 | apple 37 | apple 38 | apple 39 | apple 40 | apple 41 | apple 42 | apple 43 | apple 44 | apple 45 | apple 46 | apple 47 | apple 48 | apple 49 | apple 50 | apple 51 | apple 52 | apple 53 | apple 54 | apple 55 | apple 56 | apple 57 | apple 58 | apple 59 | apple 60 | apple 61 | apple 62 | apple 63 | apple 64 | apple 65 | apple 66 | apple 67 | apple 68 | apple 69 | apple 70 | apple 71 | apple 72 | apple 73 | apple 74 | apple 75 | apple 76 | apple 77 | apple 78 | apple 79 | apple 80 | apple 81 | apple 82 | apple 83 | apple 84 | apple 85 | apple 86 | apple 87 | apple 88 | apple 89 | apple 90 | apple 91 | apple 92 | apple 93 | apple 94 | apple 95 | apple 96 | apple 97 | apple 98 | apple 99 | apple 100 | apple 101 | google 102 | google 103 | google 104 | google 105 | google 106 | google 107 | google 108 | google 109 | google 110 | google 111 | google 112 | google 113 | google 114 | google 115 | google 116 | google 117 | google 118 | google 119 | google 120 | google 121 | google 122 | google 123 | google 124 | google 125 | google 126 | google 127 | google 128 | google 129 | google 130 | google 131 | google 132 | google 133 | google 134 | google 135 | google 136 | google 137 | google 138 | google 139 | google 140 | google 141 | google 142 | google 143 | google 144 | google 145 | google 146 | google 147 | google 148 | google 149 | google 150 | google 151 | google 152 | google 153 | google 154 | google 155 | google 156 | google 157 | google 158 | google 159 | google 160 | google 161 | google 162 | google 163 | google 164 | google 165 | google 166 | google 167 | google 168 | google 169 | google 170 | google 171 | google 172 | google 173 | google 174 | google 175 | google 176 | google 177 | google 178 | google 179 | google 180 | google 181 | google 182 | google 183 | google 184 | google 185 | google 186 | google 187 | google 188 | google 189 | google 190 | google 191 | google 192 | google 193 | google 194 | google 195 | google 196 | google 197 | google 198 | google 199 | google 200 | google 201 | microsoft 202 | microsoft 203 | microsoft 204 | microsoft 205 | microsoft 206 | microsoft 207 | microsoft 208 | microsoft 209 | microsoft 210 | microsoft 211 | microsoft 212 | microsoft 213 | microsoft 214 | microsoft 215 | microsoft 216 | microsoft 217 | microsoft 218 | microsoft 219 | microsoft 220 | microsoft 221 | microsoft 222 | microsoft 223 | microsoft 224 | microsoft 225 | microsoft 226 | microsoft 227 | microsoft 228 | microsoft 229 | microsoft 230 | microsoft 231 | microsoft 232 | microsoft 233 | microsoft 234 | microsoft 235 | microsoft 236 | microsoft 237 | microsoft 238 | microsoft 239 | microsoft 240 | microsoft 241 | microsoft 242 | microsoft 243 | microsoft 244 | microsoft 245 | microsoft 246 | microsoft 247 | microsoft 248 | microsoft 249 | microsoft 250 | microsoft 251 | microsoft 252 | microsoft 253 | microsoft 254 | microsoft 255 | microsoft 256 | microsoft 257 | microsoft 258 | microsoft 259 | microsoft 260 | microsoft 261 | microsoft 262 | microsoft 263 | microsoft 264 | microsoft 265 | microsoft 266 | microsoft 267 | microsoft 268 | microsoft 269 | microsoft 270 | microsoft 271 | microsoft 272 | microsoft 273 | microsoft 274 | microsoft 275 | microsoft 276 | microsoft 277 | microsoft 278 | microsoft 279 | microsoft 280 | microsoft 281 | microsoft 282 | microsoft 283 | microsoft 284 | microsoft 285 | microsoft 286 | microsoft 287 | microsoft 288 | microsoft 289 | microsoft 290 | microsoft 291 | microsoft 292 | microsoft 293 | microsoft 294 | microsoft 295 | microsoft 296 | microsoft 297 | microsoft 298 | microsoft 299 | microsoft 300 | microsoft 301 | twitter 302 | twitter 303 | twitter 304 | twitter 305 | twitter 306 | twitter 307 | twitter 308 | twitter 309 | twitter 310 | twitter 311 | twitter 312 | twitter 313 | twitter 314 | twitter 315 | twitter 316 | twitter 317 | twitter 318 | twitter 319 | twitter 320 | twitter 321 | twitter 322 | twitter 323 | twitter 324 | twitter 325 | twitter 326 | twitter 327 | twitter 328 | twitter 329 | twitter 330 | twitter 331 | twitter 332 | twitter 333 | twitter 334 | twitter 335 | twitter 336 | twitter 337 | twitter 338 | twitter 339 | twitter 340 | twitter 341 | twitter 342 | twitter 343 | twitter 344 | twitter 345 | twitter 346 | twitter 347 | twitter 348 | twitter 349 | twitter 350 | twitter 351 | twitter 352 | twitter 353 | twitter 354 | twitter 355 | twitter 356 | twitter 357 | twitter 358 | twitter 359 | twitter 360 | twitter 361 | twitter 362 | twitter 363 | twitter 364 | twitter 365 | twitter 366 | twitter 367 | twitter 368 | twitter 369 | twitter 370 | twitter 371 | twitter 372 | twitter 373 | twitter 374 | twitter 375 | twitter 376 | twitter 377 | twitter 378 | twitter 379 | twitter 380 | twitter 381 | twitter 382 | twitter 383 | twitter 384 | twitter 385 | twitter 386 | twitter 387 | twitter 388 | twitter 389 | twitter 390 | twitter 391 | twitter 392 | twitter 393 | twitter 394 | twitter 395 | twitter 396 | twitter 397 | twitter 398 | twitter 399 | twitter 400 | twitter 401 | -------------------------------------------------------------------------------- /test/unseenTest.txt: -------------------------------------------------------------------------------- 1 | making ipad feel ios 2 | nexus good feel bit guess android users android 3 | nice game helps search 4 | nice game helps search facebook 5 | build website website free 6 | android ics pretty good worth 7 | android ice cream sandwich nexus android nexus 8 | exciting day ice cream sandwich day android 9 | wow nexus beautiful totally gonna market share smart phone market 10 | integrated data usage manager brilliant design watching lol 11 | ice cream sandwich android works htc desire 12 | ice cream sandwich sounds android ice cream sandwich 13 | amazing imo android missing 14 | forget phone nice feature android nexus 15 | finally unveiled android ice cream sandwich good 16 | finally searches logged users 17 | rim strategy released hours release ics 18 | man love galaxy nexus samsung android 19 | doubt 20 | share winning war 21 | dear galaxy nexus send email technology 22 | telegraph reports biggest threat facebook power users 23 | samsung made bad android king 24 | facebook power users telegraph socialmedia 25 | impressed android update good font design 26 | video wallet wow 27 | tweet remember spell straight 28 | android samsung nexus 29 | efficient fun releases infinite digital bookcase 30 | pass social seo facebook 31 | ice cream sandwich stop carriers bullying smartphone users android 32 | agree freaking awesome 33 | icecream great 34 | helps 35 | samsung galaxy nexus iphone 36 | ice cream sandwich delicious iphone launches android aka 37 | loving 38 | samsung push mobile experience forward 39 | finally power volume screenshot ics 40 | nexus press conference slick 41 | high school appreciated 42 | scream scream scream android job major game mobile space 43 | thinking ahead 44 | venturebeat virtual bookcase sharing 45 | android phone keeping iphone 46 | android ice cream sandwich feature closer roboto type face read 47 | work samsung android ics impressive 48 | add profile webgl project add addthis 49 | work company work 50 | invention 51 | wait ice cream sandwich android 52 | stop nexus 53 | phone 54 | android device updated galaxy nexus 55 | android introducing ice cream sandwich delicious version android ics 56 | excited android features android ics 57 | wait nexus play 58 | check video introducing galaxy nexus simple beautiful smart youtube android nexus 59 | cream ice cream phone job 60 | great small businesses platform features thoughts 61 | loves presentations tool docs adding video 62 | brilliant webgl bookcase 63 | searches things 64 | android ice cream introducing galaxy nexus simple beautiful smart 65 | nexus prime android 66 | interesting bookcase venturebeat releases infinite digital bookcase 67 | good finally focus user experience android 68 | ics awesome phone android motorola 69 | iphone ice cream sandwich android 70 | nexus line smart move 71 | android beam alright made team team android 72 | android reply font good start ics 73 | ice cream sandwich face unlock works 74 | ready ice cream sandwich ics nexus android android 75 | ice cream sandwich android 76 | taste ice cream sandwich bite 77 | samsung event live blog gadget haven android 78 | android ice cream sandwich make smartphone operating systems 79 | photo sharing people application ice cream sandwich imo ics 80 | android nexus phone makes iphone cheap store android 81 | sweet ice cream sandwich android ice cream sandwich officially ics 82 | raise hand android powered phone samsung 83 | siri android device replace iphone 84 | nexus page live nexus android 85 | excited android beam face unlock android ics 86 | linkedin tools company page contact 87 | samsung ice cream sandwich samsung 88 | introducing galaxy nexus simple beautiful smart android ics samsung 89 | glad design android shows waiting 90 | thoughts android ics excited play features android 91 | register galaxy nexus android 92 | wow webgl infinite bookcase 93 | ics awesome wait face unlock android 94 | gotta pretty android chrome android 95 | november direct purchase samsung 96 | nexus wanna awesome 97 | event time change android samsung 98 | ios user ics awesome great job 99 | yeah great job ics 100 | literally mind blown samsung 101 | motorola verizon perfect 102 | opens door spanish entrepreneurs project 103 | intel ibm 104 | windows phone mango update process ahead schedule mango 105 | back smartphone rich 106 | word works computer 107 | free gen stores 108 | watch codename data explorer ctp coming 109 | lunch today vslive 110 | watch codename data explorer ctp coming month 111 | details search improvements windows start screen 112 | mango shows taste smartphone success mango 113 | awesome moving dev finally local 114 | stores offer free windows phone devices 115 | stores offer free windows phone devices neowin 116 | store spend hard vslive 117 | free west check 118 | hey parents free tools kids online live family 119 | cloud offers students free access improve tcn 120 | awesome bit 121 | details windows search improvements 122 | yeah taking metro yeah good android 123 | love kids tech 124 | explains improvements windows start screen search tech 125 | search idea search great 126 | bing king search search 127 | powerpoint users power create service bye solutions 128 | future information innovators nov info 129 | curate personal history project greenwich month 130 | beam research project 131 | great sql server session 132 | works days 133 | ballmer thinks computer scientist android tech agree 134 | great time 135 | win server works fine vmware 136 | wow tech turns body touchscreen psfk 137 | love love feeling building vslive bringing conference 138 | research shows awesome step closer bit kinect 139 | research shows science science fact cool sound 140 | research shows science science fact 141 | zune music canada music news 142 | kinect makes learning playful education 143 | mango 144 | check change world 145 | good world wait 146 | watching windows pretty impressive finally mac interesting battle store 147 | xbox share 148 | god 149 | blog post cool tool mouse tools 150 | forget siri beating speech commands mango siri 151 | tests proves appsense enterprise capability users personalization database enterprise 152 | software good points sap dynamics 153 | good dev 154 | secure anti 155 | impressed creating images 156 | mac blown marketing 157 | yahoo sale years back bought glad deal year 158 | omnitouch impressive technology 159 | good bing paying 160 | ipads windows tablets study 161 | home day great time 162 | mango shows taste smartphone success 163 | picture services cloud love 164 | windows net dev 165 | nice talk community 166 | omg sharepoint working 167 | innovation sad sad 168 | office love genius 169 | love gates foundation 170 | good 171 | skype family amazing things 172 | absolutely loving mouse 173 | fan cool video turn surface touchscreen 174 | wow android ics lots talk mango launch people public speaking 175 | updated computer windows 176 | ics android kill mango nokia 177 | people names mail week 178 | outlook mac sucks hate 179 | xbox accounts hack reports 180 | update net 181 | windows media center fail 182 | eclipsed 183 | word upgrade doc doc word won open doc suck 184 | u.s. antitrust leaving business played dumb 185 | lync crash issue mac fixed 186 | broke played engages racketeering calls respect 187 | nokia chief executive mole 188 | frozen xbox live xbl accounts online games report hacked 189 | gave windows dev preview good waiting beta windows 190 | powerpoint fix powerpoint presentations 191 | eclipsed guardian 192 | kind search 193 | great time family advertising 194 | windows forget past antitrust issues 195 | paying make racketeering 196 | day talking talk tomorrow waiting 197 | reader compares albatross neck agree join 198 | lot word freeze minutes 199 | lol perfect simple hate windows phones 200 | months months lose 201 | -------------------------------------------------------------------------------- /src/eval/ClusteringEval.java: -------------------------------------------------------------------------------- 1 | package eval; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.HashSet; 11 | import java.util.List; 12 | import java.util.Set; 13 | 14 | import utility.FuncUtils; 15 | 16 | /** 17 | * jLDADMM: A Java package for the LDA and DMM topic models 18 | * 19 | * Implementation of the Purity and NMI clustering evaluation scores, as 20 | * described in Section 16.3 in: 21 | * 22 | * Christopher D. Manning, Prabhakar Raghavan, and Hinrich Sch¨utze. 2008. 23 | * Introduction to Information Retrieval. Cambridge University Press. 24 | * 25 | * @author: Dat Quoc Nguyen 26 | */ 27 | 28 | public class ClusteringEval 29 | { 30 | String pathDocTopicProsFile; 31 | 32 | String pathGoldenLabelsFile; 33 | 34 | HashMap> goldenClusers; 35 | HashMap> outputClusers; 36 | 37 | int numDocs; 38 | 39 | public ClusteringEval(String inPathGoldenLabelsFile, 40 | String inPathDocTopicProsFile) 41 | throws Exception 42 | { 43 | pathDocTopicProsFile = inPathDocTopicProsFile; 44 | pathGoldenLabelsFile = inPathGoldenLabelsFile; 45 | 46 | goldenClusers = new HashMap>(); 47 | outputClusers = new HashMap>(); 48 | 49 | readGoldenLabelsFile(); 50 | readDocTopicProsFile(); 51 | } 52 | 53 | public void readGoldenLabelsFile() 54 | throws Exception 55 | { 56 | System.out 57 | .println("Reading golden labels file " + pathGoldenLabelsFile); 58 | 59 | int id = 0; 60 | 61 | BufferedReader br = null; 62 | try { 63 | br = new BufferedReader(new FileReader(pathGoldenLabelsFile)); 64 | for (String label; (label = br.readLine()) != null;) { 65 | label = label.trim(); 66 | Set ids = new HashSet(); 67 | if (goldenClusers.containsKey(label)) 68 | ids = goldenClusers.get(label); 69 | ids.add(id); 70 | goldenClusers.put(label, ids); 71 | id += 1; 72 | } 73 | } 74 | catch (Exception e) { 75 | e.printStackTrace(); 76 | } 77 | numDocs = id; 78 | } 79 | 80 | public void readDocTopicProsFile() 81 | throws Exception 82 | { 83 | System.out.println("Reading document-to-topic distribution file " 84 | + pathDocTopicProsFile); 85 | 86 | HashMap docLabelOutput = new HashMap(); 87 | 88 | int docIndex = 0; 89 | 90 | BufferedReader br = null; 91 | try { 92 | br = new BufferedReader(new FileReader(pathDocTopicProsFile)); 93 | 94 | for (String docTopicProbs; (docTopicProbs = br.readLine()) != null;) { 95 | String[] pros = docTopicProbs.trim().split("\\s+"); 96 | double maxPro = 0.0; 97 | int index = -1; 98 | for (int topicIndex = 0; topicIndex < pros.length; topicIndex++) { 99 | double pro = new Double(pros[topicIndex]); 100 | if (pro > maxPro) { 101 | maxPro = pro; 102 | index = topicIndex; 103 | } 104 | } 105 | docLabelOutput.put(docIndex, 106 | "Topic_" + new Integer(index).toString()); 107 | docIndex++; 108 | } 109 | } 110 | catch (Exception e) { 111 | e.printStackTrace(); 112 | } 113 | 114 | if (numDocs != docIndex) { 115 | System.out 116 | .println("Error: the number of documents is different to the number of labels!"); 117 | throw new Exception(); 118 | } 119 | 120 | for (Integer id : docLabelOutput.keySet()) { 121 | String label = docLabelOutput.get(id); 122 | Set ids = new HashSet(); 123 | if (outputClusers.containsKey(label)) 124 | ids = outputClusers.get(label); 125 | ids.add(id); 126 | outputClusers.put(label, ids); 127 | } 128 | 129 | } 130 | 131 | public double computePurity() 132 | { 133 | int count = 0; 134 | for (String label : outputClusers.keySet()) { 135 | Set docs = outputClusers.get(label); 136 | int correctAssignedDocNum = 0; 137 | for (String goldenLabel : goldenClusers.keySet()) { 138 | Set goldenDocs = goldenClusers.get(goldenLabel); 139 | Set outputDocs = new HashSet(docs); 140 | outputDocs.retainAll(goldenDocs); 141 | if (outputDocs.size() >= correctAssignedDocNum) 142 | correctAssignedDocNum = outputDocs.size(); 143 | } 144 | count += correctAssignedDocNum; 145 | } 146 | double value = count * 1.0 / numDocs; 147 | System.out.println("\tPurity accuracy: " + value); 148 | return value; 149 | } 150 | 151 | public double computeNMIscore() 152 | { 153 | double MIscore = 0.0; 154 | for (String label : outputClusers.keySet()) { 155 | Set docs = outputClusers.get(label); 156 | for (String goldenLabel : goldenClusers.keySet()) { 157 | Set goldenDocs = goldenClusers.get(goldenLabel); 158 | Set outputDocs = new HashSet(docs); 159 | outputDocs.retainAll(goldenDocs); 160 | double numCorrectAssignedDocs = outputDocs.size() * 1.0; 161 | if (numCorrectAssignedDocs == 0.0) 162 | continue; 163 | MIscore += (numCorrectAssignedDocs / numDocs) 164 | * Math.log(numCorrectAssignedDocs * numDocs 165 | / (docs.size() * goldenDocs.size())); 166 | } 167 | 168 | } 169 | double entropy = 0.0; 170 | for (String label : outputClusers.keySet()) { 171 | Set docs = outputClusers.get(label); 172 | entropy += (-1.0 * docs.size() / numDocs) 173 | * Math.log(1.0 * docs.size() / numDocs); 174 | } 175 | 176 | for (String label : goldenClusers.keySet()) { 177 | Set docs = goldenClusers.get(label); 178 | entropy += (-1.0 * docs.size() / numDocs) 179 | * Math.log(1.0 * docs.size() / numDocs); 180 | } 181 | 182 | double value = 2 * MIscore / entropy; 183 | System.out.println("\tNMI score: " + value); 184 | return value; 185 | } 186 | 187 | public static void evaluate(String pathGoldenLabelsFile, 188 | String pathToFolderOfDocTopicProsFiles, String suffix) 189 | throws Exception 190 | { 191 | BufferedWriter writer = new BufferedWriter(new FileWriter( 192 | pathToFolderOfDocTopicProsFiles + "/" + suffix + ".PurityNMI")); 193 | writer.write("Golden-labels in: " + pathGoldenLabelsFile + "\n\n"); 194 | File[] files = new File(pathToFolderOfDocTopicProsFiles).listFiles(); 195 | 196 | List purity = new ArrayList(), nmi = new ArrayList(); 197 | for (File file : files) { 198 | if (!file.getName().endsWith(suffix)) 199 | continue; 200 | writer.write("Results for: " + file.getAbsolutePath() + "\n"); 201 | ClusteringEval dce = new ClusteringEval(pathGoldenLabelsFile, 202 | file.getAbsolutePath()); 203 | double value = dce.computePurity(); 204 | writer.write("\tPurity: " + value + "\n"); 205 | purity.add(value); 206 | value = dce.computeNMIscore(); 207 | writer.write("\tNMI: " + value + "\n"); 208 | nmi.add(value); 209 | } 210 | if (purity.size() == 0 || nmi.size() == 0) { 211 | System.out.println("Error: There is no file ending with " + suffix); 212 | throw new Exception(); 213 | } 214 | 215 | double[] purityValues = new double[purity.size()]; 216 | double[] nmiValues = new double[nmi.size()]; 217 | 218 | for (int i = 0; i < purity.size(); i++) 219 | purityValues[i] = purity.get(i).doubleValue(); 220 | for (int i = 0; i < nmi.size(); i++) 221 | nmiValues[i] = nmi.get(i).doubleValue(); 222 | 223 | writer.write("\n---\nMean purity: " + FuncUtils.mean(purityValues) 224 | + ", standard deviation: " + FuncUtils.stddev(purityValues)); 225 | 226 | writer.write("\nMean NMI: " + FuncUtils.mean(nmiValues) 227 | + ", standard deviation: " + FuncUtils.stddev(nmiValues)); 228 | 229 | System.out.println("---\nMean purity: " + FuncUtils.mean(purityValues) 230 | + ", standard deviation: " + FuncUtils.stddev(purityValues)); 231 | 232 | System.out.println("Mean NMI: " + FuncUtils.mean(nmiValues) 233 | + ", standard deviation: " + FuncUtils.stddev(nmiValues)); 234 | 235 | writer.close(); 236 | } 237 | 238 | public static void main(String[] args) 239 | throws Exception 240 | { 241 | ClusteringEval.evaluate("test/corpus.LABEL", "test", "theta"); 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## jLDADMM: A Java package for the LDA and DMM topic models 2 | 3 | **jLDADMM** is released to provide alternatives for topic modeling on normal or short texts. Probabilistic topic models, such as Latent Dirichlet Allocation (LDA) [1] and related models [2], are widely used to discover latent topics in document collections. However, applying topic models for short texts (e.g. Tweets) is more challenging because of data sparsity and the limited contexts in such texts. One approach is to combine short texts into long pseudo-documents before training LDA. Another approach is to assume that there is only one topic per document [3]. 4 | 5 | jLDADMM provides implementations of the LDA topic model [1] and the one-topic-per-document Dirichlet Multinomial Mixture (DMM) model (i.e. mixture of unigrams) [4]. The implementations of LDA and DMM use the collapsed Gibbs sampling algorithms for inference as described in [5] and [6], respectively. Furthermore, jLDADMM supplies a document clustering evaluation to compare topic models, using two common metrics of Purity and normalized mutual information (NMI) [7]. 6 | 7 | Please cite jLDADMM whenever jLDADMM is used to produce published results or incorporated into other software: 8 | 9 | @article{jldadmm, 10 | title={{jLDADMM: A Java package for the LDA and DMM topic models}}, 11 | author={Dat Quoc Nguyen}, 12 | journal={arXiv preprint arXiv:1808.03835}, 13 | year={2018} 14 | } 15 | 16 | Bug reports, comments and suggestions about jLDADMM are highly appreciated. As a free open-source package, jLDADMM is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | 18 | ### Using jLDADMM for topic modeling 19 | 20 | This section describes the usage of jLDADMM in command line or terminal, using a pre-compiled file named `jLDADMM.jar`. Here, it is supposed that Java is already set to run in command line or terminal (e.g. adding Java to the environment variable `path` in Windows OS). 21 | 22 | Users can find the pre-compiled file `jLDADMM.jar` and source codes in folders `jar` and `src`, respectively. **The users can recompile the source codes by simply running `ant` (it is also expected that `ant` is already installed)**. In addition, the users can find input examples in `test` folder. 23 | 24 | **File format of input corpus:** Similar to file `corpus.txt` in the `test` folder, jLDADMM assumes that each line in the input corpus represents a document. Here, a document is a sequence of words/tokens separated by white space characters. The users should preprocess the input corpus before training the LDA or DMM topic models, for example: down-casing, removing non-alphabetic characters and stop-words, removing words shorter than 3 characters and words appearing less than a certain times. 25 | 26 | **Now, we can train LDA or DMM by executing:** 27 | 28 | $ java [-Xmx1G] -jar jar/jLDADMM.jar –model -corpus [-ntopics ] [-alpha ] [-beta ] [-niters ] [-twords ] [-name ] [-sstep ] [-seed ] 29 | 30 | where parameters in [ ] are optional. 31 | 32 | `-model`: Specify the topic model LDA or DMM 33 | 34 | `-corpus`: Specify the path to the input corpus file. 35 | 36 | `-ntopics `: Specify the number of topics. The default value is 20. 37 | 38 | `-alpha `: Specify the hyper-parameter `alpha`. Following [6, 8], the default `alpha` value is 0.1. 39 | 40 | `-beta `: Specify the hyper-parameter `beta`. The default `beta` value is 0.01 which is a common setting in literature [5]. Following [6], the users may consider to the `beta` value of 0.1 for short texts. 41 | 42 | `-niters `: Specify the number of Gibbs sampling iterations. The default value is 2000. 43 | 44 | `-twords `: Specify the number of the most probable topical words. The default value is 20. 45 | 46 | `-name `: Specify a name to the topic modeling experiment. The default value is `model`. 47 | 48 | `-sstep `: Specify a step to save the sampling outputs. The default value is 0 (i.e. only saving the output from the last sample). 49 | 50 | `-seed `: Specify the random _seed_ for the Gibbs sampler. Default is 0, which will use the clock. 51 | 52 | **Examples:** 53 | 54 | $ java -jar jar/jLDADMM.jar -model LDA -corpus test/corpus.txt -name testLDA 55 | 56 | The output files are saved in the same folder containing the input corpus file, in this case in the `test` folder. We have output files of `testLDA.theta`, `testLDA.phi`, `testLDA.topWords`, `testLDA.topicAssignments` and `testLDA.paras` referring to the document-to-topic distributions, topic-to-word distributions, top topical words, topic assignments and model parameters, respectively. Similarly, we perform: 57 | 58 | $ java -jar jar/jLDADMM.jar -model DMM -corpus test/corpus.txt -beta 0.1 -name testDMM 59 | 60 | Output files `testDMM.theta`, `testDMM.phi`, `testDMM.topWords`, `testDMM.topicAssignments` and `testDMM.paras` are also in the `test` folder. 61 | 62 | ### Using jLDADMM for document clustering evaluation 63 | 64 | Here, we treat each topic as a cluster, and we assign every document the topic with the highest probability given the document [8]. To get the Purity and NMI clustering scores, we perform: 65 | 66 | $ java –jar jar/jLDADMM.jar –model Eval –label -dir -prob 67 | 68 | `–label`: Specify the path to the ground truth label file. Each line in this label file contains the golden label of the corresponding document in the input corpus. See files `corpus.LABEL` and `corpus.txt` in the `test` folder. 69 | 70 | `-dir`: Specify the path to the directory containing document-to-topic distribution files. 71 | 72 | `-prob`: Specify a document-to-topic distribution file OR a group of document-to-topic distribution files in the specified directory. 73 | 74 | **Examples:** 75 | 76 | $ java -jar jar/jLDADMM.jar -model Eval -label test/corpus.LABEL -dir test -prob testLDA.theta 77 | 78 | $ java -jar jar/jLDADMM.jar -model Eval -label test/corpus.LABEL -dir test -prob testDMM.theta 79 | 80 | The above commands will produce the clustering scores for files `testLDA.theta` and `testDMM.theta` in the `test` folder, separately. The following command 81 | 82 | $ java -jar jar/jLDADMM.jar -model Eval -label test/corpus.LABEL -dir test -prob theta 83 | 84 | will produce the clustering scores for all document-to-topic distribution files with their names ending in `theta`. In this case, they are are `testLDA.theta` and `testDMM.theta`. The command also provides the mean and standard deviation of the clustering scores. 85 | 86 | To improve evaluation scores, the users might consider using [latent feature topic models LF-LDA and LF-DMM](https://github.com/datquocnguyen/LFTM) [3], which extend the LDA and DMM topic models with word embeddings. 87 | 88 | ### Topic inference on new/unseen corpus 89 | 90 | To infer topics on a new/unseen corpus using a pre-trained LDA/DMM topic model, we perform: 91 | 92 | $ java -jar jar/jLDADMM.jar -model -paras -corpus [-niters ] [-twords ] [-name ] [-sstep ] [-seed ] 93 | 94 | * `-paras`: Specify the path to the hyper-parameter file produced by the pre-trained LDA/DMM topic model. 95 | 96 | Examples: 97 | 98 | $ java -jar jar/jLDADMM.jar -model LDAinf -paras test/testLDA.paras -corpus test/unseenTest.txt -niters 100 -name testLDAinf 99 | 100 | $ java -jar jar/jLDADMM.jar -model DMMinf -paras test/testDMM.paras -corpus test/unseenTest.txt -niters 100 -name testDMMinf 101 | 102 | ### References 103 | 104 | [1] David M. Blei, Andrew Y. Ng, and Michael I. Jordan. 2003. Latent Dirichlet Allocation. Journal of Machine Learning Research, 3:993–1022. 105 | 106 | [2] David M. Blei. 2012. Probabilistic Topic Models. Communications of the ACM, 55(4):77–84. 107 | 108 | [3] Dat Quoc Nguyen, Richard Billingsley, Lan Du and Mark Johnson. 2015. [Improving Topic Models with Latent Feature Word Representations](https://tacl2013.cs.columbia.edu/ojs/index.php/tacl/article/view/582/158). Transactions of the Association for Computational Linguistics, vol. 3, pp. 299-313. [[CODE]](https://github.com/datquocnguyen/LFTM) 109 | 110 | [4] Kamal Nigam, AK McCallum, S Thrun, and T Mitchell. 2000. Text Classification from Labeled and Unlabeled Documents Using EM. Machine learning, 39:103– 134. 111 | 112 | [5] Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics. Proceedings of the National Academy of Sciences of the United States of America, 101(Suppl 1):5228–5235. 113 | 114 | [6] Jianhua Yin and Jianyong Wang. 2014. A Dirichlet Multinomial Mixture Model-based Approach for Short Text Clustering. In Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pages 233–242. 115 | 116 | [7] Christopher D. Manning, Prabhakar Raghavan, and Hinrich Sch¨utze. 2008. Introduction to Information Retrieval. Cambridge University Press. 117 | 118 | [8] Yue Lu, Qiaozhu Mei, and ChengXiang Zhai. 2011. Investigating task performance of probabilistic topic models: an empirical study of PLSA and LDA. Information Retrieval, 14:178–203. 119 | -------------------------------------------------------------------------------- /test/corpus.txt: -------------------------------------------------------------------------------- 1 | iphone crack iphone 2 | adding support iphone announced 3 | youtube video guy siri pretty love 4 | rim made easy switch iphone yeah 5 | realized ios 6 | current blackberry user bit disappointed move android iphone 7 | things siri sooo glad gave siri sense humor 8 | great personal event tonight store 9 | companies experience customer service 10 | apply job hope call lol 11 | lmao siri find hide body 12 | registered developer appreciated 13 | wow great deals ipad gen offers great deals gen ipads 14 | learning trip hong kong gotta hand iphones apps 15 | dark side hey send free iphone publicly burn blackberry 16 | find mac air 17 | macbook keyboard lunch break today warranty 18 | ipads replace 19 | siri amazing 20 | amazing ios feature 21 | reply featured education apps website today sweet 22 | reply useless days 23 | iphone yesterday awesome amount info 24 | question brother iphone 25 | people iphone phone happy 26 | ceo points ios 27 | bus iphone 28 | umber appstore itunes mobile devices talking desktop application 29 | bring ipad ipad set red red ipad 30 | sells million iphone weekend steve jobs lives iphone 31 | apologize 32 | downloads ios users 33 | lmfao argument siri 34 | incredible million iphone screenshot days iphone iphone 35 | fixed ios battery drain problem replacement iphone working 36 | brand macbook professional macbook years miss time 37 | siri dad mom brother girlfriend 38 | store amazing call waiting music 39 | sweet replaced 40 | bad sells million iphones debut weekend smartphone 41 | loving technology iphone mac air icloud technology 42 | loving ios update 43 | mention store great customer service store 44 | time iphone forward man longer paying texts 45 | girlfriend iphone great 46 | icloud set works cloud 47 | mommy totally email company great service store 48 | loving ios upgrade iphone 49 | ios ipad 50 | making switch android iphone iphone smartphone store 51 | incredible people offering water macbook professional wow 52 | macbook sick 53 | play man loving camera iphone facebook 54 | yeah ios changed life 55 | reader worldwide web 56 | love service case hand case 57 | years jobs iphone iphone iphone 58 | blackberry years lost service moving iphone 59 | sells million iphones days 60 | weekend iphone 61 | macbook professional year time selling android 62 | post card 63 | putting kind glad hear alive 64 | god youtube bad ass system loving 65 | days iphone nice gave 66 | ios email lock screen opening unlocking 67 | word wow iphone weekend sales top million 68 | love ios easter eggs pull middle top bottom pulls awesome feature ios 69 | love ios easter eggs pull middle top bottom pulls awesome feature 70 | run beautiful morning man love ios iphone 71 | simply 72 | made happy text lol text 73 | day great customer service received today phone phone 74 | loving ipod update 75 | upgraded iphone siri worth upgrade forward siri 76 | great world missed 77 | loving iphone ios 78 | love 79 | iphone great genius 80 | cards application card arrived local post office today 81 | iphone siri 82 | meet siri iphone click link 83 | work feel worst 84 | ios upgrade good luck blackberry 85 | loving ios awesome 86 | iphone addicted club 87 | guy playing facetime watching game bar 88 | blackberry boo powered technology work 89 | ipod time iphone good job guys 90 | siri year lead lost 91 | ios sweet notifications phone search covers mail wifi sync icloud backup integrated 92 | great james story today times retail success 93 | world due ios guys 94 | impressive service genius bar metro center power replaced free screen replacement free 95 | nice guy store replaced phone showed crack screen 96 | iphone battery longer day happened edge iphone nice job 97 | minutes write blackberry showing 98 | eye phone impressed 99 | iphone space amazing products people things 100 | making ipad feel ios 101 | nexus good feel bit guess android users android 102 | nice game helps search 103 | nice game helps search facebook 104 | build website website free 105 | android ics pretty good worth 106 | android ice cream sandwich nexus android nexus 107 | exciting day ice cream sandwich day android 108 | wow nexus beautiful totally gonna market share smart phone market 109 | integrated data usage manager brilliant design watching lol 110 | ice cream sandwich android works htc desire 111 | ice cream sandwich sounds android ice cream sandwich 112 | amazing imo android missing 113 | forget phone nice feature android nexus 114 | finally unveiled android ice cream sandwich good 115 | finally searches logged users 116 | rim strategy released hours release ics 117 | man love galaxy nexus samsung android 118 | doubt 119 | share winning war 120 | dear galaxy nexus send email technology 121 | telegraph reports biggest threat facebook power users 122 | samsung made bad android king 123 | facebook power users telegraph socialmedia 124 | impressed android update good font design 125 | video wallet wow 126 | tweet remember spell straight 127 | android samsung nexus 128 | efficient fun releases infinite digital bookcase 129 | pass social seo facebook 130 | ice cream sandwich stop carriers bullying smartphone users android 131 | agree freaking awesome 132 | icecream great 133 | helps 134 | samsung galaxy nexus iphone 135 | ice cream sandwich delicious iphone launches android aka 136 | loving 137 | samsung push mobile experience forward 138 | finally power volume screenshot ics 139 | nexus press conference slick 140 | high school appreciated 141 | scream scream scream android job major game mobile space 142 | thinking ahead 143 | venturebeat virtual bookcase sharing 144 | android phone keeping iphone 145 | android ice cream sandwich feature closer roboto type face read 146 | work samsung android ics impressive 147 | add profile webgl project add addthis 148 | work company work 149 | invention 150 | wait ice cream sandwich android 151 | stop nexus 152 | phone 153 | android device updated galaxy nexus 154 | android introducing ice cream sandwich delicious version android ics 155 | excited android features android ics 156 | wait nexus play 157 | check video introducing galaxy nexus simple beautiful smart youtube android nexus 158 | cream ice cream phone job 159 | great small businesses platform features thoughts 160 | loves presentations tool docs adding video 161 | brilliant webgl bookcase 162 | searches things 163 | android ice cream introducing galaxy nexus simple beautiful smart 164 | nexus prime android 165 | interesting bookcase venturebeat releases infinite digital bookcase 166 | good finally focus user experience android 167 | ics awesome phone android motorola 168 | iphone ice cream sandwich android 169 | nexus line smart move 170 | android beam alright made team team android 171 | android reply font good start ics 172 | ice cream sandwich face unlock works 173 | ready ice cream sandwich ics nexus android android 174 | ice cream sandwich android 175 | taste ice cream sandwich bite 176 | samsung event live blog gadget haven android 177 | android ice cream sandwich make smartphone operating systems 178 | photo sharing people application ice cream sandwich imo ics 179 | android nexus phone makes iphone cheap store android 180 | sweet ice cream sandwich android ice cream sandwich officially ics 181 | raise hand android powered phone samsung 182 | siri android device replace iphone 183 | nexus page live nexus android 184 | excited android beam face unlock android ics 185 | linkedin tools company page contact 186 | samsung ice cream sandwich samsung 187 | introducing galaxy nexus simple beautiful smart android ics samsung 188 | glad design android shows waiting 189 | thoughts android ics excited play features android 190 | register galaxy nexus android 191 | wow webgl infinite bookcase 192 | ics awesome wait face unlock android 193 | gotta pretty android chrome android 194 | november direct purchase samsung 195 | nexus wanna awesome 196 | event time change android samsung 197 | ios user ics awesome great job 198 | yeah great job ics 199 | literally mind blown samsung 200 | motorola verizon perfect 201 | opens door spanish entrepreneurs project 202 | intel ibm 203 | windows phone mango update process ahead schedule mango 204 | back smartphone rich 205 | word works computer 206 | free gen stores 207 | watch codename data explorer ctp coming 208 | lunch today vslive 209 | watch codename data explorer ctp coming month 210 | details search improvements windows start screen 211 | mango shows taste smartphone success mango 212 | awesome moving dev finally local 213 | stores offer free windows phone devices 214 | stores offer free windows phone devices neowin 215 | store spend hard vslive 216 | free west check 217 | hey parents free tools kids online live family 218 | cloud offers students free access improve tcn 219 | awesome bit 220 | details windows search improvements 221 | yeah taking metro yeah good android 222 | love kids tech 223 | explains improvements windows start screen search tech 224 | search idea search great 225 | bing king search search 226 | powerpoint users power create service bye solutions 227 | future information innovators nov info 228 | curate personal history project greenwich month 229 | beam research project 230 | great sql server session 231 | works days 232 | ballmer thinks computer scientist android tech agree 233 | great time 234 | win server works fine vmware 235 | wow tech turns body touchscreen psfk 236 | love love feeling building vslive bringing conference 237 | research shows awesome step closer bit kinect 238 | research shows science science fact cool sound 239 | research shows science science fact 240 | zune music canada music news 241 | kinect makes learning playful education 242 | mango 243 | check change world 244 | good world wait 245 | watching windows pretty impressive finally mac interesting battle store 246 | xbox share 247 | god 248 | blog post cool tool mouse tools 249 | forget siri beating speech commands mango siri 250 | tests proves appsense enterprise capability users personalization database enterprise 251 | software good points sap dynamics 252 | good dev 253 | secure anti 254 | impressed creating images 255 | mac blown marketing 256 | yahoo sale years back bought glad deal year 257 | omnitouch impressive technology 258 | good bing paying 259 | ipads windows tablets study 260 | home day great time 261 | mango shows taste smartphone success 262 | picture services cloud love 263 | windows net dev 264 | nice talk community 265 | omg sharepoint working 266 | innovation sad sad 267 | office love genius 268 | love gates foundation 269 | good 270 | skype family amazing things 271 | absolutely loving mouse 272 | fan cool video turn surface touchscreen 273 | wow android ics lots talk mango launch people public speaking 274 | updated computer windows 275 | ics android kill mango nokia 276 | people names mail week 277 | outlook mac sucks hate 278 | xbox accounts hack reports 279 | update net 280 | windows media center fail 281 | eclipsed 282 | word upgrade doc doc word won open doc suck 283 | u.s. antitrust leaving business played dumb 284 | lync crash issue mac fixed 285 | broke played engages racketeering calls respect 286 | nokia chief executive mole 287 | frozen xbox live xbl accounts online games report hacked 288 | gave windows dev preview good waiting beta windows 289 | powerpoint fix powerpoint presentations 290 | eclipsed guardian 291 | kind search 292 | great time family advertising 293 | windows forget past antitrust issues 294 | paying make racketeering 295 | day talking talk tomorrow waiting 296 | reader compares albatross neck agree join 297 | lot word freeze minutes 298 | lol perfect simple hate windows phones 299 | months months lose 300 | reader compares albatross neck agree join discussion 301 | make sleep plan 302 | feel world put facebook blackberry helps 303 | miss boo 304 | everytime leave back back telling lol 305 | application ass theme 306 | sleep sleep 307 | starting sending hashtags emails taking lives 308 | shit lol hell 309 | today introduced social media love 310 | facebook 311 | yeah shows glad 312 | pretty facebook 313 | gotta love shit round world speed 314 | bed gonna minute bed 315 | dear fucking missed today internet 316 | tweet keeping busy school 317 | good thing people left social 318 | social media 319 | guess addicted university exam questions 320 | good thing people left social side 321 | apples facebook content 322 | bed favorite application facebook 323 | facebook change makes excited privacy 324 | impressive numbers smm socialmedia 325 | fuck facebook bullshit bitch 326 | cool love 327 | fuck facebook follow 328 | haven shit man haven fun 329 | find song end television show watched 330 | literally back facebook text email technology good 331 | isn pretty damn amazing hope year fast 332 | dear missed promise touch 333 | bored sad mad happy true friend 334 | facebook sucks amp 335 | shit funny haven shit day 336 | voice people real life lol 337 | yeah time bug 338 | science hashtags facebook 339 | feeling real world 340 | biggest 341 | facebook messed make add reliable 342 | freaking kidding wth 343 | tomorrow blue ass bird continued 344 | dead 345 | emails telling 346 | sucks follow 347 | people reporting retweets working technical problem 348 | back lol 349 | retweets broken haven tuesday 350 | tomorrow blue ass bird ass 351 | ain showing current mentions tweets 352 | gonna problems fixed asap 353 | retweets 354 | man boring 355 | application show touch tweet 356 | trouble application updating application 357 | messed everytime text message 358 | show fucking retweets bitch 359 | sooo trash 360 | showing retweets shit 361 | mom argument pretty 362 | addicted care 363 | appreciated start working computer 364 | retweets section account working hours problem 365 | good send bloody tweets 366 | feel 367 | make account 368 | fucking late damn 369 | dear fix shit retweets mentions 370 | dead fuck 371 | point 372 | giving tweets tweeted past days lol 373 | messed followers numbers 374 | timeline mentions shit 375 | garbage 376 | hell television man 377 | stupid fucking give damn mentions ugh 378 | fucking 379 | facebook television wanna study 380 | show retweets ill back facebook 381 | reply opinions 382 | forget day time haven 383 | blogs tumblr 384 | talk step game 385 | reminder fail 386 | join follow 387 | ways competition 388 | people facebook day life 389 | drop follow show love 390 | telling reply 391 | sleep time 392 | emotions 393 | call night 394 | work break time yeah 395 | tumblr love 396 | age year days hours minutes seconds find 397 | wanna aye shit living 398 | shout favorite people happy girls 399 | follow back 400 | sleep good people trip 401 | -------------------------------------------------------------------------------- /src/models/GibbsSamplingLDA_Inf.java: -------------------------------------------------------------------------------- 1 | package models; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileReader; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | import java.util.TreeMap; 14 | 15 | import utility.FuncUtils; 16 | 17 | /** 18 | * jLDADMM: A Java package for the LDA and DMM topic models 19 | * 20 | * http://jldadmm.sourceforge.net/ 21 | * 22 | * @author: Dat Quoc Nguyen 23 | * 24 | */ 25 | 26 | public class GibbsSamplingLDA_Inf 27 | { 28 | public double alpha; // Hyper-parameter alpha 29 | public double beta; // Hyper-parameter alpha 30 | public int numTopics; // Number of topics 31 | public int numIterations; // Number of Gibbs sampling iterations 32 | public int topWords; // Number of most probable words for each topic 33 | 34 | public double alphaSum; // alpha * numTopics 35 | public double betaSum; // beta * vocabularySize 36 | 37 | public List> corpus; // Word ID-based corpus 38 | public List> topicAssignments; // Topics assignments for words 39 | // in the corpus 40 | public int numDocuments; // Number of documents in the corpus 41 | public int numWordsInCorpus; // Number of words in the corpus 42 | 43 | public HashMap word2IdVocabulary; // Vocabulary to get ID 44 | // given a word 45 | public HashMap id2WordVocabulary; // Vocabulary to get word 46 | // given an ID 47 | public int vocabularySize; // The number of word types in the corpus 48 | 49 | // numDocuments * numTopics matrix 50 | // Given a document: number of its words assigned to each topic 51 | public int[][] docTopicCount; 52 | // Number of words in every document 53 | public int[] sumDocTopicCount; 54 | // numTopics * vocabularySize matrix 55 | // Given a topic: number of times a word type assigned to the topic 56 | public int[][] topicWordCount; 57 | // Total number of words assigned to a topic 58 | public int[] sumTopicWordCount; 59 | 60 | // Double array used to sample a topic 61 | public double[] multiPros; 62 | 63 | // Path to the directory containing the corpus 64 | public String folderPath; 65 | // Path to the topic modeling corpus 66 | public String corpusPath; 67 | 68 | public String expName = "LDAinf"; 69 | public String orgExpName = "LDAinf"; 70 | public String tAssignsFilePath = ""; 71 | public int savestep = 0; 72 | 73 | public GibbsSamplingLDA_Inf(String pathToTrainingParasFile, 74 | String pathToUnseenCorpus, int inNumIterations, int inTopWords, 75 | String inExpName, int inSaveStep) 76 | throws Exception 77 | { 78 | HashMap paras = parseTrainingParasFile(pathToTrainingParasFile); 79 | if (!paras.get("-model").equals("LDA")) { 80 | throw new Exception("Wrong pre-trained model!!!"); 81 | } 82 | alpha = new Double(paras.get("-alpha")); 83 | beta = new Double(paras.get("-beta")); 84 | numTopics = new Integer(paras.get("-ntopics")); 85 | 86 | numIterations = inNumIterations; 87 | topWords = inTopWords; 88 | savestep = inSaveStep; 89 | expName = inExpName; 90 | orgExpName = expName; 91 | 92 | String trainingCorpus = paras.get("-corpus"); 93 | String trainingCorpusfolder = trainingCorpus.substring( 94 | 0, 95 | Math.max(trainingCorpus.lastIndexOf("/"), 96 | trainingCorpus.lastIndexOf("\\")) + 1); 97 | String topicAssignment4TrainFile = trainingCorpusfolder 98 | + paras.get("-name") + ".topicAssignments"; 99 | 100 | word2IdVocabulary = new HashMap(); 101 | id2WordVocabulary = new HashMap(); 102 | initializeWordCount(trainingCorpus, topicAssignment4TrainFile); 103 | 104 | corpusPath = pathToUnseenCorpus; 105 | folderPath = pathToUnseenCorpus.substring( 106 | 0, 107 | Math.max(pathToUnseenCorpus.lastIndexOf("/"), 108 | pathToUnseenCorpus.lastIndexOf("\\")) + 1); 109 | System.out.println("Reading unseen corpus: " + pathToUnseenCorpus); 110 | corpus = new ArrayList>(); 111 | numDocuments = 0; 112 | numWordsInCorpus = 0; 113 | 114 | BufferedReader br = null; 115 | try { 116 | br = new BufferedReader(new FileReader(pathToUnseenCorpus)); 117 | for (String doc; (doc = br.readLine()) != null;) { 118 | 119 | if (doc.trim().length() == 0) 120 | continue; 121 | 122 | String[] words = doc.trim().split("\\s+"); 123 | List document = new ArrayList(); 124 | 125 | for (String word : words) { 126 | if (word2IdVocabulary.containsKey(word)) { 127 | document.add(word2IdVocabulary.get(word)); 128 | } 129 | else { 130 | // Skip this unknown-word 131 | } 132 | } 133 | numDocuments++; 134 | numWordsInCorpus += document.size(); 135 | corpus.add(document); 136 | } 137 | } 138 | catch (Exception e) { 139 | e.printStackTrace(); 140 | return; 141 | } 142 | 143 | docTopicCount = new int[numDocuments][numTopics]; 144 | sumDocTopicCount = new int[numDocuments]; 145 | multiPros = new double[numTopics]; 146 | for (int i = 0; i < numTopics; i++) { 147 | multiPros[i] = 1.0 / numTopics; 148 | } 149 | 150 | alphaSum = numTopics * alpha; 151 | betaSum = vocabularySize * beta; 152 | 153 | System.out.println("Corpus size: " + numDocuments + " docs, " 154 | + numWordsInCorpus + " words"); 155 | System.out.println("Vocabuary size: " + vocabularySize); 156 | System.out.println("Number of topics: " + numTopics); 157 | System.out.println("alpha: " + alpha); 158 | System.out.println("beta: " + beta); 159 | System.out.println("Number of sampling iterations: " + numIterations); 160 | System.out.println("Number of top topical words: " + topWords); 161 | 162 | initialize(); 163 | } 164 | 165 | private HashMap parseTrainingParasFile( 166 | String pathToTrainingParasFile) 167 | throws Exception 168 | { 169 | HashMap paras = new HashMap(); 170 | BufferedReader br = null; 171 | try { 172 | br = new BufferedReader(new FileReader(pathToTrainingParasFile)); 173 | for (String line; (line = br.readLine()) != null;) { 174 | 175 | if (line.trim().length() == 0) 176 | continue; 177 | 178 | String[] paraOptions = line.trim().split("\\s+"); 179 | paras.put(paraOptions[0], paraOptions[1]); 180 | } 181 | } 182 | catch (Exception e) { 183 | e.printStackTrace(); 184 | } 185 | return paras; 186 | } 187 | 188 | private void initializeWordCount(String pathToTrainingCorpus, 189 | String pathToTopicAssignmentFile) 190 | { 191 | System.out.println("Loading pre-trained model..."); 192 | List> trainCorpus = new ArrayList>(); 193 | BufferedReader br = null; 194 | try { 195 | int indexWord = -1; 196 | br = new BufferedReader(new FileReader(pathToTrainingCorpus)); 197 | for (String doc; (doc = br.readLine()) != null;) { 198 | 199 | if (doc.trim().length() == 0) 200 | continue; 201 | 202 | String[] words = doc.trim().split("\\s+"); 203 | List document = new ArrayList(); 204 | 205 | for (String word : words) { 206 | if (word2IdVocabulary.containsKey(word)) { 207 | document.add(word2IdVocabulary.get(word)); 208 | } 209 | else { 210 | indexWord += 1; 211 | word2IdVocabulary.put(word, indexWord); 212 | id2WordVocabulary.put(indexWord, word); 213 | document.add(indexWord); 214 | } 215 | } 216 | trainCorpus.add(document); 217 | } 218 | } 219 | catch (Exception e) { 220 | e.printStackTrace(); 221 | } 222 | 223 | vocabularySize = word2IdVocabulary.size(); 224 | topicWordCount = new int[numTopics][vocabularySize]; 225 | sumTopicWordCount = new int[numTopics]; 226 | 227 | try { 228 | br = new BufferedReader(new FileReader(pathToTopicAssignmentFile)); 229 | int docId = 0; 230 | for (String line; (line = br.readLine()) != null;) { 231 | String[] strTopics = line.trim().split("\\s+"); 232 | for (int j = 0; j < strTopics.length; j++) { 233 | int wordId = trainCorpus.get(docId).get(j); 234 | int topic = new Integer(strTopics[j]); 235 | topicWordCount[topic][wordId] += 1; 236 | sumTopicWordCount[topic] += 1; 237 | } 238 | docId++; 239 | } 240 | } 241 | catch (Exception e) { 242 | e.printStackTrace(); 243 | } 244 | } 245 | 246 | /** 247 | * Randomly initialize topic assignments 248 | */ 249 | public void initialize() 250 | throws IOException 251 | { 252 | System.out.println("Randomly initializing topic assignments ..."); 253 | 254 | topicAssignments = new ArrayList>(); 255 | 256 | for (int i = 0; i < numDocuments; i++) { 257 | List topics = new ArrayList(); 258 | int docSize = corpus.get(i).size(); 259 | for (int j = 0; j < docSize; j++) { 260 | int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic 261 | // Increase counts 262 | docTopicCount[i][topic] += 1; 263 | topicWordCount[topic][corpus.get(i).get(j)] += 1; 264 | sumDocTopicCount[i] += 1; 265 | sumTopicWordCount[topic] += 1; 266 | 267 | topics.add(topic); 268 | } 269 | topicAssignments.add(topics); 270 | } 271 | } 272 | 273 | public void inference() 274 | throws IOException 275 | { 276 | writeParameters(); 277 | writeDictionary(); 278 | 279 | System.out.println("Running Gibbs sampling inference: "); 280 | 281 | for (int iter = 1; iter <= numIterations; iter++) { 282 | 283 | System.out.println("\tSampling iteration: " + (iter)); 284 | // System.out.println("\t\tPerplexity: " + computePerplexity()); 285 | 286 | sampleInSingleIteration(); 287 | 288 | if ((savestep > 0) && (iter % savestep == 0) 289 | && (iter < numIterations)) { 290 | System.out.println("\t\tSaving the output from the " + iter 291 | + "^{th} sample"); 292 | expName = orgExpName + "-" + iter; 293 | write(); 294 | } 295 | } 296 | expName = orgExpName; 297 | 298 | System.out.println("Writing output from the last sample ..."); 299 | write(); 300 | 301 | System.out.println("Sampling completed!"); 302 | 303 | } 304 | 305 | public void sampleInSingleIteration() 306 | { 307 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 308 | int docSize = corpus.get(dIndex).size(); 309 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 310 | // Get current word and its topic 311 | int topic = topicAssignments.get(dIndex).get(wIndex); 312 | int word = corpus.get(dIndex).get(wIndex); 313 | 314 | // Decrease counts 315 | docTopicCount[dIndex][topic] -= 1; 316 | // docTopicSum[dIndex] -= 1; 317 | topicWordCount[topic][word] -= 1; 318 | sumTopicWordCount[topic] -= 1; 319 | 320 | // Sample a topic 321 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 322 | multiPros[tIndex] = (docTopicCount[dIndex][tIndex] + alpha) 323 | * ((topicWordCount[tIndex][word] + beta) / (sumTopicWordCount[tIndex] + betaSum)); 324 | // multiPros[tIndex] = ((docTopicCount[dIndex][tIndex] + 325 | // alpha) / 326 | // (docTopicSum[dIndex] + alphaSum)) 327 | // * ((topicWordCount[tIndex][word] + beta) / 328 | // (topicWordSum[tIndex] + betaSum)); 329 | } 330 | topic = FuncUtils.nextDiscrete(multiPros); 331 | 332 | // Increase counts 333 | docTopicCount[dIndex][topic] += 1; 334 | // docTopicSum[dIndex] += 1; 335 | topicWordCount[topic][word] += 1; 336 | sumTopicWordCount[topic] += 1; 337 | 338 | // Update topic assignments 339 | topicAssignments.get(dIndex).set(wIndex, topic); 340 | } 341 | } 342 | } 343 | 344 | public void writeParameters() 345 | throws IOException 346 | { 347 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 348 | + expName + ".paras")); 349 | writer.write("-model" + "\t" + "LDA"); 350 | writer.write("\n-corpus" + "\t" + corpusPath); 351 | writer.write("\n-ntopics" + "\t" + numTopics); 352 | writer.write("\n-alpha" + "\t" + alpha); 353 | writer.write("\n-beta" + "\t" + beta); 354 | writer.write("\n-niters" + "\t" + numIterations); 355 | writer.write("\n-twords" + "\t" + topWords); 356 | writer.write("\n-name" + "\t" + expName); 357 | if (tAssignsFilePath.length() > 0) 358 | writer.write("\n-initFile" + "\t" + tAssignsFilePath); 359 | if (savestep > 0) 360 | writer.write("\n-sstep" + "\t" + savestep); 361 | 362 | writer.close(); 363 | } 364 | 365 | public void writeDictionary() 366 | throws IOException 367 | { 368 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 369 | + expName + ".vocabulary")); 370 | for (int id = 0; id < vocabularySize; id++) 371 | writer.write(id2WordVocabulary.get(id) + " " + id + "\n"); 372 | writer.close(); 373 | } 374 | 375 | public void writeIDbasedCorpus() 376 | throws IOException 377 | { 378 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 379 | + expName + ".IDcorpus")); 380 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 381 | int docSize = corpus.get(dIndex).size(); 382 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 383 | writer.write(corpus.get(dIndex).get(wIndex) + " "); 384 | } 385 | writer.write("\n"); 386 | } 387 | writer.close(); 388 | } 389 | 390 | public void writeTopicAssignments() 391 | throws IOException 392 | { 393 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 394 | + expName + ".topicAssignments")); 395 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 396 | int docSize = corpus.get(dIndex).size(); 397 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 398 | writer.write(topicAssignments.get(dIndex).get(wIndex) + " "); 399 | } 400 | writer.write("\n"); 401 | } 402 | writer.close(); 403 | } 404 | 405 | public void writeTopTopicalWords() 406 | throws IOException 407 | { 408 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 409 | + expName + ".topWords")); 410 | 411 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 412 | writer.write("Topic" + new Integer(tIndex) + ":"); 413 | 414 | Map wordCount = new TreeMap(); 415 | for (int wIndex = 0; wIndex < vocabularySize; wIndex++) { 416 | wordCount.put(wIndex, topicWordCount[tIndex][wIndex]); 417 | } 418 | wordCount = FuncUtils.sortByValueDescending(wordCount); 419 | 420 | Set mostLikelyWords = wordCount.keySet(); 421 | int count = 0; 422 | for (Integer index : mostLikelyWords) { 423 | if (count < topWords) { 424 | double pro = (topicWordCount[tIndex][index] + beta) 425 | / (sumTopicWordCount[tIndex] + betaSum); 426 | pro = Math.round(pro * 1000000.0) / 1000000.0; 427 | writer.write(" " + id2WordVocabulary.get(index) + "(" + pro 428 | + ")"); 429 | count += 1; 430 | } 431 | else { 432 | writer.write("\n\n"); 433 | break; 434 | } 435 | } 436 | } 437 | writer.close(); 438 | } 439 | 440 | public void writeTopicWordPros() 441 | throws IOException 442 | { 443 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 444 | + expName + ".phi")); 445 | for (int i = 0; i < numTopics; i++) { 446 | for (int j = 0; j < vocabularySize; j++) { 447 | double pro = (topicWordCount[i][j] + beta) 448 | / (sumTopicWordCount[i] + betaSum); 449 | writer.write(pro + " "); 450 | } 451 | writer.write("\n"); 452 | } 453 | writer.close(); 454 | } 455 | 456 | public void writeTopicWordCount() 457 | throws IOException 458 | { 459 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 460 | + expName + ".WTcount")); 461 | for (int i = 0; i < numTopics; i++) { 462 | for (int j = 0; j < vocabularySize; j++) { 463 | writer.write(topicWordCount[i][j] + " "); 464 | } 465 | writer.write("\n"); 466 | } 467 | writer.close(); 468 | 469 | } 470 | 471 | public void writeDocTopicPros() 472 | throws IOException 473 | { 474 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 475 | + expName + ".theta")); 476 | for (int i = 0; i < numDocuments; i++) { 477 | for (int j = 0; j < numTopics; j++) { 478 | double pro = (docTopicCount[i][j] + alpha) 479 | / (sumDocTopicCount[i] + alphaSum); 480 | writer.write(pro + " "); 481 | } 482 | writer.write("\n"); 483 | } 484 | writer.close(); 485 | } 486 | 487 | public void writeDocTopicCount() 488 | throws IOException 489 | { 490 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 491 | + expName + ".DTcount")); 492 | for (int i = 0; i < numDocuments; i++) { 493 | for (int j = 0; j < numTopics; j++) { 494 | writer.write(docTopicCount[i][j] + " "); 495 | } 496 | writer.write("\n"); 497 | } 498 | writer.close(); 499 | } 500 | 501 | public void write() 502 | throws IOException 503 | { 504 | writeTopTopicalWords(); 505 | writeDocTopicPros(); 506 | writeTopicAssignments(); 507 | writeTopicWordPros(); 508 | } 509 | 510 | public static void main(String args[]) 511 | throws Exception 512 | { 513 | GibbsSamplingLDA_Inf lda = new GibbsSamplingLDA_Inf( 514 | "test/testLDA.paras", "test/unseenTest.txt", 100, 20, "testLDAinf", 515 | 0); 516 | lda.inference(); 517 | } 518 | } 519 | -------------------------------------------------------------------------------- /src/models/GibbsSamplingLDA.java: -------------------------------------------------------------------------------- 1 | package models; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileReader; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | import java.util.TreeMap; 14 | 15 | import utility.FuncUtils; 16 | 17 | /** 18 | * jLDADMM: A Java package for the LDA and DMM topic models 19 | * 20 | * Implementation of the Latent Dirichlet Allocation topic model, using 21 | * collapsed Gibbs sampling, as described in: 22 | * 23 | * Thomas L. Griffiths and Mark Steyvers. 2004. Finding scientific topics. 24 | * Proceedings of the National Academy of Sciences of the United States of 25 | * America, 101(Suppl 1):5228–5235. 26 | * 27 | * @author: Dat Quoc Nguyen 28 | */ 29 | 30 | public class GibbsSamplingLDA 31 | { 32 | public double alpha; // Hyper-parameter alpha 33 | public double beta; // Hyper-parameter alpha 34 | public int numTopics; // Number of topics 35 | public int numIterations; // Number of Gibbs sampling iterations 36 | public int topWords; // Number of most probable words for each topic 37 | 38 | public double alphaSum; // alpha * numTopics 39 | public double betaSum; // beta * vocabularySize 40 | 41 | public List> corpus; // Word ID-based corpus 42 | public List> topicAssignments; // Topics assignments for words 43 | // in the corpus 44 | public int numDocuments; // Number of documents in the corpus 45 | public int numWordsInCorpus; // Number of words in the corpus 46 | 47 | public HashMap word2IdVocabulary; // Vocabulary to get ID 48 | // given a word 49 | public HashMap id2WordVocabulary; // Vocabulary to get word 50 | // given an ID 51 | public int vocabularySize; // The number of word types in the corpus 52 | 53 | // numDocuments * numTopics matrix 54 | // Given a document: number of its words assigned to each topic 55 | public int[][] docTopicCount; 56 | // Number of words in every document 57 | public int[] sumDocTopicCount; 58 | // numTopics * vocabularySize matrix 59 | // Given a topic: number of times a word type assigned to the topic 60 | public int[][] topicWordCount; 61 | // Total number of words assigned to a topic 62 | public int[] sumTopicWordCount; 63 | 64 | // Double array used to sample a topic 65 | public double[] multiPros; 66 | 67 | // Path to the directory containing the corpus 68 | public String folderPath; 69 | // Path to the topic modeling corpus 70 | public String corpusPath; 71 | 72 | public String expName = "LDAmodel"; 73 | public String orgExpName = "LDAmodel"; 74 | public String tAssignsFilePath = ""; 75 | public int savestep = 0; 76 | 77 | public GibbsSamplingLDA(String pathToCorpus, int inNumTopics, 78 | double inAlpha, double inBeta, int inNumIterations, int inTopWords) 79 | throws Exception 80 | { 81 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 82 | inTopWords, "LDAmodel"); 83 | } 84 | 85 | public GibbsSamplingLDA(String pathToCorpus, int inNumTopics, 86 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 87 | String inExpName) 88 | throws Exception 89 | { 90 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 91 | inTopWords, inExpName, "", 0); 92 | } 93 | 94 | public GibbsSamplingLDA(String pathToCorpus, int inNumTopics, 95 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 96 | String inExpName, String pathToTAfile) 97 | throws Exception 98 | { 99 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 100 | inTopWords, inExpName, pathToTAfile, 0); 101 | } 102 | 103 | public GibbsSamplingLDA(String pathToCorpus, int inNumTopics, 104 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 105 | String inExpName, int inSaveStep) 106 | throws Exception 107 | { 108 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 109 | inTopWords, inExpName, "", inSaveStep); 110 | } 111 | 112 | public GibbsSamplingLDA(String pathToCorpus, int inNumTopics, 113 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 114 | String inExpName, String pathToTAfile, int inSaveStep) 115 | throws Exception 116 | { 117 | 118 | alpha = inAlpha; 119 | beta = inBeta; 120 | numTopics = inNumTopics; 121 | numIterations = inNumIterations; 122 | topWords = inTopWords; 123 | savestep = inSaveStep; 124 | expName = inExpName; 125 | orgExpName = expName; 126 | corpusPath = pathToCorpus; 127 | folderPath = pathToCorpus.substring( 128 | 0, 129 | Math.max(pathToCorpus.lastIndexOf("/"), 130 | pathToCorpus.lastIndexOf("\\")) + 1); 131 | 132 | System.out.println("Reading topic modeling corpus: " + pathToCorpus); 133 | 134 | word2IdVocabulary = new HashMap(); 135 | id2WordVocabulary = new HashMap(); 136 | corpus = new ArrayList>(); 137 | numDocuments = 0; 138 | numWordsInCorpus = 0; 139 | 140 | BufferedReader br = null; 141 | try { 142 | int indexWord = -1; 143 | br = new BufferedReader(new FileReader(pathToCorpus)); 144 | for (String doc; (doc = br.readLine()) != null;) { 145 | 146 | if (doc.trim().length() == 0) 147 | continue; 148 | 149 | String[] words = doc.trim().split("\\s+"); 150 | List document = new ArrayList(); 151 | 152 | for (String word : words) { 153 | if (word2IdVocabulary.containsKey(word)) { 154 | document.add(word2IdVocabulary.get(word)); 155 | } 156 | else { 157 | indexWord += 1; 158 | word2IdVocabulary.put(word, indexWord); 159 | id2WordVocabulary.put(indexWord, word); 160 | document.add(indexWord); 161 | } 162 | } 163 | 164 | numDocuments++; 165 | numWordsInCorpus += document.size(); 166 | corpus.add(document); 167 | } 168 | } 169 | catch (Exception e) { 170 | e.printStackTrace(); 171 | } 172 | 173 | vocabularySize = word2IdVocabulary.size(); // vocabularySize = indexWord 174 | docTopicCount = new int[numDocuments][numTopics]; 175 | topicWordCount = new int[numTopics][vocabularySize]; 176 | sumDocTopicCount = new int[numDocuments]; 177 | sumTopicWordCount = new int[numTopics]; 178 | 179 | multiPros = new double[numTopics]; 180 | for (int i = 0; i < numTopics; i++) { 181 | multiPros[i] = 1.0 / numTopics; 182 | } 183 | 184 | alphaSum = numTopics * alpha; 185 | betaSum = vocabularySize * beta; 186 | 187 | System.out.println("Corpus size: " + numDocuments + " docs, " 188 | + numWordsInCorpus + " words"); 189 | System.out.println("Vocabuary size: " + vocabularySize); 190 | System.out.println("Number of topics: " + numTopics); 191 | System.out.println("alpha: " + alpha); 192 | System.out.println("beta: " + beta); 193 | System.out.println("Number of sampling iterations: " + numIterations); 194 | System.out.println("Number of top topical words: " + topWords); 195 | 196 | tAssignsFilePath = pathToTAfile; 197 | if (tAssignsFilePath.length() > 0) 198 | initialize(tAssignsFilePath); 199 | else 200 | initialize(); 201 | } 202 | 203 | /** 204 | * Randomly initialize topic assignments 205 | */ 206 | public void initialize() 207 | throws IOException 208 | { 209 | System.out.println("Randomly initializing topic assignments ..."); 210 | 211 | topicAssignments = new ArrayList>(); 212 | 213 | for (int i = 0; i < numDocuments; i++) { 214 | List topics = new ArrayList(); 215 | int docSize = corpus.get(i).size(); 216 | for (int j = 0; j < docSize; j++) { 217 | int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic 218 | // Increase counts 219 | docTopicCount[i][topic] += 1; 220 | topicWordCount[topic][corpus.get(i).get(j)] += 1; 221 | sumDocTopicCount[i] += 1; 222 | sumTopicWordCount[topic] += 1; 223 | 224 | topics.add(topic); 225 | } 226 | topicAssignments.add(topics); 227 | } 228 | } 229 | 230 | /** 231 | * Initialize topic assignments from a given file 232 | */ 233 | public void initialize(String pathToTopicAssignmentFile) 234 | { 235 | System.out.println("Reading topic-assignment file: " 236 | + pathToTopicAssignmentFile); 237 | 238 | topicAssignments = new ArrayList>(); 239 | 240 | BufferedReader br = null; 241 | try { 242 | br = new BufferedReader(new FileReader(pathToTopicAssignmentFile)); 243 | int docID = 0; 244 | int numWords = 0; 245 | for (String line; (line = br.readLine()) != null;) { 246 | String[] strTopics = line.trim().split("\\s+"); 247 | List topics = new ArrayList(); 248 | for (int j = 0; j < strTopics.length; j++) { 249 | int topic = new Integer(strTopics[j]); 250 | // Increase counts 251 | docTopicCount[docID][topic] += 1; 252 | topicWordCount[topic][corpus.get(docID).get(j)] += 1; 253 | sumDocTopicCount[docID] += 1; 254 | sumTopicWordCount[topic] += 1; 255 | 256 | topics.add(topic); 257 | numWords++; 258 | } 259 | topicAssignments.add(topics); 260 | docID++; 261 | } 262 | 263 | if ((docID != numDocuments) || (numWords != numWordsInCorpus)) { 264 | System.out 265 | .println("The topic modeling corpus and topic assignment file are not consistent!!!"); 266 | throw new Exception(); 267 | } 268 | } 269 | catch (Exception e) { 270 | e.printStackTrace(); 271 | } 272 | } 273 | 274 | public void inference() 275 | throws IOException 276 | { 277 | writeParameters(); 278 | writeDictionary(); 279 | 280 | System.out.println("Running Gibbs sampling inference: "); 281 | 282 | for (int iter = 1; iter <= numIterations; iter++) { 283 | 284 | System.out.println("\tSampling iteration: " + (iter)); 285 | // System.out.println("\t\tPerplexity: " + computePerplexity()); 286 | 287 | sampleInSingleIteration(); 288 | 289 | if ((savestep > 0) && (iter % savestep == 0) 290 | && (iter < numIterations)) { 291 | System.out.println("\t\tSaving the output from the " + iter 292 | + "^{th} sample"); 293 | expName = orgExpName + "-" + iter; 294 | write(); 295 | } 296 | } 297 | expName = orgExpName; 298 | 299 | System.out.println("Writing output from the last sample ..."); 300 | write(); 301 | 302 | System.out.println("Sampling completed!"); 303 | 304 | } 305 | 306 | public void sampleInSingleIteration() 307 | { 308 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 309 | int docSize = corpus.get(dIndex).size(); 310 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 311 | // Get current word and its topic 312 | int topic = topicAssignments.get(dIndex).get(wIndex); 313 | int word = corpus.get(dIndex).get(wIndex); 314 | 315 | // Decrease counts 316 | docTopicCount[dIndex][topic] -= 1; 317 | // docTopicSum[dIndex] -= 1; 318 | topicWordCount[topic][word] -= 1; 319 | sumTopicWordCount[topic] -= 1; 320 | 321 | // Sample a topic 322 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 323 | multiPros[tIndex] = (docTopicCount[dIndex][tIndex] + alpha) 324 | * ((topicWordCount[tIndex][word] + beta) / (sumTopicWordCount[tIndex] + betaSum)); 325 | // multiPros[tIndex] = ((docTopicCount[dIndex][tIndex] + 326 | // alpha) / 327 | // (docTopicSum[dIndex] + alphaSum)) 328 | // * ((topicWordCount[tIndex][word] + beta) / 329 | // (topicWordSum[tIndex] + betaSum)); 330 | } 331 | topic = FuncUtils.nextDiscrete(multiPros); 332 | 333 | // Increase counts 334 | docTopicCount[dIndex][topic] += 1; 335 | // docTopicSum[dIndex] += 1; 336 | topicWordCount[topic][word] += 1; 337 | sumTopicWordCount[topic] += 1; 338 | 339 | // Update topic assignments 340 | topicAssignments.get(dIndex).set(wIndex, topic); 341 | } 342 | } 343 | } 344 | 345 | public void writeParameters() 346 | throws IOException 347 | { 348 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 349 | + expName + ".paras")); 350 | writer.write("-model" + "\t" + "LDA"); 351 | writer.write("\n-corpus" + "\t" + corpusPath); 352 | writer.write("\n-ntopics" + "\t" + numTopics); 353 | writer.write("\n-alpha" + "\t" + alpha); 354 | writer.write("\n-beta" + "\t" + beta); 355 | writer.write("\n-niters" + "\t" + numIterations); 356 | writer.write("\n-twords" + "\t" + topWords); 357 | writer.write("\n-name" + "\t" + expName); 358 | if (tAssignsFilePath.length() > 0) 359 | writer.write("\n-initFile" + "\t" + tAssignsFilePath); 360 | if (savestep > 0) 361 | writer.write("\n-sstep" + "\t" + savestep); 362 | 363 | writer.close(); 364 | } 365 | 366 | public void writeDictionary() 367 | throws IOException 368 | { 369 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 370 | + expName + ".vocabulary")); 371 | for (int id = 0; id < vocabularySize; id++) 372 | writer.write(id2WordVocabulary.get(id) + " " + id + "\n"); 373 | writer.close(); 374 | } 375 | 376 | public void writeIDbasedCorpus() 377 | throws IOException 378 | { 379 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 380 | + expName + ".IDcorpus")); 381 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 382 | int docSize = corpus.get(dIndex).size(); 383 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 384 | writer.write(corpus.get(dIndex).get(wIndex) + " "); 385 | } 386 | writer.write("\n"); 387 | } 388 | writer.close(); 389 | } 390 | 391 | public void writeTopicAssignments() 392 | throws IOException 393 | { 394 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 395 | + expName + ".topicAssignments")); 396 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 397 | int docSize = corpus.get(dIndex).size(); 398 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 399 | writer.write(topicAssignments.get(dIndex).get(wIndex) + " "); 400 | } 401 | writer.write("\n"); 402 | } 403 | writer.close(); 404 | } 405 | 406 | public void writeTopTopicalWords() 407 | throws IOException 408 | { 409 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 410 | + expName + ".topWords")); 411 | 412 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 413 | writer.write("Topic" + new Integer(tIndex) + ":"); 414 | 415 | Map wordCount = new TreeMap(); 416 | for (int wIndex = 0; wIndex < vocabularySize; wIndex++) { 417 | wordCount.put(wIndex, topicWordCount[tIndex][wIndex]); 418 | } 419 | wordCount = FuncUtils.sortByValueDescending(wordCount); 420 | 421 | Set mostLikelyWords = wordCount.keySet(); 422 | int count = 0; 423 | for (Integer index : mostLikelyWords) { 424 | if (count < topWords) { 425 | double pro = (topicWordCount[tIndex][index] + beta) 426 | / (sumTopicWordCount[tIndex] + betaSum); 427 | pro = Math.round(pro * 1000000.0) / 1000000.0; 428 | writer.write(" " + id2WordVocabulary.get(index) + "(" + pro 429 | + ")"); 430 | count += 1; 431 | } 432 | else { 433 | writer.write("\n\n"); 434 | break; 435 | } 436 | } 437 | } 438 | writer.close(); 439 | } 440 | 441 | public void writeTopicWordPros() 442 | throws IOException 443 | { 444 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 445 | + expName + ".phi")); 446 | for (int i = 0; i < numTopics; i++) { 447 | for (int j = 0; j < vocabularySize; j++) { 448 | double pro = (topicWordCount[i][j] + beta) 449 | / (sumTopicWordCount[i] + betaSum); 450 | writer.write(pro + " "); 451 | } 452 | writer.write("\n"); 453 | } 454 | writer.close(); 455 | } 456 | 457 | public void writeTopicWordCount() 458 | throws IOException 459 | { 460 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 461 | + expName + ".WTcount")); 462 | for (int i = 0; i < numTopics; i++) { 463 | for (int j = 0; j < vocabularySize; j++) { 464 | writer.write(topicWordCount[i][j] + " "); 465 | } 466 | writer.write("\n"); 467 | } 468 | writer.close(); 469 | 470 | } 471 | 472 | public void writeDocTopicPros() 473 | throws IOException 474 | { 475 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 476 | + expName + ".theta")); 477 | for (int i = 0; i < numDocuments; i++) { 478 | for (int j = 0; j < numTopics; j++) { 479 | double pro = (docTopicCount[i][j] + alpha) 480 | / (sumDocTopicCount[i] + alphaSum); 481 | writer.write(pro + " "); 482 | } 483 | writer.write("\n"); 484 | } 485 | writer.close(); 486 | } 487 | 488 | public void writeDocTopicCount() 489 | throws IOException 490 | { 491 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 492 | + expName + ".DTcount")); 493 | for (int i = 0; i < numDocuments; i++) { 494 | for (int j = 0; j < numTopics; j++) { 495 | writer.write(docTopicCount[i][j] + " "); 496 | } 497 | writer.write("\n"); 498 | } 499 | writer.close(); 500 | } 501 | 502 | public void write() 503 | throws IOException 504 | { 505 | writeTopTopicalWords(); 506 | writeDocTopicPros(); 507 | writeTopicAssignments(); 508 | writeTopicWordPros(); 509 | } 510 | 511 | public static void main(String args[]) 512 | throws Exception 513 | { 514 | GibbsSamplingLDA lda = new GibbsSamplingLDA("test/corpus.txt", 7, 0.1, 515 | 0.01, 2000, 20, "testLDA"); 516 | lda.inference(); 517 | } 518 | } 519 | -------------------------------------------------------------------------------- /src/models/GibbsSamplingDMM_Inf.java: -------------------------------------------------------------------------------- 1 | package models; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileReader; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | import java.util.TreeMap; 14 | 15 | import utility.FuncUtils; 16 | 17 | /** 18 | * jLDADMM: A Java package for the LDA and DMM topic models 19 | * 20 | * http://jldadmm.sourceforge.net/ 21 | * 22 | * @author: Dat Quoc Nguyen 23 | * 24 | */ 25 | 26 | public class GibbsSamplingDMM_Inf 27 | { 28 | public double alpha; // Hyper-parameter alpha 29 | public double beta; // Hyper-parameter alpha 30 | public int numTopics; // Number of topics 31 | public int numIterations; // Number of Gibbs sampling iterations 32 | public int topWords; // Number of most probable words for each topic 33 | 34 | public double alphaSum; // alpha * numTopics 35 | public double betaSum; // beta * vocabularySize 36 | 37 | public List> corpus; // Word ID-based corpus 38 | public List topicAssignments; // Topics assignments for documents 39 | public int numDocuments; // Number of documents in the corpus 40 | public int numWordsInCorpus; // Number of words in the corpus 41 | 42 | public HashMap word2IdVocabulary; // Vocabulary to get ID 43 | // given a word 44 | public HashMap id2WordVocabulary; // Vocabulary to get word 45 | // given an ID 46 | public int vocabularySize; // The number of word types in the corpus 47 | 48 | // Number of documents assigned to a topic 49 | public int[] docTopicCount; 50 | // numTopics * vocabularySize matrix 51 | // Given a topic: number of times a word type assigned to the topic 52 | public int[][] topicWordCount; 53 | // Total number of words assigned to a topic 54 | public int[] sumTopicWordCount; 55 | 56 | // Double array used to sample a topic 57 | public double[] multiPros; 58 | 59 | // Path to the directory containing the corpus 60 | public String folderPath; 61 | // Path to the topic modeling corpus 62 | public String corpusPath; 63 | 64 | // Given a document, number of times its i^{th} word appearing from 65 | // the first index to the i^{th}-index in the document 66 | // Example: given a document of "a a b a b c d c". We have: 1 2 1 3 2 1 1 2 67 | public List> occurenceToIndexCount; 68 | 69 | public String expName = "DMMinf"; 70 | public String orgExpName = "DMMinf"; 71 | public String tAssignsFilePath = ""; 72 | public int savestep = 0; 73 | 74 | public GibbsSamplingDMM_Inf(String pathToTrainingParasFile, 75 | String pathToUnseenCorpus, int inNumIterations, int inTopWords, 76 | String inExpName, int inSaveStep) 77 | throws Exception 78 | { 79 | HashMap paras = parseTrainingParasFile(pathToTrainingParasFile); 80 | if (!paras.get("-model").equals("DMM")) { 81 | throw new Exception("Wrong pre-trained model!!!"); 82 | } 83 | alpha = new Double(paras.get("-alpha")); 84 | beta = new Double(paras.get("-beta")); 85 | numTopics = new Integer(paras.get("-ntopics")); 86 | 87 | numIterations = inNumIterations; 88 | topWords = inTopWords; 89 | savestep = inSaveStep; 90 | expName = inExpName; 91 | orgExpName = expName; 92 | 93 | String trainingCorpus = paras.get("-corpus"); 94 | String trainingCorpusfolder = trainingCorpus.substring( 95 | 0, 96 | Math.max(trainingCorpus.lastIndexOf("/"), 97 | trainingCorpus.lastIndexOf("\\")) + 1); 98 | String topicAssignment4TrainFile = trainingCorpusfolder 99 | + paras.get("-name") + ".topicAssignments"; 100 | 101 | word2IdVocabulary = new HashMap(); 102 | id2WordVocabulary = new HashMap(); 103 | initializeWordCount(trainingCorpus, topicAssignment4TrainFile); 104 | 105 | corpusPath = pathToUnseenCorpus; 106 | folderPath = pathToUnseenCorpus.substring( 107 | 0, 108 | Math.max(pathToUnseenCorpus.lastIndexOf("/"), 109 | pathToUnseenCorpus.lastIndexOf("\\")) + 1); 110 | System.out.println("Reading unseen corpus: " + pathToUnseenCorpus); 111 | corpus = new ArrayList>(); 112 | occurenceToIndexCount = new ArrayList>(); 113 | numDocuments = 0; 114 | numWordsInCorpus = 0; 115 | 116 | BufferedReader br = null; 117 | try { 118 | br = new BufferedReader(new FileReader(pathToUnseenCorpus)); 119 | for (String doc; (doc = br.readLine()) != null;) { 120 | if (doc.trim().length() == 0) 121 | continue; 122 | 123 | String[] words = doc.trim().split("\\s+"); 124 | List document = new ArrayList(); 125 | 126 | List wordOccurenceToIndexInDoc = new ArrayList(); 127 | HashMap wordOccurenceToIndexInDocCount = new HashMap(); 128 | 129 | for (String word : words) { 130 | if (word2IdVocabulary.containsKey(word)) { 131 | document.add(word2IdVocabulary.get(word)); 132 | int times = 0; 133 | if (wordOccurenceToIndexInDocCount.containsKey(word)) { 134 | times = wordOccurenceToIndexInDocCount.get(word); 135 | } 136 | times += 1; 137 | wordOccurenceToIndexInDocCount.put(word, times); 138 | wordOccurenceToIndexInDoc.add(times); 139 | } 140 | else { 141 | // Skip this unknown-word 142 | } 143 | } 144 | numDocuments++; 145 | numWordsInCorpus += document.size(); 146 | corpus.add(document); 147 | occurenceToIndexCount.add(wordOccurenceToIndexInDoc); 148 | } 149 | } 150 | catch (Exception e) { 151 | e.printStackTrace(); 152 | } 153 | 154 | docTopicCount = new int[numTopics]; 155 | multiPros = new double[numTopics]; 156 | for (int i = 0; i < numTopics; i++) { 157 | multiPros[i] = 1.0 / numTopics; 158 | } 159 | 160 | alphaSum = numTopics * alpha; 161 | betaSum = vocabularySize * beta; 162 | 163 | System.out.println("Corpus size: " + numDocuments + " docs, " 164 | + numWordsInCorpus + " words"); 165 | System.out.println("Vocabuary size: " + vocabularySize); 166 | System.out.println("Number of topics: " + numTopics); 167 | System.out.println("alpha: " + alpha); 168 | System.out.println("beta: " + beta); 169 | System.out.println("Number of sampling iterations: " + numIterations); 170 | System.out.println("Number of top topical words: " + topWords); 171 | 172 | initialize(); 173 | } 174 | 175 | private HashMap parseTrainingParasFile( 176 | String pathToTrainingParasFile) 177 | throws Exception 178 | { 179 | HashMap paras = new HashMap(); 180 | BufferedReader br = null; 181 | try { 182 | br = new BufferedReader(new FileReader(pathToTrainingParasFile)); 183 | for (String line; (line = br.readLine()) != null;) { 184 | 185 | if (line.trim().length() == 0) 186 | continue; 187 | 188 | String[] paraOptions = line.trim().split("\\s+"); 189 | paras.put(paraOptions[0], paraOptions[1]); 190 | } 191 | } 192 | catch (Exception e) { 193 | e.printStackTrace(); 194 | } 195 | return paras; 196 | } 197 | 198 | private void initializeWordCount(String pathToTrainingCorpus, 199 | String pathToTopicAssignmentFile) 200 | { 201 | System.out.println("Loading pre-trained model..."); 202 | List> trainCorpus = new ArrayList>(); 203 | BufferedReader br = null; 204 | try { 205 | int indexWord = -1; 206 | br = new BufferedReader(new FileReader(pathToTrainingCorpus)); 207 | for (String doc; (doc = br.readLine()) != null;) { 208 | 209 | if (doc.trim().length() == 0) 210 | continue; 211 | 212 | String[] words = doc.trim().split("\\s+"); 213 | List document = new ArrayList(); 214 | 215 | for (String word : words) { 216 | if (word2IdVocabulary.containsKey(word)) { 217 | document.add(word2IdVocabulary.get(word)); 218 | } 219 | else { 220 | indexWord += 1; 221 | word2IdVocabulary.put(word, indexWord); 222 | id2WordVocabulary.put(indexWord, word); 223 | document.add(indexWord); 224 | } 225 | } 226 | trainCorpus.add(document); 227 | } 228 | } 229 | catch (Exception e) { 230 | e.printStackTrace(); 231 | } 232 | 233 | vocabularySize = word2IdVocabulary.size(); 234 | topicWordCount = new int[numTopics][vocabularySize]; 235 | sumTopicWordCount = new int[numTopics]; 236 | 237 | try { 238 | br = new BufferedReader(new FileReader(pathToTopicAssignmentFile)); 239 | int docId = 0; 240 | for (String line; (line = br.readLine()) != null;) { 241 | String[] strTopics = line.trim().split("\\s+"); 242 | for (int j = 0; j < strTopics.length; j++) { 243 | int wordId = trainCorpus.get(docId).get(j); 244 | int topic = new Integer(strTopics[j]); 245 | topicWordCount[topic][wordId] += 1; 246 | sumTopicWordCount[topic] += 1; 247 | } 248 | docId++; 249 | } 250 | } 251 | catch (Exception e) { 252 | e.printStackTrace(); 253 | } 254 | } 255 | 256 | /** 257 | * Randomly initialize topic assignments 258 | */ 259 | public void initialize() 260 | throws IOException 261 | { 262 | System.out.println("Randomly initialzing topic assignments ..."); 263 | topicAssignments = new ArrayList(); 264 | for (int i = 0; i < numDocuments; i++) { 265 | int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic 266 | docTopicCount[topic] += 1; 267 | int docSize = corpus.get(i).size(); 268 | for (int j = 0; j < docSize; j++) { 269 | topicWordCount[topic][corpus.get(i).get(j)] += 1; 270 | sumTopicWordCount[topic] += 1; 271 | } 272 | topicAssignments.add(topic); 273 | } 274 | } 275 | 276 | public void inference() 277 | throws IOException 278 | { 279 | writeParameters(); 280 | writeDictionary(); 281 | 282 | System.out.println("Running Gibbs sampling inference: "); 283 | 284 | for (int iter = 1; iter <= numIterations; iter++) { 285 | 286 | System.out.println("\tSampling iteration: " + (iter)); 287 | // System.out.println("\t\tPerplexity: " + computePerplexity()); 288 | 289 | sampleInSingleIteration(); 290 | 291 | if ((savestep > 0) && (iter % savestep == 0) 292 | && (iter < numIterations)) { 293 | System.out.println("\t\tSaving the output from the " + iter 294 | + "^{th} sample"); 295 | expName = orgExpName + "-" + iter; 296 | write(); 297 | } 298 | } 299 | expName = orgExpName; 300 | 301 | System.out.println("Writing output from the last sample ..."); 302 | write(); 303 | 304 | System.out.println("Sampling completed!"); 305 | 306 | } 307 | 308 | public void sampleInSingleIteration() 309 | { 310 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 311 | int topic = topicAssignments.get(dIndex); 312 | List document = corpus.get(dIndex); 313 | int docSize = document.size(); 314 | 315 | // Decrease counts 316 | docTopicCount[topic] -= 1; 317 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 318 | int word = document.get(wIndex); 319 | topicWordCount[topic][word] -= 1; 320 | sumTopicWordCount[topic] -= 1; 321 | } 322 | 323 | // Sample a topic 324 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 325 | multiPros[tIndex] = (docTopicCount[tIndex] + alpha); 326 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 327 | int word = document.get(wIndex); 328 | multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta 329 | + occurenceToIndexCount.get(dIndex).get(wIndex) - 1) 330 | / (sumTopicWordCount[tIndex] + betaSum + wIndex); 331 | } 332 | } 333 | topic = FuncUtils.nextDiscrete(multiPros); 334 | 335 | // Increase counts 336 | docTopicCount[topic] += 1; 337 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 338 | int word = document.get(wIndex); 339 | topicWordCount[topic][word] += 1; 340 | sumTopicWordCount[topic] += 1; 341 | } 342 | // Update topic assignments 343 | topicAssignments.set(dIndex, topic); 344 | } 345 | } 346 | 347 | public void writeParameters() 348 | throws IOException 349 | { 350 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 351 | + expName + ".paras")); 352 | writer.write("-model" + "\t" + "DMM"); 353 | writer.write("\n-corpus" + "\t" + corpusPath); 354 | writer.write("\n-ntopics" + "\t" + numTopics); 355 | writer.write("\n-alpha" + "\t" + alpha); 356 | writer.write("\n-beta" + "\t" + beta); 357 | writer.write("\n-niters" + "\t" + numIterations); 358 | writer.write("\n-twords" + "\t" + topWords); 359 | writer.write("\n-name" + "\t" + expName); 360 | if (tAssignsFilePath.length() > 0) 361 | writer.write("\n-initFile" + "\t" + tAssignsFilePath); 362 | if (savestep > 0) 363 | writer.write("\n-sstep" + "\t" + savestep); 364 | 365 | writer.close(); 366 | } 367 | 368 | public void writeDictionary() 369 | throws IOException 370 | { 371 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 372 | + expName + ".vocabulary")); 373 | for (int id = 0; id < vocabularySize; id++) 374 | writer.write(id2WordVocabulary.get(id) + " " + id + "\n"); 375 | writer.close(); 376 | } 377 | 378 | public void writeIDbasedCorpus() 379 | throws IOException 380 | { 381 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 382 | + expName + ".IDcorpus")); 383 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 384 | int docSize = corpus.get(dIndex).size(); 385 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 386 | writer.write(corpus.get(dIndex).get(wIndex) + " "); 387 | } 388 | writer.write("\n"); 389 | } 390 | writer.close(); 391 | } 392 | 393 | public void writeTopicAssignments() 394 | throws IOException 395 | { 396 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 397 | + expName + ".topicAssignments")); 398 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 399 | int docSize = corpus.get(dIndex).size(); 400 | int topic = topicAssignments.get(dIndex); 401 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 402 | writer.write(topic + " "); 403 | } 404 | writer.write("\n"); 405 | } 406 | writer.close(); 407 | } 408 | 409 | public void writeTopTopicalWords() 410 | throws IOException 411 | { 412 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 413 | + expName + ".topWords")); 414 | 415 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 416 | writer.write("Topic" + new Integer(tIndex) + ":"); 417 | 418 | Map wordCount = new TreeMap(); 419 | for (int wIndex = 0; wIndex < vocabularySize; wIndex++) { 420 | wordCount.put(wIndex, topicWordCount[tIndex][wIndex]); 421 | } 422 | wordCount = FuncUtils.sortByValueDescending(wordCount); 423 | 424 | Set mostLikelyWords = wordCount.keySet(); 425 | int count = 0; 426 | for (Integer index : mostLikelyWords) { 427 | if (count < topWords) { 428 | double pro = (topicWordCount[tIndex][index] + beta) 429 | / (sumTopicWordCount[tIndex] + betaSum); 430 | pro = Math.round(pro * 1000000.0) / 1000000.0; 431 | writer.write(" " + id2WordVocabulary.get(index) + "(" + pro 432 | + ")"); 433 | count += 1; 434 | } 435 | else { 436 | writer.write("\n\n"); 437 | break; 438 | } 439 | } 440 | } 441 | writer.close(); 442 | } 443 | 444 | public void writeTopicWordPros() 445 | throws IOException 446 | { 447 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 448 | + expName + ".phi")); 449 | for (int i = 0; i < numTopics; i++) { 450 | for (int j = 0; j < vocabularySize; j++) { 451 | double pro = (topicWordCount[i][j] + beta) 452 | / (sumTopicWordCount[i] + betaSum); 453 | writer.write(pro + " "); 454 | } 455 | writer.write("\n"); 456 | } 457 | writer.close(); 458 | } 459 | 460 | public void writeTopicWordCount() 461 | throws IOException 462 | { 463 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 464 | + expName + ".WTcount")); 465 | for (int i = 0; i < numTopics; i++) { 466 | for (int j = 0; j < vocabularySize; j++) { 467 | writer.write(topicWordCount[i][j] + " "); 468 | } 469 | writer.write("\n"); 470 | } 471 | writer.close(); 472 | 473 | } 474 | 475 | public void writeDocTopicPros() 476 | throws IOException 477 | { 478 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 479 | + expName + ".theta")); 480 | 481 | for (int i = 0; i < numDocuments; i++) { 482 | int docSize = corpus.get(i).size(); 483 | double sum = 0.0; 484 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 485 | multiPros[tIndex] = (docTopicCount[tIndex] + alpha); 486 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 487 | int word = corpus.get(i).get(wIndex); 488 | multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta) 489 | / (sumTopicWordCount[tIndex] + betaSum); 490 | } 491 | sum += multiPros[tIndex]; 492 | } 493 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 494 | writer.write((multiPros[tIndex] / sum) + " "); 495 | } 496 | writer.write("\n"); 497 | } 498 | writer.close(); 499 | } 500 | 501 | public void write() 502 | throws IOException 503 | { 504 | writeTopTopicalWords(); 505 | writeDocTopicPros(); 506 | writeTopicAssignments(); 507 | writeTopicWordPros(); 508 | } 509 | 510 | public static void main(String args[]) 511 | throws Exception 512 | { 513 | GibbsSamplingDMM_Inf dmm = new GibbsSamplingDMM_Inf( 514 | "test/testDMM.paras", "test/unseenTest.txt", 100, 20, "testDMMinf", 515 | 0); 516 | dmm.inference(); 517 | } 518 | } 519 | -------------------------------------------------------------------------------- /src/models/GibbsSamplingDMM.java: -------------------------------------------------------------------------------- 1 | package models; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileReader; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | import java.util.TreeMap; 14 | 15 | import utility.FuncUtils; 16 | 17 | /** 18 | * jLDADMM: A Java package for the LDA and DMM topic models 19 | * 20 | * Implementation of the one-topic-per-document Dirichlet Multinomial Mixture 21 | * model, using collapsed Gibbs sampling, as described in: 22 | * 23 | * Jianhua Yin and Jianyong Wang. 2014. A Dirichlet Multinomial Mixture 24 | * Model-based Approach for Short Text Clustering. In Proceedings of the 20th 25 | * ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 26 | * pages 233–242. 27 | * 28 | * @author: Dat Quoc Nguyen 29 | */ 30 | 31 | public class GibbsSamplingDMM 32 | { 33 | public double alpha; // Hyper-parameter alpha 34 | public double beta; // Hyper-parameter alpha 35 | public int numTopics; // Number of topics 36 | public int numIterations; // Number of Gibbs sampling iterations 37 | public int topWords; // Number of most probable words for each topic 38 | 39 | public double alphaSum; // alpha * numTopics 40 | public double betaSum; // beta * vocabularySize 41 | 42 | public List> corpus; // Word ID-based corpus 43 | public List topicAssignments; // Topics assignments for documents 44 | public int numDocuments; // Number of documents in the corpus 45 | public int numWordsInCorpus; // Number of words in the corpus 46 | 47 | public HashMap word2IdVocabulary; // Vocabulary to get ID 48 | // given a word 49 | public HashMap id2WordVocabulary; // Vocabulary to get word 50 | // given an ID 51 | public int vocabularySize; // The number of word types in the corpus 52 | 53 | // Number of documents assigned to a topic 54 | public int[] docTopicCount; 55 | // numTopics * vocabularySize matrix 56 | // Given a topic: number of times a word type assigned to the topic 57 | public int[][] topicWordCount; 58 | // Total number of words assigned to a topic 59 | public int[] sumTopicWordCount; 60 | 61 | // Double array used to sample a topic 62 | public double[] multiPros; 63 | 64 | // Path to the directory containing the corpus 65 | public String folderPath; 66 | // Path to the topic modeling corpus 67 | public String corpusPath; 68 | 69 | // Given a document, number of times its i^{th} word appearing from 70 | // the first index to the i^{th}-index in the document 71 | // Example: given a document of "a a b a b c d c". We have: 1 2 1 3 2 1 1 2 72 | public List> occurenceToIndexCount; 73 | 74 | public String expName = "DMMmodel"; 75 | public String orgExpName = "DMMmodel"; 76 | public String tAssignsFilePath = ""; 77 | public int savestep = 0; 78 | 79 | public GibbsSamplingDMM(String pathToCorpus, int inNumTopics, 80 | double inAlpha, double inBeta, int inNumIterations, int inTopWords) 81 | throws Exception 82 | { 83 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 84 | inTopWords, "DMMmodel"); 85 | } 86 | 87 | public GibbsSamplingDMM(String pathToCorpus, int inNumTopics, 88 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 89 | String inExpName) 90 | throws Exception 91 | { 92 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 93 | inTopWords, inExpName, "", 0); 94 | 95 | } 96 | 97 | public GibbsSamplingDMM(String pathToCorpus, int inNumTopics, 98 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 99 | String inExpName, String pathToTAfile) 100 | throws Exception 101 | { 102 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 103 | inTopWords, inExpName, pathToTAfile, 0); 104 | 105 | } 106 | 107 | public GibbsSamplingDMM(String pathToCorpus, int inNumTopics, 108 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 109 | String inExpName, int inSaveStep) 110 | throws Exception 111 | { 112 | this(pathToCorpus, inNumTopics, inAlpha, inBeta, inNumIterations, 113 | inTopWords, inExpName, "", inSaveStep); 114 | 115 | } 116 | 117 | public GibbsSamplingDMM(String pathToCorpus, int inNumTopics, 118 | double inAlpha, double inBeta, int inNumIterations, int inTopWords, 119 | String inExpName, String pathToTAfile, int inSaveStep) 120 | throws IOException 121 | { 122 | alpha = inAlpha; 123 | beta = inBeta; 124 | numTopics = inNumTopics; 125 | numIterations = inNumIterations; 126 | topWords = inTopWords; 127 | savestep = inSaveStep; 128 | expName = inExpName; 129 | orgExpName = expName; 130 | corpusPath = pathToCorpus; 131 | folderPath = pathToCorpus.substring( 132 | 0, 133 | Math.max(pathToCorpus.lastIndexOf("/"), 134 | pathToCorpus.lastIndexOf("\\")) + 1); 135 | 136 | System.out.println("Reading topic modeling corpus: " + pathToCorpus); 137 | 138 | word2IdVocabulary = new HashMap(); 139 | id2WordVocabulary = new HashMap(); 140 | corpus = new ArrayList>(); 141 | occurenceToIndexCount = new ArrayList>(); 142 | numDocuments = 0; 143 | numWordsInCorpus = 0; 144 | 145 | BufferedReader br = null; 146 | try { 147 | int indexWord = -1; 148 | br = new BufferedReader(new FileReader(pathToCorpus)); 149 | for (String doc; (doc = br.readLine()) != null;) { 150 | if (doc.trim().length() == 0) 151 | continue; 152 | 153 | String[] words = doc.trim().split("\\s+"); 154 | List document = new ArrayList(); 155 | 156 | List wordOccurenceToIndexInDoc = new ArrayList(); 157 | HashMap wordOccurenceToIndexInDocCount = new HashMap(); 158 | 159 | for (String word : words) { 160 | if (word2IdVocabulary.containsKey(word)) { 161 | document.add(word2IdVocabulary.get(word)); 162 | } 163 | else { 164 | indexWord += 1; 165 | word2IdVocabulary.put(word, indexWord); 166 | id2WordVocabulary.put(indexWord, word); 167 | document.add(indexWord); 168 | } 169 | 170 | int times = 0; 171 | if (wordOccurenceToIndexInDocCount.containsKey(word)) { 172 | times = wordOccurenceToIndexInDocCount.get(word); 173 | } 174 | times += 1; 175 | wordOccurenceToIndexInDocCount.put(word, times); 176 | wordOccurenceToIndexInDoc.add(times); 177 | } 178 | numDocuments++; 179 | numWordsInCorpus += document.size(); 180 | corpus.add(document); 181 | occurenceToIndexCount.add(wordOccurenceToIndexInDoc); 182 | } 183 | } 184 | catch (Exception e) { 185 | e.printStackTrace(); 186 | } 187 | 188 | vocabularySize = word2IdVocabulary.size(); 189 | docTopicCount = new int[numTopics]; 190 | topicWordCount = new int[numTopics][vocabularySize]; 191 | sumTopicWordCount = new int[numTopics]; 192 | 193 | multiPros = new double[numTopics]; 194 | for (int i = 0; i < numTopics; i++) { 195 | multiPros[i] = 1.0 / numTopics; 196 | } 197 | 198 | alphaSum = numTopics * alpha; 199 | betaSum = vocabularySize * beta; 200 | 201 | System.out.println("Corpus size: " + numDocuments + " docs, " 202 | + numWordsInCorpus + " words"); 203 | System.out.println("Vocabuary size: " + vocabularySize); 204 | System.out.println("Number of topics: " + numTopics); 205 | System.out.println("alpha: " + alpha); 206 | System.out.println("beta: " + beta); 207 | System.out.println("Number of sampling iterations: " + numIterations); 208 | System.out.println("Number of top topical words: " + topWords); 209 | 210 | tAssignsFilePath = pathToTAfile; 211 | if (tAssignsFilePath.length() > 0) 212 | initialize(tAssignsFilePath); 213 | else 214 | initialize(); 215 | } 216 | 217 | /** 218 | * Randomly initialize topic assignments 219 | */ 220 | public void initialize() 221 | throws IOException 222 | { 223 | System.out.println("Randomly initialzing topic assignments ..."); 224 | topicAssignments = new ArrayList(); 225 | for (int i = 0; i < numDocuments; i++) { 226 | int topic = FuncUtils.nextDiscrete(multiPros); // Sample a topic 227 | docTopicCount[topic] += 1; 228 | int docSize = corpus.get(i).size(); 229 | for (int j = 0; j < docSize; j++) { 230 | topicWordCount[topic][corpus.get(i).get(j)] += 1; 231 | sumTopicWordCount[topic] += 1; 232 | } 233 | topicAssignments.add(topic); 234 | } 235 | } 236 | 237 | /** 238 | * Initialize topic assignments from a given file 239 | */ 240 | public void initialize(String pathToTopicAssignmentFile) 241 | { 242 | System.out.println("Reading topic-assigment file: " 243 | + pathToTopicAssignmentFile); 244 | 245 | topicAssignments = new ArrayList(); 246 | 247 | BufferedReader br = null; 248 | try { 249 | br = new BufferedReader(new FileReader(pathToTopicAssignmentFile)); 250 | int docID = 0; 251 | int numWords = 0; 252 | for (String line; (line = br.readLine()) != null;) { 253 | String[] strTopics = line.trim().split("\\s+"); 254 | int topic = new Integer(strTopics[0]) % numTopics; 255 | docTopicCount[topic] += 1; 256 | for (int j = 0; j < strTopics.length; j++) { 257 | // Increase counts 258 | topicWordCount[topic][corpus.get(docID).get(j)] += 1; 259 | sumTopicWordCount[topic] += 1; 260 | 261 | numWords++; 262 | } 263 | topicAssignments.add(topic); 264 | docID++; 265 | } 266 | 267 | if ((docID != numDocuments) || (numWords != numWordsInCorpus)) { 268 | System.out 269 | .println("The topic modeling corpus and topic assignment file are not consistent!!!"); 270 | throw new Exception(); 271 | } 272 | } 273 | catch (Exception e) { 274 | e.printStackTrace(); 275 | } 276 | } 277 | 278 | public void inference() 279 | throws IOException 280 | { 281 | writeParameters(); 282 | writeDictionary(); 283 | 284 | System.out.println("Running Gibbs sampling inference: "); 285 | 286 | for (int iter = 1; iter <= numIterations; iter++) { 287 | 288 | System.out.println("\tSampling iteration: " + (iter)); 289 | // System.out.println("\t\tPerplexity: " + computePerplexity()); 290 | 291 | sampleInSingleIteration(); 292 | 293 | if ((savestep > 0) && (iter % savestep == 0) 294 | && (iter < numIterations)) { 295 | System.out.println("\t\tSaving the output from the " + iter 296 | + "^{th} sample"); 297 | expName = orgExpName + "-" + iter; 298 | write(); 299 | } 300 | } 301 | expName = orgExpName; 302 | 303 | System.out.println("Writing output from the last sample ..."); 304 | write(); 305 | 306 | System.out.println("Sampling completed!"); 307 | 308 | } 309 | 310 | public void sampleInSingleIteration() 311 | { 312 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 313 | int topic = topicAssignments.get(dIndex); 314 | List document = corpus.get(dIndex); 315 | int docSize = document.size(); 316 | 317 | // Decrease counts 318 | docTopicCount[topic] -= 1; 319 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 320 | int word = document.get(wIndex); 321 | topicWordCount[topic][word] -= 1; 322 | sumTopicWordCount[topic] -= 1; 323 | } 324 | 325 | // Sample a topic 326 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 327 | multiPros[tIndex] = (docTopicCount[tIndex] + alpha); 328 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 329 | int word = document.get(wIndex); 330 | multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta 331 | + occurenceToIndexCount.get(dIndex).get(wIndex) - 1) 332 | / (sumTopicWordCount[tIndex] + betaSum + wIndex); 333 | } 334 | } 335 | topic = FuncUtils.nextDiscrete(multiPros); 336 | 337 | // Increase counts 338 | docTopicCount[topic] += 1; 339 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 340 | int word = document.get(wIndex); 341 | topicWordCount[topic][word] += 1; 342 | sumTopicWordCount[topic] += 1; 343 | } 344 | // Update topic assignments 345 | topicAssignments.set(dIndex, topic); 346 | } 347 | } 348 | 349 | public void writeParameters() 350 | throws IOException 351 | { 352 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 353 | + expName + ".paras")); 354 | writer.write("-model" + "\t" + "DMM"); 355 | writer.write("\n-corpus" + "\t" + corpusPath); 356 | writer.write("\n-ntopics" + "\t" + numTopics); 357 | writer.write("\n-alpha" + "\t" + alpha); 358 | writer.write("\n-beta" + "\t" + beta); 359 | writer.write("\n-niters" + "\t" + numIterations); 360 | writer.write("\n-twords" + "\t" + topWords); 361 | writer.write("\n-name" + "\t" + expName); 362 | if (tAssignsFilePath.length() > 0) 363 | writer.write("\n-initFile" + "\t" + tAssignsFilePath); 364 | if (savestep > 0) 365 | writer.write("\n-sstep" + "\t" + savestep); 366 | 367 | writer.close(); 368 | } 369 | 370 | public void writeDictionary() 371 | throws IOException 372 | { 373 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 374 | + expName + ".vocabulary")); 375 | for (int id = 0; id < vocabularySize; id++) 376 | writer.write(id2WordVocabulary.get(id) + " " + id + "\n"); 377 | writer.close(); 378 | } 379 | 380 | public void writeIDbasedCorpus() 381 | throws IOException 382 | { 383 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 384 | + expName + ".IDcorpus")); 385 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 386 | int docSize = corpus.get(dIndex).size(); 387 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 388 | writer.write(corpus.get(dIndex).get(wIndex) + " "); 389 | } 390 | writer.write("\n"); 391 | } 392 | writer.close(); 393 | } 394 | 395 | public void writeTopicAssignments() 396 | throws IOException 397 | { 398 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 399 | + expName + ".topicAssignments")); 400 | for (int dIndex = 0; dIndex < numDocuments; dIndex++) { 401 | int docSize = corpus.get(dIndex).size(); 402 | int topic = topicAssignments.get(dIndex); 403 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 404 | writer.write(topic + " "); 405 | } 406 | writer.write("\n"); 407 | } 408 | writer.close(); 409 | } 410 | 411 | public void writeTopTopicalWords() 412 | throws IOException 413 | { 414 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 415 | + expName + ".topWords")); 416 | 417 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 418 | writer.write("Topic" + new Integer(tIndex) + ":"); 419 | 420 | Map wordCount = new TreeMap(); 421 | for (int wIndex = 0; wIndex < vocabularySize; wIndex++) { 422 | wordCount.put(wIndex, topicWordCount[tIndex][wIndex]); 423 | } 424 | wordCount = FuncUtils.sortByValueDescending(wordCount); 425 | 426 | Set mostLikelyWords = wordCount.keySet(); 427 | int count = 0; 428 | for (Integer index : mostLikelyWords) { 429 | if (count < topWords) { 430 | double pro = (topicWordCount[tIndex][index] + beta) 431 | / (sumTopicWordCount[tIndex] + betaSum); 432 | pro = Math.round(pro * 1000000.0) / 1000000.0; 433 | writer.write(" " + id2WordVocabulary.get(index) + "(" + pro 434 | + ")"); 435 | count += 1; 436 | } 437 | else { 438 | writer.write("\n\n"); 439 | break; 440 | } 441 | } 442 | } 443 | writer.close(); 444 | } 445 | 446 | public void writeTopicWordPros() 447 | throws IOException 448 | { 449 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 450 | + expName + ".phi")); 451 | for (int i = 0; i < numTopics; i++) { 452 | for (int j = 0; j < vocabularySize; j++) { 453 | double pro = (topicWordCount[i][j] + beta) 454 | / (sumTopicWordCount[i] + betaSum); 455 | writer.write(pro + " "); 456 | } 457 | writer.write("\n"); 458 | } 459 | writer.close(); 460 | } 461 | 462 | public void writeTopicWordCount() 463 | throws IOException 464 | { 465 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 466 | + expName + ".WTcount")); 467 | for (int i = 0; i < numTopics; i++) { 468 | for (int j = 0; j < vocabularySize; j++) { 469 | writer.write(topicWordCount[i][j] + " "); 470 | } 471 | writer.write("\n"); 472 | } 473 | writer.close(); 474 | 475 | } 476 | 477 | public void writeDocTopicPros() 478 | throws IOException 479 | { 480 | BufferedWriter writer = new BufferedWriter(new FileWriter(folderPath 481 | + expName + ".theta")); 482 | 483 | for (int i = 0; i < numDocuments; i++) { 484 | int docSize = corpus.get(i).size(); 485 | double sum = 0.0; 486 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 487 | multiPros[tIndex] = (docTopicCount[tIndex] + alpha); 488 | for (int wIndex = 0; wIndex < docSize; wIndex++) { 489 | int word = corpus.get(i).get(wIndex); 490 | multiPros[tIndex] *= (topicWordCount[tIndex][word] + beta) 491 | / (sumTopicWordCount[tIndex] + betaSum); 492 | } 493 | sum += multiPros[tIndex]; 494 | } 495 | for (int tIndex = 0; tIndex < numTopics; tIndex++) { 496 | writer.write((multiPros[tIndex] / sum) + " "); 497 | } 498 | writer.write("\n"); 499 | } 500 | writer.close(); 501 | } 502 | 503 | public void write() 504 | throws IOException 505 | { 506 | writeTopTopicalWords(); 507 | writeDocTopicPros(); 508 | writeTopicAssignments(); 509 | writeTopicWordPros(); 510 | } 511 | 512 | public static void main(String args[]) 513 | throws Exception 514 | { 515 | GibbsSamplingDMM dmm = new GibbsSamplingDMM("test/corpus.txt", 7, 0.1, 516 | 0.1, 2000, 20, "testDMM"); 517 | dmm.inference(); 518 | } 519 | } 520 | -------------------------------------------------------------------------------- /src/utility/MersenneTwister.java: -------------------------------------------------------------------------------- 1 | package utility; 2 | 3 | import java.io.DataInputStream; 4 | import java.io.DataOutputStream; 5 | import java.io.IOException; 6 | import java.io.ObjectInputStream; 7 | import java.io.ObjectOutputStream; 8 | import java.io.Serializable; 9 | 10 | /** 11 | *

MersenneTwister and MersenneTwisterFast

12 | *

13 | * Version 20, based on version MT199937(99/10/29) of the Mersenne Twister algorithm found at 14 | * The Mersenne Twister Home Page, with 15 | * the initialization improved using the new 2002/1/26 initialization algorithm By Sean Luke, 16 | * October 2004. 17 | * 18 | *

19 | * MersenneTwister is a drop-in subclass replacement for java.util.Random. It is properly 20 | * synchronized and can be used in a multithreaded environment. On modern VMs such as HotSpot, it is 21 | * approximately 1/3 slower than java.util.Random. 22 | * 23 | *

24 | * MersenneTwisterFast is not a subclass of java.util.Random. It has the same public methods 25 | * as Random does, however, and it is algorithmically identical to MersenneTwister. 26 | * MersenneTwisterFast has hard-code inlined all of its methods directly, and made all of them final 27 | * (well, the ones of consequence anyway). Further, these methods are not synchronized, so 28 | * the same MersenneTwisterFast instance cannot be shared by multiple threads. But all this helps 29 | * MersenneTwisterFast achieve well over twice the speed of MersenneTwister. java.util.Random is 30 | * about 1/3 slower than MersenneTwisterFast. 31 | * 32 | *

About the Mersenne Twister

33 | *

34 | * This is a Java version of the C-program for MT19937: Integer version. The MT19937 algorithm was 35 | * created by Makoto Matsumoto and Takuji Nishimura, who ask: "When you use this, send an email to: 36 | * matumoto@math.keio.ac.jp with an appropriate reference to your work". Indicate that this is a 37 | * translation of their algorithm into Java. 38 | * 39 | *

40 | * Reference. Makato Matsumoto and Takuji Nishimura, "Mersenne Twister: A 623-Dimensionally 41 | * Equidistributed Uniform Pseudo-Random Number Generator", ACM Transactions on Modeling and. 42 | * Computer Simulation, Vol. 8, No. 1, January 1998, pp 3--30. 43 | * 44 | *

About this Version

45 | * 46 | *

47 | * Changes since V19: nextFloat(boolean, boolean) now returns float, not double. 48 | * 49 | *

50 | * Changes since V18: Removed old final declarations, which used to potentially speed up the 51 | * code, but no longer. 52 | * 53 | *

54 | * Changes since V17: Removed vestigial references to &= 0xffffffff which stemmed from the 55 | * original C code. The C code could not guarantee that ints were 32 bit, hence the masks. The 56 | * vestigial references in the Java code were likely optimized out anyway. 57 | * 58 | *

59 | * Changes since V16: Added nextDouble(includeZero, includeOne) and nextFloat(includeZero, 60 | * includeOne) to allow for half-open, fully-closed, and fully-open intervals. 61 | * 62 | *

63 | * Changes Since V15: Added serialVersionUID to quiet compiler warnings from Sun's overly 64 | * verbose compilers as of JDK 1.5. 65 | * 66 | *

67 | * Changes Since V14: made strictfp, with StrictMath.log and StrictMath.sqrt in nextGaussian 68 | * instead of Math.log and Math.sqrt. This is largely just to be safe, as it presently makes no 69 | * difference in the speed, correctness, or results of the algorithm. 70 | * 71 | *

72 | * Changes Since V13: clone() method CloneNotSupportedException removed. 73 | * 74 | *

75 | * Changes Since V12: clone() method added. 76 | * 77 | *

78 | * Changes Since V11: stateEquals(...) method added. MersenneTwisterFast is equal to other 79 | * MersenneTwisterFasts with identical state; likewise MersenneTwister is equal to other 80 | * MersenneTwister with identical state. This isn't equals(...) because that requires a contract of 81 | * immutability to compare by value. 82 | * 83 | *

84 | * Changes Since V10: A documentation error suggested that setSeed(int[]) required an int[] 85 | * array 624 long. In fact, the array can be any non-zero length. The new version also checks for 86 | * this fact. 87 | * 88 | *

89 | * Changes Since V9: readState(stream) and writeState(stream) provided. 90 | * 91 | *

92 | * Changes Since V8: setSeed(int) was only using the first 28 bits of the seed; it should 93 | * have been 32 bits. For small-number seeds the behavior is identical. 94 | * 95 | *

96 | * Changes Since V7: A documentation error in MersenneTwisterFast (but not MersenneTwister) 97 | * stated that nextDouble selects uniformly from the full-open interval [0,1]. It does not. 98 | * nextDouble's contract is identical across MersenneTwisterFast, MersenneTwister, and 99 | * java.util.Random, namely, selection in the half-open interval [0,1). That is, 1.0 should not be 100 | * returned. A similar contract exists in nextFloat. 101 | * 102 | *

103 | * Changes Since V6: License has changed from LGPL to BSD. New timing information to compare 104 | * against java.util.Random. Recent versions of HotSpot have helped Random increase in speed to the 105 | * point where it is faster than MersenneTwister but slower than MersenneTwisterFast (which should 106 | * be the case, as it's a less complex algorithm but is synchronized). 107 | * 108 | *

109 | * Changes Since V5: New empty constructor made to work the same as java.util.Random -- 110 | * namely, it seeds based on the current time in milliseconds. 111 | * 112 | *

113 | * Changes Since V4: New initialization algorithms. See (see 115 | * http://www.math.keio.ac.jp/matumoto/MT2002/emt19937ar.html) 116 | * 117 | *

118 | * The MersenneTwister code is based on standard MT19937 C/C++ code by Takuji Nishimura, with 119 | * suggestions from Topher Cooper and Marc Rieffel, July 1997. The code was originally translated 120 | * into Java by Michael Lecuyer, January 1999, and the original code is Copyright (c) 1999 by 121 | * Michael Lecuyer. 122 | * 123 | *

Java notes

124 | * 125 | *

126 | * This implementation implements the bug fixes made in Java 1.2's version of Random, which means it 127 | * can be used with earlier versions of Java. See the JDK 1.2 129 | * java.util.Random documentation for further documentation on the random-number generation 130 | * contracts made. Additionally, there's an undocumented bug in the JDK java.util.Random.nextBytes() 131 | * method, which this code fixes. 132 | * 133 | *

134 | * Just like java.util.Random, this generator accepts a long seed but doesn't use all of it. 135 | * java.util.Random uses 48 bits. The Mersenne Twister instead uses 32 bits (int size). So it's best 136 | * if your seed does not exceed the int range. 137 | * 138 | *

139 | * MersenneTwister can be used reliably on JDK version 1.1.5 or above. Earlier Java versions have 140 | * serious bugs in java.util.Random; only MersenneTwisterFast (and not MersenneTwister nor 141 | * java.util.Random) should be used with them. 142 | * 143 | *

License

144 | * 145 | * Copyright (c) 2003 by Sean Luke.
146 | * Portions copyright (c) 1993 by Michael Lecuyer.
147 | * All rights reserved.
148 | * 149 | *

150 | * Redistribution and use in source and binary forms, with or without modification, are permitted 151 | * provided that the following conditions are met: 152 | *

    153 | *
  • Redistributions of source code must retain the above copyright notice, this list of 154 | * conditions and the following disclaimer. 155 | *
  • Redistributions in binary form must reproduce the above copyright notice, this list of 156 | * conditions and the following disclaimer in the documentation and/or other materials provided with 157 | * the distribution. 158 | *
  • Neither the name of the copyright owners, their employers, nor the names of its contributors 159 | * may be used to endorse or promote products derived from this software without specific prior 160 | * written permission. 161 | *
162 | *

163 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 164 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 165 | * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNERS OR 166 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 167 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 168 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 169 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 170 | * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 171 | * 172 | * @version 20 173 | */ 174 | 175 | public strictfp class MersenneTwister 176 | extends java.util.Random 177 | implements Serializable, Cloneable 178 | { 179 | // Serialization 180 | private static final long serialVersionUID = -4035832775130174188L; // locked as of Version 15 181 | 182 | // Period parameters 183 | private static final int N = 624; 184 | private static final int M = 397; 185 | private static final int MATRIX_A = 0x9908b0df; // private static final * constant vector a 186 | private static final int UPPER_MASK = 0x80000000; // most significant w-r bits 187 | private static final int LOWER_MASK = 0x7fffffff; // least significant r bits 188 | 189 | // Tempering parameters 190 | private static final int TEMPERING_MASK_B = 0x9d2c5680; 191 | private static final int TEMPERING_MASK_C = 0xefc60000; 192 | 193 | private int mt[]; // the array for the state vector 194 | private int mti; // mti==N+1 means mt[N] is not initialized 195 | private int mag01[]; 196 | 197 | // a good initial seed (of int size, though stored in a long) 198 | // private static final long GOOD_SEED = 4357; 199 | 200 | /* 201 | * implemented here because there's a bug in Random's implementation of the Gaussian code 202 | * (divide by zero, and log(0), ugh!), yet its gaussian variables are private so we can't access 203 | * them here. :-( 204 | */ 205 | 206 | private double __nextNextGaussian; 207 | private boolean __haveNextNextGaussian; 208 | 209 | /* We're overriding all internal data, to my knowledge, so this should be okay */ 210 | public Object clone() 211 | { 212 | try { 213 | MersenneTwister f = (MersenneTwister) (super.clone()); 214 | f.mt = (int[]) (mt.clone()); 215 | f.mag01 = (int[]) (mag01.clone()); 216 | return f; 217 | } 218 | catch (CloneNotSupportedException e) { 219 | throw new InternalError(); 220 | } // should never happen 221 | } 222 | 223 | public boolean stateEquals(Object o) 224 | { 225 | if (o == this) 226 | return true; 227 | if (o == null || !(o instanceof MersenneTwister)) 228 | return false; 229 | MersenneTwister other = (MersenneTwister) o; 230 | if (mti != other.mti) 231 | return false; 232 | for (int x = 0; x < mag01.length; x++) 233 | if (mag01[x] != other.mag01[x]) 234 | return false; 235 | for (int x = 0; x < mt.length; x++) 236 | if (mt[x] != other.mt[x]) 237 | return false; 238 | return true; 239 | } 240 | 241 | /** Reads the entire state of the MersenneTwister RNG from the stream */ 242 | public void readState(DataInputStream stream) 243 | throws IOException 244 | { 245 | int len = mt.length; 246 | for (int x = 0; x < len; x++) 247 | mt[x] = stream.readInt(); 248 | 249 | len = mag01.length; 250 | for (int x = 0; x < len; x++) 251 | mag01[x] = stream.readInt(); 252 | 253 | mti = stream.readInt(); 254 | __nextNextGaussian = stream.readDouble(); 255 | __haveNextNextGaussian = stream.readBoolean(); 256 | } 257 | 258 | /** Writes the entire state of the MersenneTwister RNG to the stream */ 259 | public void writeState(DataOutputStream stream) 260 | throws IOException 261 | { 262 | int len = mt.length; 263 | for (int x = 0; x < len; x++) 264 | stream.writeInt(mt[x]); 265 | 266 | len = mag01.length; 267 | for (int x = 0; x < len; x++) 268 | stream.writeInt(mag01[x]); 269 | 270 | stream.writeInt(mti); 271 | stream.writeDouble(__nextNextGaussian); 272 | stream.writeBoolean(__haveNextNextGaussian); 273 | } 274 | 275 | /** 276 | * Constructor using the default seed. 277 | */ 278 | public MersenneTwister() 279 | { 280 | this(System.currentTimeMillis()); 281 | } 282 | 283 | /** 284 | * Constructor using a given seed. Though you pass this seed in as a long, it's best to make 285 | * sure it's actually an integer. 286 | */ 287 | public MersenneTwister(long seed) 288 | { 289 | super(seed); /* just in case */ 290 | setSeed(seed); 291 | } 292 | 293 | /** 294 | * Constructor using an array of integers as seed. Your array must have a non-zero length. Only 295 | * the first 624 integers in the array are used; if the array is shorter than this then integers 296 | * are repeatedly used in a wrap-around fashion. 297 | */ 298 | public MersenneTwister(int[] array) 299 | { 300 | super(System.currentTimeMillis()); /* pick something at random just in case */ 301 | setSeed(array); 302 | } 303 | 304 | /** 305 | * Initalize the pseudo random number generator. Don't pass in a long that's bigger than an int 306 | * (Mersenne Twister only uses the first 32 bits for its seed). 307 | */ 308 | 309 | synchronized public void setSeed(long seed) 310 | { 311 | // it's always good style to call super 312 | super.setSeed(seed); 313 | 314 | // Due to a bug in java.util.Random clear up to 1.2, we're 315 | // doing our own Gaussian variable. 316 | __haveNextNextGaussian = false; 317 | 318 | mt = new int[N]; 319 | 320 | mag01 = new int[2]; 321 | mag01[0] = 0x0; 322 | mag01[1] = MATRIX_A; 323 | 324 | mt[0] = (int) (seed & 0xffffffff); 325 | mt[0] = (int) seed; 326 | for (mti = 1; mti < N; mti++) { 327 | mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >>> 30)) + mti); 328 | /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ 329 | /* In the previous versions, MSBs of the seed affect */ 330 | /* only MSBs of the array mt[]. */ 331 | /* 2002/01/09 modified by Makoto Matsumoto */ 332 | // mt[mti] &= 0xffffffff; 333 | /* for >32 bit machines */ 334 | } 335 | } 336 | 337 | /** 338 | * Sets the seed of the MersenneTwister using an array of integers. Your array must have a 339 | * non-zero length. Only the first 624 integers in the array are used; if the array is shorter 340 | * than this then integers are repeatedly used in a wrap-around fashion. 341 | */ 342 | 343 | synchronized public void setSeed(int[] array) 344 | { 345 | if (array.length == 0) 346 | throw new IllegalArgumentException("Array length must be greater than zero"); 347 | int i, j, k; 348 | setSeed(19650218); 349 | i = 1; 350 | j = 0; 351 | k = (N > array.length ? N : array.length); 352 | for (; k != 0; k--) { 353 | mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >>> 30)) * 1664525)) + array[j] + j; /* 354 | * non 355 | * linear 356 | */ 357 | // mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */ 358 | i++; 359 | j++; 360 | if (i >= N) { 361 | mt[0] = mt[N - 1]; 362 | i = 1; 363 | } 364 | if (j >= array.length) 365 | j = 0; 366 | } 367 | for (k = N - 1; k != 0; k--) { 368 | mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >>> 30)) * 1566083941)) - i; /* non linear */ 369 | // mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */ 370 | i++; 371 | if (i >= N) { 372 | mt[0] = mt[N - 1]; 373 | i = 1; 374 | } 375 | } 376 | mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */ 377 | } 378 | 379 | /** 380 | * Returns an integer with bits bits filled with a random number. 381 | */ 382 | synchronized protected int next(int bits) 383 | { 384 | int y; 385 | 386 | if (mti >= N) // generate N words at one time 387 | { 388 | int kk; 389 | final int[] mt = this.mt; // locals are slightly faster 390 | final int[] mag01 = this.mag01; // locals are slightly faster 391 | 392 | for (kk = 0; kk < N - M; kk++) { 393 | y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); 394 | mt[kk] = mt[kk + M] ^ (y >>> 1) ^ mag01[y & 0x1]; 395 | } 396 | for (; kk < N - 1; kk++) { 397 | y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); 398 | mt[kk] = mt[kk + (M - N)] ^ (y >>> 1) ^ mag01[y & 0x1]; 399 | } 400 | y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK); 401 | mt[N - 1] = mt[M - 1] ^ (y >>> 1) ^ mag01[y & 0x1]; 402 | 403 | mti = 0; 404 | } 405 | 406 | y = mt[mti++]; 407 | y ^= y >>> 11; // TEMPERING_SHIFT_U(y) 408 | y ^= (y << 7) & TEMPERING_MASK_B; // TEMPERING_SHIFT_S(y) 409 | y ^= (y << 15) & TEMPERING_MASK_C; // TEMPERING_SHIFT_T(y) 410 | y ^= (y >>> 18); // TEMPERING_SHIFT_L(y) 411 | 412 | return y >>> (32 - bits); // hope that's right! 413 | } 414 | 415 | /* 416 | * If you've got a truly old version of Java, you can omit these two next methods. 417 | */ 418 | 419 | private synchronized void writeObject(ObjectOutputStream out) 420 | throws IOException 421 | { 422 | // just so we're synchronized. 423 | out.defaultWriteObject(); 424 | } 425 | 426 | private synchronized void readObject(ObjectInputStream in) 427 | throws IOException, ClassNotFoundException 428 | { 429 | // just so we're synchronized. 430 | in.defaultReadObject(); 431 | } 432 | 433 | /** 434 | * This method is missing from jdk 1.0.x and below. JDK 1.1 includes this for us, but what the 435 | * heck. 436 | */ 437 | public boolean nextBoolean() 438 | { 439 | return next(1) != 0; 440 | } 441 | 442 | /** 443 | * This generates a coin flip with a probability probability of returning true, else 444 | * returning false. probability must be between 0.0 and 1.0, inclusive. Not as precise 445 | * a random real event as nextBoolean(double), but twice as fast. To explicitly use this, 446 | * remember you may need to cast to float first. 447 | */ 448 | 449 | public boolean nextBoolean(float probability) 450 | { 451 | if (probability < 0.0f || probability > 1.0f) 452 | throw new IllegalArgumentException("probability must be between 0.0 and 1.0 inclusive."); 453 | if (probability == 0.0f) 454 | return false; // fix half-open issues 455 | else if (probability == 1.0f) 456 | return true; // fix half-open issues 457 | return nextFloat() < probability; 458 | } 459 | 460 | /** 461 | * This generates a coin flip with a probability probability of returning true, else 462 | * returning false. probability must be between 0.0 and 1.0, inclusive. 463 | */ 464 | 465 | public boolean nextBoolean(double probability) 466 | { 467 | if (probability < 0.0 || probability > 1.0) 468 | throw new IllegalArgumentException("probability must be between 0.0 and 1.0 inclusive."); 469 | if (probability == 0.0) 470 | return false; // fix half-open issues 471 | else if (probability == 1.0) 472 | return true; // fix half-open issues 473 | return nextDouble() < probability; 474 | } 475 | 476 | /** 477 | * This method is missing from JDK 1.1 and below. JDK 1.2 includes this for us, but what the 478 | * heck. 479 | */ 480 | 481 | public int nextInt(int n) 482 | { 483 | if (n <= 0) 484 | throw new IllegalArgumentException("n must be positive, got: " + n); 485 | 486 | if ((n & -n) == n) 487 | return (int) ((n * (long) next(31)) >> 31); 488 | 489 | int bits, val; 490 | do { 491 | bits = next(31); 492 | val = bits % n; 493 | } 494 | while (bits - val + (n - 1) < 0); 495 | return val; 496 | } 497 | 498 | /** 499 | * This method is for completness' sake. Returns a long drawn uniformly from 0 to n-1. Suffice 500 | * it to say, n must be > 0, or an IllegalArgumentException is raised. 501 | */ 502 | 503 | public long nextLong(long n) 504 | { 505 | if (n <= 0) 506 | throw new IllegalArgumentException("n must be positive, got: " + n); 507 | 508 | long bits, val; 509 | do { 510 | bits = (nextLong() >>> 1); 511 | val = bits % n; 512 | } 513 | while (bits - val + (n - 1) < 0); 514 | return val; 515 | } 516 | 517 | /** 518 | * A bug fix for versions of JDK 1.1 and below. JDK 1.2 fixes this for us, but what the heck. 519 | */ 520 | public double nextDouble() 521 | { 522 | return (((long) next(26) << 27) + next(27)) / (double) (1L << 53); 523 | } 524 | 525 | /** 526 | * Returns a double in the range from 0.0 to 1.0, possibly inclusive of 0.0 and 1.0 themselves. 527 | * Thus: 528 | * 529 | *

530 | * 531 | * 535 | * 538 | * 541 | * 544 | *
532 | * Expression 533 | * Interval 534 | *
nextDouble(false, false) 536 | * (0.0, 1.0) 537 | *
nextDouble(true, false) 539 | * [0.0, 1.0) 540 | *
nextDouble(false, true) 542 | * (0.0, 1.0] 543 | *
nextDouble(true, true) 545 | * [0.0, 1.0] 546 | *
547 | * 548 | *

549 | * This version preserves all possible random values in the double range. 550 | */ 551 | public double nextDouble(boolean includeZero, boolean includeOne) 552 | { 553 | double d = 0.0; 554 | do { 555 | d = nextDouble(); // grab a value, initially from half-open [0.0, 1.0) 556 | if (includeOne && nextBoolean()) 557 | d += 1.0; // if includeOne, with 1/2 probability, push to [1.0, 2.0) 558 | } 559 | while ((d > 1.0) || // everything above 1.0 is always invalid 560 | (!includeZero && d == 0.0)); // if we're not including zero, 0.0 is invalid 561 | return d; 562 | } 563 | 564 | /** 565 | * A bug fix for versions of JDK 1.1 and below. JDK 1.2 fixes this for us, but what the heck. 566 | */ 567 | 568 | public float nextFloat() 569 | { 570 | return next(24) / ((float) (1 << 24)); 571 | } 572 | 573 | /** 574 | * Returns a float in the range from 0.0f to 1.0f, possibly inclusive of 0.0f and 1.0f 575 | * themselves. Thus: 576 | * 577 | *

578 | * 579 | * 583 | * 586 | * 589 | * 592 | *
580 | * Expression 581 | * Interval 582 | *
nextFloat(false, false) 584 | * (0.0f, 1.0f) 585 | *
nextFloat(true, false) 587 | * [0.0f, 1.0f) 588 | *
nextFloat(false, true) 590 | * (0.0f, 1.0f] 591 | *
nextFloat(true, true) 593 | * [0.0f, 1.0f] 594 | *
595 | * 596 | *

597 | * This version preserves all possible random values in the float range. 598 | */ 599 | public float nextFloat(boolean includeZero, boolean includeOne) 600 | { 601 | float d = 0.0f; 602 | do { 603 | d = nextFloat(); // grab a value, initially from half-open [0.0f, 1.0f) 604 | if (includeOne && nextBoolean()) 605 | d += 1.0f; // if includeOne, with 1/2 probability, push to [1.0f, 2.0f) 606 | } 607 | while ((d > 1.0f) || // everything above 1.0f is always invalid 608 | (!includeZero && d == 0.0f)); // if we're not including zero, 0.0f is invalid 609 | return d; 610 | } 611 | 612 | /** 613 | * A bug fix for all versions of the JDK. The JDK appears to use all four bytes in an integer as 614 | * independent byte values! Totally wrong. I've submitted a bug report. 615 | */ 616 | 617 | public void nextBytes(byte[] bytes) 618 | { 619 | for (int x = 0; x < bytes.length; x++) 620 | bytes[x] = (byte) next(8); 621 | } 622 | 623 | /** For completeness' sake, though it's not in java.util.Random. */ 624 | 625 | public char nextChar() 626 | { 627 | // chars are 16-bit UniCode values 628 | return (char) (next(16)); 629 | } 630 | 631 | /** For completeness' sake, though it's not in java.util.Random. */ 632 | 633 | public short nextShort() 634 | { 635 | return (short) (next(16)); 636 | } 637 | 638 | /** For completeness' sake, though it's not in java.util.Random. */ 639 | 640 | public byte nextByte() 641 | { 642 | return (byte) (next(8)); 643 | } 644 | 645 | /** 646 | * A bug fix for all JDK code including 1.2. nextGaussian can theoretically ask for the log of 0 647 | * and divide it by 0! See Java bug 649 | * http://developer.java.sun.com/developer/bugParade/bugs/4254501.html 650 | */ 651 | 652 | synchronized public double nextGaussian() 653 | { 654 | if (__haveNextNextGaussian) { 655 | __haveNextNextGaussian = false; 656 | return __nextNextGaussian; 657 | } 658 | else { 659 | double v1, v2, s; 660 | do { 661 | v1 = 2 * nextDouble() - 1; // between -1.0 and 1.0 662 | v2 = 2 * nextDouble() - 1; // between -1.0 and 1.0 663 | s = v1 * v1 + v2 * v2; 664 | } 665 | while (s >= 1 || s == 0); 666 | double multiplier = StrictMath.sqrt(-2 * StrictMath.log(s) / s); 667 | __nextNextGaussian = v2 * multiplier; 668 | __haveNextNextGaussian = true; 669 | return v1 * multiplier; 670 | } 671 | } 672 | 673 | /** 674 | * Tests the code. 675 | */ 676 | public static void main(String args[]) 677 | { 678 | int j; 679 | 680 | MersenneTwister r; 681 | 682 | // CORRECTNESS TEST 683 | // COMPARE WITH http://www.math.keio.ac.jp/matumoto/CODES/MT2002/mt19937ar.out 684 | 685 | r = new MersenneTwister(new int[] { 0x123, 0x234, 0x345, 0x456 }); 686 | System.out.println("Output of MersenneTwister with new (2002/1/26) seeding mechanism"); 687 | for (j = 0; j < 1000; j++) { 688 | // first, convert the int from signed to "unsigned" 689 | long l = (long) r.nextInt(); 690 | if (l < 0) 691 | l += 4294967296L; // max int value 692 | String s = String.valueOf(l); 693 | while (s.length() < 10) 694 | s = " " + s; // buffer 695 | System.out.print(s + " "); 696 | if (j % 5 == 4) 697 | System.out.println(); 698 | } 699 | 700 | // SPEED TEST 701 | 702 | final long SEED = 4357; 703 | 704 | int xx; 705 | long ms; 706 | System.out.println("\nTime to test grabbing 100000000 ints"); 707 | 708 | r = new MersenneTwister(SEED); 709 | ms = System.currentTimeMillis(); 710 | xx = 0; 711 | for (j = 0; j < 100000000; j++) 712 | xx += r.nextInt(); 713 | System.out.println("Mersenne Twister: " + (System.currentTimeMillis() - ms) 714 | + " Ignore this: " + xx); 715 | 716 | System.out 717 | .println("To compare this with java.util.Random, run this same test on MersenneTwisterFast."); 718 | System.out 719 | .println("The comparison with Random is removed from MersenneTwister because it is a proper"); 720 | System.out 721 | .println("subclass of Random and this unfairly makes some of Random's methods un-inlinable,"); 722 | System.out.println("so it would make Random look worse than it is."); 723 | 724 | // TEST TO COMPARE TYPE CONVERSION BETWEEN 725 | // MersenneTwisterFast.java AND MersenneTwister.java 726 | 727 | System.out.println("\nGrab the first 1000 booleans"); 728 | r = new MersenneTwister(SEED); 729 | for (j = 0; j < 1000; j++) { 730 | System.out.print(r.nextBoolean() + " "); 731 | if (j % 8 == 7) 732 | System.out.println(); 733 | } 734 | if (!(j % 8 == 7)) 735 | System.out.println(); 736 | 737 | System.out 738 | .println("\nGrab 1000 booleans of increasing probability using nextBoolean(double)"); 739 | r = new MersenneTwister(SEED); 740 | for (j = 0; j < 1000; j++) { 741 | System.out.print(r.nextBoolean((double) (j / 999.0)) + " "); 742 | if (j % 8 == 7) 743 | System.out.println(); 744 | } 745 | if (!(j % 8 == 7)) 746 | System.out.println(); 747 | 748 | System.out 749 | .println("\nGrab 1000 booleans of increasing probability using nextBoolean(float)"); 750 | r = new MersenneTwister(SEED); 751 | for (j = 0; j < 1000; j++) { 752 | System.out.print(r.nextBoolean((float) (j / 999.0f)) + " "); 753 | if (j % 8 == 7) 754 | System.out.println(); 755 | } 756 | if (!(j % 8 == 7)) 757 | System.out.println(); 758 | 759 | byte[] bytes = new byte[1000]; 760 | System.out.println("\nGrab the first 1000 bytes using nextBytes"); 761 | r = new MersenneTwister(SEED); 762 | r.nextBytes(bytes); 763 | for (j = 0; j < 1000; j++) { 764 | System.out.print(bytes[j] + " "); 765 | if (j % 16 == 15) 766 | System.out.println(); 767 | } 768 | if (!(j % 16 == 15)) 769 | System.out.println(); 770 | 771 | byte b; 772 | System.out.println("\nGrab the first 1000 bytes -- must be same as nextBytes"); 773 | r = new MersenneTwister(SEED); 774 | for (j = 0; j < 1000; j++) { 775 | System.out.print((b = r.nextByte()) + " "); 776 | if (b != bytes[j]) 777 | System.out.print("BAD "); 778 | if (j % 16 == 15) 779 | System.out.println(); 780 | } 781 | if (!(j % 16 == 15)) 782 | System.out.println(); 783 | 784 | System.out.println("\nGrab the first 1000 shorts"); 785 | r = new MersenneTwister(SEED); 786 | for (j = 0; j < 1000; j++) { 787 | System.out.print(r.nextShort() + " "); 788 | if (j % 8 == 7) 789 | System.out.println(); 790 | } 791 | if (!(j % 8 == 7)) 792 | System.out.println(); 793 | 794 | System.out.println("\nGrab the first 1000 ints"); 795 | r = new MersenneTwister(SEED); 796 | for (j = 0; j < 1000; j++) { 797 | System.out.print(r.nextInt() + " "); 798 | if (j % 4 == 3) 799 | System.out.println(); 800 | } 801 | if (!(j % 4 == 3)) 802 | System.out.println(); 803 | 804 | System.out.println("\nGrab the first 1000 ints of different sizes"); 805 | r = new MersenneTwister(SEED); 806 | int max = 1; 807 | for (j = 0; j < 1000; j++) { 808 | System.out.print(r.nextInt(max) + " "); 809 | max *= 2; 810 | if (max <= 0) 811 | max = 1; 812 | if (j % 4 == 3) 813 | System.out.println(); 814 | } 815 | if (!(j % 4 == 3)) 816 | System.out.println(); 817 | 818 | System.out.println("\nGrab the first 1000 longs"); 819 | r = new MersenneTwister(SEED); 820 | for (j = 0; j < 1000; j++) { 821 | System.out.print(r.nextLong() + " "); 822 | if (j % 3 == 2) 823 | System.out.println(); 824 | } 825 | if (!(j % 3 == 2)) 826 | System.out.println(); 827 | 828 | System.out.println("\nGrab the first 1000 longs of different sizes"); 829 | r = new MersenneTwister(SEED); 830 | long max2 = 1; 831 | for (j = 0; j < 1000; j++) { 832 | System.out.print(r.nextLong(max2) + " "); 833 | max2 *= 2; 834 | if (max2 <= 0) 835 | max2 = 1; 836 | if (j % 4 == 3) 837 | System.out.println(); 838 | } 839 | if (!(j % 4 == 3)) 840 | System.out.println(); 841 | 842 | System.out.println("\nGrab the first 1000 floats"); 843 | r = new MersenneTwister(SEED); 844 | for (j = 0; j < 1000; j++) { 845 | System.out.print(r.nextFloat() + " "); 846 | if (j % 4 == 3) 847 | System.out.println(); 848 | } 849 | if (!(j % 4 == 3)) 850 | System.out.println(); 851 | 852 | System.out.println("\nGrab the first 1000 doubles"); 853 | r = new MersenneTwister(SEED); 854 | for (j = 0; j < 1000; j++) { 855 | System.out.print(r.nextDouble() + " "); 856 | if (j % 3 == 2) 857 | System.out.println(); 858 | } 859 | if (!(j % 3 == 2)) 860 | System.out.println(); 861 | 862 | System.out.println("\nGrab the first 1000 gaussian doubles"); 863 | r = new MersenneTwister(SEED); 864 | for (j = 0; j < 1000; j++) { 865 | System.out.print(r.nextGaussian() + " "); 866 | if (j % 3 == 2) 867 | System.out.println(); 868 | } 869 | if (!(j % 3 == 2)) 870 | System.out.println(); 871 | 872 | } 873 | 874 | } 875 | --------------------------------------------------------------------------------