├── .gitignore ├── CHANGELOG.md ├── README.md ├── Rakefile ├── VERSION.yml ├── ext └── lda-ruby │ ├── Makefile │ ├── cokus.c │ ├── cokus.h │ ├── extconf.rb │ ├── lda-alpha.c │ ├── lda-alpha.h │ ├── lda-data.c │ ├── lda-data.h │ ├── lda-inference.c │ ├── lda-inference.h │ ├── lda-model.c │ ├── lda-model.h │ ├── lda.h │ ├── utils.c │ └── utils.h ├── lda-ruby.gemspec ├── lib ├── lda-ruby.rb └── lda-ruby │ ├── config │ └── stopwords.yml │ ├── corpus │ ├── corpus.rb │ ├── data_corpus.rb │ ├── directory_corpus.rb │ └── text_corpus.rb │ ├── document │ ├── data_document.rb │ ├── document.rb │ └── text_document.rb │ └── vocabulary.rb ├── license.txt └── test ├── data ├── .gitignore ├── docs.dat ├── sample.rb └── wiki-test-docs.yml ├── lda_ruby_test.rb ├── simple_test.rb ├── simple_yaml.rb └── test_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.bundle 3 | *.tmproj 4 | pkg 5 | test/blei 6 | lda-ruby-*.gem 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | version 0.3.9 2 | ============= 3 | 4 | - merge pull request from @rishabh-tripathi allowing text corpus objects to also be built with an array of strings 5 | - couple minor code refinements 6 | 7 | version 0.3.8 8 | ============= 9 | 10 | - tokenization changes to support German (courtesy of @LeFnord) 11 | - user defined stop word list (also via @LeFnord) 12 | 13 | version 0.3.7 14 | ============= 15 | 16 | - change stop word removal back (optimization) 17 | 18 | version 0.3.6 19 | ============= 20 | 21 | - added stopwords list and included downcasing to improve performance 22 | 23 | version 0.3.5 24 | ============= 25 | 26 | - Bug fix for text documents by Rio Akasaka 27 | 28 | Version 0.3.4 29 | ============= 30 | 31 | - Bug fix by Rio Akasaka, fixes issues with segfaults under Ruby 1.9.2 32 | 33 | Version 0.3.1 34 | ============= 35 | 36 | - top_words method now returns actual words if they exist in the vocabulary 37 | 38 | Version 0.3.0 39 | ============= 40 | 41 | - Completely broke backwards compatibility 42 | - Reworked many classes to make functionality more reasonable 43 | - Added ability to load documents from text files 44 | 45 | Version 0.2.3 46 | ============= 47 | 48 | - Bug fixes by Todd Foster 49 | 50 | Version 0.2.2 51 | ============= 52 | 53 | - First stable release 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Latent Dirichlet Allocation – Ruby Wrapper 2 | 3 | ## What is LDA-Ruby? 4 | 5 | This wrapper is based on C-code by David M. Blei. In a nutshell, it can be used to automatically cluster documents into topics. The number of topics are chosen beforehand and the topics found are usually fairly intuitive. Details of the implementation can be found in the paper by Blei, Ng, and Jordan. 6 | 7 | The original C code relied on files for the input and output. We felt it was necessary to depart from that model and use Ruby objects for these steps instead. The only file necessary will be the data file (in a format similar to that used by [SVMlight][svmlight]). Optionally you may need a vocabulary file to be able to extract the words belonging to topics. 8 | 9 | ### Example usage: 10 | 11 | require 'lda-ruby' 12 | corpus = Lda::DataCorpus.new("data/data_file.dat") 13 | lda = Lda::Lda.new(corpus) # create an Lda object for training 14 | lda.em("random") # run EM algorithm using random starting points 15 | lda.load_vocabulary("data/vocab.txt") 16 | lda.print_topics(20) # print all topics with up to 20 words per topic 17 | 18 | If you have general questions about Latent Dirichlet Allocation, I urge you to use the [topic models mailing list][topic-models], since the people who monitor that are very knowledgeable. If you encounter bugs specific to lda-ruby, please post an issue on the Github project. 19 | 20 | ## Resources 21 | 22 | + [Blog post about LDA-Ruby][lda-ruby] 23 | + [David Blei's lda-c code][blei] 24 | + [Wikipedia article on LDA][wikipedia] 25 | + [Sample AP data][ap-data] 26 | 27 | ## References 28 | 29 | Blei, David M., Ng, Andrew Y., and Jordan, Michael I. 2003. Latent dirichlet allocation. Journal of Machine Learning Research. 3 (Mar. 2003), 993-1022 [[pdf][pdf]]. 30 | 31 | [svmlight]: http://svmlight.joachims.org 32 | [lda-ruby]: http://web.archive.org/web/20120616115448/http://mendicantbug.com/2008/11/17/lda-in-ruby/ 33 | [blei]: http://web.archive.org/web/20161126004857/http://www.cs.princeton.edu/~blei/lda-c/ 34 | [wikipedia]: http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation 35 | [ap-data]: http://web.archive.org/web/20160507090044/http://www.cs.princeton.edu/~blei/lda-c/ap.tgz 36 | [pdf]: http://www.cs.princeton.edu/picasso/mats/BleiNgJordan2003_blei.pdf 37 | [topic-models]: https://lists.cs.princeton.edu/mailman/listinfo/topic-models 38 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'rake' 3 | require 'yaml' 4 | 5 | begin 6 | require 'jeweler' 7 | Jeweler::Tasks.new do |gem| 8 | gem.name = "lda-ruby" 9 | gem.summary = %Q{Ruby port of Latent Dirichlet Allocation by David M. Blei.} 10 | gem.description = %Q{Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/.} 11 | gem.email = "jasonmadams@gmail.com" 12 | gem.homepage = "http://github.com/ealdent/lda-ruby" 13 | gem.authors = ['David Blei', 'Jason Adams', 'Rio Akasaka'] 14 | gem.extensions = ['ext/lda-ruby/extconf.rb'] 15 | gem.files.include 'stopwords.txt' 16 | gem.require_paths = ['lib', 'ext'] 17 | gem.add_dependency 'shoulda' 18 | # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings 19 | end 20 | 21 | rescue LoadError 22 | puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler" 23 | end 24 | 25 | require 'rake/testtask' 26 | Rake::TestTask.new(:test) do |test| 27 | test.libs << 'lib' << 'test' 28 | test.pattern = 'test/**/*_test.rb' 29 | test.verbose = true 30 | end 31 | 32 | begin 33 | require 'rcov/rcovtask' 34 | Rcov::RcovTask.new do |test| 35 | test.libs << 'test' 36 | test.pattern = 'test/**/*_test.rb' 37 | test.verbose = true 38 | end 39 | rescue LoadError 40 | task :rcov do 41 | abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov" 42 | end 43 | end 44 | 45 | task :default => :test 46 | 47 | require 'rake/rdoctask' 48 | Rake::RDocTask.new do |rdoc| 49 | if File.exist?('VERSION.yml') 50 | config = YAML.load(File.read('VERSION.yml')) 51 | version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}" 52 | else 53 | version = "" 54 | end 55 | 56 | rdoc.rdoc_dir = 'rdoc' 57 | rdoc.title = "lda-ruby #{version}" 58 | rdoc.rdoc_files.include('README*') 59 | rdoc.rdoc_files.include('lib/**/*.rb') 60 | end 61 | 62 | -------------------------------------------------------------------------------- /VERSION.yml: -------------------------------------------------------------------------------- 1 | --- 2 | :major: 0 3 | :minor: 3 4 | :patch: 9 5 | :build: 6 | -------------------------------------------------------------------------------- /ext/lda-ruby/Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL = /bin/sh 3 | 4 | #### Start of system configuration section. #### 5 | 6 | srcdir = . 7 | topdir = /home/taf2/.local/include/ruby-1.9.1 8 | hdrdir = /home/taf2/.local/include/ruby-1.9.1 9 | arch_hdrdir = /home/taf2/.local/include/ruby-1.9.1/$(arch) 10 | VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby 11 | prefix = $(DESTDIR)/home/taf2/.local 12 | exec_prefix = $(prefix) 13 | vendorhdrdir = $(rubyhdrdir)/vendor_ruby 14 | sitehdrdir = $(rubyhdrdir)/site_ruby 15 | rubyhdrdir = $(includedir)/$(RUBY_INSTALL_NAME)-$(ruby_version) 16 | vendordir = $(libdir)/$(RUBY_INSTALL_NAME)/vendor_ruby 17 | sitedir = $(libdir)/$(RUBY_INSTALL_NAME)/site_ruby 18 | mandir = $(datarootdir)/man 19 | localedir = $(datarootdir)/locale 20 | libdir = $(exec_prefix)/lib 21 | psdir = $(docdir) 22 | pdfdir = $(docdir) 23 | dvidir = $(docdir) 24 | htmldir = $(docdir) 25 | infodir = $(datarootdir)/info 26 | docdir = $(datarootdir)/doc/$(PACKAGE) 27 | oldincludedir = $(DESTDIR)/usr/include 28 | includedir = $(prefix)/include 29 | localstatedir = $(prefix)/var 30 | sharedstatedir = $(prefix)/com 31 | sysconfdir = $(prefix)/etc 32 | datadir = $(datarootdir) 33 | datarootdir = $(prefix)/share 34 | libexecdir = $(exec_prefix)/libexec 35 | sbindir = $(exec_prefix)/sbin 36 | bindir = $(exec_prefix)/bin 37 | rubylibdir = $(libdir)/$(ruby_install_name)/$(ruby_version) 38 | archdir = $(rubylibdir)/$(arch) 39 | sitelibdir = $(sitedir)/$(ruby_version) 40 | sitearchdir = $(sitelibdir)/$(sitearch) 41 | vendorlibdir = $(vendordir)/$(ruby_version) 42 | vendorarchdir = $(vendorlibdir)/$(sitearch) 43 | 44 | CC = gcc 45 | CXX = g++ 46 | LIBRUBY = $(LIBRUBY_SO) 47 | LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a 48 | LIBRUBYARG_SHARED = -Wl,-R -Wl,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME) 49 | LIBRUBYARG_STATIC = -Wl,-R -Wl,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)-static 50 | OUTFLAG = -o 51 | COUTFLAG = -o 52 | 53 | RUBY_EXTCONF_H = 54 | cflags = $(optflags) $(debugflags) $(warnflags) 55 | optflags = -O0 56 | debugflags = -g3 -ggdb 57 | warnflags = -Wall -Wno-parentheses 58 | CFLAGS = -fPIC $(cflags) -fPIC -Wall -ggdb -O0 59 | INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir) 60 | DEFS = 61 | CPPFLAGS = -D USE_RUBY $(DEFS) $(cppflags) 62 | CXXFLAGS = $(CFLAGS) $(cxxflags) 63 | ldflags = -L. -rdynamic -Wl,-export-dynamic 64 | dldflags = 65 | archflag = 66 | DLDFLAGS = $(ldflags) $(dldflags) $(archflag) 67 | LDSHARED = $(CC) -shared 68 | LDSHAREDXX = $(CXX) -shared 69 | AR = ar 70 | EXEEXT = 71 | 72 | RUBY_INSTALL_NAME = ruby 73 | RUBY_SO_NAME = ruby 74 | arch = x86_64-linux 75 | sitearch = x86_64-linux 76 | ruby_version = 1.9.1 77 | ruby = /home/taf2/.local/bin/ruby 78 | RUBY = $(ruby) 79 | RM = rm -f 80 | RM_RF = $(RUBY) -run -e rm -- -rf 81 | RMDIRS = $(RUBY) -run -e rmdir -- -p 82 | MAKEDIRS = mkdir -p 83 | INSTALL = /usr/bin/install -c 84 | INSTALL_PROG = $(INSTALL) -m 0755 85 | INSTALL_DATA = $(INSTALL) -m 644 86 | COPY = cp 87 | 88 | #### End of system configuration section. #### 89 | 90 | preload = 91 | 92 | libpath = . $(libdir) 93 | LIBPATH = -L. -L$(libdir) -Wl,-R$(libdir) 94 | DEFFILE = 95 | 96 | CLEANFILES = mkmf.log 97 | DISTCLEANFILES = 98 | DISTCLEANDIRS = 99 | 100 | extout = 101 | extout_prefix = 102 | target_prefix = 103 | LOCAL_LIBS = 104 | LIBS = $(LIBRUBYARG_SHARED) -lpthread -lrt -ldl -lcrypt -lm -lc 105 | SRCS = lda-model.c lda-data.c utils.c lda-alpha.c cokus.c lda-inference.c 106 | OBJS = lda-model.o lda-data.o utils.o lda-alpha.o cokus.o lda-inference.o 107 | TARGET = lda_ext 108 | DLLIB = $(TARGET).so 109 | EXTSTATIC = 110 | STATIC_LIB = 111 | 112 | BINDIR = $(bindir) 113 | RUBYCOMMONDIR = $(sitedir)$(target_prefix) 114 | RUBYLIBDIR = $(sitelibdir)$(target_prefix) 115 | RUBYARCHDIR = $(sitearchdir)$(target_prefix) 116 | HDRDIR = $(rubyhdrdir)/ruby$(target_prefix) 117 | ARCHHDRDIR = $(rubyhdrdir)/$(arch)/ruby$(target_prefix) 118 | 119 | TARGET_SO = $(DLLIB) 120 | CLEANLIBS = $(TARGET).so 121 | CLEANOBJS = *.o *.bak 122 | 123 | all: $(DLLIB) 124 | static: $(STATIC_LIB) 125 | 126 | clean-rb-default:: 127 | clean-rb:: 128 | clean-so:: 129 | clean: clean-so clean-rb-default clean-rb 130 | @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) 131 | 132 | distclean-rb-default:: 133 | distclean-rb:: 134 | distclean-so:: 135 | distclean: clean distclean-so distclean-rb-default distclean-rb 136 | @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log 137 | @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES) 138 | @-$(RMDIRS) $(DISTCLEANDIRS) 139 | 140 | realclean: distclean 141 | install: install-so install-rb 142 | 143 | install-so: $(RUBYARCHDIR) 144 | install-so: $(RUBYARCHDIR)/$(DLLIB) 145 | $(RUBYARCHDIR)/$(DLLIB): $(DLLIB) 146 | $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR) 147 | install-rb: pre-install-rb install-rb-default 148 | install-rb-default: pre-install-rb-default 149 | pre-install-rb: Makefile 150 | pre-install-rb-default: Makefile 151 | $(RUBYARCHDIR): 152 | $(MAKEDIRS) $@ 153 | 154 | site-install: site-install-so site-install-rb 155 | site-install-so: install-so 156 | site-install-rb: install-rb 157 | 158 | .SUFFIXES: .c .m .cc .cxx .cpp .C .o 159 | 160 | .cc.o: 161 | $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $< 162 | 163 | .cxx.o: 164 | $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $< 165 | 166 | .cpp.o: 167 | $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $< 168 | 169 | .C.o: 170 | $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $< 171 | 172 | .c.o: 173 | $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $< 174 | 175 | $(DLLIB): $(OBJS) Makefile 176 | @-$(RM) $(@) 177 | $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS) 178 | 179 | 180 | 181 | $(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h 182 | -------------------------------------------------------------------------------- /ext/lda-ruby/cokus.c: -------------------------------------------------------------------------------- 1 | // This is the ``Mersenne Twister'' random number generator MT19937, which 2 | // generates pseudorandom integers uniformly distributed in 0..(2^32 - 1) 3 | // starting from any odd seed in 0..(2^32 - 1). This version is a recode 4 | // by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by 5 | // Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in 6 | // July-August 1997). 7 | // 8 | // Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha 9 | // running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to 10 | // generate 300 million random numbers; after recoding: 24.0 sec. for the same 11 | // (i.e., 46.5% of original time), so speed is now about 12.5 million random 12 | // number generations per second on this machine. 13 | // 14 | // According to the URL 15 | // (and paraphrasing a bit in places), the Mersenne Twister is ``designed 16 | // with consideration of the flaws of various existing generators,'' has 17 | // a period of 2^19937 - 1, gives a sequence that is 623-dimensionally 18 | // equidistributed, and ``has passed many stringent tests, including the 19 | // die-hard test of G. Marsaglia and the load test of P. Hellekalek and 20 | // S. Wegenkittl.'' It is efficient in memory usage (typically using 2506 21 | // to 5012 bytes of static data, depending on data type sizes, and the code 22 | // is quite short as well). It generates random numbers in batches of 624 23 | // at a time, so the caching and pipelining of modern systems is exploited. 24 | // It is also divide- and mod-free. 25 | // 26 | // This library is free software; you can redistribute it and/or modify it 27 | // under the terms of the GNU Library General Public License as published by 28 | // the Free Software Foundation (either version 2 of the License or, at your 29 | // option, any later version). This library is distributed in the hope that 30 | // it will be useful, but WITHOUT ANY WARRANTY, without even the implied 31 | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 32 | // the GNU Library General Public License for more details. You should have 33 | // received a copy of the GNU Library General Public License along with this 34 | // library; if not, write to the Free Software Foundation, Inc., 59 Temple 35 | // Place, Suite 330, Boston, MA 02111-1307, USA. 36 | // 37 | // The code as Shawn received it included the following notice: 38 | // 39 | // Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. When 40 | // you use this, send an e-mail to with 41 | // an appropriate reference to your work. 42 | // 43 | // It would be nice to CC: when you write. 44 | // 45 | 46 | #include "cokus.h" 47 | 48 | static uint32 state[N+1]; // state vector + 1 extra to not violate ANSI C 49 | static uint32 *next; // next random value is computed from here 50 | static int left = -1; // can *next++ this many times before reloading 51 | 52 | void seedMT(uint32 seed) 53 | { 54 | // 55 | // We initialize state[0..(N-1)] via the generator 56 | // 57 | // x_new = (69069 * x_old) mod 2^32 58 | // 59 | // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuth's 60 | // _The Art of Computer Programming_, Volume 2, 3rd ed. 61 | // 62 | // Notes (SJC): I do not know what the initial state requirements 63 | // of the Mersenne Twister are, but it seems this seeding generator 64 | // could be better. It achieves the maximum period for its modulus 65 | // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if 66 | // x_initial can be even, you have sequences like 0, 0, 0, ...; 67 | // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31, 68 | // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below. 69 | // 70 | // Even if x_initial is odd, if x_initial is 1 mod 4 then 71 | // 72 | // the lowest bit of x is always 1, 73 | // the next-to-lowest bit of x is always 0, 74 | // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... , 75 | // the 3rd-from-lowest bit of x 4-cycles ... 0 1 1 0 0 1 1 0 ... , 76 | // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... , 77 | // ... 78 | // 79 | // and if x_initial is 3 mod 4 then 80 | // 81 | // the lowest bit of x is always 1, 82 | // the next-to-lowest bit of x is always 1, 83 | // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... , 84 | // the 3rd-from-lowest bit of x 4-cycles ... 0 0 1 1 0 0 1 1 ... , 85 | // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... , 86 | // ... 87 | // 88 | // The generator's potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is 89 | // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth. It 90 | // also does well in the dimension 2..5 spectral tests, but it could be 91 | // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth). 92 | // 93 | // Note that the random number user does not see the values generated 94 | // here directly since reloadMT() will always munge them first, so maybe 95 | // none of all of this matters. In fact, the seed values made here could 96 | // even be extra-special desirable if the Mersenne Twister theory says 97 | // so-- that's why the only change I made is to restrict to odd seeds. 98 | // 99 | 100 | register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state; 101 | register int j; 102 | 103 | for(left=0, *s++=x, j=N; --j; 104 | *s++ = (x*=69069U) & 0xFFFFFFFFU); 105 | } 106 | 107 | 108 | uint32 reloadMT(void) 109 | { 110 | register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1; 111 | register int j; 112 | 113 | if(left < -1) 114 | seedMT(4357U); 115 | 116 | left=N-1, next=state+1; 117 | 118 | for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++) 119 | *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 120 | 121 | for(pM=state, j=M; --j; s0=s1, s1=*p2++) 122 | *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 123 | 124 | s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 125 | s1 ^= (s1 >> 11); 126 | s1 ^= (s1 << 7) & 0x9D2C5680U; 127 | s1 ^= (s1 << 15) & 0xEFC60000U; 128 | return(s1 ^ (s1 >> 18)); 129 | } 130 | 131 | uint32 randomMT(void) 132 | { 133 | uint32 y; 134 | 135 | if(--left < 0) 136 | return(reloadMT()); 137 | 138 | y = *next++; 139 | y ^= (y >> 11); 140 | y ^= (y << 7) & 0x9D2C5680U; 141 | y ^= (y << 15) & 0xEFC60000U; 142 | y ^= (y >> 18); 143 | return(y); 144 | } 145 | 146 | -------------------------------------------------------------------------------- /ext/lda-ruby/cokus.h: -------------------------------------------------------------------------------- 1 | #ifndef COKUS_H 2 | #define COKUS_H 3 | 4 | #include 5 | #include 6 | 7 | // 8 | // uint32 must be an unsigned integer type capable of holding at least 32 9 | // bits; exactly 32 should be fastest, but 64 is better on an Alpha with 10 | // GCC at -O3 optimization so try your options and see what's best for you 11 | // 12 | 13 | typedef unsigned long uint32; 14 | 15 | #define N (624) // length of state vector 16 | #define M (397) // a period parameter 17 | #define K (0x9908B0DFU) // a magic constant 18 | #define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u 19 | #define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u 20 | #define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u 21 | #define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v 22 | 23 | void seedMT(uint32 seed); 24 | uint32 reloadMT(void); 25 | uint32 randomMT(void); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /ext/lda-ruby/extconf.rb: -------------------------------------------------------------------------------- 1 | ENV['ARCHFLAGS'] = "-arch #{`uname -p` =~ /powerpc/ ? 'ppc' : 'i386'}" 2 | 3 | require 'mkmf' 4 | 5 | $CFLAGS << ' -Wall -ggdb -O0' 6 | $defs.push( '-D USE_RUBY' ) 7 | 8 | dir_config('lda-ruby/lda') 9 | create_makefile('lda-ruby/lda') 10 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-alpha.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-alpha.h" 21 | 22 | /* 23 | * objective function and its derivatives 24 | * 25 | */ 26 | 27 | double alhood(double a, double ss, int D, int K) 28 | { return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); } 29 | 30 | double d_alhood(double a, double ss, int D, int K) 31 | { return(D * (K * digamma(K * a) - K * digamma(a)) + ss); } 32 | 33 | double d2_alhood(double a, int D, int K) 34 | { return(D * (K * K * trigamma(K * a) - K * trigamma(a))); } 35 | 36 | 37 | /* 38 | * newtons method 39 | * 40 | */ 41 | 42 | double opt_alpha(double ss, int D, int K) 43 | { 44 | double a, log_a, init_a = 100; 45 | double f, df, d2f; 46 | int iter = 0; 47 | 48 | log_a = log(init_a); 49 | do 50 | { 51 | iter++; 52 | a = exp(log_a); 53 | if (isnan(a)) 54 | { 55 | init_a = init_a * 10; 56 | printf("warning : alpha is nan; new init = %5.5f\n", init_a); 57 | a = init_a; 58 | log_a = log(a); 59 | } 60 | f = alhood(a, ss, D, K); 61 | df = d_alhood(a, ss, D, K); 62 | d2f = d2_alhood(a, D, K); 63 | log_a = log_a - df/(d2f * a + df); 64 | printf("alpha maximization : %5.5f %5.5f\n", f, df); 65 | } 66 | while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER)); 67 | return(exp(log_a)); 68 | } 69 | 70 | double quiet_opt_alpha(double ss, int D, int K) 71 | { 72 | double a, log_a, init_a = 100; 73 | double f, df, d2f; 74 | int iter = 0; 75 | 76 | log_a = log(init_a); 77 | do 78 | { 79 | iter++; 80 | a = exp(log_a); 81 | if (isnan(a)) 82 | { 83 | init_a = init_a * 10; 84 | //printf("warning : alpha is nan; new init = %5.5f\n", init_a); 85 | a = init_a; 86 | log_a = log(a); 87 | } 88 | f = alhood(a, ss, D, K); 89 | df = d_alhood(a, ss, D, K); 90 | d2f = d2_alhood(a, D, K); 91 | log_a = log_a - df/(d2f * a + df); 92 | //printf("alpha maximization : %5.5f %5.5f\n", f, df); 93 | } 94 | while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER)); 95 | return(exp(log_a)); 96 | } 97 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-alpha.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_ALPHA_H 2 | #define LDA_ALPHA_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "lda.h" 9 | #include "utils.h" 10 | 11 | #define NEWTON_THRESH 1e-5 12 | #define MAX_ALPHA_ITER 1000 13 | 14 | double alhood(double a, double ss, int D, int K); 15 | double d_alhood(double a, double ss, int D, int K); 16 | double d2_alhood(double a, int D, int K); 17 | double opt_alpha(double ss, int D, int K); 18 | double quiet_opt_alpha(double ss, int D, int K); 19 | //void maximize_alpha(double** gamma, lda_model* model, int num_docs); 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-data.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-data.h" 21 | 22 | corpus* read_data(char* data_filename) 23 | { 24 | FILE *fileptr; 25 | int length, count, word, n, nd, nw; 26 | corpus* c; 27 | 28 | printf("reading data from %s\n", data_filename); 29 | c = malloc(sizeof(corpus)); 30 | c->docs = 0; 31 | c->num_terms = 0; 32 | c->num_docs = 0; 33 | fileptr = fopen(data_filename, "r"); 34 | nd = 0; nw = 0; 35 | while ((fscanf(fileptr, "%10d", &length) != EOF)) 36 | { 37 | c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1)); 38 | c->docs[nd].length = length; 39 | c->docs[nd].total = 0; 40 | c->docs[nd].words = malloc(sizeof(int)*length); 41 | c->docs[nd].counts = malloc(sizeof(int)*length); 42 | for (n = 0; n < length; n++) 43 | { 44 | fscanf(fileptr, "%10d:%10d", &word, &count); 45 | word = word - OFFSET; 46 | c->docs[nd].words[n] = word; 47 | c->docs[nd].counts[n] = count; 48 | c->docs[nd].total += count; 49 | if (word >= nw) { nw = word + 1; } 50 | } 51 | nd++; 52 | } 53 | fclose(fileptr); 54 | c->num_docs = nd; 55 | c->num_terms = nw; 56 | printf("number of docs : %d\n", nd); 57 | printf("number of terms : %d\n", nw); 58 | return(c); 59 | } 60 | 61 | int max_corpus_length(corpus* c) 62 | { 63 | int n, max = 0; 64 | for (n = 0; n < c->num_docs; n++) 65 | if (c->docs[n].length > max) max = c->docs[n].length; 66 | return(max); 67 | } 68 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-data.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_DATA_H 2 | #define LDA_DATA_H 3 | 4 | #include 5 | #include 6 | 7 | #include "lda.h" 8 | 9 | #define OFFSET 0; // offset for reading data 10 | 11 | corpus* read_data(char* data_filename); 12 | int max_corpus_length(corpus* c); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-inference.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "lda.h" 28 | #include "lda-data.h" 29 | #include "lda-inference.h" 30 | #include "lda-model.h" 31 | #include "utils.h" 32 | #include "cokus.h" 33 | 34 | #ifdef USE_RUBY 35 | #include "ruby.h" 36 | 37 | VALUE rb_cLdaModule; 38 | VALUE rb_cLda; 39 | VALUE rb_cLdaCorpus; 40 | VALUE rb_cLdaDocument; 41 | #endif 42 | 43 | 44 | 45 | /* 46 | * variational inference 47 | */ 48 | 49 | double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi, short* errors) { 50 | double converged = 1; 51 | double phisum = 0, likelihood = 0; 52 | double likelihood_old = 0, oldphi[model->num_topics]; 53 | int k = 0, n = 0, var_iter = 0, index = 0; 54 | double digamma_gam[model->num_topics]; 55 | 56 | /* zero'em out */ 57 | memset(digamma_gam,0.0,sizeof(digamma_gam)); 58 | memset(oldphi,0.0,sizeof(oldphi)); 59 | 60 | // compute posterior dirichlet 61 | 62 | for (k = 0; k < model->num_topics; k++) 63 | { 64 | var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics)); 65 | digamma_gam[k] = digamma(var_gamma[k]); 66 | for (n = 0; n < doc->length; n++) 67 | phi[n][k] = 1.0/model->num_topics; 68 | } 69 | var_iter = 0; 70 | 71 | while ((converged > VAR_CONVERGED) && 72 | ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1))) 73 | { 74 | var_iter++; 75 | for (n = 0; n < doc->length; n++) 76 | { 77 | phisum = 0; 78 | for (k = 0; k < model->num_topics; k++) 79 | { 80 | oldphi[k] = phi[n][k]; 81 | index = doc->words[n]; 82 | if( index < 0 || index > model->num_terms ) { 83 | printf("phi for term: %d of %d\n", index, model->num_terms); 84 | phi[n][k] = 0.0; 85 | } 86 | else { 87 | phi[n][k] = 88 | digamma_gam[k] + 89 | model->log_prob_w[k][index]; 90 | } 91 | 92 | if (k > 0) 93 | phisum = log_sum(phisum, phi[n][k]); 94 | else 95 | phisum = phi[n][k]; // note, phi is in log space 96 | } 97 | 98 | for (k = 0; k < model->num_topics; k++) 99 | { 100 | phi[n][k] = exp(phi[n][k] - phisum); 101 | var_gamma[k] = 102 | var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]); 103 | // !!! a lot of extra digamma's here because of how we're computing it 104 | // !!! but its more automatically updated too. 105 | digamma_gam[k] = digamma(var_gamma[k]); 106 | } 107 | } 108 | 109 | likelihood = compute_likelihood(doc, model, phi, var_gamma); 110 | //assert(!isnan(likelihood)); 111 | if( isnan(likelihood) ) { *errors = 1; } 112 | converged = (likelihood_old - likelihood) / likelihood_old; 113 | likelihood_old = likelihood; 114 | 115 | // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged); 116 | } 117 | return(likelihood); 118 | } 119 | 120 | 121 | /* 122 | * compute likelihood bound 123 | */ 124 | 125 | double compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) { 126 | double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics]; 127 | int k = 0, n = 0, index = 0; 128 | memset(dig,0.0,sizeof(dig)); 129 | 130 | for (k = 0; k < model->num_topics; k++) 131 | { 132 | dig[k] = digamma(var_gamma[k]); 133 | var_gamma_sum += var_gamma[k]; 134 | } 135 | digsum = digamma(var_gamma_sum); 136 | 137 | likelihood = lgamma(model->alpha * model->num_topics) - 138 | model->num_topics * 139 | lgamma(model->alpha) - 140 | lgamma(var_gamma_sum); 141 | 142 | for (k = 0; k < model->num_topics; k++) 143 | { 144 | likelihood += (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) - (var_gamma[k] - 1)*(dig[k] - digsum); 145 | 146 | for (n = 0; n < doc->length; n++) 147 | { 148 | if (phi[n][k] > 0) 149 | { 150 | index = doc->words[n]; 151 | likelihood += doc->counts[n]* 152 | (phi[n][k]*((dig[k] - digsum) - log(phi[n][k]) 153 | + model->log_prob_w[k][index])); 154 | } 155 | } 156 | } 157 | return(likelihood); 158 | } 159 | 160 | 161 | double doc_e_step(document* doc, double* gamma, double** phi, lda_model* model, lda_suffstats* ss) { 162 | double likelihood; 163 | int n, k; 164 | short error = 0; 165 | 166 | // posterior inference 167 | 168 | likelihood = lda_inference(doc, model, gamma, phi, &error); 169 | if (error) { likelihood = 0.0; } 170 | 171 | 172 | // update sufficient statistics 173 | 174 | double gamma_sum = 0; 175 | for (k = 0; k < model->num_topics; k++) 176 | { 177 | gamma_sum += gamma[k]; 178 | ss->alpha_suffstats += digamma(gamma[k]); 179 | } 180 | ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum); 181 | 182 | for (n = 0; n < doc->length; n++) 183 | { 184 | for (k = 0; k < model->num_topics; k++) 185 | { 186 | ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k]; 187 | ss->class_total[k] += doc->counts[n]*phi[n][k]; 188 | } 189 | } 190 | 191 | ss->num_docs = ss->num_docs + 1; 192 | 193 | return(likelihood); 194 | } 195 | 196 | 197 | /* 198 | * writes the word assignments line for a document to a file 199 | */ 200 | 201 | void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model) { 202 | int n; 203 | 204 | fprintf(f, "%03d", doc->length); 205 | for (n = 0; n < doc->length; n++) { 206 | fprintf(f, " %04d:%02d", doc->words[n], argmax(phi[n], model->num_topics)); 207 | } 208 | fprintf(f, "\n"); 209 | fflush(f); 210 | } 211 | 212 | 213 | /* 214 | * saves the gamma parameters of the current dataset 215 | */ 216 | 217 | void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) { 218 | FILE* fileptr; 219 | int d, k; 220 | fileptr = fopen(filename, "w"); 221 | 222 | for (d = 0; d < num_docs; d++) { 223 | fprintf(fileptr, "%5.10f", gamma[d][0]); 224 | for (k = 1; k < num_topics; k++) { 225 | fprintf(fileptr, " %5.10f", gamma[d][k]); 226 | } 227 | fprintf(fileptr, "\n"); 228 | } 229 | fclose(fileptr); 230 | } 231 | 232 | 233 | void run_em(char* start, char* directory, corpus* corpus) { 234 | int d, n; 235 | lda_model *model = NULL; 236 | double **var_gamma, **phi; 237 | 238 | // allocate variational parameters 239 | 240 | 241 | var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); 242 | for (d = 0; d < corpus->num_docs; d++) 243 | var_gamma[d] = malloc(sizeof(double) * NTOPICS); 244 | 245 | int max_length = max_corpus_length(corpus); 246 | phi = malloc(sizeof(double*)*max_length); 247 | for (n = 0; n < max_length; n++) 248 | phi[n] = malloc(sizeof(double) * NTOPICS); 249 | 250 | // initialize model 251 | 252 | char filename[100]; 253 | 254 | lda_suffstats* ss = NULL; 255 | if (strcmp(start, "seeded")==0) { 256 | model = new_lda_model(corpus->num_terms, NTOPICS); 257 | ss = new_lda_suffstats(model); 258 | corpus_initialize_ss(ss, model, corpus); 259 | if (VERBOSE) { 260 | lda_mle(model, ss, 0); 261 | } else { 262 | quiet_lda_mle(model, ss, 0); 263 | } 264 | 265 | model->alpha = INITIAL_ALPHA; 266 | } else if (strcmp(start, "random")==0) { 267 | model = new_lda_model(corpus->num_terms, NTOPICS); 268 | ss = new_lda_suffstats(model); 269 | random_initialize_ss(ss, model); 270 | if (VERBOSE) { 271 | lda_mle(model, ss, 0); 272 | } else { 273 | quiet_lda_mle(model, ss, 0); 274 | } 275 | model->alpha = INITIAL_ALPHA; 276 | } else { 277 | model = load_lda_model(start); 278 | ss = new_lda_suffstats(model); 279 | } 280 | 281 | sprintf(filename,"%s/000",directory); 282 | save_lda_model(model, filename); 283 | 284 | // run expectation maximization 285 | 286 | int i = 0; 287 | double likelihood, likelihood_old = 0, converged = 1; 288 | sprintf(filename, "%s/likelihood.dat", directory); 289 | FILE* likelihood_file = fopen(filename, "w"); 290 | 291 | while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { 292 | i++; 293 | if (VERBOSE) 294 | printf("**** em iteration %d ****\n", i); 295 | likelihood = 0; 296 | zero_initialize_ss(ss, model); 297 | 298 | // e-step 299 | printf("e-step\n"); 300 | 301 | for (d = 0; d < corpus->num_docs; d++) { 302 | if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); 303 | likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); 304 | } 305 | printf("m-step\n"); 306 | 307 | // m-step 308 | if (VERBOSE) { 309 | lda_mle(model, ss, ESTIMATE_ALPHA); 310 | } else { 311 | quiet_lda_mle(model, ss, ESTIMATE_ALPHA); 312 | } 313 | 314 | // check for convergence 315 | converged = (likelihood_old - likelihood) / (likelihood_old); 316 | if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; 317 | likelihood_old = likelihood; 318 | 319 | // output model and likelihood 320 | 321 | fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged); 322 | fflush(likelihood_file); 323 | if ((i % LAG) == 0) 324 | { 325 | sprintf(filename,"%s/%03d",directory, i); 326 | save_lda_model(model, filename); 327 | sprintf(filename,"%s/%03d.gamma",directory, i); 328 | save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); 329 | } 330 | } 331 | 332 | // output the final model 333 | 334 | sprintf(filename,"%s/final",directory); 335 | save_lda_model(model, filename); 336 | sprintf(filename,"%s/final.gamma",directory); 337 | save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); 338 | 339 | // output the word assignments (for visualization) 340 | 341 | sprintf(filename, "%s/word-assignments.dat", directory); 342 | FILE* w_asgn_file = fopen(filename, "w"); 343 | short error = 0; 344 | double tl = 0.0; 345 | for (d = 0; d < corpus->num_docs; d++) 346 | { 347 | if ((d % 100) == 0 && VERBOSE) printf("final e step document %d\n",d); 348 | error = 0; 349 | tl = lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi,&error); 350 | if( error ) { continue; } 351 | likelihood += tl; 352 | write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); 353 | } 354 | fclose(w_asgn_file); 355 | fclose(likelihood_file); 356 | } 357 | 358 | 359 | /* 360 | * read settings. 361 | */ 362 | 363 | void read_settings(char* filename) { 364 | FILE* fileptr; 365 | char alpha_action[100]; 366 | fileptr = fopen(filename, "r"); 367 | fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER); 368 | fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED); 369 | fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER); 370 | fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED); 371 | fscanf(fileptr, "alpha %s", alpha_action); 372 | if (strcmp(alpha_action, "fixed")==0) 373 | { 374 | ESTIMATE_ALPHA = 0; 375 | } 376 | else 377 | { 378 | ESTIMATE_ALPHA = 1; 379 | } 380 | fclose(fileptr); 381 | } 382 | 383 | 384 | 385 | 386 | /* 387 | * inference only 388 | * 389 | */ 390 | 391 | void infer(char* model_root, char* save, corpus* corpus) { 392 | FILE* fileptr; 393 | char filename[100]; 394 | int i, d, n; 395 | lda_model *model; 396 | double **var_gamma, likelihood, **phi; 397 | document* doc; 398 | 399 | model = load_lda_model(model_root); 400 | var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); 401 | for (i = 0; i < corpus->num_docs; i++) 402 | var_gamma[i] = malloc(sizeof(double)*model->num_topics); 403 | sprintf(filename, "%s-lda-lhood.dat", save); 404 | fileptr = fopen(filename, "w"); 405 | for (d = 0; d < corpus->num_docs; d++) { 406 | if (((d % 100) == 0) && (d>0) && VERBOSE) printf("document %d\n",d); 407 | 408 | doc = &(corpus->docs[d]); 409 | phi = (double**) malloc(sizeof(double*) * doc->length); 410 | for (n = 0; n < doc->length; n++) 411 | phi[n] = (double*) malloc(sizeof(double) * model->num_topics); 412 | short error = 0; 413 | likelihood = lda_inference(doc, model, var_gamma[d], phi, &error); 414 | 415 | fprintf(fileptr, "%5.5f\n", likelihood); 416 | } 417 | fclose(fileptr); 418 | sprintf(filename, "%s-gamma.dat", save); 419 | save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); 420 | } 421 | 422 | 423 | /* 424 | * update sufficient statistics 425 | * 426 | */ 427 | 428 | 429 | 430 | /* 431 | * main 432 | * 433 | */ 434 | 435 | int main(int argc, char* argv[]) { 436 | corpus* corpus; 437 | 438 | long t1; 439 | (void) time(&t1); 440 | seedMT(t1); 441 | // seedMT(4357U); 442 | 443 | if (argc > 1) 444 | { 445 | if (strcmp(argv[1], "est")==0) 446 | { 447 | INITIAL_ALPHA = atof(argv[2]); 448 | NTOPICS = atoi(argv[3]); 449 | read_settings(argv[4]); 450 | corpus = read_data(argv[5]); 451 | make_directory(argv[7]); 452 | run_em(argv[6], argv[7], corpus); 453 | } 454 | if (strcmp(argv[1], "inf")==0) 455 | { 456 | read_settings(argv[2]); 457 | corpus = read_data(argv[4]); 458 | infer(argv[3], argv[5], corpus); 459 | } 460 | } 461 | else 462 | { 463 | printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/*] [directory]\n"); 464 | printf(" lda inf [settings] [model] [data] [name]\n"); 465 | } 466 | return(0); 467 | } 468 | 469 | #ifdef USE_RUBY 470 | 471 | /* */ 472 | void run_quiet_em(char* start, corpus* corpus) { 473 | int d = 0, n = 0; 474 | lda_model *model = NULL; 475 | double **var_gamma = NULL, **phi = NULL; 476 | // last_gamma is a double[num_docs][num_topics] 477 | 478 | // allocate variational parameters 479 | 480 | 481 | var_gamma = (double**)malloc(sizeof(double*)*(corpus->num_docs)); 482 | memset(var_gamma, 0.0, corpus->num_docs); 483 | 484 | for (d = 0; d < corpus->num_docs; ++d) { 485 | var_gamma[d] = (double*)malloc(sizeof(double) * NTOPICS); 486 | memset(var_gamma[d], 0.0, sizeof(double)*NTOPICS); 487 | } 488 | 489 | int max_length = max_corpus_length(corpus); 490 | 491 | phi = (double**)malloc(sizeof(double*)*max_length); 492 | memset(phi, 0.0, max_length); 493 | for (n = 0; n < max_length; ++n) { 494 | phi[n] = (double*)malloc(sizeof(double) * NTOPICS); 495 | memset(phi[n], 0.0, sizeof(double)*NTOPICS); 496 | } 497 | 498 | // initialize model 499 | 500 | lda_suffstats* ss = NULL; 501 | if (strncmp(start, "seeded",6)==0) { 502 | model = quiet_new_lda_model(corpus->num_terms, NTOPICS); 503 | model->alpha = INITIAL_ALPHA; 504 | ss = new_lda_suffstats(model); 505 | if (VERBOSE) { 506 | corpus_initialize_ss(ss, model, corpus); 507 | } else { 508 | quiet_corpus_initialize_ss(ss, model, corpus); 509 | } 510 | if (VERBOSE) { 511 | lda_mle(model, ss, 0); 512 | } else { 513 | quiet_lda_mle(model, ss, 0); 514 | } 515 | } else if (strncmp(start, "fixed",5)==0) { 516 | model = quiet_new_lda_model(corpus->num_terms, NTOPICS); 517 | model->alpha = INITIAL_ALPHA; 518 | ss = new_lda_suffstats(model); 519 | corpus_initialize_fixed_ss(ss, model, corpus); 520 | if (VERBOSE) { 521 | lda_mle(model, ss, 0); 522 | } else { 523 | quiet_lda_mle(model, ss, 0); 524 | } 525 | } else if (strncmp(start, "random",6)==0) { 526 | model = quiet_new_lda_model(corpus->num_terms, NTOPICS); 527 | model->alpha = INITIAL_ALPHA; 528 | ss = new_lda_suffstats(model); 529 | random_initialize_ss(ss, model); 530 | if (VERBOSE) { 531 | lda_mle(model, ss, 0); 532 | } else { 533 | quiet_lda_mle(model, ss, 0); 534 | } 535 | } else { 536 | model = load_lda_model(start); 537 | ss = new_lda_suffstats(model); 538 | } 539 | 540 | // save the model in the last_model global 541 | last_model = model; 542 | model_loaded = TRUE; 543 | 544 | // run expectation maximization 545 | 546 | int i = 0; 547 | double likelihood = 0.0, likelihood_old = 0, converged = 1; 548 | 549 | while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) { 550 | i++; 551 | if (VERBOSE) printf("**** em iteration %d ****\n", i); 552 | likelihood = 0; 553 | zero_initialize_ss(ss, model); 554 | 555 | // e-step 556 | 557 | for (d = 0; d < corpus->num_docs; d++) { 558 | if ((d % 1000) == 0 && VERBOSE) printf("document %d\n",d); 559 | likelihood += doc_e_step(&(corpus->docs[d]), var_gamma[d], phi, model, ss); 560 | } 561 | 562 | // m-step 563 | if (VERBOSE) { 564 | lda_mle(model, ss, ESTIMATE_ALPHA); 565 | } else { 566 | quiet_lda_mle(model, ss, ESTIMATE_ALPHA); 567 | } 568 | 569 | // check for convergence 570 | 571 | converged = (likelihood_old - likelihood) / (likelihood_old); 572 | if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; 573 | likelihood_old = likelihood; 574 | 575 | // store model and likelihood 576 | 577 | last_model = model; 578 | last_gamma = var_gamma; 579 | last_phi = phi; 580 | } 581 | 582 | // output the final model 583 | 584 | last_model = model; 585 | last_gamma = var_gamma; 586 | last_phi = phi; 587 | 588 | free_lda_suffstats(model,ss); 589 | 590 | // output the word assignments (for visualization) 591 | /* 592 | char filename[100]; 593 | sprintf(filename, "%s/word-assignments.dat", directory); 594 | FILE* w_asgn_file = fopen(filename, "w"); 595 | for (d = 0; d < corpus->num_docs; d++) { 596 | if ((d % 100) == 0) 597 | printf("final e step document %d\n",d); 598 | likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi); 599 | write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); 600 | } 601 | fclose(w_asgn_file); 602 | */ 603 | } 604 | 605 | 606 | /* 607 | * Set all of the settings in one command: 608 | * 609 | * * init_alpha 610 | * * num_topics 611 | * * max_iter 612 | * * convergence 613 | * * em_max_iter 614 | * * em_convergence 615 | * * est_alpha 616 | */ 617 | static VALUE wrap_set_config(VALUE self, VALUE init_alpha, VALUE num_topics, VALUE max_iter, VALUE convergence, VALUE em_max_iter, VALUE em_convergence, VALUE est_alpha) { 618 | INITIAL_ALPHA = NUM2DBL(init_alpha); 619 | NTOPICS = NUM2INT(num_topics); 620 | if( NTOPICS < 0 ) { rb_raise(rb_eRuntimeError, "NTOPICS must be greater than 0 - %d", NTOPICS); } 621 | VAR_MAX_ITER = NUM2INT(max_iter); 622 | VAR_CONVERGED = (float)NUM2DBL(convergence); 623 | EM_MAX_ITER = NUM2INT(em_max_iter); 624 | EM_CONVERGED = (float)NUM2DBL(em_convergence); 625 | ESTIMATE_ALPHA = NUM2INT(est_alpha); 626 | 627 | return Qtrue; 628 | } 629 | 630 | /* 631 | * Get the maximum iterations. 632 | */ 633 | static VALUE wrap_get_max_iter(VALUE self) { 634 | return rb_int_new(VAR_MAX_ITER); 635 | } 636 | 637 | /* 638 | * Set the maximum iterations. 639 | */ 640 | static VALUE wrap_set_max_iter(VALUE self, VALUE max_iter) { 641 | VAR_MAX_ITER = NUM2INT(max_iter); 642 | 643 | return max_iter; 644 | } 645 | 646 | /* 647 | * Get the convergence setting. 648 | */ 649 | static VALUE wrap_get_converged(VALUE self) { 650 | return rb_float_new(VAR_CONVERGED); 651 | } 652 | 653 | /* 654 | * Set the convergence setting. 655 | */ 656 | static VALUE wrap_set_converged(VALUE self, VALUE converged) { 657 | VAR_CONVERGED = (float)NUM2DBL(converged); 658 | 659 | return converged; 660 | } 661 | 662 | /* 663 | * Get the max iterations for the EM algorithm. 664 | */ 665 | static VALUE wrap_get_em_max_iter(VALUE self) { 666 | return rb_int_new(EM_MAX_ITER); 667 | } 668 | 669 | /* 670 | * Set the max iterations for the EM algorithm. 671 | */ 672 | static VALUE wrap_set_em_max_iter(VALUE self, VALUE em_max_iter) { 673 | EM_MAX_ITER = NUM2INT(em_max_iter); 674 | 675 | return em_max_iter; 676 | } 677 | 678 | /* 679 | * Get the convergence value for EM. 680 | */ 681 | static VALUE wrap_get_em_converged(VALUE self) { 682 | return rb_float_new(EM_CONVERGED); 683 | } 684 | 685 | /* 686 | * Set the convergence value for EM. 687 | */ 688 | static VALUE wrap_set_em_converged(VALUE self, VALUE em_converged) { 689 | EM_CONVERGED = (float)NUM2DBL(em_converged); 690 | 691 | return em_converged; 692 | } 693 | 694 | /* 695 | * Get the initial alpha value. 696 | */ 697 | static VALUE wrap_get_initial_alpha(VALUE self) { 698 | return rb_float_new(INITIAL_ALPHA); 699 | } 700 | 701 | /* 702 | * Get the number of topics being clustered. 703 | */ 704 | static VALUE wrap_get_num_topics(VALUE self) { 705 | return rb_int_new(NTOPICS); 706 | } 707 | 708 | /* 709 | * Set the initial value of alpha. 710 | */ 711 | static VALUE wrap_set_initial_alpha(VALUE self, VALUE initial_alpha) { 712 | INITIAL_ALPHA = (float)NUM2DBL(initial_alpha); 713 | 714 | return initial_alpha; 715 | } 716 | 717 | /* 718 | * Set the number of topics to be clustered. 719 | */ 720 | static VALUE wrap_set_num_topics(VALUE self, VALUE ntopics) { 721 | NTOPICS = NUM2INT(ntopics); 722 | 723 | return ntopics; 724 | } 725 | 726 | /* 727 | * Get the estimate alpha value (fixed = 0). 728 | */ 729 | static VALUE wrap_get_estimate_alpha(VALUE self) { 730 | return rb_int_new(ESTIMATE_ALPHA); 731 | } 732 | 733 | /* 734 | * Set the estimate alpha value (fixed = 0). 735 | */ 736 | static VALUE wrap_set_estimate_alpha(VALUE self, VALUE est_alpha) { 737 | ESTIMATE_ALPHA = NUM2INT(est_alpha); 738 | 739 | return est_alpha; 740 | } 741 | 742 | /* 743 | * Get the verbosity setting. 744 | */ 745 | static VALUE wrap_get_verbosity(VALUE self) { 746 | if (VERBOSE) { 747 | return Qtrue; 748 | } else { 749 | return Qfalse; 750 | } 751 | } 752 | 753 | 754 | /* 755 | * Set the verbosity level (true, false). 756 | */ 757 | static VALUE wrap_set_verbosity(VALUE self, VALUE verbosity) { 758 | if (verbosity == Qtrue) { 759 | VERBOSE = TRUE; 760 | } else { 761 | VERBOSE = FALSE; 762 | } 763 | 764 | return verbosity; 765 | } 766 | 767 | 768 | 769 | /* 770 | * Run the EM algorithm with the loaded corpus and using the current 771 | * configuration settings. The +start+ parameter can take the following 772 | * values: 773 | * * random - starting alpha are randomized 774 | * * seeded - loaded based on the corpus values 775 | * * - path to the file containing the model 776 | */ 777 | static VALUE wrap_em(VALUE self, VALUE start) { 778 | if (!corpus_loaded) 779 | return Qnil; 780 | 781 | run_quiet_em(StringValuePtr(start), last_corpus); 782 | 783 | return Qnil; 784 | } 785 | 786 | 787 | /* 788 | * Load settings from the given file. 789 | */ 790 | static VALUE wrap_load_settings(VALUE self, VALUE settings_file) { 791 | read_settings(StringValuePtr(settings_file)); 792 | 793 | return Qtrue; 794 | } 795 | 796 | /* 797 | * Load the corpus from the given file. This will not create 798 | * a +Corpus+ object that is accessible, but it will load the corpus 799 | * much faster. 800 | */ 801 | static VALUE wrap_load_corpus(VALUE self, VALUE filename) { 802 | if (!corpus_loaded) { 803 | last_corpus = read_data(StringValuePtr(filename)); 804 | corpus_loaded = TRUE; 805 | return Qtrue; 806 | } else { 807 | return Qtrue; 808 | } 809 | } 810 | 811 | /* 812 | * Set the corpus. 813 | */ 814 | static VALUE wrap_ruby_corpus(VALUE self, VALUE rcorpus) { 815 | corpus* c; 816 | int i = 0; 817 | int j = 0; 818 | 819 | c = malloc(sizeof(corpus)); 820 | c->num_terms = NUM2INT(rb_iv_get(rcorpus, "@num_terms")); 821 | c->num_docs = NUM2INT(rb_iv_get(rcorpus, "@num_docs")); 822 | c->docs = (document*) malloc(sizeof(document) * c->num_docs); 823 | VALUE doc_ary = rb_iv_get(rcorpus, "@documents"); 824 | for (i = 0; i < c->num_docs; i++) { 825 | VALUE one_doc = rb_ary_entry(doc_ary, i); 826 | VALUE words = rb_iv_get(one_doc, "@words"); 827 | VALUE counts = rb_iv_get(one_doc, "@counts"); 828 | 829 | c->docs[i].length = NUM2INT(rb_iv_get(one_doc, "@length")); 830 | c->docs[i].total = NUM2INT(rb_iv_get(one_doc, "@total")); 831 | c->docs[i].words = malloc(sizeof(int) * c->docs[i].length); 832 | c->docs[i].counts = malloc(sizeof(int) * c->docs[i].length); 833 | for (j = 0; j < c->docs[i].length; j++) { 834 | int one_word = NUM2INT(rb_ary_entry(words, j)); 835 | int one_count = NUM2INT(rb_ary_entry(counts, j)); 836 | if( one_word > c->num_terms ) { 837 | rb_raise(rb_eRuntimeError, "error term count(%d) less than word index(%d)", c->num_terms, one_word); 838 | } 839 | c->docs[i].words[j] = one_word; 840 | c->docs[i].counts[j] = one_count; 841 | } 842 | } 843 | 844 | last_corpus = c; 845 | corpus_loaded = TRUE; 846 | 847 | rb_iv_set(self, "@corpus", rcorpus); 848 | 849 | return Qtrue; 850 | } 851 | 852 | 853 | /* 854 | * Get the gamma values after the model has been run. 855 | */ 856 | static VALUE wrap_get_gamma(VALUE self) { 857 | if (!model_loaded) 858 | return Qnil; 859 | 860 | // last_gamma is a double[num_docs][num_topics] 861 | VALUE arr; 862 | int i = 0, j = 0; 863 | 864 | arr = rb_ary_new2(last_corpus->num_docs); 865 | for (i = 0; i < last_corpus->num_docs; i++) { 866 | VALUE arr2 = rb_ary_new2(last_model->num_topics); 867 | for (j = 0; j < last_model->num_topics; j++) { 868 | rb_ary_store(arr2, j, rb_float_new(last_gamma[i][j])); 869 | } 870 | rb_ary_store(arr, i, arr2); 871 | } 872 | 873 | return arr; 874 | } 875 | 876 | 877 | /* 878 | * Compute the phi values by running inference after the initial EM run has been completed. 879 | * 880 | * Returns a 3D matrix: num_docs x length x num_topics. 881 | */ 882 | static VALUE wrap_get_phi(VALUE self) { 883 | if (!model_loaded) 884 | return Qnil; 885 | 886 | VALUE arr = rb_ary_new2(last_corpus->num_docs); 887 | int i = 0, j = 0, k = 0; 888 | 889 | //int max_length = max_corpus_length(last_corpus); 890 | short error = 0; 891 | 892 | for (i = 0; i < last_corpus->num_docs; i++) { 893 | VALUE arr1 = rb_ary_new2(last_corpus->docs[i].length); 894 | 895 | lda_inference(&(last_corpus->docs[i]), last_model, last_gamma[i], last_phi, &error); 896 | 897 | for (j = 0; j < last_corpus->docs[i].length; j++) { 898 | VALUE arr2 = rb_ary_new2(last_model->num_topics); 899 | 900 | for (k = 0; k < last_model->num_topics; k++) { 901 | rb_ary_store(arr2, k, rb_float_new(last_phi[j][k])); 902 | } 903 | 904 | rb_ary_store(arr1, j, arr2); 905 | } 906 | 907 | rb_ary_store(arr, i, arr1); 908 | } 909 | 910 | return arr; 911 | } 912 | 913 | 914 | 915 | /* 916 | * Get the beta matrix after the model has been run. 917 | */ 918 | static VALUE wrap_get_model_beta(VALUE self) { 919 | if (!model_loaded) 920 | return Qnil; 921 | 922 | // beta is a double[num_topics][num_terms] 923 | VALUE arr; 924 | int i = 0, j = 0; 925 | 926 | arr = rb_ary_new2(last_model->num_topics); 927 | for (i = 0; i < last_model->num_topics; i++) { 928 | VALUE arr2 = rb_ary_new2(last_model->num_terms); 929 | for (j = 0; j < last_model->num_terms; j++) { 930 | rb_ary_store(arr2, j, rb_float_new(last_model->log_prob_w[i][j])); 931 | } 932 | rb_ary_store(arr, i, arr2); 933 | } 934 | 935 | return arr; 936 | } 937 | 938 | 939 | /* 940 | * Get the settings used for the model. 941 | */ 942 | static VALUE wrap_get_model_settings(VALUE self) { 943 | if (!model_loaded) 944 | return Qnil; 945 | 946 | VALUE arr; 947 | 948 | arr = rb_ary_new(); 949 | rb_ary_push(arr, rb_int_new(last_model->num_topics)); 950 | rb_ary_push(arr, rb_int_new(last_model->num_terms)); 951 | rb_ary_push(arr, rb_float_new(last_model->alpha)); 952 | 953 | return arr; // [num_topics, num_terms, alpha] 954 | } 955 | 956 | 957 | void Init_lda() { 958 | corpus_loaded = FALSE; 959 | model_loaded = FALSE; 960 | VERBOSE = TRUE; 961 | 962 | rb_require("lda-ruby"); 963 | 964 | rb_cLdaModule = rb_define_module("Lda"); 965 | rb_cLda = rb_define_class_under(rb_cLdaModule, "Lda", rb_cObject); 966 | rb_cLdaCorpus = rb_define_class_under(rb_cLdaModule, "Corpus", rb_cObject); 967 | rb_cLdaDocument = rb_define_class_under(rb_cLdaModule, "Document", rb_cObject); 968 | 969 | // method to load the corpus 970 | rb_define_method(rb_cLda, "fast_load_corpus_from_file", wrap_load_corpus, 1); 971 | rb_define_method(rb_cLda, "corpus=", wrap_ruby_corpus, 1); 972 | 973 | // method to run em 974 | rb_define_method(rb_cLda, "em", wrap_em, 1); 975 | 976 | // method to load settings from file 977 | rb_define_method(rb_cLda, "load_settings", wrap_load_settings, 1); 978 | 979 | // method to set all the config options at once 980 | rb_define_method(rb_cLda, "set_config", wrap_set_config, 5); 981 | 982 | // accessor stuff for main settings 983 | rb_define_method(rb_cLda, "max_iter", wrap_get_max_iter, 0); 984 | rb_define_method(rb_cLda, "max_iter=", wrap_set_max_iter, 1); 985 | rb_define_method(rb_cLda, "convergence", wrap_get_converged, 0); 986 | rb_define_method(rb_cLda, "convergence=", wrap_set_converged, 1); 987 | rb_define_method(rb_cLda, "em_max_iter", wrap_get_em_max_iter, 0); 988 | rb_define_method(rb_cLda, "em_max_iter=", wrap_set_em_max_iter, 1); 989 | rb_define_method(rb_cLda, "em_convergence", wrap_get_em_converged, 0); 990 | rb_define_method(rb_cLda, "em_convergence=", wrap_set_em_converged, 1); 991 | rb_define_method(rb_cLda, "init_alpha=", wrap_set_initial_alpha, 1); 992 | rb_define_method(rb_cLda, "init_alpha", wrap_get_initial_alpha, 0); 993 | rb_define_method(rb_cLda, "est_alpha=", wrap_set_estimate_alpha, 1); 994 | rb_define_method(rb_cLda, "est_alpha", wrap_get_estimate_alpha, 0); 995 | rb_define_method(rb_cLda, "num_topics", wrap_get_num_topics, 0); 996 | rb_define_method(rb_cLda, "num_topics=", wrap_set_num_topics, 1); 997 | rb_define_method(rb_cLda, "verbose", wrap_get_verbosity, 0); 998 | rb_define_method(rb_cLda, "verbose=", wrap_set_verbosity, 1); 999 | 1000 | // retrieve model and gamma 1001 | rb_define_method(rb_cLda, "beta", wrap_get_model_beta, 0); 1002 | rb_define_method(rb_cLda, "gamma", wrap_get_gamma, 0); 1003 | rb_define_method(rb_cLda, "compute_phi", wrap_get_phi, 0); 1004 | rb_define_method(rb_cLda, "model", wrap_get_model_settings, 0); 1005 | } 1006 | 1007 | #endif 1008 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-inference.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_INFERENCE_H 2 | #define LDA_INFERENCE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lda.h" 8 | #include "utils.h" 9 | 10 | 11 | 12 | int LAG = 5; 13 | 14 | float EM_CONVERGED; 15 | int EM_MAX_ITER; 16 | int ESTIMATE_ALPHA; 17 | double INITIAL_ALPHA; 18 | int NTOPICS; 19 | float VAR_CONVERGED; 20 | int VAR_MAX_ITER; 21 | 22 | #ifdef USE_RUBY 23 | corpus *last_corpus; 24 | lda_model *last_model; 25 | double **last_gamma; 26 | double **last_phi; 27 | 28 | enum BOOL { FALSE, TRUE } corpus_loaded, model_loaded, VERBOSE; 29 | #endif 30 | 31 | 32 | 33 | double lda_inference(document*, lda_model*, double*, double**, short*); 34 | double compute_likelihood(document*, lda_model*, double**, double*); 35 | 36 | 37 | double doc_e_step(document* doc, 38 | double* gamma, 39 | double** phi, 40 | lda_model* model, 41 | lda_suffstats* ss); 42 | 43 | void save_gamma(char* filename, 44 | double** gamma, 45 | int num_docs, 46 | int num_topics); 47 | 48 | void run_em(char* start, 49 | char* directory, 50 | corpus* corpus); 51 | 52 | #ifdef USE_RUBY 53 | void run_quiet_em(char* start, corpus* corpus); 54 | #endif 55 | 56 | void read_settings(char* filename); 57 | 58 | void infer(char* model_root, 59 | char* save, 60 | corpus* corpus); 61 | 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-model.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-model.h" 21 | #include 22 | 23 | 24 | /* 25 | * compute MLE lda model from sufficient statistics 26 | * 27 | */ 28 | 29 | void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) { 30 | int k; int w; 31 | 32 | for (k = 0; k < model->num_topics; k++) 33 | { 34 | for (w = 0; w < model->num_terms; w++) 35 | { 36 | if (ss->class_word[k][w] > 0) 37 | { 38 | model->log_prob_w[k][w] = 39 | log(ss->class_word[k][w]) - 40 | log(ss->class_total[k]); 41 | } 42 | else 43 | model->log_prob_w[k][w] = -100; 44 | } 45 | } 46 | if (estimate_alpha == 1) 47 | { 48 | model->alpha = opt_alpha(ss->alpha_suffstats, 49 | ss->num_docs, 50 | model->num_topics); 51 | 52 | printf("new alpha = %5.5f\n", model->alpha); 53 | } 54 | } 55 | 56 | void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) { 57 | int k; int w; 58 | 59 | for (k = 0; k < model->num_topics; k++) 60 | { 61 | for (w = 0; w < model->num_terms; w++) 62 | { 63 | if (ss->class_word[k][w] > 0) 64 | { 65 | model->log_prob_w[k][w] = 66 | log(ss->class_word[k][w]) - 67 | log(ss->class_total[k]); 68 | } 69 | else 70 | model->log_prob_w[k][w] = -100; 71 | } 72 | } 73 | if (estimate_alpha == 1) 74 | { 75 | model->alpha = quiet_opt_alpha(ss->alpha_suffstats, 76 | ss->num_docs, 77 | model->num_topics); 78 | } 79 | } 80 | 81 | 82 | 83 | 84 | /* 85 | * allocate sufficient statistics 86 | * 87 | */ 88 | 89 | lda_suffstats* new_lda_suffstats(lda_model* model) { 90 | register int i; 91 | int num_topics = model->num_topics; 92 | int num_terms = model->num_terms; 93 | 94 | lda_suffstats* ss = (lda_suffstats*)malloc(sizeof(lda_suffstats)); 95 | memset(ss,0,sizeof(lda_suffstats)); 96 | ss->class_total = (double*)malloc(sizeof(double)*num_topics); 97 | ss->class_word = (double**)malloc(sizeof(double*)*num_topics); 98 | 99 | for (i = 0; i < num_topics; ++i) { 100 | ss->class_total[i] = 0; 101 | ss->class_word[i] = (double*)malloc(sizeof(double)*num_terms); 102 | memset(ss->class_word[i],0.0,sizeof(double)*num_terms); 103 | } 104 | 105 | return(ss); 106 | } 107 | /* 108 | * deallocate new lda suffstats 109 | * 110 | */ 111 | void free_lda_suffstats(lda_model* model, lda_suffstats* ss) { 112 | int i; 113 | int num_topics = model->num_topics; 114 | 115 | free(ss->class_total); 116 | for (i = 0; i < num_topics; ++i) { 117 | free(ss->class_word[i]); 118 | } 119 | free(ss->class_word); 120 | free(ss); 121 | } 122 | 123 | /* 124 | * various intializations for the sufficient statistics 125 | * 126 | */ 127 | 128 | void zero_initialize_ss(lda_suffstats* ss, lda_model* model) { 129 | int k, w; 130 | for (k = 0; k < model->num_topics; k++) 131 | { 132 | ss->class_total[k] = 0; 133 | for (w = 0; w < model->num_terms; w++) 134 | { 135 | ss->class_word[k][w] = 0; 136 | } 137 | } 138 | ss->num_docs = 0; 139 | ss->alpha_suffstats = 0; 140 | } 141 | 142 | 143 | void random_initialize_ss(lda_suffstats* ss, lda_model* model) { 144 | int num_topics = model->num_topics; 145 | int num_terms = model->num_terms; 146 | int k, n; 147 | 148 | for (k = 0; k < num_topics; k++) 149 | { 150 | for (n = 0; n < num_terms; n++) 151 | { 152 | ss->class_word[k][n] += 1.0/num_terms + myrand(); 153 | ss->class_total[k] += ss->class_word[k][n]; 154 | } 155 | } 156 | } 157 | 158 | 159 | void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c) 160 | { 161 | int num_topics = model->num_topics; 162 | int i, k, d, n; 163 | document* doc; 164 | 165 | for (k = 0; k < num_topics; k++) 166 | { 167 | for (i = 0; i < NUM_INIT; i++) 168 | { 169 | d = floor(myrand() * c->num_docs); 170 | printf("initialized with document %d\n", d); 171 | doc = &(c->docs[d]); 172 | for (n = 0; n < doc->length; n++) 173 | { 174 | ss->class_word[k][doc->words[n]] += doc->counts[n]; 175 | } 176 | } 177 | for (n = 0; n < model->num_terms; n++) 178 | { 179 | ss->class_word[k][n] += 1.0; 180 | ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n]; 181 | } 182 | } 183 | } 184 | 185 | void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c) 186 | { 187 | int num_topics = model->num_topics; 188 | int i, k, d, n; 189 | document* doc; 190 | 191 | for (k = 0; k < num_topics; k++) 192 | { 193 | for (i = 0; i < NUM_INIT; i++) 194 | { 195 | d = floor(myrand() * c->num_docs); 196 | doc = &(c->docs[d]); 197 | for (n = 0; n < doc->length; n++) 198 | { 199 | ss->class_word[k][doc->words[n]] += doc->counts[n]; 200 | } 201 | } 202 | for (n = 0; n < model->num_terms; n++) 203 | { 204 | ss->class_word[k][n] += 1.0; 205 | ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n]; 206 | } 207 | } 208 | } 209 | 210 | 211 | /* 212 | * Use the first num_topics documents of the corpus as the seeds. If num_topics > num_docs, results might be hairy. 213 | */ 214 | void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c) { 215 | int num_topics = MIN(model->num_topics, c->num_docs); 216 | int k, n; 217 | document* doc; 218 | 219 | for (k = 0; k < num_topics; k++) { 220 | doc = &(c->docs[k]); 221 | for (n = 0; n < doc->length; n++) { 222 | ss->class_word[k][doc->words[n]] += doc->counts[n]; 223 | } 224 | for (n = 0; n < model->num_terms; n++) { 225 | ss->class_word[k][n] += 1.0; 226 | ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n]; 227 | } 228 | } 229 | } 230 | 231 | /* 232 | * allocate new lda model 233 | * 234 | */ 235 | 236 | lda_model* new_lda_model(int num_terms, int num_topics) { 237 | int i; 238 | lda_model* model; 239 | 240 | model = malloc(sizeof(lda_model)); 241 | model->num_topics = num_topics; 242 | model->num_terms = num_terms; 243 | model->alpha = 1.0; 244 | model->log_prob_w = malloc(sizeof(double*)*num_topics); 245 | printf("new model with: %d topics and %d terms\n", num_topics, num_terms); 246 | for (i = 0; i < num_topics; i++) 247 | { 248 | model->log_prob_w[i] = malloc(sizeof(double)*num_terms); 249 | memset(model->log_prob_w[i],0,sizeof(double)*num_terms); 250 | } 251 | return(model); 252 | } 253 | 254 | lda_model* quiet_new_lda_model(int num_terms, int num_topics) { 255 | int i; 256 | lda_model* model; 257 | 258 | model = malloc(sizeof(lda_model)); 259 | model->num_topics = num_topics; 260 | model->num_terms = num_terms; 261 | model->alpha = 1.0; 262 | model->log_prob_w = malloc(sizeof(double*)*num_topics); 263 | for (i = 0; i < num_topics; i++) 264 | { 265 | model->log_prob_w[i] = malloc(sizeof(double)*num_terms); 266 | memset(model->log_prob_w[i],0,sizeof(double)*num_terms); 267 | } 268 | return(model); 269 | } 270 | 271 | 272 | /* 273 | * deallocate new lda model 274 | * 275 | */ 276 | void free_lda_model(lda_model* model) { 277 | int i; 278 | 279 | for (i = 0; i < model->num_topics; i++) 280 | { 281 | free(model->log_prob_w[i]); 282 | } 283 | free(model->log_prob_w); 284 | } 285 | 286 | 287 | /* 288 | * save an lda model 289 | * 290 | */ 291 | void save_lda_model(lda_model* model, char* model_root) { 292 | char filename[100]; 293 | FILE* fileptr; 294 | int i, j; 295 | 296 | sprintf(filename, "%s.beta", model_root); 297 | fileptr = fopen(filename, "w"); 298 | for (i = 0; i < model->num_topics; i++) { 299 | for (j = 0; j < model->num_terms; j++) { 300 | fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]); 301 | } 302 | fprintf(fileptr, "\n"); 303 | } 304 | fclose(fileptr); 305 | 306 | sprintf(filename, "%s.other", model_root); 307 | fileptr = fopen(filename, "w"); 308 | fprintf(fileptr, "num_topics %d\n", model->num_topics); 309 | fprintf(fileptr, "num_terms %d\n", model->num_terms); 310 | fprintf(fileptr, "alpha %5.10f\n", model->alpha); 311 | fclose(fileptr); 312 | } 313 | 314 | 315 | lda_model* load_lda_model(char* model_root) { 316 | char filename[100]; 317 | FILE* fileptr; 318 | int i, j, num_terms, num_topics; 319 | float x, alpha; 320 | 321 | sprintf(filename, "%s.other", model_root); 322 | printf("loading %s\n", filename); 323 | fileptr = fopen(filename, "r"); 324 | fscanf(fileptr, "num_topics %d\n", &num_topics); 325 | fscanf(fileptr, "num_terms %d\n", &num_terms); 326 | fscanf(fileptr, "alpha %f\n", &alpha); 327 | fclose(fileptr); 328 | 329 | lda_model* model = new_lda_model(num_terms, num_topics); 330 | model->alpha = alpha; 331 | 332 | sprintf(filename, "%s.beta", model_root); 333 | printf("loading %s\n", filename); 334 | fileptr = fopen(filename, "r"); 335 | for (i = 0; i < num_topics; i++) 336 | { 337 | for (j = 0; j < num_terms; j++) 338 | { 339 | fscanf(fileptr, "%f", &x); 340 | model->log_prob_w[i][j] = x; 341 | } 342 | } 343 | fclose(fileptr); 344 | return(model); 345 | } 346 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda-model.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_MODEL_H 2 | #define LDA_MODEL 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lda.h" 8 | #include "lda-alpha.h" 9 | #include "cokus.h" 10 | 11 | #define myrand() (double) (((unsigned long) randomMT()) / 4294967296.) 12 | #define NUM_INIT 1 13 | #define MIN(A,B) (int)((A > B) ? (B) : (A)) 14 | 15 | void free_lda_model(lda_model*); 16 | void save_lda_model(lda_model*, char*); 17 | lda_model* new_lda_model(int, int); 18 | lda_model* quiet_new_lda_model(int num_terms, int num_topics); 19 | lda_model* new_lda_model(int num_terms, int num_topics); 20 | lda_suffstats* new_lda_suffstats(lda_model* model); 21 | void free_lda_suffstats(lda_model* model, lda_suffstats* ss); 22 | void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c); 23 | void quiet_corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c); 24 | void corpus_initialize_fixed_ss(lda_suffstats* ss, lda_model* model, corpus* c); 25 | void random_initialize_ss(lda_suffstats* ss, lda_model* model); 26 | void zero_initialize_ss(lda_suffstats* ss, lda_model* model); 27 | void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha); 28 | void quiet_lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha); 29 | lda_model* load_lda_model(char* model_root); 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /ext/lda-ruby/lda.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #ifndef LDA_H 21 | #define LDA_H 22 | 23 | 24 | typedef struct { 25 | int* words; 26 | int* counts; 27 | int length; 28 | int total; 29 | } document; 30 | 31 | 32 | typedef struct { 33 | document* docs; 34 | int num_terms; 35 | int num_docs; 36 | } corpus; 37 | 38 | 39 | typedef struct { 40 | double alpha; 41 | double** log_prob_w; 42 | int num_topics; 43 | int num_terms; 44 | } lda_model; 45 | 46 | 47 | typedef struct { 48 | double** class_word; 49 | double* class_total; 50 | double alpha_suffstats; 51 | int num_docs; 52 | } lda_suffstats; 53 | 54 | #endif -------------------------------------------------------------------------------- /ext/lda-ruby/utils.c: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | /* 4 | * given log(a) and log(b), return log(a + b) 5 | * 6 | */ 7 | 8 | double log_sum(double log_a, double log_b) 9 | { 10 | double v; 11 | 12 | if (log_a < log_b) 13 | { 14 | v = log_b+log(1 + exp(log_a-log_b)); 15 | } 16 | else 17 | { 18 | v = log_a+log(1 + exp(log_b-log_a)); 19 | } 20 | return(v); 21 | } 22 | 23 | /** 24 | * Proc to calculate the value of the trigamma, the second 25 | * derivative of the loggamma function. Accepts positive matrices. 26 | * From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with 27 | * recurrence formula 6.4.6. Each requires workspace at least 5 28 | * times the size of X. 29 | * 30 | **/ 31 | 32 | double trigamma(double x) 33 | { 34 | double p; 35 | int i; 36 | 37 | x=x+6; 38 | p=1/(x*x); 39 | p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238) 40 | *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p; 41 | for (i=0; i<6 ;i++) 42 | { 43 | x=x-1; 44 | p=1/(x*x)+p; 45 | } 46 | return(p); 47 | } 48 | 49 | 50 | /* 51 | * taylor approximation of first derivative of the log gamma function 52 | * 53 | */ 54 | 55 | double digamma(double x) 56 | { 57 | double p; 58 | x=x+6; 59 | p=1/(x*x); 60 | p=(((0.004166666666667*p-0.003968253986254)*p+ 61 | 0.008333333333333)*p-0.083333333333333)*p; 62 | p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6); 63 | return p; 64 | } 65 | 66 | 67 | double log_gamma(double x) 68 | { 69 | double z=1/(x*x); 70 | 71 | x=x+6; 72 | z=(((-0.000595238095238*z+0.000793650793651) 73 | *z-0.002777777777778)*z+0.083333333333333)/x; 74 | z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)- 75 | log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6); 76 | return z; 77 | } 78 | 79 | 80 | 81 | /* 82 | * make directory 83 | * 84 | */ 85 | 86 | void make_directory(char* name) 87 | { 88 | mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR); 89 | } 90 | 91 | 92 | /* 93 | * argmax 94 | * 95 | */ 96 | 97 | int argmax(double* x, int n) 98 | { 99 | int i; 100 | double max = x[0]; 101 | int argmax = 0; 102 | for (i = 1; i < n; i++) 103 | { 104 | if (x[i] > max) 105 | { 106 | max = x[i]; 107 | argmax = i; 108 | } 109 | } 110 | return(argmax); 111 | } 112 | -------------------------------------------------------------------------------- /ext/lda-ruby/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | double log_sum(double log_a, double log_b); 12 | double trigamma(double x); 13 | double digamma(double x); 14 | double log_gamma(double x); 15 | void make_directory(char* name); 16 | int argmax(double* x, int n); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /lda-ruby.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | # stub: lda-ruby 0.3.9 ruby libext 6 | # stub: ext/lda-ruby/extconf.rb 7 | 8 | Gem::Specification.new do |s| 9 | s.name = "lda-ruby" 10 | s.version = "0.3.9" 11 | 12 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 13 | s.require_paths = ["lib", "ext"] 14 | s.authors = ["David Blei", "Jason Adams", "Rio Akasaka"] 15 | s.date = "2015-02-11" 16 | s.description = "Ruby port of Latent Dirichlet Allocation by David M. Blei. See http://www.cs.princeton.edu/~blei/lda-c/." 17 | s.email = "jasonmadams@gmail.com" 18 | s.extensions = ["ext/lda-ruby/extconf.rb"] 19 | s.extra_rdoc_files = [ 20 | "README.md" 21 | ] 22 | s.files = [ 23 | "CHANGELOG.md", 24 | "README.md", 25 | "Rakefile", 26 | "VERSION.yml", 27 | "ext/lda-ruby/Makefile", 28 | "ext/lda-ruby/cokus.c", 29 | "ext/lda-ruby/cokus.h", 30 | "ext/lda-ruby/extconf.rb", 31 | "ext/lda-ruby/lda-alpha.c", 32 | "ext/lda-ruby/lda-alpha.h", 33 | "ext/lda-ruby/lda-data.c", 34 | "ext/lda-ruby/lda-data.h", 35 | "ext/lda-ruby/lda-inference.c", 36 | "ext/lda-ruby/lda-inference.h", 37 | "ext/lda-ruby/lda-model.c", 38 | "ext/lda-ruby/lda-model.h", 39 | "ext/lda-ruby/lda.h", 40 | "ext/lda-ruby/utils.c", 41 | "ext/lda-ruby/utils.h", 42 | "lda-ruby.gemspec", 43 | "lib/lda-ruby.rb", 44 | "lib/lda-ruby/config/stopwords.yml", 45 | "lib/lda-ruby/corpus/corpus.rb", 46 | "lib/lda-ruby/corpus/data_corpus.rb", 47 | "lib/lda-ruby/corpus/directory_corpus.rb", 48 | "lib/lda-ruby/corpus/text_corpus.rb", 49 | "lib/lda-ruby/document/data_document.rb", 50 | "lib/lda-ruby/document/document.rb", 51 | "lib/lda-ruby/document/text_document.rb", 52 | "lib/lda-ruby/vocabulary.rb", 53 | "license.txt", 54 | "test/data/.gitignore", 55 | "test/data/docs.dat", 56 | "test/data/sample.rb", 57 | "test/data/wiki-test-docs.yml", 58 | "test/lda_ruby_test.rb", 59 | "test/simple_test.rb", 60 | "test/simple_yaml.rb", 61 | "test/test_helper.rb" 62 | ] 63 | s.homepage = "http://github.com/ealdent/lda-ruby" 64 | s.rubygems_version = "2.4.5" 65 | s.summary = "Ruby port of Latent Dirichlet Allocation by David M. Blei." 66 | 67 | if s.respond_to? :specification_version then 68 | s.specification_version = 4 69 | 70 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 71 | s.add_runtime_dependency(%q, [">= 0"]) 72 | else 73 | s.add_dependency(%q, [">= 0"]) 74 | end 75 | else 76 | s.add_dependency(%q, [">= 0"]) 77 | end 78 | end 79 | 80 | -------------------------------------------------------------------------------- /lib/lda-ruby.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__)) 2 | 3 | require 'lda-ruby/lda' 4 | require 'lda-ruby/document/document' 5 | require 'lda-ruby/document/data_document' 6 | require 'lda-ruby/document/text_document' 7 | require 'lda-ruby/corpus/corpus' 8 | require 'lda-ruby/corpus/data_corpus' 9 | require 'lda-ruby/corpus/text_corpus' 10 | require 'lda-ruby/corpus/directory_corpus' 11 | require 'lda-ruby/vocabulary' 12 | 13 | module Lda 14 | class Lda 15 | attr_reader :vocab, :corpus 16 | 17 | def initialize(corpus) 18 | load_default_settings 19 | 20 | @vocab = nil 21 | self.corpus = corpus 22 | @vocab = corpus.vocabulary.to_a if corpus.vocabulary 23 | 24 | @phi = nil 25 | end 26 | 27 | def load_default_settings 28 | self.max_iter = 20 29 | self.convergence = 1e-6 30 | self.em_max_iter = 100 31 | self.em_convergence = 1e-4 32 | self.num_topics = 20 33 | self.init_alpha = 0.3 34 | self.est_alpha = 1 35 | 36 | [20, 1e-6, 100, 1e-4, 20, 0.3, 1] 37 | end 38 | 39 | def load_corpus(filename) 40 | @corpus = Corpus.new 41 | @corpus.load_from_file(filename) 42 | 43 | true 44 | end 45 | 46 | def load_vocabulary(vocab) 47 | if vocab.is_a?(Array) 48 | @vocab = Marshal::load(Marshal::dump(vocab)) # deep clone array 49 | elsif vocab.is_a?(Vocabulary) 50 | @vocab = vocab.to_a 51 | else 52 | @vocab = File.open(vocab, 'r') { |f| f.read.split(/\s+/) } 53 | end 54 | 55 | true 56 | end 57 | 58 | # 59 | # Visualization method for printing out the top +words_per_topic+ words 60 | # for each topic. 61 | # 62 | # See also +top_words+. 63 | # 64 | def print_topics(words_per_topic = 10) 65 | raise 'No vocabulary loaded.' unless @vocab 66 | 67 | beta.each_with_index do |topic, topic_num| 68 | # Sort the topic array and return the sorted indices of the best scores 69 | indices = topic.zip((0...@vocab.size).to_a).sort { |x| x[0] }.map { |_i, j| j }.reverse[0...words_per_topic] 70 | 71 | puts "Topic #{topic_num}" 72 | puts "\t#{indices.map { |i| @vocab[i] }.join("\n\t")}" 73 | puts '' 74 | end 75 | 76 | nil 77 | end 78 | 79 | # 80 | # After the model has been run and a vocabulary has been loaded, return the 81 | # +words_per_topic+ top words chosen by the model for each topic. This is 82 | # returned as a hash mapping the topic number to an array of top words 83 | # (in descending order of importance). 84 | # 85 | # topic_number => [w1, w2, ..., w_n] 86 | # 87 | # See also +print_topics+. 88 | # 89 | def top_word_indices(words_per_topic = 10) 90 | raise 'No vocabulary loaded.' unless @vocab 91 | 92 | # find the highest scoring words per topic 93 | topics = {} 94 | 95 | beta.each_with_index do |topic, topic_num| 96 | topics[topic_num] = topic.zip((0...@vocab.size).to_a).sort { |x| x[0] }.map { |_i, j| j }.reverse[0...words_per_topic] 97 | end 98 | 99 | topics 100 | end 101 | 102 | def top_words(words_per_topic = 10) 103 | output = {} 104 | 105 | topics = top_word_indices(words_per_topic) 106 | topics.each_pair do |topic_num, words| 107 | output[topic_num] = words.map { |w| @vocab[w] } 108 | end 109 | 110 | output 111 | end 112 | 113 | # 114 | # Get the phi matrix which can be used to assign probabilities to words 115 | # belonging to a specific topic in each document. The return value is a 116 | # 3D matrix: num_docs x doc_length x num_topics. The value is cached 117 | # after the first call, so if it needs to be recomputed, set the +recompute+ 118 | # value to true. 119 | # 120 | def phi(recompute = false) 121 | if @phi.nil? || recompute 122 | @phi = self.compute_phi 123 | end 124 | 125 | @phi 126 | end 127 | 128 | # 129 | # Compute the average log probability for each topic for each document in the corpus. 130 | # This method returns a matrix: num_docs x num_topics with the average log probability 131 | # for the topic in the document. 132 | # 133 | def compute_topic_document_probability 134 | outp = [] 135 | 136 | @corpus.documents.each_with_index do |doc, idx| 137 | tops = [0.0] * num_topics 138 | ttl = doc.counts.inject(0.0) { |sum, i| sum + i } 139 | phi[idx].each_with_index do |word_dist, word_idx| 140 | word_dist.each_with_index do |top_prob, top_idx| 141 | tops[top_idx] += Math.log(top_prob) * doc.counts[word_idx] 142 | end 143 | end 144 | tops = tops.map { |i| i / ttl } 145 | outp << tops 146 | end 147 | 148 | outp 149 | end 150 | 151 | # 152 | # String representation displaying current settings. 153 | # 154 | def to_s 155 | outp = ['LDA Settings:'] 156 | outp << ' Initial alpha: %0.6f'.format(init_alpha) 157 | outp << ' # of topics: %d'.format(num_topics) 158 | outp << ' Max iterations: %d'.format(max_iter) 159 | outp << ' Convergence: %0.6f'.format(convergence) 160 | outp << 'EM max iterations: %d'.format(em_max_iter) 161 | outp << ' EM convergence: %0.6f'.format(em_convergence) 162 | outp << ' Estimate alpha: %d'.format(est_alpha) 163 | 164 | outp.join("\n") 165 | end 166 | end 167 | end 168 | -------------------------------------------------------------------------------- /lib/lda-ruby/config/stopwords.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - a 3 | - a's 4 | - able 5 | - about 6 | - above 7 | - according 8 | - accordingly 9 | - across 10 | - actually 11 | - after 12 | - afterwards 13 | - again 14 | - against 15 | - ain't 16 | - all 17 | - allow 18 | - allows 19 | - almost 20 | - alone 21 | - along 22 | - already 23 | - also 24 | - although 25 | - always 26 | - am 27 | - among 28 | - amongst 29 | - an 30 | - and 31 | - another 32 | - any 33 | - anybody 34 | - anyhow 35 | - anyone 36 | - anything 37 | - anyway 38 | - anyways 39 | - anywhere 40 | - apart 41 | - appear 42 | - appreciate 43 | - appropriate 44 | - are 45 | - aren't 46 | - around 47 | - as 48 | - aside 49 | - ask 50 | - asking 51 | - associated 52 | - at 53 | - available 54 | - away 55 | - awfully 56 | - b 57 | - be 58 | - became 59 | - because 60 | - become 61 | - becomes 62 | - becoming 63 | - been 64 | - before 65 | - beforehand 66 | - behind 67 | - being 68 | - believe 69 | - below 70 | - beside 71 | - besides 72 | - best 73 | - better 74 | - between 75 | - beyond 76 | - both 77 | - brief 78 | - but 79 | - by 80 | - c 81 | - c'mon 82 | - c's 83 | - came 84 | - can 85 | - can't 86 | - cannot 87 | - cant 88 | - cause 89 | - causes 90 | - certain 91 | - certainly 92 | - changes 93 | - clearly 94 | - co 95 | - com 96 | - come 97 | - comes 98 | - concerning 99 | - consequently 100 | - consider 101 | - considering 102 | - contain 103 | - containing 104 | - contains 105 | - corresponding 106 | - could 107 | - couldn't 108 | - course 109 | - currently 110 | - d 111 | - definitely 112 | - described 113 | - despite 114 | - did 115 | - didn't 116 | - different 117 | - do 118 | - does 119 | - doesn't 120 | - doing 121 | - don't 122 | - done 123 | - down 124 | - downwards 125 | - during 126 | - e 127 | - each 128 | - edu 129 | - eg 130 | - eight 131 | - either 132 | - else 133 | - elsewhere 134 | - enough 135 | - entirely 136 | - especially 137 | - et 138 | - etc 139 | - even 140 | - ever 141 | - every 142 | - everybody 143 | - everyone 144 | - everything 145 | - everywhere 146 | - ex 147 | - exactly 148 | - example 149 | - except 150 | - f 151 | - far 152 | - few 153 | - fifth 154 | - first 155 | - five 156 | - followed 157 | - following 158 | - follows 159 | - for 160 | - former 161 | - formerly 162 | - forth 163 | - four 164 | - from 165 | - further 166 | - furthermore 167 | - g 168 | - get 169 | - gets 170 | - getting 171 | - given 172 | - gives 173 | - go 174 | - goes 175 | - going 176 | - gone 177 | - got 178 | - gotten 179 | - greetings 180 | - h 181 | - had 182 | - hadn't 183 | - happens 184 | - hardly 185 | - has 186 | - hasn't 187 | - have 188 | - haven't 189 | - having 190 | - he 191 | - he's 192 | - hello 193 | - help 194 | - hence 195 | - her 196 | - here 197 | - here's 198 | - hereafter 199 | - hereby 200 | - herein 201 | - hereupon 202 | - hers 203 | - herself 204 | - hi 205 | - him 206 | - himself 207 | - his 208 | - hither 209 | - hopefully 210 | - how 211 | - howbeit 212 | - however 213 | - i 214 | - i'd 215 | - i'll 216 | - i'm 217 | - i've 218 | - ie 219 | - if 220 | - ignored 221 | - immediate 222 | - in 223 | - inasmuch 224 | - inc 225 | - indeed 226 | - indicate 227 | - indicated 228 | - indicates 229 | - inner 230 | - insofar 231 | - instead 232 | - into 233 | - inward 234 | - is 235 | - isn't 236 | - it 237 | - it'd 238 | - it'll 239 | - it's 240 | - its 241 | - itself 242 | - j 243 | - just 244 | - k 245 | - keep 246 | - keeps 247 | - kept 248 | - know 249 | - knows 250 | - known 251 | - l 252 | - last 253 | - lately 254 | - later 255 | - latter 256 | - latterly 257 | - least 258 | - less 259 | - lest 260 | - let 261 | - let's 262 | - like 263 | - liked 264 | - likely 265 | - little 266 | - look 267 | - looking 268 | - looks 269 | - ltd 270 | - m 271 | - mainly 272 | - many 273 | - may 274 | - maybe 275 | - me 276 | - mean 277 | - meanwhile 278 | - merely 279 | - might 280 | - more 281 | - moreover 282 | - most 283 | - mostly 284 | - much 285 | - must 286 | - my 287 | - myself 288 | - n 289 | - name 290 | - namely 291 | - nd 292 | - near 293 | - nearly 294 | - necessary 295 | - need 296 | - needs 297 | - neither 298 | - never 299 | - nevertheless 300 | - new 301 | - next 302 | - nine 303 | - "no" 304 | - nobody 305 | - non 306 | - none 307 | - noone 308 | - nor 309 | - normally 310 | - not 311 | - nothing 312 | - novel 313 | - now 314 | - nowhere 315 | - o 316 | - obviously 317 | - of 318 | - "off" 319 | - often 320 | - oh 321 | - ok 322 | - okay 323 | - old 324 | - "on" 325 | - once 326 | - one 327 | - ones 328 | - only 329 | - onto 330 | - or 331 | - other 332 | - others 333 | - otherwise 334 | - ought 335 | - our 336 | - ours 337 | - ourselves 338 | - out 339 | - outside 340 | - over 341 | - overall 342 | - own 343 | - p 344 | - particular 345 | - particularly 346 | - per 347 | - perhaps 348 | - placed 349 | - please 350 | - plus 351 | - possible 352 | - presumably 353 | - probably 354 | - provides 355 | - q 356 | - que 357 | - quite 358 | - qv 359 | - r 360 | - rather 361 | - rd 362 | - re 363 | - really 364 | - reasonably 365 | - regarding 366 | - regardless 367 | - regards 368 | - relatively 369 | - respectively 370 | - right 371 | - s 372 | - said 373 | - same 374 | - saw 375 | - say 376 | - saying 377 | - says 378 | - second 379 | - secondly 380 | - see 381 | - seeing 382 | - seem 383 | - seemed 384 | - seeming 385 | - seems 386 | - seen 387 | - self 388 | - selves 389 | - sensible 390 | - sent 391 | - serious 392 | - seriously 393 | - seven 394 | - several 395 | - shall 396 | - she 397 | - should 398 | - shouldn't 399 | - since 400 | - six 401 | - so 402 | - some 403 | - somebody 404 | - somehow 405 | - someone 406 | - something 407 | - sometime 408 | - sometimes 409 | - somewhat 410 | - somewhere 411 | - soon 412 | - sorry 413 | - specified 414 | - specify 415 | - specifying 416 | - still 417 | - sub 418 | - such 419 | - sup 420 | - sure 421 | - t 422 | - t's 423 | - take 424 | - taken 425 | - tell 426 | - tends 427 | - th 428 | - than 429 | - thank 430 | - thanks 431 | - thanx 432 | - that 433 | - that's 434 | - thats 435 | - the 436 | - their 437 | - theirs 438 | - them 439 | - themselves 440 | - then 441 | - thence 442 | - there 443 | - there's 444 | - thereafter 445 | - thereby 446 | - therefore 447 | - therein 448 | - theres 449 | - thereupon 450 | - these 451 | - they 452 | - they'd 453 | - they'll 454 | - they're 455 | - they've 456 | - think 457 | - third 458 | - this 459 | - thorough 460 | - thoroughly 461 | - those 462 | - though 463 | - three 464 | - through 465 | - throughout 466 | - thru 467 | - thus 468 | - to 469 | - together 470 | - too 471 | - took 472 | - toward 473 | - towards 474 | - tried 475 | - tries 476 | - truly 477 | - try 478 | - trying 479 | - twice 480 | - two 481 | - u 482 | - un 483 | - under 484 | - unfortunately 485 | - unless 486 | - unlikely 487 | - until 488 | - unto 489 | - up 490 | - upon 491 | - us 492 | - use 493 | - used 494 | - useful 495 | - uses 496 | - using 497 | - usually 498 | - v 499 | - value 500 | - various 501 | - very 502 | - via 503 | - viz 504 | - vs 505 | - w 506 | - want 507 | - wants 508 | - was 509 | - wasn't 510 | - way 511 | - we 512 | - we'd 513 | - we'll 514 | - we're 515 | - we've 516 | - welcome 517 | - well 518 | - went 519 | - were 520 | - weren't 521 | - what 522 | - what's 523 | - whatever 524 | - when 525 | - whence 526 | - whenever 527 | - where 528 | - where's 529 | - whereafter 530 | - whereas 531 | - whereby 532 | - wherein 533 | - whereupon 534 | - wherever 535 | - whether 536 | - which 537 | - while 538 | - whither 539 | - who 540 | - who's 541 | - whoever 542 | - whole 543 | - whom 544 | - whose 545 | - why 546 | - will 547 | - willing 548 | - wish 549 | - with 550 | - within 551 | - without 552 | - won't 553 | - wonder 554 | - would 555 | - would 556 | - wouldn't 557 | - x 558 | - y 559 | - "yes" 560 | - yet 561 | - you 562 | - you'd 563 | - you'll 564 | - you're 565 | - you've 566 | - your 567 | - yours 568 | - yourself 569 | - yourselves 570 | - z 571 | - zero 572 | -------------------------------------------------------------------------------- /lib/lda-ruby/corpus/corpus.rb: -------------------------------------------------------------------------------- 1 | require 'set' 2 | 3 | module Lda 4 | class Corpus 5 | attr_reader :documents, :num_docs, :num_terms, :vocabulary, :stopwords 6 | 7 | def initialize(stop_word_list = nil) 8 | @documents = [] 9 | @all_terms = Set.new 10 | @num_terms = @num_docs = 0 11 | @vocabulary = Vocabulary.new 12 | @stopwords = if stop_word_list.nil? 13 | File.join(File.dirname(__FILE__), '..', 'config', 'stopwords.yml') 14 | else 15 | stop_word_list 16 | end 17 | @stopwords = YAML.load_file(@stopwords) 18 | @stopwords.map!(&:strip) 19 | end 20 | 21 | def add_document(doc) 22 | raise 'Parameter +doc+ must be of type Document' unless doc.is_a?(Document) 23 | 24 | @documents << doc 25 | 26 | @all_terms += doc.words 27 | @num_docs += 1 28 | @num_terms = @all_terms.size 29 | 30 | update_vocabulary(doc) 31 | nil 32 | end 33 | 34 | def remove_word(word) 35 | @vocabulary.words.delete word 36 | end 37 | 38 | protected 39 | 40 | def update_vocabulary(doc) 41 | doc.tokens.each { |w| @vocabulary.check_word(w) } 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/lda-ruby/corpus/data_corpus.rb: -------------------------------------------------------------------------------- 1 | module Lda 2 | class DataCorpus < Corpus 3 | attr_reader :filename 4 | 5 | def initialize(filename) 6 | super() 7 | 8 | @filename = filename 9 | load_from_file 10 | end 11 | 12 | protected 13 | 14 | def load_from_file 15 | txt = File.open(@filename, 'r', &:read) 16 | lines = txt.split(/[\r\n]+/) 17 | lines.each do |line| 18 | add_document(DataDocument.new(self, line)) 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/lda-ruby/corpus/directory_corpus.rb: -------------------------------------------------------------------------------- 1 | module Lda 2 | class DirectoryCorpus < Corpus 3 | attr_reader :path, :extension 4 | 5 | # load documents from a directory 6 | def initialize(path, extension = nil) 7 | super() 8 | 9 | @path = path.dup.freeze 10 | @extension = extension ? extension.dup.freeze : nil 11 | 12 | load_from_directory 13 | end 14 | 15 | protected 16 | 17 | def load_from_directory 18 | dir_glob = File.join(@path, (@extension ? "*.#{@extension}" : '*')) 19 | 20 | Dir.glob(dir_glob).each do |filename| 21 | add_document(TextDocument.build_from_file(self, filename)) 22 | end 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /lib/lda-ruby/corpus/text_corpus.rb: -------------------------------------------------------------------------------- 1 | module Lda 2 | class TextCorpus < Corpus 3 | attr_reader :filename 4 | 5 | # Loads text documents from a YAML file or an array of strings 6 | def initialize(input_data) 7 | super() 8 | 9 | docs = if input_data.is_a?(String) && File.exist?(input_data) 10 | # yaml file containing an array of strings representing each document 11 | YAML.load_file(input_data) 12 | elsif input_data.is_a?(Array) 13 | # an array of strings representing each document 14 | input_data.dup 15 | elsif input_data.is_a?(String) 16 | # a single string representing one document 17 | [input_data] 18 | else 19 | raise 'Unknown input type: please pass in a valid filename or an array of strings.' 20 | end 21 | 22 | docs.each do |doc| 23 | add_document(TextDocument.new(self, doc)) 24 | end 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/lda-ruby/document/data_document.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Create the Document using the svmlight-style text line: 3 | # 4 | # num_words w1:freq1 w2:freq2 ... w_n:freq_n 5 | # 6 | # Ex. 7 | # 5 1:2 3:1 4:2 7:3 12:1 8 | # 9 | # The value for the number of words should equal the number of pairs 10 | # following it, though this isn't at all enforced. Order of word-pair 11 | # indices is not important. 12 | # 13 | 14 | module Lda 15 | class DataDocument < Document 16 | def initialize(corpus, data) 17 | super(corpus) 18 | 19 | items = data.split(/\s+/) 20 | pairs = items[1..items.size].map { |item| item.split(':') } 21 | 22 | pairs.each do |feature_identifier, feature_weight| 23 | @words << feature_identifier.to_i 24 | @counts << feature_weight.to_i 25 | end 26 | 27 | recompute 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/lda-ruby/document/document.rb: -------------------------------------------------------------------------------- 1 | require 'yaml' 2 | 3 | module Lda 4 | class Document 5 | attr_reader :corpus, :words, :counts, :length, :total, :tokens 6 | 7 | def initialize(corpus) 8 | @corpus = corpus 9 | 10 | @words = [] 11 | @counts = [] 12 | @tokens = [] 13 | @length = 0 14 | @total = 0 15 | end 16 | 17 | # 18 | # Recompute the total and length values. 19 | # 20 | def recompute 21 | @total = @counts.inject(0) { |sum, i| sum + i } 22 | @length = @words.size 23 | end 24 | 25 | def text? 26 | false 27 | end 28 | 29 | def handle(tokens) 30 | tokens 31 | end 32 | 33 | def tokenize(text) 34 | # remove everything but letters and ' and leave only single spaces 35 | clean_text = text.gsub(/[^a-zäöüß'-]+/i, ' ').gsub(/\s+/, ' ').downcase 36 | @tokens = handle(clean_text.split(' ')) 37 | nil 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/lda-ruby/document/text_document.rb: -------------------------------------------------------------------------------- 1 | module Lda 2 | class TextDocument < Document 3 | attr_reader :filename 4 | 5 | def initialize(corpus, text) 6 | super(corpus) 7 | @filename = nil 8 | 9 | tokenize(text) 10 | @corpus.stopwords.each { |w| @tokens.delete(w) } 11 | build_from_tokens 12 | end 13 | 14 | def text? 15 | true 16 | end 17 | 18 | def self.build_from_file(corpus, filename) 19 | @filename = filename.dup.freeze 20 | text = File.open(@filename, 'r', &:read) 21 | new(corpus, text) 22 | end 23 | 24 | protected 25 | 26 | def build_from_tokens 27 | vocab = Hash.new(0) 28 | @tokens.each { |t| vocab[t] = vocab[t] + 1 } 29 | 30 | vocab.each_pair do |word, count| 31 | @words << @corpus.vocabulary.check_word(word) - 1 32 | @counts << count 33 | end 34 | 35 | recompute 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /lib/lda-ruby/vocabulary.rb: -------------------------------------------------------------------------------- 1 | module Lda 2 | class Vocabulary 3 | attr_reader :words, :indexes 4 | 5 | def initialize(words = nil) 6 | @words = Hash.new do |hash, key| 7 | if hash.member?(:MAX_VALUE) 8 | hash[:MAX_VALUE] = hash[:MAX_VALUE] + 1 9 | else 10 | hash[:MAX_VALUE] = 1 11 | end 12 | hash[key] = hash[:MAX_VALUE] 13 | end 14 | 15 | words.each { |w| @words[w] } if words 16 | @indexes = Hash.new 17 | 18 | @words.each_pair do |w, i| 19 | @indexes[i] = w 20 | end 21 | end 22 | 23 | def check_word(word) 24 | w = @words[word.dup] 25 | @indexes[w] = word.dup 26 | w 27 | end 28 | 29 | def load_file(filename) 30 | txt = File.open(filename, 'r') { |f| f.read } 31 | txt.split(/[\n\r]+/).each { |word| check_word(word) } 32 | end 33 | 34 | def load_yaml(filename) 35 | YAML::load_file(filename).each { |word| check_word(word) } 36 | end 37 | 38 | def num_words 39 | ((@words.size > 0) ? @words.size - 1 : 0 ) 40 | end 41 | 42 | def to_a 43 | @words.sort { |w1, w2| w1[1] <=> w2[1] }.map { |word, idx| word }.reject { |w| w == :MAX_VALUE } 44 | end 45 | end 46 | end -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 489 | 490 | Also add information on how to contact you by electronic and paper mail. 491 | 492 | You should also get your employer (if you work as a programmer) or your 493 | school, if any, to sign a "copyright disclaimer" for the library, if 494 | necessary. Here is a sample; alter the names: 495 | 496 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 497 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 498 | 499 | , 1 April 1990 500 | Ty Coon, President of Vice 501 | 502 | That's all there is to it! 503 | 504 | 505 | -------------------------------------------------------------------------------- /test/data/.gitignore: -------------------------------------------------------------------------------- 1 | tmp*.txt 2 | tmp*.yml 3 | -------------------------------------------------------------------------------- /test/data/docs.dat: -------------------------------------------------------------------------------- 1 | 170 219:2 256:2 389:1 257:1 292:2 143:1 181:14 107:1 1871:1 241:1 104:14 120:1 183:1 175:1 267:1 274:1 245:2 884:1 996:1 202:1 149:2 1264:1 182:1 275:1 2015:1 212:1 1292:1 367:1 142:1 221:2 1328:2 265:1 152:1 494:1 186:3 168:1 216:1 1184:1 284:2 276:1 151:1 164:1 290:1 249:1 150:1 141:1 125:2 1281:2 1953:1 196:1 112:3 281:1 314:1 934:2 286:2 134:1 148:6 225:1 114:2 211:1 147:1 300:1 303:1 266:1 201:1 191:1 124:8 244:1 209:1 170:2 517:1 105:1 248:1 7:1 155:2 675:1 246:1 571:1 224:1 285:1 289:1 119:3 230:4 262:1 2342:1 159:1 205:1 217:1 195:2 944:1 220:2 106:1 277:1 137:1 135:2 1458:1 118:9 2368:1 199:2 1798:1 242:1 239:2 127:3 264:3 2129:1 2076:1 103:1 154:2 238:1 102:5 108:1 197:1 268:1 184:2 19:2 255:1 430:1 173:1 309:1 138:2 261:1 272:1 5:2 128:3 1145:7 192:1 129:15 145:1 236:1 121:8 193:4 1356:1 140:1 840:3 171:2 172:2 721:1 935:1 160:2 185:1 207:1 1086:1 126:1 1132:1 130:5 157:1 271:2 100:4 132:1 200:1 188:6 204:1 109:5 153:1 158:1 304:1 208:1 146:3 110:2 218:1 2079:1 868:1 210:1 557:3 227:1 282:2 247:2 165:1 213:1 215:1 2 | 150 219:1 389:1 257:4 181:2 104:7 120:1 427:1 416:1 274:1 1517:1 310:4 373:1 245:4 352:3 479:1 1661:1 400:1 2015:1 401:1 318:1 1887:1 367:1 381:1 413:1 221:1 993:1 186:5 1543:1 358:1 334:1 1184:1 276:1 421:2 340:1 290:1 422:3 368:1 125:1 112:2 314:1 333:1 286:2 2157:1 134:2 424:2 148:3 362:1 225:1 300:1 147:1 382:2 311:4 303:1 361:2 330:1 170:1 124:5 351:4 517:1 7:1 326:1 675:1 343:1 402:1 399:3 571:4 119:2 97:1 411:1 205:1 345:1 360:2 220:1 137:2 1058:1 323:1 372:1 1798:1 375:1 1349:1 2129:1 420:1 154:1 19:2 430:1 398:1 309:1 429:1 434:2 325:1 384:1 426:1 261:1 337:2 412:1 366:1 5:2 316:1 1145:1 357:1 129:24 1727:1 121:3 403:1 433:1 193:3 339:1 356:1 317:1 380:1 1721:2 741:1 484:1 880:1 338:1 324:3 185:1 1086:1 341:2 2051:1 130:5 387:1 423:1 347:2 1610:1 132:1 350:1 109:4 320:4 331:1 406:1 394:1 431:1 393:2 392:1 344:2 304:1 391:1 354:2 335:1 432:1 329:1 2219:1 557:2 385:1 397:1 414:1 379:1 313:1 1980:1 3 | 95 186:1 512:2 242:1 448:5 449:5 127:1 494:1 276:3 151:1 497:1 507:1 205:1 286:1 480:2 460:1 473:1 324:1 489:1 517:1 461:1 1158:1 471:1 2087:1 493:3 437:2 1546:2 271:2 146:1 479:2 1661:2 544:2 466:1 191:2 514:1 1034:1 519:1 129:9 481:2 94:2 499:1 485:1 508:1 442:5 571:3 463:2 320:3 498:2 329:1 1621:2 1481:2 1154:1 290:1 109:3 225:2 515:1 447:2 282:1 439:2 464:1 496:1 130:6 441:1 422:2 505:1 504:1 450:1 495:1 484:1 121:6 41:1 503:1 467:1 132:2 1798:1 500:1 217:3 257:2 1912:1 440:4 1472:1 478:1 476:1 104:12 1302:1 325:1 513:2 453:1 32:2 193:1 743:1 475:2 516:1 1557:2 1078:1 474:1 4 | 249 102:6 1981:1 689:1 718:2 1145:4 745:1 804:1 631:1 507:1 588:1 683:1 1244:1 148:2 34:1 325:2 736:1 585:2 677:1 580:1 685:1 741:1 337:2 551:1 590:1 682:1 574:6 596:1 1734:4 594:1 598:1 181:2 603:3 1075:2 154:3 552:1 1557:2 744:1 217:1 236:1 621:1 700:2 249:3 1481:2 601:1 282:1 549:1 713:1 281:1 2071:1 537:1 112:1 728:1 264:1 670:2 602:1 274:2 684:1 1278:1 389:1 91:1 512:2 673:1 721:3 1914:1 668:2 124:1 610:1 527:1 121:11 515:1 629:2 699:1 559:1 698:1 560:1 397:1 719:1 324:1 666:1 525:4 1605:3 168:5 411:2 276:13 320:4 625:1 611:1 692:3 185:1 722:3 563:1 657:1 747:1 647:1 261:2 690:2 751:1 679:1 664:2 717:1 681:1 701:2 1546:4 529:1 406:1 526:4 450:1 442:1 88:1 696:2 5:17 662:1 503:1 129:22 1658:1 286:5 553:1 640:1 104:30 615:1 354:6 565:1 544:4 746:1 714:1 1034:1 589:1 1621:2 605:1 612:1 170:4 125:1 523:1 607:3 557:8 540:1 644:1 2001:1 743:2 578:1 191:1 675:1 740:1 697:1 146:2 41:4 522:1 592:1 2148:2 730:1 634:1 547:7 192:3 362:1 2067:1 151:3 731:2 1158:1 632:1 205:2 516:1 723:1 619:1 649:1 1661:1 277:3 137:2 733:1 1302:1 734:6 429:1 2368:4 360:2 604:1 1504:1 193:5 2048:1 329:3 597:1 659:1 109:8 630:1 725:1 708:1 749:1 514:1 593:1 691:1 724:1 568:1 618:3 686:1 958:1 186:9 614:3 581:1 738:1 2223:1 130:20 19:10 934:3 550:1 257:2 528:5 726:1 595:1 687:1 1094:1 616:1 571:8 498:2 655:1 513:2 586:1 660:2 517:3 225:2 573:1 132:2 608:4 1251:1 575:1 290:4 1154:1 32:3 709:1 914:1 530:1 620:1 652:2 555:5 548:1 729:2 188:1 637:1 300:5 583:4 7:2 245:1 171:1 479:4 1239:1 422:4 1592:1 2214:5 645:1 543:1 688:2 626:2 5 | 141 795:1 256:2 773:1 573:1 257:1 753:1 2001:1 292:1 181:7 1278:1 2231:1 104:5 183:1 802:1 1481:2 245:2 453:1 677:2 479:2 1546:3 781:1 1096:1 804:1 821:1 780:1 186:3 782:1 1347:1 168:1 284:1 151:1 276:2 760:1 767:1 290:2 1621:2 812:1 824:1 596:1 801:1 618:2 112:1 574:3 788:1 934:2 286:3 796:1 148:1 544:2 513:4 820:1 300:2 790:1 124:1 754:2 1866:1 777:1 517:2 776:1 7:1 799:1 816:1 571:4 762:1 285:1 752:1 810:2 1034:1 230:1 769:1 809:1 205:1 751:4 217:3 756:1 818:1 220:1 768:1 464:1 498:1 823:1 1158:1 199:1 743:1 1228:1 803:1 2129:1 239:1 154:1 102:3 1981:2 86:1 514:1 19:2 815:1 471:1 450:1 778:1 325:2 775:1 5:5 1145:2 1534:1 129:14 2253:1 121:7 193:3 797:1 793:1 787:1 317:1 526:1 772:1 172:1 516:1 324:1 512:2 17:1 2214:2 783:1 1239:1 770:1 130:4 515:1 271:1 1575:1 1357:1 766:1 109:3 320:1 188:2 1085:1 817:1 1302:1 329:1 282:1 1094:1 786:1 1557:2 32:1 792:1 6 | 224 102:9 835:1 689:1 850:1 1145:5 890:1 804:5 507:1 1544:2 1244:1 1049:1 874:1 325:1 148:8 241:1 900:1 960:1 989:1 987:1 142:1 866:1 833:1 996:1 219:1 574:7 1734:1 2116:2 909:2 181:18 603:1 1347:2 1557:2 217:4 1895:1 957:1 955:1 845:1 951:1 1481:2 282:3 975:1 919:1 898:1 988:1 281:2 265:1 844:1 220:1 112:2 264:3 829:2 979:2 274:2 1278:1 917:1 980:2 950:1 982:1 952:1 175:2 512:2 920:1 1132:3 515:1 121:11 947:1 345:1 886:1 560:1 888:1 324:1 160:1 853:1 168:1 494:1 990:1 1830:1 276:3 256:3 932:5 519:1 963:1 199:1 970:2 1498:1 426:1 881:1 918:1 165:1 747:1 837:2 896:1 1703:1 880:1 832:1 840:2 701:1 882:2 1546:2 885:1 864:1 846:1 875:1 2229:1 413:1 1185:1 172:1 129:15 286:3 956:2 615:1 897:1 944:1 104:11 344:1 935:1 544:2 910:1 978:1 907:1 924:1 1034:1 843:1 2015:1 903:1 2191:1 901:1 938:1 170:1 1621:4 923:1 484:1 883:1 134:2 379:1 849:3 865:2 1523:1 927:1 743:2 191:3 948:1 697:1 1472:1 941:1 1354:1 933:1 127:2 151:3 210:1 1158:1 1000:1 516:1 831:6 993:1 367:1 1302:1 2368:1 137:1 1281:1 1505:1 1661:2 193:4 830:3 983:1 109:8 962:1 922:1 514:1 994:1 593:2 946:1 999:1 618:1 958:1 868:5 738:1 891:1 186:3 130:13 965:2 934:4 271:1 1014:1 257:1 1873:1 352:1 571:7 2089:2 498:1 513:2 939:1 828:1 517:2 225:3 916:1 132:2 964:1 902:1 926:1 230:3 290:12 969:1 412:1 914:1 530:1 876:2 884:3 859:1 555:2 729:1 221:1 984:1 1078:1 894:2 188:3 1820:1 227:3 245:2 239:1 422:1 1328:2 479:3 937:1 7 | 66 1038:1 818:1 81:2 743:1 453:2 1017:1 80:1 193:3 264:1 866:1 241:1 130:1 129:3 1125:1 1024:1 934:1 175:1 809:2 1033:1 225:1 154:1 1493:1 1007:1 2148:1 741:1 2219:1 1356:1 1027:1 1004:2 1015:1 121:3 325:1 274:1 109:1 104:6 191:1 1016:1 1031:1 955:1 406:1 1020:1 1010:1 430:1 574:3 290:1 1028:1 1029:1 220:1 903:1 188:1 261:1 517:3 1035:1 1026:1 1145:2 1034:1 1021:1 275:1 102:1 362:1 1022:1 282:1 965:1 2334:1 1014:1 257:1 8 | 56 145:1 19:1 1672:1 1055:1 1050:1 193:2 1059:1 1042:1 130:4 170:1 129:6 1069:2 682:1 1043:2 1062:1 1051:1 1078:1 876:1 225:1 1053:1 1065:1 2219:1 1046:1 1961:1 1074:1 2352:1 1071:1 1068:1 1049:1 121:2 1351:8 1075:1 147:1 2214:1 109:2 1105:1 367:1 1056:1 168:1 1066:1 1077:1 329:1 422:1 1058:1 261:1 5:4 275:1 276:1 1072:1 102:1 958:1 286:2 125:1 1045:1 186:1 257:1 9 | 34 19:2 1083:2 1082:2 112:1 129:1 1043:1 738:1 594:2 1085:1 2150:1 1094:1 32:2 121:3 1351:4 217:3 2214:1 1727:1 104:3 1084:1 1086:3 1895:1 5:1 1328:1 102:1 2132:1 615:1 1091:1 1108:1 286:1 1088:1 722:1 475:1 1090:1 227:1 10 | 50 1118:1 453:1 1110:1 1117:1 271:1 130:1 170:1 129:1 882:2 1100:1 738:1 1111:1 154:1 1007:1 1109:1 956:1 728:1 2219:1 426:1 1101:1 1313:2 1099:1 121:1 1351:2 1105:3 104:4 1097:1 571:1 1096:1 1113:1 2008:1 1098:1 640:1 574:1 290:1 1106:1 165:1 220:1 1058:1 1034:1 1115:1 102:2 1119:1 1108:1 286:1 186:1 909:1 514:1 1103:1 2116:1 11 | 51 1144:1 19:2 1126:1 1122:1 1017:1 581:1 786:1 1142:1 193:1 1141:1 112:1 130:2 129:1 1125:1 142:1 1143:1 2364:1 738:1 1129:2 1130:1 157:1 1127:2 1136:2 809:1 225:1 1472:3 1138:1 119:2 1140:1 121:3 1351:4 209:1 104:2 1123:1 571:2 148:1 1131:1 1098:1 290:1 1133:1 132:1 1146:1 5:4 1145:2 199:1 729:3 1132:1 1147:1 580:1 257:1 1135:1 12 | 121 256:1 257:1 181:5 1154:1 611:2 910:2 104:7 274:1 1481:2 1078:2 453:2 1472:1 1185:2 1265:1 1171:1 1192:1 1169:1 1661:1 479:1 1546:2 1220:1 956:1 1182:1 1136:5 142:1 186:3 168:3 1153:2 1184:1 1210:1 151:1 555:2 249:1 422:2 1207:1 125:1 1621:4 876:2 1159:1 1181:1 574:4 1189:1 286:5 2101:1 513:2 148:3 1141:1 544:2 300:2 191:1 1254:1 1389:1 517:2 1172:1 1201:1 1165:1 571:3 2281:1 2132:2 924:3 217:1 1177:1 1285:1 205:1 277:1 498:1 1158:3 1195:1 743:1 1798:2 1208:1 1219:1 537:1 514:1 19:6 932:1 975:1 429:1 325:2 1164:2 5:6 1082:1 1145:1 1151:1 1083:1 701:1 129:12 734:2 193:4 41:4 121:4 565:2 2071:1 516:2 1197:4 324:1 512:2 962:1 130:8 515:1 1515:2 583:1 594:1 132:2 109:2 320:1 188:2 344:1 1211:1 1180:1 354:1 1135:1 1302:1 1162:1 329:1 557:5 1218:1 282:1 1557:2 32:3 519:1 13 | 120 1258:1 257:1 1154:1 1241:1 603:1 1246:1 1113:1 104:4 1235:1 1481:2 1078:1 1472:1 1265:1 746:1 1264:1 1661:1 479:1 547:1 1546:2 1236:1 1257:1 1266:1 1249:1 1263:1 2087:2 956:1 1273:1 186:1 1256:1 626:1 168:2 216:2 1251:1 1269:1 151:1 276:3 249:4 1621:2 574:3 286:2 513:2 148:1 544:2 300:2 1262:2 559:1 1254:1 1767:1 495:1 105:1 517:1 571:2 499:1 762:1 810:3 522:1 119:2 540:1 217:1 1105:1 745:1 205:2 277:2 498:1 1158:1 199:2 1226:1 743:2 1228:1 1270:1 500:1 1255:1 514:1 2048:3 19:6 430:1 1245:1 450:2 2034:1 722:2 325:2 5:6 1082:1 1083:1 129:9 1274:1 193:6 41:2 121:5 2008:1 70:1 503:1 516:2 721:1 324:1 512:2 744:1 2214:3 1225:1 962:1 1239:1 130:6 515:1 980:1 583:1 204:1 1238:1 907:1 146:1 1252:1 1302:1 335:1 329:1 557:2 1869:1 282:1 1244:1 1557:2 32:1 519:1 14 | 95 186:1 583:1 512:2 276:2 151:1 507:1 286:1 1292:1 1284:1 804:1 517:1 1158:1 241:1 1283:1 1281:1 1590:1 188:1 1546:2 1649:1 1238:1 762:1 1285:1 271:1 1747:1 581:1 19:3 932:1 479:2 544:2 191:1 514:1 5:4 1291:1 603:2 129:8 1141:2 1423:2 868:1 159:1 1303:1 1280:1 700:1 2001:1 571:1 102:1 1255:1 320:2 2214:1 498:1 329:1 1621:2 1481:2 1154:2 109:1 515:1 282:1 464:1 130:3 809:1 1534:1 119:1 300:1 450:1 1349:1 903:1 1278:2 192:1 1239:2 121:6 1624:2 1279:1 766:1 41:1 1301:1 1300:2 217:3 1286:1 1305:1 557:1 389:1 104:10 962:1 1302:2 2310:1 325:1 1228:1 513:2 32:2 8:1 193:3 743:1 181:3 1296:1 516:1 1557:2 15 | 153 1332:3 1315:4 1317:5 256:3 389:2 1334:1 257:3 181:10 1296:1 910:1 1364:1 104:12 1481:2 175:3 373:1 1354:1 1265:2 1078:1 453:1 1320:1 1472:1 1325:1 479:2 1661:1 1546:2 1292:2 1376:1 1326:1 1378:1 804:2 221:1 1380:1 1328:2 1333:1 1347:4 168:1 939:1 688:1 729:1 151:1 276:3 1344:1 1016:1 290:9 1621:2 1281:1 1313:1 618:1 574:1 1384:1 934:4 286:2 134:2 796:1 148:4 544:2 513:2 225:2 1350:1 300:2 1327:1 170:1 754:2 517:1 1360:2 1051:2 1358:1 1385:5 816:1 571:3 1283:1 285:2 1351:1 522:2 15:1 119:1 345:2 1105:1 220:1 1340:1 2368:1 498:1 1158:1 199:1 264:1 743:1 1349:2 882:1 2129:1 239:1 154:1 102:10 955:1 514:1 19:6 932:3 450:4 1316:1 325:1 1335:1 261:1 1311:1 412:1 5:8 1534:2 1359:1 866:1 1339:1 1145:7 129:11 121:11 193:11 1346:1 1356:1 507:1 1373:1 741:2 772:1 172:1 850:1 516:1 1331:1 324:1 512:2 1368:1 1068:2 2214:4 1086:1 962:5 130:4 515:1 271:1 1348:1 1357:1 188:1 109:4 344:1 391:1 1302:1 1318:1 588:1 329:1 1094:3 227:1 282:1 894:1 1362:1 1557:2 864:1 519:1 1308:5 560:1 1355:1 16 | 117 256:4 2147:1 181:10 978:1 1392:3 2158:1 104:6 175:1 1481:2 373:1 1265:1 746:1 245:5 182:1 400:2 479:1 1546:3 367:1 804:1 1347:1 168:1 186:2 729:1 151:2 276:2 1422:1 1410:1 249:3 1747:3 598:1 1621:2 530:1 574:3 1406:1 314:1 934:4 286:6 513:2 74:1 148:2 1141:1 544:2 300:1 1387:2 170:1 1411:1 1389:1 517:2 7:1 571:1 1402:2 762:2 119:1 810:4 1390:4 277:1 498:1 1404:1 1158:1 199:1 743:1 1228:2 127:4 154:1 500:1 525:1 102:1 514:1 19:6 430:1 932:2 450:1 722:1 1423:2 325:1 261:1 1388:4 5:5 1082:1 1401:1 1083:1 129:13 1396:1 193:4 926:1 41:1 121:9 1424:1 317:1 516:1 721:1 741:1 512:2 2214:1 962:1 130:7 515:1 980:1 583:1 132:1 1357:1 109:4 320:3 1417:2 8:1 1302:1 1416:1 282:1 329:1 868:2 1244:1 165:1 1557:2 32:1 786:1 1425:1 1355:1 17 | 104 186:1 394:1 1453:1 512:2 818:1 108:3 151:1 662:1 1840:1 1427:1 1438:1 1452:2 286:1 199:1 924:1 220:1 1434:1 324:1 517:2 1244:2 1158:2 160:2 1430:4 241:1 1283:1 865:1 188:1 60:1 1546:3 19:1 479:1 1661:1 544:2 191:1 514:1 5:1 519:1 910:2 129:9 1123:2 1451:1 1433:1 1436:2 2129:1 571:3 574:1 320:1 1441:1 498:1 1444:1 964:1 1621:4 142:1 1481:2 110:1 290:1 109:6 1389:1 515:1 896:1 282:1 464:1 130:3 422:1 148:1 58:1 274:1 119:1 375:1 1145:2 1437:1 121:5 1162:1 227:1 1447:1 1798:1 217:2 840:1 1445:2 184:1 1435:1 557:1 1472:1 145:1 1456:1 104:13 962:1 325:1 1302:1 7:1 594:1 513:2 453:1 1449:1 193:2 1458:1 1446:4 743:1 1159:1 975:1 181:2 516:1 1557:2 1078:1 18 | 89 692:1 1473:1 2347:1 168:6 2191:1 1488:1 512:2 2261:1 1592:1 276:1 151:1 1478:5 286:2 1469:1 1493:1 1226:1 804:1 2267:1 517:1 1158:1 1495:1 188:1 1546:3 762:1 146:1 19:2 479:1 544:2 514:1 5:3 910:1 129:11 2015:1 11:1 1094:1 499:1 729:1 696:1 1491:1 1498:1 574:3 2008:1 1490:1 571:2 102:1 2214:1 498:1 1621:2 1481:3 125:1 1389:1 1482:1 515:1 956:1 282:1 1480:1 130:4 2371:1 148:1 119:2 300:1 450:2 1475:1 1476:1 767:1 121:1 1279:2 766:3 1470:4 744:1 467:1 1497:1 1468:1 368:1 1472:2 145:2 104:11 1302:1 1113:1 325:1 1228:1 1486:1 513:2 193:5 743:1 1467:1 516:1 1557:2 1484:1 19 | 163 257:2 1672:1 181:1 1241:1 1523:1 241:1 104:14 1541:1 1481:2 1246:1 1517:1 1078:1 1568:1 902:1 1472:1 677:1 1325:1 479:1 1661:1 1567:1 2021:1 1564:1 1546:3 1249:1 1923:1 1540:2 804:1 956:1 381:1 142:1 1505:1 413:1 780:1 186:1 782:1 1503:1 2229:1 1543:1 1333:1 1524:1 168:2 1184:1 939:1 688:2 729:4 151:1 276:9 249:1 1621:2 125:2 1516:1 80:1 618:2 530:1 1544:2 574:1 934:2 286:6 1506:1 544:2 513:2 1566:1 1547:1 1303:1 300:3 681:1 682:1 1527:1 191:1 754:1 946:1 1528:2 170:1 124:1 1532:1 1486:1 1480:1 1509:2 517:1 675:1 571:2 762:4 810:2 1501:1 119:1 11:1 809:1 205:2 751:1 1511:1 818:1 1571:1 1226:1 498:1 1520:1 1158:1 199:2 1798:1 264:2 743:1 1228:4 1549:1 2129:1 127:1 154:1 1533:1 1799:1 102:1 2130:1 514:2 19:3 1538:1 450:1 722:1 1423:1 325:1 337:1 5:8 1529:1 1534:3 1145:3 129:16 1569:1 145:1 121:9 193:3 1510:1 1502:2 1560:1 772:1 850:1 324:1 516:1 1559:1 512:2 17:1 185:1 2214:4 1539:1 130:9 515:1 271:1 1515:1 132:1 109:4 1554:1 1252:1 1504:1 158:1 1482:1 146:2 581:1 1302:1 335:1 329:1 1094:2 1305:1 1869:1 282:2 560:1 1557:3 1507:1 519:1 20 | 112 186:2 1583:2 1632:1 127:2 512:2 1587:4 1589:1 1547:1 1592:1 625:1 1575:1 276:3 151:1 1594:1 507:1 205:2 286:3 1540:4 199:1 170:1 220:1 1055:1 517:2 1595:1 1578:1 1158:1 934:2 626:1 245:1 1590:1 188:1 1546:4 230:1 1598:1 271:1 19:3 479:2 544:2 191:2 514:1 5:3 129:8 1141:1 1423:1 159:1 249:1 701:2 729:2 1581:1 1573:1 2001:1 571:2 1576:1 1544:2 102:4 1601:1 320:3 907:1 2214:3 498:1 1621:3 1481:2 1154:1 109:1 675:2 515:1 956:1 896:1 1584:1 282:1 130:6 933:1 134:2 809:1 422:1 344:1 1543:1 450:1 555:1 1278:2 1145:3 1239:1 121:8 1279:1 766:1 760:1 1579:1 1300:2 217:5 1254:1 1599:1 1574:1 238:1 104:8 325:1 1302:1 902:1 7:1 513:2 453:1 172:1 1672:1 32:4 183:1 264:2 743:1 618:2 181:4 516:1 1557:2 1593:1 1085:1 21 | 170 1657:1 1315:2 795:1 256:2 389:1 257:2 1643:1 1672:1 1632:2 181:12 1392:4 241:1 104:14 1481:2 175:1 1645:1 1265:3 352:1 1078:1 453:2 1472:1 479:2 1661:2 1546:2 865:1 2033:1 1662:1 318:1 804:2 956:2 1654:1 186:1 168:2 1184:1 284:1 151:1 276:5 1653:1 1055:1 625:1 1619:1 767:2 290:5 1621:4 1470:2 596:1 1614:1 1624:1 574:5 934:1 286:4 134:2 148:1 544:2 513:2 1609:1 1608:1 1141:1 820:1 1497:1 592:1 300:2 664:2 1649:1 191:2 170:4 105:1 1509:1 517:1 1656:1 571:2 762:1 1638:1 285:1 810:1 15:1 119:1 230:3 809:2 159:1 217:1 1105:1 1622:1 220:1 1226:1 1642:1 2368:1 498:1 1605:6 1158:1 743:1 1228:1 1255:1 692:1 154:3 238:1 731:1 1646:1 102:3 1631:1 514:1 19:5 2034:1 173:1 450:1 1423:2 1658:1 325:1 337:2 5:8 738:1 1651:1 1623:1 709:2 1626:1 1145:3 129:15 236:1 121:7 1075:1 193:5 797:1 1640:1 1502:1 840:1 565:1 171:1 1625:1 1488:1 324:1 516:1 1617:1 512:2 728:1 1648:1 1068:1 2214:2 1641:1 1634:1 130:14 515:1 271:1 1610:2 132:1 766:1 109:4 1821:1 1670:1 1616:1 1085:1 344:1 1671:1 916:1 146:1 1644:1 1302:1 1666:1 335:1 2219:1 210:1 227:1 1579:1 282:1 1778:1 1244:2 1557:2 32:1 1612:1 519:1 1308:1 22 | 213 1684:1 1315:2 1317:1 1801:1 256:2 1788:1 1705:3 900:1 1760:1 181:11 1703:2 241:2 1802:1 1814:1 1677:1 1742:1 104:19 1710:1 1714:1 1723:1 274:1 1808:1 1517:2 1769:1 1686:1 1265:2 245:1 1839:1 1681:2 453:1 1718:1 1693:1 1793:1 479:3 1752:1 1787:1 1680:1 182:1 1292:1 1730:1 1818:1 1741:1 367:1 804:1 1840:2 1713:1 381:2 1708:1 221:2 1328:2 1687:1 1756:1 186:4 1822:1 1704:1 168:3 1701:1 1736:1 1184:1 1709:1 284:2 2037:1 1770:1 276:1 1780:1 1786:1 1734:1 290:3 1747:2 1720:1 589:1 1732:1 125:2 876:2 1744:1 1159:1 618:1 112:1 574:3 934:2 286:1 1696:2 148:4 1826:1 225:1 1722:1 1711:1 697:1 1820:1 1917:2 1782:1 191:2 124:8 559:1 1768:1 1767:1 517:1 1834:2 965:1 1726:1 1051:1 571:4 2349:1 224:1 1774:1 1753:1 522:1 119:1 1755:1 159:1 205:1 1792:1 1737:1 217:2 1717:5 1724:1 1828:1 220:2 199:1 1798:3 127:1 264:3 40:1 1255:1 692:1 154:3 238:3 1799:1 102:3 2130:1 514:2 19:1 1706:1 2048:1 471:1 1800:1 450:1 1423:1 1688:1 1685:1 1832:2 337:1 1781:1 1678:1 192:1 1145:4 1817:1 701:1 129:24 1727:1 236:3 121:9 41:2 193:12 1775:1 1356:1 874:1 507:1 1766:1 840:1 1806:1 503:1 1790:1 1827:1 1721:2 172:1 1091:1 516:1 935:1 1700:1 47:2 160:1 475:2 1783:1 728:1 938:1 1815:1 615:2 962:4 2051:1 1804:1 1810:1 130:12 271:1 1761:1 1772:2 37:4 109:7 320:1 1715:1 1821:1 1825:1 1504:1 344:2 1065:1 1644:2 1679:1 1830:2 110:3 1749:1 329:4 1816:1 1819:1 1305:1 1776:1 1778:1 1813:1 614:1 1743:1 1794:1 1355:1 1702:1 1735:1 23 | 113 1878:1 1828:1 262:1 1875:1 124:5 276:1 1840:2 1842:2 286:4 199:1 1886:1 1854:2 1292:1 46:1 1883:1 220:1 1849:2 1893:1 1895:1 517:2 1244:1 1430:4 934:2 1874:1 241:1 245:1 188:2 1891:1 1885:1 1873:1 1880:1 1897:1 1871:1 412:1 129:16 1847:1 381:1 1075:1 1857:1 47:2 1896:1 1490:1 2008:1 1856:1 102:4 1859:2 320:1 171:1 329:1 142:1 110:2 880:1 125:1 1084:1 109:4 1888:1 1389:1 1130:1 1860:1 956:1 896:1 950:1 884:1 1328:2 130:5 933:1 148:5 1853:1 1016:1 261:1 1887:1 1869:1 335:1 119:2 1876:1 555:1 2130:1 1278:1 121:5 175:1 1358:2 741:1 1848:1 1798:1 1254:1 1844:2 257:1 866:1 104:16 962:2 325:2 2059:1 1834:2 1113:1 154:2 112:1 453:1 1987:2 193:5 598:2 1855:2 1894:2 1446:5 1866:1 1877:1 475:1 1159:1 975:3 474:1 181:1 1839:1 1917:2 1078:1 24 | 44 145:1 19:2 2279:1 1476:1 692:1 193:2 1579:1 2015:1 1478:2 130:2 129:10 1475:1 1226:1 934:1 450:1 300:2 1482:1 1472:1 1917:2 1469:1 7:1 47:2 1389:1 1292:1 1840:2 121:2 110:2 2214:2 1839:1 2130:1 104:4 368:1 571:1 574:2 168:2 1834:2 124:3 5:3 1328:2 102:1 1470:3 1473:2 186:1 257:1 25 | 155 257:1 181:3 1912:1 1278:2 603:1 1978:1 104:19 1113:1 175:3 1961:1 49:1 1265:1 1123:1 1839:1 453:1 1926:1 1979:1 479:1 1661:2 1956:1 1963:1 1984:1 2015:1 1968:1 1292:1 478:1 1923:1 1840:2 956:1 1932:1 1328:2 1930:1 1931:1 186:3 909:1 1934:1 48:1 1949:2 1936:1 729:3 151:1 276:3 555:1 2067:1 290:2 1734:1 1995:1 249:1 1621:1 422:3 1935:1 1954:1 1989:1 1953:1 1516:1 1958:1 1998:1 934:1 1915:7 708:1 1946:1 225:2 485:1 1913:3 1987:1 1917:7 1942:1 1649:1 191:1 1982:1 124:6 1986:1 1834:5 1928:1 1910:2 517:1 1927:1 1904:1 571:3 1943:1 217:2 2169:1 137:1 1941:1 1642:1 2368:2 1903:3 803:1 2129:1 420:1 154:3 442:1 1967:1 1799:1 102:4 1966:1 2130:1 1972:1 1981:1 450:1 2280:1 337:1 272:1 1993:1 1529:1 1990:1 2116:1 866:3 1145:1 192:1 701:1 129:17 1727:1 145:1 236:1 121:7 433:1 734:1 193:3 2008:1 1996:1 874:1 507:1 1952:1 1940:5 516:2 1914:1 47:6 744:1 1086:1 1909:1 615:1 1905:1 962:3 130:6 132:1 204:1 109:9 320:3 1482:1 1939:1 1969:1 1907:2 110:2 329:2 2219:1 210:1 1101:1 1973:1 1244:1 1970:1 1980:1 1997:1 1933:1 1921:1 26 | 146 2023:1 256:3 257:2 1643:2 2001:1 181:3 1241:1 241:1 1632:3 104:18 175:1 1265:1 2055:1 1839:1 453:3 479:2 1661:2 2021:1 2015:2 1292:1 2033:1 318:1 1662:1 1840:2 1328:3 1654:3 2056:1 186:7 909:1 1184:1 2027:1 2031:1 729:1 2037:1 151:1 276:3 2025:1 767:1 290:3 1995:1 1747:1 1470:3 125:1 1207:1 1614:1 112:1 574:4 934:1 134:2 148:5 225:1 820:1 1917:2 300:2 664:2 2002:1 559:1 170:2 124:3 1834:2 965:1 2041:1 2026:1 1051:1 571:3 849:1 762:1 119:2 725:1 829:2 809:1 159:2 205:1 982:1 2059:1 2047:1 220:1 1390:1 2005:2 2017:2 1642:1 1605:10 199:1 1798:2 1228:1 239:1 1255:1 441:1 154:1 1646:1 102:2 2130:1 2048:1 19:5 430:1 932:2 450:1 2042:1 2034:1 1423:2 138:1 337:2 5:10 2019:1 2116:1 866:2 2063:1 696:1 129:18 2004:1 121:11 193:4 2008:1 1075:1 2032:1 171:1 1640:2 935:1 47:2 1488:1 475:1 744:1 2011:1 2214:1 2051:1 962:2 130:10 2049:1 1641:1 271:1 1515:3 1610:1 132:1 109:4 907:1 2018:1 110:2 218:3 868:1 1101:1 1776:1 2219:1 1244:1 165:1 2038:1 215:1 27 | 147 1315:1 256:3 2310:1 257:2 1130:2 900:1 181:14 104:6 373:1 2124:1 274:1 1265:1 1839:1 51:1 1661:2 2015:1 1292:2 2087:1 367:1 1840:2 2107:1 142:1 221:1 1328:2 186:1 1347:1 939:1 729:1 284:1 2067:1 290:1 1410:1 249:1 1747:1 2102:1 2114:1 125:1 876:1 618:1 112:1 314:2 934:1 286:1 2122:1 2101:1 148:3 362:1 1917:2 201:1 2083:1 170:1 124:3 1866:1 1834:2 965:1 1254:1 1389:1 517:1 2094:1 2126:1 246:1 2065:6 571:1 224:1 2103:1 2127:1 2069:1 2095:1 829:1 230:2 982:1 217:5 2100:1 195:1 2047:1 220:1 2120:1 464:1 2064:2 2075:1 199:2 2089:1 1798:1 264:1 2129:1 2076:1 127:5 2080:1 238:1 2130:2 2085:1 1860:1 1020:1 430:1 2081:1 975:1 2104:1 429:1 2121:2 261:1 1311:1 2084:1 412:1 2116:1 1145:5 129:13 145:1 2123:1 121:9 2113:1 193:3 317:1 840:1 503:1 2071:1 772:1 484:1 935:1 880:1 47:2 2093:1 2088:1 130:4 271:1 1515:1 132:1 2125:1 188:2 1844:1 109:7 320:2 2082:1 2092:2 2066:1 344:1 2097:1 354:1 1749:1 110:2 218:2 2079:1 868:3 1592:1 557:3 2070:2 2111:1 2118:1 28 | 26 19:2 1255:1 17:1 193:2 130:2 626:1 1658:1 175:1 1094:1 225:1 2134:1 119:1 121:1 1351:2 2214:3 109:1 389:1 104:1 1244:1 5:1 1325:1 221:1 1912:1 2132:1 615:1 362:1 29 | 90 186:2 1396:1 127:4 276:2 151:1 1355:1 430:2 286:5 199:1 1387:2 170:1 804:1 292:1 2152:1 1244:1 934:4 1347:1 2143:1 980:1 245:5 762:1 1747:3 19:2 932:2 314:1 317:1 1425:1 5:1 367:1 129:3 1141:1 1423:2 868:2 1265:1 249:1 1404:1 2231:1 1388:2 1402:2 1406:1 722:1 1417:2 729:1 400:2 373:1 1094:1 810:2 1424:1 574:4 525:1 1416:1 102:3 2214:1 329:2 18:1 182:1 109:3 130:5 165:1 148:2 2368:1 261:2 2158:1 1351:4 2147:1 1392:3 1401:1 121:8 530:1 741:1 1357:1 132:2 1390:1 926:1 2142:1 557:2 217:1 1254:1 500:1 786:2 104:6 1228:1 256:4 8:1 193:4 721:1 978:2 1422:1 181:10 2144:1 30 | 35 2158:1 19:1 224:2 1281:1 2154:1 786:1 17:1 271:1 2015:1 130:1 142:1 2150:1 2148:2 121:1 1351:6 2214:1 2157:1 230:1 104:1 1624:1 1972:2 571:1 148:1 2231:1 1280:1 134:1 1265:1 138:1 5:1 276:1 2149:1 102:1 2152:1 2153:1 668:2 31 | 9 171:1 193:1 1269:1 1654:1 104:1 1207:1 2231:1 1270:1 102:1 32 | 71 186:1 168:1 2191:1 2165:1 276:2 286:3 2179:1 2185:1 2364:1 220:1 1146:1 1244:2 1347:2 437:1 25:1 245:1 230:1 2176:1 1625:2 129:5 916:1 2183:1 1014:1 1123:1 1356:1 1110:1 27:1 571:1 1734:1 102:2 1775:2 329:1 275:1 2193:1 290:1 1621:1 109:1 142:1 2172:1 2173:1 2194:1 130:2 809:2 1869:2 344:1 555:1 1351:6 2169:1 2186:2 1476:2 2181:1 26:1 1358:1 1122:1 939:1 2171:1 132:2 1101:1 2187:1 104:7 1320:1 112:1 2116:1 2198:1 2178:1 1502:1 193:1 1159:1 354:1 2188:1 975:1 33 | 21 19:1 2129:1 786:1 751:2 2200:1 934:2 762:1 1228:1 121:1 1351:6 754:1 2214:1 104:2 778:1 677:2 245:2 5:2 276:2 181:2 292:2 186:1 34 | 15 19:2 934:1 1094:1 762:1 1228:1 1351:4 2214:1 1544:1 104:2 612:2 5:2 2205:1 1108:1 286:1 257:1 35 | 29 219:1 19:1 824:1 1347:1 1486:3 934:1 300:2 762:1 1228:1 2206:1 121:1 1351:4 109:2 1501:2 104:2 1123:1 1817:1 148:2 329:1 245:1 188:1 261:2 517:1 5:3 2207:1 1159:1 125:1 257:3 2210:1 36 | 24 810:2 19:2 193:1 2015:1 130:1 129:1 2213:1 934:1 956:1 1101:1 121:1 1351:2 2214:1 109:1 104:2 571:1 557:1 574:1 290:1 5:2 729:1 102:1 2362:1 2211:1 37 | 19 19:1 130:2 626:1 10:1 1254:1 762:2 1228:2 1592:1 32:1 1351:6 2214:1 1817:1 2231:1 5:1 1278:1 276:2 2152:1 722:1 437:1 38 | 18 19:1 193:1 225:1 1605:1 2220:1 982:1 2219:1 664:1 479:1 121:1 1351:6 2214:2 1610:1 442:1 2231:2 5:1 276:1 2152:1 39 | 16 19:2 2224:2 673:1 35:1 130:1 626:1 762:1 1228:1 2222:1 1351:6 2214:2 1817:1 799:1 5:1 276:2 2223:2 40 | 62 19:3 1142:1 2245:1 193:1 127:1 2233:1 804:1 130:2 1218:1 594:1 1129:1 2238:1 1136:2 300:1 2239:1 1515:1 2227:1 2229:1 1292:1 2235:2 284:2 555:1 121:1 1351:6 2246:1 274:1 464:1 980:1 104:4 571:1 1291:2 344:1 148:1 1987:1 574:1 290:3 168:1 2230:1 2231:2 329:1 1265:1 2242:1 188:1 2243:1 261:2 2232:1 5:2 1132:2 275:1 102:1 2237:1 965:1 286:1 701:1 1334:1 2152:1 181:2 186:1 2234:2 2248:1 1154:1 1151:1 41 | 36 19:3 429:1 1747:2 8:1 271:1 804:2 1141:1 626:1 192:2 738:1 175:1 1094:1 956:1 32:1 1292:1 1351:4 2214:1 464:1 104:1 191:2 1113:1 2008:1 290:1 188:1 5:1 1225:1 1278:2 276:1 932:1 102:1 2132:1 722:1 181:2 320:1 1154:2 2253:1 42 | 64 810:1 1495:2 19:1 1498:1 1281:1 2259:1 2266:1 193:5 264:1 1478:3 129:2 1480:2 1406:1 859:1 738:1 1486:2 767:1 175:1 300:1 1172:1 809:2 10:1 1472:4 910:1 1311:1 956:1 1497:1 373:1 121:1 1351:6 1491:2 109:2 412:1 2261:1 104:10 745:1 571:1 2191:1 574:1 11:2 168:6 2281:1 329:1 1133:1 188:2 1467:1 261:2 5:4 1145:3 1328:1 276:1 102:1 1159:1 1853:1 282:2 286:2 1470:4 1473:1 2267:1 2280:1 437:1 257:6 1468:1 2265:1 43 | 59 810:2 19:9 2129:2 1255:1 277:1 193:1 130:3 1799:1 2319:1 15:1 767:1 300:3 809:2 1410:1 1608:1 956:1 982:1 2305:1 1101:1 119:1 1241:1 2301:1 2321:1 555:1 121:1 1351:24 2214:1 109:3 1544:1 412:1 2157:1 632:2 917:1 104:4 571:3 1244:1 2286:2 2287:1 430:1 574:2 165:1 424:1 5:9 1145:1 199:1 1328:1 184:1 932:1 102:4 2322:1 2368:1 286:2 249:1 2310:1 756:2 186:1 2211:2 2324:1 13:2 44 | 53 810:1 19:1 1402:1 193:4 1347:1 271:2 828:1 866:1 112:1 130:2 129:3 2329:4 2337:1 285:1 1486:2 300:2 809:1 2332:1 2345:1 2338:1 2340:1 2344:1 121:1 1351:4 109:1 1544:1 2327:1 230:1 205:1 104:2 2341:1 344:1 796:1 1621:1 148:1 2142:1 168:1 134:1 132:1 261:1 5:2 1145:1 276:2 2330:1 102:2 615:1 2333:1 2334:1 957:1 475:1 2331:1 2328:2 2342:1 45 | 45 810:2 874:1 1896:1 19:1 2356:1 1281:1 2266:1 2355:1 1347:1 690:1 264:1 1478:1 112:1 129:2 1480:3 1486:3 934:2 809:3 1472:1 910:1 2352:1 121:2 1351:6 1501:1 104:5 148:1 1488:1 1098:2 430:1 11:3 2349:1 168:1 1133:2 245:1 2348:1 261:1 5:1 1145:1 1328:2 102:1 1470:1 257:2 2116:1 2265:2 195:1 46 | 62 810:3 874:1 19:2 816:1 1122:1 146:1 2375:1 193:1 264:2 964:1 112:1 1358:1 130:3 129:5 2329:1 626:1 1480:1 1169:1 1520:1 2364:1 1874:1 2238:1 1486:2 934:4 809:5 881:1 910:1 593:1 2206:1 2371:1 1351:2 274:1 109:3 104:4 191:2 2360:1 2365:1 148:1 1488:1 2191:2 574:1 11:1 400:1 182:2 1133:1 159:1 245:2 261:1 5:3 1145:3 1328:1 1912:2 2370:1 276:1 379:1 2368:1 2367:1 2362:2 186:3 1626:1 257:1 2067:2 -------------------------------------------------------------------------------- /test/data/sample.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require 'rubygems' 4 | require 'lda-ruby' 5 | 6 | # Load the Corpus. The AP data from David Blei's website is in the "DataCorpus" format 7 | corpus = Lda::DataCorpus.new("ap/ap.dat") 8 | 9 | # Initialize the Lda instance with the corpus 10 | lda = Lda::Lda.new(corpus) 11 | 12 | # Run the EM algorithm using random starting points. Fixed starting points will use the first n documents 13 | # to initialize the topics, where n is the number of topics. 14 | lda.em("random") # run EM algorithm using random starting points 15 | 16 | # Load the vocabulary file necessary with DataCorpus objects 17 | lda.load_vocabulary("ap/vocab.txt") 18 | 19 | # Print the top 20 words per topic 20 | lda.print_topics(20) 21 | -------------------------------------------------------------------------------- /test/data/wiki-test-docs.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - |- 3 | The Olympic Games are an international event of summer and winter sports, in which thousands of athletes compete in a wide variety of events. The Games are currently held every two years, with Summer and Winter Olympic Games alternating. Originally, the ancient Olympic Games were held in Olympia, Greece, from the 8th century BC to the 5th century AD. In the late 19th century, Baron Pierre de Coubertin was inspired by Olympic festivals to revive the Games. For this purpose, he founded the International Olympic Committee (IOC) in 1894, and two years later, the modern Olympic Games were established in Athens. The IOC has since become the governing body of the Olympic Movement, whose structure and actions are defined by the Olympic Charter. 4 | The evolution of the Olympic Movement during the 20th century forced the IOC to adapt the Games to the world's changing social circumstances. Some of these adjustments included the creation of the Winter Games for ice and snow sports, the Paralympic Games for athletes with physical disabilities, and the Youth Olympic Games for teenage athletes. The IOC also had to accommodate the Games to the varying economical, political, and technological realities of the 20th century. As a result, the Olympics shifted away from pure amateurism, as envisioned by Coubertin, to allow participation of professional athletes. The growing importance of the mass media created the issue of corporate sponsorship and commercialization of the Games. 5 | The Olympic Movement currently comprises international sports federations (IFs), National Olympic Committees (NOCs), and organizing committees for each specific Olympic Games. As the decision-making body, the IOC is responsible for choosing the host city for each Olympic Games. The host city is responsible for organizing and funding a celebration of the Games consistent with the Olympic Charter. The Olympic program, consisting of the sports to be contested at each Olympic Games, is also determined by the IOC. The celebration of the Games encompasses many rituals and symbols, such as the Olympic flag and torch, as well as the opening and closing ceremonies. There are over 13,000 athletes that compete at the Summer and Winter Olympics in 33 different sports and nearly 400 events. The first, second, and third place finishers in each event receive gold, silver or bronze Olympic medals, respectively. 6 | The Games have grown in scale to the point that nearly every nation is represented. Such growth has created numerous challenges, including boycotts, doping, bribery of officials, and terrorism. Every two years, the Olympics and its media exposure provide unknown athletes with the chance to attain national, and in particular cases, international fame. The Games also constitute a major opportunity for the host city and country to promote and showcase themselves to the world. 7 | - |- 8 | A multi-sport event is an organized sporting event, often held over multiple days, and featuring competition in many different sports between organized teams of athletes from (mostly) nation-states. The first major, modern, multi-sport event of international significance was the modern Olympic Games. 9 | Many regional multi-sport events have since been founded, modeled after the Olympics. Most have the same basic structure. Games are held over the course of several days in and around a "host city," which changes for each competition. Countries send national teams to each competition, consisting of individual athletes and teams that compete in a wide variety of sports. Athletes or teams are awarded gold, silver, or bronze medals for first, second, and third place respectively. The games are generally held every four years, though some are annual competitions. 10 | - The West Germanic languages constitute the largest of the three traditional branches of the Germanic family of languages and include languages such as English, Dutch and Afrikaans, German, the Frisian languages, and Yiddish. The other two of these three traditional branches of the Germanic languages are the North and East Germanic languages. 11 | - |- 12 | English is a West Germanic language that originated in Anglo-Saxon England. As a result of the military, economic, scientific, political, and cultural influence of the British Empire during the 18th, 19th, and early 20th centuries and of the United States since the mid 20th century,[7][8][9][10] it has become the lingua franca in many parts of the world.[11][12] It is used extensively as a second language and as an official language in Commonwealth countries and many international organizations. 13 | Historically, English originated from several dialects, now collectively termed Old English, which were brought to Great Britain by Anglo-Saxon settlers beginning in the 5th century. The language was heavily influenced by the Old Norse language of Viking invaders. After the Norman conquest, Old English developed into Middle English, borrowing heavily from the Norman (Anglo-French) vocabulary and spelling conventions. Modern English developed from there notably with the Great Vowel Shift that began in 15th-century England, and continues to adopt foreign words from a variety of languages, as well as coining new words. A significant number of English words, especially technical words, have been constructed based on roots from Latin and ancient Greek. 14 | - |- 15 | Contemporary Christian music (or CCM; also by its religious neutral term "inspirational music") is a genre of popular music which is lyrically focused on matters concerned with the Christian faith. The term is typically used to refer to the Nashville, Tennessee-based pop, rock, and worship Christian music industry, currently represented by artists such as Avalon, BarlowGirl, Jeremy Camp, Casting Crowns, Steven Curtis Chapman, David Crowder Band, Amy Grant, Natalie Grant, Jars of Clay, MercyMe, Newsboys, Chris Tomlin, Hillsong, Michael W. Smith, Rebecca St. James, Third Day, tobyMac, and a host of others. The industry is represented in Billboard Magazine's "Top Christian Albums" and "Hot Christian Songs" charts,[1] and by Radio & Records magazine's Christian AC (Adult Contemporary), Christian CHR (Contemporary Hit Radio), Christian Rock, and Inspirational (INSPO) airplay charts,[2] as well as the iTunes Store's "Christian & Gospel" genre. 16 | Not all popular music which lyrically identifies with Christianity is normally considered Contemporary Christian Music.[3] For example, many punk, hardcore, and holy hip-hop groups deal explicitly with issues of faith but are not a part of the Nashville industry. Also, several mainstream artists such as Bob Dylan, The Byrds, Lifehouse, and U2 have dealt with Christian themes in their work but are not considered CCM artists.[3] 17 | - Third Day is a CCM and Christian rock band formed in Marietta, Georgia during the 1990s. The band was founded by lead singer Mac Powell, guitarist Mark Lee and former member Billy Wilkins[1]. The other band members are bassist Tai Anderson and drummer David Carr. The band's name is a reference to the biblical account of Jesus rising from the dead on the third day following his Crucifixion. 18 | - "Alien vs. Predator, also known as AVP, is a 2004 American science fiction film, directed by Paul W.S. Anderson for 20th Century Fox. The film adapts the Alien vs. Predator crossover imprint bringing together the titular creatures of the Alien and Predator series, a concept which originated in a 1989 comic book. Anderson, Dan O'Bannon, and Ronald Shusett wrote the story, and Anderson and Shane Salerno adapted the story into a screenplay. Their writing was influenced by Aztec mythology, the comic book series, and the writings of Erich von D\xC3\xA4niken.\n\ 19 | Set in 2004, the film follows a group of paleontologists, archaeologists, and others assembled by billionaire Charles Bishop Weyland (Lance Henriksen) for an expedition near the Antarctic to investigate a mysterious heat signal. Weyland hopes to claim the find for himself, and his group discovers a pyramid below the surface of a whaling station. Hieroglyphs and sculptures reveal that the pyramid is a hunting ground for Predators who kill Aliens as a rite of passage. The humans are caught in the middle of a battle between the two species and attempt to prevent the Aliens from reaching the surface.\n\ 20 | The film was released on August 13, 2004, in North America and received mostly negative reviews from film critics. Some praised the special effects and set designs, while others dismissed the film for its \"wooden dialogue\" and \"cardboard characters\". Nevertheless, Alien vs. Predator became the most commercially successful film in the franchises, grossing $172 million in its theatrical run. The film's success led to a sequel in 2007 titled Aliens vs. Predator: Requiem." 21 | - |- 22 | Predator was John McTiernan's second studio film as director. The studio hired screenplay writer Shane Black to not only play a supporting role in the film, but to keep an eye on McTiernan due to the director's inexperience. Jean-Claude Van Damme was originally cast as the film's creature,[1] the idea being that the physical action star would use his martial arts skills to make the creature an agile, ninja-esque hunter. When compared to Arnold Schwarzenegger, Carl Weathers and Jesse Ventura, actors known for their bodybuilding regimes, it became apparent a more physically-imposing man was needed to make the creature appear threatening. Ventura's autobiography also alleges Van Damme intentionally injured a stunt man.[citation needed] Eventually, Van Damme was removed from the film and replaced by the actor and mime artist Kevin Peter Hall. 23 | The predator creature's design is credited to special effects artist Stan Winston. While flying to Japan with Aliens director James Cameron, Winston, who had been hired to design the Predator, was doing concept art on the flight. Cameron saw what he was drawing and said, "I always wanted to see something with mandibles". Winston then included them in his designs. Schwarzenegger recommended Winston after his experience working on The Terminator. 24 | - |- 25 | Norfolk & Western is an indie rock and folk/rock band from Portland, Oregon. An essential part of their stage set-up and sound is a turn-of-the-century Victrola Grammaphone. Norfolk and Western began as the recording project of Adam with friends including M.Ward playing various instruments, and evolved over time to become the fully orchestrated band it is today. In the early days Norfolk and Western's sound was whispery, intimate, elegant folk music laced with creaky old instruments and atmospheric sound collages. Their live shows often features band members switching instruments, sometimes even mid-song, as well as film accompaniment. 26 | Norfolk and Western toured in support of the release A Gilded Age in Spring and Summer of 2006. 27 | - "Charles Francis Hansom (27 July 1817 \xE2\x80\x93 30 November 1888)[1] was a prominent Roman Catholic Victorian architect who primarily designed in the Gothic Revival style.\n\ 28 | He was born of a Roman Catholic family in York. He was the brother of Joseph Aloysius Hansom, architect and creator of the Hansom cab, and father of the architect Edward Joseph Hansom. He practised in partnership with his brother, Joseph, in London from 1854.\n\ 29 | This partnership was dissolved in 1859 when Charles established an independent practice in Bath with his son Edward (born 22 October 1842) as an articled clerk. He took his son into partnership in 1867, by which time the practice had moved to Bristol, with a large West Country practice of church and collegiate architecture. In Bristol he took on Benjamin Bucknall as an assistant.\n\ 30 | He was commonly known as Francis the Hansom, as he was rather handsome." 31 | - Clifton College is an independent school in Clifton, Bristol, England, founded in 1862. In its early years it was notable (compared with most Public Schools of the time) for emphasising science in the curriculum, and for being less concerned with social elitism, e.g. by admitting day-boys on equal terms and providing a dedicated boarding house for Jewish boys.[1][2][3] Having linked its General Studies classes with Badminton School since 1972, it admitted girls to the Sixth Form in 1987 and is now fully coeducational. The dedicated Jewish boarding house closed in 2005. 32 | - "Badminton School is an independent, boarding and day school for girls aged 4 to 18 years situated in Westbury-on-Trym, Bristol, England. The school consistently performs well in the government's league tables, particularly at A level.[1] In 2008 the school was ranked 3rd in the Financial Times top 1000 schools[2]\n\ 33 | According to the Good Schools Guide, \"The secret of the school's success is in its size and a good deal of individual attention.\"[3]\n\ 34 | Ms Miriam Badock established a school for girls in 1858 at Badminton House in Clifton. By 1898 it was known as Miss Bartlett's School for Young Ladies[4]\n\ 35 | Unusually for the time the school developed a broad curriculum and extra curricular activities, including sport, were encouraged. The School grew steadily in size and in 1924 moved to the present site, under the Headship of Miss Beatrice May Baker (1876\xE2\x80\x931973)." 36 | - |- 37 | Westbury-on-Trym is a suburb and council ward in the north of the City of Bristol, near the suburbs of Stoke Bishop, Westbury Park, Henleaze, Southmead and Henbury, in the southwest of England. Westbury-on-Trym has a village atmosphere. The place is partly named after the River Trym that flows through it. 38 | The origins of Westbury on Trym predate those of Bristol itself. At the end of the 8th century, King Offa of Mercia granted land at Westbury to his minister, Aethelmund. Later there was a monastery at Westbury, probably initially a secular one, with married clergy. This changed towards the end of the 10th century when Oswald of Worcester, in whose diocese the monastery lay, sent a party of 12 monks to follow more stringent rules at the Westbury monastery[2]. 39 | The architect Ednoth constructed a new church and other buildings. The monastery became a college with a dean and canons at the end of the 13th century. It was rebuilt in the mid-15th century to resemble a miniature castle with turrets and a gatehouse. The Royalist Prince Rupert of the Rhine used it as his quarters during the English Civil War. When he left, in 1643, he ordered it to be set on fire so that the Parliamentarians could not make use of it. It was restored in the 20th century and the grounds were adapted for housing elderly people. 40 | The current Church of the Holy Trinity dates from 1194 (although there has been a place of worship on the site since 717), with an early 13th century nave and aisles, and 15th century chancel, chapels and tower. It is a grade I listed building.[3] 41 | - |- 42 | Fort Lee is a census-designated place (CDP) in Prince George County, Virginia, United States. The population was 7,269 at the 2000 census. 43 | Fort Lee is a United States Army post and headquarters of the U.S. Army Combined Arms Support Command (CASCOM), U.S. Army Quartermaster Center and School (QMCS), the Army Logistics University (ALU) and the U.S. Defense Commissary Agency (DeCA). A U.S. Army Forces Command (FORSCOM) unit, the 49th Quartermaster Group (Petroleum and Water), is stationed here. Fort Lee also hosts two Army museums, the U.S. Army Quartermaster Museum and the U.S. Army Women's Museum. The fort is named for Confederate General Robert E. Lee. 44 | - |- 45 | he Defense Commissary Agency (DeCA) is an agency of the United States Department of Defense that manages 284 grocery stores on U.S. military installations worldwide. The current (2009) director of Defense Comissary Agency is Philip E. Sakowitz Jr. [1]. 46 | These stores, called commissaries, function much the same as a typical civilian supermarket in the United States. Goods are sold at cost, plus a five percent surcharge to the total to pay for building new commissaries, maintenance, and operations equipment. DeCA states that a family of four can save over 30% or nearly $3,000 a year on their food purchases by shopping at the commissary. DeCA derived this statistic from cost-of-food figures from the USDA Center for Nutrition Policy and Promotion and figures from DeCA's price comparison study, which compares commissary prices with those of local supermarkets, major grocery store chains, and supercenters. Commissaries offer items typically stocked at a civilian supermarket; non-grocery items such as clothing and televisions are instead sold on military installations at a store called an Exchange not under DeCA control. In 2007, DeCA had annual sales of over $5.54 billion[2]. 47 | DeCA was activated in 1991 to consolidate the commissary functions previously performed by the Army, Air Force, Navy, and Marine Corps. The origins of DeCA are traced to the Second World War, when a rise in black market activity prompted the United States War Department to consider a central office through which goods and services could be provided to deployed servicemembers who would otherwise seek such goods through illegal means[citation needed]. The commissary system in the United States dates back to 1867 when Congress first authorized the Army to sell food items at cost[3]. 48 | The Defense Commissary Agency is a civilian agency, but employs some military personnel for liaison functions, mostly from the Quartermaster and Supply Corps branches. DeCA also issues a limited series of awards and decorations[4] including: 49 | DeCA Distinguished Civilian Service Award ribbon 50 | DeCA Distinguished Civilian Service Award 51 | DeCA Meritorious Civilian Service Award 52 | DeCA Superior Civilian Service Award 53 | DeCA Civilian Career Service Award 54 | DeCA Certificate of Appreciation 55 | DeCA Certificate of Appreciation in Equal Opportunity 56 | DeCA Disabled Employee of the Year 57 | DeCA Director's Award for Volunteer Service 58 | DeCA Civilian of the Year Award 59 | Michael W. Blackwell Leadership Award (civilian and military eligible) 60 | DeCA employees may also be awarded the Department of Defense Distinguished Civilian Service Award. 61 | - |- 62 | The United States Department of War, also called the War Office, was the cabinet department originally responsible for the operation and maintenance of the US Army. It was also responsible for naval affairs until the establishment of the Navy Department in 1798, and for land-based air forces until the creation of the Department of the Air Force in 1947. 63 | The War Department existed from 1789 until September 18, 1947, when it was renamed as the Department of the Army, and became part of the new, joint National Military Establishment (NME). Shortly after, in 1949, the NME was renamed to the Department of Defense, which the Dept of the Army is part of today. 64 | - |- 65 | The United States Air Force (USAF) is the aerial warfare branch of the U.S. armed forces and one of the American uniformed services. Initially part of the United States Army, the USAF was formed as a separate branch of the military on 18 September 1947.[3] It is the most recent branch of the U.S. military to be formed. 66 | The USAF is the largest and most technologically advanced Air Force in the world, with 5,573 manned aircraft in service (3,990 USAF; 1,213 Air National Guard; and 370 Air Force Reserve);[4] approximately 180 unmanned combat air vehicles, 2,130 air-launched cruise missiles,[5] and 450 intercontinental ballistic missiles. The USAF has 327,452 personnel on active duty, 115,299 in the Selected and Individual Ready Reserves, and 106,700 in the Air National Guard. In addition, the Air Force employs 171,313 civilian personnel including indirect hire of foreign nationals.[6] 67 | The Department of the Air Force is headed by the civilian Secretary of the Air Force who oversees all administrative and policy affairs. The Department of the Air Force is a division of the Department of Defense, headed by the Secretary of Defense. The highest ranking military officer in the Department of the Air Force is the Chief of Staff of the Air Force. 68 | - |- 69 | The current United States Department of Defense system for naming and designating aircraft aims to provide a unified system across all services that applies to all military aerial and space craft. There are two basic components to a craft's identity: its designation, and its common name. 70 | A vehicle designation is sometimes referred to as a Mission Design Series (MDS), referring to the three main parts of the designation, that combine to form a unique profile for each vehicle. The first series of letters (up to four) determine the type of craft and designed mission. A series number identifies major types which are of the same type and mission, and finally a series of variant and block identifiers clarify the exact configuration of the vehicle. 71 | The name is a matter of less specific construction, but is aimed at providing an official common name which eases identification and communication regarding the vehicle. The common name is not used in internal publications (an official internal report would refer to the "F-16" and "AIM-9" but not mention the names "Fighting Falcon" or "Sidewinder"). Pilots often have their own nicknames for their aircraft which may bear only coincidental resemblance (if that) to the official common name, although some pilot nicknames are similar or even derived from the official common name (such as "Bug" and "Super Bug" for the F/A-18 Hornet and F/A-18E/F Super Hornet). 72 | The current regulations and procedures relating to employing this system are laid out in DoD and branch documents, including Air Force Joint Instruction 16-401 [1], and are not classified. These regulations replaced the previous regulations which were originally introduced in 1962 (See 1962 United States Tri-Service aircraft designation system). 73 | - |- 74 | Pink Floyd's 'North American Tour' was a concert tour by the British progressive rock band Pink Floyd. Often referred to as the Wish You Were Here Tour, the tour was launched before the release of their album Wish you were here in September of that year. The tour was divided in two legs in the United States, West Coast and East Coast, and a gig in the UK at the Knebworth Festival. 75 | On this tour debuted the song Have a Cigar and the Shine on You Crazy Diamond suite was divided in two parts with Have a Cigar between. 76 | The last gig of the tour was as the headliner of 1975 Knebworth Festival, which also featured Steve Miller Band, Captain Beefheart and Roy Harper (who joined Pink Floyd on the stage to sing "Have a Cigar"). Knebworth was the last time the band would perform "Echoes" and the entire Dark Side of the Moon with Roger Waters. 77 | - "\"Shine On You Crazy Diamond\" is a nine-part Pink Floyd composition with lyrics written by Roger Waters in tribute to former band member Syd Barrett and music written by Waters, Richard Wright, and David Gilmour. It was first performed on their 1974 French tour. It was recorded for the 1975 concept album Wish You Were Here. The song was intended to be a side-long composition like \"Atom Heart Mother\" and \"Echoes\", but was ultimately split into two parts and used to bookend the album." 78 | - |- 79 | 21-87 is a notable Canadian abstract film created in 1963 by Arthur Lipsett that lasts nine minutes and 33 seconds. 80 | The short film, produced by the National Film Board of Canada, is a collage of snippets from discarded footage found by Lipsett in the editing room of the National Film Board (where he was employed as an animator), combined with his own black and white 16mm footage which he shot on the streets of Montreal and New York City, among other locations. 81 | 21-87 has had a profound influence on director George Lucas and sound designer/editor Walter Murch. Lucas's aesthetic and style was strongly influenced by it for the Star Wars films and a number of other works, including American Graffiti and his pure cinema visual tone poems "6-18-67", "1:42.08", "Look At Life", his short film "THX 1138:4EB" and the feature it inspired, THX 1138. Lucas never met Arthur Lipsett, who committed suicide in 1986, but tributes to 21-87 appear throughout Star Wars to the extent that the phrase, "The Force", itself is said to have been inspired by the short film. [1][2] 82 | - "rthur Lipsett (May 13, 1936 \xE2\x80\x93 May 1, 1986) was a Canadian avant-garde director of short experimental films.\n\ 83 | In the 1960s he was employed as an animator by the National Film Board of Canada. Lipsett's particular passion was sound. He would collect pieces of sound and fit them together to create an interesting auditory sensation. After playing one of these creations to friends, they suggested that Lipsett put images to it. He did what his friends suggested, and the result became the 7 minute long film Very Nice, Very Nice which was nominated for the Academy Award for Best Short Subject, Live Action Subjects in 1962. Despite not winning the Oscar, this film brought Lipsett considerable praise from critics and directors. Stanley Kubrick was one of Lipsett's fans, and asked him to create a trailer for his upcoming movie Dr. Strangelove. Lipsett declined Kubrick's offer. Kubrick went on to direct the trailer himself; however, Lipsett's influence on Kubrick is clearly visible when watching the trailer.\n\ 84 | Lipsett's film 21-87 was a profound influence on director George Lucas who included elements from 21-87 in THX 1138, his Star Wars films and also American Graffiti. The film 21-87 has been credited by Lucas as the source of the \"The Force\" in Star Wars.[1] Lucas never met the filmmaker but tributes to 21-87 appear throughout Star Wars. For example, the holding cell of Princess Leia in Star Wars Episode IV: A New Hope on the Death Star is cell No. 2187.\n\ 85 | Lipsett's success allowed him some freedom, but as his films became more bizarre, this freedom quickly disappeared. He suffered from psychological problems. Later in his life he is said to have done strange things like taking a taxi from Toronto to Montreal (costing several hundred dollars). Lipsett committed suicide in 1986, two weeks before his 50th birthday.\n\ 86 | In 2006, a feature-length documentary about Lipsett, Remembering Arthur, was produced by Public Pictures. The film was directed by Lipsett's close friend Martin Lavut." 87 | - |- 88 | THX 1138 is a 1971 science fiction film directed by George Lucas, from a screenplay by Lucas and Walter Murch. It depicts a dystopian future in which a high level of control is exerted upon the populace through omnipresent, faceless, android police officers and mandatory, regulated use of special drugs to suppress emotion, including sexual desire. 89 | It was the first feature-length film directed by Lucas, and a more developed, feature-length version of his student film Electronic Labyrinth: THX 1138 4EB, which he made in 1967 while attending the University of Southern California, based on a one and a quarter page treatment of an idea by Matthew Robbins. The film was produced in a joint venture between Warner Brothers and Francis Ford Coppola's then-new production company, American Zoetrope. A novelization by Ben Bova was published in 1971. 90 | - |- 91 | Bova was a technical writer for Project Vanguard and later for Avco Everett in the 1960s when they did research in lasers and fluid dynamics. It was there that he met Arthur R. Kantrowitz later of the Foresight Institute. 92 | In 1971 he became editor of Analog Science Fiction after John W. Campbell's death. After leaving Analog, he went on to edit Omni during 1978-1982. 93 | In 1974 he wrote the screenplay for an episode of the children's science fiction television series Land of the Lost entitled "The Search". 94 | Bova was the science advisor for the failed television series The Starlost, leaving in disgust after the airing of the first episode. His novel The Starcrossed was loosely based on his experiences and featured a thinly veiled characterization of his friend and colleague Harlan Ellison. He dedicated the novel to "Cordwainer Bird", the pen name Harlan Ellison uses when he does not want to be associated with a television or film project. 95 | Bova is the President Emeritus of the National Space Society and a past President of Science-fiction and Fantasy Writers of America (SFWA). 96 | Bova went back to school in the 1980s, earning an M.A. in communications in 1987 and a Ed.D. in 1996. 97 | Bova has drawn on these meetings and experiences to create fact and fiction writings rich with references to spaceflight, lasers, artificial hearts, nanotechnology, environmentalism, fencing and martial arts, photography and artists. 98 | Bova is the author of over one hundred fifteen books, non-fiction as well as science fiction. In 2000, he was the Author Guest of Honor at the 58th World Science Fiction Convention (Chicon 2000). 99 | Recently, Hollywood has taken an interest in Bova's works for his wealth of knowledge about science and what the future may look like. In 2007, he was hired as a consultant by both Stuber/Parent Productions to provide insight into what the world is to look like in the near future for their upcoming film "Repossession Mambo" starring Jude Law and Forest Whitaker and by Silver Pictures in which he provided consulting services on the feature adaptation of Richard Morgan's "Altered Carbon". 100 | - Harlan Jay Ellison (born May 27, 1934) is an American writer. He has written in many genres, principally, but not exclusively, that of science fiction. He has also written short stories, novellas, screenplays, teleplays, essays, and a wide range of criticism covering not only literature, but film, television, and print media. His reputation as an editor was cemented with his two ground-breaking science fiction anthologies, Dangerous Visions and Again, Dangerous Visions. 101 | - "Aigars Kalv\xC4\xABtis (born June 27, 1966) is a Latvian politician and the former Prime Minister of Latvia.\n\ 102 | Kalv\xC4\xABtis graduated from Latvian University of Agriculture in 1992 with a degree in economics. From 1992 to 1998, he was a manager at various agriculture-related businesses. Kalv\xC4\xABtis was one of the founders of People's Party of Latvia in 1997 and was first elected to Saeima, the Latvian parliament, in 1998. He served as the minister of agriculture from 1999 to 2000 and the minister of economics from 2000 to 2002. Kalv\xC4\xABtis was reelected to Saeima and became the leader of the parliamentary faction of People's Party in 2002.\n\ 103 | On December 2, 2004, he became the Prime Minister of Latvia. Kalv\xC4\xABtis at first led a coalition government consisting of his own People's Party, the New Era Party, the Union of Greens and Farmers and the Latvia's First Party. In April 2006, the New Era Party left the government and Kalv\xC4\xABtis led a minority coalition government consisting of the other three parties.\n\ 104 | His governing coalition retained power in the October 7, 2006 parliamentary election, winning a slight majority of seats and becoming the first government since Latvian independence in 1991 to be re-elected.[1] It now consists of the People's Party, Union of Greens and Farmers, the Latvia First/Latvian Way Party, and the Fatherland and Freedom Party. The Fatherland and Freedom Party was added after the 2006 elections, and strengthens the coalition's majority to 59 of the 100 seats. Meanwhile, the People\xE2\x80\x99s Party became the largest party in Parliament. Kalv\xC4\xABtis became its chairman.\n\ 105 | On November 7, 2007, Kalv\xC4\xABtis announced that he would step down on December 5, after encountering widespread opposition to his dismissal of the head of the anti-corruption bureau, Aleksejs Loskutovs, in the previous month.[2] He accordingly met with President Valdis Zatlers on December 5 and announced his resignation, along with that of his government.[3][4] According to Kalv\xC4\xABtis, speaking on television on the same day, this was necessary to \"cool down hot heads\".[4] Kalv\xC4\xABtis remained in office in a caretaker capacity until the appointment of his successor Ivars Godmanis.[3]" 106 | - "The Prime Minister of Latvia is the most powerful member of the Government of the Republic of Latvia, and presides over the Latvian Cabinet of Ministers. The Prime Minister is nominated by the President of Latvia, but must be able to obtain the support of a majority of Saeima (parliament).\n\ 107 | The tables below display all Latvian Prime Ministers for both the first period when Latvia was independent (1918\xE2\x80\x931940) and since the country regained its independence (1990\xE2\x80\x93present). From 1990 to 6 July 1993, the office was known as Chairman of the Council of Ministers, but is generally considered to have been the same role." 108 | - "Uros are a pre-Incan people that live on forty-two self-fashioned floating man-made islets in Lake Titicaca Puno, Peru and Bolivia. They form three main groups: Uru-Chipayas, Uru-Muratos and the Uru-Iruitos. The latter are still located on the Bolivian side of Lake Titicaca and Desaguadero River.\n\ 109 | The Uros use the totora plant to make boats (balsas mats) of bundled dried reeds, and to make the islands themselves.[1]\n\ 110 | Los Uros island\n\ 111 | The Uros islands at 3810 meters above sea level are just five kilometers west from Puno port.[2] Around 2,000 descendants of the Uros were counted in the 1997 census,[3] although only a few hundred still live on and maintain the islands; most have moved to the mainland. The Uros also bury their dead on the mainland in special cemeteries.\n\ 112 | Uro man pulling boat made of reeds\n\ 113 | The Uros descend from a millennial town that according to legends are \"pukinas\" who speak Uro or Pukina and that believe they are the owners of the lake and water. Uros used to say that they have black blood because they did not feel the cold. Also they call themselves \"Lupihaques\" (Sons of The Sun). Nowadays, Uros do not speak the Uro language, nor practice their old beliefs but keep some old customs.[3]\n\ 114 | The purpose of the island settlements was originally defensive, and if a threat arose they could be moved. The largest island retains a watchtower almost entirely constructed of reeds.\n\ 115 | The Uros traded with the Aymara tribe on the mainland, interbreeding with them and eventually abandoning the Uro language for that of the Aymara. About 500 years ago they lost their original language. When this pre-Incan civilization was conquered by the Incans, they had to pay taxes to them, and often were made slaves.\n\ 116 | The islets are made of totora reeds, which grow in the lake. The dense roots that the plants develop and interweave form a natural layer called Khili (about one to two meters thick) that support the islands . They are anchored with ropes attached to sticks driven into the bottom of the lake. The reeds at the bottoms of the islands rot away fairly quickly, so new reeds are added to the top constantly, about every three months; this is what it makes exciting for tourists when walking on the island.[3] This is especially important in the rainy season when the reeds rot a lot faster. The islands last about thirty years.\n\ 117 | Uros children before going to school\n\ 118 | Much of the Uros' diet and medicine also revolve around these totora reeds. When a reed is pulled, the white bottom is often eaten for iodine. This prevents goiter. This white part of the reed is called the chullo (Aymara [t\xCA\x83\xCA\xBCu\xCA\x8Eo]). Like the Andean people of Peru rely on the Coca Leaf for relief from a harsh climate and hunger, the Uros rely on the Totora reeds in the same way. When in pain, the reed is wrapped around the place in pain to absorb it. They also make a reed flower tea.\n\ 119 | The larger islands house about ten families, while smaller ones, only about thirty meters wide, house only two or three.[2]\n\ 120 | Local residents fish ispi, carachi and catfish. Two types of fish were recently introduced to the lake: trout was introduced from Canada in 1940, and kingfish was introduced from Argentina. Uros also hunt birds such as seagulls, ducks and flamingos, and graze their cattle on the islets. They also run crafts stalls aimed at the numerous tourists who land on ten of the islands each year. They barter totora reeds on the mainland in Puno to get products they need, such as quinoa and other foods.\n\ 121 | Food is cooked with fires placed on piles of stones. To relieve themselves, tiny 'outhouse' islands are near the main islands. The ground root absorbs the waste.\n\ 122 | The Uros do not reject modern technology: some boats have motors, some houses have solar panels to run appliances such as televisions, and the main island is home to an Uros-run FM radio station, which plays music for several hours a day.\n\ 123 | Early schooling is done on several islands, including a traditional school and a school run by a Christian church. Older children and university students attend school on the mainland, often in nearby Puno." 124 | -------------------------------------------------------------------------------- /test/lda_ruby_test.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'test/unit' 3 | require 'shoulda' 4 | require 'yaml' 5 | 6 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 7 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 8 | require 'lda-ruby' 9 | 10 | class LdaRubyTest < Test::Unit::TestCase 11 | context "A Document instance" do 12 | setup do 13 | @corpus = Lda::Corpus.new 14 | end 15 | 16 | context "A typical Document" do 17 | setup do 18 | @document = Lda::Document.new(@corpus) 19 | end 20 | 21 | should "not have text" do 22 | assert !@document.text? 23 | end 24 | 25 | should "be empty" do 26 | assert_equal @document.total, 0 27 | assert_equal @document.length, 0 28 | end 29 | 30 | context "after adding words" do 31 | setup do 32 | @document.words << 1 << 2 << 3 << 4 << 5 33 | @document.counts << 2 << 1 << 1 << 1 << 3 34 | @document.recompute 35 | end 36 | 37 | should "have word count equal to what was added" do 38 | assert_equal @document.length, 5 39 | end 40 | 41 | should "have total words equal to the sum of the counts" do 42 | assert_equal @document.total, 8 43 | end 44 | end 45 | end 46 | 47 | context "A typical DataDocument" do 48 | setup do 49 | @data = '5 1:2 2:1 3:1 4:1 5:3' 50 | @document = Lda::DataDocument.new(@corpus, @data) 51 | end 52 | 53 | should "not have text" do 54 | assert !@document.text? 55 | end 56 | 57 | should "have word count equal to what was added" do 58 | assert_equal @document.length, 5 59 | end 60 | 61 | should "have total words equal to the sum of the counts" do 62 | assert_equal @document.total, 8 63 | end 64 | 65 | should "have words equal to the order they were entered" do 66 | assert_equal @document.words, [1, 2, 3, 4, 5] 67 | end 68 | 69 | should "have counts equal to the order they were entered" do 70 | assert_equal @document.counts, [2, 1, 1, 1, 3] 71 | end 72 | end 73 | 74 | context "A typical TextDocument" do 75 | setup do 76 | @text = 'stop words stop stop masterful stoppage buffalo buffalo buffalo' 77 | @document = Lda::TextDocument.new(@corpus, @text) 78 | end 79 | 80 | should "have text" do 81 | assert @document.text? 82 | end 83 | 84 | should "have word count equal to what was added" do 85 | assert_equal @document.length, 5 86 | end 87 | 88 | should "have total words equal to the sum of the counts" do 89 | assert_equal @document.total, @text.split(/ /).size 90 | end 91 | 92 | should "have tokens in the order they were entered" do 93 | assert_equal @document.tokens, @text.split(/ /) 94 | end 95 | end 96 | end 97 | 98 | context "A Corpus instance" do 99 | context "A typical Lda::Corpus instance" do 100 | setup do 101 | @corpus = Lda::Corpus.new 102 | @document1 = Lda::TextDocument.new(@corpus, 'This is the document that never ends. Oh wait yeah it does.') 103 | @document2 = Lda::TextDocument.new(@corpus, 'A second document that is just as lame as the first.') 104 | end 105 | 106 | should "be able to add new documents" do 107 | assert @corpus.respond_to?(:add_document) 108 | @corpus.add_document(@document1) 109 | assert_equal @corpus.documents.size, 1 110 | end 111 | 112 | should "update vocabulary with words in the document" do 113 | @corpus.add_document(@document2) 114 | assert_equal @corpus.vocabulary.words.member?('lame'), true 115 | end 116 | end 117 | 118 | context "An Lda::DataCorpus instance loaded from a file" do 119 | setup do 120 | @filename = File.join(File.dirname(__FILE__), 'data', 'docs.dat') 121 | @filetext = File.open(@filename, 'r') { |f| f.read } 122 | @corpus = Lda::DataCorpus.new(@filename) 123 | end 124 | 125 | should "contain the number of documents equivalent to the number of lines in the file" do 126 | assert_equal @corpus.num_docs, @filetext.split(/\n/).size 127 | end 128 | 129 | should "not load any words into the vocabulary since none were given" do 130 | assert_equal @corpus.vocabulary.words.size, 0 131 | end 132 | end 133 | 134 | context "An Lda::TextCorpus instance loaded from a file" do 135 | setup do 136 | @filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml') 137 | @filedocs = YAML::load_file(@filename) 138 | @corpus = Lda::TextCorpus.new(@filename) 139 | end 140 | 141 | should "contain the number of documents equivalent to the number of lines in the file" do 142 | assert_equal @corpus.num_docs, @filedocs.size 143 | end 144 | 145 | should "update the vocabulary with the words that were loaded" do 146 | assert @corpus.vocabulary.words.size > 0 147 | end 148 | end 149 | 150 | context "An Lda::DirectoryCorpus instance loaded from a directory" do 151 | setup do 152 | @path = File.join(File.dirname(__FILE__), 'data', 'tmp') 153 | @extension = 'txt' 154 | Dir.mkdir(@path) 155 | @original_filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml') 156 | @filedocs = YAML::load_file(@original_filename) 157 | @filedocs.each_with_index do |doc, idx| 158 | File.open(File.join(@path, "doc_#{idx + 1}.txt"), 'w') { |f| f.write(doc) } 159 | end 160 | 161 | @corpus = Lda::DirectoryCorpus.new(@path, @extension) 162 | end 163 | 164 | should "load a document for every file in the directory" do 165 | assert_equal @corpus.num_docs, @filedocs.size 166 | end 167 | 168 | should "update the vocabulary with the words that were loaded" do 169 | assert @corpus.vocabulary.words.size > 0 170 | end 171 | 172 | teardown do 173 | Dir.glob(File.join(@path, "*.#{@extension}")).each { |f| File.unlink(f) } 174 | Dir.rmdir(@path) 175 | end 176 | end 177 | end 178 | 179 | context "A Vocabulary instance" do 180 | setup do 181 | @vocab = Lda::Vocabulary.new 182 | @words = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6'] 183 | @filename1 = File.join(File.dirname(__FILE__), 'data', 'tmp_file.txt') 184 | File.open(@filename1, 'w') do |f| 185 | @words.each { |w| f.write("#{w}\n") } 186 | end 187 | @filename2 = File.join(File.dirname(__FILE__), 'data', 'tmp_file.yml') 188 | File.open(@filename2, 'w') { |f| YAML::dump(@words, f) } 189 | end 190 | 191 | should "load a file containing a list of words, one per line" do 192 | assert @vocab.num_words == 0 193 | @vocab.load_file(@filename1) 194 | assert @vocab.words.size > 0 195 | end 196 | 197 | should "load a yaml file containing a list of words" do 198 | assert @vocab.num_words == 0 199 | @vocab.load_yaml(@filename2) 200 | assert @vocab.num_words > 0 201 | end 202 | 203 | should "return indexes for words in the order they were loaded" do 204 | @vocab.load_yaml(@filename2) 205 | @words.each_with_index do |word, idx| 206 | assert_equal @vocab.check_word(word), idx + 1 207 | end 208 | end 209 | 210 | teardown do 211 | File.unlink(@filename1) 212 | File.unlink(@filename2) 213 | end 214 | end 215 | 216 | context "An Lda::Lda instance" do 217 | setup do 218 | @filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml') 219 | @filedocs = YAML::load_file(@filename) 220 | @corpus = Lda::TextCorpus.new(@filename) 221 | 222 | @lda = Lda::Lda.new(@corpus) 223 | end 224 | 225 | should "have loaded the vocabulary from the corpus" do 226 | assert !@lda.vocab.nil? 227 | end 228 | 229 | should "have loaded the same number of words in the vocabulary as are in the original" do 230 | assert_equal @lda.vocab.size, @corpus.vocabulary.num_words 231 | end 232 | 233 | should "have default values for the main settings" do 234 | assert !@lda.max_iter.nil? 235 | assert !@lda.convergence.nil? 236 | assert !@lda.em_max_iter.nil? 237 | assert !@lda.em_convergence.nil? 238 | assert !@lda.num_topics.nil? 239 | assert !@lda.init_alpha.nil? 240 | assert !@lda.est_alpha.nil? 241 | end 242 | 243 | context "after running em" do 244 | setup do 245 | @lda.verbose = false 246 | @lda.num_topics = 8 247 | @lda.em('random') 248 | end 249 | 250 | should "phi should be defined" do 251 | assert !@lda.phi.nil? 252 | end 253 | 254 | should "return the top 10 list of words for each topic" do 255 | topics = @lda.top_words(10) 256 | assert topics.is_a?(Hash) 257 | assert_equal topics.size, @lda.num_topics 258 | 259 | topics.each_pair do |topic, top_n_words| 260 | assert_equal top_n_words.size, 10 261 | end 262 | end 263 | 264 | context "after computing topic-document probabilities" do 265 | setup do 266 | @topic_doc_probs = @lda.compute_topic_document_probability 267 | end 268 | 269 | should "have a row for each document" do 270 | assert_equal @topic_doc_probs.size, @corpus.num_docs 271 | end 272 | 273 | should "have columns for each topic" do 274 | @topic_doc_probs.each do |doc| 275 | assert_equal doc.size, @lda.num_topics 276 | end 277 | end 278 | end 279 | end 280 | end 281 | end 282 | -------------------------------------------------------------------------------- /test/simple_test.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'shoulda' 3 | require 'yaml' 4 | require 'lda-ruby' 5 | 6 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 7 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 8 | 9 | class Test::Unit::TestCase 10 | 11 | @corpus = Lda::Corpus.new 12 | @document1 = Lda::TextDocument.new(@corpus, 'Dom Cobb is a skilled thief, the absolute best in the dangerous art of extraction, stealing valuable secrets from deep within the subconscious during the dream state, when the mind is at its most vulnerable. Cobb\'s rare ability has made him a coveted player in this treacherous new world of corporate espionage, but it has also made him an international fugitive and cost him everything he has ever loved. Now Cobb is being offered a chance at redemption. One last job could give him his life back but only if he can accomplish the impossible-inception. Instead of the perfect heist, Cobb and his team of specialists have to pull off the reverse: their task is not to steal an idea but to plant one. If they succeed, it could be the perfect crime. But no amount of careful planning or expertise can prepare the team for the dangerous enemy that seems to predict their every move. An enemy that only Cobb could have seen coming.') 13 | @document2 = Lda::TextDocument.new(@corpus, 'When his brother is killed in a robbery, paraplegic Marine Jake Sully decides to take his place in a mission on the distant world of Pandora. There he learns of greedy corporate figurehead Parker Selfridge\'s intentions of driving off the native humanoid \"Na\'vi\" in order to mine for the precious material scattered throughout their rich woodland. In exchange for the spinal surgery that will fix his legs, Jake gathers intel for the cooperating military unit spearheaded by gung-ho Colonel Quaritch, while simultaneously attempting to infiltrate the Na\'vi people with the use of an \"avatar\" identity. While Jake begins to bond with the native tribe and quickly falls in love with the beautiful alien Neytiri, the restless Colonel moves forward with his ruthless extermination tactics, forcing the soldier to take a stand - and fight back in an epic battle for the fate of Pandora.') 14 | 15 | @corpus.add_document(@document1) 16 | @corpus.add_document(@document2) 17 | @corpus.remove_word("cobb") 18 | @lda = Lda::Lda.new(@corpus) 19 | 20 | @lda.verbose = false 21 | @lda.num_topics = 2 22 | @lda.em('random') 23 | topics = @lda.top_words(5) 24 | puts topics 25 | 26 | end 27 | -------------------------------------------------------------------------------- /test/simple_yaml.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'shoulda' 3 | require 'yaml' 4 | require 'lda-ruby' 5 | 6 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 7 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 8 | 9 | class Test::Unit::TestCase 10 | 11 | @filename = File.join(File.dirname(__FILE__), 'data', 'wiki-test-docs.yml') 12 | @filedocs = YAML::load_file(@filename) 13 | @corpus = Lda::TextCorpus.new(@filename) 14 | 15 | @lda = Lda::Lda.new(@corpus) 16 | 17 | @lda.verbose = false 18 | @lda.num_topics = 20 19 | @lda.em('random') 20 | @lda.print_topics(20) 21 | 22 | 23 | end 24 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'test/unit' 3 | require 'shoulda' 4 | require 'yaml' 5 | 6 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 7 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 8 | require 'lda-ruby' 9 | 10 | class Test::Unit::TestCase 11 | end 12 | --------------------------------------------------------------------------------