├── Makefile ├── README.md ├── cokus.c ├── cokus.h ├── example ├── ap-topics.pdf └── ap.tgz ├── inf-settings.txt ├── lda-alpha.c ├── lda-alpha.h ├── lda-data.c ├── lda-data.h ├── lda-estimate.c ├── lda-estimate.h ├── lda-inference.c ├── lda-inference.h ├── lda-model.c ├── lda-model.h ├── lda.h ├── license.txt ├── readme.txt ├── settings.txt ├── todo.txt ├── topics.py ├── utils.c └── utils.h /Makefile: -------------------------------------------------------------------------------- 1 | # (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | # This file is part of LDA-C. 4 | 5 | # LDA-C is free software; you can redistribute it and/or modify it under 6 | # the terms of the GNU General Public License as published by the Free 7 | # Software Foundation; either version 2 of the License, or (at your 8 | # option) any later version. 9 | 10 | # LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | # for more details. 14 | 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program; if not, write to the Free Software 17 | # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | # USA 19 | 20 | .SUFFIXES: .c .u 21 | CC= gcc 22 | CFLAGS= -O3 -Wall -g 23 | LDFLAGS= -lm 24 | 25 | LOBJECTS= lda-data.o lda-estimate.o lda-model.o lda-inference.o utils.o cokus.o lda-alpha.o 26 | 27 | LSOURCE= lda-data.c lda-estimate.c lda-model.c lda-inference.c utils.c cokus.c lda-alpha.c 28 | 29 | lda: $(LOBJECTS) 30 | $(CC) $(CFLAGS) $(LOBJECTS) -o lda $(LDFLAGS) 31 | 32 | clean: 33 | -rm -f *.o 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Latent Dirichlet allocation 2 | 3 | This is a C implementation of variational EM for latent Dirichlet allocation (LDA), a topic model for text or other discrete data. LDA allows you to analyze of corpus, and extract the topics that combined to form its documents. For example, click [here](https://github.com/Blei-Lab/lda-c/blob/master/example/ap-topics.pdf) to see the topics estimated from a small corpus of Associated Press documents. LDA is fully described in [Blei et al. (2003)](http://www.cs.columbia.edu/~blei/papers/BleiNgJordan2003.pdf). 4 | 5 | This code contains: 6 | 7 | * an implementation of variational inference for the per-document topic proportions and per-word topic assignments 8 | * a variational EM procedure for estimating the topics and exchangeable Dirichlet hyperparameter 9 | 10 | ## Readme 11 | 12 | View the [readme.txt](https://github.com/Blei-Lab/lda-c/blob/master/readme.txt) and fork or clone the repository. 13 | 14 | ## Sample data 15 | 16 | 2246 documents from the Associated Press **[download](https://github.com/Blei-Lab/lda-c/blob/master/example/ap.tgz)**. 17 | 18 | Top 20 words from 100 topics estimated from the AP corpus **[pdf](https://github.com/Blei-Lab/lda-c/blob/master/example/ap-topics.pdf)**. 19 | 20 | ## Bug fixes and updates 21 | 22 | To learn about bug-fixes, updates, and discuss LDA and related techniques, please join the topic-models mailing list, topic-models [at] lists.cs.princeton.edu. 23 | 24 | To join, click [here](https://lists.cs.princeton.edu/mailman/listinfo/topic-models). 25 | 26 | ## Other implementations on the web 27 | 28 | There are several other implementations of LDA on the web: 29 | * [R package 30 | ](http://cran.r-project.org/web/packages/lda/) 31 | * [The Mallet Toolkit from UMass](http://mallet.cs.umass.edu/) 32 | * [Gregor Heinrich's LDA-J](http://www.arbylon.net/projects/) 33 | * [Multinomial PCA](http://cosco.hiit.fi/search/MPCA/) 34 | -------------------------------------------------------------------------------- /cokus.c: -------------------------------------------------------------------------------- 1 | // This is the ``Mersenne Twister'' random number generator MT19937, which 2 | // generates pseudorandom integers uniformly distributed in 0..(2^32 - 1) 3 | // starting from any odd seed in 0..(2^32 - 1). This version is a recode 4 | // by Shawn Cokus (Cokus@math.washington.edu) on March 8, 1998 of a version by 5 | // Takuji Nishimura (who had suggestions from Topher Cooper and Marc Rieffel in 6 | // July-August 1997). 7 | // 8 | // Effectiveness of the recoding (on Goedel2.math.washington.edu, a DEC Alpha 9 | // running OSF/1) using GCC -O3 as a compiler: before recoding: 51.6 sec. to 10 | // generate 300 million random numbers; after recoding: 24.0 sec. for the same 11 | // (i.e., 46.5% of original time), so speed is now about 12.5 million random 12 | // number generations per second on this machine. 13 | // 14 | // According to the URL 15 | // (and paraphrasing a bit in places), the Mersenne Twister is ``designed 16 | // with consideration of the flaws of various existing generators,'' has 17 | // a period of 2^19937 - 1, gives a sequence that is 623-dimensionally 18 | // equidistributed, and ``has passed many stringent tests, including the 19 | // die-hard test of G. Marsaglia and the load test of P. Hellekalek and 20 | // S. Wegenkittl.'' It is efficient in memory usage (typically using 2506 21 | // to 5012 bytes of static data, depending on data type sizes, and the code 22 | // is quite short as well). It generates random numbers in batches of 624 23 | // at a time, so the caching and pipelining of modern systems is exploited. 24 | // It is also divide- and mod-free. 25 | // 26 | // This library is free software; you can redistribute it and/or modify it 27 | // under the terms of the GNU Library General Public License as published by 28 | // the Free Software Foundation (either version 2 of the License or, at your 29 | // option, any later version). This library is distributed in the hope that 30 | // it will be useful, but WITHOUT ANY WARRANTY, without even the implied 31 | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 32 | // the GNU Library General Public License for more details. You should have 33 | // received a copy of the GNU Library General Public License along with this 34 | // library; if not, write to the Free Software Foundation, Inc., 59 Temple 35 | // Place, Suite 330, Boston, MA 02111-1307, USA. 36 | // 37 | // The code as Shawn received it included the following notice: 38 | // 39 | // Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. When 40 | // you use this, send an e-mail to with 41 | // an appropriate reference to your work. 42 | // 43 | // It would be nice to CC: when you write. 44 | // 45 | 46 | #include "cokus.h" 47 | 48 | static uint32 state[N+1]; // state vector + 1 extra to not violate ANSI C 49 | static uint32 *next; // next random value is computed from here 50 | static int left = -1; // can *next++ this many times before reloading 51 | 52 | void seedMT(uint32 seed) 53 | { 54 | // 55 | // We initialize state[0..(N-1)] via the generator 56 | // 57 | // x_new = (69069 * x_old) mod 2^32 58 | // 59 | // from Line 15 of Table 1, p. 106, Sec. 3.3.4 of Knuth's 60 | // _The Art of Computer Programming_, Volume 2, 3rd ed. 61 | // 62 | // Notes (SJC): I do not know what the initial state requirements 63 | // of the Mersenne Twister are, but it seems this seeding generator 64 | // could be better. It achieves the maximum period for its modulus 65 | // (2^30) iff x_initial is odd (p. 20-21, Sec. 3.2.1.2, Knuth); if 66 | // x_initial can be even, you have sequences like 0, 0, 0, ...; 67 | // 2^31, 2^31, 2^31, ...; 2^30, 2^30, 2^30, ...; 2^29, 2^29 + 2^31, 68 | // 2^29, 2^29 + 2^31, ..., etc. so I force seed to be odd below. 69 | // 70 | // Even if x_initial is odd, if x_initial is 1 mod 4 then 71 | // 72 | // the lowest bit of x is always 1, 73 | // the next-to-lowest bit of x is always 0, 74 | // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... , 75 | // the 3rd-from-lowest bit of x 4-cycles ... 0 1 1 0 0 1 1 0 ... , 76 | // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 0 1 1 1 1 0 ... , 77 | // ... 78 | // 79 | // and if x_initial is 3 mod 4 then 80 | // 81 | // the lowest bit of x is always 1, 82 | // the next-to-lowest bit of x is always 1, 83 | // the 2nd-from-lowest bit of x alternates ... 0 1 0 1 0 1 0 1 ... , 84 | // the 3rd-from-lowest bit of x 4-cycles ... 0 0 1 1 0 0 1 1 ... , 85 | // the 4th-from-lowest bit of x has the 8-cycle ... 0 0 1 1 1 1 0 0 ... , 86 | // ... 87 | // 88 | // The generator's potency (min. s>=0 with (69069-1)^s = 0 mod 2^32) is 89 | // 16, which seems to be alright by p. 25, Sec. 3.2.1.3 of Knuth. It 90 | // also does well in the dimension 2..5 spectral tests, but it could be 91 | // better in dimension 6 (Line 15, Table 1, p. 106, Sec. 3.3.4, Knuth). 92 | // 93 | // Note that the random number user does not see the values generated 94 | // here directly since reloadMT() will always munge them first, so maybe 95 | // none of all of this matters. In fact, the seed values made here could 96 | // even be extra-special desirable if the Mersenne Twister theory says 97 | // so-- that's why the only change I made is to restrict to odd seeds. 98 | // 99 | 100 | register uint32 x = (seed | 1U) & 0xFFFFFFFFU, *s = state; 101 | register int j; 102 | 103 | for(left=0, *s++=x, j=N; --j; 104 | *s++ = (x*=69069U) & 0xFFFFFFFFU); 105 | } 106 | 107 | 108 | uint32 reloadMT(void) 109 | { 110 | register uint32 *p0=state, *p2=state+2, *pM=state+M, s0, s1; 111 | register int j; 112 | 113 | if(left < -1) 114 | seedMT(4357U); 115 | 116 | left=N-1, next=state+1; 117 | 118 | for(s0=state[0], s1=state[1], j=N-M+1; --j; s0=s1, s1=*p2++) 119 | *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 120 | 121 | for(pM=state, j=M; --j; s0=s1, s1=*p2++) 122 | *p0++ = *pM++ ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 123 | 124 | s1=state[0], *p0 = *pM ^ (mixBits(s0, s1) >> 1) ^ (loBit(s1) ? K : 0U); 125 | s1 ^= (s1 >> 11); 126 | s1 ^= (s1 << 7) & 0x9D2C5680U; 127 | s1 ^= (s1 << 15) & 0xEFC60000U; 128 | return(s1 ^ (s1 >> 18)); 129 | } 130 | 131 | uint32 randomMT(void) 132 | { 133 | uint32 y; 134 | 135 | if(--left < 0) 136 | return(reloadMT()); 137 | 138 | y = *next++; 139 | y ^= (y >> 11); 140 | y ^= (y << 7) & 0x9D2C5680U; 141 | y ^= (y << 15) & 0xEFC60000U; 142 | y ^= (y >> 18); 143 | return(y); 144 | } 145 | 146 | -------------------------------------------------------------------------------- /cokus.h: -------------------------------------------------------------------------------- 1 | #ifndef COKUS_H 2 | #define COKUS_H 3 | 4 | #include 5 | #include 6 | 7 | // 8 | // uint32 must be an unsigned integer type capable of holding at least 32 9 | // bits; exactly 32 should be fastest, but 64 is better on an Alpha with 10 | // GCC at -O3 optimization so try your options and see what's best for you 11 | // 12 | 13 | typedef unsigned long uint32; 14 | 15 | #define N (624) // length of state vector 16 | #define M (397) // a period parameter 17 | #define K (0x9908B0DFU) // a magic constant 18 | #define hiBit(u) ((u) & 0x80000000U) // mask all but highest bit of u 19 | #define loBit(u) ((u) & 0x00000001U) // mask all but lowest bit of u 20 | #define loBits(u) ((u) & 0x7FFFFFFFU) // mask the highest bit of u 21 | #define mixBits(u, v) (hiBit(u)|loBits(v)) // move hi bit of u to hi bit of v 22 | 23 | void seedMT(uint32 seed); 24 | uint32 reloadMT(void); 25 | uint32 randomMT(void); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /example/ap-topics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/lda-c/0c46575e89092683010db077f713f8aa3b3594a2/example/ap-topics.pdf -------------------------------------------------------------------------------- /example/ap.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blei-lab/lda-c/0c46575e89092683010db077f713f8aa3b3594a2/example/ap.tgz -------------------------------------------------------------------------------- /inf-settings.txt: -------------------------------------------------------------------------------- 1 | var max iter -1 2 | var convergence 1e-6 3 | em max iter 100 4 | em convergence 1e-4 5 | alpha estimate 6 | -------------------------------------------------------------------------------- /lda-alpha.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-alpha.h" 21 | 22 | /* 23 | * objective function and its derivatives 24 | * 25 | */ 26 | 27 | double alhood(double a, double ss, int D, int K) 28 | { return(D * (lgamma(K * a) - K * lgamma(a)) + (a - 1) * ss); } 29 | 30 | double d_alhood(double a, double ss, int D, int K) 31 | { return(D * (K * digamma(K * a) - K * digamma(a)) + ss); } 32 | 33 | double d2_alhood(double a, int D, int K) 34 | { return(D * (K * K * trigamma(K * a) - K * trigamma(a))); } 35 | 36 | 37 | /* 38 | * newtons method 39 | * 40 | */ 41 | 42 | double opt_alpha(double ss, int D, int K) 43 | { 44 | double a, log_a, init_a = 100; 45 | double f, df, d2f; 46 | int iter = 0; 47 | 48 | log_a = log(init_a); 49 | do 50 | { 51 | iter++; 52 | a = exp(log_a); 53 | if (isnan(a)) 54 | { 55 | init_a = init_a * 10; 56 | printf("warning : alpha is nan; new init = %5.5f\n", init_a); 57 | a = init_a; 58 | log_a = log(a); 59 | } 60 | f = alhood(a, ss, D, K); 61 | df = d_alhood(a, ss, D, K); 62 | d2f = d2_alhood(a, D, K); 63 | log_a = log_a - df/(d2f * a + df); 64 | printf("alpha maximization : %5.5f %5.5f\n", f, df); 65 | } 66 | while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER)); 67 | return(exp(log_a)); 68 | } 69 | -------------------------------------------------------------------------------- /lda-alpha.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_ALPHA_H 2 | #define LDA_ALPHA_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "lda.h" 9 | #include "utils.h" 10 | 11 | #define NEWTON_THRESH 1e-5 12 | #define MAX_ALPHA_ITER 1000 13 | 14 | double alhood(double a, double ss, int D, int K); 15 | double d_alhood(double a, double ss, int D, int K); 16 | double d2_alhood(double a, int D, int K); 17 | double opt_alpha(double ss, int D, int K); 18 | void maximize_alpha(double** gamma, lda_model* model, int num_docs); 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /lda-data.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-data.h" 21 | 22 | corpus* read_data(char* data_filename) 23 | { 24 | FILE *fileptr; 25 | int length, count, word, n, nd, nw; 26 | corpus* c; 27 | 28 | printf("reading data from %s\n", data_filename); 29 | c = malloc(sizeof(corpus)); 30 | c->docs = 0; 31 | c->num_terms = 0; 32 | c->num_docs = 0; 33 | fileptr = fopen(data_filename, "r"); 34 | nd = 0; nw = 0; 35 | while ((fscanf(fileptr, "%10d", &length) != EOF)) 36 | { 37 | c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1)); 38 | c->docs[nd].length = length; 39 | c->docs[nd].total = 0; 40 | c->docs[nd].words = malloc(sizeof(int)*length); 41 | c->docs[nd].counts = malloc(sizeof(int)*length); 42 | for (n = 0; n < length; n++) 43 | { 44 | fscanf(fileptr, "%10d:%10d", &word, &count); 45 | word = word - OFFSET; 46 | c->docs[nd].words[n] = word; 47 | c->docs[nd].counts[n] = count; 48 | c->docs[nd].total += count; 49 | if (word >= nw) { nw = word + 1; } 50 | } 51 | nd++; 52 | } 53 | fclose(fileptr); 54 | c->num_docs = nd; 55 | c->num_terms = nw; 56 | printf("number of docs : %d\n", nd); 57 | printf("number of terms : %d\n", nw); 58 | return(c); 59 | } 60 | 61 | int max_corpus_length(corpus* c) 62 | { 63 | int n, max = 0; 64 | for (n = 0; n < c->num_docs; n++) 65 | if (c->docs[n].length > max) max = c->docs[n].length; 66 | return(max); 67 | } 68 | -------------------------------------------------------------------------------- /lda-data.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_DATA_H 2 | #define LDA_DATA_H 3 | 4 | #include 5 | #include 6 | 7 | #include "lda.h" 8 | 9 | #define OFFSET 0; // offset for reading data 10 | 11 | corpus* read_data(char* data_filename); 12 | int max_corpus_length(corpus* c); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /lda-estimate.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-estimate.h" 21 | 22 | /* 23 | * perform inference on a document and update sufficient statistics 24 | * 25 | */ 26 | 27 | double doc_e_step(document* doc, double* gamma, double** phi, 28 | lda_model* model, lda_suffstats* ss) 29 | { 30 | double likelihood; 31 | int n, k; 32 | 33 | // posterior inference 34 | 35 | likelihood = lda_inference(doc, model, gamma, phi); 36 | 37 | // update sufficient statistics 38 | 39 | double gamma_sum = 0; 40 | for (k = 0; k < model->num_topics; k++) 41 | { 42 | gamma_sum += gamma[k]; 43 | ss->alpha_suffstats += digamma(gamma[k]); 44 | } 45 | ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum); 46 | 47 | for (n = 0; n < doc->length; n++) 48 | { 49 | for (k = 0; k < model->num_topics; k++) 50 | { 51 | ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k]; 52 | ss->class_total[k] += doc->counts[n]*phi[n][k]; 53 | } 54 | } 55 | 56 | ss->num_docs = ss->num_docs + 1; 57 | 58 | return(likelihood); 59 | } 60 | 61 | 62 | /* 63 | * writes the word assignments line for a document to a file 64 | * 65 | */ 66 | 67 | void write_word_assignment(FILE* f, document* doc, double** phi, lda_model* model) 68 | { 69 | int n; 70 | 71 | fprintf(f, "%03d", doc->length); 72 | for (n = 0; n < doc->length; n++) 73 | { 74 | fprintf(f, " %04d:%02d", 75 | doc->words[n], argmax(phi[n], model->num_topics)); 76 | } 77 | fprintf(f, "\n"); 78 | fflush(f); 79 | } 80 | 81 | 82 | /* 83 | * saves the gamma parameters of the current dataset 84 | * 85 | */ 86 | 87 | void save_gamma(char* filename, double** gamma, int num_docs, int num_topics) 88 | { 89 | FILE* fileptr; 90 | int d, k; 91 | fileptr = fopen(filename, "w"); 92 | 93 | for (d = 0; d < num_docs; d++) 94 | { 95 | fprintf(fileptr, "%5.10f", gamma[d][0]); 96 | for (k = 1; k < num_topics; k++) 97 | { 98 | fprintf(fileptr, " %5.10f", gamma[d][k]); 99 | } 100 | fprintf(fileptr, "\n"); 101 | } 102 | fclose(fileptr); 103 | } 104 | 105 | 106 | /* 107 | * run_em 108 | * 109 | */ 110 | 111 | void run_em(char* start, char* directory, corpus* corpus) 112 | { 113 | 114 | int d, n; 115 | lda_model *model = NULL; 116 | double **var_gamma, **phi; 117 | 118 | // allocate variational parameters 119 | 120 | var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); 121 | for (d = 0; d < corpus->num_docs; d++) 122 | var_gamma[d] = malloc(sizeof(double) * NTOPICS); 123 | 124 | int max_length = max_corpus_length(corpus); 125 | phi = malloc(sizeof(double*)*max_length); 126 | for (n = 0; n < max_length; n++) 127 | phi[n] = malloc(sizeof(double) * NTOPICS); 128 | 129 | // initialize model 130 | 131 | char filename[100]; 132 | 133 | lda_suffstats* ss = NULL; 134 | if (strcmp(start, "seeded")==0) 135 | { 136 | model = new_lda_model(corpus->num_terms, NTOPICS); 137 | ss = new_lda_suffstats(model); 138 | corpus_initialize_ss(ss, model, corpus); 139 | lda_mle(model, ss, 0); 140 | model->alpha = INITIAL_ALPHA; 141 | } 142 | else if (strcmp(start, "random")==0) 143 | { 144 | model = new_lda_model(corpus->num_terms, NTOPICS); 145 | ss = new_lda_suffstats(model); 146 | random_initialize_ss(ss, model); 147 | lda_mle(model, ss, 0); 148 | model->alpha = INITIAL_ALPHA; 149 | } 150 | else if (strncmp(start, "manual=",7)==0) 151 | { 152 | model = new_lda_model(corpus->num_terms, NTOPICS); 153 | ss = new_lda_suffstats(model); 154 | manual_initialize_ss(start + 7, ss, model, corpus); 155 | lda_mle(model, ss, 0); 156 | model->alpha = INITIAL_ALPHA; 157 | } 158 | else 159 | { 160 | model = load_lda_model(start); 161 | ss = new_lda_suffstats(model); 162 | } 163 | 164 | sprintf(filename,"%s/000",directory); 165 | save_lda_model(model, filename); 166 | 167 | // run expectation maximization 168 | 169 | int i = 0; 170 | double likelihood, likelihood_old = 0, converged = 1; 171 | sprintf(filename, "%s/likelihood.dat", directory); 172 | FILE* likelihood_file = fopen(filename, "w"); 173 | 174 | while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && (i <= EM_MAX_ITER)) 175 | { 176 | i++; printf("**** em iteration %d ****\n", i); 177 | likelihood = 0; 178 | zero_initialize_ss(ss, model); 179 | 180 | // e-step 181 | 182 | for (d = 0; d < corpus->num_docs; d++) 183 | { 184 | if ((d % 1000) == 0) printf("document %d\n",d); 185 | likelihood += doc_e_step(&(corpus->docs[d]), 186 | var_gamma[d], 187 | phi, 188 | model, 189 | ss); 190 | } 191 | 192 | // m-step 193 | 194 | lda_mle(model, ss, ESTIMATE_ALPHA); 195 | 196 | // check for convergence 197 | 198 | converged = (likelihood_old - likelihood) / (likelihood_old); 199 | if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2; 200 | likelihood_old = likelihood; 201 | 202 | // output model and likelihood 203 | 204 | fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged); 205 | fflush(likelihood_file); 206 | if ((i % LAG) == 0) 207 | { 208 | sprintf(filename,"%s/%03d",directory, i); 209 | save_lda_model(model, filename); 210 | sprintf(filename,"%s/%03d.gamma",directory, i); 211 | save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); 212 | } 213 | } 214 | 215 | // output the final model 216 | 217 | sprintf(filename,"%s/final",directory); 218 | save_lda_model(model, filename); 219 | sprintf(filename,"%s/final.gamma",directory); 220 | save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); 221 | 222 | // output the word assignments (for visualization) 223 | 224 | sprintf(filename, "%s/word-assignments.dat", directory); 225 | FILE* w_asgn_file = fopen(filename, "w"); 226 | for (d = 0; d < corpus->num_docs; d++) 227 | { 228 | if ((d % 100) == 0) printf("final e step document %d\n",d); 229 | likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi); 230 | write_word_assignment(w_asgn_file, &(corpus->docs[d]), phi, model); 231 | } 232 | fclose(w_asgn_file); 233 | fclose(likelihood_file); 234 | } 235 | 236 | 237 | /* 238 | * read settings. 239 | * 240 | */ 241 | 242 | void read_settings(char* filename) 243 | { 244 | FILE* fileptr; 245 | char alpha_action[100]; 246 | fileptr = fopen(filename, "r"); 247 | fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER); 248 | fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED); 249 | fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER); 250 | fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED); 251 | fscanf(fileptr, "alpha %s", alpha_action); 252 | if (strcmp(alpha_action, "fixed")==0) 253 | { 254 | ESTIMATE_ALPHA = 0; 255 | } 256 | else 257 | { 258 | ESTIMATE_ALPHA = 1; 259 | } 260 | fclose(fileptr); 261 | } 262 | 263 | 264 | /* 265 | * inference only 266 | * 267 | */ 268 | 269 | void infer(char* model_root, char* save, corpus* corpus) 270 | { 271 | FILE* fileptr; 272 | char filename[100]; 273 | int i, d, n; 274 | lda_model *model; 275 | double **var_gamma, likelihood, **phi; 276 | document* doc; 277 | 278 | model = load_lda_model(model_root); 279 | var_gamma = malloc(sizeof(double*)*(corpus->num_docs)); 280 | for (i = 0; i < corpus->num_docs; i++) 281 | var_gamma[i] = malloc(sizeof(double)*model->num_topics); 282 | sprintf(filename, "%s-lda-lhood.dat", save); 283 | fileptr = fopen(filename, "w"); 284 | for (d = 0; d < corpus->num_docs; d++) 285 | { 286 | if (((d % 100) == 0) && (d>0)) printf("document %d\n",d); 287 | 288 | doc = &(corpus->docs[d]); 289 | phi = (double**) malloc(sizeof(double*) * doc->length); 290 | for (n = 0; n < doc->length; n++) 291 | phi[n] = (double*) malloc(sizeof(double) * model->num_topics); 292 | likelihood = lda_inference(doc, model, var_gamma[d], phi); 293 | 294 | fprintf(fileptr, "%5.5f\n", likelihood); 295 | } 296 | fclose(fileptr); 297 | sprintf(filename, "%s-gamma.dat", save); 298 | save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics); 299 | } 300 | 301 | 302 | /* 303 | * update sufficient statistics 304 | * 305 | */ 306 | 307 | 308 | 309 | /* 310 | * main 311 | * 312 | */ 313 | 314 | int main(int argc, char* argv[]) 315 | { 316 | // (est / inf) alpha k settings data (random / seed/ model) (directory / out) 317 | 318 | corpus* corpus; 319 | 320 | long t1; 321 | (void) time(&t1); 322 | seedMT(t1); 323 | // seedMT(4357U); 324 | 325 | if (argc > 1) 326 | { 327 | if (strcmp(argv[1], "est")==0) 328 | { 329 | INITIAL_ALPHA = atof(argv[2]); 330 | NTOPICS = atoi(argv[3]); 331 | read_settings(argv[4]); 332 | corpus = read_data(argv[5]); 333 | make_directory(argv[7]); 334 | run_em(argv[6], argv[7], corpus); 335 | } 336 | if (strcmp(argv[1], "inf")==0) 337 | { 338 | read_settings(argv[2]); 339 | corpus = read_data(argv[4]); 340 | infer(argv[3], argv[5], corpus); 341 | } 342 | } 343 | else 344 | { 345 | printf("usage : lda est [initial alpha] [k] [settings] [data] [random/seeded/manual=filename/*] [directory]\n"); 346 | printf(" lda inf [settings] [model] [data] [name]\n"); 347 | } 348 | return(0); 349 | } 350 | -------------------------------------------------------------------------------- /lda-estimate.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_ESTIMATE_H 2 | #define LDA_ESTIMATE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "lda.h" 12 | #include "lda-data.h" 13 | #include "lda-inference.h" 14 | #include "lda-model.h" 15 | #include "lda-alpha.h" 16 | #include "utils.h" 17 | #include "cokus.h" 18 | 19 | int LAG = 5; 20 | 21 | float EM_CONVERGED; 22 | int EM_MAX_ITER; 23 | int ESTIMATE_ALPHA; 24 | double INITIAL_ALPHA; 25 | int NTOPICS; 26 | 27 | double doc_e_step(document* doc, 28 | double* gamma, 29 | double** phi, 30 | lda_model* model, 31 | lda_suffstats* ss); 32 | 33 | void save_gamma(char* filename, 34 | double** gamma, 35 | int num_docs, 36 | int num_topics); 37 | 38 | void run_em(char* start, 39 | char* directory, 40 | corpus* corpus); 41 | 42 | void read_settings(char* filename); 43 | 44 | void infer(char* model_root, 45 | char* save, 46 | corpus* corpus); 47 | 48 | #endif 49 | 50 | 51 | -------------------------------------------------------------------------------- /lda-inference.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-inference.h" 21 | 22 | /* 23 | * variational inference 24 | * 25 | */ 26 | 27 | double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi) 28 | { 29 | double converged = 1; 30 | double phisum = 0, likelihood = 0; 31 | double likelihood_old = 0, oldphi[model->num_topics]; 32 | int k, n, var_iter; 33 | double digamma_gam[model->num_topics]; 34 | 35 | // compute posterior dirichlet 36 | 37 | for (k = 0; k < model->num_topics; k++) 38 | { 39 | var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics)); 40 | digamma_gam[k] = digamma(var_gamma[k]); 41 | for (n = 0; n < doc->length; n++) 42 | phi[n][k] = 1.0/model->num_topics; 43 | } 44 | var_iter = 0; 45 | 46 | while ((converged > VAR_CONVERGED) && 47 | ((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1))) 48 | { 49 | var_iter++; 50 | for (n = 0; n < doc->length; n++) 51 | { 52 | phisum = 0; 53 | for (k = 0; k < model->num_topics; k++) 54 | { 55 | oldphi[k] = phi[n][k]; 56 | phi[n][k] = 57 | digamma_gam[k] + 58 | model->log_prob_w[k][doc->words[n]]; 59 | 60 | if (k > 0) 61 | phisum = log_sum(phisum, phi[n][k]); 62 | else 63 | phisum = phi[n][k]; // note, phi is in log space 64 | } 65 | 66 | for (k = 0; k < model->num_topics; k++) 67 | { 68 | phi[n][k] = exp(phi[n][k] - phisum); 69 | var_gamma[k] = 70 | var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]); 71 | // !!! a lot of extra digamma's here because of how we're computing it 72 | // !!! but its more automatically updated too. 73 | digamma_gam[k] = digamma(var_gamma[k]); 74 | } 75 | } 76 | 77 | likelihood = compute_likelihood(doc, model, phi, var_gamma); 78 | assert(!isnan(likelihood)); 79 | converged = (likelihood_old - likelihood) / likelihood_old; 80 | likelihood_old = likelihood; 81 | 82 | // printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged); 83 | } 84 | return(likelihood); 85 | } 86 | 87 | 88 | /* 89 | * compute likelihood bound 90 | * 91 | */ 92 | 93 | double 94 | compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma) 95 | { 96 | double likelihood = 0, digsum = 0, var_gamma_sum = 0, dig[model->num_topics]; 97 | int k, n; 98 | 99 | for (k = 0; k < model->num_topics; k++) 100 | { 101 | dig[k] = digamma(var_gamma[k]); 102 | var_gamma_sum += var_gamma[k]; 103 | } 104 | digsum = digamma(var_gamma_sum); 105 | 106 | likelihood = 107 | lgamma(model->alpha * model -> num_topics) 108 | - model -> num_topics * lgamma(model->alpha) 109 | - (lgamma(var_gamma_sum)); 110 | 111 | for (k = 0; k < model->num_topics; k++) 112 | { 113 | likelihood += 114 | (model->alpha - 1)*(dig[k] - digsum) + lgamma(var_gamma[k]) 115 | - (var_gamma[k] - 1)*(dig[k] - digsum); 116 | 117 | for (n = 0; n < doc->length; n++) 118 | { 119 | if (phi[n][k] > 0) 120 | { 121 | likelihood += doc->counts[n]* 122 | (phi[n][k]*((dig[k] - digsum) - log(phi[n][k]) 123 | + model->log_prob_w[k][doc->words[n]])); 124 | } 125 | } 126 | } 127 | return(likelihood); 128 | } 129 | -------------------------------------------------------------------------------- /lda-inference.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_INFERENCE_H 2 | #define LDA_INFERENCE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lda.h" 8 | #include "utils.h" 9 | 10 | float VAR_CONVERGED; 11 | int VAR_MAX_ITER; 12 | 13 | double lda_inference(document*, lda_model*, double*, double**); 14 | double compute_likelihood(document*, lda_model*, double**, double*); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /lda-model.c: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #include "lda-model.h" 21 | 22 | /* 23 | * compute MLE lda model from sufficient statistics 24 | * 25 | */ 26 | 27 | void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha) 28 | { 29 | int k; int w; 30 | 31 | for (k = 0; k < model->num_topics; k++) 32 | { 33 | for (w = 0; w < model->num_terms; w++) 34 | { 35 | if (ss->class_word[k][w] > 0) 36 | { 37 | model->log_prob_w[k][w] = 38 | log(ss->class_word[k][w]) - 39 | log(ss->class_total[k]); 40 | } 41 | else 42 | model->log_prob_w[k][w] = -100; 43 | } 44 | } 45 | if (estimate_alpha == 1) 46 | { 47 | model->alpha = opt_alpha(ss->alpha_suffstats, 48 | ss->num_docs, 49 | model->num_topics); 50 | 51 | printf("new alpha = %5.5f\n", model->alpha); 52 | } 53 | } 54 | 55 | /* 56 | * allocate sufficient statistics 57 | * 58 | */ 59 | 60 | lda_suffstats* new_lda_suffstats(lda_model* model) 61 | { 62 | int num_topics = model->num_topics; 63 | int num_terms = model->num_terms; 64 | int i,j; 65 | 66 | lda_suffstats* ss = malloc(sizeof(lda_suffstats)); 67 | ss->class_total = malloc(sizeof(double)*num_topics); 68 | ss->class_word = malloc(sizeof(double*)*num_topics); 69 | for (i = 0; i < num_topics; i++) 70 | { 71 | ss->class_total[i] = 0; 72 | ss->class_word[i] = malloc(sizeof(double)*num_terms); 73 | for (j = 0; j < num_terms; j++) 74 | { 75 | ss->class_word[i][j] = 0; 76 | } 77 | } 78 | return(ss); 79 | } 80 | 81 | 82 | /* 83 | * various intializations for the sufficient statistics 84 | * 85 | */ 86 | 87 | void zero_initialize_ss(lda_suffstats* ss, lda_model* model) 88 | { 89 | int k, w; 90 | for (k = 0; k < model->num_topics; k++) 91 | { 92 | ss->class_total[k] = 0; 93 | for (w = 0; w < model->num_terms; w++) 94 | { 95 | ss->class_word[k][w] = 0; 96 | } 97 | } 98 | ss->num_docs = 0; 99 | ss->alpha_suffstats = 0; 100 | } 101 | 102 | 103 | void random_initialize_ss(lda_suffstats* ss, lda_model* model) 104 | { 105 | int num_topics = model->num_topics; 106 | int num_terms = model->num_terms; 107 | int k, n; 108 | for (k = 0; k < num_topics; k++) 109 | { 110 | for (n = 0; n < num_terms; n++) 111 | { 112 | ss->class_word[k][n] += 1.0/num_terms + myrand(); 113 | ss->class_total[k] += ss->class_word[k][n]; 114 | } 115 | } 116 | } 117 | 118 | 119 | void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c) 120 | { 121 | int num_topics = model->num_topics; 122 | int i, j, k, d, n; 123 | document* doc; 124 | int seen[num_topics][NUM_INIT]; 125 | int already_selected; 126 | 127 | for (k = 0; k < num_topics; k++) 128 | { 129 | for (i = 0; i < NUM_INIT; i++) 130 | { 131 | do 132 | { 133 | d = floor(myrand() * c->num_docs); 134 | 135 | already_selected = 0; 136 | for (j = 0;j < k;j++) 137 | { 138 | if (seen[j][i] == d) 139 | { 140 | already_selected = 1; 141 | printf("skipping duplicate seed document %d\n", d); 142 | } 143 | } 144 | } while (already_selected); 145 | seen[k][i] = d; 146 | 147 | printf("initialized with document %d\n", d); 148 | doc = &(c->docs[d]); 149 | for (n = 0; n < doc->length; n++) 150 | { 151 | ss->class_word[k][doc->words[n]] += doc->counts[n]; 152 | } 153 | } 154 | for (n = 0; n < model->num_terms; n++) 155 | { 156 | ss->class_word[k][n] += 1.0; 157 | ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n]; 158 | } 159 | } 160 | } 161 | 162 | void manual_initialize_ss(char *seedfile, lda_suffstats* ss, lda_model* model, corpus* c) 163 | { 164 | int num_topics = model->num_topics; 165 | int i, k, d, n, err; 166 | document* doc; 167 | 168 | FILE *seeds = fopen(seedfile,"r"); 169 | if (seeds == NULL) { 170 | printf("Couldn't find manual seeds in %s.\n", seedfile); 171 | exit(1); 172 | } 173 | printf("Loading seeds from %s\n", seedfile); 174 | 175 | for (k = 0; k < num_topics; k++) 176 | { 177 | for (i = 0; i < NUM_INIT; i++) 178 | { 179 | err = fscanf(seeds, "%d\n", &d); 180 | if (err == EOF) 181 | { 182 | printf("Ran out of seeds (%d/%d)\n", k, num_topics); 183 | exit(2); 184 | } else if (err != 1) 185 | { 186 | printf("Couldn't read a seed from ldaseeds.txt. It should have one number per line.\n"); 187 | exit(3); 188 | } 189 | 190 | printf("initialized with document %d\n", d); 191 | doc = &(c->docs[d]); 192 | for (n = 0; n < doc->length; n++) 193 | { 194 | ss->class_word[k][doc->words[n]] += doc->counts[n]; 195 | } 196 | } 197 | for (n = 0; n < model->num_terms; n++) 198 | { 199 | ss->class_word[k][n] += 1.0; 200 | ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n]; 201 | } 202 | } 203 | } 204 | 205 | 206 | /* 207 | * allocate new lda model 208 | * 209 | */ 210 | 211 | lda_model* new_lda_model(int num_terms, int num_topics) 212 | { 213 | int i,j; 214 | lda_model* model; 215 | 216 | model = malloc(sizeof(lda_model)); 217 | model->num_topics = num_topics; 218 | model->num_terms = num_terms; 219 | model->alpha = 1.0; 220 | model->log_prob_w = malloc(sizeof(double*)*num_topics); 221 | for (i = 0; i < num_topics; i++) 222 | { 223 | model->log_prob_w[i] = malloc(sizeof(double)*num_terms); 224 | for (j = 0; j < num_terms; j++) 225 | model->log_prob_w[i][j] = 0; 226 | } 227 | return(model); 228 | } 229 | 230 | 231 | /* 232 | * deallocate new lda model 233 | * 234 | */ 235 | 236 | void free_lda_model(lda_model* model) 237 | { 238 | int i; 239 | 240 | for (i = 0; i < model->num_topics; i++) 241 | { 242 | free(model->log_prob_w[i]); 243 | } 244 | free(model->log_prob_w); 245 | } 246 | 247 | 248 | /* 249 | * save an lda model 250 | * 251 | */ 252 | 253 | void save_lda_model(lda_model* model, char* model_root) 254 | { 255 | char filename[100]; 256 | FILE* fileptr; 257 | int i, j; 258 | 259 | sprintf(filename, "%s.beta", model_root); 260 | fileptr = fopen(filename, "w"); 261 | for (i = 0; i < model->num_topics; i++) 262 | { 263 | for (j = 0; j < model->num_terms; j++) 264 | { 265 | fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]); 266 | } 267 | fprintf(fileptr, "\n"); 268 | } 269 | fclose(fileptr); 270 | 271 | sprintf(filename, "%s.other", model_root); 272 | fileptr = fopen(filename, "w"); 273 | fprintf(fileptr, "num_topics %d\n", model->num_topics); 274 | fprintf(fileptr, "num_terms %d\n", model->num_terms); 275 | fprintf(fileptr, "alpha %5.10f\n", model->alpha); 276 | fclose(fileptr); 277 | } 278 | 279 | 280 | lda_model* load_lda_model(char* model_root) 281 | { 282 | char filename[100]; 283 | FILE* fileptr; 284 | int i, j, num_terms, num_topics; 285 | float x, alpha; 286 | 287 | sprintf(filename, "%s.other", model_root); 288 | printf("loading %s\n", filename); 289 | fileptr = fopen(filename, "r"); 290 | fscanf(fileptr, "num_topics %d\n", &num_topics); 291 | fscanf(fileptr, "num_terms %d\n", &num_terms); 292 | fscanf(fileptr, "alpha %f\n", &alpha); 293 | fclose(fileptr); 294 | 295 | lda_model* model = new_lda_model(num_terms, num_topics); 296 | model->alpha = alpha; 297 | 298 | sprintf(filename, "%s.beta", model_root); 299 | printf("loading %s\n", filename); 300 | fileptr = fopen(filename, "r"); 301 | for (i = 0; i < num_topics; i++) 302 | { 303 | for (j = 0; j < num_terms; j++) 304 | { 305 | fscanf(fileptr, "%f", &x); 306 | model->log_prob_w[i][j] = x; 307 | } 308 | } 309 | fclose(fileptr); 310 | return(model); 311 | } 312 | -------------------------------------------------------------------------------- /lda-model.h: -------------------------------------------------------------------------------- 1 | #ifndef LDA_MODEL_H 2 | #define LDA_MODEL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "lda.h" 8 | #include "lda-alpha.h" 9 | #include "cokus.h" 10 | 11 | #define myrand() (double) (((unsigned long) randomMT()) / 4294967296.) 12 | #define NUM_INIT 1 13 | 14 | void free_lda_model(lda_model*); 15 | void save_lda_model(lda_model*, char*); 16 | lda_model* new_lda_model(int, int); 17 | lda_suffstats* new_lda_suffstats(lda_model* model); 18 | void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c); 19 | void manual_initialize_ss(char *seedfile, lda_suffstats* ss, lda_model* model, corpus* c); 20 | void random_initialize_ss(lda_suffstats* ss, lda_model* model); 21 | void zero_initialize_ss(lda_suffstats* ss, lda_model* model); 22 | void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha); 23 | lda_model* load_lda_model(char* model_root); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /lda.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2004, David M. Blei (blei [at] cs [dot] cmu [dot] edu) 2 | 3 | // This file is part of LDA-C. 4 | 5 | // LDA-C is free software; you can redistribute it and/or modify it under 6 | // the terms of the GNU General Public License as published by the Free 7 | // Software Foundation; either version 2 of the License, or (at your 8 | // option) any later version. 9 | 10 | // LDA-C is distributed in the hope that it will be useful, but WITHOUT 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 | // for more details. 14 | 15 | // You should have received a copy of the GNU General Public License 16 | // along with this program; if not, write to the Free Software 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 18 | // USA 19 | 20 | #ifndef LDA_H 21 | #define LDA_H 22 | 23 | typedef struct 24 | { 25 | int* words; 26 | int* counts; 27 | int length; 28 | int total; 29 | } document; 30 | 31 | 32 | typedef struct 33 | { 34 | document* docs; 35 | int num_terms; 36 | int num_docs; 37 | } corpus; 38 | 39 | 40 | typedef struct 41 | { 42 | double alpha; 43 | double** log_prob_w; 44 | int num_topics; 45 | int num_terms; 46 | } lda_model; 47 | 48 | 49 | typedef struct 50 | { 51 | double** class_word; 52 | double* class_total; 53 | double alpha_suffstats; 54 | int num_docs; 55 | } lda_suffstats; 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 489 | 490 | Also add information on how to contact you by electronic and paper mail. 491 | 492 | You should also get your employer (if you work as a programmer) or your 493 | school, if any, to sign a "copyright disclaimer" for the library, if 494 | necessary. Here is a sample; alter the names: 495 | 496 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 497 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 498 | 499 | , 1 April 1990 500 | Ty Coon, President of Vice 501 | 502 | That's all there is to it! 503 | 504 | 505 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- 1 | *************************** 2 | LATENT DIRICHLET ALLOCATION 3 | *************************** 4 | 5 | David M. Blei 6 | blei[at]cs.princeton.edu 7 | 8 | (C) Copyright 2006, David M. Blei (blei [at] cs [dot] princeton [dot] edu) 9 | 10 | This file is part of LDA-C. 11 | 12 | LDA-C is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | LDA-C is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | ------------------------------------------------------------------------ 28 | 29 | This is a C implementation of latent Dirichlet allocation (LDA), a 30 | model of discrete data which is fully described in Blei et al. (2003) 31 | (http://www.cs.berkeley.edu/~blei/papers/blei03a.pdf). 32 | 33 | LDA is a hierarchical probabilistic model of documents. Let \alpha be 34 | a scalar and \beta_{1:K} be K distributions of words (called "topics"). 35 | As implemented here, a K topic LDA model assumes the following 36 | generative process of an N word document: 37 | 38 | 1. \theta | \alpha ~ Dirichlet(\alpha, ..., \alpha) 39 | 40 | 2. for each word n = {1, ..., N}: 41 | 42 | a. Z_n | \theta ~ Mult(\theta) 43 | 44 | b. W_n | z_n, \beta ~ Mult(\beta_{z_n}) 45 | 46 | This code implements variational inference of \theta and z_{1:N} for a 47 | document, and estimation of the topics \beta_{1:K} and Dirichlet 48 | parameter \alpha. 49 | 50 | ------------------------------------------------------------------------ 51 | 52 | 53 | TABLE OF CONTENTS 54 | 55 | 56 | A. COMPILING 57 | 58 | B. TOPIC ESTIMATION 59 | 60 | 1. SETTINGS FILE 61 | 62 | 2. DATA FILE FORMAT 63 | 64 | C. INFERENCE 65 | 66 | D. PRINTING TOPICS 67 | 68 | E. QUESTIONS, COMMENTS, PROBLEMS, UPDATE ANNOUNCEMENTS 69 | 70 | 71 | ------------------------------------------------------------------------ 72 | 73 | A. COMPILING 74 | 75 | Type "make" in a shell. 76 | 77 | 78 | ------------------------------------------------------------------------ 79 | 80 | B. TOPIC ESTIMATION 81 | 82 | Estimate the model by executing: 83 | 84 | lda est [alpha] [k] [settings] [data] [random/seeded/manual=filename/*] [directory] 85 | 86 | The term [random/seeded/*] > describes how the topics will be 87 | initialized. "Random" initializes each topic randomly; "seeded" 88 | initializes each topic to a distribution smoothed from a randomly 89 | chosen document; "manual=filename" will load the document numbers to 90 | use as seeds from the file specified (one per line); or, you can 91 | specify a model name to load a pre-existing model as the initial model 92 | (this is useful to continue EM from where it left off). To change the 93 | number of initial documents used, edit lda-estimate.c. 94 | 95 | The model (i.e., \alpha and \beta_{1:K}) and variational posterior 96 | Dirichlet parameters will be saved in the specified directory every 97 | ten iterations. Additionally, there will be a log file for the 98 | likelihood bound and convergence score at each iteration. The 99 | algorithm runs until that score is less than "em_convergence" (from 100 | the settings file) or "em_max_iter" iterations are reached. (To 101 | change the lag between saved models, edit lda-estimate.c.) 102 | 103 | The saved models are in two files: 104 | 105 | .other contains alpha. 106 | 107 | .beta contains the log of the topic distributions. 108 | Each line is a topic; in line k, each entry is log p(w | z=k) 109 | 110 | The variational posterior Dirichlets are in: 111 | 112 | .gamma 113 | 114 | The settings file and data format are described below. 115 | 116 | 117 | 1. Settings file 118 | 119 | See settings.txt for a sample. See inf-settings.txt for an example of 120 | a settings file for inference. These are placeholder values; they 121 | should be experimented with. 122 | 123 | This is of the following form: 124 | 125 | var max iter [integer e.g., 10 or -1] 126 | var convergence [float e.g., 1e-8] 127 | em max iter [integer e.g., 100] 128 | em convergence [float e.g., 1e-5] 129 | alpha [fit/estimate] 130 | 131 | where the settings are 132 | 133 | [var max iter] 134 | 135 | The maximum number of iterations of coordinate ascent variational 136 | inference for a single document. A value of -1 indicates "full" 137 | variational inference, until the variational convergence 138 | criterion is met. 139 | 140 | [var convergence] 141 | 142 | The convergence criteria for variational inference. Stop if 143 | (score_old - score) / abs(score_old) is less than this value (or 144 | after the maximum number of iterations). Note that the score is 145 | the lower bound on the likelihood for a particular document. 146 | 147 | [em max iter] 148 | 149 | The maximum number of iterations of variational EM. 150 | 151 | [em convergence] 152 | 153 | The convergence criteria for varitional EM. Stop if (score_old - 154 | score) / abs(score_old) is less than this value (or after the 155 | maximum number of iterations). Note that "score" is the lower 156 | bound on the likelihood for the whole corpus. 157 | 158 | [alpha] 159 | 160 | If set to [fixed] then alpha does not change from iteration to 161 | iteration. If set to [estimate], then alpha is estimated along 162 | with the topic distributions. 163 | 164 | 165 | 2. Data format 166 | 167 | Under LDA, the words of each document are assumed exchangeable. Thus, 168 | each document is succinctly represented as a sparse vector of word 169 | counts. The data is a file where each line is of the form: 170 | 171 | [M] [term_1]:[count] [term_2]:[count] ... [term_N]:[count] 172 | 173 | where [M] is the number of unique terms in the document, and the 174 | [count] associated with each term is how many times that term appeared 175 | in the document. Note that [term_1] is an integer which indexes the 176 | term; it is not a string. 177 | 178 | 179 | ------------------------------------------------------------------------ 180 | 181 | C. INFERENCE 182 | 183 | To perform inference on a different set of data (in the same format as 184 | for estimation), execute: 185 | 186 | lda inf [settings] [model] [data] [name] 187 | 188 | Variational inference is performed on the data using the model in 189 | [model].* (see above). Two files will be created : [name].gamma are 190 | the variational Dirichlet parameters for each document; 191 | [name].likelihood is the bound on the likelihood for each document. 192 | 193 | 194 | ------------------------------------------------------------------------ 195 | 196 | D. PRINTING TOPICS 197 | 198 | The Python script topics.py lets you print out the top N 199 | words from each topic in a .beta file. Usage is: 200 | 201 | python topics.py 202 | 203 | 204 | ------------------------------------------------------------------------ 205 | 206 | E. QUESTIONS, COMMENTS, PROBLEMS, AND UPDATE ANNOUNCEMENTS 207 | 208 | Please join the topic-models mailing list, 209 | topic-models@lists.cs.princeton.edu. 210 | 211 | To join, go to http://lists.cs.princeton.edu and click on 212 | "topic-models." 213 | -------------------------------------------------------------------------------- /settings.txt: -------------------------------------------------------------------------------- 1 | var max iter 20 2 | var convergence 1e-6 3 | em max iter 100 4 | em convergence 1e-4 5 | alpha estimate 6 | -------------------------------------------------------------------------------- /todo.txt: -------------------------------------------------------------------------------- 1 | -- figure out why the likelihood is positive in the short ap documents 2 | 3 | -- fix learning alpha 4 | 5 | (a) save alpha 6 | (b) start alpha intelligently, or from the previous value of alpha 7 | (c) fix convergence criteria of the optimizer 8 | 9 | -- deal with settings in a cleaner way 10 | -------------------------------------------------------------------------------- /topics.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | # usage: python topics.py 4 | # 5 | # is output from the lda-c code 6 | # is a list of words, one per line 7 | # is the number of words to print from each topic 8 | 9 | import sys 10 | 11 | def print_topics(beta_file, vocab_file, nwords = 25): 12 | 13 | # get the vocabulary 14 | 15 | vocab = file(vocab_file, 'r').readlines() 16 | # vocab = map(lambda x: x.split()[0], vocab) 17 | vocab = map(lambda x: x.strip(), vocab) 18 | 19 | # for each line in the beta file 20 | 21 | indices = range(len(vocab)) 22 | topic_no = 0 23 | for topic in file(beta_file, 'r'): 24 | print 'topic %03d' % topic_no 25 | topic = map(float, topic.split()) 26 | indices.sort(lambda x,y: -cmp(topic[x], topic[y])) 27 | for i in range(nwords): 28 | print ' %s' % vocab[indices[i]] 29 | topic_no = topic_no + 1 30 | print '\n' 31 | 32 | if (__name__ == '__main__'): 33 | 34 | if (len(sys.argv) != 4): 35 | print 'usage: python topics.py \n' 36 | sys.exit(1) 37 | 38 | beta_file = sys.argv[1] 39 | vocab_file = sys.argv[2] 40 | nwords = int(sys.argv[3]) 41 | print_topics(beta_file, vocab_file, nwords) 42 | -------------------------------------------------------------------------------- /utils.c: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | /* 4 | * given log(a) and log(b), return log(a + b) 5 | * 6 | */ 7 | 8 | double log_sum(double log_a, double log_b) 9 | { 10 | double v; 11 | 12 | if (log_a < log_b) 13 | { 14 | v = log_b+log(1 + exp(log_a-log_b)); 15 | } 16 | else 17 | { 18 | v = log_a+log(1 + exp(log_b-log_a)); 19 | } 20 | return(v); 21 | } 22 | 23 | /** 24 | * Proc to calculate the value of the trigamma, the second 25 | * derivative of the loggamma function. Accepts positive matrices. 26 | * From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with 27 | * recurrence formula 6.4.6. Each requires workspace at least 5 28 | * times the size of X. 29 | * 30 | **/ 31 | 32 | double trigamma(double x) 33 | { 34 | double p; 35 | int i; 36 | 37 | x=x+6; 38 | p=1/(x*x); 39 | p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238) 40 | *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p; 41 | for (i=0; i<6 ;i++) 42 | { 43 | x=x-1; 44 | p=1/(x*x)+p; 45 | } 46 | return(p); 47 | } 48 | 49 | 50 | /* 51 | * taylor approximation of first derivative of the log gamma function 52 | * 53 | */ 54 | 55 | double digamma(double x) 56 | { 57 | double p; 58 | x=x+6; 59 | p=1/(x*x); 60 | p=(((0.004166666666667*p-0.003968253986254)*p+ 61 | 0.008333333333333)*p-0.083333333333333)*p; 62 | p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6); 63 | return p; 64 | } 65 | 66 | 67 | double log_gamma(double x) 68 | { 69 | double z=1/(x*x); 70 | 71 | x=x+6; 72 | z=(((-0.000595238095238*z+0.000793650793651) 73 | *z-0.002777777777778)*z+0.083333333333333)/x; 74 | z=(x-0.5)*log(x)-x+0.918938533204673+z-log(x-1)- 75 | log(x-2)-log(x-3)-log(x-4)-log(x-5)-log(x-6); 76 | return z; 77 | } 78 | 79 | 80 | 81 | /* 82 | * make directory 83 | * 84 | */ 85 | 86 | void make_directory(char* name) 87 | { 88 | mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR); 89 | } 90 | 91 | 92 | /* 93 | * argmax 94 | * 95 | */ 96 | 97 | int argmax(double* x, int n) 98 | { 99 | int i; 100 | double max = x[0]; 101 | int argmax = 0; 102 | for (i = 1; i < n; i++) 103 | { 104 | if (x[i] > max) 105 | { 106 | max = x[i]; 107 | argmax = i; 108 | } 109 | } 110 | return(argmax); 111 | } 112 | -------------------------------------------------------------------------------- /utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | double log_sum(double log_a, double log_b); 12 | double trigamma(double x); 13 | double digamma(double x); 14 | double log_gamma(double x); 15 | void make_directory(char* name); 16 | int argmax(double* x, int n); 17 | 18 | #endif 19 | --------------------------------------------------------------------------------