├── Makefile
├── README
├── README.md
├── TAGS
├── corpus.c
├── corpus.h
├── ctm
├── ctm-topics.py
├── ctm.c
├── ctm.h
├── estimate.c
├── gsl-wrappers.c
├── gsl-wrappers.h
├── inf-settings.txt
├── inference.c
├── inference.h
├── lasso-graph.r
├── params.c
├── params.h
├── r-interface.c
├── r-interface.h
└── settings.txt


/Makefile:
--------------------------------------------------------------------------------
 1 | .SUFFIXES: .c .u
 2 | CC= gcc
 3 | 
 4 | CFLAGS = -g -Wall -O3 -DHAVE_INLINE -DGSL_RANGE_CHECK_OFF
 5 | LDFLAGS = -lm -lgsl -latlas -lgslcblas
 6 | # LDFLAGS = -lm -lgsl -latlas -lcblas
 7 | LOBJECTS= inference.o gsl-wrappers.o ctm.o estimate.o corpus.o params.o
 8 | LSOURCE= inference.c gsl-wrappers.c corpus.c estimate.c corpus.c params.c
 9 | 
10 | mac:	$(LOBJECTS)
11 | 	$(CC) $(LOBJECTS) -o ctm $(LDFLAGS)
12 | 
13 | linux:	$(LOBJECTS)
14 | 	$(CC) $(LOBJECTS) -o ctm $(LDFLAGS)
15 | 
16 | debug:	$(LOBJECTS)
17 | 	$(CC) $(LOBJECTS) -o ctm $(LDFLAGS)
18 | 
19 | clean:
20 | 	-rm -f *.o
21 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | ---------------------------
  2 | Correlated Topic Model in C
  3 | ---------------------------
  4 | 
  5 | David M. Blei and John D. Lafferty
  6 | blei[at]cs.princeton.edu
  7 | 
  8 | (C) Copyright 2007, David M. Blei and John D. Lafferty
  9 | 
 10 | This file is part of CTM-C.
 11 | 
 12 | CTM-C is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | CTM-C is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | ----
 28 | 
 29 | This is a C-implementation of the correlated topic model (CTM) from
 30 | Blei and Lafferty (2007).  This code requires the GSL library.
 31 | 
 32 | Any questions or comments about this code should be sent to the topic
 33 | models mailing list, which is a forum for discussing topic models in
 34 | general.  To join, go to http://lists.cs.princeton.edu and click on
 35 | "topic-models."  DO NOT EMAIL EITHER OF THE AUTHORS WITH QUESTIONS
 36 | ABOUT THIS CODE.  ALL QUESTIONS WILL BE ANSWERED ON THE MAILING LIST.
 37 | 
 38 | ------------------------------------------------------------------------
 39 | 
 40 | TABLE OF CONTENTS
 41 | 
 42 | A. COMPILING
 43 | 
 44 | B. DATA FORMAT
 45 | 
 46 | C. MODEL ESTIMATION
 47 | 
 48 | D. MODEL EXAMINATION
 49 |    1. output of estimation
 50 |    2. viewing the topics with ctm-topics.py
 51 |    3. using lasso-graph.r
 52 | 
 53 | E. POSTERIOR INFERENCE ON NEW DOCUMENTS
 54 | 
 55 | ------------------------------------------------------------------------
 56 | 
 57 | A. COMPILING
 58 | 
 59 | Type "make" in a shell.  Note: the Makefule currently points to the
 60 | (inefficient) GSL version of the BLAS.  You will probably want to
 61 | point to the BLAS library on your machine.
 62 | 
 63 | ------------------------------------------------------------------------
 64 | 
 65 | B. Data format
 66 | 
 67 | Under the CTM, the words of each document are assumed exchangeable.
 68 | Thus, each document is succinctly represented as a sparse vector of
 69 | word counts. The data is a file where each line is of the form:
 70 | 
 71 |      [M] [term_1]:[count_1] [term_2]:[count_2] ...  [term_N]:[count_3]
 72 | 
 73 | * [M] is the number of unique terms in the document
 74 | 
 75 | * [term_i] is an integer associated with the i-th term in the
 76 |   vocabulary.
 77 | 
 78 | * [count_i] is how many times the i-th term appeared in the document.
 79 | 
 80 | ------------------------------------------------------------------------
 81 | 
 82 | C. Estimating a model
 83 | 
 84 | The command to estimate a model is:
 85 | 
 86 | ./ctm est <dataset> <k> <rand/seed/model> <dir> <settings>
 87 | 
 88 | For example:
 89 | 
 90 | ./ctm est my-training-data.dat 10 seed CTM10 settings.txt
 91 | 
 92 | - <dataset> is the file described above in part B.
 93 | 
 94 | - <# topics> is the desired number of topics into which to decompose
 95 |   the documents
 96 | 
 97 | - <rand/seed/model> indicates how to initialize EM: randomly, seeded,
 98 |   or from a partially fit model.  If from a model, type the name of
 99 |   the model into the command line, rather than the word "model."  For
100 |   example, if your model was in the directory "CTM100" and had the
101 |   prefix "010" then you'd type "CTM100/010" for the starting point of
102 |   EM.  (We recommend using "seed" to begin with.)
103 | 
104 | - <dir> is the directory in which to place the files associated with
105 |   this run of variational EM.  (See part D below.)
106 | 
107 | - <settings> is a settings file.  For example, the settings.txt file
108 |   is good for EM and looks like this:
109 | 
110 |                   em max iter 1000
111 |                   var max iter 20
112 |                   cg max iter -1
113 |                   em convergence 1e-3
114 |                   var convergence 1e-6
115 |                   cg convergence 1e-6
116 |                   lag 10
117 |                   covariance estimate mle
118 | 
119 |   The first item ("em max iter") is the maximum number of EM
120 |   iterations.
121 | 
122 |   The second item ("var max iter") is the maximum number of variational iterations,
123 |   i.e., passes through each variational parameter (-1 indicates to
124 |   iterate until the convergence criterion is met.)
125 | 
126 |   The third item ("cg max iter") is the maximum number of conjugate
127 |   gradient iterations in fitting the variational mean and variance per
128 |   document.
129 | 
130 |   Items 4-6 are convergence criterions for EM, variational inference,
131 |   and conjugate gradient, respectively.
132 | 
133 |   The 7th item ("lag") is the multiple of iterations of EM after which
134 |   to save a version of the model.  This is useful, for example, if you
135 |   want to monitor how the model changes from iteration to iteration.
136 | 
137 |   The 8th item ("covariance estimate") is what technique to estimate
138 |   the covariance with.  The choices are "mle" or "shrinkage."
139 |   Additional R code is provided in this directory to implement L1
140 |   regularization of the topic covariance matrix as described in Blei
141 |   and Lafferty (2007).
142 | 
143 | ------------------------------------------------------------------------
144 | 
145 | D. MODEL EXAMINATION
146 | 
147 | 1. Once EM has converged, the model directory will be populated with
148 | several files that can be used to examine the resulting model fit, for
149 | example to make topic graph figures or compute similarity between
150 | documents.
151 | 
152 | All the files are stored in row major format.  They can be read into R
153 | with the command:
154 | 
155 |      x <- matrix(scan(FILENAME), byrow=T, nrow=NR, ncol=NC),
156 | 
157 | where FILENAME is the file, NR is the number of rows, and NC is the
158 | number of columns.
159 | 
160 | Let K be the number of topics and V be the number of words in the
161 | vocabulary.  The files are as follows:
162 | 
163 |     final-cov.dat, final-inv-cov.dat, final-log-det-inv-cov: These are
164 |     files corresponding to the (K-1) x (K-1) covariance matrix between
165 |     topics.  Note that this code implements the logistic normal where
166 |     a K-2 Gaussian is mapped to the K-1 simplex.  (This is slightly
167 |     different from the treatment in the paper, where the K-1 Gaussian
168 |     is mapped to the K-1 simplex.)
169 | 
170 |     final-mu.dat: This is the K-1 mean vector of the logistic normal
171 |     over topic proportions.
172 | 
173 |     final-log-beta.dat: This is a K X V topic matrix.  The ith row
174 |     contains the log probabilities of the words for the ith topic.
175 |     Combined with a vector of words in order, this can be used to
176 |     inspect the top N words from each topic.
177 | 
178 |     final-lambda.dat and final-nu.dat: This is a D x K matrix of the
179 |     variational mean parameter for each document's topic proportions.
180 | 
181 |     final-nu: This is the D x K matrix of the variational variance
182 |     parameter for each document in the collection.
183 | 
184 |     likelihood.dat: This is a record of the likelihood bound at each
185 |     iteration of EM.  The columns are: likelihood bound, convergence
186 |     criterion, time in seconds of the iteration, average number of
187 |     variational iterations per document, the percentage of documents
188 |     that reached the variational convergence criterion.
189 | 
190 | 2. The script in ctm-topics.py lists the top N words from each topic.
191 | To use, you need the NumPy package installed.  Execute
192 | 
193 |      python ctm-topics.py final-log-beta.dat vocab.dat 25
194 | 
195 | where vocab.dat is a file with one word per line ordered according to
196 | the numbering in your data.  This will print out the top 25 words from
197 | each topic.
198 | 
199 | 3. Finally, the file lasso-graph.r provides R code to build graphs of
200 | topics using the lasso.  Details are in the file.
201 | 
202 | ------------------------------------------------------------------------
203 | 
204 | E. POSTERIOR INFERENCE ON NEW DOCUMENTS
205 | 
206 | To perform posterior inference on a set of documents with the same
207 | vocabulary, run the command
208 | 
209 | ./ctm inf <dataset> <model-prefix> <results-prefix> <settings>
210 | 
211 | For example:
212 | 
213 | ./ctm inf holdout.dat CTM10/final CTM10/holdout inf-settings.txt
214 | 
215 | This will result in a number of files with prefix "results-prefix."
216 | They are as follows:
217 | 
218 | - inf-lambda.dat, inf-nu.dat: as above.
219 | 
220 | - inf-ctm-lhood: the likelihood bound for each document
221 | 
222 | - inf-phi-sums: A D x K matrix of the sum of the phi variables for
223 |   each document.  This gives an idea about how many words are
224 |   associated with each topic.
225 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Correlated topic model
 2 | 
 3 | This is a C implementation of the correlated topic model (CTM), a topic model for text or other discrete data that models correlation between the occurrence of different topics in a document. The CTM is fully described in [Blei and Lafferty (2007)](http://www.cs.columbia.edu/~blei/papers/BleiLafferty2007.pdf). (For an implementation of a related topic model, latent Dirichlet allocation, click [here](https://github.com/Blei-Lab/lda-c).)
 4 | 
 5 | An example of the correlated topic model applied to a corpus of the journal Science can be found [here](http://www.cs.cmu.edu/~lemur/science/).
 6 | 
 7 | Note that this code requires the [Gnu Scientific Library](http://www.gnu.org/software/gsl/).
 8 | 
 9 | ## Downloads
10 | 
11 | View the [README](https://github.com/Blei-Lab/ctm-c/blob/master/README).
12 | 
13 | Fork or clone this repository.
14 | 
15 | ## Questions, bug fixes, and updates
16 | 
17 | Please join the topic-models mailing list to ask questions about this code, be notified of updates, and to discuss the CTM and related techniques.
18 | 
19 | To join, click [here](https://lists.cs.princeton.edu/mailman/listinfo/topic-models).
20 | 


--------------------------------------------------------------------------------
/TAGS:
--------------------------------------------------------------------------------
  1 | 
  2 | corpus.c,148
  3 | corpus* read_data(33,950
  4 | void print_doc(78,2102
  5 | void write_corpus(95,2355
  6 | void init_doc(115,2764
  7 | int remove_word(136,3153
  8 | void split(158,3495
  9 | 
 10 | corpus.h,160
 11 | #define CORPUS_H21,805
 12 | #define OFFSET 25,842
 13 | typedef struct doc doc32,921
 14 | } doc;doc37,1006
 15 | typedef struct corpus corpus45,1067
 16 | } corpus;corpus49,1137
 17 | 
 18 | ctm.c,282
 19 | llna_model* new_llna_model(43,1255
 20 | llna_ss * new_llna_ss(60,1688
 21 | void del_llna_ss(73,2023
 22 | void reset_llna_ss(81,2159
 23 | void write_ss(90,2334
 24 | llna_model* corpus_init(102,2604
 25 | llna_model* random_init(166,4398
 26 | llna_model* read_llna_model(208,5392
 27 | void write_llna_model(250,6601
 28 | 
 29 | ctm.h,223
 30 | #define LLNA_H22,804
 31 | #define NUM_INIT 29,915
 32 | #define SEED_INIT_SMOOTH 30,934
 33 | typedef struct llna_modelllna_model37,993
 34 | } llna_model;llna_model45,1156
 35 | typedef struct llna_ssllna_ss53,1233
 36 | } llna_ss;llna_ss59,1351
 37 | 
 38 | estimate.c,270
 39 | void expectation(49,1314
 40 | void cov_shrinkage(103,2950
 41 | void maximization(176,5012
 42 | llna_model* em_initial_model(229,6333
 43 | void em(243,6697
 44 | void inference(339,9616
 45 | void within_doc_split(392,11439
 46 | int pod_experiment(416,12140
 47 | void count(474,14015
 48 | int main(494,14321
 49 | 
 50 | gsl-wrappers.c,559
 51 | double safe_log(27,959
 52 | double log_sum(34,1047
 53 | double vget(52,1287
 54 | void vset(58,1367
 55 | void vinc(64,1444
 56 | double mget(70,1524
 57 | void mset(76,1614
 58 | void minc(82,1701
 59 | void col_sum(88,1794
 60 | void vprint(99,2007
 61 | void vfprint(108,2143
 62 | void mprint(119,2312
 63 | void scanf_vector(131,2498
 64 | void scanf_matrix(140,2664
 65 | void printf_vector(148,2830
 66 | void printf_matrix(157,3004
 67 | void matrix_inverse(166,3179
 68 | double log_det(184,3560
 69 | void sym_eigen(205,3951
 70 | double sum(220,4321
 71 | void center(231,4497
 72 | void normalize(241,4663
 73 | double norm(251,4829
 74 | int argmax(262,4991
 75 | 
 76 | gsl-wrappers.h,30
 77 | #define GSL_WRAPPERS_H21,811
 78 | 
 79 | inference.c,810
 80 | gsl_vector ** temp;50,1618
 81 | int ntemp 51,1638
 82 | void init_temp_vectors(53,1654
 83 | double expect_mult_norm(67,1852
 84 | void lhood_bnd(80,2142
 85 | int opt_zeta(134,3737
 86 | void opt_phi(151,3994
 87 | void fdf_lambda(183,4796
 88 | double f_lambda(190,4944
 89 | void df_lambda(218,5936
 90 | int opt_lambda(252,6928
 91 | double f_nu_i(328,9142
 92 | double df_nu_i(341,9439
 93 | double d2f_nu_i(354,9735
 94 | void opt_nu(365,9984
 95 | double fixed_point_iter_i(375,10165
 96 | void opt_nu_i(388,10465
 97 | void init_var_unif(423,11347
 98 | void init_var(442,11768
 99 | llna_var_param * new_llna_var_param(460,12074
100 | void free_llna_var_param(473,12441
101 | double var_inference(483,12627
102 | void update_expected_ss(522,13696
103 | double sample_term(561,14730
104 | double sample_lhood(605,15878
105 | void expected_theta(644,16839
106 | double log_mult_prob(699,18337
107 | void write_word_assignment(721,18809
108 | 
109 | inference.h,220
110 | #define LLNA_INFERENCE_H21,813
111 | #define NEWTON_THRESH 23,839
112 | typedef struct llna_var_param llna_var_param34,1028
113 | } llna_var_param;llna_var_param44,1256
114 | typedef struct bundle bundle47,1276
115 | } bundle;bundle52,1389
116 | 
117 | params.c,108
118 | llna_params PARAMS;26,869
119 | void read_params(28,890
120 | void print_params(49,1668
121 | void default_params(62,2119
122 | 
123 | params.h,148
124 | #define PARAMS_H21,805
125 | #define MLE 23,823
126 | #define SHRINK 24,837
127 | typedef struct llna_paramsllna_params26,855
128 | } llna_params;llna_params36,1065
129 | 
130 | r-interface.c,216
131 | llna_model global_mod;29,973
132 | double r_mtx_get(37,1093
133 | gsl_matrix * r_to_gsl_matrix(43,1186
134 | gsl_vector * r_to_gsl_vector(60,1478
135 | void r_set_mod(79,1726
136 | void r_lhood_bound(102,2444
137 | void r_var_inference(132,3170
138 | 
139 | r-interface.h,34
140 | #define LLNA_R_INTERFACE_H21,815
141 | 


--------------------------------------------------------------------------------
/corpus.c:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
  2 | 
  3 | // This file is part of CTM-C.
  4 | 
  5 | // CTM-C is free software; you can redistribute it and/or modify it under
  6 | // the terms of the GNU General Public License as published by the Free
  7 | // Software Foundation; either version 2 of the License, or (at your
  8 | // option) any later version.
  9 | 
 10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 13 | // for more details.
 14 | 
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 18 | // USA
 19 | 
 20 | #include <stdio.h>
 21 | #include <stdlib.h>
 22 | #include <gsl/gsl_rng.h>
 23 | #include <time.h>
 24 | #include <assert.h>
 25 | 
 26 | #include "corpus.h"
 27 | 
 28 | /*
 29 |  * read corpus from a file
 30 |  *
 31 |  */
 32 | 
 33 | corpus* read_data(const char* data_filename)
 34 | {
 35 |     FILE *fileptr;
 36 |     int length, count, word, n, nd, nw, corpus_total = 0;
 37 |     corpus* c;
 38 | 
 39 |     printf("reading data from %s\n", data_filename);
 40 |     c = malloc(sizeof(corpus));
 41 |     fileptr = fopen(data_filename, "r");
 42 |     nd = 0; nw = 0;
 43 |     c->docs = malloc(sizeof(doc) * 1);
 44 |     while ((fscanf(fileptr, "%10d", &length) != EOF))
 45 |     {
 46 | 	c->docs = (doc*) realloc(c->docs, sizeof(doc)*(nd+1));
 47 | 	c->docs[nd].nterms = length;
 48 | 	c->docs[nd].total = 0;
 49 | 	c->docs[nd].word = malloc(sizeof(int)*length);
 50 | 	c->docs[nd].count = malloc(sizeof(int)*length);
 51 | 	for (n = 0; n < length; n++)
 52 | 	{
 53 | 	    fscanf(fileptr, "%10d:%10d", &word, &count);
 54 | 	    word = word - OFFSET;
 55 | 	    c->docs[nd].word[n] = word;
 56 | 	    c->docs[nd].count[n] = count;
 57 | 	    c->docs[nd].total += count;
 58 | 	    if (word >= nw) { nw = word + 1; }
 59 | 	}
 60 | 	corpus_total += c->docs[nd].total;
 61 |         nd++;
 62 |     }
 63 |     fclose(fileptr);
 64 |     c->ndocs = nd;
 65 |     c->nterms = nw;
 66 |     printf("number of docs    : %d\n", nd);
 67 |     printf("number of terms   : %d\n", nw);
 68 |     printf("total             : %d\n", corpus_total);
 69 |     return(c);
 70 | }
 71 | 
 72 | 
 73 | /*
 74 |  * print document
 75 |  *
 76 |  */
 77 | 
 78 | void print_doc(doc* d)
 79 | {
 80 |     int i;
 81 |     printf("total : %d\n", d->total);
 82 |     printf("nterm : %d\n", d->nterms);
 83 |     for (i = 0; i < d->nterms; i++)
 84 |     {
 85 |         printf("%d:%d ", d->word[i], d->count[i]);
 86 |     }
 87 | }
 88 | 
 89 | 
 90 | /*
 91 |  * write a corpus to file
 92 |  *
 93 |  */
 94 | 
 95 | void write_corpus(corpus* c, char* filename)
 96 | {
 97 |     int i, j;
 98 |     FILE * fileptr;
 99 |     doc * d;
100 | 
101 |     fileptr = fopen(filename, "w");
102 |     for (i = 0; i < c->ndocs; i++)
103 |     {
104 |         d = &(c->docs[i]);
105 |         fprintf(fileptr, "%d", d->nterms);
106 |         for (j = 0; j < d->nterms; j++)
107 |         {
108 |             fprintf(fileptr, " %d:%d", d->word[j], d->count[j]);
109 |         }
110 |         fprintf(fileptr, "\n");
111 |     }
112 | }
113 | 
114 | 
115 | void init_doc(doc* d, int max_nterms)
116 | {
117 |     int i;
118 |     d->nterms = 0;
119 |     d->total = 0;
120 |     d->word = malloc(sizeof(int) * max_nterms);
121 |     d->count = malloc(sizeof(int) * max_nterms);
122 |     for (i = 0; i < max_nterms; i++)
123 |     {
124 |         d->word[i] = 0;
125 |         d->count[i] = 0;
126 |     }
127 | }
128 | 
129 | 
130 | /*
131 |  * return the 'n'th word of a document
132 |  * (note order has been lost in the representation)
133 |  *
134 |  */
135 | 
136 | int remove_word(int n, doc* d)
137 | {
138 |     int i = -1, word, pos = 0;
139 |     do
140 |     {
141 |         i++;
142 |         pos += d->count[i];
143 |         word = d->word[i];
144 |     }
145 |     while (pos <= n);
146 |     d->total--;
147 |     d->count[i]--;
148 |     assert(d->count[i] >= 0);
149 |     return(word);
150 | }
151 | 
152 | 
153 | /*
154 |  * randomly move some proportion of words from one document to another
155 |  *
156 |  */
157 | 
158 | void split(doc* orig, doc* dest, double prop)
159 | {
160 |     int w, i, nwords;
161 | 
162 |     gsl_rng * r = gsl_rng_alloc(gsl_rng_taus);
163 |     time_t seed;
164 |     time(&seed);
165 |     gsl_rng_set(r, (long) seed);
166 | 
167 |     nwords = floor((double) prop * orig->total);
168 |     if (nwords == 0) nwords = 1;
169 |     init_doc(dest, nwords);
170 |     for (i = 0; i < nwords; i++)
171 |     {
172 |         w = remove_word(floor(gsl_rng_uniform(r)*orig->total), orig);
173 |         dest->total++;
174 |         dest->nterms++;
175 |         dest->word[i] = w;
176 |         dest->count[i] = 1;
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/corpus.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
 2 | 
 3 | // This file is part of CTM-C.
 4 | 
 5 | // CTM-C is free software; you can redistribute it and/or modify it under
 6 | // the terms of the GNU General Public License as published by the Free
 7 | // Software Foundation; either version 2 of the License, or (at your
 8 | // option) any later version.
 9 | 
10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 | // for more details.
14 | 
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 | // USA
19 | 
20 | #ifndef CORPUS_H
21 | #define CORPUS_H
22 | 
23 | #include <math.h>
24 | 
25 | #define OFFSET 0
26 | 
27 | /*
28 |  * a document is a collection of counts and terms
29 |  *
30 |  */
31 | 
32 | typedef struct doc {
33 |     int total;
34 |     int nterms;
35 |     int * word;
36 |     int * count;
37 | } doc;
38 | 
39 | 
40 | /*
41 |  * a corpus is a collection of documents
42 |  *
43 |  */
44 | 
45 | typedef struct corpus {
46 |     doc* docs;
47 |     int nterms;
48 |     int ndocs;
49 | } corpus;
50 | 
51 | 
52 | /*
53 |  * functions
54 |  *
55 |  */
56 | 
57 | corpus* read_data(const char*);
58 | void print_doc(doc* d);
59 | void split(doc* orig, doc* dest, double prop);
60 | void write_corpus(corpus* c, char* filename);
61 | 
62 | #endif
63 | 


--------------------------------------------------------------------------------
/ctm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/blei-lab/ctm-c/dfa139c3dac5d10059429f33faf90401a04125ea/ctm


--------------------------------------------------------------------------------
/ctm-topics.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | 
 3 | # usage: python topics.py <beta-file> <vocab-file> <num words>
 4 | 
 5 | import sys, numpy
 6 | 
 7 | def print_topics(beta_file, vocab_file,
 8 |                  nwords = 25, out = sys.stdout):
 9 | 
10 |     # get the vocabulary
11 | 
12 |     vocab = file(vocab_file, 'r').readlines()
13 |     vocab = map(lambda x: x.split()[0], vocab)
14 | 
15 |     indices = range(len(vocab))
16 |     topic = numpy.array(map(float, file(beta_file, 'r').readlines()))
17 | 
18 |     nterms  = len(vocab)
19 |     ntopics = len(topic)/nterms
20 |     topic   = numpy.reshape(topic, [ntopics, nterms])
21 |     for i in range(ntopics):
22 |         out.write('\ntopic %03d\n' % i)
23 |         indices.sort(lambda x,y: -cmp(topic[i,x], topic[i,y]))
24 |         for j in range(nwords):
25 |             out.write('     '+vocab[indices[j]]+'\n')
26 | 
27 | 
28 | if (__name__ == '__main__'):
29 |      beta_file = sys.argv[1]
30 |      vocab_file = sys.argv[2]
31 |      nwords = int(sys.argv[3])
32 |      print_topics(beta_file, vocab_file, nwords)
33 | 


--------------------------------------------------------------------------------
/ctm.c:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
  2 | 
  3 | // This file is part of CTM-C.
  4 | 
  5 | // CTM-C is free software; you can redistribute it and/or modify it under
  6 | // the terms of the GNU General Public License as published by the Free
  7 | // Software Foundation; either version 2 of the License, or (at your
  8 | // option) any later version.
  9 | 
 10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 13 | // for more details.
 14 | 
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 18 | // USA
 19 | 
 20 | /*************************************************************************
 21 |  *
 22 |  * llna.c
 23 |  *
 24 |  * reading, writing, and initializing a logistic normal allocation model
 25 |  *
 26 |  *************************************************************************/
 27 | 
 28 | #include <stdlib.h>
 29 | #include <gsl/gsl_rng.h>
 30 | #include <gsl/gsl_vector.h>
 31 | #include <gsl/gsl_matrix.h>
 32 | #include <time.h>
 33 | 
 34 | #include "gsl-wrappers.h"
 35 | #include "corpus.h"
 36 | #include "ctm.h"
 37 | 
 38 | /*
 39 |  * create a new empty model
 40 |  *
 41 |  */
 42 | 
 43 | llna_model* new_llna_model(int ntopics, int nterms)
 44 | {
 45 |     llna_model* model = malloc(sizeof(llna_model));
 46 |     model->k = ntopics;
 47 |     model->mu = gsl_vector_calloc(ntopics - 1);
 48 |     model->cov = gsl_matrix_calloc(ntopics-1, ntopics-1);
 49 |     model->inv_cov = gsl_matrix_calloc(ntopics-1, ntopics-1);
 50 |     model->log_beta = gsl_matrix_calloc(ntopics, nterms);
 51 |     return(model);
 52 | }
 53 | 
 54 | 
 55 | /*
 56 |  * create and delete sufficient statistics
 57 |  *
 58 |  */
 59 | 
 60 | llna_ss * new_llna_ss(llna_model* model)
 61 | {
 62 |     llna_ss * ss;
 63 |     ss = malloc(sizeof(llna_ss));
 64 |     ss->mu_ss = gsl_vector_calloc(model->k-1);
 65 |     ss->cov_ss = gsl_matrix_calloc(model->k-1, model->k-1);
 66 |     ss->beta_ss = gsl_matrix_calloc(model->k, model->log_beta->size2);
 67 |     ss->ndata = 0;
 68 |     reset_llna_ss(ss);
 69 |     return(ss);
 70 | }
 71 | 
 72 | 
 73 | void del_llna_ss(llna_ss * ss)
 74 | {
 75 |     gsl_vector_free(ss->mu_ss);
 76 |     gsl_matrix_free(ss->cov_ss);
 77 |     gsl_matrix_free(ss->beta_ss);
 78 | }
 79 | 
 80 | 
 81 | void reset_llna_ss(llna_ss * ss)
 82 | {
 83 |     gsl_matrix_set_all(ss->beta_ss, 0);
 84 |     gsl_matrix_set_all(ss->cov_ss, 0);
 85 |     gsl_vector_set_all(ss->mu_ss, 0);
 86 |     ss->ndata = 0;
 87 | }
 88 | 
 89 | 
 90 | void write_ss(llna_ss * ss)
 91 | {
 92 |     printf_matrix("cov_ss", ss->cov_ss);
 93 |     printf_matrix("beta_ss", ss->beta_ss);
 94 |     printf_vector("mu_ss", ss->mu_ss);
 95 | }
 96 | /*
 97 |  * initialize a model with zero-mean, diagonal covariance gaussian and
 98 |  * topics seeded from the corpus
 99 |  *
100 |  */
101 | 
102 | llna_model* corpus_init(int ntopics, corpus* corpus)
103 | {
104 |     llna_model* model = new_llna_model(ntopics, corpus->nterms);
105 |     gsl_rng * r = gsl_rng_alloc(gsl_rng_taus);
106 |     doc* doc;
107 |     int i, k, n, d;
108 |     double sum;
109 |     time_t seed;
110 |     time(&seed);
111 |     printf("SEED = %ld\n", seed);
112 |     printf("USING 1115574245\n");
113 |     gsl_rng_set(r, (long) 1115574245);
114 |     // gsl_rng_set(r, (long) seed);
115 |     // gsl_rng_set(r, (long) 432403824);
116 | 
117 |     // gaussian
118 |     for (i = 0; i < ntopics-1; i++)
119 |     {
120 |         vset(model->mu, i, 0);
121 |         mset(model->cov, i, i, 1.0);
122 |     }
123 |     matrix_inverse(model->cov, model->inv_cov);
124 |     model->log_det_inv_cov = log_det(model->inv_cov);
125 | 
126 |     // topics
127 |     for (k = 0; k < ntopics; k++)
128 |     {
129 |         sum = 0;
130 |         // seed
131 |         for (i = 0; i < NUM_INIT; i++)
132 |         {
133 |             d = floor(gsl_rng_uniform(r)*corpus->ndocs);
134 |             printf("initialized with document %d\n", d);
135 |             doc = &(corpus->docs[d]);
136 |             for (n = 0; n < doc->nterms; n++)
137 |             {
138 |                 minc(model->log_beta, k, doc->word[n], (double) doc->count[n]);
139 |             }
140 |         }
141 |         // smooth
142 |         for (n = 0; n < model->log_beta->size2; n++)
143 |         {
144 |             minc(model->log_beta, k, n, SEED_INIT_SMOOTH + gsl_rng_uniform(r));
145 |             // minc(model->log_beta, k, n, SEED_INIT_SMOOTH);
146 |             sum += mget(model->log_beta, k, n);
147 |         }
148 |         sum = safe_log(sum);
149 |         // normalize
150 |         for (n = 0; n < model->log_beta->size2; n++)
151 |         {
152 |             mset(model->log_beta, k, n,
153 |                  safe_log(mget(model->log_beta, k, n)) - sum);
154 |         }
155 |     }
156 |     gsl_rng_free(r);
157 |     return(model);
158 | }
159 | 
160 | /*
161 |  * random initialization means zero-mean, diagonal covariance gaussian
162 |  * and randomly generated topics
163 |  *
164 |  */
165 | 
166 | llna_model* random_init(int ntopics, int nterms)
167 | {
168 |     int i, j;
169 |     double sum, val;
170 |     llna_model* model = new_llna_model(ntopics, nterms);
171 |     gsl_rng * r = gsl_rng_alloc(gsl_rng_taus);
172 |     long t1;
173 |     (void) time(&t1);
174 |     // !!! DEBUG
175 |     // t1 = gsl_rng_set(r, (long) 1115574245);
176 |     printf("RANDOM SEED = %ld\n", t1);
177 |     gsl_rng_set(r, t1);
178 | 
179 |     for (i = 0; i < ntopics-1; i++)
180 |     {
181 |         vset(model->mu, i, 0);
182 |         mset(model->cov, i, i, 1.0);
183 |     }
184 |     for (i = 0; i < ntopics; i++)
185 |     {
186 |         sum = 0;
187 |         for (j = 0; j < nterms; j++)
188 |         {
189 |             val = gsl_rng_uniform(r) + 1.0/100;
190 |             sum += val;
191 |             mset(model->log_beta, i, j, val);
192 |         }
193 |         for (j = 0; j < nterms; j++)
194 |             mset(model->log_beta, i, j, log(mget(model->log_beta, i, j) / sum));
195 |     }
196 |     matrix_inverse(model->cov, model->inv_cov);
197 |     model->log_det_inv_cov = log_det(model->inv_cov);
198 | 
199 |     gsl_rng_free(r);
200 |     return(model);
201 | }
202 | 
203 | /*
204 |  * read a model
205 |  *
206 |  */
207 | 
208 | llna_model* read_llna_model(char * root)
209 | {
210 |     char filename[200];
211 |     FILE* fileptr;
212 |     llna_model* model;
213 |     int ntopics, nterms;
214 | 
215 |     // read parameters
216 |     sprintf(filename, "%s-param.txt", root);
217 |     printf("reading params from %s\n", filename);
218 |     fileptr = fopen(filename, "r");
219 |     fscanf(fileptr, "num_topics %d\n", &ntopics);
220 |     fscanf(fileptr, "num_terms %d\n", &nterms);
221 |     fclose(fileptr);
222 |     printf("%d topics, %d terms\n", ntopics, nterms);
223 |     // allocate model
224 |     model = new_llna_model(ntopics, nterms);
225 |     // read gaussian
226 |     printf("reading gaussian\n");
227 |     sprintf(filename, "%s-mu.dat", root);
228 |     scanf_vector(filename, model->mu);
229 |     sprintf(filename, "%s-cov.dat", root);
230 |     scanf_matrix(filename, model->cov);
231 |     sprintf(filename, "%s-inv-cov.dat", root);
232 |     scanf_matrix(filename, model->inv_cov);
233 |     sprintf(filename, "%s-log-det-inv-cov.dat", root);
234 |     fileptr = fopen(filename, "r");
235 |     fscanf(fileptr, "%lf\n", &(model->log_det_inv_cov));
236 |     fclose(fileptr);
237 |     // read topic matrix
238 |     printf("reading topics\n");
239 |     sprintf(filename, "%s-log-beta.dat", root);
240 |     scanf_matrix(filename, model->log_beta);
241 | 
242 |     return(model);
243 | }
244 | 
245 | /*
246 |  * write a model
247 |  *
248 |  */
249 | 
250 | void write_llna_model(llna_model * model, char * root)
251 | {
252 |     char filename[200];
253 |     FILE* fileptr;
254 | 
255 |     // write parameters
256 |     printf("writing params\n");
257 |     sprintf(filename, "%s-param.txt", root);
258 |     fileptr = fopen(filename, "w");
259 |     fprintf(fileptr, "num_topics %d\n", model->k);
260 |     fprintf(fileptr, "num_terms %d\n", (int) model->log_beta->size2);
261 |     fclose(fileptr);
262 |     // write gaussian
263 |     printf("writing gaussian\n");
264 |     sprintf(filename, "%s-mu.dat", root);
265 |     printf_vector(filename, model->mu);
266 |     sprintf(filename, "%s-cov.dat", root);
267 |     printf_matrix(filename, model->cov);
268 |     sprintf(filename, "%s-inv-cov.dat", root);
269 |     printf_matrix(filename, model->inv_cov);
270 |     sprintf(filename, "%s-log-det-inv-cov.dat", root);
271 |     fileptr = fopen(filename, "w");
272 |     fprintf(fileptr, "%lf\n", model->log_det_inv_cov);
273 |     fclose(fileptr);
274 |     // write topic matrix
275 |     printf("writing topics\n");
276 |     sprintf(filename, "%s-log-beta.dat", root);
277 |     printf_matrix(filename, model->log_beta);
278 | }
279 | 


--------------------------------------------------------------------------------
/ctm.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
 2 | 
 3 | // This file is part of CTM-C.
 4 | 
 5 | // CTM-C is free software; you can redistribute it and/or modify it under
 6 | // the terms of the GNU General Public License as published by the Free
 7 | // Software Foundation; either version 2 of the License, or (at your
 8 | // option) any later version.
 9 | 
10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 | // for more details.
14 | 
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 | // USA
19 | 
20 | 
21 | #ifndef LLNA_H
22 | #define LLNA_H
23 | 
24 | #include <gsl/gsl_vector.h>
25 | #include <gsl/gsl_matrix.h>
26 | #include <time.h>
27 | #include "corpus.h"
28 | 
29 | #define NUM_INIT 1
30 | #define SEED_INIT_SMOOTH 1.0
31 | 
32 | /*
33 |  * the llna model
34 |  *
35 |  */
36 | 
37 | typedef struct llna_model
38 | {
39 |     int k;
40 |     gsl_matrix * log_beta;
41 |     gsl_vector * mu;
42 |     gsl_matrix * inv_cov;
43 |     gsl_matrix * cov;
44 |     double log_det_inv_cov;
45 | } llna_model;
46 | 
47 | 
48 | /*
49 |  * sufficient statistics for mle of an llna model
50 |  *
51 |  */
52 | 
53 | typedef struct llna_ss
54 | {
55 |     gsl_matrix * cov_ss;
56 |     gsl_vector * mu_ss;
57 |     gsl_matrix * beta_ss;
58 |     double ndata;
59 | } llna_ss;
60 | 
61 | 
62 | /*
63 |  * function declarations
64 |  *
65 |  */
66 | 
67 | llna_model* read_llna_model(char*);
68 | void write_llna_model(llna_model*, char*);
69 | llna_model* new_llna_model(int, int);
70 | llna_model* random_init(int, int);
71 | llna_model* corpus_init(int, corpus*);
72 | llna_ss * new_llna_ss(llna_model*);
73 | void del_llna_ss(llna_ss*);
74 | void reset_llna_ss(llna_ss*);
75 | void write_ss(llna_ss*);
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/estimate.c:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
  2 | 
  3 | // This file is part of CTM-C.
  4 | 
  5 | // CTM-C is free software; you can redistribute it and/or modify it under
  6 | // the terms of the GNU General Public License as published by the Free
  7 | // Software Foundation; either version 2 of the License, or (at your
  8 | // option) any later version.
  9 | 
 10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 13 | // for more details.
 14 | 
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 18 | // USA
 19 | 
 20 | /*************************************************************************
 21 |  *
 22 |  * llna.c
 23 |  *
 24 |  * estimation of an llna model by variational em
 25 |  *
 26 |  *************************************************************************/
 27 | 
 28 | #include <stdlib.h>
 29 | #include <sys/stat.h>
 30 | #include <sys/types.h>
 31 | #include <time.h>
 32 | #include <string.h>
 33 | #include <gsl/gsl_blas.h>
 34 | #include <assert.h>
 35 | 
 36 | #include "corpus.h"
 37 | #include "ctm.h"
 38 | #include "inference.h"
 39 | #include "gsl-wrappers.h"
 40 | #include "params.h"
 41 | 
 42 | extern llna_params PARAMS;
 43 | 
 44 | /*
 45 |  * e step
 46 |  *
 47 |  */
 48 | 
 49 | void expectation(corpus* corpus, llna_model* model, llna_ss* ss,
 50 |                  double* avg_niter, double* total_lhood,
 51 |                  gsl_matrix* corpus_lambda, gsl_matrix* corpus_nu,
 52 |                  gsl_matrix* corpus_phi_sum,
 53 |                  short reset_var, double* converged_pct)
 54 | {
 55 |     int i;
 56 |     llna_var_param* var;
 57 |     doc doc;
 58 |     double lhood, total;
 59 |     gsl_vector lambda, nu;
 60 |     gsl_vector* phi_sum;
 61 | 
 62 |     *avg_niter = 0.0;
 63 |     *converged_pct = 0;
 64 |     phi_sum = gsl_vector_alloc(model->k);
 65 |     total = 0;
 66 |     for (i = 0; i < corpus->ndocs; i++)
 67 |     {
 68 |         printf("doc %5d   ", i);
 69 |         doc = corpus->docs[i];
 70 |         var = new_llna_var_param(doc.nterms, model->k);
 71 |         if (reset_var)
 72 |             init_var_unif(var, &doc, model);
 73 |         else
 74 |         {
 75 |             lambda = gsl_matrix_row(corpus_lambda, i).vector;
 76 |             nu= gsl_matrix_row(corpus_nu, i).vector;
 77 |             init_var(var, &doc, model, &lambda, &nu);
 78 |         }
 79 |         lhood = var_inference(var, &doc, model);
 80 |         update_expected_ss(var, &doc, ss);
 81 |         total += lhood;
 82 |         printf("lhood %5.5e   niter %5d\n", lhood, var->niter);
 83 |         *avg_niter += var->niter;
 84 |         *converged_pct += var->converged;
 85 |         gsl_matrix_set_row(corpus_lambda, i, var->lambda);
 86 |         gsl_matrix_set_row(corpus_nu, i, var->nu);
 87 |         col_sum(var->phi, phi_sum);
 88 |         gsl_matrix_set_row(corpus_phi_sum, i, phi_sum);
 89 |         free_llna_var_param(var);
 90 |     }
 91 |     gsl_vector_free(phi_sum);
 92 |     *avg_niter = *avg_niter / corpus->ndocs;
 93 |     *converged_pct = *converged_pct / corpus->ndocs;
 94 |     *total_lhood = total;
 95 | }
 96 | 
 97 | 
 98 | /*
 99 |  * m step
100 |  *
101 |  */
102 | 
103 | void cov_shrinkage(gsl_matrix* mle, int n, gsl_matrix* result)
104 | {
105 |     int p = mle->size1, i;
106 |     double temp = 0, alpha = 0, tau = 0, log_lambda_s = 0;
107 |     gsl_vector
108 |         *lambda_star = gsl_vector_calloc(p),
109 |         t, u,
110 |         *eigen_vals = gsl_vector_calloc(p),
111 |         *s_eigen_vals = gsl_vector_calloc(p);
112 |     gsl_matrix
113 |         *d = gsl_matrix_calloc(p,p),
114 |         *eigen_vects = gsl_matrix_calloc(p,p),
115 |         *s_eigen_vects = gsl_matrix_calloc(p,p),
116 |         *result1 = gsl_matrix_calloc(p,p);
117 | 
118 |     // get eigen decomposition
119 | 
120 |     sym_eigen(mle, eigen_vals, eigen_vects);
121 |     for (i = 0; i < p; i++)
122 |     {
123 | 
124 |         // compute shrunken eigenvalues
125 | 
126 |         temp = 0;
127 |         alpha = 1.0/(n+p+1-2*i);
128 |         vset(lambda_star, i, n * alpha * vget(eigen_vals, i));
129 |     }
130 | 
131 |     // get diagonal mle and eigen decomposition
132 | 
133 |     t = gsl_matrix_diagonal(d).vector;
134 |     u = gsl_matrix_diagonal(mle).vector;
135 |     gsl_vector_memcpy(&t, &u);
136 |     sym_eigen(d, s_eigen_vals, s_eigen_vects);
137 | 
138 |     // compute tau^2
139 | 
140 |     for (i = 0; i < p; i++)
141 |         log_lambda_s += log(vget(s_eigen_vals, i));
142 |     log_lambda_s = log_lambda_s/p;
143 |     for (i = 0; i < p; i++)
144 |         tau += pow(log(vget(lambda_star, i)) - log_lambda_s, 2)/(p + 4) - 2.0 / n;
145 | 
146 |     // shrink \lambda* towards the structured eigenvalues
147 | 
148 |     for (i = 0; i < p; i++)
149 |         vset(lambda_star, i,
150 |              exp((2.0/n)/((2.0/n) + tau) * log_lambda_s +
151 |                  tau/((2.0/n) + tau) * log(vget(lambda_star, i))));
152 | 
153 |     // put the eigenvalues in a diagonal matrix
154 | 
155 |     t = gsl_matrix_diagonal(d).vector;
156 |     gsl_vector_memcpy(&t, lambda_star);
157 | 
158 |     // reconstruct the covariance matrix
159 | 
160 |     gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1, d, eigen_vects, 0, result1);
161 |     gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1, eigen_vects, result1, 0, result);
162 | 
163 |     // clean up
164 | 
165 |     gsl_vector_free(lambda_star);
166 |     gsl_vector_free(eigen_vals);
167 |     gsl_vector_free(s_eigen_vals);
168 |     gsl_matrix_free(d);
169 |     gsl_matrix_free(eigen_vects);
170 |     gsl_matrix_free(s_eigen_vects);
171 |     gsl_matrix_free(result1);
172 | }
173 | 
174 | 
175 | 
176 | void maximization(llna_model* model, llna_ss* ss)
177 | {
178 |     int i, j;
179 |     double sum;
180 | 
181 |     // mean maximization
182 | 
183 |     for (i = 0; i < model->k-1; i++)
184 |         vset(model->mu, i, vget(ss->mu_ss, i) / ss->ndata);
185 | 
186 |     // covariance maximization
187 | 
188 |     for (i = 0; i < model->k-1; i++)
189 |     {
190 |         for (j = 0; j < model->k-1; j++)
191 |         {
192 |             mset(model->cov, i, j,
193 |                  (1.0 / ss->ndata) *
194 |                  (mget(ss->cov_ss, i, j) +
195 |                   ss->ndata * vget(model->mu, i) * vget(model->mu, j) -
196 |                   vget(ss->mu_ss, i) * vget(model->mu, j) -
197 |                   vget(ss->mu_ss, j) * vget(model->mu, i)));
198 |         }
199 |     }
200 |     if (PARAMS.cov_estimate == SHRINK)
201 |     {
202 |         cov_shrinkage(model->cov, ss->ndata, model->cov);
203 |     }
204 |     matrix_inverse(model->cov, model->inv_cov);
205 |     model->log_det_inv_cov = log_det(model->inv_cov);
206 | 
207 |     // topic maximization
208 | 
209 |     for (i = 0; i < model->k; i++)
210 |     {
211 |         sum = 0;
212 |         for (j = 0; j < model->log_beta->size2; j++)
213 |             sum += mget(ss->beta_ss, i, j);
214 | 
215 |         if (sum == 0) sum = safe_log(sum) * model->log_beta->size2;
216 |         else sum = safe_log(sum);
217 | 
218 |         for (j = 0; j < model->log_beta->size2; j++)
219 |             mset(model->log_beta, i, j, safe_log(mget(ss->beta_ss, i, j)) - sum);
220 |     }
221 | }
222 | 
223 | 
224 | /*
225 |  * run em
226 |  *
227 |  */
228 | 
229 | llna_model* em_initial_model(int k, corpus* corpus, char* start)
230 | {
231 |     llna_model* model;
232 |     printf("starting from %s\n", start);
233 |     if (strcmp(start, "rand")==0)
234 |         model = random_init(k, corpus->nterms);
235 |     else if (strcmp(start, "seed")==0)
236 |         model = corpus_init(k, corpus);
237 |     else
238 |         model = read_llna_model(start);
239 |     return(model);
240 | }
241 | 
242 | 
243 | void em(char* dataset, int k, char* start, char* dir)
244 | {
245 |     FILE* lhood_fptr;
246 |     char string[100];
247 |     int iteration;
248 |     double convergence = 1, lhood = 0, lhood_old = 0;
249 |     corpus* corpus;
250 |     llna_model *model;
251 |     llna_ss* ss;
252 |     time_t t1,t2;
253 |     double avg_niter, converged_pct, old_conv = 0;
254 |     gsl_matrix *corpus_lambda, *corpus_nu, *corpus_phi_sum;
255 |     short reset_var = 1;
256 | 
257 |     // read the data and make the directory
258 | 
259 |     corpus = read_data(dataset);
260 |     mkdir(dir, S_IRUSR|S_IWUSR|S_IXUSR);
261 | 
262 |     // set up the log likelihood log file
263 | 
264 |     sprintf(string, "%s/likelihood.dat", dir);
265 |     lhood_fptr = fopen(string, "w");
266 | 
267 |     // run em
268 | 
269 |     model = em_initial_model(k, corpus, start);
270 |     ss = new_llna_ss(model);
271 |     corpus_lambda = gsl_matrix_alloc(corpus->ndocs, model->k);
272 |     corpus_nu = gsl_matrix_alloc(corpus->ndocs, model->k);
273 |     corpus_phi_sum = gsl_matrix_alloc(corpus->ndocs, model->k);
274 |     time(&t1);
275 |     init_temp_vectors(model->k-1); // !!! hacky
276 |     iteration = 0;
277 |     sprintf(string, "%s/%03d", dir, iteration);
278 |     write_llna_model(model, string);
279 |     do
280 |     {
281 |         printf("***** EM ITERATION %d *****\n", iteration);
282 | 
283 |         expectation(corpus, model, ss, &avg_niter, &lhood,
284 |                     corpus_lambda, corpus_nu, corpus_phi_sum,
285 |                     reset_var, &converged_pct);
286 |         time(&t2);
287 |         convergence = (lhood_old - lhood) / lhood_old;
288 |         fprintf(lhood_fptr, "%d %5.5e %5.5e %5ld %5.5f %1.5f\n",
289 |                 iteration, lhood, convergence, (int) t2 - t1, avg_niter, converged_pct);
290 | 
291 |         if (((iteration % PARAMS.lag)==0) || isnan(lhood))
292 |         {
293 |             sprintf(string, "%s/%03d", dir, iteration);
294 |             write_llna_model(model, string);
295 |             sprintf(string, "%s/%03d-lambda.dat", dir, iteration);
296 |             printf_matrix(string, corpus_lambda);
297 |             sprintf(string, "%s/%03d-nu.dat", dir, iteration);
298 |             printf_matrix(string, corpus_nu);
299 |         }
300 |         time(&t1);
301 | 
302 |         if (convergence < 0)
303 |         {
304 |             reset_var = 0;
305 |             if (PARAMS.var_max_iter > 0)
306 |                 PARAMS.var_max_iter += 10;
307 |             else PARAMS.var_convergence /= 10;
308 |         }
309 |         else
310 |         {
311 |             maximization(model, ss);
312 |             lhood_old = lhood;
313 |             reset_var = 1;
314 |             iteration++;
315 |         }
316 | 
317 |         fflush(lhood_fptr);
318 |         reset_llna_ss(ss);
319 |         old_conv = convergence;
320 |     }
321 |     while ((iteration < PARAMS.em_max_iter) &&
322 |            ((convergence > PARAMS.em_convergence) || (convergence < 0)));
323 | 
324 |     sprintf(string, "%s/final", dir);
325 |     write_llna_model(model, string);
326 |     sprintf(string, "%s/final-lambda.dat", dir);
327 |     printf_matrix(string, corpus_lambda);
328 |     sprintf(string, "%s/final-nu.dat", dir);
329 |     printf_matrix(string, corpus_nu);
330 |     fclose(lhood_fptr);
331 | }
332 | 
333 | 
334 | /*
335 |  * load a model, and do approximate inference for each document in a corpus
336 |  *
337 |  */
338 | 
339 | void inference(char* dataset, char* model_root, char* out)
340 | {
341 |     int i;
342 |     char fname[100];
343 | 
344 |     // read the data and model
345 |     corpus * corpus = read_data(dataset);
346 |     llna_model * model = read_llna_model(model_root);
347 |     gsl_vector * lhood = gsl_vector_alloc(corpus->ndocs);
348 |     gsl_matrix * corpus_nu = gsl_matrix_alloc(corpus->ndocs, model->k);
349 |     gsl_matrix * corpus_lambda = gsl_matrix_alloc(corpus->ndocs, model->k);
350 |     // gsl_matrix * topic_lhoods = gsl_matrix_alloc(corpus->ndocs, model->k);
351 |     gsl_matrix * phi_sums = gsl_matrix_alloc(corpus->ndocs, model->k);
352 | 
353 |     // approximate inference
354 |     init_temp_vectors(model->k-1); // !!! hacky
355 |     sprintf(fname, "%s-word-assgn.dat", out);
356 |     FILE* word_assignment_file = fopen(fname, "w");
357 |     for (i = 0; i < corpus->ndocs; i++)
358 |     {
359 |         doc doc = corpus->docs[i];
360 |         llna_var_param * var = new_llna_var_param(doc.nterms, model->k);
361 |         init_var_unif(var, &doc, model);
362 | 
363 |         vset(lhood, i, var_inference(var, &doc, model));
364 |         gsl_matrix_set_row(corpus_lambda, i, var->lambda);
365 |         gsl_matrix_set_row(corpus_nu, i, var->nu);
366 |         gsl_vector curr_row = gsl_matrix_row(phi_sums, i).vector;
367 |         col_sum(var->phi, &curr_row);
368 |         write_word_assignment(word_assignment_file, &doc, var->phi);
369 | 
370 |         printf("document %05d, niter = %05d\n", i, var->niter);
371 |         free_llna_var_param(var);
372 |     }
373 | 
374 |     // output likelihood and some variational parameters
375 |     sprintf(fname, "%s-ctm-lhood.dat", out);
376 |     printf_vector(fname, lhood);
377 |     sprintf(fname, "%s-lambda.dat", out);
378 |     printf_matrix(fname, corpus_lambda);
379 |     sprintf(fname, "%s-nu.dat", out);
380 |     printf_matrix(fname, corpus_nu);
381 |     sprintf(fname, "%s-phi-sum.dat", out);
382 |     printf_matrix(fname, phi_sums);
383 | 
384 | }
385 | 
386 | 
387 | /*
388 |  * split documents into two random parts
389 |  *
390 |  */
391 | 
392 | void within_doc_split(char* dataset, char* src_data, char* dest_data, double prop)
393 | {
394 |     int i;
395 |     corpus * corp, * dest_corp;
396 | 
397 |     corp = read_data(dataset);
398 |     dest_corp = malloc(sizeof(corpus));
399 |     printf("splitting %d docs\n", corp->ndocs);
400 |     dest_corp->docs = malloc(sizeof(doc) * corp->ndocs);
401 |     dest_corp->nterms = corp->nterms;
402 |     dest_corp->ndocs = corp->ndocs;
403 |     for (i = 0; i < corp->ndocs; i++)
404 |         split(&(corp->docs[i]), &(dest_corp->docs[i]), prop);
405 |     write_corpus(dest_corp, dest_data);
406 |     write_corpus(corp, src_data);
407 | }
408 | 
409 | 
410 | /*
411 |  * for each partially observed document: (a) perform inference on the
412 |  * observations (b) take expected theta and compute likelihood
413 |  *
414 |  */
415 | 
416 | int pod_experiment(char* observed_data, char* heldout_data,
417 |                    char* model_root, char* out)
418 | {
419 |     corpus *obs, *heldout;
420 |     llna_model *model;
421 |     llna_var_param *var;
422 |     int i;
423 |     gsl_vector *log_lhood, *e_theta;
424 |     doc obs_doc, heldout_doc;
425 |     char string[100];
426 |     double total_lhood = 0, total_words = 0, l;
427 |     FILE* e_theta_file = fopen("/Users/blei/llna050_e_theta.txt", "w");
428 | 
429 |     // load model and data
430 |     obs = read_data(observed_data);
431 |     heldout = read_data(heldout_data);
432 |     assert(obs->ndocs == heldout->ndocs);
433 |     model = read_llna_model(model_root);
434 | 
435 |     // run experiment
436 |     init_temp_vectors(model->k-1); // !!! hacky
437 |     log_lhood = gsl_vector_alloc(obs->ndocs + 1);
438 |     e_theta = gsl_vector_alloc(model->k);
439 |     for (i = 0; i < obs->ndocs; i++)
440 |     {
441 |         // get observed and heldout documents
442 |         obs_doc = obs->docs[i];
443 |         heldout_doc = heldout->docs[i];
444 |         // compute variational distribution
445 |         var = new_llna_var_param(obs_doc.nterms, model->k);
446 |         init_var_unif(var, &obs_doc, model);
447 |         var_inference(var, &obs_doc, model);
448 |         expected_theta(var, &obs_doc, model, e_theta);
449 | 
450 |         vfprint(e_theta, e_theta_file);
451 | 
452 |         // approximate inference of held out data
453 |         l = log_mult_prob(&heldout_doc, e_theta, model->log_beta);
454 |         vset(log_lhood, i, l);
455 |         total_words += heldout_doc.total;
456 |         total_lhood += l;
457 |         printf("hid doc %d    log_lhood %5.5f\n", i, vget(log_lhood, i));
458 |         // save results?
459 |         free_llna_var_param(var);
460 |     }
461 |     vset(log_lhood, obs->ndocs, exp(-total_lhood/total_words));
462 |     printf("perplexity : %5.10f", exp(-total_lhood/total_words));
463 |     sprintf(string, "%s-pod-llna.dat", out);
464 |     printf_vector(string, log_lhood);
465 |     return(0);
466 | }
467 | 
468 | 
469 | /*
470 |  * little function to count the words in each document and spit it out
471 |  *
472 |  */
473 | 
474 | void count(char* corpus_name, char* output_name)
475 | {
476 |     corpus *c;
477 |     int i;
478 |     FILE *f;
479 |     int j;
480 |     f = fopen(output_name, "w");
481 |     c = read_data(corpus_name);
482 |     for (i = 0; i < c->ndocs; i++)
483 |     {
484 |         j = c->docs[i].total;
485 |         fprintf(f, "%5d\n", j);
486 |     }
487 | }
488 | 
489 | /*
490 |  * main function
491 |  *
492 |  */
493 | 
494 | int main(int argc, char* argv[])
495 | {
496 |     if (argc > 1)
497 |     {
498 |         if (strcmp(argv[1], "est")==0)
499 |         {
500 |             read_params(argv[6]);
501 |             print_params();
502 |             em(argv[2], atoi(argv[3]), argv[4], argv[5]);
503 |             return(0);
504 |         }
505 |         if (strcmp(argv[1], "inf")==0)
506 |         {
507 |             read_params(argv[5]);
508 |             print_params();
509 |             inference(argv[2], argv[3], argv[4]);
510 |             return(0);
511 |         }
512 |     }
513 |     printf("usage : ctm est <dataset> <# topics> <rand/seed/model> <dir> <settings>\n");
514 |     printf("        ctm inf <dataset> <model-prefix> <results-prefix> <settings>\n");
515 |     return(0);
516 | }
517 | 


--------------------------------------------------------------------------------
/gsl-wrappers.c:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
  2 | 
  3 | // This file is part of CTM-C.
  4 | 
  5 | // CTM-C is free software; you can redistribute it and/or modify it under
  6 | // the terms of the GNU General Public License as published by the Free
  7 | // Software Foundation; either version 2 of the License, or (at your
  8 | // option) any later version.
  9 | 
 10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 13 | // for more details.
 14 | 
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 18 | // USA
 19 | 
 20 | #include "gsl-wrappers.h"
 21 | #include <gsl/gsl_vector.h>
 22 | #include <gsl/gsl_matrix.h>
 23 | #include <gsl/gsl_permutation.h>
 24 | #include <gsl/gsl_linalg.h>
 25 | #include <gsl/gsl_eigen.h>
 26 | 
 27 | double safe_log(double x)
 28 | {
 29 |     if (x == 0) return(-1000);
 30 |     else return(log(x));
 31 | }
 32 | 
 33 | 
 34 | double log_sum(double log_a, double log_b)
 35 | {
 36 |   double v;
 37 | 
 38 |   if (log_a == -1) return(log_b);
 39 | 
 40 |   if (log_a < log_b)
 41 |   {
 42 |       v = log_b+log(1 + exp(log_a-log_b));
 43 |   }
 44 |   else
 45 |   {
 46 |       v = log_a+log(1 + exp(log_b-log_a));
 47 |   }
 48 |   return(v);
 49 | }
 50 | 
 51 | 
 52 | double vget(const gsl_vector* v, int i)
 53 | {
 54 |     return(gsl_vector_get(v, i));
 55 | }
 56 | 
 57 | 
 58 | void vset(gsl_vector* v, int i, double x)
 59 | {
 60 |     gsl_vector_set(v, i, x);
 61 | }
 62 | 
 63 | 
 64 | void vinc(gsl_vector* v, int i, double x)
 65 | {
 66 |     vset(v, i, vget(v, i) + x);
 67 | }
 68 | 
 69 | 
 70 | double mget(const gsl_matrix* m, int i, int j)
 71 | {
 72 |     return(gsl_matrix_get(m, i, j));
 73 | }
 74 | 
 75 | 
 76 | void mset(gsl_matrix* m, int i, int j, double x)
 77 | {
 78 |     gsl_matrix_set(m, i, j, x);
 79 | }
 80 | 
 81 | 
 82 | void minc(gsl_matrix* m, int i, int j, double x)
 83 | {
 84 |     mset(m, i, j, mget(m, i, j) + x);
 85 | }
 86 | 
 87 | 
 88 | void col_sum(gsl_matrix* m, gsl_vector* val)
 89 | {
 90 |     int i, j;
 91 |     gsl_vector_set_all(val, 0);
 92 | 
 93 |     for (i = 0; i < m->size1; i++)
 94 |         for (j = 0; j < m->size2; j++)
 95 |             vinc(val, j, mget(m, i, j));
 96 | }
 97 | 
 98 | 
 99 | void vprint(const gsl_vector * v)
100 | {
101 |     int i;
102 |     for (i = 0; i < v->size; i++)
103 | 	printf("%5.5f ", vget(v, i));
104 |     printf("\n\n");
105 | }
106 | 
107 | 
108 | void vfprint(const gsl_vector * v, FILE * f)
109 | {
110 |     int i;
111 |     for (i = 0; i < v->size; i++)
112 | 	fprintf(f, "%5.5f ", vget(v, i));
113 |     fprintf(f, "\n");
114 |     fflush(f);
115 | }
116 | 
117 | 
118 | 
119 | void mprint(const gsl_matrix * m)
120 | {
121 |     int i, j;
122 |     for (i = 0; i < m->size1; i++)
123 |     {
124 | 	for (j = 0; j < m->size2; j++)
125 | 	    printf("%5.5f ", mget(m, i, j));
126 | 	printf("\n");
127 |     }
128 | }
129 | 
130 | 
131 | void scanf_vector(char* filename, gsl_vector* v)
132 | {
133 |     FILE* fileptr;
134 |     fileptr = fopen(filename, "r");
135 |     gsl_vector_fscanf(fileptr, v);
136 |     fclose(fileptr);
137 | }
138 | 
139 | 
140 | void scanf_matrix(char* filename, gsl_matrix * m)
141 | {
142 |     FILE* fileptr;
143 |     fileptr = fopen(filename, "r");
144 |     gsl_matrix_fscanf(fileptr, m);
145 |     fclose(fileptr);
146 | }
147 | 
148 | void printf_vector(char* filename, gsl_vector* v)
149 | {
150 |     FILE* fileptr;
151 |     fileptr = fopen(filename, "w");
152 |     gsl_vector_fprintf(fileptr, v, "%f");
153 |     fclose(fileptr);
154 | }
155 | 
156 | 
157 | void printf_matrix(char* filename, gsl_matrix * m)
158 | {
159 |     FILE* fileptr;
160 |     fileptr = fopen(filename, "w");
161 |     gsl_matrix_fprintf(fileptr, m, "%f");
162 |     fclose(fileptr);
163 | }
164 | 
165 | 
166 | void matrix_inverse(gsl_matrix* m, gsl_matrix* inverse)
167 | {
168 |     gsl_matrix *lu;
169 |     gsl_permutation* p;
170 |     int signum;
171 | 
172 |     p = gsl_permutation_alloc(m->size1);
173 |     lu = gsl_matrix_alloc(m->size1, m->size2);
174 | 
175 |     gsl_matrix_memcpy(lu, m);
176 |     gsl_linalg_LU_decomp(lu, p, &signum);
177 |     gsl_linalg_LU_invert(lu, p, inverse);
178 | 
179 |     gsl_matrix_free(lu);
180 |     gsl_permutation_free(p);
181 | }
182 | 
183 | 
184 | double log_det(gsl_matrix* m)
185 | {
186 |     gsl_matrix* lu;
187 |     gsl_permutation* p;
188 |     double result;
189 |     int signum;
190 | 
191 |     p = gsl_permutation_alloc(m->size1);
192 |     lu = gsl_matrix_alloc(m->size1, m->size2);
193 | 
194 |     gsl_matrix_memcpy(lu, m);
195 |     gsl_linalg_LU_decomp(lu, p, &signum);
196 |     result = gsl_linalg_LU_lndet(lu);
197 | 
198 |     gsl_matrix_free(lu);
199 |     gsl_permutation_free(p);
200 | 
201 |     return(result);
202 | }
203 | 
204 | 
205 | void sym_eigen(gsl_matrix* m, gsl_vector* vals, gsl_matrix* vects)
206 | {
207 |     gsl_eigen_symmv_workspace* wk;
208 |     gsl_matrix* mcpy;
209 |     int r;
210 | 
211 |     mcpy = gsl_matrix_alloc(m->size1, m->size2);
212 |     wk = gsl_eigen_symmv_alloc(m->size1);
213 |     gsl_matrix_memcpy(mcpy, m);
214 |     r = gsl_eigen_symmv(mcpy, vals, vects, wk);
215 |     gsl_eigen_symmv_free(wk);
216 |     gsl_matrix_free(mcpy);
217 | }
218 | 
219 | 
220 | double sum(gsl_vector* v)
221 | {
222 |     double *data = v->data, val = 0;
223 |     int size = v->size;
224 |     int i;
225 |     for (i = 0; i < size; i++)
226 |         val += data[i];
227 |     return(val);
228 | }
229 | 
230 | 
231 | void center(gsl_vector* v)
232 | {
233 |     int size = v->size;
234 |     double mean = sum(v)/size;
235 |     int i;
236 |     for (i = 0; i < size; i++)
237 |         vset(v, i, vget(v,i)-mean);
238 | }
239 | 
240 | 
241 | void normalize(gsl_vector* v)
242 | {
243 |     int size = v->size;
244 |     double sum_v = sum(v);
245 |     int i;
246 |     for (i = 0; i < size; i++)
247 |         vset(v, i, vget(v,i)/sum_v);
248 | }
249 | 
250 | 
251 | double norm(gsl_vector *v)
252 | {
253 |     double val = 0;
254 |     int i;
255 | 
256 |     for (i = 0; i < v->size; i++)
257 |         val += vget(v, i) * vget(v, i);
258 |     return(sqrt(val));
259 | }
260 | 
261 | 
262 | int argmax(gsl_vector *v)
263 | {
264 |     int argmax = 0;
265 |     double max = vget(v, 0);
266 |     int i;
267 |     for (i = 1; i < v->size; i++)
268 |     {
269 |         double val = vget(v, i);
270 |         if (val > max)
271 |         {
272 |             argmax = i;
273 |             max    = val;
274 |         }
275 |     }
276 |     return(argmax);
277 | }
278 | 


--------------------------------------------------------------------------------
/gsl-wrappers.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
 2 | 
 3 | // This file is part of CTM-C.
 4 | 
 5 | // CTM-C is free software; you can redistribute it and/or modify it under
 6 | // the terms of the GNU General Public License as published by the Free
 7 | // Software Foundation; either version 2 of the License, or (at your
 8 | // option) any later version.
 9 | 
10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 | // for more details.
14 | 
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 | // USA
19 | 
20 | #ifndef GSL_WRAPPERS_H
21 | #define GSL_WRAPPERS_H
22 | 
23 | // #include <gsl/gsl_check_range.h>
24 | #include <gsl/gsl_vector.h>
25 | #include <gsl/gsl_matrix.h>
26 | #include <math.h>
27 | 
28 | // #define MAXFLOAT 3.40282347e+38F
29 | 
30 | double safe_log(double);
31 | double log_sum(double, double);
32 | double vget(const gsl_vector*, int);
33 | void vset(gsl_vector*, int, double);
34 | void vinc(gsl_vector*, int, double);
35 | double mget(const gsl_matrix* m, int, int);
36 | void mset(gsl_matrix*, int, int, double) ;
37 | void minc(gsl_matrix*, int, int, double) ;
38 | void col_sum(gsl_matrix*, gsl_vector*);
39 | void vprint(const gsl_vector*);
40 | void mprint(const gsl_matrix*);
41 | void scanf_vector(char*, gsl_vector*);
42 | void scanf_matrix(char*, gsl_matrix*);
43 | void printf_vector(char*, gsl_vector*);
44 | void printf_matrix(char*, gsl_matrix*);
45 | double log_det(gsl_matrix*);
46 | void matrix_inverse(gsl_matrix*, gsl_matrix*);
47 | void sym_eigen(gsl_matrix*, gsl_vector*, gsl_matrix*);
48 | double sum(gsl_vector*);
49 | double norm(gsl_vector *);
50 | void vfprint(const gsl_vector * v, FILE * f);
51 | int argmax(gsl_vector *v);
52 | void center(gsl_vector* v);
53 | void normalize(gsl_vector* v);
54 | 
55 | #endif
56 | 


--------------------------------------------------------------------------------
/inf-settings.txt:
--------------------------------------------------------------------------------
1 | em max iter 1000
2 | var max iter -1
3 | cg max iter -1
4 | em convergence 1e-5
5 | var convergence 1e-6
6 | cg convergence 1e-6
7 | lag 1
8 | covariance estimate mle
9 | 


--------------------------------------------------------------------------------
/inference.c:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
  2 | 
  3 | // This file is part of CTM-C.
  4 | 
  5 | // CTM-C is free software; you can redistribute it and/or modify it under
  6 | // the terms of the GNU General Public License as published by the Free
  7 | // Software Foundation; either version 2 of the License, or (at your
  8 | // option) any later version.
  9 | 
 10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 13 | // for more details.
 14 | 
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 18 | // USA
 19 | 
 20 | #include <stdlib.h>
 21 | #include <gsl/gsl_vector.h>
 22 | #include <gsl/gsl_vector_double.h>
 23 | #include <gsl/gsl_blas.h>
 24 | #include <gsl/gsl_multimin.h>
 25 | #include <gsl/gsl_rng.h>
 26 | #include <gsl/gsl_randist.h>
 27 | #include <assert.h>
 28 | 
 29 | #include "gsl-wrappers.h"
 30 | #include "corpus.h"
 31 | #include "ctm.h"
 32 | #include "params.h"
 33 | #include "inference.h"
 34 | 
 35 | extern llna_params PARAMS;
 36 | 
 37 | double f_lambda(const gsl_vector * p, void * params);
 38 | void df_lambda(const gsl_vector * p, void * params, gsl_vector * df);
 39 | void fdf_lambda(const gsl_vector * p, void * params, double * f, gsl_vector * df);
 40 | 
 41 | double f_nu(const gsl_vector * p, void * params);
 42 | void df_nu(const gsl_vector * p, void * params, gsl_vector * df);
 43 | void fdf_nu(const gsl_vector * p, void * params, double * f, gsl_vector * df);
 44 | 
 45 | /*
 46 |  * temporary k-1 vectors so we don't have to allocate, deallocate
 47 |  *
 48 |  */
 49 | 
 50 | gsl_vector ** temp;
 51 | int ntemp = 4;
 52 | 
 53 | void init_temp_vectors(int size)
 54 | {
 55 |     int i;
 56 |     temp = malloc(sizeof(gsl_vector *)*ntemp);
 57 |     for (i = 0; i < 4; i++)
 58 |         temp[i] = gsl_vector_alloc(size);
 59 | }
 60 | 
 61 | 
 62 | /*
 63 |  * likelihood bound
 64 |  *
 65 |  */
 66 | 
 67 | double expect_mult_norm(llna_var_param * var)
 68 | {
 69 |     int i;
 70 |     double sum_exp = 0;
 71 |     int niter = var->lambda->size;
 72 | 
 73 |     for (i = 0; i < niter; i++)
 74 |         sum_exp += exp(vget(var->lambda, i) + (0.5) * vget(var->nu,i));
 75 | 
 76 |     return((1.0/var->zeta) * sum_exp - 1.0 + log(var->zeta));
 77 | }
 78 | 
 79 | 
 80 | void lhood_bnd(llna_var_param* var, doc* doc, llna_model* mod)
 81 | {
 82 |     int i = 0, j = 0, k = mod->k;
 83 |     gsl_vector_set_zero(var->topic_scores);
 84 | 
 85 |     // E[log p(\eta | \mu, \Sigma)] + H(q(\eta | \lambda, \nu)
 86 | 
 87 |     double lhood  = (0.5) * mod->log_det_inv_cov + (0.5) * (mod->k-1);
 88 |     for (i = 0; i < k-1; i++)
 89 |     {
 90 |         double v = - (0.5) * vget(var->nu, i) * mget(mod->inv_cov,i, i);
 91 |         for (j = 0; j < mod->k-1; j++)
 92 |         {
 93 |             v -= (0.5) *
 94 |                 (vget(var->lambda, i) - vget(mod->mu, i)) *
 95 |                 mget(mod->inv_cov, i, j) *
 96 |                 (vget(var->lambda, j) - vget(mod->mu, j));
 97 |         }
 98 |         v += (0.5) * log(vget(var->nu, i));
 99 |         lhood += v;
100 |     }
101 | 
102 |     // E[log p(z_n | \eta)] + E[log p(w_n | \beta)] + H(q(z_n | \phi_n))
103 | 
104 |     lhood -= expect_mult_norm(var) * doc->total;
105 |     for (i = 0; i < doc->nterms; i++)
106 |     {
107 |         // !!! we can speed this up by turning it into a dot product
108 |         // !!! profiler says this is where some time is spent
109 |         for (j = 0; j < mod->k; j++)
110 |         {
111 |             double phi_ij = mget(var->phi, i, j);
112 |             double log_phi_ij = mget(var->log_phi, i, j);
113 |             if (phi_ij > 0)
114 |             {
115 |                 vinc(var->topic_scores, j, phi_ij * doc->count[i]);
116 |                 lhood +=
117 |                     doc->count[i] * phi_ij *
118 |                     (vget(var->lambda, j) +
119 |                      mget(mod->log_beta, j, doc->word[i]) -
120 |                      log_phi_ij);
121 |             }
122 |         }
123 |     }
124 |     var->lhood = lhood;
125 |     assert(!isnan(var->lhood));
126 | }
127 | 
128 | 
129 | /**
130 |  * optimize zeta
131 |  *
132 |  */
133 | 
134 | int opt_zeta(llna_var_param * var, doc * doc, llna_model * mod)
135 | {
136 |     int i;
137 | 
138 |     var->zeta = 1.0;
139 |     for (i = 0; i < mod->k-1; i++)
140 |         var->zeta += exp(vget(var->lambda, i) + (0.5) * vget(var->nu, i));
141 | 
142 |     return(0);
143 | }
144 | 
145 | 
146 | /**
147 |  * optimize phi
148 |  *
149 |  */
150 | 
151 | void opt_phi(llna_var_param * var, doc * doc, llna_model * mod)
152 | {
153 |     int i, n, K = mod->k;
154 |     double log_sum_n = 0;
155 | 
156 |     // compute phi proportions in log space
157 | 
158 |     for (n = 0; n < doc->nterms; n++)
159 |     {
160 |         log_sum_n = 0;
161 |         for (i = 0; i < K; i++)
162 |         {
163 |             mset(var->log_phi, n, i,
164 |                  vget(var->lambda, i) + mget(mod->log_beta, i, doc->word[n]));
165 |             if (i == 0)
166 |                 log_sum_n = mget(var->log_phi, n, i);
167 |             else
168 |                 log_sum_n =  log_sum(log_sum_n, mget(var->log_phi, n, i));
169 |         }
170 |         for (i = 0; i < K; i++)
171 |         {
172 |             mset(var->log_phi, n, i, mget(var->log_phi, n, i) - log_sum_n);
173 |             mset(var->phi, n, i, exp(mget(var->log_phi, n, i)));
174 |         }
175 |     }
176 | }
177 | 
178 | /**
179 |  * optimize lambda
180 |  *
181 |  */
182 | 
183 | void fdf_lambda(const gsl_vector * p, void * params, double * f, gsl_vector * df)
184 | {
185 |     *f = f_lambda(p, params);
186 |     df_lambda(p, params, df);
187 | }
188 | 
189 | 
190 | double f_lambda(const gsl_vector * p, void * params)
191 | {
192 |     double term1, term2, term3;
193 |     int i;
194 |     llna_var_param * var = ((bundle *) params)->var;
195 |     doc * doc = ((bundle *) params)->doc;
196 |     llna_model * mod = ((bundle *) params)->mod;
197 | 
198 |     // compute lambda^T \sum phi
199 |     gsl_blas_ddot(p,((bundle *) params)->sum_phi, &term1);
200 |     // compute lambda - mu (= temp1)
201 |     gsl_blas_dcopy(p, temp[1]);
202 |     gsl_blas_daxpy (-1.0, mod->mu, temp[1]);
203 |     // compute (lambda - mu)^T Sigma^-1 (lambda - mu)
204 |     gsl_blas_dsymv(CblasUpper, 1, mod->inv_cov, temp[1], 0, temp[2]);
205 |     // gsl_blas_dgemv(CblasNoTrans, 1, mod->inv_cov, temp[1], 0, temp[2]);
206 |     gsl_blas_ddot(temp[2], temp[1], &term2);
207 |     term2 = - 0.5 * term2;
208 |     // last term
209 |     term3 = 0;
210 |     for (i = 0; i < mod->k-1; i++)
211 |         term3 += exp(vget(p, i) + (0.5) * vget(var->nu,i));
212 |     term3 = -((1.0/var->zeta) * term3 - 1.0 + log(var->zeta)) * doc->total;
213 |     // negate for minimization
214 |     return(-(term1+term2+term3));
215 | }
216 | 
217 | 
218 | void df_lambda(const gsl_vector * p, void * params, gsl_vector * df)
219 | {
220 |     // cast bundle {variational parameters, model, document}
221 | 
222 |     llna_var_param * var = ((bundle *) params)->var;
223 |     doc * doc = ((bundle *) params)->doc;
224 |     llna_model * mod = ((bundle *) params)->mod;
225 |     gsl_vector * sum_phi = ((bundle *) params)->sum_phi;
226 | 
227 |     // compute \Sigma^{-1} (\mu - \lambda)
228 | 
229 |     gsl_vector_set_zero(temp[0]);
230 |     gsl_blas_dcopy(mod->mu, temp[1]);
231 |     gsl_vector_sub(temp[1], p);
232 |     gsl_blas_dsymv(CblasLower, 1, mod->inv_cov, temp[1], 0, temp[0]);
233 | 
234 |     // compute - (N / \zeta) * exp(\lambda + \nu^2 / 2)
235 | 
236 |     int i;
237 |     for (i = 0; i < temp[3]->size; i++)
238 |     {
239 |         vset(temp[3], i, -(((double) doc->total) / var->zeta) *
240 |              exp(vget(p, i) + 0.5 * vget(var->nu, i)));
241 |     }
242 | 
243 |     // set return value (note negating derivative of bound)
244 | 
245 |     gsl_vector_set_all(df, 0.0);
246 |     gsl_vector_sub(df, temp[0]);
247 |     gsl_vector_sub(df, sum_phi);
248 |     gsl_vector_sub(df, temp[3]);
249 | }
250 | 
251 | 
252 | int opt_lambda(llna_var_param * var, doc * doc, llna_model * mod)
253 | {
254 |     gsl_multimin_function_fdf lambda_obj;
255 |     const gsl_multimin_fdfminimizer_type * T;
256 |     gsl_multimin_fdfminimizer * s;
257 |     bundle b;
258 |     int iter = 0, i, j;
259 |     int status;
260 |     double f_old, converged;
261 | 
262 |     b.var = var;
263 |     b.doc = doc;
264 |     b.mod = mod;
265 | 
266 |     // precompute \sum_n \phi_n and put it in the bundle
267 | 
268 |     b.sum_phi = gsl_vector_alloc(mod->k-1);
269 |     gsl_vector_set_zero(b.sum_phi);
270 |     for (i = 0; i < doc->nterms; i++)
271 |     {
272 |         for (j = 0; j < mod->k-1; j++)
273 |         {
274 |             vset(b.sum_phi, j,
275 |                  vget(b.sum_phi, j) +
276 |                  ((double) doc->count[i]) * mget(var->phi, i, j));
277 |         }
278 |     }
279 | 
280 |     lambda_obj.f = &f_lambda;
281 |     lambda_obj.df = &df_lambda;
282 |     lambda_obj.fdf = &fdf_lambda;
283 |     lambda_obj.n = mod->k-1;
284 |     lambda_obj.params = (void *)&b;
285 | 
286 |     // starting value
287 |     // T = gsl_multimin_fdfminimizer_vector_bfgs;
288 |     T = gsl_multimin_fdfminimizer_conjugate_fr;
289 |     // T = gsl_multimin_fdfminimizer_steepest_descent;
290 |     s = gsl_multimin_fdfminimizer_alloc (T, mod->k-1);
291 | 
292 |     gsl_vector* x = gsl_vector_calloc(mod->k-1);
293 |     for (i = 0; i < mod->k-1; i++) vset(x, i, vget(var->lambda, i));
294 |     gsl_multimin_fdfminimizer_set (s, &lambda_obj, x, 0.01, 1e-3);
295 |     do
296 |     {
297 |         iter++;
298 |         f_old = s->f;
299 |         status = gsl_multimin_fdfminimizer_iterate (s);
300 |         converged = fabs((f_old - s->f) / f_old);
301 |         // printf("f(lambda) = %5.17e ; conv = %5.17e\n", s->f, converged);
302 |         if (status) break;
303 |         status = gsl_multimin_test_gradient (s->gradient, PARAMS.cg_convergence);
304 |     }
305 |     while ((status == GSL_CONTINUE) &&
306 |            ((PARAMS.cg_max_iter < 0) || (iter < PARAMS.cg_max_iter)));
307 |     // while ((converged > PARAMS.cg_convergence) &&
308 |     // ((PARAMS.cg_max_iter < 0) || (iter < PARAMS.cg_max_iter)));
309 |     if (iter == PARAMS.cg_max_iter)
310 |         printf("warning: cg didn't converge (lambda) \n");
311 | 
312 |     for (i = 0; i < mod->k-1; i++)
313 |         vset(var->lambda, i, vget(s->x, i));
314 |     vset(var->lambda, i, 0);
315 | 
316 |     gsl_multimin_fdfminimizer_free(s);
317 |     gsl_vector_free(b.sum_phi);
318 |     gsl_vector_free(x);
319 | 
320 |     return(0);
321 | }
322 | 
323 | /**
324 |  * optimize nu
325 |  *
326 |  */
327 | 
328 | double f_nu_i(double nu_i, int i, llna_var_param * var,
329 |               llna_model * mod, doc * d)
330 | {
331 |     double v;
332 | 
333 |     v = - (nu_i * mget(mod->inv_cov, i, i) * 0.5)
334 |         - (((double) d->total/var->zeta) * exp(vget(var->lambda, i) + nu_i/2))
335 |         + (0.5 * safe_log(nu_i));
336 | 
337 |     return(v);
338 | }
339 | 
340 | 
341 | double df_nu_i(double nu_i, int i, llna_var_param * var,
342 |                llna_model * mod, doc * d)
343 | {
344 |     double v;
345 | 
346 |     v = - (mget(mod->inv_cov, i, i) * 0.5)
347 |         - (0.5 * ((double) d->total/var->zeta) * exp(vget(var->lambda, i) + nu_i/2))
348 |         + (0.5 * (1.0 / nu_i));
349 | 
350 |     return(v);
351 | }
352 | 
353 | 
354 | double d2f_nu_i(double nu_i, int i, llna_var_param * var, llna_model * mod, doc * d)
355 | {
356 |     double v;
357 | 
358 |     v = - (0.25 * ((double) d->total/var->zeta) * exp(vget(var->lambda, i) + nu_i/2))
359 |         - (0.5 * (1.0 / (nu_i * nu_i)));
360 | 
361 |     return(v);
362 | }
363 | 
364 | 
365 | void opt_nu(llna_var_param * var, doc * d, llna_model * mod)
366 | {
367 |     int i;
368 | 
369 |     // !!! here i changed to k-1
370 |     for (i = 0; i < mod->k-1; i++)
371 |         opt_nu_i(i, var, mod, d);
372 | }
373 | 
374 | 
375 | double fixed_point_iter_i(int i, llna_var_param * var, llna_model * mod, doc * d)
376 | {
377 |     double v;
378 |     double lambda = vget(var->lambda, i);
379 |     double nu = vget(var->nu, i);
380 |     double c = ((double) d->total / var->zeta);
381 | 
382 |     v = mget(mod->inv_cov,i,i) + c * exp(lambda + nu/2);
383 | 
384 |     return(v);
385 | }
386 | 
387 | 
388 | void opt_nu_i(int i, llna_var_param * var, llna_model * mod, doc * d)
389 | {
390 |     double init_nu = 10;
391 |     double nu_i = 0, log_nu_i = 0, df = 0, d2f = 0;
392 |     int iter = 0;
393 | 
394 |     log_nu_i = log(init_nu);
395 |     do
396 |     {
397 |         iter++;
398 |         nu_i = exp(log_nu_i);
399 |         // assert(!isnan(nu_i));
400 |         if (isnan(nu_i))
401 |         {
402 |             init_nu = init_nu*2;
403 |             printf("warning : nu is nan; new init = %5.5f\n", init_nu);
404 |             log_nu_i = log(init_nu);
405 |             nu_i = init_nu;
406 |         }
407 |         // f = f_nu_i(nu_i, i, var, mod, d);
408 |         // printf("%5.5f  %5.5f \n", nu_i, f);
409 |         df = df_nu_i(nu_i, i, var, mod, d);
410 |         d2f = d2f_nu_i(nu_i, i, var, mod, d);
411 |         log_nu_i = log_nu_i - (df*nu_i)/(d2f*nu_i*nu_i+df*nu_i);
412 |     }
413 |     while (fabs(df) > NEWTON_THRESH);
414 | 
415 |     vset(var->nu, i, exp(log_nu_i));
416 | }
417 | 
418 | /**
419 |  * initial variational parameters
420 |  *
421 |  */
422 | 
423 | void init_var_unif(llna_var_param * var, doc * doc, llna_model * mod)
424 | {
425 |     int i;
426 | 
427 |     gsl_matrix_set_all(var->phi, 1.0/mod->k);
428 |     gsl_matrix_set_all(var->log_phi, -log((double) mod->k));
429 |     var->zeta = 10;
430 |     for (i = 0; i < mod->k-1; i++)
431 |     {
432 |         vset(var->nu, i, 10.0);
433 |         vset(var->lambda, i, 0);
434 |     }
435 |     vset(var->nu, i, 0);
436 |     vset(var->lambda, i, 0);
437 |     var->niter = 0;
438 |     var->lhood = 0;
439 | }
440 | 
441 | 
442 | void init_var(llna_var_param * var, doc * doc, llna_model * mod, gsl_vector *lambda, gsl_vector *nu)
443 | {
444 |     gsl_vector_memcpy(var->lambda, lambda);
445 |     gsl_vector_memcpy(var->nu, nu);
446 |     opt_zeta(var, doc, mod);
447 |     opt_phi(var, doc, mod);
448 |     var->niter = 0;
449 | }
450 | 
451 | 
452 | 
453 | 
454 | /**
455 |  *
456 |  * variational inference
457 |  *
458 |  */
459 | 
460 | llna_var_param * new_llna_var_param(int nterms, int k)
461 | {
462 |     llna_var_param * ret = malloc(sizeof(llna_var_param));
463 |     ret->lambda = gsl_vector_alloc(k);
464 |     ret->nu = gsl_vector_alloc(k);
465 |     ret->phi = gsl_matrix_alloc(nterms, k);
466 |     ret->log_phi = gsl_matrix_alloc(nterms, k);
467 |     ret->zeta = 0;
468 |     ret->topic_scores = gsl_vector_alloc(k);
469 |     return(ret);
470 | }
471 | 
472 | 
473 | void free_llna_var_param(llna_var_param * v)
474 | {
475 |     gsl_vector_free(v->lambda);
476 |     gsl_vector_free(v->nu);
477 |     gsl_matrix_free(v->phi);
478 |     gsl_matrix_free(v->log_phi);
479 |     gsl_vector_free(v->topic_scores);
480 |     free(v);
481 | }
482 | 
483 | 
484 | double var_inference(llna_var_param * var,
485 |                      doc * doc,
486 |                      llna_model * mod)
487 | {
488 |     double lhood_old = 0;
489 |     double convergence;
490 | 
491 |     lhood_bnd(var, doc, mod);
492 |     do
493 |     {
494 |         var->niter++;
495 | 
496 |         opt_zeta(var, doc, mod);
497 |         opt_lambda(var, doc, mod);
498 |         opt_zeta(var, doc, mod);
499 |         opt_nu(var, doc, mod);
500 |         opt_zeta(var, doc, mod);
501 |         opt_phi(var, doc, mod);
502 | 
503 |         lhood_old = var->lhood;
504 |         lhood_bnd(var, doc, mod);
505 | 
506 |         convergence = fabs((lhood_old - var->lhood) / lhood_old);
507 |         // printf("lhood = %8.6f (%7.6f)\n", var->lhood, convergence);
508 | 
509 |         if ((lhood_old > var->lhood) && (var->niter > 1))
510 |             printf("WARNING: iter %05d %5.5f > %5.5f\n",
511 |                    var->niter, lhood_old, var->lhood);
512 |     }
513 |     while ((convergence > PARAMS.var_convergence) &&
514 |            ((PARAMS.var_max_iter < 0) || (var->niter < PARAMS.var_max_iter)));
515 | 
516 |     if (convergence > PARAMS.var_convergence) var->converged = 0;
517 |     else var->converged = 1;
518 | 
519 |     return(var->lhood);
520 | }
521 | 
522 | 
523 | void update_expected_ss(llna_var_param* var, doc* d, llna_ss* ss)
524 | {
525 |     int i, j, w, c;
526 |     double lilj;
527 | 
528 |     // covariance and mean suff stats
529 |     for (i = 0; i < ss->cov_ss->size1; i++)
530 |     {
531 |         vinc(ss->mu_ss, i, vget(var->lambda, i));
532 |         for (j = 0; j < ss->cov_ss->size2; j++)
533 |         {
534 |             lilj = vget(var->lambda, i) * vget(var->lambda, j);
535 |             if (i==j)
536 |                 mset(ss->cov_ss, i, j,
537 |                      mget(ss->cov_ss, i, j) + vget(var->nu, i) + lilj);
538 |             else
539 |                 mset(ss->cov_ss, i, j, mget(ss->cov_ss, i, j) + lilj);
540 |         }
541 |     }
542 |     // topics suff stats
543 |     for (i = 0; i < d->nterms; i++)
544 |     {
545 |         for (j = 0; j < ss->beta_ss->size1; j++)
546 |         {
547 |             w = d->word[i];
548 |             c = d->count[i];
549 |             mset(ss->beta_ss, j, w,
550 |                  mget(ss->beta_ss, j, w) + c * mget(var->phi, i, j));
551 |         }
552 |     }
553 |     // number of data
554 |     ss->ndata++;
555 | }
556 | 
557 | /*
558 |  * importance sampling the likelihood based on the variational posterior
559 |  *
560 |  */
561 | 
562 | double sample_term(llna_var_param* var, doc* d, llna_model* mod, double* eta)
563 | {
564 |     int i, j, n;
565 |     double t1, t2, sum, theta[mod->k];
566 |     double word_term;
567 | 
568 |     t1 = (0.5) * mod->log_det_inv_cov;
569 |     t1 += - (0.5) * (mod->k) * 1.837877;
570 |     for (i = 0; i < mod->k; i++)
571 |         for (j = 0; j < mod->k ; j++)
572 |             t1 -= (0.5) *
573 |                 (eta[i] - vget(mod->mu, i)) *
574 |                 mget(mod->inv_cov, i, j) *
575 |                 (eta[j] - vget(mod->mu, j));
576 | 
577 |     // compute theta
578 |     sum = 0;
579 |     for (i = 0; i < mod->k; i++)
580 |     {
581 |         theta[i] = exp(eta[i]);
582 |         sum += theta[i];
583 |     }
584 |     for (i = 0; i < mod->k; i++)
585 |     {
586 |         theta[i] = theta[i] / sum;
587 |     }
588 | 
589 |     // compute word probabilities
590 |     for (n = 0; n < d->nterms; n++)
591 |     {
592 |         word_term = 0;
593 |         for (i = 0; i < mod->k; i++)
594 |             word_term += theta[i]*exp(mget(mod->log_beta,i,d->word[n]));
595 |         t1 += d->count[n] * safe_log(word_term);
596 |     }
597 | 
598 |     // log(q(\eta | lambda, nu))
599 |     t2 = 0;
600 |     for (i = 0; i < mod->k; i++)
601 |         t2 += log(gsl_ran_gaussian_pdf(eta[i] - vget(var->lambda,i), sqrt(vget(var->nu,i))));
602 |     return(t1-t2);
603 | }
604 | 
605 | 
606 | double sample_lhood(llna_var_param* var, doc* d, llna_model* mod)
607 | {
608 |     int nsamples, i, n;
609 |     double eta[mod->k];
610 |     double log_prob, sum = 0, v;
611 |     gsl_rng * r = gsl_rng_alloc(gsl_rng_taus);
612 | 
613 |     gsl_rng_set(r, (long) 1115574245);
614 |     nsamples = 10000;
615 | 
616 |     // for each sample
617 |     for (n = 0; n < nsamples; n++)
618 |     {
619 |         // sample eta from q(\eta)
620 |         for (i = 0; i < mod->k; i++)
621 |         {
622 |             v = gsl_ran_gaussian_ratio_method(r, sqrt(vget(var->nu,i)));
623 |             eta[i] = v + vget(var->lambda, i);
624 |         }
625 |         // compute p(w | \eta) - q(\eta)
626 |         log_prob = sample_term(var, d, mod, eta);
627 |         // update log sum
628 |         if (n == 0) sum = log_prob;
629 |         else sum = log_sum(sum, log_prob);
630 |         // printf("%5.5f\n", (sum - log(n+1)));
631 |     }
632 |     sum = sum - log((double) nsamples);
633 |     return(sum);
634 | }
635 | 
636 | 
637 | /*
638 |  * expected theta under a variational distribution
639 |  *
640 |  * (v is assumed allocated to the right length.)
641 |  *
642 |  */
643 | 
644 | 
645 | void expected_theta(llna_var_param *var, doc* d, llna_model *mod, gsl_vector* val)
646 | {
647 |     int nsamples, i, n;
648 |     double eta[mod->k];
649 |     double theta[mod->k];
650 |     double e_theta[mod->k];
651 |     double sum, w, v;
652 |     gsl_rng * r = gsl_rng_alloc(gsl_rng_taus);
653 | 
654 |     gsl_rng_set(r, (long) 1115574245);
655 |     nsamples = 100;
656 | 
657 |     // initialize e_theta
658 |     for (i = 0; i < mod->k; i++) e_theta[i] = -1;
659 |     // for each sample
660 |     for (n = 0; n < nsamples; n++)
661 |     {
662 |         // sample eta from q(\eta)
663 |         for (i = 0; i < mod->k; i++)
664 |         {
665 |             v = gsl_ran_gaussian_ratio_method(r, sqrt(vget(var->nu,i)));
666 |             eta[i] = v + vget(var->lambda, i);
667 |         }
668 |         // compute p(w | \eta) - q(\eta)
669 |         w = sample_term(var, d, mod, eta);
670 |         // compute theta
671 |         sum = 0;
672 |         for (i = 0; i < mod->k; i++)
673 |         {
674 |             theta[i] = exp(eta[i]);
675 |             sum += theta[i];
676 |         }
677 |         for (i = 0; i < mod->k; i++)
678 |             theta[i] = theta[i] / sum;
679 |         // update e_theta
680 |         for (i = 0; i < mod->k; i++)
681 |             e_theta[i] = log_sum(e_theta[i], w +  safe_log(theta[i]));
682 |     }
683 |     // normalize e_theta and set return vector
684 |     sum = -1;
685 |     for (i = 0; i < mod->k; i++)
686 |     {
687 |         e_theta[i] = e_theta[i] - log(nsamples);
688 |         sum = log_sum(sum, e_theta[i]);
689 |     }
690 |     for (i = 0; i < mod->k; i++)
691 |         vset(val, i, exp(e_theta[i] - sum));
692 | }
693 | 
694 | /*
695 |  * log probability of the document under proportions theta and topics
696 |  * beta
697 |  *
698 |  */
699 | 
700 | double log_mult_prob(doc* d, gsl_vector* theta, gsl_matrix* log_beta)
701 | {
702 |     int i, k;
703 |     double ret = 0;
704 |     double term_prob;
705 | 
706 |     for (i = 0; i < d->nterms; i++)
707 |     {
708 |         term_prob = 0;
709 |         for (k = 0; k < log_beta->size1; k++)
710 |             term_prob += vget(theta, k) * exp(mget(log_beta, k, d->word[i]));
711 |         ret = ret + safe_log(term_prob) * d->count[i];
712 |     }
713 |     return(ret);
714 | }
715 | 
716 | 
717 | /*
718 |  * writes the word assignments line for a document to a file
719 |  *
720 |  */
721 | 
722 | void write_word_assignment(FILE* f, doc* d, gsl_matrix* phi)
723 | {
724 |     int n;
725 | 
726 |     fprintf(f, "%03d", d->nterms);
727 |     for (n = 0; n < d->nterms; n++)
728 |     {
729 |         gsl_vector phi_row = gsl_matrix_row(phi, n).vector;
730 |         fprintf(f, " %04d:%02d", d->word[n], argmax(&phi_row));
731 |     }
732 |     fprintf(f, "\n");
733 |     fflush(f);
734 | }
735 | 


--------------------------------------------------------------------------------
/inference.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
 2 | 
 3 | // This file is part of CTM-C.
 4 | 
 5 | // CTM-C is free software; you can redistribute it and/or modify it under
 6 | // the terms of the GNU General Public License as published by the Free
 7 | // Software Foundation; either version 2 of the License, or (at your
 8 | // option) any later version.
 9 | 
10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 | // for more details.
14 | 
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 | // USA
19 | 
20 | #ifndef LLNA_INFERENCE_H
21 | #define LLNA_INFERENCE_H
22 | 
23 | #define NEWTON_THRESH 1e-10
24 | 
25 | #include <gsl/gsl_vector.h>
26 | #include <gsl/gsl_matrix.h>
27 | #include <stdlib.h>
28 | #include <stdio.h>
29 | 
30 | #include "corpus.h"
31 | #include "ctm.h"
32 | #include "gsl-wrappers.h"
33 | 
34 | typedef struct llna_var_param {
35 |     gsl_vector * nu;
36 |     gsl_vector * lambda;
37 |     double zeta;
38 |     gsl_matrix * phi;
39 |     gsl_matrix * log_phi;
40 |     int niter;
41 |     short converged;
42 |     double lhood;
43 |     gsl_vector * topic_scores;
44 | } llna_var_param;
45 | 
46 | 
47 | typedef struct bundle {
48 |     llna_var_param * var;
49 |     llna_model * mod;
50 |     doc * doc;
51 |     gsl_vector * sum_phi;
52 | } bundle;
53 | 
54 | 
55 | /*
56 |  * functions
57 |  *
58 |  */
59 | 
60 | void init_temp_vectors(int size);
61 | int opt_lambda(llna_var_param * var, doc * doc, llna_model * mod);
62 | void opt_phi(llna_var_param * var, doc * doc, llna_model * mod);
63 | void opt_nu(llna_var_param * var, doc * doc, llna_model * mod);
64 | int opt_zeta(llna_var_param * var, doc * doc, llna_model * mod);
65 | void lhood_bnd(llna_var_param *var, doc* doc, llna_model* mod);
66 | double var_inference(llna_var_param * var, doc * doc, llna_model * mod);
67 | llna_var_param* new_llna_var_param(int, int);
68 | void free_llna_var_param(llna_var_param *);
69 | void update_expected_ss(llna_var_param* , doc*, llna_ss*);
70 | void init_var_unif(llna_var_param * var, doc * doc, llna_model * mod);
71 | void init_var(llna_var_param *var, doc *doc, llna_model *mod, gsl_vector *lambda, gsl_vector *nu);
72 | void opt_nu_i(int i, llna_var_param * var, llna_model * mod, doc * d);
73 | double fixed_point_iter_i(int, llna_var_param *, llna_model *, doc *);
74 | double sample_lhood(llna_var_param* var, doc* d, llna_model* mod);
75 | void expected_theta(llna_var_param *var, doc* d, llna_model *mod, gsl_vector* v);
76 | double log_mult_prob(doc* d, gsl_vector* theta, gsl_matrix* log_beta);
77 | void write_word_assignment(FILE* f, doc* d, gsl_matrix* phi);
78 | 
79 | #endif
80 | 


--------------------------------------------------------------------------------
/lasso-graph.r:
--------------------------------------------------------------------------------
 1 | # Graphical model selection using the Lasso, as
 2 | # proposed by Meinshausen and Buhlmann
 3 | 
 4 | # April, 2007 -- Dave Blei and John Lafferty
 5 | #
 6 | # To apply this to topic graphs, we take the variational means
 7 | # (lambda) for each document, and treat these as data.  We then
 8 | # regress each variable (topic) onto the others using the lasso, and
 9 | # consider the indices of the non-zero entries as estimates of the
10 | # neighbors of the node in the inverse covariance.  The graph is then
11 | # formed by including an edge if either/both (OR/AND) of the endpoints
12 | # include it in the corresponding penalized regression.
13 | 
14 | library(lasso2)
15 | # it's possible to use the lars package as well, with some minor mods
16 | 
17 | # Inputs
18 | #   file:   n x p data matrix -- e.g., the variational means ("final-lambda.dat")
19 | #   lambda: relative bound on the l1-norm of the parameters, in [0,1]
20 | #   and=T:  if and=T/F then the graph is computed by taking the intersction/union of the nbhds
21 | #
22 | # Output
23 | #   Ihat:   matrix of 0/1, with 1 indicating an edge in the graph
24 | 
25 | build.graph = function(x, lambda, and=T) {
26 |   x = scale(x)
27 |   p = ncol(x)
28 |   n = nrow(x)
29 |   Shat = matrix(F,p,p)
30 | 
31 |   cat("n=",n," p=",p, " lambda=",lambda,"\n", sep="")
32 |   for (j in 1:p) {
33 |     cat(".")
34 |     if (j %% 10 == 0) {
35 |       cat(j)
36 |     }
37 |     # The response is the j-th column
38 |     y = x[,j]
39 |     X = x[,-j]
40 | 
41 |     # Do the l1-regularized regression
42 |     # Note: the bound in l1ce code is the upper bound on the l1
43 |     # norm.  So, a larger bound is a weaker constraint on the model
44 |     data = data.frame(cbind(y,X))
45 |     out = l1ce(y ~ X, data=data, sweep.out = ~1, bound=lambda)
46 | 
47 |     indices = (1:p)[-j]
48 |     beta = coef(out)[2:p] # skipping the intercept
49 |     nonzero = indices[beta > 0]
50 |     Shat[j,nonzero] = T
51 |     Shat[j,j] = T
52 |   }
53 |   cat("\n")
54 | 
55 |   # Include an edge if either (and=F) or both (and=T) endpoints are neighbors
56 |   Ihat = matrix(F,p,p)
57 |   if (and==T) {
58 |     for (i in 1:p) {
59 |       Ihat[,i] = Shat[,i] & Shat[i,]
60 |     }
61 |   }
62 |   else {
63 |     for (i in 1:p) {
64 |       Ihat[,i] = Shat[,i] | Shat[i,]
65 |     }      
66 |   }
67 |   image(Ihat,col=heat.colors(2),xaxp=c(-1,2,1),yaxp=c(-1,2,1))
68 |   title(main = "Estimated graph")
69 |   return(Ihat)
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/params.c:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
 2 | 
 3 | // This file is part of CTM-C.
 4 | 
 5 | // CTM-C is free software; you can redistribute it and/or modify it under
 6 | // the terms of the GNU General Public License as published by the Free
 7 | // Software Foundation; either version 2 of the License, or (at your
 8 | // option) any later version.
 9 | 
10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 | // for more details.
14 | 
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 | // USA
19 | 
20 | #include <stdlib.h>
21 | #include <stdio.h>
22 | #include <string.h>
23 | 
24 | #include "params.h"
25 | 
26 | llna_params PARAMS;
27 | 
28 | void read_params(char* filename)
29 | {
30 |     FILE* fileptr;
31 |     char string[100];
32 | 
33 |     fileptr = fopen(filename, "r");
34 |     fscanf(fileptr, "em max iter %d\n", &(PARAMS.em_max_iter));
35 |     fscanf(fileptr, "var max iter %d\n", &(PARAMS.var_max_iter));
36 |     fscanf(fileptr, "cg max iter %d\n", &(PARAMS.cg_max_iter));
37 |     fscanf(fileptr, "em convergence %lf\n", &(PARAMS.em_convergence));
38 |     fscanf(fileptr, "var convergence %lf\n", &(PARAMS.var_convergence));
39 |     fscanf(fileptr, "cg convergence %lf\n", &(PARAMS.cg_convergence));
40 |     fscanf(fileptr, "lag %d\n", &(PARAMS.lag));
41 |     fscanf(fileptr, "covariance estimate %s\n", string);
42 |     if (strcmp(string, "shrinkage")==0)
43 |         PARAMS.cov_estimate = SHRINK;
44 |     if (strcmp(string, "mle")==0)
45 |         PARAMS.cov_estimate = MLE;
46 | }
47 | 
48 | 
49 | void print_params()
50 | {
51 |     printf("em max iter %d\n", PARAMS.em_max_iter);
52 |     printf("var max iter %d\n", PARAMS.var_max_iter);
53 |     printf("cg max iter %d\n", PARAMS.cg_max_iter);
54 |     printf("em convergence %lf\n", PARAMS.em_convergence);
55 |     printf("var convergence %lf\n", PARAMS.var_convergence);
56 |     printf("cg convergence %lf\n", PARAMS.cg_convergence);
57 |     printf("lag %d\n", PARAMS.lag);
58 |     printf("shrinkage? %d\n", PARAMS.cov_estimate);
59 | }
60 | 
61 | 
62 | void default_params()
63 | {
64 |     PARAMS.em_max_iter = 1000;
65 |     PARAMS.var_max_iter = 500;
66 |     PARAMS.cg_max_iter = 500;
67 |     PARAMS.em_convergence = 1e-3;
68 |     PARAMS.var_convergence = 1e-5;
69 |     PARAMS.cg_convergence = 1e-5;
70 |     PARAMS.cov_estimate = MLE;
71 |     PARAMS.lag = 1;
72 | }
73 | 


--------------------------------------------------------------------------------
/params.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
 2 | 
 3 | // This file is part of CTM-C.
 4 | 
 5 | // CTM-C is free software; you can redistribute it and/or modify it under
 6 | // the terms of the GNU General Public License as published by the Free
 7 | // Software Foundation; either version 2 of the License, or (at your
 8 | // option) any later version.
 9 | 
10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 | // for more details.
14 | 
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 | // USA
19 | 
20 | #ifndef PARAMS_H
21 | #define PARAMS_H
22 | 
23 | #define MLE 0
24 | #define SHRINK 1
25 | 
26 | typedef struct llna_params
27 | {
28 |     int em_max_iter;
29 |     int var_max_iter;
30 |     int cg_max_iter;
31 |     double em_convergence;
32 |     double var_convergence;
33 |     double cg_convergence;
34 |     int cov_estimate;
35 |     int lag;
36 | } llna_params;
37 | 
38 | void read_params(char*);
39 | void print_params();
40 | void default_params();
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/r-interface.c:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
  2 | 
  3 | // This file is part of CTM-C.
  4 | 
  5 | // CTM-C is free software; you can redistribute it and/or modify it under
  6 | // the terms of the GNU General Public License as published by the Free
  7 | // Software Foundation; either version 2 of the License, or (at your
  8 | // option) any later version.
  9 | 
 10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
 11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 13 | // for more details.
 14 | 
 15 | // You should have received a copy of the GNU General Public License
 16 | // along with this program; if not, write to the Free Software
 17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 18 | // USA
 19 | 
 20 | // NOTE: this file is not currently used
 21 | 
 22 | #include <gsl/gsl_vector.h>
 23 | #include <gsl/gsl_matrix.h>
 24 | #include "gsl-wrappers.h"
 25 | #include "ctm.h"
 26 | #include "inference.h"
 27 | #include "params.h"
 28 | 
 29 | llna_model global_mod;
 30 | extern llna_params PARAMS;
 31 | 
 32 | /*
 33 |  * for translating r arrays into gsl matrices and vectors
 34 |  *
 35 |  */
 36 | 
 37 | double r_mtx_get(const double* v, int i, int j, int nrow)
 38 | {
 39 |     return(v[nrow * j + i]);
 40 | }
 41 | 
 42 | 
 43 | gsl_matrix * r_to_gsl_matrix(const double * v, int nrow, int ncol)
 44 | {
 45 |     int i, j;
 46 |     gsl_matrix * ret;
 47 | 
 48 |     ret = gsl_matrix_alloc(nrow, ncol);
 49 |     for (i = 0; i < nrow; i++)
 50 |     {
 51 | 	for (j = 0; j < ncol; j++)
 52 | 	{
 53 | 	    mset(ret, i, j, r_mtx_get(v, i, j, nrow));
 54 | 	}
 55 |     }
 56 |     return(ret);
 57 | }
 58 | 
 59 | 
 60 | gsl_vector * r_to_gsl_vector(const double * v, int size)
 61 | {
 62 |     int i;
 63 |     gsl_vector * ret;
 64 | 
 65 |     ret = gsl_vector_alloc(size);
 66 |     for (i = 0; i < size; i++)
 67 |     {
 68 | 	vset(ret, i, v[i]);
 69 |     }
 70 |     return(ret);
 71 | }
 72 | 
 73 | 
 74 | /*
 75 |  * sets the global model
 76 |  *
 77 |  */
 78 | 
 79 | void r_set_mod(int * k, int * nterms,
 80 |                double * mu,
 81 |                double * inv_cov,
 82 |                double * cov,
 83 |                double * log_det_inv_cov,
 84 |                double * log_beta)
 85 | {
 86 |     global_mod.k = *k;
 87 |     global_mod.log_beta = r_to_gsl_matrix(log_beta, global_mod.k, *nterms);
 88 |     global_mod.mu = r_to_gsl_vector(mu, global_mod.k-1);
 89 |     global_mod.inv_cov = r_to_gsl_matrix(inv_cov, global_mod.k-1, global_mod.k-1);
 90 |     global_mod.cov = r_to_gsl_matrix(cov, global_mod.k-1, global_mod.k-1);
 91 |     global_mod.log_det_inv_cov = *log_det_inv_cov;
 92 |     init_temp_vectors(global_mod.k-1);
 93 |     default_params();
 94 | }
 95 | 
 96 | 
 97 | /*
 98 |  * compute the likelihood bound for variational parameters and document
 99 |  *
100 |  */
101 | 
102 | void r_lhood_bound(double * lambda, double * nu, double * phi, double * zeta,
103 |                    int * word, int * count, int * total, int * nterms,
104 |                    double * val)
105 | {
106 |     llna_var_param var;
107 |     doc doc;
108 |     int i;
109 | 
110 |     var.lambda = r_to_gsl_vector(lambda, global_mod.k-1);
111 |     var.nu = r_to_gsl_vector(nu, global_mod.k-1);
112 |     var.phi = r_to_gsl_matrix(phi, *nterms, global_mod.k);
113 |     var.zeta = *zeta;
114 | 
115 |     doc.nterms = *nterms;
116 |     doc.total = *total;
117 |     doc.count = count;
118 |     doc.word = word;
119 |     for (i = 0; i < doc.nterms; i++)
120 | 	doc.word[i] = doc.word[i] - 1;
121 | 
122 |     lhood_bnd(&var, &doc, &global_mod);
123 | }
124 | 
125 | 
126 | /*
127 |  * variational inference given a document and pointers to variational
128 |  * parameters
129 |  *
130 |  */
131 | 
132 | void r_var_inference(double * lambda, double * nu, double * phi,
133 |                      double * zeta,
134 |                      int * word, int * count, int * total, int * nterms,
135 |                      double * lhood)
136 | {
137 |     llna_var_param * var;
138 |     doc doc;
139 |     int i, j, n;
140 |     // set up document
141 |     doc.nterms = *nterms;
142 |     doc.total = *total;
143 |     doc.count = count;
144 |     doc.word = word;
145 |     // !!! note we assume that the words are 1-indexed coming from R
146 |     for (i = 0; i < doc.nterms; i++)
147 | 	doc.word[i] = doc.word[i] - 1;
148 |     // allocate variational parameters
149 |     var = new_llna_var_param(*nterms, global_mod.k);
150 |     // run variational inference
151 |     lhood[0] = var_inference(var, &doc, &global_mod);
152 |     init_var_unif(var, &doc, &global_mod);
153 |     printf("LHOOD BOUND : %5.5f\n", lhood[0]);
154 |     // return variational parameters
155 |     *zeta = var->zeta;
156 |     for (i = 0; i < global_mod.k-1; i++)
157 |     {
158 | 	lambda[i] = vget(var->lambda, i);
159 | 	nu[i] = vget(var->nu, i);
160 |     }
161 |     j = 0;
162 |     for (i = 0; i < global_mod.k; i++)
163 |     {
164 | 	for (n = 0; n < doc.nterms; n++)
165 | 	{
166 | 	    phi[j] = mget(var->phi, n, i);
167 | 	    j++;
168 | 	}
169 |     }
170 |     // clean up
171 |     free_llna_var_param(var);
172 | }
173 | 


--------------------------------------------------------------------------------
/r-interface.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2007, David M. Blei and John D. Lafferty
 2 | 
 3 | // This file is part of CTM-C.
 4 | 
 5 | // CTM-C is free software; you can redistribute it and/or modify it under
 6 | // the terms of the GNU General Public License as published by the Free
 7 | // Software Foundation; either version 2 of the License, or (at your
 8 | // option) any later version.
 9 | 
10 | // CTM-C is distributed in the hope that it will be useful, but WITHOUT
11 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13 | // for more details.
14 | 
15 | // You should have received a copy of the GNU General Public License
16 | // along with this program; if not, write to the Free Software
17 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
18 | // USA
19 | 
20 | #ifndef LLNA_R_INTERFACE_H
21 | #define LLNA_R_INTERFACE_H
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/settings.txt:
--------------------------------------------------------------------------------
1 | em max iter 1000
2 | var max iter 20
3 | cg max iter -1
4 | em convergence 1e-3
5 | var convergence 1e-6
6 | cg convergence 1e-6
7 | lag 10
8 | covariance estimate mle
9 | 


--------------------------------------------------------------------------------