├── .gitignore ├── bin ├── dtm-linux32 ├── dtm-linux64 ├── dtm-darwin64 ├── dtm-win32.exe └── dtm-win64.exe ├── dtm ├── example │ └── test-seq.dat ├── main.h ├── Makefile ├── params.h ├── util.h ├── build.sh ├── param.h ├── gsl-wrappers.h ├── ss-lm.h ├── lda-seq.h ├── sample.sh ├── params.c ├── lda.h ├── data.h ├── util.c ├── main.c ├── gsl-wrappers.c └── data.c ├── lib └── math │ ├── logspace.h │ ├── gradient_projection.h │ ├── specialfunc.h │ ├── logspace_base.cpp │ ├── logspace_base.h │ ├── optimizer.h │ ├── logspace.cpp │ ├── gradient_projection_test.cpp │ ├── specialfunc.cpp │ ├── gsl_matrix.h │ ├── gradient_projection.cpp │ ├── gsl_vector.h │ └── vectorops.h ├── gslwrap ├── include │ └── gslwrap │ │ ├── permutation.h │ │ ├── matrix_vector_operators.h │ │ ├── random_number_distribution.h │ │ ├── random_generator.h │ │ ├── histogram.h │ │ ├── min_fminimizer.h │ │ ├── multimin_fdfminimizer.h │ │ ├── matrix_double.h │ │ ├── matrix_int.h │ │ └── matrix_float.h └── bin │ └── gslwrap-config ├── doc ├── HOWTO ├── lda.tex └── dtm.tex └── README /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | -------------------------------------------------------------------------------- /bin/dtm-linux32: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-linux32 -------------------------------------------------------------------------------- /bin/dtm-linux64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-linux64 -------------------------------------------------------------------------------- /bin/dtm-darwin64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-darwin64 -------------------------------------------------------------------------------- /bin/dtm-win32.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-win32.exe -------------------------------------------------------------------------------- /bin/dtm-win64.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-win64.exe -------------------------------------------------------------------------------- /dtm/example/test-seq.dat: -------------------------------------------------------------------------------- 1 | 10 2 | 25 3 | 50 4 | 75 5 | 100 6 | 100 7 | 100 8 | 100 9 | 125 10 | 150 11 | 175 -------------------------------------------------------------------------------- /dtm/main.h: -------------------------------------------------------------------------------- 1 | #ifndef MAINH 2 | #define MAINH 3 | 4 | #include 5 | #include 6 | #include "data.h" 7 | #include "lda-seq.h" 8 | #include "lda.h" 9 | #include 10 | 11 | typedef struct dtm_fit_params 12 | { 13 | char* datafile; 14 | char* outname; 15 | char* heldout; 16 | int start; 17 | int end; 18 | int ntopics; 19 | int lda_max_em_iter; 20 | double top_obs_var; 21 | double top_chain_var; 22 | double alpha; 23 | } dtm_fit_params; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /dtm/Makefile: -------------------------------------------------------------------------------- 1 | .SUFFIXES: .c .u 2 | 3 | LIB = ../lib 4 | GSLWRAP_LIB = ../gslwrap 5 | OUTPUT_FILE ?= dtm 6 | OPTFLAGS ?= -O2 7 | STRIP ?= strip 8 | CXXFLAGS += -I ${LIB} -I ${LIB}/math -I ${GSLWRAP_LIB}/include -I ${GSLWRAP_LIB}/include/gslwrap 9 | 10 | LDFLAGS += -lgsl -lm -lgslcblas -lgflags 11 | EXTRA_LDFLAGS = 12 | LOBJECTS = ss-lm.o gsl-wrappers.o data.o param.o util.o lda-seq.o lda.o params.o main.o 13 | 14 | all: main 15 | 16 | .c.o : 17 | $(CXX) $(CXXFLAGS) -c $< 18 | 19 | main: $(LOBJECTS) 20 | $(CXX) $(OPTFLAGS) $(CXXFLAGS) $(LOBJECTS) -o $(OUTPUT_FILE) $(LDFLAGS) $(EXTRA_LDFLAGS) 21 | $(STRIP) $(OUTPUT_FILE) 22 | 23 | clean: 24 | -rm -f *.o 25 | 26 | distclean: clean 27 | -rm -f $(OUTPUT_FILE) $(OUTPUT_FILE)-* 28 | -------------------------------------------------------------------------------- /dtm/params.h: -------------------------------------------------------------------------------- 1 | // Author: David Blei (blei@cs.princeton.edu) 2 | // 3 | // Copyright 2006 David Blei 4 | // All Rights Reserved. 5 | // 6 | // See the README for this package for details about modifying or 7 | // distributing this software. 8 | 9 | #ifndef PARAMSH 10 | #define PARAMSH 11 | 12 | #define MAX_LINE_LENGTH 100000; 13 | 14 | #include "gsl-wrappers.h" 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | void params_read_string(FILE* f, char* name, char* x); 21 | 22 | void params_read_int(FILE* f, char* name, int* x); 23 | 24 | void params_write_int(FILE *, char *, int); 25 | 26 | void params_read_double(FILE* f, char* name, double* x); 27 | 28 | void params_write_double(FILE *, char *, double); 29 | 30 | void params_read_gsl_vector(FILE* f, char* name, gsl_vector** x); 31 | 32 | void params_write_gsl_vector(FILE *, char* , gsl_vector *); 33 | 34 | void params_write_gsl_vector_multiline(FILE *, char* , gsl_vector *); 35 | 36 | void params_write_gsl_matrix(FILE *, char* , gsl_matrix *); 37 | 38 | void params_write_sparse_gsl_matrix(FILE *, char* , gsl_matrix *); 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /dtm/util.h: -------------------------------------------------------------------------------- 1 | // Author: David Blei (blei@cs.princeton.edu) 2 | // 3 | // Copyright 2006 David Blei 4 | // All Rights Reserved. 5 | // 6 | // See the README for this package for details about modifying or 7 | // distributing this software. 8 | 9 | #ifndef _UTIL_INCLUDED 10 | #define _UTIL_INCLUDED 1 11 | 12 | #include 13 | 14 | #define EOS '\0' 15 | #define CRLF printf("\n") 16 | #define TRUE 1 17 | #define FALSE 0 18 | 19 | extern const char* quote (const char *s); 20 | extern char* dequote (char *s); 21 | extern void quote_no_matter_what (const char *s, char *t); 22 | extern int verify (char *s, char *t); 23 | extern char* strip (char *s); 24 | extern char* upper (char *s); 25 | extern char* lower (char *s); 26 | extern int qfilef (const char *fname); /* TRUE if file exists */ 27 | extern int free_storage (char *fn); /* returns free storage in file system of fn */ 28 | extern char* util_strdup(char *string); 29 | extern void* util_malloc (int size); 30 | extern void* util_realloc (void *p, int size); 31 | extern void* util_calloc (int num, int size); 32 | extern void util_free (void *p); 33 | extern int util_space_in_use (void); 34 | extern int util_pointers_in_use (void); 35 | extern void error(char *fmt, ...); 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /lib/math/logspace.h: -------------------------------------------------------------------------------- 1 | #ifndef __MATH_LOGSPACE_H__ 2 | #define __MATH_LOGSPACE_H__ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "logspace_base.h" 9 | #include "specialfunc.h" 10 | 11 | // Given two log vectors, log a_i and log b_i, compute 12 | // log sum (a_i * b_i). 13 | double log_dot_product(const gsl_vector* log_a, const gsl_vector* log_b); 14 | 15 | // Given a log vector, log a_i, compute log sum a_i. Returns the sum. 16 | double log_normalize(gsl_vector* x); 17 | 18 | // Compute the log sum over all elements in the vector 19 | double log_sum(const gsl_vector* x); 20 | 21 | // Given a log matrix, log a_i, compute log sum a_i. Returns the sum. 22 | double log_normalize_matrix(gsl_matrix* x); 23 | 24 | double log_dirichlet_likelihood(const double sum, 25 | const double prior_sum, 26 | const std::vector& counts, 27 | bool debug = false); 28 | 29 | double log_dirichlet_likelihood(const double sum, 30 | const double prior_scale, 31 | const gsl_vector* prior, 32 | const std::vector& counts); 33 | 34 | #endif // __MATH_LOGSPACE_H__ 35 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/permutation.h: -------------------------------------------------------------------------------- 1 | // This matrix class is a C++ wrapper for the GNU Scientific Library 2 | // Copyright (C) ULP-IPB Strasbourg 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | #ifndef _permutation_h 19 | #define _permutation_h 20 | 21 | #include 22 | 23 | namespace gsl 24 | { 25 | class permutation 26 | { 27 | friend class matrix; 28 | friend class matrix_float; 29 | friend class matrix_int; 30 | 31 | gsl_permutation *gsldata; 32 | public: 33 | permutation(size_t n,bool clear=true) 34 | { 35 | gsldata=(clear ? gsl_permutation_calloc(n) : gsl_permutation_alloc(n)); 36 | } 37 | permutation():gsldata(NULL){;} 38 | void resize(size_t n){gsldata= gsl_permutation_calloc(n);} 39 | }; 40 | } 41 | #endif// _permutation_h 42 | -------------------------------------------------------------------------------- /dtm/build.sh: -------------------------------------------------------------------------------- 1 | make clean 2 | PATH=/usr/x86_64-w64-mingw32/bin:$PATH make main \ 3 | NM=nm \ 4 | RANLIB=ranlib \ 5 | CC=x86_64-w64-mingw32-gcc \ 6 | CCC=x86_64-w64-mingw32-g++ \ 7 | CXX=x86_64-w64-mingw32-g++ \ 8 | AS=as \ 9 | OPTFLAGS="-O2 -static" \ 10 | EXTRA_LDFLAGS="-lshlwapi" \ 11 | STRIP="x86_64-w64-mingw32-strip" \ 12 | OUTPUT_FILE="dtm-win64.exe" 13 | 14 | make clean 15 | PATH=/usr/i686-w64-mingw32/bin:$PATH make main \ 16 | NM=nm \ 17 | RANLIB=ranlib \ 18 | CC=i686-w64-mingw32-gcc \ 19 | CCC=i686-w64-mingw32-g++ \ 20 | CXX=i686-w64-mingw32-g++ \ 21 | AS=as \ 22 | OPTFLAGS="-O2 -static" \ 23 | EXTRA_LDFLAGS="-lshlwapi" \ 24 | STRIP="i686-w64-mingw32-strip" \ 25 | OUTPUT_FILE="dtm-win32.exe" 26 | 27 | make clean 28 | make main \ 29 | NM=nm \ 30 | RANLIB=ranlib \ 31 | CC=gcc \ 32 | CCC=g++ \ 33 | CXX=g++ \ 34 | AS=as \ 35 | CFLAGS="-m32" \ 36 | CXXFLAGS="-m32" \ 37 | OUTPUT_FILE="dtm-linux32" 38 | 39 | make clean 40 | make main \ 41 | NM=nm \ 42 | RANLIB=ranlib \ 43 | CC=gcc \ 44 | CCC=g++ \ 45 | CXX=g++ \ 46 | AS=as \ 47 | OUTPUT_FILE="dtm-linux64" 48 | 49 | # Remember to remove the dynamic libraries from the libdir when compiling 50 | # (otherwise the binary will require dynamic libraries as well) 51 | make clean 52 | PATH=/usr/x86_64-apple-darwin15/bin:$PATH make main \ 53 | NM=x86_64-apple-darwin15-nm \ 54 | RANLIB=x86_64-apple-darwin15-ranlib \ 55 | CC=x86_64-apple-darwin15-clang \ 56 | CCC=x86_64-apple-darwin15-clang++ \ 57 | CXX=x86_64-apple-darwin15-clang++ \ 58 | AS=x86_64-apple-darwin15-as \ 59 | STRIP=x86_64-apple-darwin15-strip \ 60 | OUTPUT_FILE="dtm-darwin64" 61 | -------------------------------------------------------------------------------- /lib/math/gradient_projection.h: -------------------------------------------------------------------------------- 1 | #ifndef MATH_GRADIENTPROJECTION_INCLUDED 2 | #define MATH_GRADIENTPROJECTION_INCLUDED 3 | 4 | #define SAFETY_BOX 0.001 5 | #define GRADIENT_DESCENT_SLOWDOWN 1.0 6 | 7 | #include 8 | 9 | #include "gslwrap/vector_double.h" 10 | #include "gslwrap/matrix_double.h" 11 | 12 | namespace GradientProjection { 13 | /* 14 | * Returns true if the sum to less than one constraint is violated, 15 | * fills in with the active constraint matrix. Caller is responsible 16 | * for memory management of newly created matrix. 17 | */ 18 | bool createActiveConstraints(const gsl::vector& x, 19 | gsl::matrix& n, 20 | gsl::vector& g); 21 | 22 | void display(const gsl_vector* v, const char* name); 23 | 24 | void display(const gsl_matrix* m, const char* name); 25 | 26 | void createProjection(const gsl::matrix& activeConstraints, 27 | const gsl::vector& g, 28 | const gsl::vector& grad, 29 | gsl::matrix& projection, 30 | gsl::vector& direction, 31 | gsl::vector& correction); 32 | 33 | double updateState(gsl::vector& x, 34 | const double gamma, 35 | const gsl::vector grad, 36 | const double f); 37 | 38 | double descend(gsl::vector& x, 39 | gsl::vector& s, 40 | const double gamma, 41 | const double obj_value, 42 | const gsl::vector& correction, 43 | const gsl::vector& grad); 44 | 45 | } 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /lib/math/specialfunc.h: -------------------------------------------------------------------------------- 1 | #ifndef __MATH_SPECIALFUNC_H__ 2 | #define __MATH_SPECIALFUNC_H__ 3 | #include 4 | 5 | #ifndef M_PI 6 | #define M_PI 3.14159265358979323846 7 | #endif 8 | 9 | /** 10 | * Proc to calculate the value of the trigamma, the second 11 | * derivative of the loggamma function. Accepts positive matrices. 12 | * From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with 13 | * recurrence formula 6.4.6. Each requires workspace at least 5 14 | * times the size of X. 15 | * 16 | **/ 17 | 18 | double trigamma(double x); 19 | 20 | 21 | /* 22 | * taylor approximation of first derivative of the log gamma function 23 | * 24 | */ 25 | double digamma(double x); 26 | double InverseDigamma(double x); 27 | 28 | 29 | // lgamma.cpp -- log gamma function of real argument. 30 | // Algorithms and coefficient values from "Computation of Special 31 | // Functions", Zhang and Jin, John Wiley and Sons, 1996. 32 | // 33 | // (C) 2003, C. Bond. All rights reserved. 34 | // 35 | // Returns log(gamma) of real argument. 36 | // NOTE: Returns 1e308 if argument is 0 or negative. 37 | // 38 | double log_gamma(double x); 39 | 40 | double sigmoid(double x); 41 | 42 | // First derivative of sigmoid function. 43 | double dsigmoid(double x); 44 | 45 | // Second derivative of sigmoid function. 46 | double d2sigmoid(double x); 47 | 48 | // Log of the CDF of a Gaussian. 49 | double LogPGaussian(double x); 50 | 51 | // Log of the PDF of a Gaussian. 52 | double LogDGaussian(double x); 53 | 54 | // Computes the inverse of PGaussian. 55 | double InversePGaussian(double x); 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/matrix_vector_operators.h: -------------------------------------------------------------------------------- 1 | // This matrix class is a C++ wrapper for the GNU Scientific Library 2 | // Copyright (C) ULP-IPB Strasbourg 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | #ifndef __matrix_vector_operators_h 19 | #define __matrix_vector_operators_h 20 | 21 | #include "gsl/gsl_blas.h" 22 | #include 23 | #include 24 | #include 25 | 26 | namespace gsl 27 | { 28 | 29 | inline 30 | vector_float operator*(const matrix_float& m, const vector_float& v) 31 | { 32 | vector_float y(m.get_rows()); 33 | gsl_blas_sgemv(CblasNoTrans, 1.0, m.gslobj(), v.gslobj(), 0.0, y.gslobj()); 34 | return y; 35 | } 36 | 37 | inline 38 | vector operator*(const matrix& m, const vector& v) 39 | { 40 | vector y(m.get_rows()); 41 | gsl_blas_dgemv(CblasNoTrans, 1.0, m.gslobj(), v.gslobj(), 0.0, y.gslobj()); 42 | return y; 43 | } 44 | 45 | } 46 | 47 | #endif //__matrix_vector_operators_h 48 | -------------------------------------------------------------------------------- /doc/HOWTO: -------------------------------------------------------------------------------- 1 | # Para um tópico 2 | data0 = scan("topic-000-var-e-log-prob.dat") 3 | b0 = matrix(data0, ncol=10, byrow=TRUE) 4 | write.table(b0, file="dist-topic0.csv", sep=";") 5 | 6 | 7 | # Processa todos tópicos 8 | # Para cada tópico, gera um arquivo com a probabilidade de cada 9 | # termo para cada ano 10 | # TODO: rodar exp() nos valores 11 | topics = list() 12 | for (i in 0:9) { 13 | filename = paste("topic-00", i, sep = "") 14 | filename = paste(filename, "-var-e-log-prob.dat", sep = "") 15 | data = scan(filename) 16 | topic = matrix(data, ncol=9, byrow=TRUE) 17 | filename = paste("dist-topic", i, sep = "") 18 | filename = paste(filename, ".csv", sep = "") 19 | write.table(topic, file=filename, sep=";") 20 | } 21 | 22 | 23 | for (i in 10:49) { 24 | filename = paste("topic-0", i, sep = "") 25 | filename = paste(filename, "-var-e-log-prob.dat", sep = "") 26 | data = scan(filename) 27 | topic = matrix(data, ncol=9, byrow=TRUE) 28 | filename = paste("dist-topic", i, sep = "") 29 | filename = paste(filename, ".csv", sep = "") 30 | write.table(topic, file=filename, sep=";") 31 | } 32 | 33 | # - gam.dat: The gammas associated with each document. Divide these by 34 | # the sum for each document to get expected topic mixtures. 35 | # Proportion of topic 5 in document 3: 36 | # e.theta[3, 5] 37 | a = scan("gam.dat") 38 | b = matrix(a, ncol=50, byrow=TRUE) 39 | rs = rowSums(b) 40 | e.theta = b / rs 41 | write.table(e.theta, file="documents_topics.csv", sep=";") 42 | 43 | #Treinamento dos tópicos 44 | ./main \ 45 | --ntopics=25 \ 46 | --mode=fit \ 47 | --rng_seed=0 \ 48 | --initialize_lda=true \ 49 | --corpus_prefix=example/SBSC \ 50 | --outname=example/model_run \ 51 | --top_chain_var=0.005 \ 52 | --alpha=2.0 \ 53 | --lda_sequence_min_iter=10 \ 54 | --lda_sequence_max_iter=30 \ 55 | --lda_max_em_iter=10 -------------------------------------------------------------------------------- /lib/math/logspace_base.cpp: -------------------------------------------------------------------------------- 1 | #include "logspace_base.h" 2 | 3 | using namespace std; 4 | 5 | double safe_log(double x) { 6 | if (x <= 0) { 7 | return(-1e4); 8 | } else { 9 | return(log(x)); 10 | } 11 | } 12 | 13 | // Given log(a) and log(b), return log(a + b). 14 | double log_sum(double log_a, double log_b) { 15 | double v; 16 | 17 | if (log_a == -std::numeric_limits::infinity() && 18 | log_b == log_a) { 19 | return -std::numeric_limits::infinity(); 20 | } else if (log_a < log_b) { 21 | v = log_b + log(1 + exp(log_a - log_b)); 22 | } else { 23 | v = log_a + log(1 + exp(log_b - log_a)); 24 | } 25 | return(v); 26 | } 27 | 28 | // Given log(a) and log(b), return log(a - b). 29 | double log_diff(double log_a, double log_b) { 30 | double val; 31 | double dangerous_part = exp(log_b - log_a); 32 | assert(dangerous_part < 1.0); 33 | val = log_a + log(1.0 - dangerous_part); 34 | return val; 35 | } 36 | 37 | /* 38 | * returns the element randomly sampled from the log 39 | * probabilities in array (number is the number of elements) 40 | */ 41 | int log_sample(double* vals, int length) { 42 | double normalizer = safe_log(0.0); 43 | int ii; 44 | for(ii=0; ii= cutoff) 53 | break; 54 | } 55 | assert(ii < length); 56 | return ii; 57 | } 58 | 59 | /* 60 | * A stupid "sampling" function for deterministic testing 61 | */ 62 | int sample_first_nonzero(double* vals, int length) { 63 | int ii; 64 | for(ii=0; ii < length - 1 && exp(vals[ii]) < 0.01; ++ii) { } 65 | return ii; 66 | } 67 | 68 | bool is_nan(double val) { 69 | return val != val; 70 | } 71 | -------------------------------------------------------------------------------- /lib/math/logspace_base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Author: Jordan Boyd-Graber 3 | * Date: March 2008 4 | * 5 | * This file was spun off from logspace.h in order to create a 6 | * logspace file that wouldn't depend on gsl. 7 | */ 8 | #ifndef __MATH_LOGSPACE_BASE_H__ 9 | #define __MATH_LOGSPACE_BASE_H__ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | using namespace std; 17 | 18 | #ifndef isnan 19 | # define isnan(x) \ 20 | (sizeof (x) == sizeof (long double) ? isnan_ld (x) \ 21 | : sizeof (x) == sizeof (double) ? isnan_d (x) \ 22 | : isnan_f (x)) 23 | static inline int isnan_f (float x) { return x != x; } 24 | static inline int isnan_d (double x) { return x != x; } 25 | static inline int isnan_ld (long double x) { return x != x; } 26 | #endif 27 | 28 | #ifndef isinf 29 | # define isinf(x) \ 30 | (sizeof (x) == sizeof (long double) ? isinf_ld (x) \ 31 | : sizeof (x) == sizeof (double) ? isinf_d (x) \ 32 | : isinf_f (x)) 33 | static inline int isinf_f (float x) { return isnan (x - x); } 34 | static inline int isinf_d (double x) { return isnan (x - x); } 35 | static inline int isinf_ld (long double x) { return isnan (x - x); } 36 | #endif 37 | 38 | double safe_log(double x); 39 | 40 | // Given log(a) and log(b), return log(a + b). 41 | double log_sum(double log_a, double log_b); 42 | 43 | // Given log(a) and log(b), return log(a - b). 44 | double log_diff(double log_a, double log_b); 45 | 46 | /* 47 | * returns the element randomly sampled from the log 48 | * probabilities in array (number is the number of elements) 49 | */ 50 | int log_sample(double* vals, int length); 51 | 52 | /* 53 | * Stupid "sampling" function for deterministic testing (i.e. in unit tests) 54 | */ 55 | int sample_first_nonzero(double* vals, int length); 56 | int sample_max(double* vals); 57 | 58 | bool is_nan(double val); 59 | 60 | #endif // __MATH_LOGSPACE_BASE_H__ 61 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/random_number_distribution.h: -------------------------------------------------------------------------------- 1 | // This matrix class is a C++ wrapper for the GNU Scientific Library 2 | // Copyright (C) ULP-IPB Strasbourg 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | 18 | #ifndef __random_number_distribution_h 19 | #define __random_number_distribution_h 20 | 21 | #include "gslwrap/random_generator.h" 22 | #include "gsl/gsl_randist.h" 23 | 24 | namespace gsl 25 | { 26 | 27 | class random_number_distribution 28 | { 29 | public: 30 | random_number_distribution(const random_generator& _generator) : generator(_generator){;} 31 | 32 | //Methods: 33 | virtual double get()=0; 34 | virtual double pdf(const double& x)=0; 35 | virtual ~random_number_distribution() 36 | { 37 | ; 38 | } 39 | protected: 40 | random_generator generator; 41 | }; 42 | 43 | class gaussian_random : public random_number_distribution 44 | { 45 | public: 46 | gaussian_random(const random_generator& _generator, const double& _sigma=1.0) : random_number_distribution(_generator), sigma(_sigma){;} 47 | 48 | //methods: 49 | double get(){return gsl_ran_gaussian(generator.gslobj(), sigma);} 50 | double get(double _sigma){return gsl_ran_gaussian(generator.gslobj(), _sigma);} 51 | double pdf(const double& x){return gsl_ran_gaussian_pdf(x, sigma);} 52 | 53 | double ratio_method(){return gsl_ran_gaussian_ratio_method(generator.gslobj(), sigma);} 54 | protected: 55 | double sigma; 56 | }; 57 | 58 | } 59 | 60 | #endif //__random_number_distribution_h 61 | -------------------------------------------------------------------------------- /dtm/param.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util.h" 3 | 4 | int param_geti(const char *parameter_name, int default_value); 5 | 6 | double param_getf(const char *parameter_name, double default_value); 7 | 8 | char *param_getc(const char *parameter_name, char *default_value); 9 | 10 | char *param_gets(const char *parameter_name); 11 | 12 | int param_getb(const char *parameter_name, int default_value); 13 | /* Returns true if the value of is 1, true, or yes, 14 | (case insensitive), false for any other value, and default_value 15 | for no value. */ 16 | 17 | int param_symvarie(const char *parameter_name, int *returned_value); 18 | /* Returns true if a value was found, false otherwise */ 19 | 20 | int param_symvarfe(const char *parameter_name, double *returned_value); 21 | /* Ditto */ 22 | 23 | int param_symvarce(const char *parameter_name, char *returned_value); 24 | /* Ditto. Note that the second argument is a "pointer to a char *", 25 | i.e., approximately a pointer to a string. */ 26 | 27 | void param_set(const char *parameter_name, char *new_value); 28 | /* Changes the value of ddinf parameter . This can be 29 | used to communicate with subroutines which expect ddinf 30 | parameters without having to make sure they exist in the ddinf file. 31 | Note, however, that values assigned in the ddinf file are 32 | OVERRIDDEN by a call to param_set. */ 33 | /* One might want to implement a param_add which would allow adding 34 | new ddinf parameters within a program, but which could not 35 | override values from the ddinf file. */ 36 | 37 | /* if the following isn't called, param.c looks for a %restart 38 | binding in the param file */ 39 | void param_set_restart_file(const char *restart_name_p); 40 | 41 | /* The following three calls write values to the restart file: */ 42 | void param_puti(const char *parameter_name, int value); 43 | 44 | void param_putf(const char *parameter_name, double value); 45 | 46 | void param_putc(const char *parameter_name, char *value); 47 | 48 | 49 | int param_checkpointed(void); 50 | /* If there is a restart file, reads it in and returns TRUE. Otherwise 51 | returns false. */ 52 | 53 | void param_checkpoint(void); 54 | /* Commits all of the param_put calls so far, are starts a new 55 | checkpoint. (I.e., subsequent `param_put's supersede earlier ones.) */ 56 | 57 | 58 | void param_dump (FILE *stream); 59 | /* Writes the current ddinf bindings to a stream */ 60 | 61 | void param_push_prefix (const char *hot_prefix); 62 | /* Push the current prefix to be applied to all ddnames */ 63 | 64 | void param_pop_prefix (void); 65 | /* Pop the current prefix */ 66 | 67 | int param_push_file (const char *fn); 68 | /* Use the file for all bindings */ 69 | 70 | char *param_pop_file (void); 71 | /* Pop current bindings */ 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/random_generator.h: -------------------------------------------------------------------------------- 1 | // This random generator is a C++ wrapper for the GNU Scientific Library 2 | // Copyright (C) 2001 Torbjorn Vik 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | #ifndef __random_generator_h 18 | #define __random_generator_h 19 | 20 | #include "gsl/gsl_rng.h" 21 | #include 22 | 23 | namespace gsl 24 | { 25 | #ifndef __HP_aCC 26 | using std::string; 27 | #endif 28 | 29 | //class RandomNumberGenerator 30 | class random_generator 31 | { 32 | private: 33 | gsl_rng* generator; 34 | public: 35 | // Construction and Initializing: 36 | //! Default args reads environment variable GSL_RNG_TYPE and GSL_RNG_SEED to initialize. If these are not set the generator gsl_rng_mt19937 will be used with seed 0. 37 | random_generator (const random_generator& other) : generator(NULL) {generator = gsl_rng_clone(other.generator);} 38 | random_generator (const gsl_rng_type* type=NULL, unsigned long int seed=0) : generator(NULL) 39 | { 40 | gsl_rng_env_setup(); 41 | if (!type) 42 | { 43 | generator = gsl_rng_alloc (gsl_rng_default); 44 | } 45 | else 46 | { 47 | generator = gsl_rng_alloc (type) ; 48 | if (seed) 49 | gsl_rng_set(generator, seed); 50 | } 51 | } 52 | ~random_generator () {gsl_rng_free(generator);} 53 | random_generator& operator=(const random_generator& other){if (generator) gsl_rng_free(generator); generator = gsl_rng_clone(other.generator);return *this;} 54 | void set(unsigned long int seed){gsl_rng_set(generator, seed);} 55 | 56 | // Sampling: 57 | unsigned long int get(unsigned long int n=0) {if (n) return gsl_rng_uniform_int(generator, n); else return gsl_rng_get(generator);} // returns value in range [min, max] 58 | double uniform() { return gsl_rng_uniform(generator);} // returns value in range [0, 1) 59 | double uniform_positive() { return gsl_rng_uniform_pos(generator);}// returns value in range (0, 1) 60 | unsigned long int uniform_int(unsigned long int n) 61 | { return gsl_rng_uniform_int(generator, n);}// returns value in range [0, n-1] 62 | 63 | // Information: 64 | string name(){return gsl_rng_name(generator);} 65 | unsigned long int max(){return gsl_rng_max(generator);} 66 | unsigned long int min(){return gsl_rng_min(generator);} 67 | 68 | // For calling gsl functions directly 69 | gsl_rng* gslobj() { return generator;} 70 | const gsl_rng* gslobj() const { return generator;} 71 | // static void Test(); 72 | }; 73 | 74 | } 75 | 76 | #endif //__random_generator_h 77 | -------------------------------------------------------------------------------- /dtm/gsl-wrappers.h: -------------------------------------------------------------------------------- 1 | #ifndef GSL_WRAPPERS_H 2 | #define GSL_WRAPPERS_H 3 | 4 | // #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define outlog(format, args...) \ 22 | fprintf(stderr, format, args); \ 23 | fprintf(stderr, "\n"); 24 | 25 | double safe_log(double); 26 | double log_sum(double, double); 27 | 28 | static inline double vget(const gsl_vector* v, int i) 29 | { return(gsl_vector_get(v, i)); }; 30 | 31 | static inline void vset(gsl_vector* v, int i, double x) 32 | { gsl_vector_set(v, i, x); }; 33 | 34 | // Increment a vector element by a double. 35 | void vinc(gsl_vector*, int, double); 36 | 37 | static inline double mget(const gsl_matrix* m, int i, int j) 38 | { return(gsl_matrix_get(m, i, j)); }; 39 | 40 | static inline void mset(gsl_matrix* m, int i, int j, double x) 41 | { gsl_matrix_set(m, i, j, x); }; 42 | 43 | void msetcol(gsl_matrix* m, int r, const gsl_vector* val); 44 | 45 | // Increment a matrix element by a double. 46 | void minc(gsl_matrix*, int, int, double); 47 | void msetrow(gsl_matrix*, int, const gsl_vector*); 48 | 49 | void col_sum(gsl_matrix*, gsl_vector*); 50 | 51 | void vct_printf(const gsl_vector* v); 52 | void mtx_printf(const gsl_matrix* m); 53 | void vct_fscanf(const char*, gsl_vector* v); 54 | void mtx_fscanf(const char*, gsl_matrix* m); 55 | void vct_fprintf(const char* filename, gsl_vector* v); 56 | void mtx_fprintf(const char* filename, const gsl_matrix* m); 57 | 58 | double log_det(gsl_matrix*); 59 | 60 | void matrix_inverse(gsl_matrix*, gsl_matrix*); 61 | 62 | void sym_eigen(gsl_matrix*, gsl_vector*, gsl_matrix*); 63 | 64 | double sum(const gsl_vector* v); 65 | 66 | double norm(gsl_vector * v); 67 | 68 | void vct_log(gsl_vector* v); 69 | void vct_exp(gsl_vector* x); 70 | 71 | void choose_k_from_n(int k, int n, int* result); 72 | 73 | void log_normalize(gsl_vector* x); 74 | void normalize(gsl_vector* x); 75 | 76 | void optimize(int dim, 77 | gsl_vector* x, 78 | void* params, 79 | void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*), 80 | void (*df)(const gsl_vector*, void*, gsl_vector*), 81 | double (*f)(const gsl_vector*, void*)); 82 | 83 | void optimize_fdf(int dim, 84 | gsl_vector* x, 85 | void* params, 86 | void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*), 87 | void (*df)(const gsl_vector*, void*, gsl_vector*), 88 | double (*f)(const gsl_vector*, void*), 89 | double* f_val, 90 | double* conv_val, 91 | int* niter); 92 | 93 | void log_write(FILE* f, char* string); 94 | int directory_exist(const char *dname); 95 | void make_directory(char* name); 96 | 97 | gsl_rng* new_random_number_generator(); 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /lib/math/optimizer.h: -------------------------------------------------------------------------------- 1 | #ifndef __LIB_MATH_OPTIMIZER__ 2 | #define __LIB_MATH_OPTIMIZER__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include "math/gsl_vector.h" 8 | #include "util/flags.h" 9 | 10 | using std::cout; 11 | using std::endl; 12 | 13 | DEFINE_double(multimin_convergence_threshold, 14 | 1e-5, 15 | "Convergence threshold for conjugate gradient."); 16 | DEFINE_size(max_multimin_iterations, 17 | 40, 18 | "Maximum number of conjugate gradient iterations to perform."); 19 | 20 | class Optimizer { 21 | public: 22 | Optimizer(size_t size) : size_(size) { 23 | } 24 | 25 | void Optimize() { 26 | gsl_multimin_function_fdf my_func; 27 | my_func.n = size_; 28 | my_func.f = &MultiminObjectiveWrapper; 29 | my_func.df = &MultiminGradientWrapper; 30 | my_func.fdf = &MultiminObjectiveGradientWrapper; 31 | my_func.params = this; 32 | 33 | gsl_multimin_fdfminimizer* s = 34 | gsl_multimin_fdfminimizer_alloc(gsl_multimin_fdfminimizer_conjugate_fr, size_); 35 | GslVector initial_guess(size_); 36 | MultiminInitialGuess(initial_guess.mutable_ptr()); 37 | 38 | // step_size, tol 39 | // gsl_multimin_fdfminimizer_set(s, &my_func, initial_guess.ptr(), 0.1, 1.0); 40 | gsl_multimin_fdfminimizer_set(s, &my_func, initial_guess.ptr(), 0.01, 0.01); 41 | 42 | size_t iter = 0; 43 | int status; 44 | 45 | double value = std::numeric_limits::infinity(); 46 | double prev_value; 47 | do { 48 | prev_value = value; 49 | iter++; 50 | status = gsl_multimin_fdfminimizer_iterate(s); 51 | if (status) { 52 | cout << "Error: " << gsl_strerror(status) << endl; 53 | break; 54 | } 55 | status = gsl_multimin_test_gradient(s->gradient, 1e-3); 56 | if (status == GSL_SUCCESS) { 57 | cout << "Minimum found." << endl; 58 | } 59 | value = s->f; 60 | cout << "Iteration: " << iter << " Value: " << 61 | value << " dValue:" << (prev_value - value)/fabs(value) << " " << 62 | gsl_strerror(status) << endl; 63 | } while (status == GSL_CONTINUE && 64 | iter < FLAGS_max_multimin_iterations && 65 | (prev_value - value) / fabs(value) > FLAGS_multimin_convergence_threshold); 66 | MultiminResult(s->x); 67 | gsl_multimin_fdfminimizer_free(s); 68 | } 69 | 70 | virtual void MultiminObjectiveGradient(const gsl_vector* x, 71 | double* objective, 72 | gsl_vector* gradient) = 0; 73 | 74 | virtual void MultiminInitialGuess(gsl_vector* v) = 0; 75 | 76 | virtual void MultiminResult(gsl_vector* x) = 0; 77 | 78 | virtual ~Optimizer() { } 79 | protected: 80 | static double MultiminObjectiveWrapper(const gsl_vector* x, void* params) { 81 | double objective; 82 | reinterpret_cast(params)->MultiminObjectiveGradient(x, &objective, NULL); 83 | return objective; 84 | } 85 | 86 | static void MultiminGradientWrapper(const gsl_vector* x, void* params, gsl_vector* g) { 87 | reinterpret_cast(params)->MultiminObjectiveGradient(x, NULL, g); 88 | } 89 | 90 | static void MultiminObjectiveGradientWrapper(const gsl_vector* x, 91 | void* params, 92 | double* f, 93 | gsl_vector* g) { 94 | reinterpret_cast(params)->MultiminObjectiveGradient(x, f, g); 95 | } 96 | 97 | private: 98 | size_t size_; 99 | }; 100 | #endif // __LIB_MATH_OPTIMIZER__ 101 | -------------------------------------------------------------------------------- /lib/math/logspace.cpp: -------------------------------------------------------------------------------- 1 | #include "logspace.h" 2 | 3 | double log_dirichlet_likelihood(const double sum, 4 | const double prior_sum, 5 | const std::vector& counts, 6 | bool debug) { 7 | double val = 0.0; 8 | int length = counts.size(); 9 | 10 | double prior_value = prior_sum / (double)length; 11 | val += gsl_sf_lngamma(prior_sum); 12 | val -= (double)length * gsl_sf_lngamma(prior_value); 13 | 14 | if(debug) cout << "Likelihood (" << sum << "," << prior_sum << "," << 15 | prior_value << "," << length << ") = " << val << endl; 16 | 17 | for(int ii = 0; ii < length; ++ii) { 18 | 19 | if(debug) cout << "\tGAMMA(" << prior_value << " + " << 20 | (double)counts[ii] << " = " << prior_value + 21 | (double)counts[ii] << ") -> " << val << endl; 22 | val += gsl_sf_lngamma(prior_value + (double)counts[ii]); 23 | } 24 | val -= gsl_sf_lngamma(prior_sum + sum); 25 | 26 | if(debug) cout << endl; 27 | 28 | return val; 29 | } 30 | 31 | double log_dirichlet_likelihood(const double sum, 32 | const double prior_scale, 33 | const gsl_vector* prior, 34 | const std::vector& counts) { 35 | double val = 0.0; 36 | int length = counts.size(); 37 | 38 | val += gsl_sf_lngamma(prior_scale); 39 | for(int ii=0; ii < length; ++ii) { 40 | double prior_value = gsl_vector_get(prior, ii); 41 | val -= gsl_sf_lngamma(prior_value); 42 | val += gsl_sf_lngamma(prior_value + (double)counts[ii]); 43 | } 44 | val -= gsl_sf_lngamma(prior_scale + sum); 45 | 46 | return val; 47 | 48 | } 49 | 50 | double log_dot_product(const gsl_vector* log_a, const gsl_vector* log_b) { 51 | double sum = gsl_vector_get(log_a, 0) + gsl_vector_get(log_b, 0); 52 | assert(log_a->size == log_b->size); 53 | for (unsigned int ii = 1; ii < log_a->size; ++ii) { 54 | sum = log_sum(sum, gsl_vector_get(log_a, ii) + 55 | gsl_vector_get(log_b, ii)); 56 | } 57 | return sum; 58 | } 59 | 60 | double log_sum(const gsl_vector* x) { 61 | double sum = gsl_vector_get(x, 0); 62 | 63 | for (unsigned int ii = 1; ii < x->size; ii++) { 64 | sum = log_sum(sum, gsl_vector_get(x, ii)); 65 | } 66 | return sum; 67 | } 68 | 69 | // Given a log vector, log a_i, compute log sum a_i. Returns the sum. 70 | double log_normalize(gsl_vector* x) { 71 | double sum = gsl_vector_get(x, 0); 72 | unsigned int i; 73 | 74 | for (i = 1; i < x->size; i++) { 75 | sum = log_sum(sum, gsl_vector_get(x, i)); 76 | } 77 | 78 | for (i = 0; i < x->size; i++) { 79 | double val = gsl_vector_get(x, i); 80 | gsl_vector_set(x, i, val - sum); 81 | } 82 | return sum; 83 | } 84 | 85 | // Given a log matrix, log a_i, compute log sum a_i. Returns the sum. 86 | double log_normalize_matrix(gsl_matrix* x) { 87 | double sum = gsl_matrix_get(x, 0, 0); 88 | 89 | for (size_t ii = 0; ii < x->size1; ++ii) { 90 | for (size_t jj = 0; jj < x->size2; ++jj) { 91 | if (ii == 0 && jj == 0) { 92 | continue; 93 | } 94 | sum = log_sum(sum, gsl_matrix_get(x, ii, jj)); 95 | } 96 | } 97 | 98 | for (size_t ii = 0; ii < x->size1; ++ii) { 99 | for (size_t jj = 0; jj < x->size2; ++jj) { 100 | double val = gsl_matrix_get(x, ii, jj); 101 | gsl_matrix_set(x, ii, jj, val - sum); 102 | } 103 | } 104 | return sum; 105 | } 106 | -------------------------------------------------------------------------------- /dtm/ss-lm.h: -------------------------------------------------------------------------------- 1 | // Authors: David Blei (blei@cs.princeton.edu) 2 | // Sean Gerrish (sgerrish@cs.princeton.edu) 3 | // 4 | // Copyright 2011 Sean Gerrish and David Blei 5 | // All Rights Reserved. 6 | // 7 | // See the README for this package for details about modifying or 8 | // distributing this software. 9 | 10 | /* 11 | * state space language model variational inference 12 | * 13 | */ 14 | 15 | #ifndef SSLM_H 16 | #define SSLM_H 17 | 18 | #include "gsl-wrappers.h" 19 | #include "params.h" 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "data.h" 26 | 27 | #define SSLM_MAX_ITER 2 // maximum number of optimization iters 28 | #define SSLM_FIT_THRESHOLD 1e-6 // convergence criterion for fitting sslm 29 | #define INIT_MULT 1000 // multiplier to variance for first obs 30 | // #define OBS_NORM_CUTOFF 10 // norm cutoff after which we use all 0 obs 31 | //#define OBS_NORM_CUTOFF 8 // norm cutoff after which we use all 0 obs 32 | #define OBS_NORM_CUTOFF 2 // norm cutoff after which we use all 0 obs 33 | 34 | /* 35 | * functions for variational inference 36 | * 37 | */ 38 | 39 | // allocate new state space language model variational posterior 40 | sslm_var* sslm_var_alloc(int W, int T); 41 | 42 | // allocate extra parameters for inference 43 | void sslm_inference_alloc(sslm_var* var); 44 | 45 | // free extra parameters for inference 46 | void sslm_inference_free(sslm_var* var); 47 | 48 | // initialize with zero observations 49 | void sslm_zero_init(sslm_var* var, 50 | double obs_variance, 51 | double chain_variance); 52 | 53 | // initialize with counts 54 | void sslm_counts_init(sslm_var* var, 55 | double obs_variance, 56 | double chain_variance, 57 | const gsl_vector* counts); 58 | 59 | // initialize from variational observations 60 | void sslm_obs_file_init(sslm_var* var, 61 | double obs_variance, 62 | double chain_variance, 63 | const char* filename); 64 | 65 | 66 | // compute E[\beta_{w,t}] for t = 1:T 67 | void compute_post_mean(int w, sslm_var* var, double chain_variance); 68 | 69 | // compute Var[\beta_{w,t}] for t = 1:T 70 | void compute_post_variance(int w, sslm_var* var, double chain_variance); 71 | 72 | // optimize \hat{beta} 73 | void optimize_var_obs(sslm_var* var); 74 | 75 | // compute dE[\beta_{w,t}]/d\obs_{w,s} for t = 1:T 76 | void compute_mean_deriv(int word, int time, sslm_var* var, 77 | gsl_vector* deriv); 78 | 79 | // compute d bound/d obs_{w, t} for t=1:T. 80 | void compute_obs_deriv(int word, gsl_vector* word_counts, 81 | gsl_vector* total_counts, sslm_var* var, 82 | gsl_matrix* mean_deriv_mtx, gsl_vector* deriv); 83 | 84 | // update observations 85 | void update_obs(gsl_matrix* word_counts, gsl_vector* totals, 86 | sslm_var* var); 87 | 88 | // log probability bound 89 | double compute_bound(gsl_matrix* word_counts, gsl_vector* totals, 90 | sslm_var* var); 91 | 92 | 93 | // fit variational distribution 94 | double fit_sslm(sslm_var* var, gsl_matrix* word_counts); 95 | 96 | // read and write variational distribution 97 | void write_sslm_var(sslm_var* var, char* out); 98 | sslm_var* read_sslm_var(char* in); 99 | 100 | void compute_expected_log_prob(sslm_var* var); 101 | // !!! old function (from doc mixture...) 102 | double expected_log_prob(int w, int t, sslm_var* var); 103 | 104 | // update zeta 105 | void update_zeta(sslm_var* var); 106 | 107 | #endif 108 | -------------------------------------------------------------------------------- /dtm/lda-seq.h: -------------------------------------------------------------------------------- 1 | // Authors: David Blei (blei@cs.princeton.edu) 2 | // Sean Gerrish (sgerrish@cs.princeton.edu) 3 | // 4 | // Copyright 2011 Sean Gerrish and David Blei 5 | // All Rights Reserved. 6 | // 7 | // See the README for this package for details about modifying or 8 | // distributing this software. 9 | 10 | #ifndef LDASEQ_H 11 | #define LDASEQ_H 12 | 13 | #include 14 | #include 15 | 16 | #include "gsl-wrappers.h" 17 | #include "lda.h" 18 | 19 | #define LDA_SEQ_EM_THRESH 1e-4 20 | #define SAVE_LAG 10 21 | 22 | /* 23 | * an lda sequence is a collection of simplex sequences for K topics 24 | * and an alpha vector 25 | * 26 | */ 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include "param.h" 34 | #include "ss-lm.h" 35 | #include "data.h" 36 | #include "lda.h" 37 | 38 | #define LDA_SEQ_EM_THRESHOLD 1e-5; 39 | 40 | // lda sequence variational posterior distribution 41 | 42 | 43 | // === allocation and initialization === 44 | 45 | inf_var* inf_var_alloc(int number_topics, 46 | corpus_seq_t* corpus_seq); 47 | void inf_var_free(inf_var* ptr); 48 | 49 | // initialize lda sequence from lda model topics 50 | 51 | void init_lda_seq_from_ss(lda_seq* model, 52 | double topic_chain_variance, 53 | double topic_obs_variance, 54 | double alpha, 55 | gsl_matrix* init_suffstats); 56 | 57 | // === fitting === 58 | 59 | 60 | // infer a corpus with an lda-seq 61 | 62 | double update_inf_var(lda_seq* seq, 63 | const corpus_seq_t* data, 64 | gsl_matrix** phi, 65 | size_t t, 66 | const char* root); 67 | double update_inf_var_multiple(lda_seq* seq, 68 | const corpus_seq_t* data, 69 | gsl_matrix** phi, 70 | size_t t, 71 | const char* root); 72 | void update_inf_reg(lda_seq* seq, 73 | const corpus_seq_t* data, 74 | gsl_matrix** phi, 75 | size_t t, 76 | const char* root); 77 | 78 | double lda_seq_infer(lda_seq* model, 79 | const corpus_seq_t* data, 80 | gsl_matrix** suffstats, 81 | gsl_matrix* gammas, 82 | gsl_matrix* lhoods, 83 | int iter, 84 | const char* file_root); 85 | 86 | // fit lda sequence from sufficient statistics 87 | 88 | double fit_lda_seq(lda_seq* m, 89 | const corpus_seq_t* data, 90 | const corpus_seq_t* heldout, 91 | const char* file_root); 92 | 93 | void update_lda_seq_ss(int time, 94 | const doc_t* doc, 95 | const lda_post* post, 96 | gsl_matrix** ss); 97 | 98 | double fit_lda_seq_topics(lda_seq* model, 99 | gsl_matrix** ss); 100 | 101 | 102 | // === reading and writing === 103 | 104 | 105 | // read and write a lda sequence 106 | 107 | void write_lda_seq(const lda_seq* m, const char* root); 108 | 109 | lda_seq* read_lda_seq(const char* root, corpus_seq_t* data); 110 | 111 | // write lda sequence sufficient statistics 112 | 113 | void write_lda_seq_suffstats(lda_seq* m, 114 | gsl_matrix** topic_ss, 115 | const char* root); 116 | 117 | // new lda sequence 118 | 119 | lda_seq* new_lda_seq(corpus_seq_t* data, 120 | int W, 121 | int T, 122 | int K); 123 | 124 | void make_lda_from_seq_slice(lda* lda_m, 125 | lda_seq* lda_seq_m, 126 | int time); 127 | 128 | #endif 129 | -------------------------------------------------------------------------------- /dtm/sample.sh: -------------------------------------------------------------------------------- 1 | This file provides information about running the Dynamic Topic Model 2 | or the Document Influence Model. It gives two command-line examples 3 | for running the software and several example commands in R for reading 4 | output files. 5 | 6 | Dynamic topic models and the influence model have been implemented 7 | here in c / c++. This implementation takes two input files: 8 | 9 | (a) foo-mult.dat, which is one-doc-per-line, each line of the form 10 | 11 | unique_word_count index1:count1 index2:count2 ... indexn:counnt 12 | 13 | where each index is an integer corresponding to a unique word. 14 | 15 | (b) foo-seq.dat, which is of the form 16 | 17 | 18 | Number_Timestamps 19 | number_docs_time_1 20 | ... 21 | number_docs_time_i 22 | ... 23 | number_docs_time_NumberTimestamps 24 | 25 | - The docs in foo-mult.dat should be ordered by date, with the first 26 | docs from time1, the next from time2, ..., and the last docs from 27 | timen. 28 | 29 | When working with data like this, I've found it helpful to create 30 | the following files: 31 | - the mult.dat file (described in (a) above) 32 | - the seq.dat file (described in (b) above) 33 | - a file with all of the words in the vocabulary, arranged in 34 | the same order as the word indices 35 | - a file with information on each of the documents, arranged in 36 | the same order as the docs in the mult file. 37 | 38 | The code creates at least the following files: 39 | 40 | - topic-???-var-e-log-prob.dat: the e-betas (word distributions) for 41 | topic ??? for all times. This is in row-major form, i.e.: 42 | 43 | > a = scan("topic-002-var-e-log-prob.dat") 44 | > b = matrix(a, ncol=10, byrow=TRUE) 45 | 46 | # The probability of term 100 in topic 2 at time 3: 47 | exp(b[100, 3]) 48 | 49 | - gam.dat: The gammas associated with each document. Divide these by 50 | the sum for each document to get expected topic mixtures. 51 | 52 | > a = scan("gam.dat") 53 | > b = matrix(a, ncol=10, byrow=TRUE) 54 | > rs = rowSums(b) 55 | > e.theta = b / rs 56 | # Proportion of topic 5 in document 3: 57 | e.theta[3, 5] 58 | 59 | If you are running this software in "dim" mode to find document 60 | influence, it will also create the following files: 61 | 62 | - influence_time-??? : the influence of documents at time ??? for 63 | each topic, where time is based on in your -seq.dat file and the 64 | document index is given by the ordering of documents in the mult 65 | file. 66 | 67 | For example, in R: 68 | > a = scan("influence-time-010") 69 | > b = matrix(a, ncol=10, byrow=TRUE) 70 | # The influence of the 2nd document on topic 5: 71 | > b[2, 5] 72 | 73 | # Here are some example commands: 74 | # Run the dynamic topic model. 75 | ./main \ 76 | --ntopics=20 \ 77 | --mode=fit \ 78 | --rng_seed=0 \ 79 | --initialize_lda=true \ 80 | --corpus_prefix=example/test \ 81 | --outname=example/model_run \ 82 | --top_chain_var=0.005 \ 83 | --alpha=0.01 \ 84 | --lda_sequence_min_iter=6 \ 85 | --lda_sequence_max_iter=20 \ 86 | --lda_max_em_iter=10 87 | 88 | # Run the influence model. 89 | ./main \ 90 | --mode=fit \ 91 | --rng_seed=0 \ 92 | --model=fixed \ 93 | --initialize_lda=true \ 94 | --corpus_prefix=example/test \ 95 | --outname=example/output \ 96 | --time_resolution=2 \ 97 | --influence_flat_years=5 \ 98 | --top_obs_var=0.5 \ 99 | --top_chain_var=0.005 \ 100 | --sigma_d=0.0001 \ 101 | --sigma_l=0.0001 \ 102 | --alpha=0.01 \ 103 | --lda_sequence_min_iter=6 \ 104 | --lda_sequence_max_iter=20 \ 105 | --save_time=-1 \ 106 | --ntopics=10 \ 107 | --lda_max_em_iter=10 108 | 109 | -------------------------------------------------------------------------------- /dtm/params.c: -------------------------------------------------------------------------------- 1 | // Author: David Blei (blei@cs.princeton.edu) 2 | // 3 | // Copyright 2006 David Blei 4 | // All Rights Reserved. 5 | // 6 | // See the README for this package for details about modifying or 7 | // distributing this software. 8 | 9 | #include "params.h" 10 | 11 | /* 12 | * check label 13 | * 14 | */ 15 | 16 | void check_label(FILE* f, char* name) 17 | { 18 | char label[400]; 19 | fscanf(f, "%s", label); 20 | assert(strcmp(label, name) == 0); 21 | } 22 | 23 | 24 | /* 25 | * read and write strings 26 | * 27 | */ 28 | 29 | void params_read_string(FILE* f, char* name, char* x) 30 | { 31 | check_label(f, name); 32 | fscanf(f, "%s", x); 33 | outlog("%-10s READ NAME=%-10s STRING=%s", "[PARAMS]", name, x); 34 | } 35 | 36 | /* 37 | * read and write integers 38 | * 39 | */ 40 | 41 | void params_read_int(FILE* f, char* name, int* x) 42 | { 43 | check_label(f, name); 44 | assert(fscanf(f, "%d", x) > 0); 45 | outlog("%-10s READ NAME=%-10s INT=%d", "[PARAMS]", name, *x); 46 | } 47 | 48 | void params_write_int(FILE* f, char* name, int x) 49 | { 50 | fprintf(f, "%s %d\n", name, x); 51 | } 52 | 53 | 54 | /* 55 | * read and write doubles 56 | * 57 | */ 58 | 59 | void params_read_double(FILE* f, char* name, double* x) 60 | { 61 | check_label(f, name); 62 | assert(fscanf(f, "%lf", x) > 0); 63 | outlog("%-10s READ NAME=%-10s DBL=%1.14e", "[PARAMS]", name, *x); 64 | } 65 | 66 | void params_write_double(FILE* f, char* name, double x) 67 | { 68 | fprintf(f, "%s %17.14f\n", name, x); 69 | } 70 | 71 | 72 | /* 73 | * read and write gsl vectors and matrices. 74 | * 75 | */ 76 | 77 | void params_read_gsl_vector(FILE* f, char* name, gsl_vector** x) 78 | { 79 | int size, i; 80 | double val; 81 | 82 | check_label(f, name); 83 | assert(fscanf(f, "%d", &size) > 0); 84 | *x = gsl_vector_calloc(size); 85 | for (i = 0; i < size; i++) 86 | { 87 | assert(fscanf(f, "%lf", &val) > 0); 88 | gsl_vector_set(*x, i, val); 89 | } 90 | } 91 | 92 | 93 | void params_write_gsl_vector(FILE* f, char* name, gsl_vector* x) 94 | { 95 | fprintf(f, "%s %d", name, (int) x->size); 96 | int i; 97 | for (i = 0; i < x->size; i++) 98 | fprintf(f, " %17.14f", gsl_vector_get(x, i)); 99 | fprintf(f, "\n"); 100 | } 101 | 102 | 103 | //void params_write_doc_ 104 | 105 | void params_write_gsl_vector_multiline(FILE* f, char* name, gsl_vector* x) 106 | { 107 | fprintf(f, "%s %d\n", name, (int) x->size); 108 | int i; 109 | if (x->size) { 110 | fprintf(f, "%17.14f", gsl_vector_get(x, 0)); 111 | } 112 | for (i = 1; i < x->size; i++) 113 | fprintf(f, ",%17.14f", gsl_vector_get(x, i)); 114 | fprintf(f, "\n"); 115 | } 116 | 117 | 118 | void params_write_gsl_matrix(FILE* f, char* name, gsl_matrix* x) 119 | { 120 | fprintf(f, "%s %ld %ld\n", name, x->size1, x->size2); 121 | int i, j; 122 | if (x->size1 == 0) { 123 | return; 124 | } 125 | for (i = 0; i < x->size1; i++) { 126 | fprintf(f, "%17.14f", gsl_matrix_get(x, i, 0)); 127 | for (j = 1; j < x->size2; j++) { 128 | fprintf(f, ",%17.14f", gsl_matrix_get(x, i, j)); 129 | } 130 | fprintf(f, "\n"); 131 | } 132 | } 133 | 134 | void params_write_sparse_gsl_matrix(FILE* f, char* name, gsl_matrix* x) 135 | { 136 | fprintf(f, "%s %ld %ld\n", name, x->size1, x->size2); 137 | int i, j; 138 | if (x->size1 == 0) { 139 | return; 140 | } 141 | for (i = 0; i < x->size1; i++) { 142 | for (j = 0; j < x->size2; j++) { 143 | // outlog("%d %d %d %d", i, j, x->size1, x->size2); 144 | double value = gsl_matrix_get(x, i, j); 145 | if (fabs(value) > 1e-12) { 146 | fprintf(f, "%d,%d,%17.14f\n", i, j, value); 147 | } 148 | } 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/histogram.h: -------------------------------------------------------------------------------- 1 | // This random generator is a C++ wrapper for the GNU Scientific Library 2 | // Copyright (C) 2001 Torbjorn Vik 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | #ifndef __histogram_h 18 | #define __histogram_h 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | namespace gsl{ 25 | #ifndef __HP_aCC 26 | using std::string; 27 | using std::runtime_error; 28 | #endif 29 | 30 | //! Encapsulates the histogram object of gsl. Only uniformly spaced bins yet. 31 | class histogram 32 | { 33 | public: 34 | histogram(int nBins, double xmin, double xmax) 35 | { 36 | h=gsl_histogram_calloc(nBins); 37 | if (!h) 38 | { 39 | throw runtime_error("Couldn't allocate memory for histogram"); 40 | } 41 | gsl_histogram_set_ranges_uniform(h, xmin, xmax); 42 | } 43 | ~histogram(){gsl_histogram_free(h);} 44 | 45 | //@{ Updating and Accessing Methods 46 | int increment(double x){return gsl_histogram_increment(h, x);} 47 | int accumulate(double x, double weight){return gsl_histogram_accumulate(h, x, weight);} 48 | double get(int i) const {return gsl_histogram_get(h, i);} 49 | double& operator[](const uint & i) 50 | { 51 | const uint n = h->n; 52 | 53 | if (i >= n) 54 | { 55 | throw runtime_error("index lies outside valid range of 0 .. n - 1"); 56 | // GSL_ERROR_VAL ("index lies outside valid range of 0 .. n - 1", GSL_EDOM, 0); 57 | } 58 | 59 | return h->bin[i]; 60 | } 61 | const double& operator[](const uint & i) const //{return (*this)[i];/*gsl_histogram_get(h, i);*/} 62 | { 63 | const uint n = h->n; 64 | 65 | if (i >= n) 66 | { 67 | throw runtime_error("index lies outside valid range of 0 .. n - 1"); 68 | // GSL_ERROR_VAL ("index lies outside valid range of 0 .. n - 1", GSL_EDOM, 0); 69 | } 70 | 71 | return h->bin[i]; 72 | } 73 | 74 | void get_range(int i, double& xmin, double& xmax) const {gsl_histogram_get_range(h, i, &xmin, &xmax);} 75 | //@} 76 | 77 | //@{These functions return the maximum upper and minimum lower range limits 78 | // and the number of bins of the histogram h. They provide a way of determining these values without 79 | // accessing the gsl_histogram struct directly. 80 | double max() const {return gsl_histogram_max(h);} 81 | double min() const {return gsl_histogram_min(h);} 82 | int bins()const {return gsl_histogram_bins(h);} 83 | int size()const {return gsl_histogram_bins(h);} 84 | 85 | //@} 86 | 87 | //@{ Histogram statistics 88 | double mean()const {return gsl_histogram_mean(h);} // not in gsl library ? 89 | double max_val() const {return gsl_histogram_max_val(h);} 90 | int max_bin() const {return gsl_histogram_max_bin(h);} 91 | double min_val() const {return gsl_histogram_min_val(h);} 92 | int min_bin() const {return gsl_histogram_min_bin(h);} 93 | double sum() const {return gsl_histogram_sum(h);} 94 | //@} 95 | 96 | 97 | //@{ Accessor for gsl compatibility 98 | gsl_histogram* gslobj() { return h;} 99 | const gsl_histogram* gslobj() const { return h;} 100 | //@} 101 | 102 | protected: 103 | gsl_histogram * h; 104 | }; 105 | } 106 | 107 | #endif // __histogram_h 108 | -------------------------------------------------------------------------------- /gslwrap/bin/gslwrap-config: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # this is shamelessly stolen from imlib3d-config :-) which is : 4 | # this is shamelessly stolen from gtkmm-config :-) 5 | 6 | prefix=/n/fs/topics/lib/gslwrap 7 | exec_prefix=${prefix} 8 | top_srcdir=. 9 | 10 | libdir=${exec_prefix}/lib 11 | 12 | bindir=${exec_prefix}/bin 13 | pkglibdir=${libdir}/gslwrap 14 | 15 | transform=s,x,x, 16 | 17 | 18 | gslwrap_libs="-L${exec_prefix}/lib -lgslwrap -L/n/fs/topics/lib/gsl/lib -lgsl -lgslcblas -lm" 19 | gslwrap_cflags="-I${prefix}/include -I${prefix}/include/gslwrap -I/n/fs/topics/lib/gsl/include " 20 | 21 | 22 | usage() 23 | { 24 | cat < 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "param.h" 21 | #include "data.h" 22 | #include "gsl-wrappers.h" 23 | 24 | /* 25 | * functions for posterior inference in the latent dirichlet 26 | * allocation model. 27 | * 28 | */ 29 | 30 | #define LDA_INFERENCE_CONVERGED 1e-8 31 | #define LDA_SEED_INIT 1 32 | #define LDA_INIT_SMOOTH 1.0 33 | #define LDA_EM_CONVERGED 5e-5 34 | #define LDA_USE_VAR_BAYES 0 35 | #define LDA_TOPIC_DIR_PARAM 0.001 36 | 37 | // lda model 38 | 39 | typedef struct lda { 40 | int ntopics; // number of topics 41 | int nterms; // vocabulary size 42 | gsl_matrix* topics; // each column is a topic (V X K) 43 | gsl_vector* alpha; // dirichlet parameters 44 | } lda; 45 | 46 | // lda posterior 47 | 48 | typedef struct lda_post { 49 | doc_t* doc; // document associated to this posterior 50 | lda* model; // lda model 51 | gsl_matrix* phi; // variational mult parameters (nterms x K) 52 | gsl_matrix* log_phi; // convenient for computation (nterms x K) 53 | gsl_vector* gamma; // variational dirichlet parameters (K) 54 | gsl_vector* lhood; // a K+1 vector, sums to the lhood bound 55 | gsl_vector* doc_weight; // Not owned by this structure. 56 | gsl_vector* renormalized_doc_weight; // Not owned by this structure. 57 | } lda_post; 58 | 59 | // lda sufficient statistics 60 | 61 | typedef struct lda_suff_stats { 62 | gsl_matrix* topics_ss; 63 | } lda_suff_stats; 64 | 65 | 66 | // new lda model and suff stats 67 | 68 | lda* new_lda_model(int ntopics, int nterms); 69 | void free_lda_model(lda* m); 70 | lda_suff_stats* new_lda_suff_stats(lda* model); 71 | void reset_lda_suff_stats(lda_suff_stats* ss); 72 | lda_post* new_lda_post(int ntopics, int max_length); 73 | void free_lda_post(lda_post* p); 74 | void initialize_lda_ss_from_data(corpus_t* data, lda_suff_stats* ss); 75 | 76 | // posterior inference 77 | 78 | double fit_lda_post(int doc_number, int time, 79 | lda_post* p, lda_seq* var, 80 | gsl_matrix* g, 81 | gsl_matrix* g3, 82 | gsl_matrix* g4, 83 | gsl_matrix* g5); 84 | void init_lda_post(lda_post* p); 85 | void update_gamma(lda_post* p); 86 | void update_phi(int doc_number, int time, 87 | lda_post* p, lda_seq* var, 88 | gsl_matrix* g); 89 | void update_phi_dim(int doc_number, int time, 90 | lda_post* p, lda_seq* var, 91 | gsl_matrix* g); 92 | void update_phi_fixed(int doc_number, int time, 93 | lda_post* p, lda_seq* var, 94 | gsl_matrix* g3_matrix, 95 | gsl_matrix* g4_matrix, 96 | gsl_matrix* g5_matrix); 97 | void update_phi_multiple(int doc_number, int time, 98 | lda_post* p, lda_seq* var, 99 | gsl_matrix* g); 100 | 101 | // compute the likelihood bound 102 | 103 | double compute_lda_lhood(lda_post* p); 104 | 105 | // EM algorithm 106 | 107 | double lda_e_step(lda* model, corpus_t* data, lda_suff_stats* ss); 108 | double lda_m_step(lda* model, lda_suff_stats* ss); 109 | void lda_em(lda* model, 110 | lda_suff_stats* ss, 111 | corpus_t* data, 112 | int max_iter, 113 | char* outname); 114 | 115 | // reading and writing 116 | 117 | lda_suff_stats* read_lda_suff_stats(char* filename, int ntopics, int nterms); 118 | void write_lda(lda* model, char* name); 119 | void write_lda_suff_stats(lda_suff_stats* ss, char* name); 120 | lda* read_lda(int ntopics, int nterms, char* name); 121 | 122 | 123 | void initialize_lda_ss_from_random(corpus_t* data, lda_suff_stats* ss); 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /lib/math/gradient_projection_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "gradient_projection.h" 5 | 6 | #define CLOSE_TOL 0.1 7 | #define LOOSE_TOL 10 8 | 9 | using boost::unit_test_framework::test_suite; 10 | using namespace GradientProjection; 11 | 12 | void test_projection() { 13 | gsl::matrix n(4, 3); 14 | gsl::matrix p(1, 1); 15 | n(0, 0) = 2; n(0, 1) = 1; n(0, 2) = 0; 16 | n(1, 0) = 1; n(1, 1) = 1; n(1, 2) = 0; 17 | n(2, 0) = 1; n(2, 1) = 2; n(2, 2) = 0; 18 | n(3, 0) = 4; n(3, 1) = 1; n(3, 2) = 1; 19 | gsl::vector g(3); 20 | g[0] = 0.0; g[1] = -0.1; g[2] = 0.0; 21 | gsl::vector grad(4); 22 | grad[0] = 2.0; grad[1] = 4.0; grad[2] = 2.0; grad[3] = -3.0; 23 | gsl::vector direction; 24 | gsl::vector correction; 25 | 26 | createProjection(n, g, grad, p, direction, correction); 27 | BOOST_CHECK_CLOSE(p(0, 0), 1.0/11.0, LOOSE_TOL); 28 | BOOST_CHECK_CLOSE(p(0, 1), -3.0/11.0, LOOSE_TOL); 29 | BOOST_CHECK_CLOSE(p(0, 2), 1.0/11.0, LOOSE_TOL); 30 | BOOST_CHECK(abs(p(0, 3)) < 1e-10); 31 | BOOST_CHECK_CLOSE(p(1, 0), -3.0/11.0, LOOSE_TOL); 32 | BOOST_CHECK_CLOSE(p(1, 1), 9.0/11.0, LOOSE_TOL); 33 | BOOST_CHECK_CLOSE(p(1, 2), -3.0/11.0, LOOSE_TOL); 34 | BOOST_CHECK(abs(p(1, 3)) < 1e-10); 35 | BOOST_CHECK_CLOSE(p(2, 0), 1.0/11.0, LOOSE_TOL); 36 | BOOST_CHECK_CLOSE(p(2, 1), -3.0/11.0, LOOSE_TOL); 37 | BOOST_CHECK_CLOSE(p(2, 2), 1.0/11.0, LOOSE_TOL); 38 | BOOST_CHECK(abs(p(2, 3)) < 1e-10); 39 | BOOST_CHECK(abs(p(3, 0)) < 1e-10); 40 | BOOST_CHECK(abs(p(3, 1)) < 1e-10); 41 | BOOST_CHECK(abs(p(3, 2)) < 1e-10); 42 | BOOST_CHECK(abs(p(3, 3)) < 1e-10); 43 | 44 | BOOST_CHECK_CLOSE(correction[0], -4.0/110.0, CLOSE_TOL); 45 | BOOST_CHECK_CLOSE(correction[1], 1.0/110.0, CLOSE_TOL); 46 | BOOST_CHECK_CLOSE(correction[2], 7.0/110.0, CLOSE_TOL); 47 | BOOST_CHECK(abs(correction[3]) < 1e-10); 48 | 49 | BOOST_CHECK_CLOSE(direction[0], 8.0/11.0, CLOSE_TOL); 50 | BOOST_CHECK_CLOSE(direction[1], -24.0/11.0, CLOSE_TOL); 51 | BOOST_CHECK_CLOSE(direction[2], 8.0/11.0, CLOSE_TOL); 52 | BOOST_CHECK(abs(direction[3]) < 1e-10); 53 | 54 | gsl::vector x(4); 55 | x[0] = 2; x[1] = 2; x[2] = 1; x[3] = 0; 56 | 57 | descend(x, direction, 0.1, 5, correction, grad); 58 | BOOST_CHECK_CLOSE(x[0], 2.026, CLOSE_TOL); 59 | BOOST_CHECK_CLOSE(x[1], 1.822, CLOSE_TOL); 60 | BOOST_CHECK_CLOSE(x[2], 1.126, CLOSE_TOL); 61 | BOOST_CHECK(abs(x[3]) < 1e-10); 62 | } 63 | 64 | void test_constraint_matrix() { 65 | gsl::matrix n; 66 | gsl::vector g; 67 | gsl::vector x(3); 68 | 69 | 70 | x[0] = 0.15; x[1] = 0.5; x[2] = 0.2; 71 | 72 | BOOST_CHECK_EQUAL(createActiveConstraints(x, n, g), false); 73 | 74 | x[0] = 0.0; 75 | BOOST_CHECK_EQUAL(createActiveConstraints(x, n, g), true); 76 | BOOST_CHECK_EQUAL(n.size1(), x.size()); 77 | // Only one constraint is broken. 78 | BOOST_CHECK_EQUAL(n.size2(), 1); 79 | BOOST_CHECK_EQUAL(n(0, 0), 1); 80 | BOOST_CHECK_EQUAL(n(1, 0), 0); 81 | BOOST_CHECK_EQUAL(n(2, 0), 0); 82 | BOOST_CHECK_EQUAL(g.size(), 1); 83 | BOOST_CHECK_EQUAL(g[0], -SAFETY_BOX); 84 | 85 | x[2] = 0.6; 86 | BOOST_CHECK_EQUAL(createActiveConstraints(x, n, g), true); 87 | BOOST_CHECK_EQUAL(n.size1(), x.size()); 88 | // Two constraints are broken. 89 | BOOST_CHECK_EQUAL(n.size2(), 2); 90 | BOOST_CHECK_EQUAL(n(0, 0), -1); 91 | BOOST_CHECK_EQUAL(n(1, 0), -1); 92 | BOOST_CHECK_EQUAL(n(2, 0), -1); 93 | BOOST_CHECK_EQUAL(n(0, 1), 1); 94 | BOOST_CHECK_EQUAL(n(1, 1), 0); 95 | BOOST_CHECK_EQUAL(n(2, 1), 0); 96 | BOOST_CHECK_EQUAL(g.size(), 2); 97 | BOOST_CHECK_CLOSE(g[0], -0.2, CLOSE_TOL); 98 | BOOST_CHECK_EQUAL(g[1], -SAFETY_BOX); 99 | 100 | } 101 | 102 | test_suite* init_unit_test_suite(int, char* []) { 103 | test_suite* test= BOOST_TEST_SUITE( "Testing Gradient Projection" ); 104 | test->add( BOOST_TEST_CASE( &test_constraint_matrix ), 0); 105 | test->add( BOOST_TEST_CASE( &test_projection ), 0); 106 | return test; 107 | } 108 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/min_fminimizer.h: -------------------------------------------------------------------------------- 1 | // This random generator is a C++ wrapper for the GNU Scientific Library 2 | // Copyright (C) 2001 Torbjorn Vik 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | #ifndef __min_fminimizer_h 18 | #define __min_fminimizer_h 19 | 20 | #include 21 | #include 22 | 23 | namespace gsl{ 24 | 25 | //! Derive this class provide a user defined function for minimisation 26 | struct min_f 27 | { 28 | //! This operator must be overridden 29 | virtual double operator()(const double& x)=0; 30 | 31 | //! This is the function gsl calls to optimize f 32 | static double f(double x, void *p) 33 | { 34 | return (*(min_f *)p)(x); 35 | } 36 | }; 37 | 38 | //! Class for minimizing one dimensional functions. 39 | /*! 40 | Usage: 41 | - Create with optional minimize type 42 | - Set with function object and inital bounds 43 | - Loop the iterate function until convergence or maxIterations (extra facility) 44 | 45 | - Recover minimum and bounds 46 | */ 47 | class min_fminimizer 48 | { 49 | public: 50 | //! choose between gsl_min_fminimizer_goldensection and gsl_min_fminimizer_brent 51 | min_fminimizer(const gsl_min_fminimizer_type* type=gsl_min_fminimizer_brent) : s(NULL), maxIterations(100), isSet(false) 52 | { 53 | s=gsl_min_fminimizer_alloc(type); 54 | nIterations=0; 55 | if (!s) 56 | { 57 | //error 58 | //cout << "ERROR Couldn't allocate memory for minimizer" << endl; 59 | //throw ? 60 | exit(-1); 61 | } 62 | } 63 | ~min_fminimizer(){if (s) gsl_min_fminimizer_free(s);} 64 | //! returns GSL_FAILURE if the interval does not contain a minimum 65 | int set(min_f& function, double minimum, double x_lower, double x_upper) 66 | { 67 | isSet=false; 68 | f.function = &function.f; 69 | f.params = &function; 70 | int status= gsl_min_fminimizer_set(s, &f, minimum, x_lower, x_upper); 71 | if (!status) 72 | { 73 | isSet=true; 74 | nIterations=0; 75 | } 76 | return status; 77 | } 78 | int set_with_values(min_f& function, 79 | double minimum, double f_minimum, 80 | double x_lower,double f_lower, 81 | double x_upper, double f_upper) 82 | { 83 | isSet=false; 84 | f.function = &function.f; 85 | f.params = &function; 86 | int status= gsl_min_fminimizer_set_with_values(s, &f, minimum, f_minimum, x_lower, f_lower, x_upper, f_upper); 87 | if (!status) 88 | { 89 | isSet=true; 90 | nIterations=0; 91 | } 92 | return status; 93 | } 94 | int iterate() 95 | { 96 | assert_set(); 97 | int status=gsl_min_fminimizer_iterate(s); 98 | nIterations++; 99 | if (status==GSL_FAILURE) 100 | isConverged=true; 101 | return status; 102 | } 103 | double minimum(){assert_set();return gsl_min_fminimizer_minimum(s);} 104 | double x_upper(){assert_set();return gsl_min_fminimizer_x_upper(s);} 105 | double x_lower(){assert_set();return gsl_min_fminimizer_x_lower(s);} 106 | void SetMaxIterations(int n){maxIterations=n;} 107 | int GetNIterations(){return nIterations;} 108 | bool is_converged(){if (nIterations>=maxIterations) return true; if (isConverged) return true; return false;} 109 | //string name() const; 110 | 111 | private: 112 | void assert_set(){if (!isSet)exit(-1);} // Old problem of error handling: TODO 113 | 114 | bool isSet; 115 | bool isConverged; 116 | int nIterations; 117 | int maxIterations; 118 | gsl_min_fminimizer* s; 119 | gsl_function f; 120 | }; 121 | }; // namespace gsl 122 | 123 | #endif //__min_fminimizer_h 124 | -------------------------------------------------------------------------------- /lib/math/specialfunc.cpp: -------------------------------------------------------------------------------- 1 | #include "specialfunc.h" 2 | #include 3 | #include 4 | 5 | double trigamma(double x) 6 | { 7 | double p; 8 | int i; 9 | 10 | x=x+6; 11 | p=1/(x*x); 12 | p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238) 13 | *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p; 14 | for (i=0; i<6 ;i++) 15 | { 16 | x=x-1; 17 | p=1/(x*x)+p; 18 | } 19 | return(p); 20 | } 21 | 22 | 23 | double digamma(double x) { 24 | if (x == 0.0) { 25 | return -std::numeric_limits::infinity(); 26 | } 27 | 28 | double p; 29 | x=x+6; 30 | p=1/(x*x); 31 | p=(((0.004166666666667*p-0.003968253986254)*p+ 32 | 0.008333333333333)*p-0.083333333333333)*p; 33 | p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6); 34 | return p; 35 | } 36 | 37 | /* 38 | We invert the gamma by making a reasonable initial guess (typically 39 | this is correct to within a few percent). An iteration of Newton's 40 | method is then used; this yields errors whose worst case are around 41 | .3% and typically around .01%. 42 | 43 | For small x, digamma is approximately -1/x and for large x it is 44 | approximately log(x). Thus we make the initial guesses -1/x and 45 | exp(x) (with some fudge factors) depending on where x lies. 46 | */ 47 | double InverseDigamma(double x) { 48 | double guess = 0.0; 49 | if (x < -2) { 50 | guess = -1/x; 51 | } else { 52 | guess = exp(x) - 1 / (x + 7) + 0.5772157; // Euler-Mascheroni constant. 53 | } 54 | guess -= (digamma(guess) - x) / trigamma(guess); 55 | return(guess); 56 | } 57 | 58 | 59 | double log_gamma(double x) 60 | { 61 | double x0,x2,xp,gl,gl0; 62 | int n=0,k=0; 63 | static double a[] = { 64 | 8.333333333333333e-02, 65 | -2.777777777777778e-03, 66 | 7.936507936507937e-04, 67 | -5.952380952380952e-04, 68 | 8.417508417508418e-04, 69 | -1.917526917526918e-03, 70 | 6.410256410256410e-03, 71 | -2.955065359477124e-02, 72 | 1.796443723688307e-01, 73 | -1.39243221690590}; 74 | 75 | x0 = x; 76 | if (x <= 0.0) return 1e308; 77 | else if ((x == 1.0) || (x == 2.0)) return 0.0; 78 | else if (x <= 7.0) { 79 | n = (int)(7-x); 80 | x0 = x+n; 81 | } 82 | x2 = 1.0/(x0*x0); 83 | xp = 2.0*M_PI; 84 | gl0 = a[9]; 85 | for (k=8;k>=0;k--) { 86 | gl0 = gl0*x2 + a[k]; 87 | } 88 | gl = gl0/x0+0.5*log(xp)+(x0-0.5)*log(x0)-x0; 89 | if (x <= 7.0) { 90 | for (k=1;k<=n;k++) { 91 | gl -= log(x0-1.0); 92 | x0 -= 1.0; 93 | } 94 | } 95 | return gl; 96 | } 97 | 98 | double sigmoid(double x) { 99 | return 1./(1 + exp(-x)); 100 | } 101 | 102 | // First derivative of sigmoid function. 103 | double dsigmoid(double x) { 104 | double s = sigmoid(x); 105 | return s * s * exp(-x); 106 | } 107 | 108 | // Second derivative of sigmoid function. 109 | double d2sigmoid(double x) { 110 | double s = sigmoid(x); 111 | double ds = dsigmoid(x); 112 | return ds * (2 * s * exp(-x) - 1); 113 | } 114 | 115 | double LogPGaussian(double x) { 116 | // Phi(x) = 0.5 * erfc( - x / sqrt(2)) 117 | // log Phi(x) = log(0.5) + log erfc( -x / sqrt(2)) 118 | return log(0.5) + gsl_sf_log_erfc(-x / sqrt(2)); 119 | } 120 | 121 | // d Phi(x) = 0.5 erfc'( - x / sqrt(2)) * (- 1 / sqrt(2)) 122 | // Note that d erfc = d (1 - erf) = - d erf = - 2 / sqrt(pi) exp(-x^2) 123 | // => d Phi(x) = 0.5 * (-2 / sqrt(pi)) * exp(-x^2/2) * (-1 / sqrt(2)) 124 | // = 1 / sqrt(2 pi) exp(-x^2 / 2) 125 | double LogDGaussian(double x) { 126 | return -x * x / 2 - 0.5 * log(2 * M_PI); 127 | } 128 | 129 | // Computes the inverse of PGaussian. We use Newton iteration on the 130 | // *log*. This makes the iteration converge much better for small x. 131 | // I dunno how well it will work for large values of x. 132 | // 5 iterations seems to be enough. It's still pretty slow so don't use 133 | // this in time-critical code. 134 | double InversePGaussian(double x) { 135 | double y = 0; 136 | x = log(x); 137 | for (int ii = 0; ii < 5; ++ii) { 138 | double pgy = LogPGaussian(y); 139 | y -= (pgy - x) * exp(pgy) / exp(LogDGaussian(y)); 140 | } 141 | return y; 142 | } 143 | 144 | 145 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | THIS REPOSITORY HAS BEEN ARCHIVED 2 | 3 | Please, use the new repository/project at: 4 | https://github.com/magsilva/dtm 5 | 6 | Reason for archival: this repository was created before Blei creating 7 | his official project at Github (https://github.com/blei-lab/dtm). Hence 8 | this project is not shown as its fork (which is not appropriate). With 9 | the new project at magsilva/dtm, we how have it as a proper "fork" of 10 | blei-lab/dtm. 11 | 12 | 13 | 14 | 15 | 16 | *************************** 17 | Dynamic Topic Models and the Document Influence Model 18 | *************************** 19 | 20 | This code is the result of work by 21 | 22 | David M. Blei 23 | blei[at]cs.princeton.edu 24 | 25 | and 26 | 27 | Sean M Gerrish 28 | sgerrish[at]cs.princeton.edu. 29 | 30 | (C) Copyright 2006, David M. Blei 31 | (blei [at] cs [dot] princeton [dot] edu) 32 | 33 | (C) Copyright 2011, Sean M. Gerrish 34 | (sgerrish [at] cs [dot] princeton [dot] edu) 35 | 36 | It includes software corresponding to models described in the 37 | following papers: 38 | 39 | [1] D. Blei and J. Lafferty. Dynamic topic models. In 40 | Proceedings of the 23rd International Conference on Machine Learning, 41 | 2006. 42 | [2] S. Gerrish and D. Blei. A Language-based Approach to Measuring 43 | Scholarly Impact. In Proceedings of the 27th International Conference 44 | on Machine Learning, 2010. 45 | 46 | These files are part of DIM. 47 | 48 | DIM is free software; you can redistribute it and/or modify it under 49 | the terms of the GNU General Public License as published by the Free 50 | Software Foundation; either version 2 of the License, or (at your 51 | option) any later version. 52 | 53 | DIM is distributed in the hope that it will be useful, but WITHOUT 54 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 55 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 56 | for more details. 57 | 58 | You should have received a copy of the GNU General Public License 59 | along with this program; if not, write to the Free Software 60 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 61 | USA 62 | 63 | ------------------------------------------------------------------------ 64 | 65 | A. COMPILING 66 | 67 | You will need to have several libraries installed to compile this 68 | package: 69 | gsl 70 | gflags 71 | 72 | Depending on your package manager, you may be able to install these 73 | with *one* of the following commands: 74 | 75 | sudo aptitude install libgsl0-dev # Ubuntu 10.04 76 | sudo zypper install gsl-devel # OpenSUSE 11.2 77 | sudo yum install gsl-devel # CentOS 5.5 78 | 79 | You can make the main program by changing your working directory to 80 | dtm/ and typing: 81 | 82 | make 83 | 84 | This software has been compiled on Ubuntu 10.04, OpenSUSE 11.2, and 85 | CentOS 5.5. Depending on your environment, you may need to install 86 | additional libraries. 87 | 88 | B. RUNNING 89 | 90 | Once everything is compiled, you can run this software by typing the 91 | command "./main ", where flags is a list of command-line 92 | options. An example command and a description of the input and output 93 | files is given in dtm/sample.sh. You can see all command-line options 94 | by typing 95 | 96 | ./main --help 97 | 98 | (although we suggest you start out with the example in dtm/sample.sh). 99 | 100 | You should also replace 'main' by the appropriate executable (depending 101 | on your computer architecture and operating system). We currently 102 | provide binaries for Linux (dtm-linux32 and dtm-linux64), MacOS (dtm-darwin64) 103 | and Windows (dtm-win32.exe and dtm-win64.exe). 104 | 105 | C. SUPPORT and QUESTIONS 106 | 107 | This software is provided as-is, without any warranty or support, 108 | WHATSOEVER. If you have any questions about running this software, 109 | you can post your question to the topic-models mailing list at 110 | topic-models@lists.cs.princeton.edu. You are welcome to submit 111 | modifications or bug-fixes of this software to the authors, although 112 | not all submissions may be posted. 113 | 114 | D. USAGE 115 | 116 | This progam takes as input a collection of text documents and creates 117 | as output a list of topics over time, a description of each document 118 | as a mixture of these topics, and (possibly) a measure of how 119 | "influential" each document is, based on its language. 120 | 121 | We have provided an example dataset, instructions for formatting input 122 | data and processing output files, and example command lines for 123 | running this software in the file dtm/sample.sh. 124 | 125 | E. CHANGES 126 | 127 | Changes in this version include: 128 | 129 | - Change the default top_obs_var flag to 0.5 (from -1.0) 130 | - Change to use more iterations and a tighter convergence criterion in each doc's E-step. 131 | - Change to initialize random topics to be a bit more "flat". 132 | -------------------------------------------------------------------------------- /lib/math/gsl_matrix.h: -------------------------------------------------------------------------------- 1 | #ifndef __MATH_GSL_MATRIX__ 2 | #define __MATH_GSL_MATRIX__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | class GslMatrixItem { 9 | public: 10 | GslMatrixItem(gsl_matrix* ptr, size_t index1, size_t index2) : 11 | ptr_(ptr), 12 | index1_(index1), 13 | index2_(index2) { } 14 | 15 | operator const double() { 16 | return gsl_matrix_get(ptr_, index1_, index2_); 17 | } 18 | 19 | double operator =(const double v) { 20 | gsl_matrix_set(ptr_, index1_, index2_, v); 21 | return v; 22 | } 23 | 24 | double operator +=(const double v) { 25 | double old_v = gsl_matrix_get(ptr_, index1_, index2_); 26 | gsl_matrix_set(ptr_, index1_, index2_, v + old_v); 27 | return v + old_v; 28 | } 29 | private: 30 | gsl_matrix* ptr_; 31 | size_t index1_; 32 | size_t index2_; 33 | }; 34 | 35 | class GslMatrixBase { 36 | public: 37 | GslMatrixBase& operator=(const double v) { 38 | if (v == 0.0) { 39 | SetZero(); 40 | } else { 41 | SetAll(v); 42 | } 43 | return *this; 44 | } 45 | 46 | GslMatrixItem operator()(const size_t index1, const size_t index2) const { 47 | assert(ptr_ != NULL); 48 | return GslMatrixItem(ptr_, index1, index2); 49 | } 50 | 51 | void SetZero() { 52 | assert(ptr_ != NULL); 53 | gsl_matrix_set_zero(ptr_); 54 | } 55 | 56 | void SetAll(const double v) { 57 | assert(ptr_ != NULL); 58 | gsl_matrix_set_all(ptr_, v); 59 | } 60 | 61 | void Reset(gsl_matrix* val) { 62 | if(ptr_ != NULL) { 63 | gsl_matrix_free(ptr_); 64 | } 65 | ptr_ = val; 66 | } 67 | 68 | int Fprintf(FILE* stream, const char* format) const { 69 | assert(ptr_ != NULL); 70 | return gsl_matrix_fprintf(stream, ptr_, format); 71 | } 72 | 73 | int Fscanf(FILE* stream) { 74 | assert(ptr_ != NULL); 75 | return gsl_matrix_fscanf(stream, ptr_); 76 | } 77 | 78 | void Set(const int i, const int j, double val) { 79 | gsl_matrix_set(ptr_, i, j, val); 80 | } 81 | 82 | /* 83 | double operator()(const int nCol, const int nRow) { 84 | return gsl_matrix_get(ptr_, nCol, nRow); 85 | } 86 | */ 87 | 88 | int size1() const { 89 | return ptr_->size1; 90 | } 91 | 92 | int size2() const { 93 | return ptr_->size2; 94 | } 95 | 96 | double Trace() const { 97 | double val = 0; 98 | assert(ptr_ != NULL); 99 | assert(ptr_->size1 == ptr_->size2); 100 | for (size_t ii = 0; ii < ptr_->size1; ++ii) { 101 | val += gsl_matrix_get(ptr_, ii, ii); 102 | } 103 | return val; 104 | } 105 | 106 | double Sum() const { 107 | double val = 0; 108 | assert(ptr_ != NULL); 109 | for (size_t ii = 0; ii < ptr_->size1; ++ii) { 110 | for (size_t jj = 0; jj < ptr_->size2; ++jj) { 111 | val += gsl_matrix_get(ptr_, ii, jj); 112 | } 113 | } 114 | return val; 115 | } 116 | 117 | /* 118 | * Apply the transpose of this matrix to a vector x and store the result. 119 | 120 | int TransMul(const GslVector& x, GslVector& res, double scale = 0.0) { 121 | return gsl_blas_dgemv(CblasTrans, 1.0, ptr_, x.ptr(), scale, res.ptr()); 122 | } 123 | 124 | int Mul(const GslVector& x, GslVector& res, double scale = 0.0) { 125 | return gsl_blas_dgemv(CblasNoTrans, 1.0, ptr_, x.ptr(), scale, res.ptr()); 126 | } 127 | */ 128 | 129 | const gsl_matrix* ptr() const { return ptr_; } 130 | gsl_matrix* mutable_ptr() { return ptr_; } 131 | 132 | protected: 133 | GslMatrixBase() : ptr_(NULL) { 134 | } 135 | gsl_matrix* ptr_; 136 | 137 | private: 138 | GslMatrixBase(const GslMatrixBase&) { } 139 | }; 140 | 141 | class GslMatrix : public GslMatrixBase { 142 | public: 143 | GslMatrix(const size_t size1, const size_t size2) : GslMatrixBase() { 144 | Allocate(size1, size2); 145 | } 146 | 147 | void Allocate(const size_t size1, const size_t size2) { 148 | assert(ptr_ == NULL); 149 | ptr_ = gsl_matrix_alloc(size1, size2); 150 | } 151 | 152 | GslMatrix() : GslMatrixBase() { 153 | } 154 | 155 | GslMatrix(gsl_matrix* val) : GslMatrixBase() { 156 | ptr_ = val; 157 | } 158 | 159 | ~GslMatrix() { 160 | if(ptr_ != NULL) { 161 | gsl_matrix_free(ptr_); 162 | } 163 | } 164 | 165 | GslMatrixBase& operator=(const double v) { 166 | GslMatrixBase::operator=(v); 167 | return *this; 168 | } 169 | private: 170 | GslMatrix(const GslMatrix&) { } 171 | }; 172 | 173 | 174 | class GslSubmatrix : public GslMatrixBase { 175 | public: 176 | GslSubmatrix(GslMatrixBase& matrix, size_t k1, size_t k2, size_t n1, size_t n2) : 177 | view_(gsl_matrix_submatrix(matrix.mutable_ptr(), k1, k2, n1, n2)) { 178 | ptr_ = &view_.matrix; 179 | } 180 | 181 | GslSubmatrix(gsl_matrix* matrix, size_t k1, size_t k2, size_t n1, size_t n2) : 182 | view_(gsl_matrix_submatrix(matrix, k1, k2, n1, n2)) { 183 | ptr_ = &view_.matrix; 184 | } 185 | 186 | GslMatrixBase& operator=(const double v) { 187 | GslMatrixBase::operator=(v); 188 | return *this; 189 | } 190 | private: 191 | gsl_matrix_view view_; 192 | GslSubmatrix(const GslSubmatrix&) { } 193 | }; 194 | 195 | #endif // __MATH_GSL_MATRIX__ 196 | -------------------------------------------------------------------------------- /dtm/data.h: -------------------------------------------------------------------------------- 1 | // Authors: David Blei (blei@cs.princeton.edu) 2 | // Sean Gerrish (sgerrish@cs.princeton.edu) 3 | // 4 | // Copyright 2011 Sean Gerrish and David Blei 5 | // All Rights Reserved. 6 | // 7 | // See the README for this package for details about modifying or 8 | // distributing this software. 9 | 10 | #ifndef DATA_H 11 | #define DATA_H 12 | 13 | #include "gsl-wrappers.h" 14 | #include "param.h" 15 | #include 16 | #include 17 | #include 18 | 19 | #define OFFSET 0 20 | 21 | // Create the scaled beta distribution, which describes how much weight documents have after n years. 22 | const int kScaledInfluenceMax = 200; 23 | 24 | // This mean and variance are relative to the interval [0, 1.0]. 25 | const double kScaledInfluenceMean = 10.0 / kScaledInfluenceMax; 26 | const double kScaledInfluenceVariance = ((10.0 / kScaledInfluenceMax) * (10.0 / kScaledInfluenceMax)); 27 | 28 | /* 29 | * a document is a collection of counts and terms 30 | * 31 | */ 32 | 33 | typedef struct doc_t { 34 | int total; 35 | int nterms; 36 | int* word; 37 | int* count; 38 | // A parameter for finding phi. 39 | double* lambda; 40 | 41 | // Used for measuring perplexity. 42 | double log_likelihood; 43 | double* log_likelihoods; 44 | } doc_t; 45 | 46 | 47 | /* 48 | * a corpus is a collection of documents 49 | * 50 | */ 51 | 52 | typedef struct corpus_t { 53 | doc_t** doc; 54 | int ndocs; 55 | int nterms; 56 | int max_unique; // maximum number of unique terms in a document 57 | } corpus_t; 58 | 59 | 60 | /* 61 | * a sequence is a sequence of corpora 62 | * 63 | */ 64 | 65 | typedef struct corpus_seq_t { 66 | corpus_t** corpus; 67 | int nterms; 68 | int max_nterms; 69 | int len; 70 | int ndocs; 71 | } corpus_seq_t; 72 | 73 | 74 | typedef struct inf_var { 75 | gsl_matrix** doc_weights; // T matrices of document weights. 76 | // each matrix is d_t x K. 77 | gsl_matrix** renormalized_doc_weights; // T matrices of document weights. 78 | // each matrix is d_t x K. 79 | int ntime; 80 | } inf_var; 81 | 82 | /* 83 | * variational posterior structure 84 | * 85 | */ 86 | 87 | 88 | typedef struct sslm_var { 89 | // properties 90 | 91 | int W; // vocabulary size 92 | int T; // sequence length 93 | 94 | // variational parameters 95 | 96 | gsl_matrix* obs; // observations, W x T 97 | 98 | // biproducts of the variational parameters 99 | 100 | double obs_variance; // observation variance 101 | double chain_variance; // chain variance 102 | gsl_vector* zeta; // extra variational parameter, T 103 | gsl_matrix* e_log_prob; // E log prob(w | t), W x T 104 | 105 | // convenient quantities for inference 106 | 107 | gsl_matrix* fwd_mean; // forward posterior mean, W x T 108 | gsl_matrix* fwd_variance; // forward posterior variance, W x T 109 | gsl_matrix* mean; // posterior mean, W x T 110 | gsl_matrix* variance; // posterior variance, W x T 111 | 112 | gsl_matrix* mean_t; // W x T 113 | gsl_matrix* variance_t; 114 | 115 | gsl_matrix* influence_sum_lgl; // The sum exp * w_phi_l 116 | 117 | // Recent copy of w_phi_l. 118 | gsl_matrix* w_phi_l; // W x T 119 | gsl_matrix* w_phi_sum; // W x T 120 | gsl_matrix* w_phi_l_sq; // Square term involving various 121 | gsl_matrix* m_update_coeff; // Terms involving squares of 122 | // W, l, and phi. 123 | gsl_matrix* m_update_coeff_g; // \sum_i=0..t phi_l(t) r(i-t) 124 | 125 | // useful temporary vector 126 | gsl_vector* T_vct; 127 | } sslm_var; 128 | 129 | 130 | typedef struct lda_seq { 131 | int ntopics; // number of topics 132 | int nterms; // number of terms 133 | int nseq; // length of sequence 134 | gsl_vector* alpha; // dirichlet parameters 135 | 136 | sslm_var** topic; // topic chains. 137 | 138 | inf_var* influence; // document weights 139 | 140 | gsl_matrix** influence_sum_lgl; // Sum of document weights at time t (see g in the regression formula) 141 | 142 | // gsl_vector** influence_sum_g; // Sum of document weights at time t. 143 | // gsl_vector** influence_sum_h; // Sum of document weights at time t. 144 | 145 | inf_var* renormalized_influence; // document weights 146 | 147 | // gsl_matrix** w_phi_l; // Product term for the \beta update. 148 | // gsl_matrix** w_phi_l_sq; // Square term involving various 149 | // coefficients for the \beta update. 150 | 151 | std::pair**** top_doc_phis; // T x D_t x n of document phis. 152 | } lda_seq; 153 | 154 | /* 155 | * functions 156 | * 157 | */ 158 | 159 | corpus_t* read_corpus(const char* name); 160 | corpus_seq_t* read_corpus_seq(const char* name); 161 | int compute_max_nterms(const corpus_seq_t* c); 162 | gsl_matrix * compute_total_counts(const corpus_seq_t* c); 163 | corpus_seq_t* make_seq_corpus_subset(corpus_seq_t* all, int start, int end); 164 | void write_corpus(corpus_t* c, char* filename); 165 | void write_corpus_seq(corpus_seq_t* c, char* name); 166 | corpus_seq_t* make_corpus_seq_subset(corpus_seq_t* all, int start, int end); 167 | corpus_t* collapse_corpus_seq(corpus_seq_t* c); 168 | double* NewScaledInfluence(int size); 169 | 170 | #endif 171 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/multimin_fdfminimizer.h: -------------------------------------------------------------------------------- 1 | // This random generator is a C++ wrapper for the GNU Scientific Library 2 | // Copyright (C) 2001 Torbjorn Vik 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 | #ifndef __multimin_fdfminimizer_h 18 | #define __multimin_fdfminimizer_h 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | namespace gsl{ 25 | 26 | //! Create an instance of this class with a user defined function 27 | /*! 28 | A template class with the function operator()(const vector& x) 29 | and derivative(const vector&, vector&), as well as a reference to an object of this class must be fournished 30 | 31 | User is responsible for deleting this reference ! 32 | 33 | */ 34 | template 35 | class multimin_fdf 36 | { 37 | public: 38 | fdf_function* fct; 39 | 40 | //! These operators can be overridden 41 | virtual double operator()(const vector& x) 42 | { 43 | return (*fct)(x); 44 | } 45 | virtual void derivative(const vector& x, vector& g) 46 | { 47 | (*fct).derivative(x, g); 48 | } 49 | 50 | //! This operator can be overridden to gain performance in calculating the value and its derivative in a scoop 51 | virtual double fval_and_derivative(const vector&x, vector& g ) 52 | { 53 | derivative(x, g); 54 | return (*this)(x); 55 | } 56 | 57 | 58 | //! This is the function gsl calls to calculate the value of f at x 59 | static double f(const gsl_vector* x, void *p) 60 | { 61 | vector_view x_view(*x); 62 | return (*(multimin_fdf *)p)(x_view); 63 | } 64 | 65 | //! This is the function gsl calls to calculate the value of g=f' at x 66 | static void df(const gsl_vector* x, void *p, gsl_vector* g) 67 | { 68 | vector_view x_view(*x); 69 | vector_view g_view(*g); 70 | (*(multimin_fdf *)p).derivative(x_view, g_view); 71 | } 72 | 73 | //! This is the function gsl calls to calculate the value of g=f' at x 74 | static void fdf(const gsl_vector* x, void *p, double* f, gsl_vector* g) 75 | { 76 | vector_view x_view(*x); 77 | vector_view g_view(*g); 78 | *f=(*(multimin_fdf *)p).fval_and_derivative(x_view, g_view); 79 | } 80 | 81 | //! Constructor (User is responsible for deleting the fdf_function object) 82 | multimin_fdf(fdf_function* _fct):fct(_fct){assert (fct!=NULL);} 83 | }; 84 | 85 | //! Class for multiminimizing one dimensional functions. 86 | /*! 87 | Usage: 88 | - Create with optional multiminimize type 89 | - Set with function object and inital bounds 90 | - Loop the iterate function until convergence or maxIterations (extra facility) 91 | 92 | - Recover multiminimum and bounds 93 | */ 94 | class multimin_fdfminimizer 95 | { 96 | public: 97 | //! 98 | /*! Choose between : 99 | - gsl_multimin_fdfminimizer_conjugate_fr 100 | - gsl_multimin_fdfminimizer_conjugate_pr 101 | - gsl_multimin_fdfminimizer_vector_bfgs 102 | - gsl_multimin_fdfminimizer_steepest_descent 103 | 104 | */ 105 | multimin_fdfminimizer(uint _dim, 106 | const gsl_multimin_fdfminimizer_type* type=gsl_multimin_fdfminimizer_conjugate_fr) : 107 | dim(_dim), isSet(false), maxIterations(100), s(NULL) 108 | { 109 | s=gsl_multimin_fdfminimizer_alloc(type, dim); 110 | nIterations=0; 111 | if (!s) 112 | { 113 | //error 114 | //cout << "ERROR Couldn't allocate memory for multiminimizer" << endl; 115 | //throw ? 116 | exit(-1); 117 | } 118 | } 119 | ~multimin_fdfminimizer(){if (s) gsl_multimin_fdfminimizer_free(s);} 120 | //! returns GSL_FAILURE if the interval does not contain a multiminimum 121 | template 122 | int set(multimin_fdf& function, const vector& initial_x, double step_size, double tol) 123 | { 124 | isSet=false; 125 | f.f = &function.f; 126 | f.df = &function.df; 127 | f.fdf = &function.fdf; 128 | f.n = dim; 129 | f.params = &function; 130 | int status= gsl_multimin_fdfminimizer_set(s, &f, initial_x.gslobj(), step_size, tol); 131 | if (!status) 132 | { 133 | isSet=true; 134 | nIterations=0; 135 | } 136 | return status; 137 | } 138 | int iterate() 139 | { 140 | assert_set(); 141 | int status=gsl_multimin_fdfminimizer_iterate(s); 142 | nIterations++; 143 | if (status==GSL_FAILURE) 144 | isConverged=true; 145 | return status; 146 | } 147 | int restart(){return gsl_multimin_fdfminimizer_restart(s);} 148 | double minimum(){assert_set();return gsl_multimin_fdfminimizer_minimum(s);} 149 | vector x_value(){assert_set();return vector_view(*gsl_multimin_fdfminimizer_x(s));} 150 | vector gradient(){assert_set();return vector_view(*gsl_multimin_fdfminimizer_gradient(s));} 151 | 152 | 153 | void SetMaxIterations(int n){maxIterations=n;} 154 | int GetNIterations(){return nIterations;} 155 | bool is_converged(){if (nIterations>=maxIterations) return true; if (isConverged) return true; return false;} 156 | //string name() const; 157 | 158 | private: 159 | void assert_set(){if (!isSet)exit(-1);} // Old problem of error handling: TODO 160 | 161 | uint dim; 162 | bool isSet; 163 | bool isConverged; 164 | int nIterations; 165 | int maxIterations; 166 | gsl_multimin_fdfminimizer* s; 167 | gsl_multimin_function_fdf f; 168 | }; 169 | }; // namespace gsl 170 | 171 | #endif //__multimin_fdfminimizer_h 172 | -------------------------------------------------------------------------------- /lib/math/gradient_projection.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "gradient_projection.h" 3 | 4 | namespace GradientProjection { 5 | 6 | void display(const gsl_vector* v, const char* name) { 7 | std::cout << name << " = <"; 8 | for(unsigned int i=0; isize; i++) { 9 | std::cout << gsl_vector_get(v, i); 10 | if (i < v->size - 1) { 11 | std::cout << ", "; 12 | } 13 | } 14 | std::cout << "> (" << v->size << ")" << std::endl; 15 | } 16 | 17 | void display(const gsl_matrix* m, const char* name) { 18 | std::cout << name << "\t = |"; 19 | for(unsigned int i=0; isize1; i++) { 20 | if(i!=0) { 21 | std::cout << "\t |"; 22 | } 23 | for(unsigned int j=0; jsize2; j++) { 24 | std::cout << gsl_matrix_get(m, i, j) << "\t"; 25 | } 26 | std::cout << "|" << std::endl; 27 | } 28 | std::cout << " SIZE: " << m->size1 << " x " << m->size2 << std::endl; 29 | } 30 | 31 | void createProjection(const gsl::matrix& activeConstraints, 32 | const gsl::vector& g, 33 | const gsl::vector& grad, 34 | gsl::matrix& projection, 35 | gsl::vector& direction, 36 | gsl::vector& correction) { 37 | int n = activeConstraints.size1(); 38 | int r = activeConstraints.size2(); 39 | 40 | correction.resize(n); 41 | direction.resize(n); 42 | 43 | // This could be done with cholesky or QR decomposition, but I 44 | // couldn't get it to work. Given that this happens infrequently 45 | // and the matrices are not *that* big, it's not that bad 46 | gsl::matrix S(r,r); 47 | // S = N^T N 48 | gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, 49 | activeConstraints.gslobj(), activeConstraints.gslobj(), 50 | 0.0, S.gslobj()); 51 | // T = (N^{T} N) ^{-1} 52 | gsl::matrix T = S.LU_invert(); 53 | S.set_dimensions(n, r); 54 | // S = -N(N^{T} N)^{-1} 55 | gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, -1.0, 56 | activeConstraints.gslobj(), T.gslobj(), 0.0, S.gslobj()); 57 | 58 | // Set the correction 59 | gsl_blas_dgemv(CblasNoTrans, 1.0, S.gslobj(), g.gslobj(), 0.0, 60 | correction.gslobj()); 61 | 62 | // Set the direction 63 | // P = -N(N^{T} N)^{-1}N + I 64 | projection.identity(n); 65 | gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, 66 | S.gslobj(), activeConstraints.gslobj(), 67 | 1.0, projection.gslobj()); 68 | gsl_blas_dgemv(CblasNoTrans, -1.0, projection.gslobj(), grad.gslobj(), 0.0, 69 | direction.gslobj()); 70 | } 71 | 72 | bool createActiveConstraints(const gsl::vector& x, 73 | gsl::matrix& n, 74 | gsl::vector& g) { 75 | bool sumConstraintViolated = false; 76 | int dimension = x.size(); 77 | double margin = SAFETY_BOX; 78 | 79 | if(x.sum() >= 1.0 - margin) { 80 | sumConstraintViolated = true; 81 | } 82 | 83 | int nonNegativeConstraintsViolated = 0; 84 | for(int ii = 0; ii < dimension; ++ii) { 85 | if (x[ii] <= margin) { 86 | ++nonNegativeConstraintsViolated; 87 | } 88 | } 89 | 90 | int newSize = nonNegativeConstraintsViolated; 91 | if(sumConstraintViolated) { 92 | newSize += 1; 93 | } 94 | 95 | if(newSize > 0) { 96 | n.set_dimensions(dimension, newSize); 97 | g.resize(newSize); 98 | g.set_all(SAFETY_BOX); 99 | int col = 0; 100 | if(sumConstraintViolated) { 101 | g[0] = -(1.0 - SAFETY_BOX); 102 | for(int ii = 0; ii < dimension; ++ii) { 103 | n(ii, 0) = -1.0; 104 | } 105 | ++col; 106 | } 107 | 108 | for(int ii = 0; ii < dimension; ++ii) { 109 | if(x[ii] <= margin) { 110 | n(ii, col) = 1.0; 111 | ++col; 112 | } 113 | } 114 | assert(col == newSize); 115 | 116 | gsl_blas_dgemv(CblasTrans, 1.0, n.gslobj(), x.gslobj(), -1.0, g.gslobj()); 117 | 118 | //display(n.gslobj(), "N"); 119 | //display(g.gslobj(), "g"); 120 | 121 | return true; 122 | } else { 123 | return false; 124 | } 125 | } 126 | 127 | double descend(gsl::vector& x, 128 | gsl::vector& s, 129 | const double gamma, 130 | const double obj_value, 131 | const gsl::vector& correction, 132 | const gsl::vector& grad) { 133 | double alpha = 0.0; 134 | 135 | gsl_blas_ddot(s.gslobj(), grad.gslobj(), &alpha); 136 | //std::cout << "dot prod= " << alpha << " "; 137 | if(alpha == 0) { 138 | return alpha; 139 | } 140 | alpha = -(gamma * obj_value) / alpha; 141 | //std::cout << " alpha= " << alpha << " "; 142 | 143 | s *= alpha; 144 | s += correction; 145 | x += s; 146 | 147 | //display(s.gslobj(), "final move"); 148 | 149 | if(alpha < 0) { 150 | alpha = -alpha; 151 | } 152 | return alpha; 153 | } 154 | 155 | double updateState(gsl::vector& x, 156 | const double gamma, 157 | const gsl::vector grad, 158 | const double f) { 159 | /* 160 | * First we see if we're up against constraints 161 | */ 162 | int dim = x.size(); 163 | gsl::matrix n; 164 | gsl::vector g; 165 | gsl::vector s; 166 | gsl::vector correction(dim); 167 | 168 | if(createActiveConstraints(x, n, g)) { 169 | s.resize(dim); 170 | gsl::matrix p; 171 | 172 | createProjection(n, g, grad, p, s, correction); 173 | //std::cout << "Constraints violated." << std::endl; 174 | 175 | //display(p.gslobj(), "p"); 176 | //display(s.gslobj(), "s"); 177 | //display(correction.gslobj(), "correction"); 178 | return descend(x, s, gamma, f, correction, grad); 179 | } else { 180 | //std::cout << "No constraints violated." << std::endl; 181 | s.copy(grad); 182 | s *= -gamma * GRADIENT_DESCENT_SLOWDOWN; 183 | x += s; 184 | double diff; 185 | gsl_blas_ddot(s.gslobj(), s.gslobj(), &diff); 186 | return diff * gamma; 187 | } 188 | 189 | 190 | } 191 | 192 | 193 | } 194 | -------------------------------------------------------------------------------- /doc/lda.tex: -------------------------------------------------------------------------------- 1 | 2 | \section{Latent Dirichlet Allocation (LDA)} 3 | 4 | This is a C implementation of latent Dirichlet allocation (LDA), a 5 | model of discrete data which is fully described in Blei et al. (2003) 6 | (http://www.cs.berkeley.edu/~blei/papers/blei03a.pdf). 7 | 8 | LDA is a hierarchical probabilistic model of documents. Let \alpha be 9 | a scalar and \beta_{1:K} be K distributions of words (called "topics"). 10 | As implemented here, a K topic LDA model assumes the following 11 | generative process of an N word document: 12 | 13 | 1. \theta | \alpha ~ Dirichlet(\alpha, ..., \alpha) 14 | 15 | 2. for each word n = {1, ..., N}: 16 | 17 | a. Z_n | \theta ~ Mult(\theta) 18 | 19 | b. W_n | z_n, \beta ~ Mult(\beta_{z_n}) 20 | 21 | This code implements variational inference of \theta and z_{1:N} for a 22 | document, and estimation of the topics \beta_{1:K} and Dirichlet 23 | parameter \alpha. 24 | 25 | 26 | \subsection{Data format} 27 | 28 | Under LDA, the words of each document are assumed exchangeable. Thus, 29 | each document is succinctly represented as a sparse vector of word 30 | counts. The data is a file where each line is of the form: 31 | 32 | [M] [term_1]:[count] [term_2]:[count] ... [term_N]:[count] 33 | 34 | where [M] is the number of unique terms in the document, and the 35 | [count] associated with each term is how many times that term appeared 36 | in the document. Note that [term_1] is an integer which indexes the 37 | term; it is not a string. 38 | 39 | 40 | 41 | \subsection{Configuration} 42 | 43 | \begin{description} 44 | \item[var max iter (integer; default: -1)] 45 | The maximum number of iterations of coordinate ascent variational 46 | inference for a single document. A value of -1 indicates "full" 47 | variational inference, until the variational convergence 48 | criterion is met. 49 | 50 | \item[var convergence] (float; default: 1e-6)] 51 | The convergence criteria for variational inference. Stop if 52 | (score_old - score) / abs(score_old) is less than this value (or 53 | after the maximum number of iterations). Note that the score is 54 | the lower bound on the likelihood for a particular document. 55 | 56 | \item[em max iter] (integer; default: 100)] 57 | The maximum number of iterations of variational EM. 58 | 59 | \item[em convergence (float; default: 1e-4)] 60 | The convergence criteria for varitional EM. Stop if (score_old - 61 | score) / abs(score_old) is less than this value (or after the 62 | maximum number of iterations). Note that "score" is the lower 63 | bound on the likelihood for the whole corpus. 64 | 65 | \item[alpha (string: `fit' or `estimate'; default: estimate)] 66 | If set to [fixed] then alpha does not change from iteration to 67 | iteration. If set to [estimate], then alpha is estimated along 68 | with the topic distributions. 69 | \end{description} 70 | 71 | 72 | \subsection{Running} 73 | 74 | \subsubsection{Topic estimation} 75 | 76 | Estimate the model by executing: 77 | 78 | lda est [alpha] [k] [settings] [data] [random/seeded/*] [directory] 79 | 80 | The term [random/seeded/*] > describes how the topics will be 81 | initialized. "Random" initializes each topic randomly; "seeded" 82 | initializes each topic to a distribution smoothed from a randomly 83 | chosen document; or, you can specify a model name to load a 84 | pre-existing model as the initial model (this is useful to continue EM 85 | from where it left off). To change the number of initial documents 86 | used, edit lda-estimate.c. 87 | 88 | The model (i.e., \alpha and \beta_{1:K}) and variational posterior 89 | Dirichlet parameters will be saved in the specified directory every 90 | ten iterations. Additionally, there will be a log file for the 91 | likelihood bound and convergence score at each iteration. The 92 | algorithm runs until that score is less than "em_convergence" (from 93 | the settings file) or "em_max_iter" iterations are reached. (To 94 | change the lag between saved models, edit lda-estimate.c.) 95 | 96 | The saved models are in two files: 97 | 98 | .other contains alpha. 99 | 100 | .beta contains the log of the topic distributions. 101 | Each line is a topic; in line k, each entry is log p(w | z=k) 102 | 103 | The variational posterior Dirichlets are in: 104 | 105 | .gamma 106 | 107 | The settings file and data format are described below. 108 | 109 | 110 | \subsubsection{Inference} 111 | 112 | To perform inference on a different set of data (in the same format as 113 | for estimation), execute: 114 | 115 | lda inf [settings] [model] [data] [name] 116 | 117 | Variational inference is performed on the data using the model in 118 | [model].* (see above). Two files will be created : [name].gamma are 119 | the variational Dirichlet parameters for each document; 120 | [name].likelihood is the bound on the likelihood for each document. 121 | 122 | 123 | 124 | \subsection{Results} 125 | 126 | \subsubsection{Printing topics} 127 | 128 | The Python script topics.py lets you print out the top N 129 | words from each topic in a .beta file. Usage is: 130 | 131 | python topics.py 132 | 133 | 134 | \begin{lstlisting} 135 | #! /usr/bin/python 136 | 137 | # usage: python topics.py 138 | # 139 | # is output from the lda-c code 140 | # is a list of words, one per line 141 | # is the number of words to print from each topic 142 | 143 | import sys 144 | 145 | def print_topics(beta_file, vocab_file, nwords = 25): 146 | 147 | # get the vocabulary 148 | 149 | vocab = file(vocab_file, 'r').readlines() 150 | # vocab = map(lambda x: x.split()[0], vocab) 151 | vocab = map(lambda x: x.strip(), vocab) 152 | 153 | # for each line in the beta file 154 | 155 | indices = range(len(vocab)) 156 | topic_no = 0 157 | for topic in file(beta_file, 'r'): 158 | print 'topic %03d' % topic_no 159 | topic = map(float, topic.split()) 160 | indices.sort(lambda x,y: -cmp(topic[x], topic[y])) 161 | for i in range(nwords): 162 | print ' %s' % vocab[indices[i]] 163 | topic_no = topic_no + 1 164 | print '\n' 165 | 166 | if (__name__ == '__main__'): 167 | 168 | if (len(sys.argv) != 4): 169 | print 'usage: python topics.py \n' 170 | sys.exit(1) 171 | 172 | beta_file = sys.argv[1] 173 | vocab_file = sys.argv[2] 174 | nwords = int(sys.argv[3]) 175 | print_topics(beta_file, vocab_file, nwords) 176 | \end{lstlisting} 177 | 178 | -------------------------------------------------------------------------------- /dtm/util.c: -------------------------------------------------------------------------------- 1 | // Author: David Blei (blei@cs.princeton.edu) 2 | // 3 | // Copyright 2006 David Blei 4 | // All Rights Reserved. 5 | // 6 | // See the README for this package for details about modifying or 7 | // distributing this software. 8 | 9 | #define ABNORMAL_RETURN_CODE 1 10 | #define MAX_STRING_LENGTH 65535 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include "util.h" 19 | 20 | #ifdef __alpha__ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #else 28 | 29 | /*#include */ 30 | #include 31 | #include 32 | /*#include */ 33 | 34 | #endif 35 | 36 | char buf[1024]; 37 | static int space_in_use=0; 38 | static int pointers_in_use=0; 39 | int display_allocs=FALSE; 40 | 41 | 42 | void error(char *fmt, ...){ 43 | va_list args; 44 | va_start(args, fmt); 45 | vfprintf(stderr, fmt, args); CRLF; 46 | va_end(args); 47 | fprintf(stderr, "\n"); 48 | if (errno > 0) { 49 | perror(buf); 50 | fprintf(stderr, "errno=%d\n", errno); 51 | fprintf(stderr, buf); 52 | fprintf(stderr, "\n"); 53 | } 54 | fflush(stderr); 55 | fflush(stdout); 56 | assert(0); 57 | } 58 | 59 | void bomb(char *fmt, ...) 60 | { 61 | /* just break out, with error code =1 (fail) */ 62 | 63 | va_list args; 64 | va_start(args, fmt); 65 | vfprintf(stderr, fmt, args); CRLF; 66 | va_end(args); 67 | fprintf(stderr, "\n"); 68 | fflush(stderr); 69 | fflush(stdout); 70 | exit(1); 71 | } 72 | 73 | 74 | void bail(char *fmt, ...) 75 | { 76 | /* just break out, with error code =0 (success) */ 77 | 78 | va_list args; 79 | va_start(args, fmt); 80 | vfprintf(stderr, fmt, args); CRLF; 81 | va_end(args); 82 | fprintf(stderr, "\n"); 83 | fflush(stderr); 84 | fflush(stdout); 85 | exit(0); 86 | } 87 | 88 | 89 | 90 | char *dequote (char *s) { 91 | static char *sbuf=NULL; 92 | char *t; 93 | int i; 94 | if (s[0] != '\'') return s; 95 | else if ((i=strlen(s)) < 2) return s; 96 | else if (s[i-1] != '\'') 97 | error("Illegal string passed to dequote: %s", s); 98 | if (sbuf == NULL) 99 | sbuf = (char *) malloc(MAX_STRING_LENGTH); 100 | t = sbuf; 101 | s++; 102 | while(*s != EOS) { 103 | if (*s == '\'') s++; 104 | *t = *s; 105 | s++; t++; 106 | } 107 | *t = EOS; 108 | return sbuf; 109 | } 110 | 111 | void quote_no_matter_what (const char *s, char *t) { 112 | *t = '\''; 113 | t++; 114 | while((*s != EOS)) { 115 | *t = *s; 116 | if (*s == '\'') { 117 | t++; *t = '\''; 118 | } 119 | s++; t++; 120 | } 121 | *t = '\''; t++; 122 | *t = EOS; 123 | } 124 | 125 | 126 | const char *quote (const char *s) { 127 | static char *sbuf=NULL; 128 | if (sbuf == NULL) 129 | sbuf = (char *) malloc(MAX_STRING_LENGTH); 130 | if ( strchr(s,' ') == NULL && 131 | strstr(s,"/*") == NULL && strstr(s,"*/") == NULL ) return s; 132 | else { 133 | quote_no_matter_what(s, sbuf); 134 | return sbuf; 135 | } 136 | } 137 | 138 | 139 | 140 | 141 | /* returns TRUE iff string only contains chars in valid. */ 142 | int verify(char *string, char *valid) 143 | { 144 | int i; 145 | for(i=0;i0; --i) 163 | if (s[i] != ' ') break; 164 | s[i+1] = '\0'; 165 | return s; 166 | } 167 | 168 | 169 | /* converts s to upper case */ 170 | char * upper(char *s) { 171 | int i; 172 | for (i=0; i 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include "math/vectorops.h" 11 | 12 | class GslVectorItem { 13 | public: 14 | GslVectorItem(gsl_vector* ptr, size_t index) : 15 | ptr_(ptr), 16 | index_(index) { } 17 | 18 | operator const double() { 19 | return gsl_vector_get(ptr_, index_); 20 | } 21 | 22 | double operator =(const double v) { 23 | gsl_vector_set(ptr_, index_, v); 24 | return v; 25 | } 26 | 27 | double operator +=(const double v) { 28 | double oldv = gsl_vector_get(ptr_, index_); 29 | gsl_vector_set(ptr_, index_, oldv + v); 30 | return oldv + v; 31 | } 32 | 33 | double operator -=(const double v) { 34 | double oldv = gsl_vector_get(ptr_, index_); 35 | gsl_vector_set(ptr_, index_, oldv - v); 36 | return oldv - v; 37 | } 38 | private: 39 | gsl_vector* ptr_; 40 | size_t index_; 41 | }; 42 | 43 | class GslVectorBase { 44 | public: 45 | /* 46 | double operator[](const size_t index) const { 47 | assert(ptr_ != NULL); 48 | return gsl_vector_get(ptr_, index); 49 | } 50 | */ 51 | 52 | GslVectorItem operator[](const size_t index) const { 53 | assert(ptr_ != NULL); 54 | return GslVectorItem(ptr_, index); 55 | } 56 | 57 | GslVectorBase& operator+=(const gsl_vector* x) { 58 | assert(ptr_ != NULL); 59 | gsl_vector_add(ptr_, x); 60 | return *this; 61 | } 62 | 63 | GslVectorBase& operator-=(const gsl_vector* x) { 64 | assert(ptr_ != NULL); 65 | gsl_vector_sub(ptr_, x); 66 | return *this; 67 | } 68 | 69 | GslVectorBase& operator+=(const GslVectorBase& x) { 70 | assert(ptr_ != NULL); 71 | gsl_vector_add(ptr_, x.ptr()); 72 | return *this; 73 | } 74 | 75 | GslVectorBase& operator-=(const GslVectorBase& x) { 76 | assert(ptr_ != NULL); 77 | gsl_vector_sub(ptr_, x.ptr()); 78 | return *this; 79 | } 80 | 81 | GslVectorBase& operator*=(const gsl_vector* x) { 82 | assert(ptr_ != NULL); 83 | gsl_vector_mul(ptr_, x); 84 | return *this; 85 | } 86 | 87 | GslVectorBase& operator*=(const GslVectorBase& x) { 88 | assert(ptr_ != NULL); 89 | gsl_vector_mul(ptr_, x.ptr()); 90 | return *this; 91 | } 92 | 93 | GslVectorBase& operator/=(const GslVectorBase& x) { 94 | assert(ptr_ != NULL); 95 | gsl_vector_div(ptr_, x.ptr()); 96 | return *this; 97 | } 98 | 99 | double Sum() const { 100 | return gsl_blas_dsum(ptr_); 101 | } 102 | 103 | double L2Norm() const { 104 | return gsl_blas_dnrm2(ptr_); 105 | } 106 | 107 | void Normalize() const { 108 | assert(ptr_ != NULL); 109 | double s = Sum(); 110 | gsl_vector_scale(ptr_, 1. / s); 111 | } 112 | 113 | size_t size() const { 114 | return ptr_->size; 115 | } 116 | 117 | GslVectorBase& operator*=(double v) { 118 | assert(ptr_ != NULL); 119 | gsl_vector_scale(ptr_, v); 120 | return *this; 121 | } 122 | 123 | GslVectorBase& operator/=(double v) { 124 | assert(ptr_ != NULL); 125 | gsl_vector_scale(ptr_, 1. / v); 126 | return *this; 127 | } 128 | 129 | // Note that the standalone product is a dot product! 130 | const double operator*(const gsl_vector* x) const { 131 | double result; 132 | assert(ptr_ != NULL); 133 | gsl_blas_ddot(ptr_, x, &result); 134 | return result; 135 | } 136 | 137 | const double operator*(const GslVectorBase& x) const { 138 | double result; 139 | assert(ptr_ != NULL); 140 | gsl_blas_ddot(ptr_, x.ptr(), &result); 141 | return result; 142 | } 143 | 144 | GslVectorBase& operator+=(const double x) { 145 | for (size_t ii = 0; ii < ptr_->size; ++ii) { 146 | gsl_vector_set(ptr_, ii, gsl_vector_get(ptr_, ii) + x); 147 | } 148 | return *this; 149 | } 150 | 151 | GslVectorBase& operator-=(const double x) { 152 | for (size_t ii = 0; ii < ptr_->size; ++ii) { 153 | gsl_vector_set(ptr_, ii, gsl_vector_get(ptr_, ii) - x); 154 | } 155 | return *this; 156 | } 157 | 158 | GslVectorBase& operator=(const gsl_vector* x) { 159 | assert(ptr_ != NULL); 160 | gsl_vector_memcpy(ptr_, x); 161 | 162 | return *this; 163 | } 164 | 165 | GslVectorBase& operator=(const GslVectorBase& x) { 166 | assert(ptr_ != NULL); 167 | return *this = x.ptr(); 168 | } 169 | 170 | GslVectorBase& operator=(const double v) { 171 | if (v == 0.0) { 172 | SetZero(); 173 | } else { 174 | SetAll(v); 175 | } 176 | return *this; 177 | } 178 | 179 | void SetZero() { 180 | assert(ptr_ != NULL); 181 | gsl_vector_set_zero(ptr_); 182 | } 183 | 184 | void SetAll(const double v) { 185 | assert(ptr_ != NULL); 186 | gsl_vector_set_all(ptr_, v); 187 | } 188 | 189 | int Fprintf(FILE* stream, const char* format) const { 190 | assert(ptr_ != NULL); 191 | return gsl_vector_fprintf(stream, ptr_, format); 192 | } 193 | 194 | int Fscanf(FILE* stream) { 195 | assert(ptr_ != NULL); 196 | return gsl_vector_fscanf(stream, ptr_); 197 | } 198 | 199 | const gsl_vector* ptr() const { return ptr_; } 200 | gsl_vector* mutable_ptr() { return ptr_; } 201 | 202 | protected: 203 | GslVectorBase() : ptr_(NULL) { } 204 | gsl_vector* ptr_; 205 | 206 | private: 207 | GslVectorBase(const GslVectorBase&) { } 208 | }; 209 | 210 | class GslVector : public GslVectorBase { 211 | public: 212 | GslVector(const size_t size) : GslVectorBase() { 213 | Allocate(size); 214 | } 215 | 216 | GslVector() : GslVectorBase() { 217 | } 218 | 219 | ~GslVector() { 220 | if (ptr_ != NULL) { 221 | gsl_vector_free(ptr_); 222 | } 223 | } 224 | 225 | void Reset(gsl_vector* val) { 226 | if (ptr_ != NULL) { 227 | gsl_vector_free(ptr_); 228 | } 229 | ptr_ = val; 230 | } 231 | 232 | void Allocate(const size_t size) { 233 | assert(ptr_ == NULL); 234 | ptr_ = gsl_vector_alloc(size); 235 | } 236 | 237 | GslVectorBase& operator=(const gsl_vector* x) { 238 | GslVectorBase::operator=(x); 239 | return *this; 240 | } 241 | 242 | GslVectorBase& operator=(const GslVectorBase& x) { 243 | GslVectorBase::operator=(x); 244 | return *this; 245 | } 246 | 247 | GslVectorBase& operator=(const double v) { 248 | GslVectorBase::operator=(v); 249 | return *this; 250 | } 251 | private: 252 | GslVector(const GslVector&) { } 253 | }; 254 | 255 | class GslMatrixRow : public GslVectorBase { 256 | public: 257 | GslMatrixRow(GslMatrix& matrix, const size_t row) : 258 | view_(gsl_matrix_row(matrix.mutable_ptr(), row)) { 259 | ptr_ = &view_.vector; 260 | } 261 | 262 | GslMatrixRow(gsl_matrix* matrix, const size_t row) : 263 | view_(gsl_matrix_row(matrix, row)) { 264 | ptr_ = &view_.vector; 265 | } 266 | 267 | GslVectorBase& operator=(const gsl_vector* x) { 268 | GslVectorBase::operator=(x); 269 | return *this; 270 | } 271 | 272 | GslVectorBase& operator=(const GslVectorBase& x) { 273 | GslVectorBase::operator=(x); 274 | return *this; 275 | } 276 | 277 | GslVectorBase& operator=(const double v) { 278 | GslVectorBase::operator=(v); 279 | return *this; 280 | } 281 | private: 282 | gsl_vector_view view_; 283 | GslMatrixRow(const GslMatrixRow&) { } 284 | }; 285 | 286 | class GslMatrixColumn : public GslVectorBase { 287 | public: 288 | GslMatrixColumn(GslMatrix& matrix, const size_t col) : 289 | view_(gsl_matrix_column(matrix.mutable_ptr(), col)) { 290 | ptr_ = &view_.vector; 291 | } 292 | 293 | GslMatrixColumn(gsl_matrix* matrix, const size_t col) : 294 | view_(gsl_matrix_column(matrix, col)) { 295 | ptr_ = &view_.vector; 296 | } 297 | 298 | GslVectorBase& operator=(const gsl_vector* x) { 299 | GslVectorBase::operator=(x); 300 | return *this; 301 | } 302 | 303 | GslVectorBase& operator=(const GslVectorBase& x) { 304 | GslVectorBase::operator=(x); 305 | return *this; 306 | } 307 | 308 | GslVectorBase& operator=(const double v) { 309 | GslVectorBase::operator=(v); 310 | return *this; 311 | } 312 | private: 313 | gsl_vector_view view_; 314 | GslMatrixColumn(const GslMatrixColumn&) { } 315 | }; 316 | 317 | class GslMatrixDiagonal : public GslVectorBase { 318 | public: 319 | GslMatrixDiagonal(GslMatrix& matrix) : 320 | view_(gsl_matrix_diagonal(matrix.mutable_ptr())) { 321 | ptr_ = &view_.vector; 322 | } 323 | 324 | GslMatrixDiagonal(gsl_matrix* matrix) : 325 | view_(gsl_matrix_diagonal(matrix)) { 326 | ptr_ = &view_.vector; 327 | } 328 | 329 | GslVectorBase& operator=(const gsl_vector* x) { 330 | GslVectorBase::operator=(x); 331 | return *this; 332 | } 333 | 334 | GslVectorBase& operator=(const GslVectorBase& x) { 335 | GslVectorBase::operator=(x); 336 | return *this; 337 | } 338 | 339 | GslVectorBase& operator=(const double v) { 340 | GslVectorBase::operator=(v); 341 | return *this; 342 | } 343 | private: 344 | gsl_vector_view view_; 345 | GslMatrixDiagonal(const GslMatrixDiagonal&) { } 346 | }; 347 | 348 | class GslSubvector : public GslVectorBase { 349 | public: 350 | GslSubvector(GslVectorBase& vector, size_t i, size_t n) : 351 | view_(gsl_vector_subvector(vector.mutable_ptr(), i, n)) { 352 | ptr_ = &view_.vector; 353 | } 354 | 355 | GslVectorBase& operator=(const gsl_vector* x) { 356 | GslVectorBase::operator=(x); 357 | return *this; 358 | } 359 | 360 | GslVectorBase& operator=(const GslVectorBase& x) { 361 | GslVectorBase::operator=(x); 362 | return *this; 363 | } 364 | 365 | GslVectorBase& operator=(const double v) { 366 | GslVectorBase::operator=(v); 367 | return *this; 368 | } 369 | private: 370 | gsl_vector_view view_; 371 | GslSubvector(const GslSubvector&) { } 372 | }; 373 | 374 | #endif // __MATH_GSL_VECTOR__ 375 | 376 | -------------------------------------------------------------------------------- /lib/math/vectorops.h: -------------------------------------------------------------------------------- 1 | #ifndef __MATH_VECTOROPS_INCLUDED 2 | #define __MATH_VECTOROPS_INCLUDED 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | <<<<<<< vectorops.h 9 | #include 10 | #include "math/specialfunc.h" 11 | ======= 12 | #include 13 | #include "specialfunc.h" 14 | >>>>>>> 1.27 15 | 16 | #ifndef M_PI 17 | #define M_PI 3.14159265358979323846 18 | #endif 19 | 20 | #ifndef isnan 21 | # define isnan(x) ((x) != (x)) 22 | #endif 23 | 24 | /* 25 | * take the exponent of a vector 26 | * 27 | * If the exponent is infinite, then we replace the value with a 28 | * suitably large max_val 29 | */ 30 | void vexp(const gsl_vector* v, 31 | gsl_vector* exp_v, 32 | double max_val = std::numeric_limits::infinity()) { 33 | assert(exp_v->size >= v->size); 34 | for (unsigned int ii = 0; ii < v->size; ++ii) { 35 | double val = exp(gsl_vector_get(v, ii)); 36 | if (val == std::numeric_limits::infinity() || val > max_val) { 37 | val = max_val; 38 | } 39 | gsl_vector_set(exp_v, ii, val); 40 | } 41 | } 42 | 43 | /* take the exponent of a matrix */ 44 | void mexp(const gsl_matrix* m, gsl_matrix* exp_m) { 45 | for (unsigned int ii = 0; ii < m->size1; ++ii) { 46 | for (unsigned int jj = 0; jj < m->size2; ++jj) { 47 | double val = exp(gsl_matrix_get(m, ii, jj)); 48 | assert(!isnan(val)); 49 | gsl_matrix_set(exp_m, ii, jj, val); 50 | } 51 | } 52 | } 53 | 54 | /* like vexp except that it also computes sum x log x */ 55 | double vexp_entropy(const gsl_vector* v, gsl_vector* exp_v) { 56 | double entropy = 0.0; 57 | for (unsigned int ii = 0; ii < v->size; ++ii) { 58 | double logval = gsl_vector_get(v, ii); 59 | double val = exp(logval); 60 | assert(!isnan(val)); 61 | gsl_vector_set(exp_v, ii, val); 62 | if (val != 0) { 63 | entropy -= val * logval; 64 | } 65 | } 66 | return entropy; 67 | } 68 | 69 | double ventropy(const gsl_vector* v) { 70 | double entropy = 0.0; 71 | for (unsigned int ii = 0; ii < v->size; ++ii) { 72 | double val = gsl_vector_get(v, ii); 73 | if (val != 0) { 74 | entropy -= val * log(val); 75 | } 76 | } 77 | return entropy; 78 | } 79 | 80 | double lgamma(double x) { 81 | double x0,x2,xp,gl,gl0; 82 | int n=0,k=0; 83 | static double a[] = { 84 | 8.333333333333333e-02, 85 | -2.777777777777778e-03, 86 | 7.936507936507937e-04, 87 | -5.952380952380952e-04, 88 | 8.417508417508418e-04, 89 | -1.917526917526918e-03, 90 | 6.410256410256410e-03, 91 | -2.955065359477124e-02, 92 | 1.796443723688307e-01, 93 | -1.39243221690590}; 94 | 95 | x0 = x; 96 | if (x <= 0.0) return 1e308; 97 | else if ((x == 1.0) || (x == 2.0)) return 0.0; 98 | else if (x <= 7.0) { 99 | n = (int)(7-x); 100 | x0 = x+n; 101 | } 102 | x2 = 1.0/(x0*x0); 103 | xp = 2.0*M_PI; 104 | gl0 = a[9]; 105 | for (k=8;k>=0;k--) { 106 | gl0 = gl0*x2 + a[k]; 107 | } 108 | gl = gl0/x0+0.5*log(xp)+(x0-0.5)*log(x0)-x0; 109 | if (x <= 7.0) { 110 | for (k=1;k<=n;k++) { 111 | gl -= log(x0-1.0); 112 | x0 -= 1.0; 113 | } 114 | } 115 | return gl; 116 | } 117 | 118 | void mlog(const gsl_matrix* m, gsl_matrix* log_m) { 119 | for (unsigned int ii = 0; ii < m->size1; ++ii) { 120 | for (unsigned int jj = 0; jj < m->size2; ++jj) { 121 | gsl_matrix_set(log_m, ii, jj, log(gsl_matrix_get(m, ii, jj))); 122 | } 123 | } 124 | } 125 | 126 | void vlog(const gsl_vector* v, gsl_vector* log_v) { 127 | for (unsigned int ii = 0; ii < v->size; ++ii) 128 | gsl_vector_set(log_v, ii, log(gsl_vector_get(v, ii))); 129 | } 130 | 131 | void vlogit(const gsl_vector* v, gsl_vector* log_v) { 132 | for (unsigned int ii = 0; ii < v->size; ++ii) { 133 | double p = gsl_vector_get(v, ii); 134 | assert(p >= 0.0); 135 | assert(p <= 1.0); 136 | gsl_vector_set(log_v, ii, log(p / (1 - p))); 137 | } 138 | } 139 | 140 | 141 | void vsigmoid(const gsl_vector* v, gsl_vector* sig_v) { 142 | for (unsigned int ii = 0; ii < v->size; ++ii) { 143 | double p = gsl_vector_get(v, ii); 144 | gsl_vector_set(sig_v, ii, 1. / (1. + exp(-p))); 145 | } 146 | } 147 | 148 | 149 | double vlog_entropy(const gsl_vector* v, gsl_vector* log_v) { 150 | double entropy = 0; 151 | for (unsigned int ii = 0; ii < v->size; ++ii) { 152 | double val = gsl_vector_get(v, ii); 153 | entropy -= val * log(val); 154 | gsl_vector_set(log_v, ii, log(val)); 155 | } 156 | return entropy; 157 | } 158 | 159 | double entropy(const gsl_vector* v) { 160 | double entropy = 0; 161 | for (unsigned int ii = 0; ii < v->size; ++ii) { 162 | double val = gsl_vector_get(v, ii); 163 | entropy -= val * log(val); 164 | } 165 | return entropy; 166 | } 167 | 168 | void vdigamma(const gsl_vector* v, gsl_vector* digamma_v) { 169 | for (unsigned int ii = 0; ii < v->size; ++ii) 170 | // gsl_sf_psi throws an error when its argument is 0, whereas digamma returns inf. 171 | // gsl_vector_set(digamma_v, ii, gsl_sf_psi(gsl_vector_get(v, ii))); 172 | gsl_vector_set(digamma_v, ii, digamma(gsl_vector_get(v, ii))); 173 | } 174 | 175 | void vlgamma(const gsl_vector* v, gsl_vector* lgamma_v) { 176 | for (unsigned int ii = 0; ii < v->size; ++ii) 177 | gsl_vector_set(lgamma_v, ii, lgamma(gsl_vector_get(v, ii))); 178 | } 179 | 180 | double gsl_blas_dsum(const gsl_vector* v) { 181 | double sum = 0; 182 | for (unsigned int ii = 0; ii < v->size; ++ii) { 183 | sum += gsl_vector_get(v, ii); 184 | } 185 | return sum; 186 | } 187 | 188 | double gsl_blas_dsum(const gsl_matrix* v) { 189 | double sum = 0; 190 | for (unsigned int ii = 0; ii < v->size1; ++ii) { 191 | for (unsigned int jj = 0; jj < v->size2; ++jj) { 192 | sum += gsl_matrix_get(v, ii, jj); 193 | } 194 | } 195 | return sum; 196 | } 197 | 198 | double gsl_matrix_rowsum(const gsl_matrix* m, const int row) { 199 | double sum = 0; 200 | for(unsigned int i=0; i < m->size2; i++) { 201 | sum += gsl_matrix_get(m, row, i); 202 | } 203 | return sum; 204 | } 205 | 206 | double dot_product(const gsl_vector* a, const gsl_vector* b) { 207 | assert(a->size == b->size); 208 | double val = 0; 209 | for(unsigned i=0; isize; i++) { 210 | val += gsl_vector_get(a, i) * gsl_vector_get(b, i); 211 | } 212 | return val; 213 | } 214 | 215 | void uniform(gsl_vector* v) { 216 | gsl_vector_set_all(v, 1.0 / (double)v->size); 217 | } 218 | 219 | double normalize(gsl_vector* v) { 220 | double sum = gsl_blas_dsum(v); 221 | gsl_blas_dscal(1 / sum, v); 222 | return sum; 223 | } 224 | 225 | /* 226 | This function takes as input a multinomial parameter vector and 227 | computes the "total" variance, i.e., the sum of the diagonal of the 228 | covariance matrix. 229 | 230 | If the multinomial parameter is unnormalized, then the variance of 231 | the normalized multinomial vector will be computed and then 232 | multiplied by the scale of the vector. 233 | */ 234 | double MultinomialTotalVariance(const gsl_vector* v) { 235 | double scale = gsl_blas_dsum(v); 236 | double variance = 0.0; 237 | for (size_t ii = 0; ii < v->size; ++ii) { 238 | double val = gsl_vector_get(v, ii) / scale; 239 | variance += val * (1. - val); 240 | } 241 | return variance * scale; 242 | } 243 | 244 | /* 245 | Computes covariance using the renormalization above and adds it to 246 | an existing matrix. 247 | */ 248 | void MultinomialCovariance(double alpha, 249 | const gsl_vector* v, 250 | gsl_matrix* m) { 251 | double scale = gsl_blas_dsum(v); 252 | gsl_blas_dger(-alpha / scale, v, v, m); 253 | gsl_vector_view diag = gsl_matrix_diagonal(m); 254 | gsl_blas_daxpy(alpha, v, &diag.vector); 255 | } 256 | 257 | double MatrixProductSum(const gsl_matrix* m1, 258 | const gsl_matrix* m2) { 259 | double val = 0; 260 | assert(m1->size1 == m2->size1); 261 | assert(m1->size2 == m2->size2); 262 | for (size_t ii = 0; ii < m1->size1; ++ii) { 263 | for (size_t jj = 0; jj < m2->size2; ++jj) { 264 | val += gsl_matrix_get(m1, ii, jj) * 265 | gsl_matrix_get(m2, ii, jj); 266 | } 267 | } 268 | return val; 269 | } 270 | 271 | double MatrixProductProductSum(const gsl_matrix* m1, 272 | const gsl_matrix* m2, 273 | const gsl_matrix* m3) { 274 | double val = 0; 275 | assert(m1->size1 == m2->size1); 276 | assert(m1->size2 == m2->size2); 277 | assert(m1->size1 == m3->size1); 278 | assert(m1->size2 == m3->size2); 279 | for (size_t ii = 0; ii < m1->size1; ++ii) { 280 | for (size_t jj = 0; jj < m2->size2; ++jj) { 281 | for (size_t kk = 0; kk < m3->size2; ++kk) { 282 | val += gsl_matrix_get(m1, ii, jj) * 283 | gsl_matrix_get(m2, ii, jj) * 284 | gsl_matrix_get(m3, ii, jj); 285 | } 286 | } 287 | } 288 | return val; 289 | } 290 | 291 | double SumLGamma(const gsl_vector* v) { 292 | double s = 0.0; 293 | for (size_t ii = 0; ii < v->size; ++ii) { 294 | s += lgamma(gsl_vector_get(v, ii)); 295 | } 296 | return s; 297 | } 298 | 299 | void mtx_fprintf(const char* filename, const gsl_matrix * m) 300 | { 301 | FILE* fileptr; 302 | fileptr = fopen(filename, "w"); 303 | gsl_matrix_fprintf(fileptr, m, "%20.17e"); 304 | fclose(fileptr); 305 | } 306 | 307 | 308 | void mtx_fscanf(const char* filename, gsl_matrix * m) 309 | { 310 | FILE* fileptr; 311 | fileptr = fopen(filename, "r"); 312 | gsl_matrix_fscanf(fileptr, m); 313 | fclose(fileptr); 314 | } 315 | 316 | double mtx_accum(const int i, 317 | const int j, 318 | const double contribution, 319 | gsl_matrix* m) { 320 | 321 | double new_val = gsl_matrix_get(m, i, j) + contribution; 322 | gsl_matrix_set(m, i, j, new_val); 323 | return new_val; 324 | } 325 | 326 | void vct_fscanf(const char* filename, gsl_vector* v) 327 | { 328 | FILE* fileptr; 329 | fileptr = fopen(filename, "r"); 330 | gsl_vector_fscanf(fileptr, v); 331 | fclose(fileptr); 332 | } 333 | 334 | void vct_fprintf(const char* filename, gsl_vector* v) 335 | { 336 | FILE* fileptr; 337 | fileptr = fopen(filename, "w"); 338 | gsl_vector_fprintf(fileptr, v, "%20.17e"); 339 | fclose(fileptr); 340 | } 341 | 342 | #endif 343 | -------------------------------------------------------------------------------- /dtm/main.c: -------------------------------------------------------------------------------- 1 | // Authors: David Blei (blei@cs.princeton.edu) 2 | // Sean Gerrish (sgerrish@cs.princeton.edu) 3 | // 4 | // Copyright 2011 Sean Gerrish and David Blei 5 | // All Rights Reserved. 6 | // 7 | // See the README for this package for details about modifying or 8 | // distributing this software. 9 | 10 | #include 11 | 12 | #include "main.h" 13 | 14 | DEFINE_string(mode, 15 | "fit", 16 | "The function to perform. " 17 | "Can be fit, est, or time."); 18 | DEFINE_string(model, 19 | "dtm", 20 | "The function to perform. " 21 | "Can be dtm or dim."); 22 | DEFINE_string(corpus_prefix, 23 | "", 24 | "The function to perform. " 25 | "Can be dtm or dim."); 26 | DEFINE_string(lda_model_prefix, 27 | "", 28 | "The name of a fit model to be " 29 | "used for testing likelihood. Appending \"info.dat\" " 30 | "to this should give the name of the file."); 31 | DEFINE_int32(heldout_time, 32 | -1, 33 | "A time up to (but not including) which we wish to train, " 34 | "and at which we wish to test."); 35 | DEFINE_string(output_table, "", ""); 36 | DEFINE_string(params_file, 37 | "settings.txt", 38 | "A file containing parameters for this run."); 39 | DEFINE_bool(initialize_lda, 40 | false, 41 | "If true, initialize the model with lda."); 42 | 43 | DEFINE_string(outname, "", ""); 44 | DEFINE_double(top_obs_var, 0.5, ""); 45 | DEFINE_double(top_chain_var, 0.005, ""); 46 | DEFINE_double(alpha, -10.0, ""); 47 | DEFINE_double(ntopics, -1.0, ""); 48 | DEFINE_int32(lda_max_em_iter, 20, ""); 49 | DEFINE_string(heldout_corpus_prefix, "", ""); 50 | DEFINE_int32(start, -1, ""); 51 | DEFINE_int32(end, -1, ""); 52 | 53 | extern int LDA_INFERENCE_MAX_ITER; 54 | 55 | /* 56 | * read the parameters 57 | * 58 | * !!! use the cleaner functions in params.h 59 | * 60 | */ 61 | 62 | 63 | /* 64 | * fit a model from data 65 | * 66 | */ 67 | 68 | void fit_dtm(int min_time, int max_time) 69 | { 70 | char name[400]; 71 | 72 | // make the directory for this fit 73 | char run_dir[400]; 74 | sprintf(run_dir, "%s/", FLAGS_outname.c_str()); 75 | if (!directory_exist(run_dir)) { 76 | make_directory(run_dir); 77 | } 78 | 79 | // initialize (a few iterations of LDA fitting) 80 | outlog("%s","### INITIALIZING MODEL FROM LDA ###\n"); 81 | 82 | printf("data file: %s\n", FLAGS_corpus_prefix.c_str()); 83 | corpus_t* initial_lda_data = read_corpus(FLAGS_corpus_prefix.c_str()); 84 | 85 | gsl_matrix* topics_ss; 86 | // !!! make this an option 87 | if (FLAGS_initialize_lda) { 88 | lda* lda_model = new_lda_model(FLAGS_ntopics, initial_lda_data->nterms); 89 | gsl_vector_set_all(lda_model->alpha, FLAGS_alpha); 90 | 91 | lda_suff_stats* lda_ss = new_lda_suff_stats(lda_model); 92 | // initialize_lda_ss_from_data(initial_lda_data, lda_ss); 93 | initialize_lda_ss_from_random(initial_lda_data, lda_ss); 94 | // sgerrish: Why do we only define the topics once? 95 | lda_m_step(lda_model, lda_ss); 96 | 97 | sprintf(name, "%s/initial-lda", run_dir); 98 | // TODO(sgerrish): Fix this. This was originally hardcoded to 1. 99 | LDA_INFERENCE_MAX_ITER = 25; 100 | lda_em(lda_model, lda_ss, initial_lda_data, FLAGS_lda_max_em_iter, name); 101 | sprintf(name, "%s/initial-lda-ss.dat", run_dir); 102 | 103 | write_lda_suff_stats(lda_ss, name); 104 | topics_ss = lda_ss->topics_ss; 105 | } else { 106 | printf("loading %d terms..\n", initial_lda_data->nterms); 107 | topics_ss = gsl_matrix_calloc(initial_lda_data->nterms, FLAGS_ntopics); 108 | sprintf(name, "%s/initial-lda-ss.dat", FLAGS_outname.c_str()); 109 | mtx_fscanf(name, topics_ss); 110 | } 111 | 112 | printf("fitting.. \n"); 113 | // estimate dynamic topic model 114 | 115 | outlog("\n%s\n","### FITTING DYNAMIC TOPIC MODEL ###"); 116 | 117 | corpus_seq_t* data_full = read_corpus_seq(FLAGS_corpus_prefix.c_str()); 118 | 119 | corpus_seq_t* data_subset; 120 | if (max_time >= 0) { 121 | // We are training on a subset of the data. 122 | assert(max_time > min_time 123 | && min_time >= 0 124 | && max_time < data_full->len); 125 | data_subset = (corpus_seq_t*) malloc(sizeof(corpus_seq_t)); 126 | data_subset->len = max_time - min_time + 1; 127 | data_subset->nterms = data_full->nterms; 128 | data_subset->corpus = (corpus_t**) malloc( 129 | sizeof(corpus_t*) * data_subset->len); 130 | int max_nterms = 0; 131 | int ndocs = 0; 132 | for (int i=min_time; i < max_time; ++i) { 133 | corpus_t* corpus = data_full->corpus[i]; 134 | max_nterms = max_nterms > corpus->nterms ? max_nterms : corpus->nterms; 135 | data_subset->corpus[i - min_time] = corpus; 136 | ndocs += corpus->ndocs; 137 | } 138 | data_subset->max_nterms = max_nterms; 139 | data_subset->ndocs = ndocs; 140 | } else { 141 | // Use the entire dataset. 142 | data_subset = data_full; 143 | } 144 | 145 | lda_seq* model_seq = new_lda_seq(data_subset, 146 | data_subset->nterms, 147 | data_subset->len, 148 | FLAGS_ntopics); 149 | init_lda_seq_from_ss(model_seq, 150 | FLAGS_top_chain_var, 151 | FLAGS_top_obs_var, 152 | FLAGS_alpha, 153 | topics_ss); 154 | 155 | fit_lda_seq(model_seq, data_subset, NULL, run_dir); 156 | 157 | if (max_time < 0) { 158 | return; 159 | } 160 | 161 | // Now find the posterior likelihood of the next time slice 162 | // using the most-recently-known time slice. 163 | lda* lda_model = new_lda_model(model_seq->ntopics, model_seq->nterms); 164 | make_lda_from_seq_slice(lda_model, model_seq, max_time - 1); 165 | 166 | lda_post post; 167 | int max_nterms = compute_max_nterms(data_full); 168 | post.phi = gsl_matrix_calloc(max_nterms, model_seq->ntopics); 169 | post.log_phi = gsl_matrix_calloc(max_nterms, model_seq->ntopics); 170 | post.gamma = gsl_vector_calloc(model_seq->ntopics); 171 | post.lhood = gsl_vector_calloc(model_seq->ntopics); 172 | post.model = lda_model; 173 | post.doc_weight = NULL; 174 | 175 | int d; 176 | double* table = (double*) malloc(sizeof(double) * data_full->corpus[max_time]->ndocs); 177 | 178 | for (d = 0; d < data_full->corpus[max_time]->ndocs; d++) 179 | { 180 | post.doc = data_full->corpus[max_time]->doc[d]; 181 | table[d] = fit_lda_post(d, max_time, &post, NULL, NULL, 182 | NULL, NULL, NULL); 183 | } 184 | char tmp_string[400]; 185 | sprintf(tmp_string, "%s-heldout_post_%d.dat", FLAGS_outname.c_str(), 186 | max_time); 187 | FILE* post_file = fopen(tmp_string, "w"); 188 | for (int d = 0; d < data_full->corpus[max_time]->ndocs; ++d) 189 | { 190 | fprintf(post_file, "%f\n", table[d]); 191 | } 192 | } 193 | 194 | /* 195 | * main function 196 | * 197 | * supports fitting a dynamic topic model 198 | * 199 | */ 200 | 201 | int main(int argc, char* argv[]) 202 | { 203 | // Initialize the flag objects. 204 | // InitFlags(argc, argv); 205 | google::ParseCommandLineFlags(&argc, &argv, 0); 206 | 207 | // usage: main (sums corpus_sequence|fit param|time params) 208 | 209 | // mode for spitting out document sums 210 | if (FLAGS_mode == "sums") 211 | { 212 | corpus_seq_t* c = read_corpus_seq(FLAGS_corpus_prefix.c_str()); 213 | outlog("Tried to read corpus %s", FLAGS_corpus_prefix.c_str()); 214 | int d, t; 215 | for (t = 0; t < c->len; t++) 216 | { 217 | int sum = 0; 218 | for (d = 0; d < c->corpus[t]->ndocs; d++) 219 | { 220 | sum += c->corpus[t]->doc[d]->total; 221 | } 222 | printf("%d\n\n", sum); 223 | } 224 | } 225 | 226 | // mode for fitting a dynamic topic model 227 | 228 | if (FLAGS_mode == "fit") { 229 | fit_dtm(0, FLAGS_heldout_time - 1); 230 | } 231 | 232 | // mode for analyzing documents through time according to a DTM 233 | 234 | if (FLAGS_mode == "time") 235 | { 236 | // read parameters 237 | 238 | // load corpus and model based on information from params 239 | 240 | corpus_seq_t* data = read_corpus_seq(FLAGS_heldout_corpus_prefix.c_str()); 241 | lda_seq* model = read_lda_seq(FLAGS_lda_model_prefix.c_str(), 242 | data); 243 | 244 | // initialize the table (D X OFFSETS) 245 | 246 | int d; 247 | double** table = (double**) malloc(sizeof(double*) * data->len); 248 | 249 | for (int t = 0; t < data->len; t++) 250 | { 251 | table[t] = (double*) malloc(sizeof(double) * data->corpus[t]->ndocs); 252 | for (d = 0; d < data->corpus[t]->ndocs; d++) 253 | { 254 | table[t][d] = -1; // this should be NAN 255 | } 256 | } 257 | 258 | // set up the LDA model to be populated 259 | 260 | lda* lda_model = new_lda_model(model->ntopics, model->nterms); 261 | 262 | lda_post post; 263 | int max_nterms = compute_max_nterms(data); 264 | post.phi = gsl_matrix_calloc(max_nterms, model->ntopics); 265 | post.log_phi = gsl_matrix_calloc(max_nterms, model->ntopics); 266 | post.gamma = gsl_vector_calloc(model->ntopics); 267 | post.lhood = gsl_vector_calloc(model->ntopics); 268 | post.model = lda_model; 269 | 270 | // compute likelihoods for each model 271 | 272 | for (int t = 0; t < data->len; t++) { 273 | make_lda_from_seq_slice(lda_model, model, t); 274 | for (d = 0; d < data->corpus[t]->ndocs; d++) { 275 | post.doc = data->corpus[t]->doc[d]; 276 | double likelihood = fit_lda_post(d, t, &post, model, 277 | NULL, 278 | NULL, NULL, NULL); 279 | table[t][d] = post.doc->log_likelihood; 280 | } 281 | } 282 | char tmp_string[400]; 283 | sprintf(tmp_string, "%s-heldout_post.dat", FLAGS_outname.c_str()); 284 | FILE* post_file = fopen(tmp_string, "w"); 285 | for (int t=0; t < data->len; ++t) 286 | { 287 | if (data->corpus[t]->ndocs >= 0) { 288 | fprintf(post_file, "%f", table[t][0]); 289 | } 290 | for (int d = 1; d < data->corpus[t]->ndocs; ++d) 291 | { 292 | fprintf(post_file, ",%f", table[t][d]); 293 | } 294 | fprintf(post_file, "\n"); 295 | } 296 | // !!! write out table 297 | } 298 | 299 | return(0); 300 | } 301 | 302 | 303 | 304 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/matrix_double.h: -------------------------------------------------------------------------------- 1 | // matrix.h 2 | 3 | // This matrix class is a C++ wrapper for the GNU Scientific Library 4 | // Copyright (C) 2001 Ramin Nakisa 5 | 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | 16 | // You should have received a copy of the GNU General Public License 17 | // along with this program; if not, write to the Free Software 18 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 | 20 | #if !defined( _matrix_double_h ) 21 | #define _matrix_double_h 22 | 23 | #ifdef __HP_aCC //for aCC B3910B A.01.27 24 | #include 25 | #include 26 | #include 27 | #else //for gcc3 28 | #include 29 | #include 30 | #include 31 | #endif 32 | 33 | #include 34 | #include 35 | #include 36 | /// 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #define type_is 44 | #ifdef type_is 45 | #define type_is_double 46 | #endif 47 | 48 | namespace gsl 49 | { 50 | 51 | /// 52 | class matrix 53 | { 54 | #ifdef type_is_double 55 | friend class matrix_float; 56 | friend class matrix_int; 57 | #endif 58 | public: 59 | typedef double value_type; 60 | typedef vector vector_type; 61 | 62 | /// 63 | matrix(); 64 | /// 65 | matrix( size_t new_rows, size_t new_cols, bool clear = true ); 66 | 67 | template 68 | void copy(const oclass &other) 69 | { 70 | if ( static_cast( this ) == static_cast( &other ) ) 71 | return; 72 | 73 | set_dimensions( other.get_rows(), other.get_cols() ); 74 | for ( size_t i = 0; i < get_rows(); i++ ) 75 | { 76 | for ( size_t j = 0; j < get_cols(); j++ ) 77 | { 78 | gsl_matrix_set( m, i, j, (double)other(i,j)); 79 | } 80 | } 81 | } 82 | /* template<> */ 83 | /* void copy(const matrix &other) */ 84 | /* { */ 85 | /* set_dimensions(other.size1(),other.size2()); */ 86 | /* gsl_matrix_memcpy( m, other.m ); */ 87 | /* } */ 88 | // copy constructor for type matrix 89 | matrix( const matrix &other ):m(NULL) {copy(other);} 90 | /// 91 | template 92 | matrix( const oclass &other ):m(NULL) {copy(other);} 93 | 94 | /// 95 | ~matrix(); 96 | /// 97 | // matrix( const char *Filename ); 98 | /// 99 | size_t get_rows() const {return m->size1;} 100 | /// 101 | size_t get_cols() const {return m->size2;} 102 | /// 103 | size_t size1() const {return m->size1;} 104 | /// 105 | size_t size2() const {return m->size2;} 106 | 107 | 108 | /// 109 | void dimensions( size_t *num_rows, size_t *num_cols ) const; 110 | /// 111 | double get_element ( size_t row, size_t col ) const {return gsl_matrix_get( m, row, col ) ;} 112 | const double &operator()( size_t row, size_t col ) const {return *gsl_matrix_ptr( m, row, col ) ;} 113 | double &operator()( size_t row, size_t col ) {return *gsl_matrix_ptr( m, row, col ) ;} 114 | /// 115 | void set_element( size_t row, size_t col, const double &v ){ gsl_matrix_set( m, row, col, v );} 116 | /// 117 | void set_elements( const double & new_value ); 118 | void set_all ( const double & new_value ) {gsl_matrix_set_all ( m, new_value );} 119 | void set_zero() {gsl_matrix_set_zero( m );} 120 | /// 121 | void set_dimensions( size_t new_rows, size_t new_cols ); 122 | /// 123 | void load( const char *filename ); 124 | /// 125 | void save( const char *filename ) const; 126 | /// 127 | friend ostream& operator<< ( ostream& os, const matrix& m ); 128 | //This function writes the elements of the matrix m to the stream stream in binary format. The return value is 0 for success and GSL_EFAILED if there was a problem writing to the file. Since the data is written in the native binary format it may not be portable between different architectures. 129 | int fwrite (FILE * stream) const {return gsl_matrix_fwrite (stream, m);} 130 | 131 | //This function reads into the matrix m from the open stream stream in binary format. The matrix m must be preallocated with the correct dimensions since the function uses the size of m to determine how many bytes to read. The return value is 0 for success and GSL_EFAILED if there was a problem reading from the file. The data is assumed to have been written in the native binary format on the same architecture. 132 | int fread (FILE * stream) {return gsl_matrix_fread (stream, m);} 133 | 134 | /// 135 | void load_binary( const char *filename ); 136 | /// 137 | void save_binary( const char *filename ) const; 138 | /// 139 | bool operator==( const matrix &other ) const; 140 | bool operator!=( const matrix &other ) const {return !((*this)==other);} 141 | 142 | matrix& operator=( const matrix &other ) {copy( other );return *this;} 143 | /// converts from any other matrix type 144 | template 145 | matrix &operator=( const omatrix& other ) 146 | { 147 | copy(other); 148 | return *this; 149 | } 150 | /// 151 | matrix operator+( const matrix &other ) const; 152 | /// 153 | matrix operator+( const double &f ) const; 154 | /// 155 | friend matrix operator+( const double &f, const matrix &other ); 156 | /// 157 | matrix &operator+=( const double &f ); 158 | /// 159 | matrix &operator+=( const matrix &other ); 160 | /// 161 | matrix operator-( const matrix &other ) const; 162 | /// 163 | matrix operator-( const double &f ) const; 164 | /// 165 | friend matrix operator-( const double &f, const matrix &other ); 166 | /// 167 | matrix &operator-=( const double &f ); 168 | /// 169 | matrix &operator-=( const matrix &other ); 170 | /// 171 | matrix operator*( const matrix &other ) const; 172 | /// 173 | matrix operator*( const double &f ) const; 174 | /// 175 | friend matrix operator*( const double &f, const matrix &other ); 176 | /// 177 | matrix &operator*=( const double &f ); 178 | /// 179 | matrix &operator*=( const matrix &other ); 180 | /// 181 | matrix operator/( const double &) const; 182 | /// 183 | matrix &operator/=( const double &); 184 | /// 185 | matrix transpose() const; 186 | /// 187 | matrix LU_decomp(gsl::permutation *perm=NULL,int *psign=NULL) const; 188 | /// 189 | matrix LU_invert() const; 190 | 191 | // return a submatrix of the this from row_min to row_max (not included!) 192 | matrix submatrix(size_t row_min, size_t row_max, size_t col_min, size_t col_max) const 193 | { 194 | matrix m(row_max - row_min, col_max - col_min); 195 | for (size_t i = row_min ; i < row_max ; i++) 196 | { 197 | for (size_t j = col_min ; j < col_max ; j++) 198 | { 199 | m(i - row_min,j - col_min) = (*this)(i,j); 200 | } 201 | } 202 | return m; 203 | } 204 | private: 205 | /// 206 | void LU_decomp( gsl_matrix **a, 207 | gsl_permutation **permutation, 208 | int *sign ) const; 209 | public: 210 | /** returns sum of all the matrix elements. */ 211 | double sum() const; 212 | /** returns logarithm of the determinant of the matrix. */ 213 | double LU_lndet() const; 214 | 215 | 216 | /** returns a vector_view of a single row of the matrix. */ 217 | vector_view row( size_t rowindex ); 218 | const vector_view row( size_t rowindex ) const ; 219 | /** returns a vector_view of a single column of the matrix. */ 220 | vector_view column( size_t colindex ); 221 | const vector_view column( size_t colindex ) const; 222 | /** returns a vector_view of the diagonal elements of the matrix. */ 223 | vector_view diagonal(); 224 | const vector_view diagonal() const; 225 | 226 | /** returns a column matrix containing a single row of the matrix. */ 227 | matrix get_row( size_t rowindex ) const; 228 | /** returns a column matrix containing a single column of the matrix. */ 229 | matrix get_col( size_t colindex ) const; 230 | /** calculates sum of rows returned as a column matrix. */ 231 | matrix row_sum() const; 232 | /** calculates sum of columns returned as a row matrix. */ 233 | matrix column_sum() const; 234 | /** returns trace (diagonal sum) of a square matrix. */ 235 | double trace() const; 236 | /** calculates cholesky decomposition of the matrix, returning success if matrix is positive definite. */ 237 | int cholesky_decomp( matrix &a ) const; 238 | // /** returns index of nearest row in matrix to vector argument. */ 239 | // int nearest_row_index( const matrix &v ) const; 240 | /** calculates covariance of the matrix columns. */ 241 | matrix covariance() const; 242 | /** returns 1 if matrix is square, 0 otherwise. */ 243 | bool is_square() const; 244 | /** diag operator (sets the diagonal elements of the matrix to the elements of v */ 245 | void diag(const vector& v); 246 | /** set diagonal elements of a square matrix to f. */ 247 | void set_diagonal( double f ); 248 | /** sets matrix to a k dimensional unit matrix. */ 249 | void identity( size_t k ); 250 | /** returns sum of nth power of all elements. */ 251 | double norm( double n ) const; 252 | 253 | /* Function: double gsl_matrix_max (const gsl_matrix * m) */ 254 | /* This function returns the maximum value in the matrix m. */ 255 | double max() const {return gsl_matrix_max(m);} 256 | /* Function: double gsl_matrix_min (const gsl_matrix * m) */ 257 | /* This function returns the minimum value in the matrix m. */ 258 | double min()const{return gsl_matrix_min(m);} 259 | 260 | /** This function returns 1 if all the elements of the matrix m are zero, and 0 otherwise. */ 261 | bool isnull() const { return gsl_matrix_isnull(m);} 262 | /* Function: void gsl_matrix_minmax (const gsl_matrix * m, double * min_out, double * max_out) */ 263 | /* This function returns the minimum and maximum values in the matrix m, storing them in min_out and max_out. */ 264 | 265 | /* Function: void gsl_matrix_max_index (const gsl_matrix * m, size_t * imax, size_t * jmax) */ 266 | /* This function returns the indices of the maximum value in the matrix m, storing them in imax and jmax. When there are several equal maximum elements then the first element found */ 267 | /* is returned. */ 268 | 269 | /* Function: void gsl_matrix_min_index (const gsl_matrix * m, size_t * imax, size_t * jmax) */ 270 | /* This function returns the indices of the minimum value in the matrix m, storing them in imax and jmax. When there are several equal minimum elements then the first element found */ 271 | /* is returned. */ 272 | 273 | /* Function: void gsl_matrix_minmax_index (const gsl_matrix * m, size_t * imin, size_t * imax) */ 274 | /* This function returns the indices of the minimum and maximum values in the matrix m, storing them in (imin,jmin) and (imax,jmax). When there are several equal minimum or */ 275 | /* maximum elements then the first elements found are returned. */ 276 | 277 | /** for interfacing with gsl c */ 278 | /* gsl_matrix *gslobj() {if (!m){cout << "matrix::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */ 279 | /* const gsl_matrix *gslobj() const {if (!m){cout << "matrix::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */ 280 | gsl_matrix *gslobj() {assert(m);return m;} 281 | const gsl_matrix *gslobj() const {assert(m);return m;} 282 | private: 283 | /// 284 | gsl_matrix *m; 285 | 286 | }; 287 | } 288 | #undef type_is 289 | #undef type_is_double 290 | 291 | #endif // _matrix_double_h 292 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/matrix_int.h: -------------------------------------------------------------------------------- 1 | // matrix.h 2 | 3 | // This matrix class is a C++ wrapper for the GNU Scientific Library 4 | // Copyright (C) 2001 Ramin Nakisa 5 | 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | 16 | // You should have received a copy of the GNU General Public License 17 | // along with this program; if not, write to the Free Software 18 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 | 20 | #if !defined( _matrix_int_h ) 21 | #define _matrix_int_h 22 | 23 | #ifdef __HP_aCC //for aCC B3910B A.01.27 24 | #include 25 | #include 26 | #include 27 | #else //for gcc3 28 | #include 29 | #include 30 | #include 31 | #endif 32 | 33 | #include 34 | #include 35 | #include 36 | /// 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #define type_is_int 44 | #ifdef type_is 45 | #define type_is_double 46 | #endif 47 | 48 | namespace gsl 49 | { 50 | 51 | /// 52 | class matrix_int 53 | { 54 | #ifdef type_is_double 55 | friend class matrix_float; 56 | friend class matrix_int; 57 | #endif 58 | public: 59 | typedef int value_type; 60 | typedef vector_int vector_type; 61 | 62 | /// 63 | matrix_int(); 64 | /// 65 | matrix_int( size_t new_rows, size_t new_cols, bool clear = true ); 66 | 67 | template 68 | void copy(const oclass &other) 69 | { 70 | if ( static_cast( this ) == static_cast( &other ) ) 71 | return; 72 | 73 | set_dimensions( other.get_rows(), other.get_cols() ); 74 | for ( size_t i = 0; i < get_rows(); i++ ) 75 | { 76 | for ( size_t j = 0; j < get_cols(); j++ ) 77 | { 78 | gsl_matrix_int_set( m, i, j, (int)other(i,j)); 79 | } 80 | } 81 | } 82 | /* template<> */ 83 | /* void copy(const matrix_int &other) */ 84 | /* { */ 85 | /* set_dimensions(other.size1(),other.size2()); */ 86 | /* gsl_matrix_int_memcpy( m, other.m ); */ 87 | /* } */ 88 | // copy constructor for type matrix_int 89 | matrix_int( const matrix_int &other ):m(NULL) {copy(other);} 90 | /// 91 | template 92 | matrix_int( const oclass &other ):m(NULL) {copy(other);} 93 | 94 | /// 95 | ~matrix_int(); 96 | /// 97 | // matrix_int( const char *Filename ); 98 | /// 99 | size_t get_rows() const {return m->size1;} 100 | /// 101 | size_t get_cols() const {return m->size2;} 102 | /// 103 | size_t size1() const {return m->size1;} 104 | /// 105 | size_t size2() const {return m->size2;} 106 | 107 | 108 | /// 109 | void dimensions( size_t *num_rows, size_t *num_cols ) const; 110 | /// 111 | int get_element ( size_t row, size_t col ) const {return gsl_matrix_int_get( m, row, col ) ;} 112 | const int &operator()( size_t row, size_t col ) const {return *gsl_matrix_int_ptr( m, row, col ) ;} 113 | int &operator()( size_t row, size_t col ) {return *gsl_matrix_int_ptr( m, row, col ) ;} 114 | /// 115 | void set_element( size_t row, size_t col, const int &v ){ gsl_matrix_int_set( m, row, col, v );} 116 | /// 117 | void set_elements( const int & new_value ); 118 | void set_all ( const int & new_value ) {gsl_matrix_int_set_all ( m, new_value );} 119 | void set_zero() {gsl_matrix_int_set_zero( m );} 120 | /// 121 | void set_dimensions( size_t new_rows, size_t new_cols ); 122 | /// 123 | void load( const char *filename ); 124 | /// 125 | void save( const char *filename ) const; 126 | /// 127 | friend ostream& operator<< ( ostream& os, const matrix_int& m ); 128 | //This function writes the elements of the matrix m to the stream stream in binary format. The return value is 0 for success and GSL_EFAILED if there was a problem writing to the file. Since the data is written in the native binary format it may not be portable between different architectures. 129 | int fwrite (FILE * stream) const {return gsl_matrix_int_fwrite (stream, m);} 130 | 131 | //This function reads into the matrix m from the open stream stream in binary format. The matrix m must be preallocated with the correct dimensions since the function uses the size of m to determine how many bytes to read. The return value is 0 for success and GSL_EFAILED if there was a problem reading from the file. The data is assumed to have been written in the native binary format on the same architecture. 132 | int fread (FILE * stream) {return gsl_matrix_int_fread (stream, m);} 133 | 134 | /// 135 | void load_binary( const char *filename ); 136 | /// 137 | void save_binary( const char *filename ) const; 138 | /// 139 | bool operator==( const matrix_int &other ) const; 140 | bool operator!=( const matrix_int &other ) const {return !((*this)==other);} 141 | 142 | matrix_int& operator=( const matrix_int &other ) {copy( other );return *this;} 143 | /// converts from any other matrix type 144 | template 145 | matrix_int &operator=( const omatrix& other ) 146 | { 147 | copy(other); 148 | return *this; 149 | } 150 | /// 151 | matrix_int operator+( const matrix_int &other ) const; 152 | /// 153 | matrix_int operator+( const int &f ) const; 154 | /// 155 | friend matrix_int operator+( const int &f, const matrix_int &other ); 156 | /// 157 | matrix_int &operator+=( const int &f ); 158 | /// 159 | matrix_int &operator+=( const matrix_int &other ); 160 | /// 161 | matrix_int operator-( const matrix_int &other ) const; 162 | /// 163 | matrix_int operator-( const int &f ) const; 164 | /// 165 | friend matrix_int operator-( const int &f, const matrix_int &other ); 166 | /// 167 | matrix_int &operator-=( const int &f ); 168 | /// 169 | matrix_int &operator-=( const matrix_int &other ); 170 | /// 171 | matrix_int operator*( const matrix_int &other ) const; 172 | /// 173 | matrix_int operator*( const int &f ) const; 174 | /// 175 | friend matrix_int operator*( const int &f, const matrix_int &other ); 176 | /// 177 | matrix_int &operator*=( const int &f ); 178 | /// 179 | matrix_int &operator*=( const matrix_int &other ); 180 | /// 181 | matrix_int operator/( const int &) const; 182 | /// 183 | matrix_int &operator/=( const int &); 184 | /// 185 | matrix_int transpose() const; 186 | /// 187 | matrix_int LU_decomp(gsl::permutation *perm=NULL,int *psign=NULL) const; 188 | /// 189 | matrix_int LU_invert() const; 190 | 191 | // return a submatrix of the this from row_min to row_max (not included!) 192 | matrix_int submatrix(size_t row_min, size_t row_max, size_t col_min, size_t col_max) const 193 | { 194 | matrix_int m(row_max - row_min, col_max - col_min); 195 | for (size_t i = row_min ; i < row_max ; i++) 196 | { 197 | for (size_t j = col_min ; j < col_max ; j++) 198 | { 199 | m(i - row_min,j - col_min) = (*this)(i,j); 200 | } 201 | } 202 | return m; 203 | } 204 | private: 205 | /// 206 | void LU_decomp( gsl_matrix_int **a, 207 | gsl_permutation **permutation, 208 | int *sign ) const; 209 | public: 210 | /** returns sum of all the matrix elements. */ 211 | int sum() const; 212 | /** returns logarithm of the determinant of the matrix. */ 213 | double LU_lndet() const; 214 | 215 | 216 | /** returns a vector_int_view of a single row of the matrix. */ 217 | vector_int_view row( size_t rowindex ); 218 | const vector_int_view row( size_t rowindex ) const ; 219 | /** returns a vector_int_view of a single column of the matrix. */ 220 | vector_int_view column( size_t colindex ); 221 | const vector_int_view column( size_t colindex ) const; 222 | /** returns a vector_int_view of the diagonal elements of the matrix. */ 223 | vector_int_view diagonal(); 224 | const vector_int_view diagonal() const; 225 | 226 | /** returns a column matrix containing a single row of the matrix. */ 227 | matrix_int get_row( size_t rowindex ) const; 228 | /** returns a column matrix containing a single column of the matrix. */ 229 | matrix_int get_col( size_t colindex ) const; 230 | /** calculates sum of rows returned as a column matrix. */ 231 | matrix_int row_sum() const; 232 | /** calculates sum of columns returned as a row matrix. */ 233 | matrix_int column_sum() const; 234 | /** returns trace (diagonal sum) of a square matrix. */ 235 | double trace() const; 236 | /** calculates cholesky decomposition of the matrix, returning success if matrix is positive definite. */ 237 | int cholesky_decomp( matrix_int &a ) const; 238 | // /** returns index of nearest row in matrix to vector argument. */ 239 | // int nearest_row_index( const matrix_int &v ) const; 240 | /** calculates covariance of the matrix columns. */ 241 | matrix_int covariance() const; 242 | /** returns 1 if matrix is square, 0 otherwise. */ 243 | bool is_square() const; 244 | /** diag operator (sets the diagonal elements of the matrix to the elements of v */ 245 | void diag(const vector_int& v); 246 | /** set diagonal elements of a square matrix to f. */ 247 | void set_diagonal( int f ); 248 | /** sets matrix to a k dimensional unit matrix. */ 249 | void identity( size_t k ); 250 | /** returns sum of nth power of all elements. */ 251 | double norm( double n ) const; 252 | 253 | /* Function: double gsl_matrix_max (const gsl_matrix * m) */ 254 | /* This function returns the maximum value in the matrix m. */ 255 | double max() const {return gsl_matrix_int_max(m);} 256 | /* Function: double gsl_matrix_min (const gsl_matrix * m) */ 257 | /* This function returns the minimum value in the matrix m. */ 258 | double min()const{return gsl_matrix_int_min(m);} 259 | 260 | /** This function returns 1 if all the elements of the matrix m are zero, and 0 otherwise. */ 261 | bool isnull() const { return gsl_matrix_int_isnull(m);} 262 | /* Function: void gsl_matrix_minmax (const gsl_matrix * m, double * min_out, double * max_out) */ 263 | /* This function returns the minimum and maximum values in the matrix m, storing them in min_out and max_out. */ 264 | 265 | /* Function: void gsl_matrix_max_index (const gsl_matrix * m, size_t * imax, size_t * jmax) */ 266 | /* This function returns the indices of the maximum value in the matrix m, storing them in imax and jmax. When there are several equal maximum elements then the first element found */ 267 | /* is returned. */ 268 | 269 | /* Function: void gsl_matrix_min_index (const gsl_matrix * m, size_t * imax, size_t * jmax) */ 270 | /* This function returns the indices of the minimum value in the matrix m, storing them in imax and jmax. When there are several equal minimum elements then the first element found */ 271 | /* is returned. */ 272 | 273 | /* Function: void gsl_matrix_minmax_index (const gsl_matrix * m, size_t * imin, size_t * imax) */ 274 | /* This function returns the indices of the minimum and maximum values in the matrix m, storing them in (imin,jmin) and (imax,jmax). When there are several equal minimum or */ 275 | /* maximum elements then the first elements found are returned. */ 276 | 277 | /** for interfacing with gsl c */ 278 | /* gsl_matrix_int *gslobj() {if (!m){cout << "matrix_int::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */ 279 | /* const gsl_matrix_int *gslobj() const {if (!m){cout << "matrix_int::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */ 280 | gsl_matrix_int *gslobj() {assert(m);return m;} 281 | const gsl_matrix_int *gslobj() const {assert(m);return m;} 282 | private: 283 | /// 284 | gsl_matrix_int *m; 285 | 286 | }; 287 | } 288 | #undef type_is_int 289 | #undef type_is_double 290 | 291 | #endif // _matrix_int_h 292 | -------------------------------------------------------------------------------- /gslwrap/include/gslwrap/matrix_float.h: -------------------------------------------------------------------------------- 1 | // matrix.h 2 | 3 | // This matrix class is a C++ wrapper for the GNU Scientific Library 4 | // Copyright (C) 2001 Ramin Nakisa 5 | 6 | // This program is free software; you can redistribute it and/or modify 7 | // it under the terms of the GNU General Public License as published by 8 | // the Free Software Foundation; either version 2 of the License, or 9 | // (at your option) any later version. 10 | 11 | // This program is distributed in the hope that it will be useful, 12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | // GNU General Public License for more details. 15 | 16 | // You should have received a copy of the GNU General Public License 17 | // along with this program; if not, write to the Free Software 18 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 | 20 | #if !defined( _matrix_float_h ) 21 | #define _matrix_float_h 22 | 23 | #ifdef __HP_aCC //for aCC B3910B A.01.27 24 | #include 25 | #include 26 | #include 27 | #else //for gcc3 28 | #include 29 | #include 30 | #include 31 | #endif 32 | 33 | #include 34 | #include 35 | #include 36 | /// 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | 43 | #define type_is_float 44 | #ifdef type_is 45 | #define type_is_double 46 | #endif 47 | 48 | namespace gsl 49 | { 50 | 51 | /// 52 | class matrix_float 53 | { 54 | #ifdef type_is_double 55 | friend class matrix_float; 56 | friend class matrix_int; 57 | #endif 58 | public: 59 | typedef float value_type; 60 | typedef vector_float vector_type; 61 | 62 | /// 63 | matrix_float(); 64 | /// 65 | matrix_float( size_t new_rows, size_t new_cols, bool clear = true ); 66 | 67 | template 68 | void copy(const oclass &other) 69 | { 70 | if ( static_cast( this ) == static_cast( &other ) ) 71 | return; 72 | 73 | set_dimensions( other.get_rows(), other.get_cols() ); 74 | for ( size_t i = 0; i < get_rows(); i++ ) 75 | { 76 | for ( size_t j = 0; j < get_cols(); j++ ) 77 | { 78 | gsl_matrix_float_set( m, i, j, (float)other(i,j)); 79 | } 80 | } 81 | } 82 | /* template<> */ 83 | /* void copy(const matrix_float &other) */ 84 | /* { */ 85 | /* set_dimensions(other.size1(),other.size2()); */ 86 | /* gsl_matrix_float_memcpy( m, other.m ); */ 87 | /* } */ 88 | // copy constructor for type matrix_float 89 | matrix_float( const matrix_float &other ):m(NULL) {copy(other);} 90 | /// 91 | template 92 | matrix_float( const oclass &other ):m(NULL) {copy(other);} 93 | 94 | /// 95 | ~matrix_float(); 96 | /// 97 | // matrix_float( const char *Filename ); 98 | /// 99 | size_t get_rows() const {return m->size1;} 100 | /// 101 | size_t get_cols() const {return m->size2;} 102 | /// 103 | size_t size1() const {return m->size1;} 104 | /// 105 | size_t size2() const {return m->size2;} 106 | 107 | 108 | /// 109 | void dimensions( size_t *num_rows, size_t *num_cols ) const; 110 | /// 111 | float get_element ( size_t row, size_t col ) const {return gsl_matrix_float_get( m, row, col ) ;} 112 | const float &operator()( size_t row, size_t col ) const {return *gsl_matrix_float_ptr( m, row, col ) ;} 113 | float &operator()( size_t row, size_t col ) {return *gsl_matrix_float_ptr( m, row, col ) ;} 114 | /// 115 | void set_element( size_t row, size_t col, const float &v ){ gsl_matrix_float_set( m, row, col, v );} 116 | /// 117 | void set_elements( const float & new_value ); 118 | void set_all ( const float & new_value ) {gsl_matrix_float_set_all ( m, new_value );} 119 | void set_zero() {gsl_matrix_float_set_zero( m );} 120 | /// 121 | void set_dimensions( size_t new_rows, size_t new_cols ); 122 | /// 123 | void load( const char *filename ); 124 | /// 125 | void save( const char *filename ) const; 126 | /// 127 | friend ostream& operator<< ( ostream& os, const matrix_float& m ); 128 | //This function writes the elements of the matrix m to the stream stream in binary format. The return value is 0 for success and GSL_EFAILED if there was a problem writing to the file. Since the data is written in the native binary format it may not be portable between different architectures. 129 | int fwrite (FILE * stream) const {return gsl_matrix_float_fwrite (stream, m);} 130 | 131 | //This function reads into the matrix m from the open stream stream in binary format. The matrix m must be preallocated with the correct dimensions since the function uses the size of m to determine how many bytes to read. The return value is 0 for success and GSL_EFAILED if there was a problem reading from the file. The data is assumed to have been written in the native binary format on the same architecture. 132 | int fread (FILE * stream) {return gsl_matrix_float_fread (stream, m);} 133 | 134 | /// 135 | void load_binary( const char *filename ); 136 | /// 137 | void save_binary( const char *filename ) const; 138 | /// 139 | bool operator==( const matrix_float &other ) const; 140 | bool operator!=( const matrix_float &other ) const {return !((*this)==other);} 141 | 142 | matrix_float& operator=( const matrix_float &other ) {copy( other );return *this;} 143 | /// converts from any other matrix type 144 | template 145 | matrix_float &operator=( const omatrix& other ) 146 | { 147 | copy(other); 148 | return *this; 149 | } 150 | /// 151 | matrix_float operator+( const matrix_float &other ) const; 152 | /// 153 | matrix_float operator+( const float &f ) const; 154 | /// 155 | friend matrix_float operator+( const float &f, const matrix_float &other ); 156 | /// 157 | matrix_float &operator+=( const float &f ); 158 | /// 159 | matrix_float &operator+=( const matrix_float &other ); 160 | /// 161 | matrix_float operator-( const matrix_float &other ) const; 162 | /// 163 | matrix_float operator-( const float &f ) const; 164 | /// 165 | friend matrix_float operator-( const float &f, const matrix_float &other ); 166 | /// 167 | matrix_float &operator-=( const float &f ); 168 | /// 169 | matrix_float &operator-=( const matrix_float &other ); 170 | /// 171 | matrix_float operator*( const matrix_float &other ) const; 172 | /// 173 | matrix_float operator*( const float &f ) const; 174 | /// 175 | friend matrix_float operator*( const float &f, const matrix_float &other ); 176 | /// 177 | matrix_float &operator*=( const float &f ); 178 | /// 179 | matrix_float &operator*=( const matrix_float &other ); 180 | /// 181 | matrix_float operator/( const float &) const; 182 | /// 183 | matrix_float &operator/=( const float &); 184 | /// 185 | matrix_float transpose() const; 186 | /// 187 | matrix_float LU_decomp(gsl::permutation *perm=NULL,int *psign=NULL) const; 188 | /// 189 | matrix_float LU_invert() const; 190 | 191 | // return a submatrix of the this from row_min to row_max (not included!) 192 | matrix_float submatrix(size_t row_min, size_t row_max, size_t col_min, size_t col_max) const 193 | { 194 | matrix_float m(row_max - row_min, col_max - col_min); 195 | for (size_t i = row_min ; i < row_max ; i++) 196 | { 197 | for (size_t j = col_min ; j < col_max ; j++) 198 | { 199 | m(i - row_min,j - col_min) = (*this)(i,j); 200 | } 201 | } 202 | return m; 203 | } 204 | private: 205 | /// 206 | void LU_decomp( gsl_matrix_float **a, 207 | gsl_permutation **permutation, 208 | int *sign ) const; 209 | public: 210 | /** returns sum of all the matrix elements. */ 211 | float sum() const; 212 | /** returns logarithm of the determinant of the matrix. */ 213 | double LU_lndet() const; 214 | 215 | 216 | /** returns a vector_float_view of a single row of the matrix. */ 217 | vector_float_view row( size_t rowindex ); 218 | const vector_float_view row( size_t rowindex ) const ; 219 | /** returns a vector_float_view of a single column of the matrix. */ 220 | vector_float_view column( size_t colindex ); 221 | const vector_float_view column( size_t colindex ) const; 222 | /** returns a vector_float_view of the diagonal elements of the matrix. */ 223 | vector_float_view diagonal(); 224 | const vector_float_view diagonal() const; 225 | 226 | /** returns a column matrix containing a single row of the matrix. */ 227 | matrix_float get_row( size_t rowindex ) const; 228 | /** returns a column matrix containing a single column of the matrix. */ 229 | matrix_float get_col( size_t colindex ) const; 230 | /** calculates sum of rows returned as a column matrix. */ 231 | matrix_float row_sum() const; 232 | /** calculates sum of columns returned as a row matrix. */ 233 | matrix_float column_sum() const; 234 | /** returns trace (diagonal sum) of a square matrix. */ 235 | double trace() const; 236 | /** calculates cholesky decomposition of the matrix, returning success if matrix is positive definite. */ 237 | int cholesky_decomp( matrix_float &a ) const; 238 | // /** returns index of nearest row in matrix to vector argument. */ 239 | // int nearest_row_index( const matrix_float &v ) const; 240 | /** calculates covariance of the matrix columns. */ 241 | matrix_float covariance() const; 242 | /** returns 1 if matrix is square, 0 otherwise. */ 243 | bool is_square() const; 244 | /** diag operator (sets the diagonal elements of the matrix to the elements of v */ 245 | void diag(const vector_float& v); 246 | /** set diagonal elements of a square matrix to f. */ 247 | void set_diagonal( float f ); 248 | /** sets matrix to a k dimensional unit matrix. */ 249 | void identity( size_t k ); 250 | /** returns sum of nth power of all elements. */ 251 | double norm( double n ) const; 252 | 253 | /* Function: double gsl_matrix_max (const gsl_matrix * m) */ 254 | /* This function returns the maximum value in the matrix m. */ 255 | double max() const {return gsl_matrix_float_max(m);} 256 | /* Function: double gsl_matrix_min (const gsl_matrix * m) */ 257 | /* This function returns the minimum value in the matrix m. */ 258 | double min()const{return gsl_matrix_float_min(m);} 259 | 260 | /** This function returns 1 if all the elements of the matrix m are zero, and 0 otherwise. */ 261 | bool isnull() const { return gsl_matrix_float_isnull(m);} 262 | /* Function: void gsl_matrix_minmax (const gsl_matrix * m, double * min_out, double * max_out) */ 263 | /* This function returns the minimum and maximum values in the matrix m, storing them in min_out and max_out. */ 264 | 265 | /* Function: void gsl_matrix_max_index (const gsl_matrix * m, size_t * imax, size_t * jmax) */ 266 | /* This function returns the indices of the maximum value in the matrix m, storing them in imax and jmax. When there are several equal maximum elements then the first element found */ 267 | /* is returned. */ 268 | 269 | /* Function: void gsl_matrix_min_index (const gsl_matrix * m, size_t * imax, size_t * jmax) */ 270 | /* This function returns the indices of the minimum value in the matrix m, storing them in imax and jmax. When there are several equal minimum elements then the first element found */ 271 | /* is returned. */ 272 | 273 | /* Function: void gsl_matrix_minmax_index (const gsl_matrix * m, size_t * imin, size_t * imax) */ 274 | /* This function returns the indices of the minimum and maximum values in the matrix m, storing them in (imin,jmin) and (imax,jmax). When there are several equal minimum or */ 275 | /* maximum elements then the first elements found are returned. */ 276 | 277 | /** for interfacing with gsl c */ 278 | /* gsl_matrix_float *gslobj() {if (!m){cout << "matrix_float::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */ 279 | /* const gsl_matrix_float *gslobj() const {if (!m){cout << "matrix_float::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */ 280 | gsl_matrix_float *gslobj() {assert(m);return m;} 281 | const gsl_matrix_float *gslobj() const {assert(m);return m;} 282 | private: 283 | /// 284 | gsl_matrix_float *m; 285 | 286 | }; 287 | } 288 | #undef type_is_float 289 | #undef type_is_double 290 | 291 | #endif // _matrix_float_h 292 | -------------------------------------------------------------------------------- /dtm/gsl-wrappers.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gsl-wrappers.h" 3 | #include 4 | 5 | static gsl_rng* RANDOM_NUMBER_GENERATOR = NULL; 6 | 7 | DEFINE_int64(rng_seed, 8 | 0, 9 | "Specifies the random seed. If 0, seeds pseudo-randomly."); 10 | 11 | // The maximum number of iterations for each update. 12 | const double MAX_ITER = 15; 13 | 14 | /* 15 | * safe logarithm function 16 | * 17 | */ 18 | 19 | double safe_log(double x) 20 | { 21 | if (x == 0) 22 | { 23 | return(-1000); 24 | } 25 | else 26 | { 27 | return(log(x)); 28 | } 29 | } 30 | 31 | 32 | /* 33 | * given log(a) and log(b), return log(a+b) 34 | * 35 | */ 36 | 37 | double log_sum(double log_a, double log_b) 38 | { 39 | double v; 40 | 41 | if (log_a == -1) return(log_b); 42 | 43 | if (log_a < log_b) 44 | { 45 | v = log_b+log(1 + exp(log_a-log_b)); 46 | } 47 | else 48 | { 49 | v = log_a+log(1 + exp(log_b-log_a)); 50 | } 51 | return(v); 52 | } 53 | 54 | 55 | void vinc(gsl_vector* v, int i, double x) 56 | { 57 | vset(v, i, vget(v, i) + x); 58 | } 59 | 60 | void minc(gsl_matrix* m, int i, int j, double x) 61 | { 62 | mset(m, i, j, mget(m, i, j) + x); 63 | } 64 | 65 | 66 | void msetrow(gsl_matrix* m, int r, const gsl_vector* val) 67 | { 68 | int i; 69 | gsl_vector v = gsl_matrix_row(m, r).vector; 70 | for (i = 0; i < v.size; i++) 71 | vset(&v, i, vget(val, i)); 72 | } 73 | 74 | 75 | void msetcol(gsl_matrix* m, int r, const gsl_vector* val) 76 | { 77 | int i; 78 | gsl_vector v = gsl_matrix_column(m, r).vector; 79 | for (i = 0; i < v.size; i++) 80 | vset(&v, i, vget(val, i)); 81 | } 82 | 83 | 84 | /* 85 | * compute the column sums of a matrix 86 | * 87 | */ 88 | 89 | void col_sum(gsl_matrix* m, gsl_vector* val) 90 | { 91 | int i, j; 92 | gsl_vector_set_all(val, 0); 93 | 94 | for (i = 0; i < m->size1; i++) 95 | for (j = 0; j < m->size2; j++) 96 | vinc(val, j, mget(m, i, j)); 97 | } 98 | 99 | 100 | /* 101 | * print a vector to standard out 102 | * 103 | */ 104 | 105 | void vct_printf(const gsl_vector * v) 106 | { 107 | int i; 108 | for (i = 0; i < v->size; i++) 109 | printf("%5.5f ", vget(v, i)); 110 | printf("\n\n"); 111 | } 112 | 113 | 114 | /* 115 | * print a matrix to standard out 116 | * 117 | */ 118 | 119 | void mtx_printf(const gsl_matrix * m) 120 | { 121 | int i, j; 122 | for (i = 0; i < m->size1; i++) 123 | { 124 | for (j = 0; j < m->size2; j++) 125 | printf("%5.5f ", mget(m, i, j)); 126 | printf("\n"); 127 | } 128 | } 129 | 130 | 131 | /* 132 | * read/write a vector/matrix from a file 133 | * 134 | */ 135 | 136 | void vct_fscanf(const char* filename, gsl_vector* v) 137 | { 138 | outlog("reading %ld vector from %s", v->size, filename); 139 | FILE* fileptr; 140 | if (!fileptr) { 141 | outlog("Error opening file %s. Failing.", filename); 142 | exit(1); 143 | } 144 | fileptr = fopen(filename, "r"); 145 | gsl_vector_fscanf(fileptr, v); 146 | fclose(fileptr); 147 | } 148 | 149 | void mtx_fscanf(const char* filename, gsl_matrix * m) 150 | { 151 | FILE* fileptr = fopen(filename, "r"); 152 | 153 | outlog("reading %ld x %ld matrix from %s", 154 | m->size1, m->size2, filename); 155 | if (!fileptr) { 156 | outlog("Error opening file %s. Failing.", filename); 157 | exit(1); 158 | } 159 | 160 | gsl_matrix_fscanf(fileptr, m); 161 | fclose(fileptr); 162 | } 163 | 164 | void vct_fprintf(const char* filename, gsl_vector* v) 165 | { 166 | outlog( "writing %ld vector to %s", v->size, filename); 167 | FILE* fileptr; 168 | fileptr = fopen(filename, "w"); 169 | if (!fileptr) { 170 | outlog("Error opening file %s. Failing.", filename); 171 | exit(1); 172 | } 173 | gsl_vector_fprintf(fileptr, v, "%20.17e"); 174 | fclose(fileptr); 175 | } 176 | 177 | 178 | void mtx_fprintf(const char* filename, const gsl_matrix * m) 179 | { 180 | outlog( "writing %ld x %ld matrix to %s", 181 | m->size1, m->size2, filename); 182 | FILE* fileptr; 183 | fileptr = fopen(filename, "w"); 184 | if (!fileptr) { 185 | outlog("Error opening file: %s", filename); 186 | exit(1); 187 | } 188 | gsl_matrix_fprintf(fileptr, m, "%20.17e"); 189 | fclose(fileptr); 190 | } 191 | 192 | 193 | /* 194 | * matrix inversion using blas 195 | * 196 | */ 197 | 198 | void matrix_inverse(gsl_matrix* m, gsl_matrix* inverse) 199 | { 200 | gsl_matrix *lu; 201 | gsl_permutation* p; 202 | int signum; 203 | 204 | p = gsl_permutation_alloc(m->size1); 205 | lu = gsl_matrix_alloc(m->size1, m->size2); 206 | 207 | gsl_matrix_memcpy(lu, m); 208 | gsl_linalg_LU_decomp(lu, p, &signum); 209 | gsl_linalg_LU_invert(lu, p, inverse); 210 | 211 | gsl_matrix_free(lu); 212 | gsl_permutation_free(p); 213 | } 214 | 215 | 216 | /* 217 | * log determinant using blas 218 | * 219 | */ 220 | 221 | double log_det(gsl_matrix* m) 222 | { 223 | gsl_matrix* lu; 224 | gsl_permutation* p; 225 | double result; 226 | int signum; 227 | 228 | p = gsl_permutation_alloc(m->size1); 229 | lu = gsl_matrix_alloc(m->size1, m->size2); 230 | 231 | gsl_matrix_memcpy(lu, m); 232 | gsl_linalg_LU_decomp(lu, p, &signum); 233 | result = gsl_linalg_LU_lndet(lu); 234 | 235 | gsl_matrix_free(lu); 236 | gsl_permutation_free(p); 237 | 238 | return(result); 239 | } 240 | 241 | 242 | /* 243 | * eigenvalues of a symmetric matrix using blas 244 | * 245 | */ 246 | 247 | void sym_eigen(gsl_matrix* m, gsl_vector* vals, gsl_matrix* vects) 248 | { 249 | gsl_eigen_symmv_workspace* wk; 250 | gsl_matrix* mcpy; 251 | int r; 252 | 253 | mcpy = gsl_matrix_alloc(m->size1, m->size2); 254 | wk = gsl_eigen_symmv_alloc(m->size1); 255 | gsl_matrix_memcpy(mcpy, m); 256 | r = gsl_eigen_symmv(mcpy, vals, vects, wk); 257 | gsl_eigen_symmv_free(wk); 258 | gsl_matrix_free(mcpy); 259 | } 260 | 261 | 262 | /* 263 | * sum of a vector 264 | * 265 | */ 266 | 267 | double sum(const gsl_vector* v) 268 | { 269 | double val = 0; 270 | int i, size = v->size; 271 | for (i = 0; i < size; i++) 272 | val += vget(v, i); 273 | return(val); 274 | } 275 | 276 | 277 | /* 278 | * take log of each element 279 | * 280 | */ 281 | 282 | void vct_log(gsl_vector* v) 283 | { 284 | int i, size = v->size; 285 | for (i = 0; i < size; i++) 286 | vset(v, i, safe_log(vget(v, i))); 287 | } 288 | 289 | 290 | /* 291 | * l2 norm of a vector 292 | * 293 | */ 294 | 295 | // !!! this can be BLASified 296 | 297 | double norm(gsl_vector *v) 298 | { 299 | double val = 0; 300 | int i; 301 | 302 | for (i = 0; i < v->size; i++) 303 | val += vget(v, i) * vget(v, i); 304 | return(sqrt(val)); 305 | } 306 | 307 | 308 | /* 309 | * draw K random integers from 0..N-1 310 | * 311 | */ 312 | 313 | void choose_k_from_n(int k, int n, int* result) 314 | { 315 | int i, x[n]; 316 | 317 | if (RANDOM_NUMBER_GENERATOR == NULL) 318 | RANDOM_NUMBER_GENERATOR = gsl_rng_alloc(gsl_rng_taus); 319 | for (i = 0; i < n; i++) 320 | x[i] = i; 321 | 322 | gsl_ran_choose (RANDOM_NUMBER_GENERATOR, (void *) result, k, 323 | (void *) x, n, sizeof(int)); 324 | } 325 | 326 | 327 | /* 328 | * normalize a vector in log space 329 | * 330 | * x_i = log(a_i) 331 | * v = log(a_1 + ... + a_k) 332 | * x_i = x_i - v 333 | * 334 | */ 335 | 336 | void log_normalize(gsl_vector* x) 337 | { 338 | double v = vget(x, 0); 339 | int i; 340 | 341 | for (i = 1; i < x->size; i++) 342 | v = log_sum(v, vget(x, i)); 343 | 344 | for (i = 0; i < x->size; i++) 345 | vset(x, i, vget(x,i)-v); 346 | } 347 | 348 | 349 | /* 350 | * normalize a positive vector 351 | * 352 | */ 353 | 354 | void normalize(gsl_vector* x) 355 | { 356 | double v = 0; 357 | int i; 358 | 359 | for (i = 0; i < x->size; i++) 360 | v += vget(x, i); 361 | 362 | for (i = 0; i < x->size; i++) 363 | vset(x, i, vget(x, i) / v); 364 | } 365 | 366 | 367 | /* 368 | * exponentiate a vector 369 | * 370 | */ 371 | 372 | void vct_exp(gsl_vector* x) 373 | { 374 | int i; 375 | 376 | for (i = 0; i < x->size; i++) 377 | vset(x, i, exp(vget(x, i))); 378 | } 379 | 380 | 381 | /* 382 | * maximize a function using its derivative 383 | * 384 | */ 385 | 386 | void optimize_fdf(int dim, 387 | gsl_vector* x, 388 | void* params, 389 | void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*), 390 | void (*df)(const gsl_vector*, void*, gsl_vector*), 391 | double (*f)(const gsl_vector*, void*), 392 | double* f_val, 393 | double* conv_val, 394 | int* niter) 395 | { 396 | gsl_multimin_function_fdf obj; 397 | obj.f = f; 398 | obj.df = df; 399 | obj.fdf = fdf; 400 | obj.n = dim; 401 | obj.params = params; 402 | 403 | // const gsl_multimin_fdfminimizer_type * method = 404 | // gsl_multimin_fdfminimizer_vector_bfgs; 405 | const gsl_multimin_fdfminimizer_type * method = 406 | gsl_multimin_fdfminimizer_conjugate_fr; 407 | 408 | gsl_multimin_fdfminimizer * opt = 409 | gsl_multimin_fdfminimizer_alloc(method, dim); 410 | 411 | gsl_multimin_fdfminimizer_set(opt, &obj, x, 0.01, 1e-3); 412 | 413 | int iter = 0, status; 414 | double converged, f_old = 0; 415 | do 416 | { 417 | iter++; 418 | status = gsl_multimin_fdfminimizer_iterate(opt); 419 | // assert(status==0); 420 | converged = fabs((f_old - opt->f) / (dim * f_old)); 421 | // status = gsl_multimin_test_gradient(opt->gradient, 1e-3); 422 | // printf("f = %1.15e; conv = %5.3e; norm = %5.3e; niter = %03d\n", 423 | // opt->f, converged, norm(opt->gradient), iter); 424 | f_old = opt->f; 425 | } 426 | while (converged > 1e-8 && iter < MAX_ITER); 427 | // while (status == GSL_CONTINUE); 428 | *f_val = opt->f; 429 | *conv_val = converged; 430 | *niter = iter; 431 | gsl_multimin_fdfminimizer_free(opt); 432 | } 433 | 434 | 435 | 436 | /* 437 | * maximize a function 438 | * 439 | */ 440 | 441 | void optimize_f(int dim, 442 | gsl_vector* x, 443 | void* params, 444 | double (*f)(const gsl_vector*, void*)) 445 | { 446 | gsl_multimin_function obj; 447 | obj.f = f; 448 | obj.n = dim; 449 | obj.params = params; 450 | 451 | const gsl_multimin_fminimizer_type * method = 452 | gsl_multimin_fminimizer_nmsimplex; 453 | 454 | gsl_multimin_fminimizer * opt = 455 | gsl_multimin_fminimizer_alloc(method, dim); 456 | 457 | gsl_vector * step_size = gsl_vector_alloc(dim); 458 | gsl_vector_set_all(step_size, 1); 459 | gsl_multimin_fminimizer_set(opt, &obj, x, step_size); 460 | 461 | int iter = 0, status; 462 | double converged, f_old; 463 | do 464 | { 465 | iter++; 466 | f_old = opt->fval; 467 | status = gsl_multimin_fminimizer_iterate(opt); 468 | converged = fabs((f_old - opt->fval) / f_old); 469 | printf("f = %1.15e; conv = %5.3e; size = %5.3e; niter = %03d\n", 470 | opt->fval, converged, opt->size, iter); 471 | } 472 | while ((converged > 1e-10) || (iter < 10000)); 473 | // while (status == GSL_CONTINUE); 474 | printf("f = %1.15e; conv = %5.3e; niter = %03d\n", 475 | opt->fval, converged, iter); 476 | 477 | gsl_multimin_fminimizer_free(opt); 478 | gsl_vector_free(step_size); 479 | } 480 | 481 | 482 | /* 483 | * check if a directory exists 484 | * 485 | * !!! shouldn't be here 486 | */ 487 | 488 | int directory_exist(const char *dname) 489 | { 490 | struct stat st; 491 | int ret; 492 | 493 | if (stat(dname,&st) != 0) 494 | { 495 | return 0; 496 | } 497 | 498 | ret = S_ISDIR(st.st_mode); 499 | 500 | if(!ret) 501 | { 502 | errno = ENOTDIR; 503 | } 504 | 505 | return ret; 506 | } 507 | 508 | void make_directory(char* name) 509 | { 510 | #if _POSIX_C_SOURCE || __MACH__ 511 | mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR); 512 | #else 513 | mkdir(name); 514 | #endif 515 | } 516 | 517 | gsl_rng* new_random_number_generator() 518 | { 519 | gsl_rng* random_number_generator = gsl_rng_alloc(gsl_rng_taus); 520 | time_t t1; 521 | (void) time(&t1); 522 | 523 | if (FLAGS_rng_seed) { 524 | t1 = FLAGS_rng_seed; 525 | } 526 | 527 | // !!! DEBUG 528 | // t1 = 1147530551; 529 | printf("RANDOM SEED = %ld\n", t1); 530 | gsl_rng_set(random_number_generator, t1); 531 | return(random_number_generator); 532 | } 533 | 534 | -------------------------------------------------------------------------------- /dtm/data.c: -------------------------------------------------------------------------------- 1 | // Authors: David Blei (blei@cs.princeton.edu) 2 | // Sean Gerrish (sgerrish@cs.princeton.edu) 3 | // 4 | // Copyright 2011 Sean Gerrish and David Blei 5 | // All Rights Reserved. 6 | // 7 | // See the README for this package for details about modifying or 8 | // distributing this software. 9 | 10 | #include 11 | 12 | #define PI 3.141592653589793 13 | 14 | #include "data.h" 15 | 16 | DEFINE_double(sigma_l, 17 | 0.05, 18 | "If true, use the new phi calculation."); 19 | DEFINE_double(sigma_d, 20 | 0.05, 21 | "If true, use the new phi calculation."); 22 | DEFINE_double(sigma_c, 23 | 0.05, 24 | "c stdev."); 25 | DEFINE_double(sigma_cv, 26 | 1e-6, 27 | "Variational c stdev."); 28 | DEFINE_double(resolution, 29 | 1, 30 | "The resolution. Used to determine how far out the beta mean should be."); 31 | DEFINE_int32(max_number_time_points, 32 | 200, 33 | "Used for the influence window."); 34 | DEFINE_double(time_resolution, 35 | 0.5, 36 | "This is the number of years per time slice."); 37 | DEFINE_double(influence_mean_years, 38 | 20.0, 39 | "How many years is the mean number of citations?"); 40 | DEFINE_double(influence_stdev_years, 41 | 15.0, 42 | "How many years is the stdev number of citations?"); 43 | DEFINE_int32(influence_flat_years, 44 | -1, 45 | "How many years is the influence nonzero?" 46 | "If nonpositive, a lognormal distribution is used."); 47 | 48 | DECLARE_string(normalize_docs); 49 | 50 | /* 51 | * seq corpus range: [start, end) 52 | * 53 | * creates a subset of time slices 54 | * 55 | */ 56 | 57 | corpus_seq_t* make_corpus_seq_subset(corpus_seq_t* all, int start, int end) 58 | { 59 | int n; 60 | corpus_seq_t* subset_corpus = (corpus_seq_t*) malloc(sizeof(corpus_seq_t)); 61 | subset_corpus->nterms = all->nterms; 62 | subset_corpus->len = end - start; 63 | subset_corpus->ndocs = 0; 64 | subset_corpus->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * subset_corpus->len); 65 | for (n = start; n < end; n++) 66 | { 67 | subset_corpus->corpus[n - start] = all->corpus[n]; 68 | subset_corpus->ndocs += all->corpus[n]->ndocs; 69 | } 70 | return(subset_corpus); 71 | } 72 | 73 | 74 | /* 75 | * collapse a sequential corpus to a flat corpus 76 | * 77 | */ 78 | 79 | corpus_t* collapse_corpus_seq(corpus_seq_t* c) 80 | { 81 | corpus_t* collapsed = (corpus_t*) malloc(sizeof(corpus_t)); 82 | collapsed->ndocs = c->ndocs; 83 | collapsed->nterms = c->nterms; 84 | collapsed->doc = (doc_t**) malloc(sizeof(doc_t*) * c->ndocs); 85 | collapsed->max_unique = 0; 86 | int t, n, doc_idx = 0; 87 | for (t = 0; t < c->len; t++) 88 | { 89 | for (n = 0; n < c->corpus[t]->ndocs; n++) 90 | { 91 | collapsed->doc[doc_idx] = c->corpus[t]->doc[n]; 92 | if (collapsed->doc[doc_idx]->nterms > collapsed->max_unique) 93 | collapsed->max_unique = collapsed->doc[doc_idx]->nterms; 94 | doc_idx++; 95 | } 96 | } 97 | assert(doc_idx == collapsed->ndocs); 98 | return(collapsed); 99 | } 100 | 101 | /* 102 | * read corpus 103 | * 104 | */ 105 | 106 | corpus_t* read_corpus(const char* name) 107 | { 108 | int length, count, word, n; 109 | corpus_t* c; 110 | char filename[400]; 111 | sprintf(filename, "%s-mult.dat", name); 112 | outlog("reading corpus from %s", filename); 113 | c = (corpus_t*) malloc(sizeof(corpus_t)); 114 | c->max_unique = 0; 115 | FILE* fileptr = fopen(filename, "r"); 116 | if (fileptr == NULL) { 117 | outlog("Error reading corpus prefix %s. Failing.", 118 | filename); 119 | exit(1); 120 | } 121 | c->ndocs = 0; c->nterms = 0; 122 | c->doc = (doc_t**) malloc(sizeof(doc_t*)); 123 | int grand_total = 0; 124 | while ((fscanf(fileptr, "%10d", &length) != EOF)) 125 | { 126 | if (length > c->max_unique) c->max_unique = length; 127 | c->doc = (doc_t**) realloc(c->doc, sizeof(doc_t*)*(c->ndocs+1)); 128 | c->doc[c->ndocs] = (doc_t*) malloc(sizeof(doc_t)); 129 | c->doc[c->ndocs]->nterms = length; 130 | c->doc[c->ndocs]->total = 0; 131 | c->doc[c->ndocs]->log_likelihood = 0.0; 132 | 133 | c->doc[c->ndocs]->word = (int*) malloc(sizeof(int)*length); 134 | c->doc[c->ndocs]->count = (int*) malloc(sizeof(int)*length); 135 | c->doc[c->ndocs]->lambda = (double*) malloc(sizeof(double)*length); 136 | c->doc[c->ndocs]->log_likelihoods = (double*) malloc(sizeof(double)*length); 137 | for (n = 0; n < length; n++) 138 | { 139 | fscanf(fileptr, "%10d:%10d", &word, &count); 140 | word = word - OFFSET; 141 | if (FLAGS_normalize_docs == "occurrence") { 142 | count = 1; 143 | } 144 | c->doc[c->ndocs]->word[n] = word; 145 | c->doc[c->ndocs]->count[n] = count; 146 | c->doc[c->ndocs]->total += count; 147 | // Is there a better value for initializing lambda? 148 | c->doc[c->ndocs]->lambda[n] = 0.0; 149 | c->doc[c->ndocs]->log_likelihoods[n] = 0.0; 150 | if (word >= c->nterms) { c->nterms = word + 1; } 151 | } 152 | grand_total += c->doc[c->ndocs]->total; 153 | c->ndocs = c->ndocs + 1; 154 | } 155 | fclose(fileptr); 156 | outlog("read corpus (ndocs = %d; nterms = %d; nwords = %d)\n", 157 | c->ndocs, c->nterms, grand_total); 158 | return(c); 159 | } 160 | 161 | /* 162 | * read corpus sequence 163 | * 164 | */ 165 | 166 | corpus_seq_t* read_corpus_seq(const char* name) 167 | { 168 | char filename[400]; 169 | corpus_seq_t* corpus_seq = (corpus_seq_t*) malloc(sizeof(corpus_seq_t)); 170 | 171 | // read corpus 172 | corpus_t* raw_corpus = read_corpus(name); 173 | corpus_seq->nterms = raw_corpus->nterms; 174 | // read sequence information 175 | sprintf(filename, "%s-seq.dat", name); 176 | outlog("Reading corpus sequence %s.", filename); 177 | FILE* fileptr = fopen(filename, "r"); 178 | if (!fileptr) { 179 | outlog("Error opening dtm sequence file %s.\n", 180 | filename); 181 | exit(1); 182 | } 183 | fscanf(fileptr, "%d", &(corpus_seq->len)); 184 | corpus_seq->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * corpus_seq->len); 185 | // allocate corpora 186 | int doc_idx = 0; 187 | int ndocs, i, j; 188 | corpus_seq->ndocs = 0; 189 | for (i = 0; i < corpus_seq->len; ++i) 190 | { 191 | fscanf(fileptr, "%d", &ndocs); 192 | corpus_seq->ndocs += ndocs; 193 | corpus_seq->corpus[i] = (corpus_t*) malloc(sizeof(corpus_t)); 194 | corpus_seq->corpus[i]->ndocs = ndocs; 195 | corpus_seq->corpus[i]->doc = (doc_t**) malloc(sizeof(doc_t*) * ndocs); 196 | for (j = 0; j < ndocs; j++) 197 | { 198 | if (doc_idx >= raw_corpus->ndocs) { 199 | outlog("Error: too few documents listed in dtm sequence file %s.\n" 200 | "Current line: %d %d %d.\n", 201 | filename, 202 | doc_idx, 203 | ndocs, 204 | j); 205 | exit(1); 206 | } 207 | // outlog("%d %d %d %d\n", i, j, doc_idx, raw_corpus->ndocs); 208 | corpus_seq->corpus[i]->doc[j] = raw_corpus->doc[doc_idx]; 209 | doc_idx++; 210 | } 211 | } 212 | corpus_seq->max_nterms = compute_max_nterms(corpus_seq); 213 | outlog("read corpus of length %d\n", corpus_seq->len); 214 | return(corpus_seq); 215 | } 216 | 217 | 218 | /* 219 | * write sequential corpus 220 | * 221 | */ 222 | 223 | void write_corpus_seq(corpus_seq_t* c, char* name) 224 | { 225 | char tmp_string[400]; 226 | int n; 227 | 228 | outlog("writing %d slices to %s (%d total docs)", c->len, name, c->ndocs); 229 | sprintf(tmp_string, "%s-seq.dat", name); 230 | FILE* seq_file = fopen(tmp_string, "w"); 231 | fprintf(seq_file, "%d", c->len); 232 | for (n = 0; n < c->len; n++) 233 | fprintf(seq_file, " %d", c->corpus[n]->ndocs); 234 | fclose(seq_file); 235 | 236 | corpus_t* flat = collapse_corpus_seq(c); 237 | sprintf(tmp_string, "%s-mult.dat", name); 238 | write_corpus(flat, tmp_string); 239 | } 240 | 241 | /* 242 | * write corpus 243 | * 244 | */ 245 | 246 | void write_corpus(corpus_t* c, char* filename) 247 | { 248 | int i, j; 249 | FILE * fileptr; 250 | doc_t * d; 251 | outlog("writing %d docs to %s\n", c->ndocs, filename); 252 | fileptr = fopen(filename, "w"); 253 | for (i = 0; i < c->ndocs; i++) 254 | { 255 | d = c->doc[i]; 256 | fprintf(fileptr, "%d", d->nterms); 257 | for (j = 0; j < d->nterms; j++) 258 | { 259 | fprintf(fileptr, " %d:%d", d->word[j], d->count[j]); 260 | } 261 | fprintf(fileptr, "\n"); 262 | } 263 | fclose(fileptr); 264 | } 265 | 266 | 267 | /* 268 | * compute the maximum nterms in a corpus sequence 269 | * 270 | */ 271 | 272 | int compute_max_nterms(const corpus_seq_t* c) 273 | { 274 | int i,j; 275 | int max = 0; 276 | for (i = 0; i < c->len; i++) 277 | { 278 | corpus_t* corpus = c->corpus[i]; 279 | for (j = 0; j < corpus->ndocs; j++) 280 | if (corpus->doc[j]->nterms > max) 281 | max = corpus->doc[j]->nterms; 282 | } 283 | return(max); 284 | } 285 | 286 | 287 | /* 288 | * compute the total matrix of counts (W x T) 289 | * 290 | */ 291 | 292 | gsl_matrix* compute_total_counts(const corpus_seq_t* c) 293 | { 294 | int t, d, n; 295 | gsl_matrix* ret = gsl_matrix_alloc(c->nterms, c->len); 296 | 297 | for (t = 0; t < c->len; t++) 298 | { 299 | corpus_t* corpus = c->corpus[t]; 300 | for (d = 0; d < corpus->ndocs; d++) 301 | { 302 | doc_t* doc = corpus->doc[d]; 303 | for (n = 0; n < doc->nterms; n++) 304 | { 305 | minc(ret, doc->word[n], t, (double) doc->count[n]); 306 | } 307 | } 308 | } 309 | return(ret); 310 | } 311 | 312 | /** 313 | * Creates a new array of doubles with kScaledBetaMax elements. 314 | */ 315 | double * NewScaledInfluence(int size) 316 | { 317 | double* scaled_influence = new double[size]; 318 | 319 | if (FLAGS_influence_flat_years > 0) { 320 | // Note that we round up, to make sure we have at least one epoch. 321 | int number_epochs = FLAGS_influence_flat_years * FLAGS_time_resolution; 322 | double epoch_weight = 1.0 / number_epochs; 323 | for (int i = 0; i < number_epochs; ++i) { 324 | scaled_influence[i] = epoch_weight; 325 | } 326 | for (int i = number_epochs; i < size; ++i) { 327 | scaled_influence[i] = 0.0; 328 | } 329 | return scaled_influence; 330 | } 331 | 332 | 333 | /* 334 | // Use the simple distribution: 1 at [0], 0 everywhere else. 335 | for (int i=0; i < size; ++i) { 336 | scaled_influence[i] = 0.0; 337 | } 338 | scaled_influence[0] = 1.0; 339 | return scaled_influence; 340 | */ 341 | 342 | /* 343 | // Simulate a beta distribution with specified mean and variance. 344 | double total = 0.0; 345 | double tmp = (scaled_beta_mean * (1 - scaled_beta_mean) / scaled_beta_variance) - 1.0; 346 | double beta_alpha = scaled_beta_mean * tmp; 347 | double beta_beta = (1 - scaled_beta_mean) * tmp; 348 | for (int i = 0; i < scaled_beta_max; ++i) { 349 | // Offset tmp by 0.5 so we get a centered distribution and don't run into degeneracy issues. 350 | tmp = (i + 0.5) / (scaled_beta_max); 351 | scaled_beta[i] = (pow(tmp, beta_alpha - 1.0) * pow(1 - tmp, beta_beta - 1.0)); 352 | total += scaled_beta[i]; 353 | } 354 | */ 355 | 356 | 357 | // Handle the log-normal distribution. 358 | double total = 0.0; 359 | 360 | // Here, we're interested more in the median. So we treat the variable mean as 361 | // median and note this in our paper. 362 | double scaled_influence_mean = FLAGS_influence_mean_years; 363 | double scaled_influence_variance = (FLAGS_influence_stdev_years * FLAGS_influence_stdev_years); 364 | double tmp = (1.0 + (scaled_influence_variance / (scaled_influence_mean * scaled_influence_mean))); 365 | double lognormal_sigma_squared = log(tmp); 366 | double lognormal_mu = (log(scaled_influence_mean) - 0.5 * lognormal_sigma_squared); 367 | double halfTimeframe = (1.0 / FLAGS_time_resolution) / 2; 368 | printf("Median: %.2f\n", exp(lognormal_mu)); 369 | for (int i = 0; i < size; ++i) { 370 | // Shift right by half a timeframe to avoid corner cases. 371 | double x = (i / FLAGS_time_resolution) + halfTimeframe; 372 | double tmp2 = (log(x) - lognormal_mu); 373 | scaled_influence[i] = (1.0 / (x * sqrt(lognormal_sigma_squared * 2 * PI)) * exp(-tmp2 * tmp2/ (2.0 * lognormal_sigma_squared))); 374 | total += scaled_influence[i]; 375 | } 376 | for (int i = 0; i < kScaledInfluenceMax; ++i) { 377 | scaled_influence[i] /= total; 378 | } 379 | 380 | return scaled_influence; 381 | } 382 | -------------------------------------------------------------------------------- /doc/dtm.tex: -------------------------------------------------------------------------------- 1 | \section{Dynamic Topic Model (DTM)} 2 | 3 | 4 | \subsection{Formato dos dados de entrada} 5 | 6 | A ferramenta \srccode{dtm} requer, no mínimo, dois arquivos de entrada: 7 | um para a descrição de cada documento e seus respectivos termos e outro 8 | para identificar as fatias de tempo a serem analisadas. 9 | 10 | O primeiro arquivo, geralmente definido com o nome \srccode{???-mult.dat}, 11 | contém M linhas, sendo M a quantidade de documentos a serem analisados. 12 | Os documentos devem ser ordenados pela data, em ordem crescente. 13 | Cada linha descreve um documento, os seus termos e a quantidade de cada 14 | termo no documento, de acordo com o seguinte formato: 15 | 16 | \begin{lstlisting} 17 | unique_word_count index1:count1 index2:count2 ... indexn:counnt 18 | \end{lstlisting} 19 | 20 | Não existe um identificador para o documento: o número da linha é utilizado 21 | para esse fim. Os termos que compõe o documento também não são declarados 22 | de forma textual: deve-se adotar um identificador numérico para cada termo. 23 | Esse identificador deve ser único para o mesmo termo em relação a todos os 24 | documentos (veja isto como uma otimização: é mais rápido processar um número 25 | do que uma palavra). A contagem corresponde a frequência absoluta do termo 26 | no documento da linha atual. Finalmente, o primeiro elemento da linha indica 27 | o tamanho do vocabulário necessário para descrever o documento (veja isto 28 | como um facilitador para ler os dados restantes da linha). 29 | 30 | Por exemplo, considere os documentos da \cref{}. A definição deles no formato 31 | DTM é apresentada na \cref{}. Observe que, para o segundo documento, alguns 32 | termos que já tinham sido utilizados para especificar o primeiro documento, 33 | apareceram. Logo, o índice que identifica o termo é o mesmo (\eg{], 3, 9 e 14}, 34 | embora a contagem seja particular a cada documento (\eg{}, o termo 3 apareceu 35 | 3 vezes no primeiro documento e apenas 1 vez no segundo documento). 36 | 37 | \begin{figure} 38 | \begin{itemize} 39 | \item Documento 1~\cite{Kulesza-etal2007}: 40 | \\``The development of collaborative and multimedia systems is a 41 | complex task and one of the key challenges is to promote the reuse and 42 | integration of those two software categories in the same environment.'' 43 | 44 | \item Documento 2~\cite{Bezerra-Wainer2006}: 45 | ``This work shows a model to detect a set of anomalous traces in a log 46 | generated by a business process management system.'' 47 | \end{itemize} 48 | 49 | \begin{lstlisting} 50 | 24 1:4 2:1 3:3 4:1 5:3 6:1 7:1 8:2 9:1 10:1 11:1 12:1 13:1 14:1 15:1 16:1 17:1 18:1 19:1 20:1 21:1 22:1 23:1 24:1 51 | 17 3:1 9:4 14:1 22:1 25:1 26:1 27:1 28:1 29:1 30:1 31:1 32:1 33:1 34:1 35:1 36:1 37:1 52 | \end{lstlisting} 53 | 54 | \begin{tabular} 55 | 1. the 56 | 2. development 57 | 3. of 58 | 4. collaborative 59 | 5. and 60 | 6. multimedia 61 | 7. systems 62 | 8 is 63 | 9 a 64 | 10 complex 65 | 11 task 66 | 12 one 67 | 13 challenges 68 | 14 to 69 | 15 promote 70 | 16 reuse 71 | 17 integration 72 | 18 those 73 | 19 two 74 | 20 software 75 | 21 categories 76 | 22 in 77 | 23 same 78 | 24 environment 79 | 25 this 80 | 26 work 81 | 27 shows 82 | 28 model 83 | 29 detect 84 | 30 set 85 | 31 log 86 | 32 generated 87 | 33 by 88 | 34 business 89 | 35 process 90 | 36 management 91 | 37 system 92 | \end{tabular} 93 | \end{figure} 94 | 95 | Para facilitar a posterior análise dos dados, é necessário guardar a 96 | informação de qual termo corresponde a cada identificador de termo e 97 | de qual documento corresponde a cada identificador de documento. 98 | Para o primeiro caso, deve-se criar um arquivo \srccode{vocab} com o 99 | vocabulário utilizado na coleção analisada. O formato deste arquivo é 100 | bem simples: coloca-se um termo por linha, sendo que o número da linha 101 | corresponde ao identificador do termo. Para o segundo caso, deve-se criar 102 | um arquivo \srccode{docs} com o nome de cada documento, organizados na 103 | mesma ordem em que foram especificados no arquivo de entrada (\srccode{-mult.dat}). 104 | Dessa forma, os dados em \cref{} podem ser posteriormente recuperados, 105 | facilitando a análise dos resultados pelos pesquisadores. 106 | 107 | 108 | O segundo arquivo de entrada do DTM define as fatias de tempo que serão 109 | analisadas. Esse arquivo, geralmente nomeado \srccode{???-seq.dat}, adota 110 | o seguinte formato: 111 | 112 | \begin{lstlisting} 113 | Number_Timestamps 114 | number_docs_time_1 115 | ... 116 | number_docs_time_i 117 | ... 118 | number_docs_time_NumberTimestamps 119 | \end{lstlisting} 120 | 121 | A primeira linha determina a quantidade de fatias de tempo a serem analisadas. 122 | As linhas seguintes especificam quantos documentos fazem parte de cada fatia, 123 | em ordem crescente. Esses documentos são obtidos em sequência, do início para 124 | o fim, do arquivo que descreve a coleção de dados a serem analisadas. Como 125 | aquele arquivo descreve um documento por linha, serão utilizados as $M_1$ 126 | primeiras linhas para a primeira fatia de tempo, as $M_2$ linhas seguintes para 127 | a segunda fatia de tempo e assim por diante. 128 | 129 | Por exemplo, observando-se o \cref{}, para a fatia de tempo 1, definida na 130 | linha 2, serão considerados 15 documentos 131 | 132 | 133 | 134 | 135 | \subsection{Configuração} 136 | 137 | \begin 138 | Flags from data.c: 139 | -influence_flat_years (How many years is the influence nonzero?If 140 | nonpositive, a lognormal distribution is used.) type: int32 default: -1 141 | -influence_mean_years (How many years is the mean number of citations?) 142 | type: double default: 20 143 | -influence_stdev_years (How many years is the stdev number of citations?) 144 | type: double default: 15 145 | -max_number_time_points (Used for the influence window.) type: int32 146 | default: 200 147 | -resolution (The resolution. Used to determine how far out the beta mean 148 | should be.) type: double default: 1 149 | -sigma_c (c stdev.) type: double default: 0.050000000000000003 150 | -sigma_cv (Variational c stdev.) type: double 151 | default: 9.9999999999999995e-07 152 | -sigma_d (If true, use the new phi calculation.) type: double 153 | default: 0.050000000000000003 154 | -sigma_l (If true, use the new phi calculation.) type: double 155 | default: 0.050000000000000003 156 | -time_resolution (This is the number of years per time slice.) type: double 157 | default: 0.5 158 | 159 | Flags from gsl-wrappers.c: 160 | -rng_seed (Specifies the random seed. If 0, seeds pseudo-randomly.) 161 | type: int64 default: 0 162 | 163 | Flags from lda-seq.c: 164 | -fix_topics (Fix a set of this many topics. This amounts to fixing these 165 | topics' variance at 1e-10.) type: int32 default: 0 166 | -forward_window (The forward window for deltas. If negative, we use a beta 167 | with mean 5.) type: int32 default: 1 168 | -lda_sequence_max_iter (The maximum number of iterations.) type: int32 169 | default: 20 170 | -lda_sequence_min_iter (The maximum number of iterations.) type: int32 171 | default: 1 172 | -normalize_docs (Describes how documents's wordcounts are considered for 173 | finding influence. Options are "normalize", "none", "occurrence", "log", 174 | or "log_norm".) type: string default: "normalize" 175 | -save_time (Save a specific time. If -1, save all times.) type: int32 176 | default: 2147483647 177 | 178 | Flags from lda.c: 179 | -lambda_convergence (Specifies the level of convergence required for lambda 180 | in the phi updates.) type: double default: 0.01 181 | 182 | Flags from main.c: 183 | -alpha () type: double default: -10 184 | -corpus_prefix (The function to perform. Can be dtm or dim.) type: string 185 | default: "" 186 | -end () type: int32 default: -1 187 | -heldout_corpus_prefix () type: string default: "" 188 | -heldout_time (A time up to (but not including) which we wish to train, and 189 | at which we wish to test.) type: int32 default: -1 190 | -initialize_lda (If true, initialize the model with lda.) type: bool 191 | default: false 192 | -lda_max_em_iter () type: int32 default: 20 193 | -lda_model_prefix (The name of a fit model to be used for testing 194 | likelihood. Appending "info.dat" to this should give the name of the 195 | file.) type: string default: "" 196 | -mode (The function to perform. Can be fit, est, or time.) type: string 197 | default: "fit" 198 | -model (The function to perform. Can be dtm or dim.) type: string 199 | default: "dtm" 200 | -ntopics () type: double default: -1 201 | -outname () type: string default: "" 202 | -output_table () type: string default: "" 203 | -params_file (A file containing parameters for this run.) type: string 204 | default: "settings.txt" 205 | -start () type: int32 default: -1 206 | -top_chain_var () type: double default: 0.0050000000000000001 207 | -top_obs_var () type: double default: 0.5 208 | 209 | 210 | 211 | 212 | 213 | \subsection{Running} 214 | 215 | This progam takes as input a collection of text documents and creates 216 | as output a list of topics over time, a description of each document 217 | as a mixture of these topics, and (possibly) a measure of how 218 | "influential" each document is, based on its language. 219 | 220 | We have provided an example dataset, instructions for formatting input 221 | data and processing output files, and example command lines for 222 | running this software in the file dtm/sample.sh. 223 | 224 | 225 | \subsubsection{Topic estimation} 226 | 227 | ./main \ 228 | --ntopics=20 \ 229 | --mode=fit \ 230 | --rng_seed=0 \ 231 | --initialize_lda=true \ 232 | --corpus_prefix=example/test \ 233 | --outname=example/model_run \ 234 | --top_chain_var=0.005 \ 235 | --alpha=0.01 \ 236 | --lda_sequence_min_iter=6 \ 237 | --lda_sequence_max_iter=20 \ 238 | --lda_max_em_iter=10 239 | 240 | 241 | \subsubsection{Topic inference} 242 | 243 | ./main \ 244 | --mode=fit \ 245 | --rng_seed=0 \ 246 | --model=fixed \ 247 | --initialize_lda=true \ 248 | --corpus_prefix=example/test \ 249 | --outname=example/output \ 250 | --time_resolution=2 \ 251 | --influence_flat_years=5 \ 252 | --top_obs_var=0.5 \ 253 | --top_chain_var=0.005 \ 254 | --sigma_d=0.0001 \ 255 | --sigma_l=0.0001 \ 256 | --alpha=0.01 \ 257 | --lda_sequence_min_iter=6 \ 258 | --lda_sequence_max_iter=20 \ 259 | --save_time=-1 \ 260 | --ntopics=10 \ 261 | --lda_max_em_iter=10 262 | 263 | 264 | 265 | \subsection{Resultado} 266 | 267 | O programa \srccode{dtm} cria os seguintes arquivos: 268 | 269 | \begin{itemize} 270 | \item \srccode{topic-???-var-e-log-prob.dat}: a distribuição das 271 | palavras (e-betas) para o tópico ??? para cada período analisado. 272 | 273 | Os dados contidos no arquivo estão no formato \foreign{row-major}, 274 | ou seja, as linhas da tabela estão armazenadas uma após a outra. 275 | Cada linha, por sua vez, possui uma quantidade de colunas equivalente 276 | à quantidade de frações de tempo analisadas. Por exemplo, se foram 277 | analisadas 10 fatias de tempo, o comando para ler os dados relativos 278 | ao tópico 2 seria, em R: 279 | 280 | \begin{lstlisting}[language=R] 281 | a = scan("topic-002-var-e-log-prob.dat") 282 | b = matrix(a, ncol=10, byrow=TRUE) 283 | \end{lstlisting} 284 | 285 | Cada célula da matriz possui o logaritmo natural da probabilidade do 286 | termo M (sendo M o número identificador do termo, tal como definido 287 | no vocabulário e na matriz de entrada), o qual está na linha M da 288 | matriz, em relação ao tópico identificado pelo número N, o qual 289 | corresponde à coluna da matriz. Por exemplo, para obter a 290 | probabilidade do termo 100 para a fatia de tempo 3, o comando seria: 291 | 292 | \begin{lstlisting}[language=R] 293 | exp(b[100, 3]) 294 | \end{lstlisting} 295 | 296 | 297 | \item \srccode{gam.dat}: Armazena os parâmetros do Dirichlet variacional 298 | para cada documento. 299 | 300 | 301 | Divide these by the sum for each document to get expected topic mixtures. 302 | \begin{lstlisting}[language=R} 303 | a = scan("gam.dat") 304 | b = matrix(a, ncol=10, byrow=TRUE) 305 | rs = rowSums(b) 306 | e.theta = b / rs 307 | # Proportion of topic 5 in document 3: 308 | e.theta[3, 5] 309 | \end{lstlisting} 310 | 311 | 312 | \item[\srccode{influence_time-???}]: Armazena a influência dos documentos na 313 | fatia de tempo ??? para cada tópico. Cada linha do arquivo corresponde ao 314 | documento M, sendo tal identificador M equivalente à linha em que o documento 315 | em questão se encontra na matriz de entrada (-mult). Cada coluna corresponde 316 | a um tópico N. Por exemplo, para obter a influência do documento 2 no tópico 317 | 5, os comandos em R seriam: 318 | 319 | \begin{lstlisting}[language=R] 320 | a = scan("influence-time-010") 321 | b = matrix(a, ncol=10, byrow=TRUE) 322 | b[2, 5] 323 | \end{lstlisting} 324 | \end{description} 325 | 326 | 327 | A análise de todos esses arquivos podem ser automatizada da seguinte forma: 328 | 329 | \begin{lstlisting}[language=R] 330 | # Para um tópico 331 | data0 = scan("topic-000-var-e-log-prob.dat") 332 | b0 = matrix(data0, ncol=10, byrow=TRUE) 333 | write.table(b0, file="dist-topic0.csv", sep=";") 334 | 335 | 336 | # Processa todos tópicos 337 | # Para cada tópico, gera um arquivo com a probabilidade de cada 338 | # termo para cada ano 339 | # TODO: rodar exp() nos valores 340 | topics = list() 341 | for (i in 0:9) { 342 | filename = paste("topic-00", i, sep = "") 343 | filename = paste(filename, "-var-e-log-prob.dat", sep = "") 344 | data = scan(filename) 345 | topic = matrix(data, ncol=10, byrow=TRUE) 346 | filename = paste("dist-topic", i, sep = "") 347 | filename = paste(filename, ".csv", sep = "") 348 | write.table(topic, file=filename, sep=";") 349 | } 350 | 351 | 352 | # - gam.dat: The gammas associated with each document. Divide these by 353 | # the sum for each document to get expected topic mixtures. 354 | # Proportion of topic 5 in document 3: 355 | # e.theta[3, 5] 356 | a = scan("gam.dat") 357 | b = matrix(a, ncol=10, byrow=TRUE) 358 | rs = rowSums(b) 359 | e.theta = b / rs 360 | write.table(e.theta, file="documents_topics.csv", sep=";" 361 | \end{lstlisting} 362 | 363 | 364 | --------------------------------------------------------------------------------