├── .gitignore
├── bin
    ├── dtm-linux32
    ├── dtm-linux64
    ├── dtm-darwin64
    ├── dtm-win32.exe
    └── dtm-win64.exe
├── dtm
    ├── example
    │   └── test-seq.dat
    ├── main.h
    ├── Makefile
    ├── params.h
    ├── util.h
    ├── build.sh
    ├── param.h
    ├── gsl-wrappers.h
    ├── ss-lm.h
    ├── lda-seq.h
    ├── sample.sh
    ├── params.c
    ├── lda.h
    ├── data.h
    ├── util.c
    ├── main.c
    ├── gsl-wrappers.c
    └── data.c
├── lib
    └── math
    │   ├── logspace.h
    │   ├── gradient_projection.h
    │   ├── specialfunc.h
    │   ├── logspace_base.cpp
    │   ├── logspace_base.h
    │   ├── optimizer.h
    │   ├── logspace.cpp
    │   ├── gradient_projection_test.cpp
    │   ├── specialfunc.cpp
    │   ├── gsl_matrix.h
    │   ├── gradient_projection.cpp
    │   ├── gsl_vector.h
    │   └── vectorops.h
├── gslwrap
    ├── include
    │   └── gslwrap
    │   │   ├── permutation.h
    │   │   ├── matrix_vector_operators.h
    │   │   ├── random_number_distribution.h
    │   │   ├── random_generator.h
    │   │   ├── histogram.h
    │   │   ├── min_fminimizer.h
    │   │   ├── multimin_fdfminimizer.h
    │   │   ├── matrix_double.h
    │   │   ├── matrix_int.h
    │   │   └── matrix_float.h
    └── bin
    │   └── gslwrap-config
├── doc
    ├── HOWTO
    ├── lda.tex
    └── dtm.tex
└── README


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | 


--------------------------------------------------------------------------------
/bin/dtm-linux32:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-linux32


--------------------------------------------------------------------------------
/bin/dtm-linux64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-linux64


--------------------------------------------------------------------------------
/bin/dtm-darwin64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-darwin64


--------------------------------------------------------------------------------
/bin/dtm-win32.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-win32.exe


--------------------------------------------------------------------------------
/bin/dtm-win64.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magsilva/dtm-old/HEAD/bin/dtm-win64.exe


--------------------------------------------------------------------------------
/dtm/example/test-seq.dat:
--------------------------------------------------------------------------------
 1 | 10
 2 | 25
 3 | 50
 4 | 75
 5 | 100
 6 | 100
 7 | 100
 8 | 100
 9 | 125
10 | 150
11 | 175


--------------------------------------------------------------------------------
/dtm/main.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAINH
 2 | #define MAINH
 3 | 
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include "data.h"
 7 | #include "lda-seq.h"
 8 | #include "lda.h"
 9 | #include <gsl/gsl_matrix.h>
10 | 
11 | typedef struct dtm_fit_params
12 | {
13 |     char* datafile;
14 |     char* outname;
15 |     char* heldout;
16 |     int start;
17 |     int end;
18 |     int ntopics;
19 |     int lda_max_em_iter;
20 |     double top_obs_var;
21 |     double top_chain_var;
22 |     double alpha;
23 | } dtm_fit_params;
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/dtm/Makefile:
--------------------------------------------------------------------------------
 1 | .SUFFIXES: .c .u
 2 | 
 3 | LIB = ../lib
 4 | GSLWRAP_LIB = ../gslwrap
 5 | OUTPUT_FILE ?= dtm
 6 | OPTFLAGS ?= -O2
 7 | STRIP ?= strip
 8 | CXXFLAGS += -I ${LIB} -I ${LIB}/math -I ${GSLWRAP_LIB}/include -I ${GSLWRAP_LIB}/include/gslwrap 
 9 | 
10 | LDFLAGS += -lgsl -lm -lgslcblas -lgflags
11 | EXTRA_LDFLAGS = 
12 | LOBJECTS = ss-lm.o gsl-wrappers.o data.o param.o util.o lda-seq.o lda.o params.o main.o
13 | 
14 | all:	main
15 | 
16 | .c.o :
17 | 	$(CXX) $(CXXFLAGS) -c $<
18 | 
19 | main:	$(LOBJECTS)
20 | 	$(CXX) $(OPTFLAGS) $(CXXFLAGS) $(LOBJECTS) -o $(OUTPUT_FILE) $(LDFLAGS) $(EXTRA_LDFLAGS)
21 | 	$(STRIP) $(OUTPUT_FILE)
22 | 
23 | clean:
24 | 	-rm -f *.o
25 | 
26 | distclean: clean
27 | 	-rm -f $(OUTPUT_FILE) $(OUTPUT_FILE)-*
28 | 


--------------------------------------------------------------------------------
/dtm/params.h:
--------------------------------------------------------------------------------
 1 | // Author: David Blei (blei@cs.princeton.edu)
 2 | //
 3 | // Copyright 2006 David Blei
 4 | // All Rights Reserved.
 5 | //
 6 | // See the README for this package for details about modifying or
 7 | // distributing this software.
 8 | 
 9 | #ifndef PARAMSH
10 | #define PARAMSH
11 | 
12 | #define MAX_LINE_LENGTH 100000;
13 | 
14 | #include "gsl-wrappers.h"
15 | #include <stdlib.h>
16 | #include <stdio.h>
17 | #include <gsl/gsl_vector.h>
18 | #include <string.h>
19 | 
20 | void params_read_string(FILE* f, char* name, char* x);
21 | 
22 | void params_read_int(FILE* f, char* name, int* x);
23 | 
24 | void params_write_int(FILE *, char *, int);
25 | 
26 | void params_read_double(FILE* f, char* name, double* x);
27 | 
28 | void params_write_double(FILE *, char *, double);
29 | 
30 | void params_read_gsl_vector(FILE* f, char* name, gsl_vector** x);
31 | 
32 | void params_write_gsl_vector(FILE *, char* , gsl_vector *);
33 | 
34 | void params_write_gsl_vector_multiline(FILE *, char* , gsl_vector *);
35 | 
36 | void params_write_gsl_matrix(FILE *, char* , gsl_matrix *);
37 | 
38 | void params_write_sparse_gsl_matrix(FILE *, char* , gsl_matrix *);
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/dtm/util.h:
--------------------------------------------------------------------------------
 1 | // Author: David Blei (blei@cs.princeton.edu)
 2 | //
 3 | // Copyright 2006 David Blei
 4 | // All Rights Reserved.
 5 | //
 6 | // See the README for this package for details about modifying or
 7 | // distributing this software.
 8 | 
 9 | #ifndef _UTIL_INCLUDED
10 | #define _UTIL_INCLUDED 1
11 | 
12 | #include <stdarg.h>
13 | 
14 | #define EOS  '\0'
15 | #define CRLF  printf("\n")
16 | #define TRUE  1
17 | #define FALSE 0
18 | 
19 | extern const char*  quote (const char *s);
20 | extern char*  dequote (char *s);
21 | extern void   quote_no_matter_what (const char *s, char *t);
22 | extern int    verify (char *s, char *t);
23 | extern char*  strip (char *s);
24 | extern char*  upper (char *s);
25 | extern char*  lower (char *s);
26 | extern int    qfilef (const char *fname); /* TRUE if file exists */
27 | extern int    free_storage (char *fn); /* returns free storage in file system of fn */
28 | extern char*  util_strdup(char *string);
29 | extern void*  util_malloc (int size);
30 | extern void*  util_realloc (void *p, int size);
31 | extern void*  util_calloc (int num, int size);
32 | extern void   util_free (void *p);
33 | extern int    util_space_in_use (void);
34 | extern int    util_pointers_in_use (void);
35 | extern void error(char *fmt, ...);
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/lib/math/logspace.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MATH_LOGSPACE_H__
 2 | #define __MATH_LOGSPACE_H__
 3 | #include <cmath>
 4 | #include <vector>
 5 | #include <gsl/gsl_vector.h>
 6 | #include <gsl/gsl_matrix.h>
 7 | #include <gsl/gsl_sf_gamma.h>
 8 | #include "logspace_base.h"
 9 | #include "specialfunc.h"
10 | 
11 | // Given two log vectors, log a_i and log b_i, compute 
12 | // log sum (a_i * b_i).
13 | double log_dot_product(const gsl_vector* log_a, const gsl_vector* log_b);
14 | 
15 | // Given a log vector, log a_i, compute log sum a_i.  Returns the sum.
16 | double log_normalize(gsl_vector* x);
17 | 
18 | // Compute the log sum over all elements in the vector
19 | double log_sum(const gsl_vector* x);
20 | 
21 | // Given a log matrix, log a_i, compute log sum a_i.  Returns the sum.
22 | double log_normalize_matrix(gsl_matrix* x);
23 | 
24 | double log_dirichlet_likelihood(const double sum,
25 |                                 const double prior_sum,
26 |                                 const std::vector<int>& counts,
27 |                                 bool debug = false);
28 | 
29 | double log_dirichlet_likelihood(const double sum,
30 |                                 const double prior_scale,
31 |                                 const gsl_vector* prior,
32 |                                 const std::vector<int>& counts);
33 | 
34 | #endif  // __MATH_LOGSPACE_H__
35 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/permutation.h:
--------------------------------------------------------------------------------
 1 | //  This matrix class is a C++ wrapper for the GNU Scientific Library
 2 | //  Copyright (C)  ULP-IPB Strasbourg
 3 | 
 4 | //  This program is free software; you can redistribute it and/or modify
 5 | //  it under the terms of the GNU General Public License as published by
 6 | //  the Free Software Foundation; either version 2 of the License, or
 7 | //  (at your option) any later version.
 8 | 
 9 | //  This program is distributed in the hope that it will be useful,
10 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | //  GNU General Public License for more details.
13 | 
14 | //  You should have received a copy of the GNU General Public License
15 | //  along with this program; if not, write to the Free Software
16 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 | 
18 | #ifndef _permutation_h
19 | #define _permutation_h
20 | 
21 | #include<gsl/gsl_permutation.h>
22 | 
23 | namespace gsl
24 | {
25 | class permutation
26 | {
27 | 	friend class matrix;
28 | 	friend class matrix_float;
29 | 	friend class matrix_int;
30 | 
31 | 	gsl_permutation *gsldata;
32 | public:
33 | 	permutation(size_t n,bool clear=true)
34 | 	{
35 | 		gsldata=(clear ? gsl_permutation_calloc(n) : gsl_permutation_alloc(n));
36 | 	}
37 | 	permutation():gsldata(NULL){;}
38 | 	void resize(size_t n){gsldata= gsl_permutation_calloc(n);}
39 | };
40 | }
41 | #endif// _permutation_h
42 | 


--------------------------------------------------------------------------------
/dtm/build.sh:
--------------------------------------------------------------------------------
 1 | make clean
 2 | PATH=/usr/x86_64-w64-mingw32/bin:$PATH make main \
 3 | 	NM=nm \
 4 | 	RANLIB=ranlib \
 5 | 	CC=x86_64-w64-mingw32-gcc \
 6 | 	CCC=x86_64-w64-mingw32-g++ \
 7 | 	CXX=x86_64-w64-mingw32-g++ \
 8 | 	AS=as \
 9 | 	OPTFLAGS="-O2 -static" \
10 | 	EXTRA_LDFLAGS="-lshlwapi" \
11 | 	STRIP="x86_64-w64-mingw32-strip" \
12 | 	OUTPUT_FILE="dtm-win64.exe"
13 | 
14 | make clean
15 | PATH=/usr/i686-w64-mingw32/bin:$PATH make main \
16 | 	NM=nm \
17 | 	RANLIB=ranlib \
18 | 	CC=i686-w64-mingw32-gcc \
19 | 	CCC=i686-w64-mingw32-g++ \
20 | 	CXX=i686-w64-mingw32-g++ \
21 | 	AS=as \
22 | 	OPTFLAGS="-O2 -static" \
23 | 	EXTRA_LDFLAGS="-lshlwapi" \
24 | 	STRIP="i686-w64-mingw32-strip" \
25 | 	OUTPUT_FILE="dtm-win32.exe"
26 | 
27 | make clean
28 | make main \
29 | 	NM=nm \
30 | 	RANLIB=ranlib \
31 | 	CC=gcc \
32 | 	CCC=g++ \
33 | 	CXX=g++ \
34 | 	AS=as \
35 | 	CFLAGS="-m32" \
36 | 	CXXFLAGS="-m32" \
37 | 	OUTPUT_FILE="dtm-linux32"
38 | 
39 | make clean
40 | make main \
41 | 	NM=nm \
42 | 	RANLIB=ranlib \
43 | 	CC=gcc \
44 | 	CCC=g++ \
45 | 	CXX=g++ \
46 | 	AS=as \
47 | 	OUTPUT_FILE="dtm-linux64"
48 | 
49 | # Remember to remove the dynamic libraries from the libdir when compiling
50 | # (otherwise the binary will require dynamic libraries as well)
51 | make clean
52 | PATH=/usr/x86_64-apple-darwin15/bin:$PATH make main \
53 | 	NM=x86_64-apple-darwin15-nm \
54 | 	RANLIB=x86_64-apple-darwin15-ranlib \
55 | 	CC=x86_64-apple-darwin15-clang \
56 | 	CCC=x86_64-apple-darwin15-clang++ \
57 | 	CXX=x86_64-apple-darwin15-clang++ \
58 | 	AS=x86_64-apple-darwin15-as \
59 | 	STRIP=x86_64-apple-darwin15-strip \
60 | 	OUTPUT_FILE="dtm-darwin64"
61 | 


--------------------------------------------------------------------------------
/lib/math/gradient_projection.h:
--------------------------------------------------------------------------------
 1 | #ifndef MATH_GRADIENTPROJECTION_INCLUDED
 2 | #define MATH_GRADIENTPROJECTION_INCLUDED
 3 | 
 4 | #define SAFETY_BOX 0.001
 5 | #define GRADIENT_DESCENT_SLOWDOWN 1.0
 6 | 
 7 | #include <iostream>
 8 | 
 9 | #include "gslwrap/vector_double.h"
10 | #include "gslwrap/matrix_double.h"
11 | 
12 | namespace GradientProjection {
13 | /*
14 |  * Returns true if the sum to less than one constraint is violated,
15 |  * fills in with the active constraint matrix.  Caller is responsible
16 |  * for memory management of newly created matrix.
17 |  */
18 | bool createActiveConstraints(const gsl::vector& x, 
19 |                              gsl::matrix& n,
20 |                              gsl::vector& g);
21 | 
22 | void display(const gsl_vector* v, const char* name);
23 | 
24 | void display(const gsl_matrix* m, const char* name);
25 | 
26 | void createProjection(const gsl::matrix& activeConstraints,
27 |                       const gsl::vector& g,
28 |                       const gsl::vector& grad,
29 |                       gsl::matrix& projection,
30 |                       gsl::vector& direction,
31 |                       gsl::vector& correction);
32 | 
33 | double updateState(gsl::vector& x,
34 |                    const double gamma,
35 |                    const gsl::vector grad,
36 |                    const double f);
37 |  
38 | double descend(gsl::vector& x, 
39 |                gsl::vector& s,
40 |                const double gamma, 
41 |                const double obj_value,             
42 |                const gsl::vector& correction,
43 |                const gsl::vector& grad);
44 |  
45 | }
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/lib/math/specialfunc.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MATH_SPECIALFUNC_H__
 2 | #define __MATH_SPECIALFUNC_H__
 3 | #include <cmath>
 4 | 
 5 | #ifndef M_PI
 6 | #define M_PI        3.14159265358979323846
 7 | #endif
 8 | 
 9 |  /**
10 |    * Proc to calculate the value of the trigamma, the second
11 |    * derivative of the loggamma function. Accepts positive matrices.
12 |    * From Abromowitz and Stegun.  Uses formulas 6.4.11 and 6.4.12 with
13 |    * recurrence formula 6.4.6.  Each requires workspace at least 5
14 |    * times the size of X.
15 |    *
16 |    **/
17 | 
18 | double trigamma(double x);
19 | 
20 | 
21 | /*
22 |  * taylor approximation of first derivative of the log gamma function
23 |  *
24 |  */
25 | double digamma(double x);
26 | double InverseDigamma(double x);
27 | 
28 | 
29 | // lgamma.cpp -- log gamma function of real argument.
30 | //      Algorithms and coefficient values from "Computation of Special
31 | //      Functions", Zhang and Jin, John Wiley and Sons, 1996.
32 | //
33 | //  (C) 2003, C. Bond. All rights reserved.
34 | //
35 | //  Returns log(gamma) of real argument.
36 | //  NOTE: Returns 1e308 if argument is 0 or negative.
37 | //
38 | double log_gamma(double x);
39 | 
40 | double sigmoid(double x);
41 | 
42 | // First derivative of sigmoid function.
43 | double dsigmoid(double x);
44 | 
45 | // Second derivative of sigmoid function.
46 | double d2sigmoid(double x);
47 | 
48 | // Log of the CDF of a Gaussian.  
49 | double LogPGaussian(double x);
50 | 
51 | // Log of the PDF of a Gaussian.  
52 | double LogDGaussian(double x);
53 | 
54 | // Computes the inverse of PGaussian.
55 | double InversePGaussian(double x);
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/matrix_vector_operators.h:
--------------------------------------------------------------------------------
 1 | //  This matrix class is a C++ wrapper for the GNU Scientific Library
 2 | //  Copyright (C)  ULP-IPB Strasbourg
 3 | 
 4 | //  This program is free software; you can redistribute it and/or modify
 5 | //  it under the terms of the GNU General Public License as published by
 6 | //  the Free Software Foundation; either version 2 of the License, or
 7 | //  (at your option) any later version.
 8 | 
 9 | //  This program is distributed in the hope that it will be useful,
10 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | //  GNU General Public License for more details.
13 | 
14 | //  You should have received a copy of the GNU General Public License
15 | //  along with this program; if not, write to the Free Software
16 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 | 
18 | #ifndef __matrix_vector_operators_h
19 | #define __matrix_vector_operators_h
20 | 
21 | #include "gsl/gsl_blas.h"
22 | #include <gslwrap/matrix_double.h>
23 | #include <gslwrap/matrix_float.h>
24 | #include <gslwrap/vector_double.h>
25 | 
26 | namespace gsl
27 | {
28 | 
29 | inline
30 | vector_float operator*(const matrix_float& m, const vector_float& v)
31 | {
32 | 	vector_float y(m.get_rows());
33 | 	gsl_blas_sgemv(CblasNoTrans, 1.0, m.gslobj(), v.gslobj(), 0.0, y.gslobj());
34 | 	return y;
35 | }
36 | 
37 | inline
38 | vector operator*(const matrix& m, const vector& v)
39 | {
40 | 	vector y(m.get_rows());
41 | 	gsl_blas_dgemv(CblasNoTrans, 1.0, m.gslobj(), v.gslobj(), 0.0, y.gslobj());
42 | 	return y;
43 | }
44 | 
45 | }
46 | 
47 | #endif //__matrix_vector_operators_h
48 | 


--------------------------------------------------------------------------------
/doc/HOWTO:
--------------------------------------------------------------------------------
 1 | # Para um tópico
 2 | data0 = scan("topic-000-var-e-log-prob.dat")
 3 | b0 = matrix(data0, ncol=10, byrow=TRUE)
 4 | write.table(b0, file="dist-topic0.csv", sep=";")
 5 | 
 6 | 
 7 | # Processa todos tópicos
 8 | # Para cada tópico, gera um arquivo com a probabilidade de cada
 9 | # termo para cada ano
10 | # TODO: rodar exp() nos valores
11 | topics = list()
12 | for (i in 0:9) {
13 | 	filename = paste("topic-00", i, sep = "")
14 | 	filename = paste(filename, "-var-e-log-prob.dat", sep = "")
15 | 	data = scan(filename)
16 | 	topic = matrix(data, ncol=9, byrow=TRUE)
17 | 	filename = paste("dist-topic", i, sep = "")
18 | 	filename = paste(filename, ".csv", sep = "")
19 | 	write.table(topic, file=filename, sep=";")
20 | }
21 | 
22 | 
23 | for (i in 10:49) {
24 | 	filename = paste("topic-0", i, sep = "")
25 | 	filename = paste(filename, "-var-e-log-prob.dat", sep = "")
26 | 	data = scan(filename)
27 | 	topic = matrix(data, ncol=9, byrow=TRUE)
28 | 	filename = paste("dist-topic", i, sep = "")
29 | 	filename = paste(filename, ".csv", sep = "")
30 | 	write.table(topic, file=filename, sep=";")
31 | }
32 | 
33 | # - gam.dat: The gammas associated with each document.  Divide these by
34 | #  the sum for each document to get expected topic mixtures.
35 | # Proportion of topic 5 in document 3:
36 | # e.theta[3, 5]
37 | a = scan("gam.dat")
38 | b = matrix(a, ncol=50, byrow=TRUE)
39 | rs = rowSums(b)
40 | e.theta = b / rs
41 | write.table(e.theta, file="documents_topics.csv", sep=";")
42 | 
43 | #Treinamento dos tópicos
44 | ./main \
45 |   --ntopics=25 \
46 |   --mode=fit \
47 |   --rng_seed=0 \
48 |   --initialize_lda=true \
49 |   --corpus_prefix=example/SBSC \
50 |   --outname=example/model_run \
51 |   --top_chain_var=0.005 \
52 |   --alpha=2.0 \
53 |   --lda_sequence_min_iter=10 \
54 |   --lda_sequence_max_iter=30 \
55 |   --lda_max_em_iter=10


--------------------------------------------------------------------------------
/lib/math/logspace_base.cpp:
--------------------------------------------------------------------------------
 1 | #include "logspace_base.h"
 2 | 
 3 | using namespace std;
 4 | 
 5 | double safe_log(double x) {
 6 |   if (x <= 0) {
 7 |     return(-1e4);
 8 |   } else {
 9 |     return(log(x));
10 |   }
11 | }
12 | 
13 | // Given log(a) and log(b), return log(a + b).
14 | double log_sum(double log_a, double log_b) {
15 |   double v;
16 | 
17 |   if (log_a == -std::numeric_limits<double>::infinity() &&
18 |       log_b == log_a) {
19 |     return -std::numeric_limits<double>::infinity();
20 |   } else if (log_a < log_b) {
21 |     v = log_b + log(1 + exp(log_a - log_b));
22 |   } else {
23 |       v = log_a + log(1 + exp(log_b - log_a));
24 |   }
25 |   return(v);
26 | }
27 | 
28 | // Given log(a) and log(b), return log(a - b).
29 | double log_diff(double log_a, double log_b) {
30 |   double val;
31 |   double dangerous_part = exp(log_b - log_a);
32 |   assert(dangerous_part < 1.0);
33 |   val = log_a + log(1.0 - dangerous_part);
34 |   return val;
35 | }
36 | 
37 | /*
38 |  * returns the element randomly sampled from the log
39 |  * probabilities in array (number is the number of elements)
40 |  */
41 | int log_sample(double* vals, int length) {
42 |   double normalizer = safe_log(0.0);
43 |   int ii;
44 |   for(ii=0; ii<length; ++ii) {
45 |     normalizer = log_sum(normalizer, vals[ii]);
46 |   }
47 | 
48 |   double val = 0, sum = 0, cutoff = (double)rand() / ((double)RAND_MAX + 1.0);
49 |   for(ii=0; ii<length; ++ii) {
50 |     val = exp(vals[ii] - normalizer);
51 |     sum += val;
52 |     if(sum >= cutoff)
53 |       break;
54 |   }
55 |   assert(ii < length);
56 |   return ii;
57 | }
58 | 
59 | /*
60 |  * A stupid "sampling" function for deterministic testing
61 |  */
62 | int sample_first_nonzero(double* vals, int length) {
63 |   int ii;
64 |   for(ii=0; ii < length - 1 && exp(vals[ii]) < 0.01; ++ii) { }
65 |   return ii;
66 | }
67 | 
68 | bool is_nan(double val) {
69 |   return val != val;
70 | }
71 | 


--------------------------------------------------------------------------------
/lib/math/logspace_base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Author: Jordan Boyd-Graber
 3 |  * Date: March 2008
 4 |  *
 5 |  * This file was spun off from logspace.h in order to create a
 6 |  * logspace file that wouldn't depend on gsl.
 7 |  */
 8 | #ifndef __MATH_LOGSPACE_BASE_H__
 9 | #define __MATH_LOGSPACE_BASE_H__
10 | 
11 | #include <iostream>
12 | #include <limits>
13 | #include <assert.h>
14 | #include <math.h>
15 | 
16 | using namespace std;
17 | 
18 | #ifndef isnan
19 | # define isnan(x) \
20 |   (sizeof (x) == sizeof (long double) ? isnan_ld (x) \
21 |   : sizeof (x) == sizeof (double) ? isnan_d (x) \
22 |   : isnan_f (x))
23 | static inline int isnan_f  (float       x) { return x != x; }
24 | static inline int isnan_d  (double      x) { return x != x; }
25 | static inline int isnan_ld (long double x) { return x != x; }
26 | #endif
27 | 
28 | #ifndef isinf
29 | # define isinf(x) \
30 |   (sizeof (x) == sizeof (long double) ? isinf_ld (x) \
31 |   : sizeof (x) == sizeof (double) ? isinf_d (x) \
32 |   : isinf_f (x))
33 | static inline int isinf_f  (float       x) { return isnan (x - x); }
34 | static inline int isinf_d  (double      x) { return isnan (x - x); }
35 | static inline int isinf_ld (long double x) { return isnan (x - x); }
36 | #endif
37 | 
38 | double safe_log(double x);
39 | 
40 | // Given log(a) and log(b), return log(a + b).
41 | double log_sum(double log_a, double log_b);
42 | 
43 | // Given log(a) and log(b), return log(a - b).
44 | double log_diff(double log_a, double log_b);
45 | 
46 | /*
47 |  * returns the element randomly sampled from the log
48 |  * probabilities in array (number is the number of elements)
49 |  */
50 | int log_sample(double* vals, int length);
51 | 
52 | /*
53 |  * Stupid "sampling" function for deterministic testing (i.e. in unit tests)
54 |  */
55 | int sample_first_nonzero(double* vals, int length);
56 | int sample_max(double* vals);
57 | 
58 | bool is_nan(double val);
59 | 
60 | #endif  // __MATH_LOGSPACE_BASE_H__
61 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/random_number_distribution.h:
--------------------------------------------------------------------------------
 1 | //  This matrix class is a C++ wrapper for the GNU Scientific Library
 2 | //  Copyright (C)  ULP-IPB Strasbourg
 3 | 
 4 | //  This program is free software; you can redistribute it and/or modify
 5 | //  it under the terms of the GNU General Public License as published by
 6 | //  the Free Software Foundation; either version 2 of the License, or
 7 | //  (at your option) any later version.
 8 | 
 9 | //  This program is distributed in the hope that it will be useful,
10 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | //  GNU General Public License for more details.
13 | 
14 | //  You should have received a copy of the GNU General Public License
15 | //  along with this program; if not, write to the Free Software
16 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 | 
18 | #ifndef __random_number_distribution_h
19 | #define __random_number_distribution_h
20 | 
21 | #include "gslwrap/random_generator.h"
22 | #include "gsl/gsl_randist.h"
23 | 
24 | namespace gsl
25 | {
26 | 
27 | class random_number_distribution
28 | {
29 |  public:
30 | 	random_number_distribution(const random_generator& _generator) : generator(_generator){;}
31 | 
32 | 	//Methods:
33 | 	virtual double get()=0;
34 | 	virtual double pdf(const double& x)=0;
35 | 	virtual ~random_number_distribution()
36 | 		{
37 | 			;
38 | 		}
39 |  protected:
40 | 	random_generator generator;
41 | };
42 | 
43 | class gaussian_random : public random_number_distribution
44 | {
45 |  public:
46 | 	gaussian_random(const random_generator& _generator, const double& _sigma=1.0) : random_number_distribution(_generator), sigma(_sigma){;}
47 | 
48 | 	//methods:
49 | 	double get(){return gsl_ran_gaussian(generator.gslobj(), sigma);}
50 | 	double get(double _sigma){return gsl_ran_gaussian(generator.gslobj(), _sigma);}
51 | 	double pdf(const double& x){return gsl_ran_gaussian_pdf(x, sigma);}
52 | 	
53 | 	double ratio_method(){return gsl_ran_gaussian_ratio_method(generator.gslobj(), sigma);}
54 |  protected:
55 | 	double sigma;
56 | };
57 | 
58 | }
59 | 
60 | #endif //__random_number_distribution_h
61 | 


--------------------------------------------------------------------------------
/dtm/param.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "util.h"
 3 | 
 4 | int    param_geti(const char *parameter_name, int default_value);
 5 | 
 6 | double param_getf(const char *parameter_name, double default_value);
 7 | 
 8 | char  *param_getc(const char *parameter_name, char *default_value);
 9 | 
10 | char  *param_gets(const char *parameter_name);
11 | 
12 | int param_getb(const char *parameter_name, int default_value);
13 |   /* Returns true if the value of <parameter_name> is 1, true, or yes,
14 |      (case insensitive), false for any other value, and default_value
15 |      for no value. */
16 | 
17 | int param_symvarie(const char *parameter_name, int *returned_value);
18 |   /* Returns true if a value was found, false otherwise */
19 | 
20 | int param_symvarfe(const char *parameter_name, double *returned_value);
21 |   /* Ditto */
22 | 
23 | int param_symvarce(const char *parameter_name, char *returned_value);
24 |   /* Ditto. Note that the second argument is a "pointer to a char *",
25 |      i.e., approximately a pointer to a string. */
26 | 
27 | void param_set(const char *parameter_name, char *new_value);
28 |   /* Changes the value of ddinf parameter <parameter_name>. This can be
29 |      used to communicate with subroutines which expect ddinf
30 |      parameters without having to make sure they exist in the ddinf file.
31 |      Note, however, that values assigned in the ddinf file are 
32 |      OVERRIDDEN by a call to param_set. */
33 |   /* One might want to implement a param_add which would allow adding
34 |      new ddinf parameters within a program, but which could not
35 |      override values from the ddinf file. */
36 | 
37 | /* if the following isn't called, param.c looks for a %restart 
38 | binding in the param file */
39 | void param_set_restart_file(const char *restart_name_p);
40 | 
41 | /* The following three calls write values to the restart file: */
42 | void   param_puti(const char *parameter_name, int value);
43 | 
44 | void   param_putf(const char *parameter_name, double value);
45 | 
46 | void   param_putc(const char *parameter_name, char *value);
47 | 
48 | 
49 | int    param_checkpointed(void);
50 |   /* If there is a restart file, reads it in and returns TRUE. Otherwise
51 |      returns false. */
52 | 
53 | void   param_checkpoint(void);
54 |   /* Commits all of the param_put calls so far, are starts a new
55 |      checkpoint. (I.e., subsequent `param_put's supersede earlier ones.) */
56 | 
57 | 
58 | void  param_dump (FILE *stream);
59 |   /* Writes the current ddinf bindings to a stream */
60 | 
61 | void  param_push_prefix (const char *hot_prefix);
62 |   /* Push the current prefix to be applied to all ddnames */
63 | 
64 | void  param_pop_prefix (void);
65 |   /* Pop the current prefix */
66 | 
67 | int param_push_file (const char *fn);
68 |   /* Use the file for all bindings */
69 | 
70 | char *param_pop_file (void);
71 |   /* Pop current bindings */
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/random_generator.h:
--------------------------------------------------------------------------------
 1 | //  This random generator is a C++ wrapper for the GNU Scientific Library
 2 | //  Copyright (C) 2001 Torbjorn Vik
 3 | 
 4 | //  This program is free software; you can redistribute it and/or modify
 5 | //  it under the terms of the GNU General Public License as published by
 6 | //  the Free Software Foundation; either version 2 of the License, or
 7 | //  (at your option) any later version.
 8 | 
 9 | //  This program is distributed in the hope that it will be useful,
10 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | //  GNU General Public License for more details.
13 | 
14 | //  You should have received a copy of the GNU General Public License
15 | //  along with this program; if not, write to the Free Software
16 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 | #ifndef __random_generator_h
18 | #define __random_generator_h
19 | 
20 | #include "gsl/gsl_rng.h"
21 | #include <string>
22 | 
23 | namespace gsl
24 | {
25 | #ifndef __HP_aCC
26 | using std::string;
27 | #endif
28 | 
29 | //class RandomNumberGenerator 
30 | class random_generator 
31 | {
32 | private:
33 | 	gsl_rng* generator;
34 | public:
35 | // Construction and Initializing:
36 | 	//! Default args reads environment variable GSL_RNG_TYPE and GSL_RNG_SEED to initialize. If these are not set the generator gsl_rng_mt19937 will be used with seed 0.
37 | 	random_generator (const random_generator& other) : generator(NULL) {generator = gsl_rng_clone(other.generator);}
38 | 	random_generator (const gsl_rng_type* type=NULL, unsigned long int seed=0) : generator(NULL)
39 | 	{
40 | 		gsl_rng_env_setup();
41 | 		if (!type)
42 | 		{
43 | 			generator = gsl_rng_alloc (gsl_rng_default);
44 | 		}
45 | 		else 
46 | 		{
47 | 			generator = gsl_rng_alloc (type) ; 
48 | 			if (seed)
49 | 				gsl_rng_set(generator, seed);
50 | 		}
51 | 	}
52 | 	~random_generator () {gsl_rng_free(generator);}
53 | 	random_generator& operator=(const random_generator& other){if (generator) gsl_rng_free(generator); generator = gsl_rng_clone(other.generator);return *this;}
54 | 	void set(unsigned long int seed){gsl_rng_set(generator, seed);}
55 | 	
56 | // Sampling:
57 | 	unsigned long int get(unsigned long int n=0) {if (n) return gsl_rng_uniform_int(generator, n); else return gsl_rng_get(generator);} // returns value in range [min, max]
58 | 	double uniform() { return gsl_rng_uniform(generator);} // returns value in range [0, 1)
59 | 	double uniform_positive() { return gsl_rng_uniform_pos(generator);}// returns value in range (0, 1)
60 | 	unsigned long int uniform_int(unsigned long int n) 
61 | 		{ return gsl_rng_uniform_int(generator, n);}// returns value in range [0, n-1]
62 | 
63 | // Information:
64 | 	string name(){return gsl_rng_name(generator);}
65 | 	unsigned long int max(){return gsl_rng_max(generator);}
66 | 	unsigned long int min(){return gsl_rng_min(generator);}
67 | 
68 | // For calling gsl functions directly
69 | 	gsl_rng*       gslobj()       { return generator;}
70 | 	const gsl_rng* gslobj() const { return generator;}
71 | //	static void Test();
72 | };
73 | 
74 | }
75 | 
76 | #endif //__random_generator_h
77 | 


--------------------------------------------------------------------------------
/dtm/gsl-wrappers.h:
--------------------------------------------------------------------------------
  1 | #ifndef GSL_WRAPPERS_H
  2 | #define GSL_WRAPPERS_H
  3 | 
  4 | // #include <gsl/gsl_check_range.h>
  5 | #include <gsl/gsl_vector.h>
  6 | #include <gsl/gsl_matrix.h>
  7 | #include <gsl/gsl_permutation.h>
  8 | #include <gsl/gsl_linalg.h>
  9 | #include <gsl/gsl_eigen.h>
 10 | #include <gsl/gsl_rng.h>
 11 | #include <gsl/gsl_randist.h>
 12 | #include <gsl/gsl_multimin.h>
 13 | #include <gsl/gsl_math.h>
 14 | #include <gsl/gsl_blas.h>
 15 | #include <math.h>
 16 | #include <assert.h>
 17 | #include <time.h>
 18 | #include <sys/stat.h>
 19 | #include <sys/types.h>
 20 | 
 21 | #define outlog(format, args...) \
 22 |     fprintf(stderr, format, args); \
 23 |     fprintf(stderr, "\n");
 24 | 
 25 | double safe_log(double);
 26 | double log_sum(double, double);
 27 | 
 28 | static inline double vget(const gsl_vector* v, int i)
 29 | { return(gsl_vector_get(v, i)); };
 30 | 
 31 | static inline void vset(gsl_vector* v, int i, double x)
 32 | { gsl_vector_set(v, i, x); };
 33 | 
 34 | // Increment a vector element by a double.
 35 | void vinc(gsl_vector*, int, double);
 36 | 
 37 | static inline double mget(const gsl_matrix* m, int i, int j)
 38 | { return(gsl_matrix_get(m, i, j)); };
 39 | 
 40 | static inline void mset(gsl_matrix* m, int i, int j, double x)
 41 | { gsl_matrix_set(m, i, j, x); };
 42 | 
 43 | void msetcol(gsl_matrix* m, int r, const gsl_vector* val);
 44 | 
 45 | // Increment a matrix element by a double.
 46 | void minc(gsl_matrix*, int, int, double);
 47 | void msetrow(gsl_matrix*, int, const gsl_vector*);
 48 | 
 49 | void col_sum(gsl_matrix*, gsl_vector*);
 50 | 
 51 | void vct_printf(const gsl_vector* v);
 52 | void mtx_printf(const gsl_matrix* m);
 53 | void vct_fscanf(const char*, gsl_vector* v);
 54 | void mtx_fscanf(const char*, gsl_matrix* m);
 55 | void vct_fprintf(const char* filename, gsl_vector* v);
 56 | void mtx_fprintf(const char* filename, const gsl_matrix* m);
 57 | 
 58 | double log_det(gsl_matrix*);
 59 | 
 60 | void matrix_inverse(gsl_matrix*, gsl_matrix*);
 61 | 
 62 | void sym_eigen(gsl_matrix*, gsl_vector*, gsl_matrix*);
 63 | 
 64 | double sum(const gsl_vector* v);
 65 | 
 66 | double norm(gsl_vector * v);
 67 | 
 68 | void vct_log(gsl_vector* v);
 69 | void vct_exp(gsl_vector* x);
 70 | 
 71 | void choose_k_from_n(int k, int n, int* result);
 72 | 
 73 | void log_normalize(gsl_vector* x);
 74 | void normalize(gsl_vector* x);
 75 | 
 76 | void optimize(int dim,
 77 |               gsl_vector* x,
 78 |               void* params,
 79 |               void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*),
 80 |               void (*df)(const gsl_vector*, void*, gsl_vector*),
 81 |               double (*f)(const gsl_vector*, void*));
 82 | 
 83 | void optimize_fdf(int dim,
 84 |                   gsl_vector* x,
 85 |                   void* params,
 86 |                   void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*),
 87 |                   void (*df)(const gsl_vector*, void*, gsl_vector*),
 88 |                   double (*f)(const gsl_vector*, void*),
 89 |                   double* f_val,
 90 |                   double* conv_val,
 91 |                   int* niter);
 92 | 
 93 | void log_write(FILE* f, char* string);
 94 | int directory_exist(const char *dname);
 95 | void make_directory(char* name);
 96 | 
 97 | gsl_rng* new_random_number_generator();
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/lib/math/optimizer.h:
--------------------------------------------------------------------------------
  1 | #ifndef __LIB_MATH_OPTIMIZER__
  2 | #define __LIB_MATH_OPTIMIZER__
  3 | 
  4 | #include <iostream>
  5 | #include <limits>
  6 | #include <gsl/gsl_multimin.h>
  7 | #include "math/gsl_vector.h"
  8 | #include "util/flags.h"
  9 | 
 10 | using std::cout;
 11 | using std::endl;
 12 | 
 13 | DEFINE_double(multimin_convergence_threshold,
 14 | 	      1e-5,
 15 | 	      "Convergence threshold for conjugate gradient.");
 16 | DEFINE_size(max_multimin_iterations,
 17 | 	    40,
 18 | 	    "Maximum number of conjugate gradient iterations to perform.");
 19 | 
 20 | class Optimizer {
 21 |  public:
 22 |   Optimizer(size_t size) : size_(size) {  
 23 |   }
 24 |   
 25 |   void Optimize() {
 26 |     gsl_multimin_function_fdf my_func;
 27 |     my_func.n = size_;
 28 |     my_func.f = &MultiminObjectiveWrapper;
 29 |     my_func.df = &MultiminGradientWrapper;
 30 |     my_func.fdf = &MultiminObjectiveGradientWrapper;
 31 |     my_func.params = this;
 32 |     
 33 |     gsl_multimin_fdfminimizer* s =  
 34 |       gsl_multimin_fdfminimizer_alloc(gsl_multimin_fdfminimizer_conjugate_fr, size_);
 35 |     GslVector initial_guess(size_);
 36 |     MultiminInitialGuess(initial_guess.mutable_ptr());
 37 | 
 38 |     // step_size, tol
 39 |     //    gsl_multimin_fdfminimizer_set(s, &my_func, initial_guess.ptr(), 0.1, 1.0);
 40 |     gsl_multimin_fdfminimizer_set(s, &my_func, initial_guess.ptr(), 0.01, 0.01);
 41 |     
 42 |     size_t iter = 0;
 43 |     int status;
 44 |     
 45 |     double value = std::numeric_limits<double>::infinity();
 46 |     double prev_value;
 47 |     do {
 48 |       prev_value = value;
 49 |       iter++;
 50 |       status = gsl_multimin_fdfminimizer_iterate(s);
 51 |       if (status) {
 52 | 	cout << "Error: " << gsl_strerror(status) << endl;
 53 | 	break;
 54 |       }
 55 |       status = gsl_multimin_test_gradient(s->gradient, 1e-3);	
 56 |       if (status == GSL_SUCCESS) {
 57 | 	cout << "Minimum found." << endl;
 58 |       }
 59 |       value = s->f;
 60 |       cout << "Iteration: " << iter << " Value: " << 
 61 | 	value << " dValue:" << (prev_value - value)/fabs(value) << " " <<
 62 | 	gsl_strerror(status) << endl;
 63 |     } while (status == GSL_CONTINUE &&
 64 | 	       iter < FLAGS_max_multimin_iterations &&
 65 | 	     (prev_value - value) / fabs(value) > FLAGS_multimin_convergence_threshold);    
 66 |     MultiminResult(s->x);
 67 |     gsl_multimin_fdfminimizer_free(s);
 68 |   }
 69 | 
 70 |   virtual void MultiminObjectiveGradient(const gsl_vector* x, 
 71 | 					 double* objective, 
 72 | 					 gsl_vector* gradient) = 0;
 73 | 
 74 |   virtual void MultiminInitialGuess(gsl_vector* v) = 0;
 75 |   
 76 |   virtual void MultiminResult(gsl_vector* x) = 0;
 77 | 
 78 |   virtual ~Optimizer() { }
 79 |  protected:
 80 |   static double MultiminObjectiveWrapper(const gsl_vector* x, void* params) {
 81 |     double objective;
 82 |     reinterpret_cast<Optimizer*>(params)->MultiminObjectiveGradient(x, &objective, NULL);
 83 |     return objective;
 84 |   }
 85 |   
 86 |   static void MultiminGradientWrapper(const gsl_vector* x, void* params, gsl_vector* g) {
 87 |     reinterpret_cast<Optimizer*>(params)->MultiminObjectiveGradient(x, NULL, g);
 88 |   }
 89 |   
 90 |   static void MultiminObjectiveGradientWrapper(const gsl_vector* x,
 91 | 					       void* params,
 92 | 					       double* f,
 93 | 					       gsl_vector* g) {
 94 |     reinterpret_cast<Optimizer*>(params)->MultiminObjectiveGradient(x, f, g);
 95 |   }
 96 |   
 97 |  private:
 98 |   size_t size_;
 99 | };
100 | #endif  // __LIB_MATH_OPTIMIZER__
101 | 


--------------------------------------------------------------------------------
/lib/math/logspace.cpp:
--------------------------------------------------------------------------------
  1 | #include "logspace.h"
  2 | 
  3 | double log_dirichlet_likelihood(const double sum,
  4 |                                 const double prior_sum,
  5 |                                 const std::vector<int>& counts,
  6 |                                 bool debug) {
  7 |   double val = 0.0;
  8 |   int length = counts.size();
  9 | 
 10 |   double prior_value = prior_sum / (double)length;
 11 |   val += gsl_sf_lngamma(prior_sum);
 12 |   val -= (double)length * gsl_sf_lngamma(prior_value);
 13 | 
 14 |   if(debug) cout << "Likelihood (" << sum << "," << prior_sum << "," << 
 15 |               prior_value << "," << length << ") = " << val << endl;
 16 | 
 17 |   for(int ii = 0; ii < length; ++ii) {
 18 | 
 19 |     if(debug) cout << "\tGAMMA(" << prior_value << " + " <<  
 20 |                 (double)counts[ii] << " = " << prior_value + 
 21 |                 (double)counts[ii] <<  ") -> " << val << endl;
 22 |     val += gsl_sf_lngamma(prior_value + (double)counts[ii]);
 23 |   }
 24 |   val -= gsl_sf_lngamma(prior_sum + sum);
 25 | 
 26 |   if(debug) cout << endl;
 27 | 
 28 |   return val;
 29 | }
 30 | 
 31 | double log_dirichlet_likelihood(const double sum,
 32 |                                 const double prior_scale,
 33 |                                 const gsl_vector* prior,
 34 |                                 const std::vector<int>& counts) {
 35 |   double val = 0.0;
 36 |   int length = counts.size();
 37 | 
 38 |   val += gsl_sf_lngamma(prior_scale);
 39 |   for(int ii=0; ii < length; ++ii) {
 40 |     double prior_value = gsl_vector_get(prior, ii);
 41 |     val -= gsl_sf_lngamma(prior_value);
 42 |     val += gsl_sf_lngamma(prior_value + (double)counts[ii]);
 43 |   }
 44 |   val -= gsl_sf_lngamma(prior_scale + sum);
 45 | 
 46 |   return val;
 47 | 
 48 | }
 49 | 
 50 | double log_dot_product(const gsl_vector* log_a, const gsl_vector* log_b) {
 51 |   double sum = gsl_vector_get(log_a, 0) + gsl_vector_get(log_b, 0);
 52 |   assert(log_a->size == log_b->size);
 53 |   for (unsigned int ii = 1; ii < log_a->size; ++ii) {
 54 |     sum = log_sum(sum, gsl_vector_get(log_a, ii) +
 55 | 		       gsl_vector_get(log_b, ii));
 56 |   }
 57 |   return sum;
 58 | }
 59 | 
 60 | double log_sum(const gsl_vector* x) {
 61 |   double sum = gsl_vector_get(x, 0);
 62 | 
 63 |   for (unsigned int ii = 1; ii < x->size; ii++) {
 64 |     sum = log_sum(sum, gsl_vector_get(x, ii));
 65 |   }
 66 |   return sum;
 67 | }
 68 | 
 69 | // Given a log vector, log a_i, compute log sum a_i.  Returns the sum.
 70 | double log_normalize(gsl_vector* x) {
 71 |   double sum = gsl_vector_get(x, 0);
 72 |   unsigned int i;
 73 | 
 74 |   for (i = 1; i < x->size; i++) {
 75 |     sum = log_sum(sum, gsl_vector_get(x, i));
 76 |   }
 77 | 
 78 |   for (i = 0; i < x->size; i++) {
 79 |     double val = gsl_vector_get(x, i);
 80 |     gsl_vector_set(x, i, val - sum);
 81 |   }
 82 |   return sum;
 83 | }
 84 | 
 85 | // Given a log matrix, log a_i, compute log sum a_i.  Returns the sum.
 86 | double log_normalize_matrix(gsl_matrix* x) {
 87 |   double sum = gsl_matrix_get(x, 0, 0);
 88 | 
 89 |   for (size_t ii = 0; ii < x->size1; ++ii) {
 90 |     for (size_t jj = 0; jj < x->size2; ++jj) {
 91 |       if (ii == 0 && jj == 0) {
 92 | 	continue;
 93 |       }
 94 |       sum = log_sum(sum, gsl_matrix_get(x, ii, jj));      
 95 |     }
 96 |   }
 97 | 
 98 |   for (size_t ii = 0; ii < x->size1; ++ii) {
 99 |     for (size_t jj = 0; jj < x->size2; ++jj) {
100 |       double val = gsl_matrix_get(x, ii, jj);
101 |       gsl_matrix_set(x, ii, jj, val - sum);
102 |     }
103 |   }
104 |   return sum;
105 | }
106 | 


--------------------------------------------------------------------------------
/dtm/ss-lm.h:
--------------------------------------------------------------------------------
  1 | // Authors: David Blei (blei@cs.princeton.edu)
  2 | //          Sean Gerrish (sgerrish@cs.princeton.edu)
  3 | //
  4 | // Copyright 2011 Sean Gerrish and David Blei
  5 | // All Rights Reserved.
  6 | //
  7 | // See the README for this package for details about modifying or
  8 | // distributing this software.
  9 | 
 10 | /*
 11 |  * state space language model variational inference
 12 |  *
 13 |  */
 14 | 
 15 | #ifndef SSLM_H
 16 | #define SSLM_H
 17 | 
 18 | #include "gsl-wrappers.h"
 19 | #include "params.h"
 20 | #include <gsl/gsl_vector.h>
 21 | #include <gsl/gsl_matrix.h>
 22 | #include <assert.h>
 23 | #include <math.h>
 24 | 
 25 | #include "data.h"
 26 | 
 27 | #define SSLM_MAX_ITER 2 // maximum number of optimization iters
 28 | #define SSLM_FIT_THRESHOLD 1e-6 // convergence criterion for fitting sslm
 29 | #define INIT_MULT 1000 // multiplier to variance for first obs
 30 | // #define OBS_NORM_CUTOFF 10 // norm cutoff after which we use all 0 obs
 31 | //#define OBS_NORM_CUTOFF 8 // norm cutoff after which we use all 0 obs
 32 | #define OBS_NORM_CUTOFF 2 // norm cutoff after which we use all 0 obs
 33 | 
 34 | /*
 35 |  * functions for variational inference
 36 |  *
 37 |  */
 38 | 
 39 | // allocate new state space language model variational posterior
 40 | sslm_var* sslm_var_alloc(int W, int T);
 41 | 
 42 | // allocate extra parameters for inference
 43 | void sslm_inference_alloc(sslm_var* var);
 44 | 
 45 | // free extra parameters for inference
 46 | void sslm_inference_free(sslm_var* var);
 47 | 
 48 | // initialize with zero observations
 49 | void sslm_zero_init(sslm_var* var,
 50 |                     double obs_variance,
 51 |                     double chain_variance);
 52 | 
 53 | // initialize with counts
 54 | void sslm_counts_init(sslm_var* var,
 55 |                       double obs_variance,
 56 |                       double chain_variance,
 57 |                       const gsl_vector* counts);
 58 | 
 59 | // initialize from variational observations
 60 | void sslm_obs_file_init(sslm_var* var,
 61 |                         double obs_variance,
 62 |                         double chain_variance,
 63 |                         const char* filename);
 64 | 
 65 | 
 66 | // compute E[\beta_{w,t}] for t = 1:T
 67 | void compute_post_mean(int w, sslm_var* var, double chain_variance);
 68 | 
 69 | // compute Var[\beta_{w,t}] for t = 1:T
 70 | void compute_post_variance(int w, sslm_var* var, double chain_variance);
 71 | 
 72 | // optimize \hat{beta}
 73 | void optimize_var_obs(sslm_var* var);
 74 | 
 75 | // compute dE[\beta_{w,t}]/d\obs_{w,s} for t = 1:T
 76 | void compute_mean_deriv(int word, int time, sslm_var* var,
 77 |                         gsl_vector* deriv);
 78 | 
 79 | // compute d bound/d obs_{w, t} for t=1:T.
 80 | void compute_obs_deriv(int word, gsl_vector* word_counts,
 81 |                        gsl_vector* total_counts, sslm_var* var,
 82 |                        gsl_matrix* mean_deriv_mtx, gsl_vector* deriv);
 83 | 
 84 | // update observations
 85 | void update_obs(gsl_matrix* word_counts, gsl_vector* totals,
 86 |                 sslm_var* var);
 87 | 
 88 | // log probability bound
 89 | double compute_bound(gsl_matrix* word_counts, gsl_vector* totals,
 90 |                      sslm_var* var);
 91 | 
 92 | 
 93 | // fit variational distribution
 94 | double fit_sslm(sslm_var* var, gsl_matrix* word_counts);
 95 | 
 96 | // read and write variational distribution
 97 | void write_sslm_var(sslm_var* var, char* out);
 98 | sslm_var* read_sslm_var(char* in);
 99 | 
100 | void compute_expected_log_prob(sslm_var* var);
101 | // !!! old function (from doc mixture...)
102 | double expected_log_prob(int w, int t, sslm_var* var);
103 | 
104 | // update zeta
105 | void update_zeta(sslm_var* var);
106 | 
107 | #endif
108 | 


--------------------------------------------------------------------------------
/dtm/lda-seq.h:
--------------------------------------------------------------------------------
  1 | // Authors: David Blei (blei@cs.princeton.edu)
  2 | //          Sean Gerrish (sgerrish@cs.princeton.edu)
  3 | //
  4 | // Copyright 2011 Sean Gerrish and David Blei
  5 | // All Rights Reserved.
  6 | //
  7 | // See the README for this package for details about modifying or
  8 | // distributing this software.
  9 | 
 10 | #ifndef LDASEQ_H
 11 | #define LDASEQ_H
 12 | 
 13 | #include <sys/stat.h>
 14 | #include <sys/types.h>
 15 | 
 16 | #include "gsl-wrappers.h"
 17 | #include "lda.h"
 18 | 
 19 | #define LDA_SEQ_EM_THRESH 1e-4
 20 | #define SAVE_LAG 10
 21 | 
 22 | /*
 23 |  * an lda sequence is a collection of simplex sequences for K topics
 24 |  * and an alpha vector
 25 |  *
 26 |  */
 27 | 
 28 | #include <gsl/gsl_vector.h>
 29 | #include <gsl/gsl_matrix.h>
 30 | #include <stdlib.h>
 31 | #include <assert.h>
 32 | 
 33 | #include "param.h"
 34 | #include "ss-lm.h"
 35 | #include "data.h"
 36 | #include "lda.h"
 37 | 
 38 | #define LDA_SEQ_EM_THRESHOLD 1e-5;
 39 | 
 40 | // lda sequence variational posterior distribution
 41 | 
 42 | 
 43 | // === allocation and initialization ===
 44 | 
 45 | inf_var* inf_var_alloc(int number_topics,
 46 | 		       corpus_seq_t* corpus_seq);
 47 | void inf_var_free(inf_var* ptr);
 48 | 
 49 | // initialize lda sequence from lda model topics
 50 | 
 51 | void init_lda_seq_from_ss(lda_seq* model,
 52 |                           double topic_chain_variance,
 53 |                           double topic_obs_variance,
 54 |                           double alpha,
 55 |                           gsl_matrix* init_suffstats);
 56 | 
 57 | // === fitting ===
 58 | 
 59 | 
 60 | // infer a corpus with an lda-seq
 61 | 
 62 | double update_inf_var(lda_seq* seq,
 63 | 		      const corpus_seq_t* data,
 64 | 		      gsl_matrix** phi,
 65 | 		      size_t t,
 66 | 		      const char* root);
 67 | double update_inf_var_multiple(lda_seq* seq,
 68 | 			       const corpus_seq_t* data,
 69 | 			       gsl_matrix** phi,
 70 | 			       size_t t,
 71 | 			       const char* root);
 72 | void update_inf_reg(lda_seq* seq,
 73 | 		    const corpus_seq_t* data,
 74 | 		    gsl_matrix** phi,
 75 | 		    size_t t,
 76 | 		    const char* root);
 77 | 
 78 | double lda_seq_infer(lda_seq* model,
 79 |                      const corpus_seq_t* data,
 80 |                      gsl_matrix** suffstats,
 81 |                      gsl_matrix* gammas,
 82 |                      gsl_matrix* lhoods,
 83 | 		     int iter,
 84 | 		     const char* file_root);
 85 | 
 86 | // fit lda sequence from sufficient statistics
 87 | 
 88 | double fit_lda_seq(lda_seq* m,
 89 |                    const corpus_seq_t* data,
 90 |                    const corpus_seq_t* heldout,
 91 |                    const char* file_root);
 92 | 
 93 | void update_lda_seq_ss(int time,
 94 |                        const doc_t* doc,
 95 |                        const lda_post* post,
 96 |                        gsl_matrix** ss);
 97 | 
 98 | double fit_lda_seq_topics(lda_seq* model,
 99 |                           gsl_matrix** ss);
100 | 
101 | 
102 | // === reading and writing ===
103 | 
104 | 
105 | // read and write a lda sequence
106 | 
107 | void write_lda_seq(const lda_seq* m, const char* root);
108 | 
109 | lda_seq* read_lda_seq(const char* root, corpus_seq_t* data);
110 | 
111 | // write lda sequence sufficient statistics
112 | 
113 | void write_lda_seq_suffstats(lda_seq* m,
114 |                              gsl_matrix** topic_ss,
115 |                              const char* root);
116 | 
117 | // new lda sequence
118 | 
119 | lda_seq* new_lda_seq(corpus_seq_t* data,
120 | 		     int W,
121 | 		     int T,
122 | 		     int K);
123 | 
124 | void make_lda_from_seq_slice(lda* lda_m,
125 |                              lda_seq* lda_seq_m,
126 |                              int time);
127 | 
128 | #endif
129 | 


--------------------------------------------------------------------------------
/dtm/sample.sh:
--------------------------------------------------------------------------------
  1 | This file provides information about running the Dynamic Topic Model
  2 | or the Document Influence Model.  It gives two command-line examples
  3 | for running the software and several example commands in R for reading
  4 | output files.
  5 | 
  6 | Dynamic topic models and the influence model have been implemented
  7 | here in c / c++.  This implementation takes two input files:
  8 | 
  9 |  (a) foo-mult.dat, which is one-doc-per-line, each line of the form
 10 | 
 11 |    unique_word_count index1:count1 index2:count2 ... indexn:counnt
 12 | 
 13 |    where each index is an integer corresponding to a unique word.
 14 | 
 15 |  (b) foo-seq.dat, which is of the form
 16 | 
 17 | 
 18 |    Number_Timestamps
 19 |    number_docs_time_1
 20 |    ...
 21 |    number_docs_time_i
 22 |    ...
 23 |    number_docs_time_NumberTimestamps
 24 | 
 25 |    - The docs in foo-mult.dat should be ordered by date, with the first
 26 |      docs from time1, the next from time2, ..., and the last docs from
 27 |      timen.
 28 | 
 29 | When working with data like this, I've found it helpful to create
 30 | the following files:
 31 |   - the mult.dat file (described in (a) above)
 32 |   - the seq.dat file (described in (b) above)
 33 |   - a file with all of the words in the vocabulary, arranged in
 34 |     the same order as the word indices
 35 |   - a file with information on each of the documents, arranged in
 36 |     the same order as the docs in the mult file.
 37 | 
 38 | The code creates at least the following files:
 39 | 
 40 |  - topic-???-var-e-log-prob.dat: the e-betas (word distributions) for
 41 |    topic ??? for all times.  This is in row-major form, i.e.:
 42 | 
 43 |   > a = scan("topic-002-var-e-log-prob.dat")
 44 |   > b = matrix(a, ncol=10, byrow=TRUE)
 45 | 
 46 |   # The probability of term 100 in topic 2 at time 3:
 47 |   exp(b[100, 3])
 48 | 
 49 |  - gam.dat: The gammas associated with each document.  Divide these by
 50 |   the sum for each document to get expected topic mixtures.
 51 | 
 52 |   > a = scan("gam.dat")
 53 |   > b = matrix(a, ncol=10, byrow=TRUE)
 54 |   > rs = rowSums(b)
 55 |   > e.theta = b / rs
 56 |   # Proportion of topic 5 in document 3:
 57 |   e.theta[3, 5]
 58 | 
 59 | If you are running this software in "dim" mode to find document
 60 | influence, it will also create the following files:
 61 | 
 62 |  - influence_time-??? : the influence of documents at time ??? for
 63 |   each topic, where time is based on in your -seq.dat file and the
 64 |   document index is given by the ordering of documents in the mult
 65 |   file.
 66 | 
 67 |   For example, in R:
 68 |   > a = scan("influence-time-010")
 69 |   > b = matrix(a, ncol=10, byrow=TRUE)
 70 |   # The influence of the 2nd document on topic 5:
 71 |   > b[2, 5]
 72 | 
 73 | # Here are some example commands:
 74 | # Run the dynamic topic model.
 75 | ./main \
 76 |   --ntopics=20 \
 77 |   --mode=fit \
 78 |   --rng_seed=0 \
 79 |   --initialize_lda=true \
 80 |   --corpus_prefix=example/test \
 81 |   --outname=example/model_run \
 82 |   --top_chain_var=0.005 \
 83 |   --alpha=0.01 \
 84 |   --lda_sequence_min_iter=6 \
 85 |   --lda_sequence_max_iter=20 \
 86 |   --lda_max_em_iter=10
 87 | 
 88 | # Run the influence model.
 89 | ./main \
 90 |     --mode=fit \
 91 |     --rng_seed=0 \
 92 |     --model=fixed \
 93 |     --initialize_lda=true \
 94 |     --corpus_prefix=example/test \
 95 |     --outname=example/output \
 96 |     --time_resolution=2 \
 97 |     --influence_flat_years=5 \
 98 |     --top_obs_var=0.5 \
 99 |     --top_chain_var=0.005 \
100 |     --sigma_d=0.0001 \
101 |     --sigma_l=0.0001 \
102 |     --alpha=0.01 \
103 |     --lda_sequence_min_iter=6 \
104 |     --lda_sequence_max_iter=20 \
105 |     --save_time=-1 \
106 |     --ntopics=10 \
107 |     --lda_max_em_iter=10
108 | 
109 | 


--------------------------------------------------------------------------------
/dtm/params.c:
--------------------------------------------------------------------------------
  1 | // Author: David Blei (blei@cs.princeton.edu)
  2 | //
  3 | // Copyright 2006 David Blei
  4 | // All Rights Reserved.
  5 | //
  6 | // See the README for this package for details about modifying or
  7 | // distributing this software.
  8 | 
  9 | #include "params.h"
 10 | 
 11 | /*
 12 |  * check label
 13 |  *
 14 |  */
 15 | 
 16 | void check_label(FILE* f, char* name)
 17 | {
 18 |     char label[400];
 19 |     fscanf(f, "%s", label);
 20 |     assert(strcmp(label, name) == 0);
 21 | }
 22 | 
 23 | 
 24 | /*
 25 |  * read and write strings
 26 |  *
 27 |  */
 28 | 
 29 | void params_read_string(FILE* f, char* name, char* x)
 30 | {
 31 |     check_label(f, name);
 32 |     fscanf(f, "%s", x);
 33 |     outlog("%-10s READ NAME=%-10s STRING=%s", "[PARAMS]", name, x);
 34 | }
 35 | 
 36 | /*
 37 |  * read and write integers
 38 |  *
 39 |  */
 40 | 
 41 | void params_read_int(FILE* f, char* name, int* x)
 42 | {
 43 |     check_label(f, name);
 44 |     assert(fscanf(f, "%d", x) > 0);
 45 |     outlog("%-10s READ NAME=%-10s INT=%d", "[PARAMS]", name, *x);
 46 | }
 47 | 
 48 | void params_write_int(FILE* f, char* name, int x)
 49 | {
 50 |     fprintf(f, "%s %d\n", name, x);
 51 | }
 52 | 
 53 | 
 54 | /*
 55 |  * read and write doubles
 56 |  *
 57 |  */
 58 | 
 59 | void params_read_double(FILE* f, char* name, double* x)
 60 | {
 61 |     check_label(f, name);
 62 |     assert(fscanf(f, "%lf", x) > 0);
 63 |     outlog("%-10s READ NAME=%-10s DBL=%1.14e", "[PARAMS]", name, *x);
 64 | }
 65 | 
 66 | void params_write_double(FILE* f, char* name, double x)
 67 | {
 68 |     fprintf(f, "%s %17.14f\n", name, x);
 69 | }
 70 | 
 71 | 
 72 | /*
 73 |  * read and write gsl vectors and matrices.
 74 |  *
 75 |  */
 76 | 
 77 | void params_read_gsl_vector(FILE* f, char* name, gsl_vector** x)
 78 | {
 79 |     int size, i;
 80 |     double val;
 81 | 
 82 |     check_label(f, name);
 83 |     assert(fscanf(f, "%d", &size) > 0);
 84 |     *x = gsl_vector_calloc(size);
 85 |     for (i = 0; i < size; i++)
 86 |     {
 87 |         assert(fscanf(f, "%lf", &val) > 0);
 88 |         gsl_vector_set(*x, i, val);
 89 |     }
 90 | }
 91 | 
 92 | 
 93 | void params_write_gsl_vector(FILE* f, char* name, gsl_vector* x)
 94 | {
 95 |     fprintf(f, "%s %d", name, (int) x->size);
 96 |     int i;
 97 |     for (i = 0; i < x->size; i++)
 98 |         fprintf(f, " %17.14f", gsl_vector_get(x, i));
 99 |     fprintf(f, "\n");
100 | }
101 | 
102 | 
103 | //void params_write_doc_
104 | 
105 | void params_write_gsl_vector_multiline(FILE* f, char* name, gsl_vector* x)
106 | {
107 |     fprintf(f, "%s %d\n", name, (int) x->size);
108 |     int i;
109 |     if (x->size) {
110 |         fprintf(f, "%17.14f", gsl_vector_get(x, 0));
111 |     }
112 |     for (i = 1; i < x->size; i++)
113 |         fprintf(f, ",%17.14f", gsl_vector_get(x, i));
114 |     fprintf(f, "\n");
115 | }
116 | 
117 | 
118 | void params_write_gsl_matrix(FILE* f, char* name, gsl_matrix* x)
119 | {
120 |     fprintf(f, "%s %ld %ld\n", name, x->size1, x->size2);
121 |     int i, j;
122 |     if (x->size1 == 0) {
123 |       return;
124 |     }
125 |     for (i = 0; i < x->size1; i++) {
126 |       fprintf(f, "%17.14f", gsl_matrix_get(x, i, 0));
127 |       for (j = 1; j < x->size2; j++) {
128 |         fprintf(f, ",%17.14f", gsl_matrix_get(x, i, j));
129 |       }
130 |       fprintf(f, "\n");
131 |     }
132 | }
133 | 
134 | void params_write_sparse_gsl_matrix(FILE* f, char* name, gsl_matrix* x)
135 | {
136 |     fprintf(f, "%s %ld %ld\n", name, x->size1, x->size2);
137 |     int i, j;
138 |     if (x->size1 == 0) {
139 |       return;
140 |     }
141 |     for (i = 0; i < x->size1; i++) {
142 |       for (j = 0; j < x->size2; j++) {
143 | 	//	outlog("%d %d %d %d", i, j, x->size1, x->size2);
144 | 	double value = gsl_matrix_get(x, i, j);
145 | 	if (fabs(value) > 1e-12) {
146 | 	  fprintf(f, "%d,%d,%17.14f\n", i, j, value);
147 | 	}
148 |       }
149 |     }
150 | }
151 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/histogram.h:
--------------------------------------------------------------------------------
  1 | //  This random generator is a C++ wrapper for the GNU Scientific Library
  2 | //  Copyright (C) 2001 Torbjorn Vik
  3 | 
  4 | //  This program is free software; you can redistribute it and/or modify
  5 | //  it under the terms of the GNU General Public License as published by
  6 | //  the Free Software Foundation; either version 2 of the License, or
  7 | //  (at your option) any later version.
  8 | 
  9 | //  This program is distributed in the hope that it will be useful,
 10 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | //  GNU General Public License for more details.
 13 | 
 14 | //  You should have received a copy of the GNU General Public License
 15 | //  along with this program; if not, write to the Free Software
 16 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 17 | #ifndef __histogram_h
 18 | #define __histogram_h
 19 | 
 20 | #include <gsl/gsl_histogram.h>
 21 | #include <stdexcept>
 22 | #include <exception>
 23 | 
 24 | namespace gsl{
 25 | #ifndef __HP_aCC
 26 | using std::string;
 27 | using std::runtime_error;
 28 | #endif
 29 | 
 30 | //! Encapsulates the histogram object of gsl. Only uniformly spaced bins yet.
 31 | class histogram
 32 | {
 33 | public:
 34 | 	histogram(int nBins, double xmin, double xmax)
 35 | 	{
 36 | 		h=gsl_histogram_calloc(nBins);
 37 | 		if (!h)
 38 | 		{
 39 | 			throw runtime_error("Couldn't allocate memory for histogram");
 40 | 		}
 41 | 		gsl_histogram_set_ranges_uniform(h, xmin, xmax);
 42 | 	}
 43 | 	~histogram(){gsl_histogram_free(h);}
 44 | 
 45 | //@{ Updating and Accessing Methods
 46 | 	int increment(double x){return gsl_histogram_increment(h, x);}
 47 | 	int accumulate(double x, double weight){return gsl_histogram_accumulate(h, x, weight);}
 48 | 	double get(int i) const {return gsl_histogram_get(h, i);}
 49 | 	double& operator[](const uint & i) 
 50 | 	{
 51 | 		const uint n = h->n;
 52 | 		
 53 | 		if (i >= n)
 54 | 		{
 55 | 			throw runtime_error("index lies outside valid range of 0 .. n - 1");
 56 | //			GSL_ERROR_VAL ("index lies outside valid range of 0 .. n - 1", GSL_EDOM, 0);
 57 | 		}
 58 | 		
 59 | 		return h->bin[i];
 60 | 	}
 61 | 	const double& operator[](const uint & i) const //{return (*this)[i];/*gsl_histogram_get(h, i);*/}
 62 | 	{
 63 | 		const uint n = h->n;
 64 | 		
 65 | 		if (i >= n)
 66 | 		{
 67 | 			throw runtime_error("index lies outside valid range of 0 .. n - 1");
 68 | //			GSL_ERROR_VAL ("index lies outside valid range of 0 .. n - 1", GSL_EDOM, 0);
 69 | 		}
 70 | 		
 71 | 		return h->bin[i];
 72 | 	}
 73 | 
 74 | 	void get_range(int i, double& xmin, double& xmax) const {gsl_histogram_get_range(h, i, &xmin, &xmax);}
 75 | //@}
 76 | 
 77 | //@{These functions return the maximum upper and minimum lower range limits
 78 | // and the number of bins of the histogram h. They provide a way of determining these values without
 79 | //    accessing the gsl_histogram struct directly. 
 80 | 	double max() const {return gsl_histogram_max(h);}
 81 | 	double min() const {return gsl_histogram_min(h);}
 82 | 	int bins()const {return gsl_histogram_bins(h);}
 83 | 	int size()const {return gsl_histogram_bins(h);}
 84 | 
 85 | //@}
 86 | 
 87 | //@{ Histogram statistics
 88 | 	double mean()const {return gsl_histogram_mean(h);} // not in gsl library ?
 89 | 	double max_val() const {return gsl_histogram_max_val(h);}
 90 | 	int max_bin() const {return gsl_histogram_max_bin(h);}
 91 | 	double min_val() const {return gsl_histogram_min_val(h);}
 92 | 	int min_bin() const {return gsl_histogram_min_bin(h);}
 93 | 	double sum() const {return gsl_histogram_sum(h);}
 94 | //@}
 95 | 
 96 | 
 97 | //@{ Accessor for gsl compatibility
 98 | 	gsl_histogram*       gslobj()       { return h;}
 99 | 	const gsl_histogram* gslobj() const { return h;}
100 | //@}
101 | 
102 | protected:
103 | 	gsl_histogram * h;
104 | };
105 | }
106 | 
107 | #endif // __histogram_h
108 | 


--------------------------------------------------------------------------------
/gslwrap/bin/gslwrap-config:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | # this is shamelessly stolen from imlib3d-config :-) which is :
  4 | # this is shamelessly stolen from gtkmm-config :-)
  5 | 
  6 | prefix=/n/fs/topics/lib/gslwrap
  7 | exec_prefix=${prefix}
  8 | top_srcdir=.
  9 | 
 10 | libdir=${exec_prefix}/lib
 11 | 
 12 | bindir=${exec_prefix}/bin
 13 | pkglibdir=${libdir}/gslwrap
 14 | 
 15 | transform=s,x,x,
 16 | 
 17 | 
 18 | gslwrap_libs="-L${exec_prefix}/lib -lgslwrap -L/n/fs/topics/lib/gsl/lib -lgsl -lgslcblas -lm"
 19 | gslwrap_cflags="-I${prefix}/include -I${prefix}/include/gslwrap -I/n/fs/topics/lib/gsl/include "
 20 | 
 21 | 
 22 | usage()
 23 | {
 24 |     cat <<EOF
 25 | Usage: gslwrap-config [OPTION]... [LIBRARY]...
 26 | 
 27 | Generic options
 28 |   --version     output gslwrap version information
 29 |   --help        display this help and exit
 30 | 
 31 | Compilation support options
 32 |   --cflags      print compiler flags
 33 |   --libs        print library linking information
 34 |   --libs-only-L only print the -L/-R part of --libs
 35 |   --libs-only-l only print the -l part of --libs
 36 | 
 37 | Known values for LIBRARY are:
 38 | 
 39 |     gslwrap
 40 | 
 41 | EOF
 42 | 
 43 |     exit $1
 44 | }
 45 | 
 46 | if test $# -eq 0; then
 47 |     usage 1
 48 | fi
 49 | 
 50 | cflags=false
 51 | libs_L=false
 52 | libs_l=false
 53 | 
 54 | any=no
 55 | 
 56 | while test $# -gt 0; do
 57 |     case "$1" in
 58 |     -*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
 59 |     *) optarg= ;;
 60 |     esac
 61 | 
 62 |     case $1 in
 63 |     --version)
 64 |         echo 0.2
 65 |         exit 0
 66 |         ;;
 67 |     --help)
 68 |         usage 0
 69 |         ;;
 70 |     --prefix=*)
 71 | 	prefix=$optarg
 72 |         ;;
 73 |     --cflags)
 74 |         cflags=true
 75 |         ;;
 76 |     --libs)
 77 |         libs_L=true
 78 |         libs_l=true
 79 |         ;;
 80 |     --libs-only-L)
 81 |         libs_L=true
 82 |         ;;
 83 |     --libs-only-l)
 84 |         libs_l=true
 85 |         ;;
 86 |     gslwrap)
 87 |         the_libs="-L${exec_prefix}/lib -lgslwrap -L/n/fs/topics/lib/gsl/lib -lgsl -lgslcblas -lm"
 88 |         the_flags="-I${prefix}/include -I${prefix}/include/gslwrap -I/n/fs/topics/lib/gsl/include "
 89 |         any=yes
 90 |         ;;
 91 |     *)
 92 |         usage 1
 93 |         ;;
 94 |     esac
 95 |     shift
 96 | done
 97 | 
 98 | # default to gtk-- for backward compatibility
 99 | # (should be removed some time in the future)
100 | if test "x$any" = xno; then
101 |    the_libs="$the_libs $gslwrap_libs"
102 |    the_flags="$the_flags $gslwrap_cflags"
103 | fi
104 | 
105 | if $cflags; then
106 |     all_flags="$the_flags"
107 | fi
108 | 
109 | if $libs_L || $libs_l; then
110 |     all_flags="$all_flags $the_libs"
111 | fi
112 | 
113 | if test -z "$all_flags" || test "x$all_flags" = "x "; then
114 |     exit 1
115 | fi
116 | 
117 | # Straight out any possible duplicates, but be careful to
118 | # get `-lfoo -lbar -lbaz' for `-lfoo -lbaz -lbar -lbaz'
119 | other_flags=
120 | lib_L_flags=
121 | rev_libs=
122 | for i in $all_flags; do
123 |     case "$i" in
124 |     # a library, save it for later, in reverse order
125 |     -l*) rev_libs="$i $rev_libs" ;;
126 |     -L*|-R*)
127 |         if $libs_L; then
128 |             case " $lib_L_flags " in
129 |             *\ $i\ *) ;;                        # already there
130 |             *) lib_L_flags="$lib_L_flags $i" ;; # add it to output
131 |             esac 
132 |         fi;;
133 |     *)
134 |         case " $other_flags " in
135 |         *\ $i\ *) ;;                            # already there
136 |         *) other_flags="$other_flags $i" ;;     # add it to output
137 |         esac ;;
138 |     esac
139 | done
140 | 
141 | ord_libs=
142 | if $libs_l; then
143 |     for i in $rev_libs; do
144 |         case " $ord_libs " in
145 |         *\ $i\ *) ;;                    # already there
146 |         *) ord_libs="$i $ord_libs" ;;   # add it to output in reverse order
147 |         esac
148 |     done
149 | fi
150 | 
151 | echo $other_flags $lib_L_flags $ord_libs
152 | 
153 | exit 0
154 | 


--------------------------------------------------------------------------------
/dtm/lda.h:
--------------------------------------------------------------------------------
  1 | // Authors: David Blei (blei@cs.princeton.edu)
  2 | //          Sean Gerrish (sgerrish@cs.princeton.edu)
  3 | //
  4 | // Copyright 2011 Sean Gerrish and David Blei
  5 | // All Rights Reserved.
  6 | //
  7 | // See the README for this package for details about modifying or
  8 | // distributing this software.
  9 | 
 10 | #ifndef LDA_H
 11 | #define LDA_H
 12 | 
 13 | #include <gsl/gsl_vector.h>
 14 | #include <gsl/gsl_matrix.h>
 15 | #include <gsl/gsl_sf_gamma.h>
 16 | #include <gsl/gsl_sf_psi.h>
 17 | #include <gsl/gsl_sf_lambert.h>
 18 | #include <gsl/gsl_rng.h>
 19 | 
 20 | #include "param.h"
 21 | #include "data.h"
 22 | #include "gsl-wrappers.h"
 23 | 
 24 | /*
 25 |  * functions for posterior inference in the latent dirichlet
 26 |  * allocation model.
 27 |  *
 28 |  */
 29 | 
 30 | #define LDA_INFERENCE_CONVERGED 1e-8
 31 | #define LDA_SEED_INIT 1
 32 | #define LDA_INIT_SMOOTH 1.0
 33 | #define LDA_EM_CONVERGED 5e-5
 34 | #define LDA_USE_VAR_BAYES 0
 35 | #define LDA_TOPIC_DIR_PARAM 0.001
 36 | 
 37 | // lda model
 38 | 
 39 | typedef struct lda {
 40 |     int ntopics;         // number of topics
 41 |     int nterms;          // vocabulary size
 42 |     gsl_matrix* topics;  // each column is a topic (V X K)
 43 |     gsl_vector* alpha;   // dirichlet parameters
 44 | } lda;
 45 | 
 46 | // lda posterior
 47 | 
 48 | typedef struct lda_post {
 49 |     doc_t* doc;          // document associated to this posterior
 50 |     lda* model;          // lda model
 51 |     gsl_matrix* phi;     // variational mult parameters (nterms x K)
 52 |     gsl_matrix* log_phi; // convenient for computation (nterms x K)
 53 |     gsl_vector* gamma;   // variational dirichlet parameters (K)
 54 |     gsl_vector* lhood;   // a K+1 vector, sums to the lhood bound
 55 |     gsl_vector* doc_weight;  // Not owned by this structure.
 56 |     gsl_vector* renormalized_doc_weight;  // Not owned by this structure.
 57 | } lda_post;
 58 | 
 59 | // lda sufficient statistics
 60 | 
 61 | typedef struct lda_suff_stats {
 62 |   gsl_matrix* topics_ss;
 63 | } lda_suff_stats;
 64 | 
 65 | 
 66 | // new lda model and suff stats
 67 | 
 68 | lda* new_lda_model(int ntopics, int nterms);
 69 | void free_lda_model(lda* m);
 70 | lda_suff_stats* new_lda_suff_stats(lda* model);
 71 | void reset_lda_suff_stats(lda_suff_stats* ss);
 72 | lda_post* new_lda_post(int ntopics, int max_length);
 73 | void free_lda_post(lda_post* p);
 74 | void initialize_lda_ss_from_data(corpus_t* data, lda_suff_stats* ss);
 75 | 
 76 | // posterior inference
 77 | 
 78 | double fit_lda_post(int doc_number, int time,
 79 | 		    lda_post* p, lda_seq* var,
 80 | 		    gsl_matrix* g,
 81 | 		    gsl_matrix* g3,
 82 | 		    gsl_matrix* g4,
 83 | 		    gsl_matrix* g5);
 84 | void init_lda_post(lda_post* p);
 85 | void update_gamma(lda_post* p);
 86 | void update_phi(int doc_number, int time,
 87 | 		lda_post* p, lda_seq* var,
 88 | 		gsl_matrix* g);
 89 | void update_phi_dim(int doc_number, int time,
 90 | 		    lda_post* p, lda_seq* var,
 91 | 		    gsl_matrix* g);
 92 | void update_phi_fixed(int doc_number, int time,
 93 | 		      lda_post* p, lda_seq* var,
 94 | 		      gsl_matrix* g3_matrix,
 95 | 		      gsl_matrix* g4_matrix,
 96 | 		      gsl_matrix* g5_matrix);
 97 | void update_phi_multiple(int doc_number, int time,
 98 | 			 lda_post* p, lda_seq* var,
 99 | 			 gsl_matrix* g);
100 | 
101 | // compute the likelihood bound
102 | 
103 | double compute_lda_lhood(lda_post* p);
104 | 
105 | // EM algorithm
106 | 
107 | double lda_e_step(lda* model, corpus_t* data, lda_suff_stats* ss);
108 | double lda_m_step(lda* model, lda_suff_stats* ss);
109 | void lda_em(lda* model,
110 | 	    lda_suff_stats* ss,
111 | 	    corpus_t* data,
112 | 	    int max_iter,
113 | 	    char* outname);
114 | 
115 | // reading and writing
116 | 
117 | lda_suff_stats* read_lda_suff_stats(char* filename, int ntopics, int nterms);
118 | void write_lda(lda* model, char* name);
119 | void write_lda_suff_stats(lda_suff_stats* ss, char* name);
120 | lda* read_lda(int ntopics, int nterms, char* name);
121 | 
122 | 
123 | void initialize_lda_ss_from_random(corpus_t* data, lda_suff_stats* ss);
124 | 
125 | #endif
126 | 


--------------------------------------------------------------------------------
/lib/math/gradient_projection_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <boost/test/included/unit_test_framework.hpp>
  2 | #include <boost/test/unit_test.hpp>
  3 | #include <boost/test/floating_point_comparison.hpp>
  4 | #include "gradient_projection.h"
  5 | 
  6 | #define CLOSE_TOL 0.1
  7 | #define LOOSE_TOL 10
  8 | 
  9 | using boost::unit_test_framework::test_suite;
 10 | using namespace GradientProjection;
 11 | 
 12 | void test_projection() {
 13 |   gsl::matrix n(4, 3);
 14 |   gsl::matrix p(1, 1);
 15 |   n(0, 0) = 2; n(0, 1) = 1; n(0, 2) = 0;
 16 |   n(1, 0) = 1; n(1, 1) = 1; n(1, 2) = 0;
 17 |   n(2, 0) = 1; n(2, 1) = 2; n(2, 2) = 0;
 18 |   n(3, 0) = 4; n(3, 1) = 1; n(3, 2) = 1;
 19 |   gsl::vector g(3);
 20 |   g[0] = 0.0; g[1] = -0.1; g[2] = 0.0;
 21 |   gsl::vector grad(4);
 22 |   grad[0] = 2.0; grad[1] = 4.0; grad[2] = 2.0; grad[3] = -3.0;
 23 |   gsl::vector direction;
 24 |   gsl::vector correction;
 25 | 
 26 |   createProjection(n, g, grad, p, direction, correction);
 27 |   BOOST_CHECK_CLOSE(p(0, 0),  1.0/11.0, LOOSE_TOL);
 28 |   BOOST_CHECK_CLOSE(p(0, 1), -3.0/11.0, LOOSE_TOL);
 29 |   BOOST_CHECK_CLOSE(p(0, 2),  1.0/11.0, LOOSE_TOL);
 30 |   BOOST_CHECK(abs(p(0, 3)) < 1e-10);
 31 |   BOOST_CHECK_CLOSE(p(1, 0), -3.0/11.0, LOOSE_TOL);
 32 |   BOOST_CHECK_CLOSE(p(1, 1),  9.0/11.0, LOOSE_TOL);
 33 |   BOOST_CHECK_CLOSE(p(1, 2), -3.0/11.0, LOOSE_TOL);
 34 |   BOOST_CHECK(abs(p(1, 3)) < 1e-10);
 35 |   BOOST_CHECK_CLOSE(p(2, 0),  1.0/11.0, LOOSE_TOL);
 36 |   BOOST_CHECK_CLOSE(p(2, 1), -3.0/11.0, LOOSE_TOL);
 37 |   BOOST_CHECK_CLOSE(p(2, 2),  1.0/11.0, LOOSE_TOL);
 38 |   BOOST_CHECK(abs(p(2, 3)) < 1e-10);
 39 |   BOOST_CHECK(abs(p(3, 0)) < 1e-10);
 40 |   BOOST_CHECK(abs(p(3, 1)) < 1e-10);
 41 |   BOOST_CHECK(abs(p(3, 2)) < 1e-10);
 42 |   BOOST_CHECK(abs(p(3, 3)) < 1e-10);
 43 | 
 44 |   BOOST_CHECK_CLOSE(correction[0], -4.0/110.0, CLOSE_TOL);
 45 |   BOOST_CHECK_CLOSE(correction[1],  1.0/110.0, CLOSE_TOL);
 46 |   BOOST_CHECK_CLOSE(correction[2],  7.0/110.0, CLOSE_TOL);
 47 |   BOOST_CHECK(abs(correction[3]) < 1e-10);
 48 | 
 49 |   BOOST_CHECK_CLOSE(direction[0],    8.0/11.0, CLOSE_TOL);
 50 |   BOOST_CHECK_CLOSE(direction[1],  -24.0/11.0, CLOSE_TOL);
 51 |   BOOST_CHECK_CLOSE(direction[2],    8.0/11.0, CLOSE_TOL);
 52 |   BOOST_CHECK(abs(direction[3]) < 1e-10);
 53 | 
 54 |   gsl::vector x(4);
 55 |   x[0] = 2; x[1] = 2; x[2] = 1; x[3] = 0;
 56 | 
 57 |   descend(x, direction, 0.1, 5, correction, grad);
 58 |   BOOST_CHECK_CLOSE(x[0], 2.026, CLOSE_TOL);
 59 |   BOOST_CHECK_CLOSE(x[1], 1.822, CLOSE_TOL);
 60 |   BOOST_CHECK_CLOSE(x[2], 1.126, CLOSE_TOL);
 61 |   BOOST_CHECK(abs(x[3]) < 1e-10);
 62 | }
 63 | 
 64 | void test_constraint_matrix() {
 65 |   gsl::matrix n;
 66 |   gsl::vector g;
 67 |   gsl::vector x(3);
 68 | 
 69 | 
 70 |   x[0] = 0.15; x[1] = 0.5; x[2] = 0.2;
 71 | 
 72 |   BOOST_CHECK_EQUAL(createActiveConstraints(x, n, g), false);
 73 | 
 74 |   x[0] = 0.0;
 75 |   BOOST_CHECK_EQUAL(createActiveConstraints(x, n, g), true);
 76 |   BOOST_CHECK_EQUAL(n.size1(), x.size());
 77 |   // Only one constraint is broken.
 78 |   BOOST_CHECK_EQUAL(n.size2(), 1);
 79 |   BOOST_CHECK_EQUAL(n(0, 0), 1);
 80 |   BOOST_CHECK_EQUAL(n(1, 0), 0);
 81 |   BOOST_CHECK_EQUAL(n(2, 0), 0);
 82 |   BOOST_CHECK_EQUAL(g.size(), 1);
 83 |   BOOST_CHECK_EQUAL(g[0], -SAFETY_BOX);
 84 | 
 85 |   x[2] = 0.6;
 86 |   BOOST_CHECK_EQUAL(createActiveConstraints(x, n, g), true);
 87 |   BOOST_CHECK_EQUAL(n.size1(), x.size());
 88 |   // Two constraints are broken.
 89 |   BOOST_CHECK_EQUAL(n.size2(), 2);
 90 |   BOOST_CHECK_EQUAL(n(0, 0), -1);
 91 |   BOOST_CHECK_EQUAL(n(1, 0), -1);
 92 |   BOOST_CHECK_EQUAL(n(2, 0), -1);
 93 |   BOOST_CHECK_EQUAL(n(0, 1), 1);
 94 |   BOOST_CHECK_EQUAL(n(1, 1), 0);
 95 |   BOOST_CHECK_EQUAL(n(2, 1), 0);
 96 |   BOOST_CHECK_EQUAL(g.size(), 2);
 97 |   BOOST_CHECK_CLOSE(g[0], -0.2, CLOSE_TOL);
 98 |   BOOST_CHECK_EQUAL(g[1], -SAFETY_BOX);
 99 |   
100 | }
101 | 
102 | test_suite* init_unit_test_suite(int, char* []) {
103 |   test_suite* test= BOOST_TEST_SUITE( "Testing Gradient Projection" );
104 |   test->add( BOOST_TEST_CASE( &test_constraint_matrix ),    0);
105 |   test->add( BOOST_TEST_CASE( &test_projection        ),    0);
106 |   return test;
107 | }
108 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/min_fminimizer.h:
--------------------------------------------------------------------------------
  1 | //  This random generator is a C++ wrapper for the GNU Scientific Library
  2 | //  Copyright (C) 2001 Torbjorn Vik
  3 | 
  4 | //  This program is free software; you can redistribute it and/or modify
  5 | //  it under the terms of the GNU General Public License as published by
  6 | //  the Free Software Foundation; either version 2 of the License, or
  7 | //  (at your option) any later version.
  8 | 
  9 | //  This program is distributed in the hope that it will be useful,
 10 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | //  GNU General Public License for more details.
 13 | 
 14 | //  You should have received a copy of the GNU General Public License
 15 | //  along with this program; if not, write to the Free Software
 16 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 17 | #ifndef __min_fminimizer_h
 18 | #define __min_fminimizer_h 
 19 | 
 20 | #include <gsl/gsl_errno.h>
 21 | #include <gsl/gsl_min.h>
 22 | 
 23 | namespace gsl{
 24 | 
 25 | //! Derive this class provide a user defined function for minimisation
 26 | struct min_f
 27 | {
 28 | 	//! This operator must be overridden
 29 | 	virtual double operator()(const double& x)=0;
 30 | 	
 31 | 	//! This is the function gsl calls to optimize f
 32 | 	static double f(double x, void *p)
 33 | 	{
 34 | 		return (*(min_f *)p)(x);
 35 | 	}
 36 | };
 37 | 
 38 | //! Class for minimizing one dimensional functions. 
 39 | /*!
 40 |   Usage: 
 41 |        - Create with optional minimize type
 42 | 	   - Set with function object and inital bounds
 43 | 	   - Loop the  iterate function until convergence or maxIterations (extra facility)
 44 | 
 45 | 	   - Recover minimum and bounds
 46 |  */
 47 | class min_fminimizer 
 48 | {
 49 |  public:
 50 | 	//! choose between gsl_min_fminimizer_goldensection and gsl_min_fminimizer_brent
 51 | 	min_fminimizer(const gsl_min_fminimizer_type* type=gsl_min_fminimizer_brent) : s(NULL), maxIterations(100), isSet(false)
 52 | 	{
 53 | 		s=gsl_min_fminimizer_alloc(type);
 54 | 		nIterations=0;
 55 | 		if (!s)
 56 | 		{
 57 | 			//error
 58 | 			//cout << "ERROR Couldn't allocate memory for minimizer" << endl;
 59 | 			//throw ? 
 60 | 			exit(-1);
 61 | 		}
 62 | 	}
 63 | 	~min_fminimizer(){if (s) gsl_min_fminimizer_free(s);}
 64 | 	//! returns GSL_FAILURE if the interval does not contain a minimum
 65 | 	int set(min_f& function, double minimum, double x_lower, double x_upper)
 66 | 	{
 67 | 		isSet=false;
 68 | 		f.function = &function.f;
 69 | 		f.params = &function;
 70 | 		int status=	gsl_min_fminimizer_set(s, &f, minimum, x_lower, x_upper);
 71 | 		if (!status)
 72 | 		{
 73 | 			isSet=true;
 74 | 			nIterations=0;
 75 | 		}
 76 | 		return status;
 77 | 	}
 78 | 	int set_with_values(min_f& function, 
 79 | 						double minimum, double f_minimum, 
 80 | 						double x_lower,double f_lower, 
 81 | 						double x_upper, double f_upper)
 82 | 	{
 83 | 		isSet=false;
 84 | 		f.function = &function.f;
 85 | 		f.params = &function;
 86 | 		int status=	gsl_min_fminimizer_set_with_values(s, &f, minimum, f_minimum, x_lower, f_lower, x_upper, f_upper);
 87 | 		if (!status)
 88 | 		{
 89 | 			isSet=true;
 90 | 			nIterations=0;
 91 | 		}
 92 | 		return status;
 93 | 	}
 94 | 	int iterate()
 95 | 	{
 96 | 		assert_set();
 97 | 		int status=gsl_min_fminimizer_iterate(s);
 98 | 		nIterations++;
 99 | 		if (status==GSL_FAILURE)
100 | 			isConverged=true;
101 | 		return status;
102 | 	}
103 | 	double minimum(){assert_set();return gsl_min_fminimizer_minimum(s);}
104 | 	double x_upper(){assert_set();return gsl_min_fminimizer_x_upper(s);}
105 | 	double x_lower(){assert_set();return gsl_min_fminimizer_x_lower(s);}
106 | 	void SetMaxIterations(int n){maxIterations=n;}
107 | 	int GetNIterations(){return nIterations;}
108 | 	bool is_converged(){if (nIterations>=maxIterations) return true; if (isConverged) return true; return false;}
109 | 	//string name() const;
110 | 	
111 |  private:
112 | 	void assert_set(){if (!isSet)exit(-1);} // Old problem of error handling: TODO
113 | 	
114 | 	bool isSet;
115 | 	bool isConverged;
116 | 	int nIterations;
117 | 	int maxIterations;
118 | 	gsl_min_fminimizer* s;
119 | 	gsl_function f;
120 | };
121 | };	 // namespace gsl
122 | 
123 | #endif //__min_fminimizer_h
124 | 


--------------------------------------------------------------------------------
/lib/math/specialfunc.cpp:
--------------------------------------------------------------------------------
  1 | #include "specialfunc.h"
  2 | #include <limits>
  3 | #include <gsl/gsl_sf_erf.h>
  4 | 
  5 | double trigamma(double x)
  6 | {
  7 |     double p;
  8 |     int i;
  9 | 
 10 |     x=x+6;
 11 |     p=1/(x*x);
 12 |     p=(((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)
 13 |          *p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
 14 |     for (i=0; i<6 ;i++)
 15 |     {
 16 |         x=x-1;
 17 |         p=1/(x*x)+p;
 18 |     }
 19 |     return(p);
 20 | }
 21 | 
 22 | 
 23 | double digamma(double x) {
 24 |   if (x == 0.0) {
 25 |     return -std::numeric_limits<double>::infinity();
 26 |   }
 27 |  
 28 |   double p;
 29 |   x=x+6;
 30 |   p=1/(x*x);
 31 |   p=(((0.004166666666667*p-0.003968253986254)*p+
 32 |       0.008333333333333)*p-0.083333333333333)*p;
 33 |   p=p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
 34 |   return p;
 35 | }
 36 | 
 37 | /*
 38 |   We invert the gamma by making a reasonable initial guess (typically
 39 |   this is correct to within a few percent).  An iteration of Newton's
 40 |   method is then used; this yields errors whose worst case are around
 41 |   .3% and typically around .01%.
 42 | 
 43 |   For small x, digamma is approximately -1/x and for large x it is
 44 |   approximately log(x).  Thus we make the initial guesses -1/x and
 45 |   exp(x) (with some fudge factors) depending on where x lies.
 46 |  */
 47 | double InverseDigamma(double x) {
 48 |   double guess = 0.0;
 49 |   if (x < -2) {
 50 |     guess = -1/x;
 51 |   } else {
 52 |     guess = exp(x) - 1 / (x + 7) + 0.5772157;  // Euler-Mascheroni constant.
 53 |   }
 54 |   guess -= (digamma(guess) - x) / trigamma(guess);
 55 |   return(guess);
 56 | }
 57 | 
 58 | 
 59 | double log_gamma(double x)
 60 | {
 61 |   double x0,x2,xp,gl,gl0;
 62 |   int n=0,k=0;
 63 |   static double a[] = {
 64 |     8.333333333333333e-02,
 65 |     -2.777777777777778e-03,
 66 |     7.936507936507937e-04,
 67 |     -5.952380952380952e-04,
 68 |     8.417508417508418e-04,
 69 |     -1.917526917526918e-03,
 70 |     6.410256410256410e-03,
 71 |     -2.955065359477124e-02,
 72 |     1.796443723688307e-01,
 73 |     -1.39243221690590};
 74 |   
 75 |   x0 = x;
 76 |   if (x <= 0.0) return 1e308;
 77 |   else if ((x == 1.0) || (x == 2.0)) return 0.0;
 78 |   else if (x <= 7.0) {
 79 |     n = (int)(7-x);
 80 |     x0 = x+n;
 81 |   }
 82 |   x2 = 1.0/(x0*x0);
 83 |   xp = 2.0*M_PI;
 84 |   gl0 = a[9];
 85 |   for (k=8;k>=0;k--) {
 86 |     gl0 = gl0*x2 + a[k];
 87 |   }
 88 |   gl = gl0/x0+0.5*log(xp)+(x0-0.5)*log(x0)-x0;
 89 |   if (x <= 7.0) {
 90 |     for (k=1;k<=n;k++) {
 91 |       gl -= log(x0-1.0);
 92 |       x0 -= 1.0;
 93 |     }
 94 |   }
 95 |   return gl;
 96 | }
 97 | 
 98 | double sigmoid(double x) {
 99 |   return 1./(1 + exp(-x));
100 | }
101 | 
102 | // First derivative of sigmoid function.
103 | double dsigmoid(double x) {
104 |   double s = sigmoid(x);
105 |   return s * s * exp(-x);
106 | }
107 | 
108 | // Second derivative of sigmoid function.
109 | double d2sigmoid(double x) {
110 |   double s = sigmoid(x);
111 |   double ds = dsigmoid(x);
112 |   return ds * (2 * s * exp(-x) - 1);
113 | }
114 | 
115 | double LogPGaussian(double x) {
116 |   // Phi(x) = 0.5 * erfc( - x / sqrt(2))
117 |   // log Phi(x) = log(0.5) + log erfc( -x / sqrt(2))
118 |   return log(0.5) + gsl_sf_log_erfc(-x / sqrt(2));
119 | }
120 | 
121 | // d Phi(x) = 0.5 erfc'( - x / sqrt(2)) * (- 1 / sqrt(2))
122 | // Note that d erfc = d (1 - erf) = - d erf = - 2 / sqrt(pi) exp(-x^2)
123 | // => d Phi(x) = 0.5 * (-2 / sqrt(pi)) * exp(-x^2/2) * (-1 / sqrt(2))
124 | //             = 1 / sqrt(2 pi) exp(-x^2 / 2)
125 | double LogDGaussian(double x)  {
126 |   return -x * x / 2 - 0.5 * log(2 * M_PI);
127 | }
128 | 
129 | // Computes the inverse of PGaussian.  We use Newton iteration on the
130 | // *log*.  This makes the iteration converge much better for small x.
131 | // I dunno how well it will work for large values of x.
132 | // 5 iterations seems to be enough.  It's still pretty slow so don't use 
133 | // this in time-critical code.
134 | double InversePGaussian(double x) {
135 |   double y = 0;
136 |   x = log(x);
137 |   for (int ii = 0; ii < 5; ++ii) {
138 |     double pgy = LogPGaussian(y);
139 |     y -= (pgy - x) * exp(pgy) / exp(LogDGaussian(y));
140 |   }
141 |   return y;
142 | }
143 | 
144 | 
145 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | THIS REPOSITORY HAS BEEN ARCHIVED
  2 | 
  3 | Please, use the new repository/project at:
  4 | https://github.com/magsilva/dtm
  5 | 
  6 | Reason for archival: this repository was created before Blei creating
  7 | his official project at Github (https://github.com/blei-lab/dtm). Hence
  8 | this project is not shown as its fork (which is not appropriate). With
  9 | the new project at magsilva/dtm, we how have it as a proper "fork" of
 10 | blei-lab/dtm.
 11 | 
 12 | 
 13 | 
 14 | 
 15 | 
 16 | ***************************
 17 | Dynamic Topic Models and the Document Influence Model
 18 | ***************************
 19 | 
 20 | This code is the result of work by 
 21 | 
 22 | David M. Blei
 23 | blei[at]cs.princeton.edu
 24 | 
 25 | and
 26 | 
 27 | Sean M Gerrish
 28 | sgerrish[at]cs.princeton.edu.
 29 | 
 30 | (C) Copyright 2006, David M. Blei
 31 |    (blei [at] cs [dot] princeton [dot] edu)
 32 | 
 33 | (C) Copyright 2011, Sean M. Gerrish
 34 |    (sgerrish [at] cs [dot] princeton [dot] edu)
 35 | 
 36 | It includes software corresponding to models described in the
 37 | following papers:
 38 | 
 39 | [1] D. Blei and J. Lafferty. Dynamic topic models. In
 40 |    Proceedings of the 23rd International Conference on Machine Learning,
 41 |    2006.
 42 | [2] S. Gerrish and D. Blei.  A Language-based Approach to Measuring
 43 |    Scholarly Impact.  In Proceedings of the 27th International Conference
 44 |    on Machine Learning, 2010.
 45 | 
 46 | These files are part of DIM.
 47 | 
 48 | DIM is free software; you can redistribute it and/or modify it under
 49 | the terms of the GNU General Public License as published by the Free
 50 | Software Foundation; either version 2 of the License, or (at your
 51 | option) any later version.
 52 | 
 53 | DIM is distributed in the hope that it will be useful, but WITHOUT
 54 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 55 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 56 | for more details.
 57 | 
 58 | You should have received a copy of the GNU General Public License
 59 | along with this program; if not, write to the Free Software
 60 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 61 | USA
 62 | 
 63 | ------------------------------------------------------------------------
 64 | 
 65 | A. COMPILING
 66 | 
 67 | You will need to have several libraries installed to compile this
 68 | package:
 69 | 	gsl
 70 | 	gflags
 71 | 
 72 | Depending on your package manager, you may be able to install these
 73 |  with *one* of the following commands:
 74 | 
 75 |   sudo aptitude install libgsl0-dev # Ubuntu 10.04
 76 |   sudo zypper install gsl-devel     # OpenSUSE 11.2
 77 |   sudo yum install gsl-devel        # CentOS 5.5
 78 | 
 79 | You can make the main program by changing your working directory to
 80 | dtm/ and typing:
 81 | 
 82 |   make
 83 | 
 84 | This software has been compiled on Ubuntu 10.04, OpenSUSE 11.2, and
 85 | CentOS 5.5.  Depending on your environment, you may need to install
 86 | additional libraries.
 87 | 
 88 | B. RUNNING
 89 | 
 90 | Once everything is compiled, you can run this software by typing the
 91 | command "./main <flags>", where flags is a list of command-line
 92 | options.  An example command and a description of the input and output
 93 | files is given in dtm/sample.sh.  You can see all command-line options
 94 | by typing
 95 | 
 96 |   ./main --help
 97 | 
 98 | (although we suggest you start out with the example in dtm/sample.sh).
 99 | 
100 | You should also replace 'main' by the appropriate executable (depending
101 | on your computer architecture and operating system). We currently
102 | provide binaries for Linux (dtm-linux32 and dtm-linux64), MacOS (dtm-darwin64)
103 | and Windows (dtm-win32.exe and dtm-win64.exe).
104 | 
105 | C. SUPPORT and QUESTIONS
106 | 
107 | This software is provided as-is, without any warranty or support,
108 | WHATSOEVER.  If you have any questions about running this software,
109 | you can post your question to the topic-models mailing list at
110 | topic-models@lists.cs.princeton.edu.  You are welcome to submit
111 | modifications or bug-fixes of this software to the authors, although
112 | not all submissions may be posted.
113 | 
114 | D. USAGE
115 | 
116 | This progam takes as input a collection of text documents and creates
117 | as output a list of topics over time, a description of each document
118 | as a mixture of these topics, and (possibly) a measure of how
119 | "influential" each document is, based on its language.
120 | 
121 | We have provided an example dataset, instructions for formatting input
122 | data and processing output files, and example command lines for
123 | running this software in the file dtm/sample.sh.
124 | 
125 | E. CHANGES
126 | 
127 | Changes in this version include:
128 | 
129 |  - Change the default top_obs_var flag to 0.5 (from -1.0)
130 |  - Change to use more iterations and a tighter convergence criterion in each doc's E-step.
131 |  - Change to initialize random topics to be a bit more "flat".
132 | 


--------------------------------------------------------------------------------
/lib/math/gsl_matrix.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MATH_GSL_MATRIX__
  2 | #define __MATH_GSL_MATRIX__
  3 | 
  4 | #include <gsl/gsl_vector.h>
  5 | #include <gsl/gsl_matrix.h>
  6 | #include <gsl/gsl_cblas.h>
  7 | 
  8 | class GslMatrixItem {
  9 |  public:
 10 |   GslMatrixItem(gsl_matrix* ptr, size_t index1, size_t index2) :
 11 |     ptr_(ptr),
 12 |     index1_(index1),
 13 |     index2_(index2) { }
 14 | 
 15 |   operator const double() {
 16 |     return gsl_matrix_get(ptr_, index1_, index2_);
 17 |   }
 18 | 
 19 |   double operator =(const double v) {
 20 |     gsl_matrix_set(ptr_, index1_, index2_, v);
 21 |     return v;
 22 |   }
 23 | 
 24 |   double operator +=(const double v) {
 25 |     double old_v = gsl_matrix_get(ptr_, index1_, index2_);
 26 |     gsl_matrix_set(ptr_, index1_, index2_, v + old_v);
 27 |     return v + old_v;
 28 |   }
 29 |  private:
 30 |   gsl_matrix* ptr_;
 31 |   size_t index1_;
 32 |   size_t index2_;
 33 | };
 34 | 
 35 | class GslMatrixBase {
 36 |  public:
 37 |   GslMatrixBase& operator=(const double v) {
 38 |     if (v == 0.0) {
 39 |       SetZero();
 40 |     } else {
 41 |       SetAll(v);
 42 |     }
 43 |     return *this;
 44 |   }
 45 | 
 46 |   GslMatrixItem operator()(const size_t index1, const size_t index2) const {
 47 |     assert(ptr_ != NULL);
 48 |     return GslMatrixItem(ptr_, index1, index2);
 49 |   }
 50 | 
 51 |   void SetZero() {
 52 |     assert(ptr_ != NULL);
 53 |     gsl_matrix_set_zero(ptr_);
 54 |   }
 55 | 
 56 |   void SetAll(const double v) {
 57 |     assert(ptr_ != NULL);
 58 |     gsl_matrix_set_all(ptr_, v);
 59 |   }
 60 |   
 61 |   void Reset(gsl_matrix* val) {
 62 |     if(ptr_ != NULL) {
 63 |       gsl_matrix_free(ptr_);
 64 |     }
 65 |     ptr_ = val;
 66 |   }
 67 | 
 68 |   int Fprintf(FILE* stream, const char* format) const {
 69 |     assert(ptr_ != NULL);
 70 |     return gsl_matrix_fprintf(stream, ptr_, format);
 71 |   }
 72 | 
 73 |   int Fscanf(FILE* stream) {
 74 |     assert(ptr_ != NULL);
 75 |     return gsl_matrix_fscanf(stream, ptr_);
 76 |   }
 77 | 
 78 |   void Set(const int i, const int j, double val) {
 79 |     gsl_matrix_set(ptr_, i, j, val);
 80 |   }
 81 | 
 82 |   /*
 83 |     double operator()(const int nCol, const int nRow) {
 84 |     return gsl_matrix_get(ptr_, nCol, nRow);
 85 |     }
 86 |   */
 87 | 
 88 |   int size1() const {
 89 |     return ptr_->size1;
 90 |   }
 91 | 
 92 |   int size2() const {
 93 |     return ptr_->size2;
 94 |   }
 95 | 
 96 |   double Trace() const {
 97 |     double val = 0;
 98 |     assert(ptr_ != NULL);
 99 |     assert(ptr_->size1 == ptr_->size2);
100 |     for (size_t ii = 0; ii < ptr_->size1; ++ii) {
101 |       val += gsl_matrix_get(ptr_, ii, ii);
102 |     }
103 |     return val;
104 |   }
105 | 
106 |   double Sum() const {
107 |     double val = 0;
108 |     assert(ptr_ != NULL);
109 |     for (size_t ii = 0; ii < ptr_->size1; ++ii) {
110 |       for (size_t jj = 0; jj < ptr_->size2; ++jj) {
111 | 	val += gsl_matrix_get(ptr_, ii, jj);
112 |       }
113 |     }
114 |     return val;
115 |   }
116 | 
117 |   /*
118 |    * Apply the transpose of this matrix to a vector x and store the result.
119 | 
120 |    int TransMul(const GslVector& x, GslVector& res, double scale = 0.0) {
121 |    return gsl_blas_dgemv(CblasTrans, 1.0, ptr_, x.ptr(), scale, res.ptr());
122 |    }
123 |    
124 |    int Mul(const GslVector& x, GslVector& res, double scale = 0.0) {
125 |    return gsl_blas_dgemv(CblasNoTrans, 1.0, ptr_, x.ptr(), scale, res.ptr());
126 |    }
127 |   */
128 | 
129 |   const gsl_matrix* ptr() const { return ptr_; }
130 |   gsl_matrix* mutable_ptr() { return ptr_; }
131 | 
132 |  protected:
133 |   GslMatrixBase() : ptr_(NULL) {
134 |   }  
135 |   gsl_matrix* ptr_;
136 | 
137 |  private:
138 |   GslMatrixBase(const GslMatrixBase&) { }
139 | };
140 | 
141 | class GslMatrix : public GslMatrixBase {
142 |  public:
143 |   GslMatrix(const size_t size1, const size_t size2) : GslMatrixBase() {
144 |     Allocate(size1, size2);
145 |   }
146 | 
147 |   void Allocate(const size_t size1, const size_t size2) {
148 |     assert(ptr_ == NULL);
149 |     ptr_ = gsl_matrix_alloc(size1, size2);
150 |   }
151 | 
152 |   GslMatrix() : GslMatrixBase() {
153 |   }
154 | 
155 |   GslMatrix(gsl_matrix* val) : GslMatrixBase() {
156 |     ptr_ = val;
157 |   }
158 | 
159 |   ~GslMatrix() {
160 |     if(ptr_ != NULL) {
161 |       gsl_matrix_free(ptr_);
162 |     }
163 |   }
164 | 
165 |   GslMatrixBase& operator=(const double v) {
166 |     GslMatrixBase::operator=(v);
167 |     return *this;
168 |   }
169 |  private:
170 |   GslMatrix(const GslMatrix&) { }
171 | };
172 | 
173 | 
174 | class GslSubmatrix : public GslMatrixBase {
175 |  public:
176 |  GslSubmatrix(GslMatrixBase& matrix, size_t k1, size_t k2, size_t n1, size_t n2) :
177 |   view_(gsl_matrix_submatrix(matrix.mutable_ptr(), k1, k2, n1, n2)) {    
178 |      ptr_ = &view_.matrix;
179 |   }
180 | 
181 |  GslSubmatrix(gsl_matrix* matrix, size_t k1, size_t k2, size_t n1, size_t n2) :
182 |   view_(gsl_matrix_submatrix(matrix, k1, k2, n1, n2)) {    
183 |      ptr_ = &view_.matrix;
184 |   }
185 | 
186 |   GslMatrixBase& operator=(const double v) {
187 |     GslMatrixBase::operator=(v);
188 |     return *this;
189 |   }
190 |  private:
191 |   gsl_matrix_view view_;
192 |   GslSubmatrix(const GslSubmatrix&) { }
193 | };
194 | 
195 | #endif  // __MATH_GSL_MATRIX__
196 | 


--------------------------------------------------------------------------------
/dtm/data.h:
--------------------------------------------------------------------------------
  1 | // Authors: David Blei (blei@cs.princeton.edu)
  2 | //          Sean Gerrish (sgerrish@cs.princeton.edu)
  3 | //
  4 | // Copyright 2011 Sean Gerrish and David Blei
  5 | // All Rights Reserved.
  6 | //
  7 | // See the README for this package for details about modifying or
  8 | // distributing this software.
  9 | 
 10 | #ifndef DATA_H
 11 | #define DATA_H
 12 | 
 13 | #include "gsl-wrappers.h"
 14 | #include "param.h"
 15 | #include <stdio.h>
 16 | #include <stdlib.h>
 17 | #include <utility>
 18 | 
 19 | #define OFFSET 0
 20 | 
 21 | // Create the scaled beta distribution, which describes how much weight documents have after n years.
 22 | const int kScaledInfluenceMax = 200;
 23 | 
 24 | // This mean and variance are relative to the interval [0, 1.0].
 25 | const double kScaledInfluenceMean = 10.0 / kScaledInfluenceMax;
 26 | const double kScaledInfluenceVariance = ((10.0 / kScaledInfluenceMax) * (10.0 / kScaledInfluenceMax));
 27 | 
 28 | /*
 29 |  * a document is a collection of counts and terms
 30 |  *
 31 |  */
 32 | 
 33 | typedef struct doc_t {
 34 |     int total;
 35 |     int nterms;
 36 |     int* word;
 37 |     int* count;
 38 |     // A parameter for finding phi.
 39 |     double* lambda;
 40 | 
 41 |     // Used for measuring perplexity.
 42 |     double log_likelihood;
 43 |     double* log_likelihoods;
 44 | } doc_t;
 45 | 
 46 | 
 47 | /*
 48 |  * a corpus is a collection of documents
 49 |  *
 50 |  */
 51 | 
 52 | typedef struct corpus_t {
 53 |     doc_t** doc;
 54 |     int ndocs;
 55 |     int nterms;
 56 |     int max_unique;  // maximum number of unique terms in a document
 57 | } corpus_t;
 58 | 
 59 | 
 60 | /*
 61 |  * a sequence is a sequence of corpora
 62 |  *
 63 |  */
 64 | 
 65 | typedef struct corpus_seq_t {
 66 |     corpus_t** corpus;
 67 |     int nterms;
 68 |     int max_nterms;
 69 |     int len;
 70 |     int ndocs;
 71 | } corpus_seq_t;
 72 | 
 73 | 
 74 | typedef struct inf_var {
 75 |   gsl_matrix** doc_weights;   // T matrices of document weights.
 76 |                               // each matrix is d_t x K.
 77 |   gsl_matrix** renormalized_doc_weights;   // T matrices of document weights.
 78 |                               // each matrix is d_t x K.
 79 |   int ntime;
 80 | } inf_var;
 81 | 
 82 | /*
 83 |  * variational posterior structure
 84 |  *
 85 |  */
 86 | 
 87 | 
 88 | typedef struct sslm_var {
 89 |     // properties
 90 | 
 91 |     int W; // vocabulary size
 92 |     int T; // sequence length
 93 | 
 94 |     // variational parameters
 95 | 
 96 |     gsl_matrix* obs;             // observations, W x T
 97 | 
 98 |     // biproducts of the variational parameters
 99 | 
100 |     double obs_variance;         // observation variance
101 |     double chain_variance;       // chain variance
102 |     gsl_vector* zeta;            // extra variational parameter, T
103 |     gsl_matrix* e_log_prob;      // E log prob(w | t), W x T
104 | 
105 |     // convenient quantities for inference
106 | 
107 |     gsl_matrix* fwd_mean;       // forward posterior mean, W x T
108 |     gsl_matrix* fwd_variance;   // forward posterior variance, W x T
109 |     gsl_matrix* mean;           // posterior mean, W x T
110 |     gsl_matrix* variance;       // posterior variance, W x T
111 | 
112 |     gsl_matrix* mean_t;         // W x T
113 |     gsl_matrix* variance_t;
114 | 
115 |     gsl_matrix* influence_sum_lgl;  // The sum exp * w_phi_l
116 | 
117 |     // Recent copy of w_phi_l.
118 |     gsl_matrix* w_phi_l;         // W x T
119 |     gsl_matrix* w_phi_sum;      // W x T
120 |     gsl_matrix* w_phi_l_sq;      // Square term involving various
121 |     gsl_matrix* m_update_coeff;  // Terms involving squares of
122 |                                  // W, l, and phi.
123 |     gsl_matrix* m_update_coeff_g;  // \sum_i=0..t phi_l(t) r(i-t)
124 | 
125 |     // useful temporary vector
126 |     gsl_vector* T_vct;
127 | } sslm_var;
128 | 
129 | 
130 | typedef struct lda_seq {
131 |     int ntopics;             // number of topics
132 |     int nterms;              // number of terms
133 |     int nseq;                // length of sequence
134 |     gsl_vector* alpha;       // dirichlet parameters
135 | 
136 |     sslm_var** topic;        // topic chains.
137 | 
138 |     inf_var* influence;      // document weights
139 | 
140 |     gsl_matrix** influence_sum_lgl;  // Sum of document weights at time t (see g in the regression formula)
141 | 
142 |   //    gsl_vector** influence_sum_g;  // Sum of document weights at time t.
143 |   // gsl_vector** influence_sum_h;  // Sum of document weights at time t.
144 | 
145 |     inf_var* renormalized_influence;      // document weights
146 | 
147 |   //    gsl_matrix** w_phi_l;        // Product term for the \beta update.
148 |   //  gsl_matrix** w_phi_l_sq;     // Square term involving various
149 |                                 // coefficients for the \beta update.
150 | 
151 |   std::pair<int, float>**** top_doc_phis;        // T x D_t x n of document phis.
152 | } lda_seq;
153 | 
154 | /*
155 |  * functions
156 |  *
157 |  */
158 | 
159 | corpus_t* read_corpus(const char* name);
160 | corpus_seq_t* read_corpus_seq(const char* name);
161 | int compute_max_nterms(const corpus_seq_t* c);
162 | gsl_matrix * compute_total_counts(const corpus_seq_t* c);
163 | corpus_seq_t* make_seq_corpus_subset(corpus_seq_t* all, int start, int end);
164 | void write_corpus(corpus_t* c, char* filename);
165 | void write_corpus_seq(corpus_seq_t* c, char* name);
166 | corpus_seq_t* make_corpus_seq_subset(corpus_seq_t* all, int start, int end);
167 | corpus_t* collapse_corpus_seq(corpus_seq_t* c);
168 | double* NewScaledInfluence(int size);
169 | 
170 | #endif
171 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/multimin_fdfminimizer.h:
--------------------------------------------------------------------------------
  1 | //  This random generator is a C++ wrapper for the GNU Scientific Library
  2 | //  Copyright (C) 2001 Torbjorn Vik
  3 | 
  4 | //  This program is free software; you can redistribute it and/or modify
  5 | //  it under the terms of the GNU General Public License as published by
  6 | //  the Free Software Foundation; either version 2 of the License, or
  7 | //  (at your option) any later version.
  8 | 
  9 | //  This program is distributed in the hope that it will be useful,
 10 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | //  GNU General Public License for more details.
 13 | 
 14 | //  You should have received a copy of the GNU General Public License
 15 | //  along with this program; if not, write to the Free Software
 16 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 17 | #ifndef __multimin_fdfminimizer_h
 18 | #define __multimin_fdfminimizer_h 
 19 | 
 20 | #include <gsl/gsl_errno.h>
 21 | #include <gsl/gsl_multimin.h>
 22 | #include <gslwrap/vector_double.h>
 23 | 
 24 | namespace gsl{
 25 | 
 26 | //! Create an instance of this class with a user defined function
 27 | /*!
 28 |   A template class with the function operator()(const vector& x) 
 29 |   and derivative(const vector&, vector&), as well as a reference to an object of this class must be fournished
 30 | 
 31 |   User is responsible for deleting this reference !
 32 | 
 33 |  */
 34 | template <class fdf_function>
 35 | class multimin_fdf
 36 | {
 37 |  public:
 38 | 	fdf_function* fct;
 39 | 
 40 | 	//! These operators can be overridden
 41 | 	virtual double operator()(const vector& x)
 42 | 		{
 43 | 			return (*fct)(x);
 44 | 		}
 45 | 	virtual void derivative(const vector& x, vector& g)
 46 | 		{
 47 | 			(*fct).derivative(x, g);
 48 | 		}
 49 | 	
 50 | 	//! This operator can be overridden to gain performance in calculating the value and its derivative in a scoop
 51 | 	virtual double fval_and_derivative(const vector&x, vector& g )
 52 | 	{
 53 | 		derivative(x, g);
 54 | 		return (*this)(x);
 55 | 	}
 56 | 
 57 | 
 58 | 	//! This is the function gsl calls to calculate the value of f at x
 59 | 	static double f(const gsl_vector* x, void *p)
 60 | 	{
 61 | 		vector_view x_view(*x);
 62 | 		return (*(multimin_fdf *)p)(x_view);
 63 | 	}
 64 | 
 65 | 	//! This is the function gsl calls to calculate the value of g=f' at x
 66 | 	static void df(const gsl_vector* x, void *p, gsl_vector* g)
 67 | 	{
 68 | 		vector_view x_view(*x);
 69 | 		vector_view g_view(*g);
 70 | 		(*(multimin_fdf *)p).derivative(x_view, g_view);
 71 | 	}
 72 | 
 73 | 	//! This is the function gsl calls to calculate the value of g=f' at x
 74 | 	static void fdf(const gsl_vector* x, void *p, double* f, gsl_vector* g)
 75 | 	{
 76 | 		vector_view x_view(*x);
 77 | 		vector_view g_view(*g);
 78 | 		*f=(*(multimin_fdf *)p).fval_and_derivative(x_view, g_view);
 79 | 	}
 80 | 
 81 | 	//! Constructor (User is responsible for deleting the fdf_function object)
 82 | 	multimin_fdf(fdf_function* _fct):fct(_fct){assert (fct!=NULL);}
 83 | };
 84 | 
 85 | //! Class for multiminimizing one dimensional functions. 
 86 | /*!
 87 |   Usage: 
 88 |        - Create with optional multiminimize type
 89 | 	   - Set with function object and inital bounds
 90 | 	   - Loop the  iterate function until convergence or maxIterations (extra facility)
 91 | 
 92 | 	   - Recover multiminimum and bounds
 93 |  */
 94 | class multimin_fdfminimizer 
 95 | {
 96 |  public:
 97 | //! 
 98 | /*! Choose between : 
 99 |   - gsl_multimin_fdfminimizer_conjugate_fr
100 |   - gsl_multimin_fdfminimizer_conjugate_pr
101 |   - gsl_multimin_fdfminimizer_vector_bfgs
102 |   - gsl_multimin_fdfminimizer_steepest_descent
103 |   
104 |  */
105 | 	multimin_fdfminimizer(uint _dim, 
106 | 						  const gsl_multimin_fdfminimizer_type* type=gsl_multimin_fdfminimizer_conjugate_fr) : 
107 | 		dim(_dim), isSet(false), maxIterations(100), s(NULL)
108 | 	{
109 | 		s=gsl_multimin_fdfminimizer_alloc(type, dim);
110 | 		nIterations=0;
111 | 		if (!s)
112 | 		{
113 | 			//error
114 | 			//cout << "ERROR Couldn't allocate memory for multiminimizer" << endl;
115 | 			//throw ? 
116 | 			exit(-1);
117 | 		}
118 | 	}
119 | 	~multimin_fdfminimizer(){if (s) gsl_multimin_fdfminimizer_free(s);}
120 | 	//! returns GSL_FAILURE if the interval does not contain a multiminimum
121 | 	template <class  fdf_function>
122 | 	int set(multimin_fdf<fdf_function>& function, const vector& initial_x, double step_size, double tol)
123 | 	{
124 | 		isSet=false;
125 | 		f.f   = &function.f;
126 | 		f.df  = &function.df;
127 | 		f.fdf = &function.fdf;
128 | 		f.n   = dim;
129 | 		f.params = &function;
130 | 		int status=	gsl_multimin_fdfminimizer_set(s, &f, initial_x.gslobj(), step_size, tol);
131 | 		if (!status)
132 | 		{
133 | 			isSet=true;
134 | 			nIterations=0;
135 | 		}
136 | 		return status;
137 | 	}
138 | 	int iterate()
139 | 	{
140 | 		assert_set();
141 | 		int status=gsl_multimin_fdfminimizer_iterate(s);
142 | 		nIterations++;
143 | 		if (status==GSL_FAILURE)
144 | 			isConverged=true;
145 | 		return status;
146 | 	}
147 | 	int restart(){return gsl_multimin_fdfminimizer_restart(s);}
148 |   	double minimum(){assert_set();return gsl_multimin_fdfminimizer_minimum(s);} 
149 | 	vector x_value(){assert_set();return vector_view(*gsl_multimin_fdfminimizer_x(s));}  
150 | 	vector gradient(){assert_set();return vector_view(*gsl_multimin_fdfminimizer_gradient(s));}  
151 | 
152 | 
153 | 	void SetMaxIterations(int n){maxIterations=n;}
154 | 	int GetNIterations(){return nIterations;}
155 | 	bool is_converged(){if (nIterations>=maxIterations) return true; if (isConverged) return true; return false;}
156 | 	//string name() const;
157 | 	
158 |  private:
159 | 	void assert_set(){if (!isSet)exit(-1);} // Old problem of error handling: TODO
160 | 	
161 | 	uint dim;
162 | 	bool isSet;
163 | 	bool isConverged;
164 | 	int nIterations;
165 | 	int maxIterations;
166 | 	gsl_multimin_fdfminimizer* s;
167 | 	gsl_multimin_function_fdf f;
168 | };
169 | };	 // namespace gsl
170 | 
171 | #endif //__multimin_fdfminimizer_h
172 | 


--------------------------------------------------------------------------------
/lib/math/gradient_projection.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "gradient_projection.h"
  3 | 
  4 | namespace GradientProjection {
  5 | 
  6 | void display(const gsl_vector* v, const char* name) {
  7 |   std::cout << name << " = <";
  8 |   for(unsigned int i=0; i<v->size; i++) {
  9 |     std::cout << gsl_vector_get(v, i);
 10 |     if (i < v->size - 1) {
 11 |       std::cout << ", ";
 12 |     }
 13 |   }
 14 |   std::cout << ">    (" << v->size << ")" << std::endl;
 15 | }
 16 | 
 17 | void display(const gsl_matrix* m, const char* name) {
 18 |   std::cout << name << "\t = |";
 19 |   for(unsigned int i=0; i<m->size1; i++) {
 20 |     if(i!=0) {
 21 |       std::cout << "\t   |";
 22 |     }
 23 |     for(unsigned int j=0; j<m->size2; j++) {
 24 |       std::cout << gsl_matrix_get(m, i, j) << "\t";
 25 |     }
 26 |     std::cout << "|" << std::endl;
 27 |   }
 28 |   std::cout << "                                SIZE: " << m->size1 << " x " << m->size2 << std::endl;
 29 | }
 30 | 
 31 | void createProjection(const gsl::matrix& activeConstraints,
 32 |                       const gsl::vector& g,
 33 |                       const gsl::vector& grad,
 34 |                       gsl::matrix& projection,
 35 |                       gsl::vector& direction,
 36 |                       gsl::vector& correction) {
 37 |   int n = activeConstraints.size1();
 38 |   int r = activeConstraints.size2();
 39 | 
 40 |   correction.resize(n);
 41 |   direction.resize(n);
 42 | 
 43 |   // This could be done with cholesky or QR decomposition, but I
 44 |   // couldn't get it to work.  Given that this happens infrequently
 45 |   // and the matrices are not *that* big, it's not that bad
 46 |   gsl::matrix S(r,r);
 47 |   // S = N^T N
 48 |   gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, 
 49 |                  activeConstraints.gslobj(), activeConstraints.gslobj(), 
 50 |                  0.0, S.gslobj());
 51 |   // T = (N^{T} N) ^{-1}
 52 |   gsl::matrix T = S.LU_invert();
 53 |   S.set_dimensions(n, r);
 54 |   // S = -N(N^{T} N)^{-1}
 55 |   gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, -1.0, 
 56 |                  activeConstraints.gslobj(), T.gslobj(), 0.0, S.gslobj());
 57 | 
 58 |   // Set the correction
 59 |   gsl_blas_dgemv(CblasNoTrans, 1.0, S.gslobj(), g.gslobj(), 0.0, 
 60 |                  correction.gslobj());
 61 | 
 62 |   // Set the direction
 63 |   // P = -N(N^{T} N)^{-1}N + I 
 64 |   projection.identity(n);
 65 |   gsl_blas_dgemm(CblasNoTrans, CblasTrans, 1.0, 
 66 |                  S.gslobj(), activeConstraints.gslobj(), 
 67 |                  1.0, projection.gslobj());
 68 |   gsl_blas_dgemv(CblasNoTrans, -1.0, projection.gslobj(), grad.gslobj(), 0.0, 
 69 |                  direction.gslobj());
 70 | }
 71 | 
 72 | bool createActiveConstraints(const gsl::vector& x, 
 73 |                              gsl::matrix& n, 
 74 |                              gsl::vector& g) {
 75 |   bool sumConstraintViolated = false;
 76 |   int dimension = x.size();
 77 |   double margin = SAFETY_BOX;
 78 | 
 79 |   if(x.sum() >= 1.0 - margin) {
 80 |     sumConstraintViolated = true;
 81 |   }
 82 | 
 83 |   int nonNegativeConstraintsViolated = 0;
 84 |   for(int ii = 0; ii < dimension; ++ii) {
 85 |     if (x[ii] <= margin) {
 86 |       ++nonNegativeConstraintsViolated;
 87 |     }
 88 |   }
 89 | 
 90 |   int newSize = nonNegativeConstraintsViolated;
 91 |   if(sumConstraintViolated) {
 92 |     newSize += 1;
 93 |   }
 94 | 
 95 |   if(newSize > 0) {
 96 |     n.set_dimensions(dimension, newSize);
 97 |     g.resize(newSize);
 98 |     g.set_all(SAFETY_BOX);
 99 |     int col = 0;
100 |     if(sumConstraintViolated) {
101 |       g[0] = -(1.0 - SAFETY_BOX);
102 |       for(int ii = 0; ii < dimension; ++ii) {
103 |         n(ii, 0) = -1.0;
104 |       }
105 |       ++col;
106 |     }
107 | 
108 |     for(int ii = 0; ii < dimension; ++ii) {
109 |       if(x[ii] <= margin) {
110 |         n(ii, col) = 1.0;
111 |         ++col;
112 |       }
113 |     }
114 |     assert(col == newSize);
115 | 
116 |     gsl_blas_dgemv(CblasTrans, 1.0, n.gslobj(), x.gslobj(), -1.0, g.gslobj());
117 | 
118 |     //display(n.gslobj(), "N");
119 |     //display(g.gslobj(), "g");
120 | 
121 |     return true;
122 |   } else {
123 |     return false;
124 |   }
125 | }
126 | 
127 | double descend(gsl::vector& x, 
128 |                gsl::vector& s,
129 |                const double gamma, 
130 |                const double obj_value,             
131 |                const gsl::vector& correction,
132 |                const gsl::vector& grad) {
133 |   double alpha = 0.0;
134 | 
135 |   gsl_blas_ddot(s.gslobj(), grad.gslobj(), &alpha);
136 |   //std::cout << "dot prod= " << alpha << " ";
137 |   if(alpha == 0) {
138 |     return alpha;
139 |   }
140 |   alpha = -(gamma * obj_value) / alpha;
141 |   //std::cout << " alpha= " << alpha << " ";
142 | 
143 |   s *= alpha;
144 |   s += correction;
145 |   x += s;
146 | 
147 |   //display(s.gslobj(), "final move");
148 | 
149 |   if(alpha < 0) {
150 |     alpha = -alpha;
151 |   }
152 |   return alpha;
153 | }
154 | 
155 | double updateState(gsl::vector& x,
156 |                    const double gamma,
157 |                    const gsl::vector grad,
158 |                    const double f) {
159 |   /*
160 |    * First we see if we're up against constraints
161 |    */
162 |   int dim = x.size();
163 |   gsl::matrix n;
164 |   gsl::vector g;
165 |   gsl::vector s;
166 |   gsl::vector correction(dim);
167 |   
168 |   if(createActiveConstraints(x, n, g)) {
169 |     s.resize(dim);
170 |     gsl::matrix p;
171 |     
172 |     createProjection(n, g, grad, p, s, correction);
173 |     //std::cout << "Constraints violated." << std::endl;
174 | 
175 |     //display(p.gslobj(), "p");
176 |     //display(s.gslobj(), "s");
177 |     //display(correction.gslobj(), "correction");
178 |     return descend(x, s, gamma, f, correction, grad);
179 |   } else {
180 |     //std::cout << "No constraints violated." << std::endl;
181 |     s.copy(grad);
182 |     s *= -gamma * GRADIENT_DESCENT_SLOWDOWN;
183 |     x += s;
184 |     double diff;
185 |     gsl_blas_ddot(s.gslobj(), s.gslobj(), &diff);
186 |     return diff * gamma;
187 |   }
188 | 
189 | 
190 | }
191 | 
192 | 
193 | }
194 | 


--------------------------------------------------------------------------------
/doc/lda.tex:
--------------------------------------------------------------------------------
  1 | 
  2 | \section{Latent Dirichlet Allocation (LDA)}
  3 | 
  4 | This is a C implementation of latent Dirichlet allocation (LDA), a
  5 | model of discrete data which is fully described in Blei et al. (2003)
  6 | (http://www.cs.berkeley.edu/~blei/papers/blei03a.pdf).
  7 | 
  8 | LDA is a hierarchical probabilistic model of documents.  Let \alpha be
  9 | a scalar and \beta_{1:K} be K distributions of words (called "topics").
 10 | As implemented here, a K topic LDA model assumes the following
 11 | generative process of an N word document:
 12 | 
 13 |           1. \theta | \alpha ~ Dirichlet(\alpha, ..., \alpha)
 14 | 
 15 |           2. for each word n = {1, ..., N}:
 16 | 
 17 |              a. Z_n | \theta ~ Mult(\theta)
 18 | 
 19 |              b. W_n | z_n, \beta ~ Mult(\beta_{z_n})
 20 | 
 21 | This code implements variational inference of \theta and z_{1:N} for a
 22 | document, and estimation of the topics \beta_{1:K} and Dirichlet
 23 | parameter \alpha.
 24 | 
 25 | 
 26 | \subsection{Data format}
 27 | 
 28 | Under LDA, the words of each document are assumed exchangeable.  Thus,
 29 | each document is succinctly represented as a sparse vector of word
 30 | counts. The data is a file where each line is of the form:
 31 | 
 32 |      [M] [term_1]:[count] [term_2]:[count] ...  [term_N]:[count]
 33 | 
 34 | where [M] is the number of unique terms in the document, and the
 35 | [count] associated with each term is how many times that term appeared
 36 | in the document.  Note that [term_1] is an integer which indexes the
 37 | term; it is not a string.
 38 | 
 39 | 
 40 | 
 41 | \subsection{Configuration}
 42 | 
 43 | \begin{description}
 44 | 	\item[var max iter (integer; default: -1)]
 45 | 	The maximum number of iterations of coordinate ascent variational
 46 | 	inference for a single document.  A value of -1 indicates "full"
 47 | 	variational inference, until the variational convergence
 48 | 	criterion is met.
 49 | 
 50 | 	\item[var convergence] (float; default: 1e-6)]
 51 | 	The convergence criteria for variational inference.  Stop if
 52 | 	(score_old - score) / abs(score_old) is less than this value (or
 53 | 	after the maximum number of iterations).  Note that the score is
 54 | 	the lower bound on the likelihood for a particular document.
 55 | 
 56 | 	\item[em max iter] (integer; default: 100)]
 57 | 	The maximum number of iterations of variational EM.
 58 | 
 59 | 	\item[em convergence (float; default: 1e-4)]
 60 | 	The convergence criteria for varitional EM.  Stop if (score_old -
 61 | 	score) / abs(score_old) is less than this value (or after the
 62 | 	maximum number of iterations).  Note that "score" is the lower
 63 | 	bound on the likelihood for the whole corpus.
 64 | 
 65 | 	\item[alpha (string: `fit' or `estimate'; default: estimate)]
 66 | 	If set to [fixed] then alpha does not change from iteration to
 67 | 	iteration.  If set to [estimate], then alpha is estimated along
 68 | 	with the topic distributions.   
 69 | \end{description}
 70 | 
 71 | 
 72 | \subsection{Running}
 73 | 
 74 | \subsubsection{Topic estimation}
 75 | 
 76 | Estimate the model by executing:
 77 | 
 78 |      lda est [alpha] [k] [settings] [data] [random/seeded/*] [directory]
 79 | 
 80 | The term [random/seeded/*] > describes how the topics will be
 81 | initialized.  "Random" initializes each topic randomly; "seeded"
 82 | initializes each topic to a distribution smoothed from a randomly
 83 | chosen document; or, you can specify a model name to load a
 84 | pre-existing model as the initial model (this is useful to continue EM
 85 | from where it left off).  To change the number of initial documents
 86 | used, edit lda-estimate.c.
 87 | 
 88 | The model (i.e., \alpha and \beta_{1:K}) and variational posterior
 89 | Dirichlet parameters will be saved in the specified directory every
 90 | ten iterations.  Additionally, there will be a log file for the
 91 | likelihood bound and convergence score at each iteration.  The
 92 | algorithm runs until that score is less than "em_convergence" (from
 93 | the settings file) or "em_max_iter" iterations are reached.  (To
 94 | change the lag between saved models, edit lda-estimate.c.)
 95 | 
 96 | The saved models are in two files:
 97 | 
 98 |      <iteration>.other contains alpha.
 99 | 
100 |      <iteration>.beta contains the log of the topic distributions.
101 |      Each line is a topic; in line k, each entry is log p(w | z=k)
102 | 
103 | The variational posterior Dirichlets are in:
104 | 
105 |      <iteration>.gamma
106 | 
107 | The settings file and data format are described below.
108 | 
109 | 
110 | \subsubsection{Inference}
111 | 
112 | To perform inference on a different set of data (in the same format as
113 | for estimation), execute:
114 | 
115 |      lda inf [settings] [model] [data] [name]
116 | 
117 | Variational inference is performed on the data using the model in
118 | [model].* (see above).  Two files will be created : [name].gamma are
119 | the variational Dirichlet parameters for each document;
120 | [name].likelihood is the bound on the likelihood for each document.
121 | 
122 | 
123 | 
124 | \subsection{Results}
125 | 
126 | \subsubsection{Printing topics}
127 | 
128 | The Python script topics.py lets you print out the top N
129 | words from each topic in a .beta file.  Usage is:
130 | 
131 |      python topics.py <beta file> <vocab file> <n words>
132 | 
133 | 
134 | \begin{lstlisting}
135 | #! /usr/bin/python
136 | 
137 | # usage: python topics.py <beta file> <vocab file> <num words>
138 | #
139 | # <beta file> is output from the lda-c code
140 | # <vocab file> is a list of words, one per line
141 | # <num words> is the number of words to print from each topic
142 | 
143 | import sys
144 | 
145 | def print_topics(beta_file, vocab_file, nwords = 25):
146 | 
147 |     # get the vocabulary
148 | 
149 |     vocab = file(vocab_file, 'r').readlines()
150 |     # vocab = map(lambda x: x.split()[0], vocab)
151 |     vocab = map(lambda x: x.strip(), vocab)
152 | 
153 |     # for each line in the beta file
154 | 
155 |     indices = range(len(vocab))
156 |     topic_no = 0
157 |     for topic in file(beta_file, 'r'):
158 |         print 'topic %03d' % topic_no
159 |         topic = map(float, topic.split())
160 |         indices.sort(lambda x,y: -cmp(topic[x], topic[y]))
161 |         for i in range(nwords):
162 |             print '   %s' % vocab[indices[i]]
163 |         topic_no = topic_no + 1
164 |         print '\n'
165 | 
166 | if (__name__ == '__main__'):
167 | 
168 |     if (len(sys.argv) != 4):
169 |        print 'usage: python topics.py <beta-file> <vocab-file> <num words>\n'
170 |        sys.exit(1)
171 | 
172 |     beta_file = sys.argv[1]
173 |     vocab_file = sys.argv[2]
174 |     nwords = int(sys.argv[3])
175 |     print_topics(beta_file, vocab_file, nwords)
176 | \end{lstlisting}
177 | 
178 | 


--------------------------------------------------------------------------------
/dtm/util.c:
--------------------------------------------------------------------------------
  1 | // Author: David Blei (blei@cs.princeton.edu)
  2 | //
  3 | // Copyright 2006 David Blei
  4 | // All Rights Reserved.
  5 | //
  6 | // See the README for this package for details about modifying or
  7 | // distributing this software.
  8 | 
  9 | #define ABNORMAL_RETURN_CODE 1
 10 | #define MAX_STRING_LENGTH    65535
 11 | 
 12 | #include <stdio.h>
 13 | #include <string.h>
 14 | #include <assert.h>
 15 | #include <sys/stat.h>
 16 | #include <errno.h>
 17 | #include <ctype.h>
 18 | #include "util.h"
 19 | 
 20 | #ifdef __alpha__
 21 | 
 22 | #include <sys/mount.h>
 23 | #include <malloc.h>
 24 | #include <stdlib.h>
 25 | #include <unistd.h>
 26 | 
 27 | #else
 28 | 
 29 | /*#include <malloc.h>*/
 30 | #include <stdlib.h>
 31 | #include <unistd.h>
 32 | /*#include <sys/vfs.h>*/
 33 | 
 34 | #endif
 35 | 
 36 | char   buf[1024];
 37 | static int space_in_use=0;
 38 | static int pointers_in_use=0;
 39 | int    display_allocs=FALSE;
 40 | 
 41 | 
 42 | void error(char *fmt, ...){
 43 |     va_list args;
 44 |     va_start(args, fmt);
 45 |     vfprintf(stderr, fmt, args); CRLF;
 46 |     va_end(args);
 47 |     fprintf(stderr, "\n");
 48 |     if (errno > 0) {
 49 |         perror(buf);
 50 |         fprintf(stderr, "errno=%d\n", errno);
 51 |         fprintf(stderr, buf);
 52 |       fprintf(stderr, "\n");
 53 |     }
 54 |     fflush(stderr);
 55 |     fflush(stdout);
 56 |     assert(0);
 57 | }
 58 | 
 59 | void bomb(char *fmt, ...)
 60 | {
 61 |    /* just break out, with error code =1 (fail) */
 62 | 
 63 |    va_list args;
 64 |    va_start(args, fmt);
 65 |    vfprintf(stderr, fmt, args); CRLF;
 66 |    va_end(args);
 67 |    fprintf(stderr, "\n");
 68 |    fflush(stderr);
 69 |    fflush(stdout);
 70 |    exit(1);
 71 | }
 72 | 
 73 | 
 74 | void bail(char *fmt, ...)
 75 | {
 76 |    /* just break out, with error code =0 (success) */
 77 | 
 78 |    va_list args;
 79 |    va_start(args, fmt);
 80 |    vfprintf(stderr, fmt, args); CRLF;
 81 |    va_end(args);
 82 |    fprintf(stderr, "\n");
 83 |    fflush(stderr);
 84 |    fflush(stdout);
 85 |    exit(0);
 86 | }
 87 | 
 88 | 
 89 | 
 90 | char *dequote (char *s) {
 91 |     static char *sbuf=NULL;
 92 |     char *t;
 93 |     int i;
 94 |     if (s[0] != '\'') return s;
 95 |     else if ((i=strlen(s)) < 2) return s;
 96 |     else if (s[i-1] != '\'')
 97 |        error("Illegal string passed to dequote: %s", s);
 98 |     if (sbuf == NULL)
 99 |         sbuf = (char *) malloc(MAX_STRING_LENGTH);
100 |     t = sbuf;
101 |     s++;
102 |     while(*s != EOS) {
103 |        if (*s == '\'') s++;
104 |        *t = *s;
105 |        s++; t++;
106 |     }
107 |     *t = EOS;
108 |     return sbuf;
109 | }
110 | 
111 | void quote_no_matter_what (const char *s, char *t) {
112 |     *t = '\'';
113 |     t++;
114 |     while((*s != EOS)) {
115 |         *t = *s;
116 |         if (*s == '\'') {
117 |            t++; *t = '\'';
118 | 	}
119 |         s++; t++;
120 |     }
121 |     *t = '\''; t++;
122 |     *t = EOS;
123 | }
124 | 
125 | 
126 | const char *quote (const char *s) {
127 |     static char *sbuf=NULL;
128 |     if (sbuf == NULL)
129 |         sbuf = (char *) malloc(MAX_STRING_LENGTH);
130 |     if ( strchr(s,' ')  == NULL  &&
131 |          strstr(s,"/*") == NULL && strstr(s,"*/") == NULL ) return s;
132 |     else {
133 |        quote_no_matter_what(s, sbuf);
134 |        return sbuf;
135 |     }
136 | }
137 | 
138 | 
139 | 
140 | 
141 | /* returns TRUE iff string only contains chars in valid. */
142 | int verify(char *string, char *valid)
143 | {
144 |    int i;
145 |    for(i=0;i<strlen(string);i++)
146 |       if (!strchr(valid, string[i])) return TRUE;
147 |    return FALSE;
148 | }
149 | 
150 | 
151 | /* strips leading and trailing white space */
152 | char * strip(char *s) {
153 |    int i,j;
154 |    int hit_char;
155 | 
156 |    j = 0;
157 |    hit_char = FALSE;
158 |    for (i=0; i<=strlen(s); ++i) {
159 |        if (s[i] != ' ') hit_char = TRUE;
160 |        if (hit_char) s[j++] = s[i];
161 |    }
162 |    for (i=strlen(s)-1; i>0; --i)
163 |        if (s[i] != ' ') break;
164 |    s[i+1] = '\0';
165 |    return s;
166 | }
167 | 
168 | 
169 | /* converts s to upper case */
170 | char * upper(char *s) {
171 |    int i;
172 |    for (i=0; i<strlen(s); ++i) s[i] = toupper(s[i]);
173 |    return s;
174 | }
175 | 
176 | /* converts s to lower case */
177 | char * lower(char *s) {
178 |    int i;
179 |    for (i=0; i<strlen(s); ++i) s[i] = tolower(s[i]);
180 |    return s;
181 | }
182 | 
183 | 
184 | /* queries existence of file */
185 | int qfilef(const char *fname) {
186 |    if (fname == FALSE) return FALSE;
187 |    if (access(fname, F_OK)==0) return TRUE;
188 |    else return FALSE;
189 | }
190 | 
191 | 
192 | /* returns free storage in file system */
193 | int free_storage (char *fn)
194 | {
195 |   /* uses a defunct function call. Also, not ever called */
196 |   abort();
197 |   /*
198 |   struct statfs sfs;
199 |   if (statfs(fn, &sfs) == -1)
200 |     return -1;
201 |   return sfs.f_bsize * sfs.f_bfree;
202 | */
203 | }
204 | 
205 | /* Return the size of file named filename */
206 | int file_size(char *filename)
207 | {
208 |   struct stat status;
209 | 
210 |   if (stat(filename,&status) != 0)
211 |     return -1;
212 |   return (int)status.st_size;
213 | }
214 | 
215 | /* Return an allocated duplicate of string */
216 | char *util_strdup(char *string)
217 | {
218 |   int len = strlen(string);
219 |   char *dup = (char *)malloc(len+1);
220 | 
221 |   if (dup == NULL)
222 |     {
223 |       perror("malloc");
224 |       return NULL;
225 |     }
226 |   strcpy(dup, string);
227 |   return dup;
228 | }
229 | 
230 | 
231 | void * util_malloc (int size)
232 | {
233 |     char * p = (char *) malloc(size+sizeof(int));
234 |     if (p == NULL) error("UTIL_MALLOC: Ran out of space. Space in use: %d (%d pointers)\n",
235 |                                            space_in_use, pointers_in_use);
236 |     space_in_use += size;
237 |     ++pointers_in_use;
238 |     *((int *) p) = size;
239 |     if (display_allocs)
240 |        fprintf(stderr, "UTIL_MALLOC: Allocated %d bytes, %d bytes total, %d pointers\n",
241 |                                     size, space_in_use, pointers_in_use);
242 |     return (void *) (p+sizeof(int));
243 | }
244 | 
245 | void * util_calloc (int num, int size)
246 | {
247 |     char * p = (char *) calloc(num*size+sizeof(int), 1);
248 |     if (p == NULL) error("UTIL_CALLOC: Ran out of space. Space in use: %d (%d pointers)\n",
249 |                                            space_in_use, pointers_in_use);
250 |     space_in_use += num*size;
251 |     ++pointers_in_use;
252 |     *((int *) p) = num*size;
253 |     if (display_allocs)
254 |        fprintf(stderr, "UTIL_CALLOC: Allocated %d bytes, %d bytes total, %d pointers\n",
255 |                                            num*size, space_in_use, pointers_in_use);
256 |     return (void *) (p+sizeof(int));
257 | }
258 | 
259 | void * util_realloc (void * p, int size)
260 | {
261 |     int oldsize;
262 |     char *realp;
263 |     realp = ((char *)p)-sizeof(int);
264 |     oldsize = *((int *)(realp));
265 |     realp = (char *) realloc(realp, size+sizeof(int));
266 |     if (realp == NULL) error("UTIL_REALLOC: Ran out of space. Space in use: %d (%d pointers)\n",
267 |                                            space_in_use, pointers_in_use);
268 |     *((int *)(realp)) = size;
269 |     space_in_use += (size-oldsize);
270 |     if (display_allocs)
271 |         fprintf(stderr, "UTIL_REALLOC: Allocated %d bytes, %d bytes total, %d pointers\n",
272 |                                         size, space_in_use, pointers_in_use);
273 |     return (realp+sizeof(int));
274 | }
275 | 
276 | void util_free (void * p)
277 | {
278 |     int size;
279 |     size = *((int *) (((char *) p)-sizeof(int)));
280 |     space_in_use -= size;
281 |     --pointers_in_use;
282 |     free(((char *)p)-sizeof(int));
283 |     if (display_allocs)
284 |        fprintf(stderr, "UTIL_FREE: Freed up %d bytes, %d bytes remaining, %d pointers\n",
285 |                                         size, space_in_use, pointers_in_use);
286 | }
287 | 
288 | int util_space_in_use (void)
289 | {
290 |    return space_in_use;
291 | }
292 | 
293 | int util_pointers_in_use (void)
294 | {
295 |    return pointers_in_use;
296 | }
297 | 


--------------------------------------------------------------------------------
/lib/math/gsl_vector.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MATH_GSL_VECTOR__
  2 | #define __MATH_GSL_VECTOR__
  3 | 
  4 | #include <assert.h>
  5 | 
  6 | #include <gsl/gsl_vector.h>
  7 | #include <gsl/gsl_cblas.h>
  8 | 
  9 | #include <gsl/gsl_matrix.h>
 10 | #include "math/vectorops.h"
 11 | 
 12 | class GslVectorItem {
 13 |  public:
 14 |   GslVectorItem(gsl_vector* ptr, size_t index) :
 15 |     ptr_(ptr),
 16 |     index_(index) { }
 17 | 
 18 |   operator const double() {
 19 |     return gsl_vector_get(ptr_, index_);
 20 |   }
 21 | 
 22 |   double operator =(const double v) {
 23 |     gsl_vector_set(ptr_, index_, v);
 24 |     return v;
 25 |   }
 26 | 
 27 |   double operator +=(const double v) {
 28 |     double oldv = gsl_vector_get(ptr_, index_);
 29 |     gsl_vector_set(ptr_, index_, oldv + v);
 30 |     return oldv + v;
 31 |   }
 32 | 
 33 |   double operator -=(const double v) {
 34 |     double oldv = gsl_vector_get(ptr_, index_);
 35 |     gsl_vector_set(ptr_, index_, oldv - v);
 36 |     return oldv - v;
 37 |   }
 38 |  private:
 39 |   gsl_vector* ptr_;
 40 |   size_t index_;
 41 | };
 42 | 
 43 | class GslVectorBase {
 44 |  public:
 45 |   /*
 46 |   double operator[](const size_t index) const {
 47 |     assert(ptr_ != NULL);
 48 |     return gsl_vector_get(ptr_, index);
 49 |   }
 50 |   */
 51 | 
 52 |   GslVectorItem operator[](const size_t index) const {
 53 |     assert(ptr_ != NULL);
 54 |     return GslVectorItem(ptr_, index);
 55 |   }
 56 | 
 57 |   GslVectorBase& operator+=(const gsl_vector* x) {
 58 |     assert(ptr_ != NULL);
 59 |     gsl_vector_add(ptr_, x);
 60 |     return *this;
 61 |   }
 62 | 
 63 |   GslVectorBase& operator-=(const gsl_vector* x) {
 64 |     assert(ptr_ != NULL);
 65 |     gsl_vector_sub(ptr_, x);
 66 |     return *this;
 67 |   }
 68 | 
 69 |   GslVectorBase& operator+=(const GslVectorBase& x) {
 70 |     assert(ptr_ != NULL);
 71 |     gsl_vector_add(ptr_, x.ptr());
 72 |     return *this;
 73 |   }
 74 | 
 75 |   GslVectorBase& operator-=(const GslVectorBase& x) {
 76 |     assert(ptr_ != NULL);
 77 |     gsl_vector_sub(ptr_, x.ptr());
 78 |     return *this;
 79 |   }
 80 | 
 81 |   GslVectorBase& operator*=(const gsl_vector* x) {
 82 |     assert(ptr_ != NULL);
 83 |     gsl_vector_mul(ptr_, x);
 84 |     return *this;
 85 |   }
 86 | 
 87 |   GslVectorBase& operator*=(const GslVectorBase& x) {
 88 |     assert(ptr_ != NULL);
 89 |     gsl_vector_mul(ptr_, x.ptr());
 90 |     return *this;
 91 |   }
 92 | 
 93 |   GslVectorBase& operator/=(const GslVectorBase& x) {
 94 |     assert(ptr_ != NULL);
 95 |     gsl_vector_div(ptr_, x.ptr());
 96 |     return *this;
 97 |   }
 98 | 
 99 |   double Sum() const {
100 |     return gsl_blas_dsum(ptr_);
101 |   }
102 | 
103 |   double L2Norm() const {
104 |     return gsl_blas_dnrm2(ptr_);
105 |   }
106 | 
107 |   void Normalize() const {
108 |     assert(ptr_ != NULL);
109 |     double s = Sum();
110 |     gsl_vector_scale(ptr_, 1. / s);
111 |   }
112 | 
113 |   size_t size() const {
114 |     return ptr_->size;
115 |   }
116 | 
117 |   GslVectorBase& operator*=(double v) {
118 |     assert(ptr_ != NULL);
119 |     gsl_vector_scale(ptr_, v);
120 |     return *this;
121 |   }
122 | 
123 |   GslVectorBase& operator/=(double v) {
124 |     assert(ptr_ != NULL);
125 |     gsl_vector_scale(ptr_, 1. / v);
126 |     return *this;
127 |   }
128 | 
129 |   // Note that the standalone product is a dot product!
130 |   const double operator*(const gsl_vector* x) const {
131 |     double result;
132 |     assert(ptr_ != NULL);
133 |     gsl_blas_ddot(ptr_, x, &result);
134 |     return result;
135 |   }
136 | 
137 |   const double operator*(const GslVectorBase& x) const {
138 |     double result;
139 |     assert(ptr_ != NULL);
140 |     gsl_blas_ddot(ptr_, x.ptr(), &result);
141 |     return result;
142 |   }
143 | 
144 |   GslVectorBase& operator+=(const double x) {
145 |     for (size_t ii = 0; ii < ptr_->size; ++ii) {
146 |       gsl_vector_set(ptr_, ii, gsl_vector_get(ptr_, ii) + x);
147 |     }
148 |     return *this;
149 |   }
150 | 
151 |   GslVectorBase& operator-=(const double x) {
152 |     for (size_t ii = 0; ii < ptr_->size; ++ii) {
153 |       gsl_vector_set(ptr_, ii, gsl_vector_get(ptr_, ii) - x);
154 |     }
155 |     return *this;
156 |   }
157 | 
158 |   GslVectorBase& operator=(const gsl_vector* x) {
159 |     assert(ptr_ != NULL);
160 |     gsl_vector_memcpy(ptr_, x);
161 | 
162 |     return *this;
163 |   }
164 | 
165 |   GslVectorBase& operator=(const GslVectorBase& x) {
166 |     assert(ptr_ != NULL);
167 |     return *this = x.ptr();
168 |   }
169 | 
170 |   GslVectorBase& operator=(const double v) {
171 |     if (v == 0.0) {
172 |       SetZero();
173 |     } else {
174 |       SetAll(v);
175 |     }
176 |     return *this;
177 |   }
178 | 
179 |   void SetZero() {
180 |     assert(ptr_ != NULL);
181 |     gsl_vector_set_zero(ptr_);
182 |   }
183 | 
184 |   void SetAll(const double v) {
185 |     assert(ptr_ != NULL);
186 |     gsl_vector_set_all(ptr_, v);
187 |   }
188 | 
189 |   int Fprintf(FILE* stream, const char* format) const {
190 |     assert(ptr_ != NULL);
191 |     return gsl_vector_fprintf(stream, ptr_, format);
192 |   }
193 | 
194 |   int Fscanf(FILE* stream) {
195 |     assert(ptr_ != NULL);
196 |     return gsl_vector_fscanf(stream, ptr_);
197 |   }
198 | 
199 |   const gsl_vector* ptr() const { return ptr_; }
200 |   gsl_vector* mutable_ptr() { return ptr_; }
201 | 
202 |  protected:
203 |   GslVectorBase() : ptr_(NULL) { }
204 |   gsl_vector* ptr_;
205 | 
206 |  private:
207 |   GslVectorBase(const GslVectorBase&) { }
208 | };
209 | 
210 | class GslVector : public GslVectorBase {
211 |  public:
212 |   GslVector(const size_t size) : GslVectorBase() {
213 |     Allocate(size);
214 |   }
215 | 
216 |   GslVector() : GslVectorBase() {
217 |   }
218 | 
219 |   ~GslVector() {
220 |     if (ptr_ != NULL) {
221 |       gsl_vector_free(ptr_);
222 |     }
223 |   }
224 | 
225 |   void Reset(gsl_vector* val) {
226 |     if (ptr_ != NULL) {
227 |       gsl_vector_free(ptr_);
228 |     }
229 |     ptr_ = val;
230 |   }
231 | 
232 |   void Allocate(const size_t size) {
233 |     assert(ptr_ == NULL);
234 |     ptr_ = gsl_vector_alloc(size);
235 |   }
236 | 
237 |   GslVectorBase& operator=(const gsl_vector* x) {
238 |     GslVectorBase::operator=(x);
239 |     return *this;
240 |   }
241 | 
242 |   GslVectorBase& operator=(const GslVectorBase& x) {
243 |     GslVectorBase::operator=(x);
244 |     return *this;
245 |   }
246 | 
247 |   GslVectorBase& operator=(const double v) {
248 |     GslVectorBase::operator=(v);
249 |     return *this;
250 |   }
251 |  private:
252 |   GslVector(const GslVector&) { }
253 | };
254 | 
255 | class GslMatrixRow : public GslVectorBase {
256 |  public:
257 |   GslMatrixRow(GslMatrix& matrix, const size_t row) :
258 |    view_(gsl_matrix_row(matrix.mutable_ptr(), row)) {    
259 |      ptr_ = &view_.vector;
260 |   }
261 | 
262 |   GslMatrixRow(gsl_matrix* matrix, const size_t row) :
263 |    view_(gsl_matrix_row(matrix, row)) {    
264 |      ptr_ = &view_.vector;
265 |   }
266 | 
267 |   GslVectorBase& operator=(const gsl_vector* x) {
268 |     GslVectorBase::operator=(x);
269 |     return *this;
270 |   }
271 | 
272 |   GslVectorBase& operator=(const GslVectorBase& x) {
273 |     GslVectorBase::operator=(x);
274 |     return *this;
275 |   }
276 | 
277 |   GslVectorBase& operator=(const double v) {
278 |     GslVectorBase::operator=(v);
279 |     return *this;
280 |   }
281 |  private:
282 |   gsl_vector_view view_;
283 |   GslMatrixRow(const GslMatrixRow&) { }
284 | };
285 | 
286 | class GslMatrixColumn : public GslVectorBase {
287 |  public:
288 |   GslMatrixColumn(GslMatrix& matrix, const size_t col) :
289 |    view_(gsl_matrix_column(matrix.mutable_ptr(), col)) {    
290 |      ptr_ = &view_.vector;
291 |   }
292 | 
293 |   GslMatrixColumn(gsl_matrix* matrix, const size_t col) :
294 |    view_(gsl_matrix_column(matrix, col)) {    
295 |      ptr_ = &view_.vector;
296 |   }
297 | 
298 |   GslVectorBase& operator=(const gsl_vector* x) {
299 |     GslVectorBase::operator=(x);
300 |     return *this;
301 |   }
302 | 
303 |   GslVectorBase& operator=(const GslVectorBase& x) {
304 |     GslVectorBase::operator=(x);
305 |     return *this;
306 |   }
307 | 
308 |   GslVectorBase& operator=(const double v) {
309 |     GslVectorBase::operator=(v);
310 |     return *this;
311 |   }
312 |  private:
313 |   gsl_vector_view view_;
314 |   GslMatrixColumn(const GslMatrixColumn&) { }
315 | };
316 | 
317 | class GslMatrixDiagonal : public GslVectorBase {
318 |  public:
319 |   GslMatrixDiagonal(GslMatrix& matrix) :
320 |    view_(gsl_matrix_diagonal(matrix.mutable_ptr())) {    
321 |      ptr_ = &view_.vector;
322 |   }
323 | 
324 |   GslMatrixDiagonal(gsl_matrix* matrix) :
325 |    view_(gsl_matrix_diagonal(matrix)) {    
326 |      ptr_ = &view_.vector;
327 |   }
328 | 
329 |   GslVectorBase& operator=(const gsl_vector* x) {
330 |     GslVectorBase::operator=(x);
331 |     return *this;
332 |   }
333 | 
334 |   GslVectorBase& operator=(const GslVectorBase& x) {
335 |     GslVectorBase::operator=(x);
336 |     return *this;
337 |   }
338 | 
339 |   GslVectorBase& operator=(const double v) {
340 |     GslVectorBase::operator=(v);
341 |     return *this;
342 |   }
343 |  private:
344 |   gsl_vector_view view_;
345 |   GslMatrixDiagonal(const GslMatrixDiagonal&) { }
346 | };
347 | 
348 | class GslSubvector : public GslVectorBase {
349 |  public:
350 |  GslSubvector(GslVectorBase& vector, size_t i, size_t n) :
351 |   view_(gsl_vector_subvector(vector.mutable_ptr(), i, n)) {    
352 |      ptr_ = &view_.vector;
353 |   }
354 | 
355 |   GslVectorBase& operator=(const gsl_vector* x) {
356 |     GslVectorBase::operator=(x);
357 |     return *this;
358 |   }
359 | 
360 |   GslVectorBase& operator=(const GslVectorBase& x) {
361 |     GslVectorBase::operator=(x);
362 |     return *this;
363 |   }
364 | 
365 |   GslVectorBase& operator=(const double v) {
366 |     GslVectorBase::operator=(v);
367 |     return *this;
368 |   }
369 |  private:
370 |   gsl_vector_view view_;
371 |   GslSubvector(const GslSubvector&) { }
372 | };
373 | 
374 | #endif  // __MATH_GSL_VECTOR__
375 | 
376 | 


--------------------------------------------------------------------------------
/lib/math/vectorops.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MATH_VECTOROPS_INCLUDED
  2 | #define __MATH_VECTOROPS_INCLUDED
  3 | 
  4 | #include <cmath>
  5 | #include <limits>
  6 | #include <gsl/gsl_matrix.h>
  7 | #include <gsl/gsl_vector.h>
  8 | <<<<<<< vectorops.h
  9 | #include <gsl/gsl_cblas.h>
 10 | #include "math/specialfunc.h"
 11 | =======
 12 | #include <gsl/gsl_blas.h>
 13 | #include "specialfunc.h"
 14 | >>>>>>> 1.27
 15 | 
 16 | #ifndef M_PI
 17 | #define M_PI        3.14159265358979323846
 18 | #endif
 19 | 
 20 | #ifndef isnan
 21 | # define isnan(x) ((x) != (x))
 22 | #endif
 23 | 
 24 | /* 
 25 |  * take the exponent of a vector
 26 |  *
 27 |  * If the exponent is infinite, then we replace the value with a
 28 |  * suitably large max_val
 29 |  */
 30 | void vexp(const gsl_vector* v, 
 31 |           gsl_vector* exp_v, 
 32 |           double max_val = std::numeric_limits<double>::infinity()) {
 33 |   assert(exp_v->size >= v->size);
 34 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
 35 |     double val = exp(gsl_vector_get(v, ii));
 36 |     if (val == std::numeric_limits<double>::infinity() || val > max_val) {
 37 |       val = max_val;
 38 |     }
 39 |     gsl_vector_set(exp_v, ii, val);
 40 |   }
 41 | }
 42 | 
 43 | /* take the exponent of a matrix */
 44 | void mexp(const gsl_matrix* m, gsl_matrix* exp_m) {
 45 |   for (unsigned int ii = 0; ii < m->size1; ++ii) {
 46 |     for (unsigned int jj = 0; jj < m->size2; ++jj) {
 47 |       double val = exp(gsl_matrix_get(m, ii, jj));
 48 |       assert(!isnan(val));
 49 |       gsl_matrix_set(exp_m, ii, jj, val);
 50 |     }
 51 |   }
 52 | }
 53 | 
 54 | /* like vexp except that it also computes sum x log x */
 55 | double vexp_entropy(const gsl_vector* v, gsl_vector* exp_v) {
 56 |   double entropy = 0.0;
 57 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
 58 |     double logval = gsl_vector_get(v, ii);
 59 |     double val = exp(logval);
 60 |     assert(!isnan(val));
 61 |     gsl_vector_set(exp_v, ii, val);
 62 |     if (val != 0) {
 63 |       entropy -= val * logval;
 64 |     }
 65 |   }
 66 |   return entropy;
 67 | }
 68 | 
 69 | double ventropy(const gsl_vector* v) {
 70 |   double entropy = 0.0;
 71 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
 72 |     double val = gsl_vector_get(v, ii);
 73 |     if (val != 0) {
 74 |       entropy -= val * log(val);
 75 |     }
 76 |   }
 77 |   return entropy;
 78 | }
 79 | 
 80 | double lgamma(double x) {
 81 |     double x0,x2,xp,gl,gl0;
 82 |     int n=0,k=0;
 83 |     static double a[] = {
 84 |         8.333333333333333e-02,
 85 |        -2.777777777777778e-03,
 86 |         7.936507936507937e-04,
 87 |        -5.952380952380952e-04,
 88 |         8.417508417508418e-04,
 89 |        -1.917526917526918e-03,
 90 |         6.410256410256410e-03,
 91 |        -2.955065359477124e-02,
 92 |         1.796443723688307e-01,
 93 |        -1.39243221690590};
 94 |     
 95 |     x0 = x;
 96 |     if (x <= 0.0) return 1e308;
 97 |     else if ((x == 1.0) || (x == 2.0)) return 0.0;
 98 |     else if (x <= 7.0) {
 99 |         n = (int)(7-x);
100 |         x0 = x+n;
101 |     }
102 |     x2 = 1.0/(x0*x0);
103 |     xp = 2.0*M_PI;
104 |     gl0 = a[9];
105 |     for (k=8;k>=0;k--) {
106 |         gl0 = gl0*x2 + a[k];
107 |     }
108 |     gl = gl0/x0+0.5*log(xp)+(x0-0.5)*log(x0)-x0;
109 |     if (x <= 7.0) {
110 |         for (k=1;k<=n;k++) {
111 |             gl -= log(x0-1.0);
112 |             x0 -= 1.0;
113 |         }
114 |     }
115 |     return gl;
116 | }
117 | 
118 | void mlog(const gsl_matrix* m, gsl_matrix* log_m) {
119 |   for (unsigned int ii = 0; ii < m->size1; ++ii) {
120 |     for (unsigned int jj = 0; jj < m->size2; ++jj) {
121 |       gsl_matrix_set(log_m, ii, jj, log(gsl_matrix_get(m, ii, jj)));
122 |     }
123 |   }
124 | }
125 | 
126 | void vlog(const gsl_vector* v, gsl_vector* log_v) {
127 |   for (unsigned int ii = 0; ii < v->size; ++ii)
128 |     gsl_vector_set(log_v, ii, log(gsl_vector_get(v, ii)));
129 | }
130 | 
131 | void vlogit(const gsl_vector* v, gsl_vector* log_v) {
132 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
133 |     double p = gsl_vector_get(v, ii);
134 |     assert(p >= 0.0);
135 |     assert(p <= 1.0);
136 |     gsl_vector_set(log_v, ii, log(p / (1 - p)));
137 |   }
138 | }
139 | 
140 | 
141 | void vsigmoid(const gsl_vector* v, gsl_vector* sig_v) {
142 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
143 |     double p = gsl_vector_get(v, ii);
144 |     gsl_vector_set(sig_v, ii, 1. / (1. + exp(-p)));
145 |   }
146 | }
147 | 
148 | 
149 | double vlog_entropy(const gsl_vector* v, gsl_vector* log_v) {
150 |   double entropy = 0;
151 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
152 |     double val = gsl_vector_get(v, ii);
153 |     entropy -= val * log(val);
154 |     gsl_vector_set(log_v, ii, log(val));
155 |   }
156 |   return entropy;
157 | }
158 | 
159 | double entropy(const gsl_vector* v) {
160 |   double entropy = 0;
161 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
162 |     double val = gsl_vector_get(v, ii);
163 |     entropy -= val * log(val);
164 |   }
165 |   return entropy;
166 | }
167 | 
168 | void vdigamma(const gsl_vector* v, gsl_vector* digamma_v) {
169 |   for (unsigned int ii = 0; ii < v->size; ++ii)
170 |     // gsl_sf_psi throws an error when its argument is 0, whereas digamma returns inf.
171 |     //    gsl_vector_set(digamma_v, ii, gsl_sf_psi(gsl_vector_get(v, ii)));
172 |     gsl_vector_set(digamma_v, ii, digamma(gsl_vector_get(v, ii)));
173 | }
174 | 
175 | void vlgamma(const gsl_vector* v, gsl_vector* lgamma_v) {
176 |   for (unsigned int ii = 0; ii < v->size; ++ii)
177 |     gsl_vector_set(lgamma_v, ii, lgamma(gsl_vector_get(v, ii)));
178 | }
179 | 
180 | double gsl_blas_dsum(const gsl_vector* v) {
181 |   double sum = 0;
182 |   for (unsigned int ii = 0; ii < v->size; ++ii) {
183 |     sum += gsl_vector_get(v, ii);
184 |   }
185 |   return sum;
186 | }
187 | 
188 | double gsl_blas_dsum(const gsl_matrix* v) {
189 |   double sum = 0;
190 |   for (unsigned int ii = 0; ii < v->size1; ++ii) {
191 |     for (unsigned int jj = 0; jj < v->size2; ++jj) {
192 |       sum += gsl_matrix_get(v, ii, jj);
193 |     }
194 |   }
195 |   return sum;
196 | }
197 | 
198 | double gsl_matrix_rowsum(const gsl_matrix* m, const int row) {
199 |   double sum = 0;
200 |   for(unsigned int i=0; i < m->size2; i++) {
201 |     sum += gsl_matrix_get(m, row, i);
202 |   }
203 |   return sum;
204 | }
205 | 
206 | double dot_product(const gsl_vector* a, const gsl_vector* b) {
207 |   assert(a->size == b->size);
208 |   double val = 0;
209 |   for(unsigned i=0; i<a->size; i++) {
210 |     val += gsl_vector_get(a, i) * gsl_vector_get(b, i);
211 |   }
212 |   return val;
213 | }
214 | 
215 | void uniform(gsl_vector* v) {
216 | 	gsl_vector_set_all(v, 1.0 / (double)v->size);
217 | }
218 | 
219 | double normalize(gsl_vector* v) {
220 |   double sum = gsl_blas_dsum(v);
221 |   gsl_blas_dscal(1 / sum, v);
222 |   return sum;
223 | }
224 | 
225 | /*
226 |   This function takes as input a multinomial parameter vector and
227 |   computes the "total" variance, i.e., the sum of the diagonal of the
228 |   covariance matrix.  
229 | 
230 |   If the multinomial parameter is unnormalized, then the variance of
231 |   the normalized multinomial vector will be computed and then
232 |   multiplied by the scale of the vector.
233 |  */
234 | double MultinomialTotalVariance(const gsl_vector* v) {
235 |   double scale = gsl_blas_dsum(v);
236 |   double variance = 0.0;
237 |   for (size_t ii = 0; ii < v->size; ++ii) {
238 |     double val = gsl_vector_get(v, ii) / scale;
239 |     variance += val * (1. - val);
240 |   }
241 |   return variance * scale;
242 | }
243 | 
244 | /*
245 |   Computes covariance using the renormalization above and adds it to
246 |   an existing matrix.
247 | */
248 | void MultinomialCovariance(double alpha,
249 | 			   const gsl_vector* v, 
250 | 			   gsl_matrix* m) {
251 |   double scale = gsl_blas_dsum(v);
252 |   gsl_blas_dger(-alpha / scale, v, v, m);
253 |   gsl_vector_view diag = gsl_matrix_diagonal(m);
254 |   gsl_blas_daxpy(alpha, v, &diag.vector);
255 | }
256 | 
257 | double MatrixProductSum(const gsl_matrix* m1,
258 | 			const gsl_matrix* m2) {
259 |   double val = 0;
260 |   assert(m1->size1 == m2->size1);
261 |   assert(m1->size2 == m2->size2);
262 |   for (size_t ii = 0; ii < m1->size1; ++ii) {
263 |     for (size_t jj = 0; jj < m2->size2; ++jj) {
264 |       val += gsl_matrix_get(m1, ii, jj) * 
265 | 	gsl_matrix_get(m2, ii, jj);
266 |     }
267 |   }
268 |   return val;
269 | }
270 | 
271 | double MatrixProductProductSum(const gsl_matrix* m1,
272 | 			       const gsl_matrix* m2,
273 | 			       const gsl_matrix* m3) {
274 |   double val = 0;
275 |   assert(m1->size1 == m2->size1);
276 |   assert(m1->size2 == m2->size2);
277 |   assert(m1->size1 == m3->size1);
278 |   assert(m1->size2 == m3->size2);
279 |   for (size_t ii = 0; ii < m1->size1; ++ii) {
280 |     for (size_t jj = 0; jj < m2->size2; ++jj) {
281 |       for (size_t kk = 0; kk < m3->size2; ++kk) {
282 | 	val += gsl_matrix_get(m1, ii, jj) * 
283 | 	  gsl_matrix_get(m2, ii, jj) *
284 | 	  gsl_matrix_get(m3, ii, jj);
285 |       }
286 |     }
287 |   }
288 |   return val;
289 | }
290 | 
291 | double SumLGamma(const gsl_vector* v) {
292 |   double s = 0.0;
293 |   for (size_t ii = 0; ii < v->size; ++ii) {
294 |     s += lgamma(gsl_vector_get(v, ii));
295 |   }
296 |   return s;
297 | }
298 | 
299 | void mtx_fprintf(const char* filename, const gsl_matrix * m)
300 | {
301 |     FILE* fileptr;
302 |     fileptr = fopen(filename, "w");
303 |     gsl_matrix_fprintf(fileptr, m, "%20.17e");
304 |     fclose(fileptr);
305 | }
306 | 
307 | 
308 | void mtx_fscanf(const char* filename, gsl_matrix * m)
309 | {
310 |     FILE* fileptr;
311 |     fileptr = fopen(filename, "r");
312 |     gsl_matrix_fscanf(fileptr, m);
313 |     fclose(fileptr);
314 | }
315 | 
316 | double mtx_accum(const int i,
317 |                  const int j,
318 |                  const double contribution,
319 |                  gsl_matrix* m) {
320 | 
321 |   double new_val = gsl_matrix_get(m, i, j) + contribution;
322 |   gsl_matrix_set(m, i, j, new_val);
323 |   return new_val;
324 | }
325 | 
326 | void vct_fscanf(const char* filename, gsl_vector* v)
327 | {
328 |     FILE* fileptr;
329 |     fileptr = fopen(filename, "r");
330 |     gsl_vector_fscanf(fileptr, v);
331 |     fclose(fileptr);
332 | }
333 | 
334 | void vct_fprintf(const char* filename, gsl_vector* v)
335 | {
336 |     FILE* fileptr;
337 |     fileptr = fopen(filename, "w");
338 |     gsl_vector_fprintf(fileptr, v, "%20.17e");
339 |     fclose(fileptr);
340 | }
341 | 
342 | #endif
343 | 


--------------------------------------------------------------------------------
/dtm/main.c:
--------------------------------------------------------------------------------
  1 | // Authors: David Blei (blei@cs.princeton.edu)
  2 | //          Sean Gerrish (sgerrish@cs.princeton.edu)
  3 | //
  4 | // Copyright 2011 Sean Gerrish and David Blei
  5 | // All Rights Reserved.
  6 | //
  7 | // See the README for this package for details about modifying or
  8 | // distributing this software.
  9 | 
 10 | #include <gflags/gflags.h>
 11 | 
 12 | #include "main.h"
 13 | 
 14 | DEFINE_string(mode,
 15 | 	      "fit",
 16 |               "The function to perform. "
 17 | 	      "Can be fit, est, or time.");
 18 | DEFINE_string(model,
 19 | 	      "dtm",
 20 |               "The function to perform. "
 21 | 	      "Can be dtm or dim.");
 22 | DEFINE_string(corpus_prefix,
 23 | 	      "",
 24 |               "The function to perform. "
 25 | 	      "Can be dtm or dim.");
 26 | DEFINE_string(lda_model_prefix,
 27 | 	      "",
 28 |               "The name of a fit model to be "
 29 | 	      "used for testing likelihood.  Appending \"info.dat\" "
 30 | 	      "to this should give the name of the file.");
 31 | DEFINE_int32(heldout_time,
 32 | 	     -1,
 33 | 	     "A time up to (but not including) which we wish to train, "
 34 | 	     "and at which we wish to test.");
 35 | DEFINE_string(output_table, "", "");
 36 | DEFINE_string(params_file,
 37 | 	      "settings.txt",
 38 |               "A file containing parameters for this run.");
 39 | DEFINE_bool(initialize_lda,
 40 | 	    false,
 41 | 	    "If true, initialize the model with lda.");
 42 | 
 43 | DEFINE_string(outname, "", "");
 44 | DEFINE_double(top_obs_var, 0.5, "");
 45 | DEFINE_double(top_chain_var, 0.005, "");
 46 | DEFINE_double(alpha, -10.0, "");
 47 | DEFINE_double(ntopics, -1.0, "");
 48 | DEFINE_int32(lda_max_em_iter, 20, "");
 49 | DEFINE_string(heldout_corpus_prefix, "", "");
 50 | DEFINE_int32(start, -1, "");
 51 | DEFINE_int32(end, -1, "");
 52 | 
 53 | extern int LDA_INFERENCE_MAX_ITER;
 54 | 
 55 | /*
 56 |  * read the parameters
 57 |  *
 58 |  * !!! use the cleaner functions in params.h
 59 |  *
 60 |  */
 61 | 
 62 | 
 63 | /*
 64 |  * fit a model from data
 65 |  *
 66 |  */
 67 | 
 68 | void fit_dtm(int min_time, int max_time)
 69 | {
 70 |     char name[400];
 71 | 
 72 |     // make the directory for this fit
 73 |     char run_dir[400];
 74 |     sprintf(run_dir, "%s/", FLAGS_outname.c_str());
 75 |     if (!directory_exist(run_dir)) {
 76 |       make_directory(run_dir);
 77 |     }
 78 | 
 79 |     // initialize (a few iterations of LDA fitting)
 80 |     outlog("%s","### INITIALIZING MODEL FROM LDA ###\n");
 81 | 
 82 |     printf("data file: %s\n", FLAGS_corpus_prefix.c_str());
 83 |     corpus_t* initial_lda_data = read_corpus(FLAGS_corpus_prefix.c_str());
 84 | 
 85 |     gsl_matrix* topics_ss;
 86 |     // !!! make this an option
 87 |     if (FLAGS_initialize_lda) {
 88 |       lda* lda_model = new_lda_model(FLAGS_ntopics, initial_lda_data->nterms);
 89 |       gsl_vector_set_all(lda_model->alpha, FLAGS_alpha);
 90 |       
 91 |       lda_suff_stats* lda_ss = new_lda_suff_stats(lda_model);
 92 |       // initialize_lda_ss_from_data(initial_lda_data, lda_ss);
 93 |       initialize_lda_ss_from_random(initial_lda_data, lda_ss);
 94 |       // sgerrish: Why do we only define the topics once?
 95 |       lda_m_step(lda_model, lda_ss);
 96 |       
 97 |       sprintf(name, "%s/initial-lda", run_dir);
 98 |       // TODO(sgerrish): Fix this.  This was originally hardcoded to 1.
 99 |       LDA_INFERENCE_MAX_ITER = 25;
100 |       lda_em(lda_model, lda_ss, initial_lda_data, FLAGS_lda_max_em_iter, name);
101 |       sprintf(name, "%s/initial-lda-ss.dat", run_dir);
102 |       
103 |       write_lda_suff_stats(lda_ss, name);
104 |       topics_ss = lda_ss->topics_ss;
105 |     } else {
106 |       printf("loading %d terms..\n", initial_lda_data->nterms);
107 |       topics_ss = gsl_matrix_calloc(initial_lda_data->nterms, FLAGS_ntopics);
108 |       sprintf(name, "%s/initial-lda-ss.dat", FLAGS_outname.c_str());
109 |       mtx_fscanf(name, topics_ss);
110 |     }
111 | 
112 |     printf("fitting.. \n");
113 |     // estimate dynamic topic model
114 | 
115 |     outlog("\n%s\n","### FITTING DYNAMIC TOPIC MODEL ###");
116 | 
117 |     corpus_seq_t* data_full = read_corpus_seq(FLAGS_corpus_prefix.c_str());
118 | 
119 |     corpus_seq_t* data_subset;
120 |     if (max_time >= 0) {
121 |       // We are training on a subset of the data.
122 |       assert(max_time > min_time
123 | 	     && min_time >= 0
124 | 	     && max_time < data_full->len);
125 |       data_subset = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));
126 |       data_subset->len = max_time - min_time + 1;
127 |       data_subset->nterms = data_full->nterms;
128 |       data_subset->corpus = (corpus_t**) malloc(
129 |         sizeof(corpus_t*) * data_subset->len);
130 |       int max_nterms = 0;
131 |       int ndocs = 0;
132 |       for (int i=min_time; i < max_time; ++i) {
133 | 	corpus_t* corpus = data_full->corpus[i];
134 | 	max_nterms = max_nterms > corpus->nterms ? max_nterms : corpus->nterms;
135 | 	data_subset->corpus[i - min_time] = corpus;
136 | 	ndocs += corpus->ndocs;
137 |       }
138 |       data_subset->max_nterms = max_nterms;
139 |       data_subset->ndocs = ndocs;
140 |     } else {
141 |       // Use the entire dataset.
142 |       data_subset = data_full;
143 |     }
144 |     
145 |     lda_seq* model_seq = new_lda_seq(data_subset,
146 | 				     data_subset->nterms,
147 | 				     data_subset->len,
148 | 				     FLAGS_ntopics);
149 |     init_lda_seq_from_ss(model_seq,
150 |                          FLAGS_top_chain_var,
151 |                          FLAGS_top_obs_var,
152 |                          FLAGS_alpha,
153 |                          topics_ss);
154 | 
155 |     fit_lda_seq(model_seq, data_subset, NULL, run_dir);
156 | 
157 |     if (max_time < 0) {
158 |       return;
159 |     }
160 | 
161 |     // Now find the posterior likelihood of the next time slice
162 |     // using the most-recently-known time slice.
163 |     lda* lda_model = new_lda_model(model_seq->ntopics, model_seq->nterms);
164 |     make_lda_from_seq_slice(lda_model, model_seq, max_time - 1);
165 | 
166 |     lda_post post;
167 |     int max_nterms = compute_max_nterms(data_full);
168 |     post.phi = gsl_matrix_calloc(max_nterms, model_seq->ntopics);
169 |     post.log_phi = gsl_matrix_calloc(max_nterms, model_seq->ntopics);
170 |     post.gamma = gsl_vector_calloc(model_seq->ntopics);
171 |     post.lhood = gsl_vector_calloc(model_seq->ntopics);
172 |     post.model = lda_model;
173 |     post.doc_weight = NULL;
174 | 
175 |     int d;
176 |     double* table = (double*) malloc(sizeof(double) * data_full->corpus[max_time]->ndocs);
177 | 
178 |     for (d = 0; d < data_full->corpus[max_time]->ndocs; d++)
179 |       {
180 | 	post.doc = data_full->corpus[max_time]->doc[d];
181 | 	table[d] = fit_lda_post(d, max_time, &post, NULL, NULL,
182 | 				NULL, NULL, NULL);
183 |       }
184 |     char tmp_string[400];
185 |     sprintf(tmp_string, "%s-heldout_post_%d.dat", FLAGS_outname.c_str(),
186 | 	    max_time);
187 |     FILE* post_file = fopen(tmp_string, "w");
188 |     for (int d = 0; d < data_full->corpus[max_time]->ndocs; ++d)
189 |       {
190 | 	fprintf(post_file, "%f\n", table[d]);
191 |       }
192 | }
193 | 
194 | /*
195 |  * main function
196 |  *
197 |  * supports fitting a dynamic topic model
198 |  *
199 |  */
200 | 
201 | int main(int argc, char* argv[])
202 | {
203 |   // Initialize the flag objects.
204 |   //    InitFlags(argc, argv);
205 |   google::ParseCommandLineFlags(&argc, &argv, 0);
206 | 
207 |     // usage: main (sums corpus_sequence|fit param|time params)
208 | 
209 |     // mode for spitting out document sums
210 |     if (FLAGS_mode == "sums")
211 |     {
212 |         corpus_seq_t* c = read_corpus_seq(FLAGS_corpus_prefix.c_str());
213 | 	outlog("Tried to read corpus %s", FLAGS_corpus_prefix.c_str());
214 |         int d, t;
215 |         for (t = 0; t < c->len; t++)
216 |         {
217 |             int sum = 0;
218 |             for (d = 0; d < c->corpus[t]->ndocs; d++)
219 |             {
220 |                 sum += c->corpus[t]->doc[d]->total;
221 |             }
222 |             printf("%d\n\n", sum);
223 |         }
224 |     }
225 | 
226 |     // mode for fitting a dynamic topic model
227 | 
228 |     if (FLAGS_mode == "fit") {
229 |       fit_dtm(0, FLAGS_heldout_time - 1);
230 |     }
231 | 
232 |     // mode for analyzing documents through time according to a DTM
233 | 
234 |     if (FLAGS_mode == "time")
235 |     {
236 |         // read parameters
237 | 
238 |         // load corpus and model based on information from params
239 | 
240 |         corpus_seq_t* data = read_corpus_seq(FLAGS_heldout_corpus_prefix.c_str());
241 |         lda_seq* model = read_lda_seq(FLAGS_lda_model_prefix.c_str(),
242 | 				      data);
243 | 
244 |         // initialize the table (D X OFFSETS)
245 | 
246 |         int d;
247 |         double** table = (double**) malloc(sizeof(double*) * data->len);
248 | 
249 |         for (int t = 0; t < data->len; t++)
250 | 	{
251 |   	    table[t] = (double*) malloc(sizeof(double) * data->corpus[t]->ndocs);
252 |             for (d = 0; d < data->corpus[t]->ndocs; d++)
253 |             {
254 | 	      table[t][d] = -1;  // this should be NAN
255 |             }
256 |         }
257 | 
258 |         // set up the LDA model to be populated
259 | 
260 |         lda* lda_model = new_lda_model(model->ntopics, model->nterms);
261 | 
262 |         lda_post post;
263 |         int max_nterms = compute_max_nterms(data);
264 |         post.phi = gsl_matrix_calloc(max_nterms, model->ntopics);
265 |         post.log_phi = gsl_matrix_calloc(max_nterms, model->ntopics);
266 |         post.gamma = gsl_vector_calloc(model->ntopics);
267 |         post.lhood = gsl_vector_calloc(model->ntopics);
268 |         post.model = lda_model;
269 | 
270 |         // compute likelihoods for each model
271 | 
272 |         for (int t = 0; t < data->len; t++) {
273 |             make_lda_from_seq_slice(lda_model, model, t);
274 | 	    for (d = 0; d < data->corpus[t]->ndocs; d++) {
275 | 		post.doc = data->corpus[t]->doc[d];
276 | 		double likelihood = fit_lda_post(d, t, &post, model,
277 | 						 NULL,
278 | 						 NULL, NULL, NULL);
279 | 		table[t][d] = post.doc->log_likelihood;
280 | 	      }
281 | 	}
282 | 	char tmp_string[400];
283 | 	sprintf(tmp_string, "%s-heldout_post.dat", FLAGS_outname.c_str());
284 | 	FILE* post_file = fopen(tmp_string, "w");
285 | 	for (int t=0; t < data->len; ++t)
286 | 	{
287 | 	  if (data->corpus[t]->ndocs >= 0) {
288 | 	    fprintf(post_file, "%f", table[t][0]);
289 | 	  }
290 | 	  for (int d = 1; d < data->corpus[t]->ndocs; ++d)
291 |           {
292 | 	    fprintf(post_file, ",%f", table[t][d]);
293 | 	  }
294 | 	  fprintf(post_file, "\n");
295 | 	}
296 |         // !!! write out table
297 |     }
298 | 
299 |     return(0);
300 | }
301 | 
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/matrix_double.h:
--------------------------------------------------------------------------------
  1 | // matrix.h
  2 | 
  3 | //  This matrix class is a C++ wrapper for the GNU Scientific Library
  4 | //  Copyright (C) 2001 Ramin Nakisa
  5 | 
  6 | //  This program is free software; you can redistribute it and/or modify
  7 | //  it under the terms of the GNU General Public License as published by
  8 | //  the Free Software Foundation; either version 2 of the License, or
  9 | //  (at your option) any later version.
 10 | 
 11 | //  This program is distributed in the hope that it will be useful,
 12 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | //  GNU General Public License for more details.
 15 | 
 16 | //  You should have received a copy of the GNU General Public License
 17 | //  along with this program; if not, write to the Free Software
 18 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 19 | 
 20 | #if !defined( _matrix_double_h )
 21 | #define _matrix_double_h
 22 | 
 23 | #ifdef __HP_aCC //for aCC B3910B A.01.27
 24 | #include <iostream.h>
 25 | #include <fstream.h>
 26 | #include <iomanip.h>
 27 | #else //for gcc3
 28 | #include <iostream>
 29 | #include <fstream>
 30 | #include <iomanip>
 31 | #endif
 32 | 
 33 | #include <math.h>
 34 | #include <stdlib.h>
 35 | #include <assert.h>
 36 | ///
 37 | #include <gsl/gsl_math.h>
 38 | #include <gsl/gsl_matrix.h>
 39 | #include <gsl/gsl_linalg.h>
 40 | #include <gslwrap/permutation.h>
 41 | #include <gslwrap/vector_double.h>
 42 | 
 43 | #define type_is
 44 | #ifdef  type_is
 45 | #define type_is_double
 46 | #endif
 47 | 
 48 | namespace gsl
 49 | {
 50 | 
 51 | ///
 52 | class matrix
 53 | {
 54 | #ifdef type_is_double
 55 | 	friend class matrix_float;
 56 | 	friend class matrix_int;
 57 | #endif
 58 | public:
 59 | 	typedef double value_type;
 60 | 	typedef vector vector_type;
 61 | 
 62 | 	///
 63 | 	matrix();
 64 | 	///
 65 | 	matrix( size_t new_rows, size_t new_cols, bool clear = true );
 66 | 
 67 | 	template<class oclass>
 68 | 	void copy(const oclass &other)
 69 | 	{
 70 | 		if ( static_cast<const void *>( this ) == static_cast<const void *>( &other ) )
 71 | 			return;
 72 | 
 73 | 		set_dimensions( other.get_rows(), other.get_cols() );
 74 | 		for ( size_t i = 0; i < get_rows(); i++ ) 
 75 | 		{
 76 | 			for ( size_t j = 0; j < get_cols(); j++ ) 
 77 | 			{
 78 | 				gsl_matrix_set( m, i, j, (double)other(i,j));
 79 | 			}
 80 | 		}
 81 | 	}
 82 | /*    	template<>  */
 83 | /*    	void copy<matrix>(const matrix &other)  */
 84 | /*    	{  */
 85 | /*    		set_dimensions(other.size1(),other.size2());  */
 86 | /*    		gsl_matrix_memcpy( m, other.m );  */
 87 | /*    	}  */
 88 | 	// copy constructor for type matrix
 89 | 	matrix( const matrix &other ):m(NULL) {copy(other);}
 90 | 	///
 91 | 	template<class oclass>
 92 | 	matrix( const oclass &other ):m(NULL) {copy(other);}
 93 | 
 94 | 	///
 95 | 	~matrix();
 96 | 	///
 97 | //	matrix( const char *Filename );
 98 | 	///
 99 | 	size_t get_rows() const {return m->size1;}
100 | 	///
101 | 	size_t get_cols() const {return m->size2;}
102 | 	///
103 | 	size_t size1() const {return m->size1;}
104 | 	///
105 | 	size_t size2() const {return m->size2;}
106 |    
107 | 
108 | 	///
109 | 	void dimensions( size_t *num_rows, size_t *num_cols ) const;
110 | 	///
111 | double        get_element ( size_t row, size_t col ) const {return  gsl_matrix_get( m, row, col ) ;}
112 | 	const double &operator()( size_t row, size_t col ) const {return *gsl_matrix_ptr( m, row, col ) ;}
113 | double       &operator()( size_t row, size_t col )       {return *gsl_matrix_ptr( m, row, col ) ;}
114 | 	///
115 | 	void set_element( size_t row, size_t col, const double &v ){ gsl_matrix_set( m, row, col, v );}
116 | 	///
117 | 	void set_elements( const double & new_value );
118 | 	void set_all ( const double & new_value ) {gsl_matrix_set_all ( m, new_value );}
119 | 	void set_zero() {gsl_matrix_set_zero( m );}
120 | 	///
121 | 	void set_dimensions( size_t new_rows, size_t new_cols );
122 | 	///
123 | 	void load( const char *filename );
124 | 	///
125 | 	void save( const char *filename ) const;
126 | 	///
127 | 	friend ostream& operator<< ( ostream& os, const matrix& m );
128 | 	//This function writes the elements of the matrix m to the stream stream in binary format. The return value is 0 for success and GSL_EFAILED if there was a problem writing to the file. Since the data is written in the native binary format it may not be portable between different architectures.
129 | 	int fwrite (FILE * stream) const {return gsl_matrix_fwrite (stream, m);}
130 | 
131 | //This function reads into the matrix m from the open stream stream in binary format. The matrix m must be preallocated with the correct dimensions since the function uses the size of m to determine how many bytes to read. The return value is 0 for success and GSL_EFAILED if there was a problem reading from the file. The data is assumed to have been written in the native binary format on the same architecture. 
132 | 	int fread (FILE * stream) {return gsl_matrix_fread (stream, m);}
133 | 
134 |     ///
135 | 	void load_binary( const char *filename );
136 | 	///
137 | 	void save_binary( const char *filename ) const;
138 | 	///
139 | 	bool operator==( const matrix &other ) const;
140 | 	bool operator!=( const matrix &other ) const {return !((*this)==other);}
141 | 	
142 | 	matrix& operator=( const matrix &other ) {copy( other );return *this;}
143 | 	/// converts from any other matrix type
144 | 	template<class omatrix>
145 | 	matrix &operator=( const omatrix& other )
146 | 	{
147 | 			copy(other);
148 | 			return *this;
149 | 	}
150 |    ///
151 | 	matrix operator+( const matrix &other ) const;
152 | 	///
153 | 	matrix operator+( const double &f ) const;
154 | 	///
155 | 	friend matrix operator+( const double &f, const matrix &other );
156 | 	///
157 | 	matrix &operator+=( const double &f );
158 | 	///
159 | 	matrix &operator+=( const matrix &other );
160 | 	///
161 | 	matrix operator-( const matrix &other ) const;
162 | 	///
163 | 	matrix operator-( const double &f ) const;
164 | 	///
165 | 	friend matrix operator-( const double &f, const matrix &other );
166 | 	///
167 | 	matrix &operator-=( const double &f );
168 | 	///
169 | 	matrix &operator-=( const matrix &other );
170 | 	///
171 | 	matrix operator*( const matrix &other ) const;
172 | 	///
173 | 	matrix operator*( const double &f ) const;
174 | 	///
175 | 	friend matrix operator*( const double &f, const matrix &other );
176 | 	///
177 | 	matrix &operator*=( const double &f );
178 | 	///
179 | 	matrix &operator*=( const matrix &other );
180 | 	///
181 | 	matrix operator/( const double &) const;
182 | 	///
183 | 	matrix &operator/=( const double &);
184 | 	///
185 | 	matrix transpose() const;
186 | 	///
187 | 	matrix LU_decomp(gsl::permutation *perm=NULL,int *psign=NULL) const;
188 | 	///
189 | 	matrix LU_invert() const;
190 | 
191 | 	// return a submatrix of the this from row_min to row_max (not included!)
192 | 	matrix submatrix(size_t row_min, size_t row_max, size_t col_min, size_t col_max) const 
193 | 		{
194 | 			matrix m(row_max - row_min, col_max - col_min);
195 | 			for (size_t i = row_min ; i < row_max ; i++)
196 | 			{
197 | 				for (size_t j = col_min ; j < col_max ; j++)
198 | 				{
199 | 					m(i - row_min,j - col_min) = (*this)(i,j);
200 | 				}
201 | 			}
202 | 			return m;
203 | 		}
204 | private:
205 | 	///
206 | 	void LU_decomp( gsl_matrix **a,
207 | 					gsl_permutation **permutation,
208 | 					int *sign ) const;
209 | public:
210 | 	/** returns sum of all the matrix elements. */
211 |     double sum() const;
212 | 	/** returns logarithm of the determinant of the matrix. */
213 | 	double LU_lndet() const;
214 | 
215 | 
216 | 	/** returns a vector_view of a single row of the matrix. */
217 | 	vector_view       row( size_t rowindex );
218 | 	const vector_view row( size_t rowindex ) const ;
219 | 	/** returns a vector_view of a single column of the matrix. */
220 | 	vector_view       column( size_t colindex );
221 | 	const vector_view column( size_t colindex ) const;
222 | 	/** returns a vector_view of the diagonal elements of the matrix. */
223 | 	vector_view       diagonal();
224 | 	const vector_view diagonal() const;
225 | 
226 | 	/** returns a column matrix containing a single row of the matrix. */
227 | 	matrix get_row( size_t rowindex ) const;
228 | 	/** returns a column matrix containing a single column of the matrix. */
229 | 	matrix get_col( size_t colindex ) const;
230 | 	/** calculates sum of rows returned as a column matrix. */
231 | 	matrix row_sum() const;
232 | 	/** calculates sum of columns returned as a row matrix. */
233 | 	matrix column_sum() const;
234 | 	/** returns trace (diagonal sum) of a square matrix. */
235 | 	double trace() const;
236 | 	/** calculates cholesky decomposition of the matrix, returning success if matrix is positive definite. */
237 | 	int cholesky_decomp( matrix &a ) const;
238 | //  	/** returns index of nearest row in matrix to vector argument. */
239 | //  	int nearest_row_index( const matrix &v ) const;
240 | 	/** calculates covariance of the matrix columns. */
241 | 	matrix covariance() const;
242 | 	/** returns 1 if matrix is square, 0 otherwise. */
243 | 	bool is_square() const;
244 | 	/** diag operator (sets the diagonal elements of the matrix to the elements of v */
245 | 	void diag(const vector& v);
246 | 	/** set diagonal elements of a square matrix to f. */
247 | 	void set_diagonal( double f );
248 | 	/** sets matrix to a k dimensional unit matrix. */
249 | 	void identity( size_t k );
250 | 	/** returns sum of nth power of all elements. */
251 | 	double norm( double n ) const;
252 | 
253 | /*  Function: double gsl_matrix_max (const gsl_matrix * m)  */
254 | /*      This function returns the maximum value in the matrix m.  */
255 | 	double max() const {return gsl_matrix_max(m);}
256 | /*  Function: double gsl_matrix_min (const gsl_matrix * m)  */
257 | /*      This function returns the minimum value in the matrix m.  */
258 | 	double min()const{return gsl_matrix_min(m);}
259 | 
260 | 	/** This function returns 1 if all the elements of the matrix m are zero, and 0 otherwise. */
261 | 	bool isnull() const { return gsl_matrix_isnull(m);}
262 | /*  Function: void gsl_matrix_minmax (const gsl_matrix * m, double * min_out, double * max_out)  */
263 | /*      This function returns the minimum and maximum values in the matrix m, storing them in min_out and max_out.  */
264 | 
265 | /*  Function: void gsl_matrix_max_index (const gsl_matrix * m, size_t * imax, size_t * jmax)  */
266 | /*      This function returns the indices of the maximum value in the matrix m, storing them in imax and jmax. When there are several equal maximum elements then the first element found */
267 | /*      is returned.  */
268 | 
269 | /*  Function: void gsl_matrix_min_index (const gsl_matrix * m, size_t * imax, size_t * jmax)  */
270 | /*      This function returns the indices of the minimum value in the matrix m, storing them in imax and jmax. When there are several equal minimum elements then the first element found */
271 | /*      is returned.  */
272 | 
273 | /*  Function: void gsl_matrix_minmax_index (const gsl_matrix * m, size_t * imin, size_t * imax)  */
274 | /*      This function returns the indices of the minimum and maximum values in the matrix m, storing them in (imin,jmin) and (imax,jmax). When there are several equal minimum or */
275 | /*      maximum elements then the first elements found are returned.  */
276 | 
277 | 	/** for interfacing with gsl c */
278 | /*  	gsl_matrix       *gslobj()       {if (!m){cout << "matrix::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */
279 | /*  	const gsl_matrix *gslobj() const {if (!m){cout << "matrix::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */
280 | 	gsl_matrix       *gslobj()       {assert(m);return m;}
281 | 	const gsl_matrix *gslobj() const {assert(m);return m;}
282 | private:
283 | 	///
284 |    gsl_matrix *m;
285 | 
286 | };
287 | }
288 | #undef type_is
289 | #undef type_is_double
290 | 
291 | #endif // _matrix_double_h
292 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/matrix_int.h:
--------------------------------------------------------------------------------
  1 | // matrix.h
  2 | 
  3 | //  This matrix class is a C++ wrapper for the GNU Scientific Library
  4 | //  Copyright (C) 2001 Ramin Nakisa
  5 | 
  6 | //  This program is free software; you can redistribute it and/or modify
  7 | //  it under the terms of the GNU General Public License as published by
  8 | //  the Free Software Foundation; either version 2 of the License, or
  9 | //  (at your option) any later version.
 10 | 
 11 | //  This program is distributed in the hope that it will be useful,
 12 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | //  GNU General Public License for more details.
 15 | 
 16 | //  You should have received a copy of the GNU General Public License
 17 | //  along with this program; if not, write to the Free Software
 18 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 19 | 
 20 | #if !defined( _matrix_int_h )
 21 | #define _matrix_int_h
 22 | 
 23 | #ifdef __HP_aCC //for aCC B3910B A.01.27
 24 | #include <iostream.h>
 25 | #include <fstream.h>
 26 | #include <iomanip.h>
 27 | #else //for gcc3
 28 | #include <iostream>
 29 | #include <fstream>
 30 | #include <iomanip>
 31 | #endif
 32 | 
 33 | #include <math.h>
 34 | #include <stdlib.h>
 35 | #include <assert.h>
 36 | ///
 37 | #include <gsl/gsl_math.h>
 38 | #include <gsl/gsl_matrix_int.h>
 39 | #include <gsl/gsl_linalg.h>
 40 | #include <gslwrap/permutation.h>
 41 | #include <gslwrap/vector_int.h>
 42 | 
 43 | #define type_is_int
 44 | #ifdef  type_is
 45 | #define type_is_double
 46 | #endif
 47 | 
 48 | namespace gsl
 49 | {
 50 | 
 51 | ///
 52 | class matrix_int
 53 | {
 54 | #ifdef type_is_double
 55 | 	friend class matrix_float;
 56 | 	friend class matrix_int;
 57 | #endif
 58 | public:
 59 | 	typedef int value_type;
 60 | 	typedef vector_int vector_type;
 61 | 
 62 | 	///
 63 | 	matrix_int();
 64 | 	///
 65 | 	matrix_int( size_t new_rows, size_t new_cols, bool clear = true );
 66 | 
 67 | 	template<class oclass>
 68 | 	void copy(const oclass &other)
 69 | 	{
 70 | 		if ( static_cast<const void *>( this ) == static_cast<const void *>( &other ) )
 71 | 			return;
 72 | 
 73 | 		set_dimensions( other.get_rows(), other.get_cols() );
 74 | 		for ( size_t i = 0; i < get_rows(); i++ ) 
 75 | 		{
 76 | 			for ( size_t j = 0; j < get_cols(); j++ ) 
 77 | 			{
 78 | 				gsl_matrix_int_set( m, i, j, (int)other(i,j));
 79 | 			}
 80 | 		}
 81 | 	}
 82 | /*    	template<>  */
 83 | /*    	void copy<matrix_int>(const matrix_int &other)  */
 84 | /*    	{  */
 85 | /*    		set_dimensions(other.size1(),other.size2());  */
 86 | /*    		gsl_matrix_int_memcpy( m, other.m );  */
 87 | /*    	}  */
 88 | 	// copy constructor for type matrix_int
 89 | 	matrix_int( const matrix_int &other ):m(NULL) {copy(other);}
 90 | 	///
 91 | 	template<class oclass>
 92 | 	matrix_int( const oclass &other ):m(NULL) {copy(other);}
 93 | 
 94 | 	///
 95 | 	~matrix_int();
 96 | 	///
 97 | //	matrix_int( const char *Filename );
 98 | 	///
 99 | 	size_t get_rows() const {return m->size1;}
100 | 	///
101 | 	size_t get_cols() const {return m->size2;}
102 | 	///
103 | 	size_t size1() const {return m->size1;}
104 | 	///
105 | 	size_t size2() const {return m->size2;}
106 |    
107 | 
108 | 	///
109 | 	void dimensions( size_t *num_rows, size_t *num_cols ) const;
110 | 	///
111 | int        get_element ( size_t row, size_t col ) const {return  gsl_matrix_int_get( m, row, col ) ;}
112 | 	const int &operator()( size_t row, size_t col ) const {return *gsl_matrix_int_ptr( m, row, col ) ;}
113 | int       &operator()( size_t row, size_t col )       {return *gsl_matrix_int_ptr( m, row, col ) ;}
114 | 	///
115 | 	void set_element( size_t row, size_t col, const int &v ){ gsl_matrix_int_set( m, row, col, v );}
116 | 	///
117 | 	void set_elements( const int & new_value );
118 | 	void set_all ( const int & new_value ) {gsl_matrix_int_set_all ( m, new_value );}
119 | 	void set_zero() {gsl_matrix_int_set_zero( m );}
120 | 	///
121 | 	void set_dimensions( size_t new_rows, size_t new_cols );
122 | 	///
123 | 	void load( const char *filename );
124 | 	///
125 | 	void save( const char *filename ) const;
126 | 	///
127 | 	friend ostream& operator<< ( ostream& os, const matrix_int& m );
128 | 	//This function writes the elements of the matrix m to the stream stream in binary format. The return value is 0 for success and GSL_EFAILED if there was a problem writing to the file. Since the data is written in the native binary format it may not be portable between different architectures.
129 | 	int fwrite (FILE * stream) const {return gsl_matrix_int_fwrite (stream, m);}
130 | 
131 | //This function reads into the matrix m from the open stream stream in binary format. The matrix m must be preallocated with the correct dimensions since the function uses the size of m to determine how many bytes to read. The return value is 0 for success and GSL_EFAILED if there was a problem reading from the file. The data is assumed to have been written in the native binary format on the same architecture. 
132 | 	int fread (FILE * stream) {return gsl_matrix_int_fread (stream, m);}
133 | 
134 |     ///
135 | 	void load_binary( const char *filename );
136 | 	///
137 | 	void save_binary( const char *filename ) const;
138 | 	///
139 | 	bool operator==( const matrix_int &other ) const;
140 | 	bool operator!=( const matrix_int &other ) const {return !((*this)==other);}
141 | 	
142 | 	matrix_int& operator=( const matrix_int &other ) {copy( other );return *this;}
143 | 	/// converts from any other matrix type
144 | 	template<class omatrix>
145 | 	matrix_int &operator=( const omatrix& other )
146 | 	{
147 | 			copy(other);
148 | 			return *this;
149 | 	}
150 |    ///
151 | 	matrix_int operator+( const matrix_int &other ) const;
152 | 	///
153 | 	matrix_int operator+( const int &f ) const;
154 | 	///
155 | 	friend matrix_int operator+( const int &f, const matrix_int &other );
156 | 	///
157 | 	matrix_int &operator+=( const int &f );
158 | 	///
159 | 	matrix_int &operator+=( const matrix_int &other );
160 | 	///
161 | 	matrix_int operator-( const matrix_int &other ) const;
162 | 	///
163 | 	matrix_int operator-( const int &f ) const;
164 | 	///
165 | 	friend matrix_int operator-( const int &f, const matrix_int &other );
166 | 	///
167 | 	matrix_int &operator-=( const int &f );
168 | 	///
169 | 	matrix_int &operator-=( const matrix_int &other );
170 | 	///
171 | 	matrix_int operator*( const matrix_int &other ) const;
172 | 	///
173 | 	matrix_int operator*( const int &f ) const;
174 | 	///
175 | 	friend matrix_int operator*( const int &f, const matrix_int &other );
176 | 	///
177 | 	matrix_int &operator*=( const int &f );
178 | 	///
179 | 	matrix_int &operator*=( const matrix_int &other );
180 | 	///
181 | 	matrix_int operator/( const int &) const;
182 | 	///
183 | 	matrix_int &operator/=( const int &);
184 | 	///
185 | 	matrix_int transpose() const;
186 | 	///
187 | 	matrix_int LU_decomp(gsl::permutation *perm=NULL,int *psign=NULL) const;
188 | 	///
189 | 	matrix_int LU_invert() const;
190 | 
191 | 	// return a submatrix of the this from row_min to row_max (not included!)
192 | 	matrix_int submatrix(size_t row_min, size_t row_max, size_t col_min, size_t col_max) const 
193 | 		{
194 | 			matrix_int m(row_max - row_min, col_max - col_min);
195 | 			for (size_t i = row_min ; i < row_max ; i++)
196 | 			{
197 | 				for (size_t j = col_min ; j < col_max ; j++)
198 | 				{
199 | 					m(i - row_min,j - col_min) = (*this)(i,j);
200 | 				}
201 | 			}
202 | 			return m;
203 | 		}
204 | private:
205 | 	///
206 | 	void LU_decomp( gsl_matrix_int **a,
207 | 					gsl_permutation **permutation,
208 | 					int *sign ) const;
209 | public:
210 | 	/** returns sum of all the matrix elements. */
211 |     int sum() const;
212 | 	/** returns logarithm of the determinant of the matrix. */
213 | 	double LU_lndet() const;
214 | 
215 | 
216 | 	/** returns a vector_int_view of a single row of the matrix. */
217 | 	vector_int_view       row( size_t rowindex );
218 | 	const vector_int_view row( size_t rowindex ) const ;
219 | 	/** returns a vector_int_view of a single column of the matrix. */
220 | 	vector_int_view       column( size_t colindex );
221 | 	const vector_int_view column( size_t colindex ) const;
222 | 	/** returns a vector_int_view of the diagonal elements of the matrix. */
223 | 	vector_int_view       diagonal();
224 | 	const vector_int_view diagonal() const;
225 | 
226 | 	/** returns a column matrix containing a single row of the matrix. */
227 | 	matrix_int get_row( size_t rowindex ) const;
228 | 	/** returns a column matrix containing a single column of the matrix. */
229 | 	matrix_int get_col( size_t colindex ) const;
230 | 	/** calculates sum of rows returned as a column matrix. */
231 | 	matrix_int row_sum() const;
232 | 	/** calculates sum of columns returned as a row matrix. */
233 | 	matrix_int column_sum() const;
234 | 	/** returns trace (diagonal sum) of a square matrix. */
235 | 	double trace() const;
236 | 	/** calculates cholesky decomposition of the matrix, returning success if matrix is positive definite. */
237 | 	int cholesky_decomp( matrix_int &a ) const;
238 | //  	/** returns index of nearest row in matrix to vector argument. */
239 | //  	int nearest_row_index( const matrix_int &v ) const;
240 | 	/** calculates covariance of the matrix columns. */
241 | 	matrix_int covariance() const;
242 | 	/** returns 1 if matrix is square, 0 otherwise. */
243 | 	bool is_square() const;
244 | 	/** diag operator (sets the diagonal elements of the matrix to the elements of v */
245 | 	void diag(const vector_int& v);
246 | 	/** set diagonal elements of a square matrix to f. */
247 | 	void set_diagonal( int f );
248 | 	/** sets matrix to a k dimensional unit matrix. */
249 | 	void identity( size_t k );
250 | 	/** returns sum of nth power of all elements. */
251 | 	double norm( double n ) const;
252 | 
253 | /*  Function: double gsl_matrix_max (const gsl_matrix * m)  */
254 | /*      This function returns the maximum value in the matrix m.  */
255 | 	double max() const {return gsl_matrix_int_max(m);}
256 | /*  Function: double gsl_matrix_min (const gsl_matrix * m)  */
257 | /*      This function returns the minimum value in the matrix m.  */
258 | 	double min()const{return gsl_matrix_int_min(m);}
259 | 
260 | 	/** This function returns 1 if all the elements of the matrix m are zero, and 0 otherwise. */
261 | 	bool isnull() const { return gsl_matrix_int_isnull(m);}
262 | /*  Function: void gsl_matrix_minmax (const gsl_matrix * m, double * min_out, double * max_out)  */
263 | /*      This function returns the minimum and maximum values in the matrix m, storing them in min_out and max_out.  */
264 | 
265 | /*  Function: void gsl_matrix_max_index (const gsl_matrix * m, size_t * imax, size_t * jmax)  */
266 | /*      This function returns the indices of the maximum value in the matrix m, storing them in imax and jmax. When there are several equal maximum elements then the first element found */
267 | /*      is returned.  */
268 | 
269 | /*  Function: void gsl_matrix_min_index (const gsl_matrix * m, size_t * imax, size_t * jmax)  */
270 | /*      This function returns the indices of the minimum value in the matrix m, storing them in imax and jmax. When there are several equal minimum elements then the first element found */
271 | /*      is returned.  */
272 | 
273 | /*  Function: void gsl_matrix_minmax_index (const gsl_matrix * m, size_t * imin, size_t * imax)  */
274 | /*      This function returns the indices of the minimum and maximum values in the matrix m, storing them in (imin,jmin) and (imax,jmax). When there are several equal minimum or */
275 | /*      maximum elements then the first elements found are returned.  */
276 | 
277 | 	/** for interfacing with gsl c */
278 | /*  	gsl_matrix_int       *gslobj()       {if (!m){cout << "matrix_int::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */
279 | /*  	const gsl_matrix_int *gslobj() const {if (!m){cout << "matrix_int::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */
280 | 	gsl_matrix_int       *gslobj()       {assert(m);return m;}
281 | 	const gsl_matrix_int *gslobj() const {assert(m);return m;}
282 | private:
283 | 	///
284 |    gsl_matrix_int *m;
285 | 
286 | };
287 | }
288 | #undef type_is_int
289 | #undef type_is_double
290 | 
291 | #endif // _matrix_int_h
292 | 


--------------------------------------------------------------------------------
/gslwrap/include/gslwrap/matrix_float.h:
--------------------------------------------------------------------------------
  1 | // matrix.h
  2 | 
  3 | //  This matrix class is a C++ wrapper for the GNU Scientific Library
  4 | //  Copyright (C) 2001 Ramin Nakisa
  5 | 
  6 | //  This program is free software; you can redistribute it and/or modify
  7 | //  it under the terms of the GNU General Public License as published by
  8 | //  the Free Software Foundation; either version 2 of the License, or
  9 | //  (at your option) any later version.
 10 | 
 11 | //  This program is distributed in the hope that it will be useful,
 12 | //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | //  GNU General Public License for more details.
 15 | 
 16 | //  You should have received a copy of the GNU General Public License
 17 | //  along with this program; if not, write to the Free Software
 18 | //  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 19 | 
 20 | #if !defined( _matrix_float_h )
 21 | #define _matrix_float_h
 22 | 
 23 | #ifdef __HP_aCC //for aCC B3910B A.01.27
 24 | #include <iostream.h>
 25 | #include <fstream.h>
 26 | #include <iomanip.h>
 27 | #else //for gcc3
 28 | #include <iostream>
 29 | #include <fstream>
 30 | #include <iomanip>
 31 | #endif
 32 | 
 33 | #include <math.h>
 34 | #include <stdlib.h>
 35 | #include <assert.h>
 36 | ///
 37 | #include <gsl/gsl_math.h>
 38 | #include <gsl/gsl_matrix_float.h>
 39 | #include <gsl/gsl_linalg.h>
 40 | #include <gslwrap/permutation.h>
 41 | #include <gslwrap/vector_float.h>
 42 | 
 43 | #define type_is_float
 44 | #ifdef  type_is
 45 | #define type_is_double
 46 | #endif
 47 | 
 48 | namespace gsl
 49 | {
 50 | 
 51 | ///
 52 | class matrix_float
 53 | {
 54 | #ifdef type_is_double
 55 | 	friend class matrix_float;
 56 | 	friend class matrix_int;
 57 | #endif
 58 | public:
 59 | 	typedef float value_type;
 60 | 	typedef vector_float vector_type;
 61 | 
 62 | 	///
 63 | 	matrix_float();
 64 | 	///
 65 | 	matrix_float( size_t new_rows, size_t new_cols, bool clear = true );
 66 | 
 67 | 	template<class oclass>
 68 | 	void copy(const oclass &other)
 69 | 	{
 70 | 		if ( static_cast<const void *>( this ) == static_cast<const void *>( &other ) )
 71 | 			return;
 72 | 
 73 | 		set_dimensions( other.get_rows(), other.get_cols() );
 74 | 		for ( size_t i = 0; i < get_rows(); i++ ) 
 75 | 		{
 76 | 			for ( size_t j = 0; j < get_cols(); j++ ) 
 77 | 			{
 78 | 				gsl_matrix_float_set( m, i, j, (float)other(i,j));
 79 | 			}
 80 | 		}
 81 | 	}
 82 | /*    	template<>  */
 83 | /*    	void copy<matrix_float>(const matrix_float &other)  */
 84 | /*    	{  */
 85 | /*    		set_dimensions(other.size1(),other.size2());  */
 86 | /*    		gsl_matrix_float_memcpy( m, other.m );  */
 87 | /*    	}  */
 88 | 	// copy constructor for type matrix_float
 89 | 	matrix_float( const matrix_float &other ):m(NULL) {copy(other);}
 90 | 	///
 91 | 	template<class oclass>
 92 | 	matrix_float( const oclass &other ):m(NULL) {copy(other);}
 93 | 
 94 | 	///
 95 | 	~matrix_float();
 96 | 	///
 97 | //	matrix_float( const char *Filename );
 98 | 	///
 99 | 	size_t get_rows() const {return m->size1;}
100 | 	///
101 | 	size_t get_cols() const {return m->size2;}
102 | 	///
103 | 	size_t size1() const {return m->size1;}
104 | 	///
105 | 	size_t size2() const {return m->size2;}
106 |    
107 | 
108 | 	///
109 | 	void dimensions( size_t *num_rows, size_t *num_cols ) const;
110 | 	///
111 | float        get_element ( size_t row, size_t col ) const {return  gsl_matrix_float_get( m, row, col ) ;}
112 | 	const float &operator()( size_t row, size_t col ) const {return *gsl_matrix_float_ptr( m, row, col ) ;}
113 | float       &operator()( size_t row, size_t col )       {return *gsl_matrix_float_ptr( m, row, col ) ;}
114 | 	///
115 | 	void set_element( size_t row, size_t col, const float &v ){ gsl_matrix_float_set( m, row, col, v );}
116 | 	///
117 | 	void set_elements( const float & new_value );
118 | 	void set_all ( const float & new_value ) {gsl_matrix_float_set_all ( m, new_value );}
119 | 	void set_zero() {gsl_matrix_float_set_zero( m );}
120 | 	///
121 | 	void set_dimensions( size_t new_rows, size_t new_cols );
122 | 	///
123 | 	void load( const char *filename );
124 | 	///
125 | 	void save( const char *filename ) const;
126 | 	///
127 | 	friend ostream& operator<< ( ostream& os, const matrix_float& m );
128 | 	//This function writes the elements of the matrix m to the stream stream in binary format. The return value is 0 for success and GSL_EFAILED if there was a problem writing to the file. Since the data is written in the native binary format it may not be portable between different architectures.
129 | 	int fwrite (FILE * stream) const {return gsl_matrix_float_fwrite (stream, m);}
130 | 
131 | //This function reads into the matrix m from the open stream stream in binary format. The matrix m must be preallocated with the correct dimensions since the function uses the size of m to determine how many bytes to read. The return value is 0 for success and GSL_EFAILED if there was a problem reading from the file. The data is assumed to have been written in the native binary format on the same architecture. 
132 | 	int fread (FILE * stream) {return gsl_matrix_float_fread (stream, m);}
133 | 
134 |     ///
135 | 	void load_binary( const char *filename );
136 | 	///
137 | 	void save_binary( const char *filename ) const;
138 | 	///
139 | 	bool operator==( const matrix_float &other ) const;
140 | 	bool operator!=( const matrix_float &other ) const {return !((*this)==other);}
141 | 	
142 | 	matrix_float& operator=( const matrix_float &other ) {copy( other );return *this;}
143 | 	/// converts from any other matrix type
144 | 	template<class omatrix>
145 | 	matrix_float &operator=( const omatrix& other )
146 | 	{
147 | 			copy(other);
148 | 			return *this;
149 | 	}
150 |    ///
151 | 	matrix_float operator+( const matrix_float &other ) const;
152 | 	///
153 | 	matrix_float operator+( const float &f ) const;
154 | 	///
155 | 	friend matrix_float operator+( const float &f, const matrix_float &other );
156 | 	///
157 | 	matrix_float &operator+=( const float &f );
158 | 	///
159 | 	matrix_float &operator+=( const matrix_float &other );
160 | 	///
161 | 	matrix_float operator-( const matrix_float &other ) const;
162 | 	///
163 | 	matrix_float operator-( const float &f ) const;
164 | 	///
165 | 	friend matrix_float operator-( const float &f, const matrix_float &other );
166 | 	///
167 | 	matrix_float &operator-=( const float &f );
168 | 	///
169 | 	matrix_float &operator-=( const matrix_float &other );
170 | 	///
171 | 	matrix_float operator*( const matrix_float &other ) const;
172 | 	///
173 | 	matrix_float operator*( const float &f ) const;
174 | 	///
175 | 	friend matrix_float operator*( const float &f, const matrix_float &other );
176 | 	///
177 | 	matrix_float &operator*=( const float &f );
178 | 	///
179 | 	matrix_float &operator*=( const matrix_float &other );
180 | 	///
181 | 	matrix_float operator/( const float &) const;
182 | 	///
183 | 	matrix_float &operator/=( const float &);
184 | 	///
185 | 	matrix_float transpose() const;
186 | 	///
187 | 	matrix_float LU_decomp(gsl::permutation *perm=NULL,int *psign=NULL) const;
188 | 	///
189 | 	matrix_float LU_invert() const;
190 | 
191 | 	// return a submatrix of the this from row_min to row_max (not included!)
192 | 	matrix_float submatrix(size_t row_min, size_t row_max, size_t col_min, size_t col_max) const 
193 | 		{
194 | 			matrix_float m(row_max - row_min, col_max - col_min);
195 | 			for (size_t i = row_min ; i < row_max ; i++)
196 | 			{
197 | 				for (size_t j = col_min ; j < col_max ; j++)
198 | 				{
199 | 					m(i - row_min,j - col_min) = (*this)(i,j);
200 | 				}
201 | 			}
202 | 			return m;
203 | 		}
204 | private:
205 | 	///
206 | 	void LU_decomp( gsl_matrix_float **a,
207 | 					gsl_permutation **permutation,
208 | 					int *sign ) const;
209 | public:
210 | 	/** returns sum of all the matrix elements. */
211 |     float sum() const;
212 | 	/** returns logarithm of the determinant of the matrix. */
213 | 	double LU_lndet() const;
214 | 
215 | 
216 | 	/** returns a vector_float_view of a single row of the matrix. */
217 | 	vector_float_view       row( size_t rowindex );
218 | 	const vector_float_view row( size_t rowindex ) const ;
219 | 	/** returns a vector_float_view of a single column of the matrix. */
220 | 	vector_float_view       column( size_t colindex );
221 | 	const vector_float_view column( size_t colindex ) const;
222 | 	/** returns a vector_float_view of the diagonal elements of the matrix. */
223 | 	vector_float_view       diagonal();
224 | 	const vector_float_view diagonal() const;
225 | 
226 | 	/** returns a column matrix containing a single row of the matrix. */
227 | 	matrix_float get_row( size_t rowindex ) const;
228 | 	/** returns a column matrix containing a single column of the matrix. */
229 | 	matrix_float get_col( size_t colindex ) const;
230 | 	/** calculates sum of rows returned as a column matrix. */
231 | 	matrix_float row_sum() const;
232 | 	/** calculates sum of columns returned as a row matrix. */
233 | 	matrix_float column_sum() const;
234 | 	/** returns trace (diagonal sum) of a square matrix. */
235 | 	double trace() const;
236 | 	/** calculates cholesky decomposition of the matrix, returning success if matrix is positive definite. */
237 | 	int cholesky_decomp( matrix_float &a ) const;
238 | //  	/** returns index of nearest row in matrix to vector argument. */
239 | //  	int nearest_row_index( const matrix_float &v ) const;
240 | 	/** calculates covariance of the matrix columns. */
241 | 	matrix_float covariance() const;
242 | 	/** returns 1 if matrix is square, 0 otherwise. */
243 | 	bool is_square() const;
244 | 	/** diag operator (sets the diagonal elements of the matrix to the elements of v */
245 | 	void diag(const vector_float& v);
246 | 	/** set diagonal elements of a square matrix to f. */
247 | 	void set_diagonal( float f );
248 | 	/** sets matrix to a k dimensional unit matrix. */
249 | 	void identity( size_t k );
250 | 	/** returns sum of nth power of all elements. */
251 | 	double norm( double n ) const;
252 | 
253 | /*  Function: double gsl_matrix_max (const gsl_matrix * m)  */
254 | /*      This function returns the maximum value in the matrix m.  */
255 | 	double max() const {return gsl_matrix_float_max(m);}
256 | /*  Function: double gsl_matrix_min (const gsl_matrix * m)  */
257 | /*      This function returns the minimum value in the matrix m.  */
258 | 	double min()const{return gsl_matrix_float_min(m);}
259 | 
260 | 	/** This function returns 1 if all the elements of the matrix m are zero, and 0 otherwise. */
261 | 	bool isnull() const { return gsl_matrix_float_isnull(m);}
262 | /*  Function: void gsl_matrix_minmax (const gsl_matrix * m, double * min_out, double * max_out)  */
263 | /*      This function returns the minimum and maximum values in the matrix m, storing them in min_out and max_out.  */
264 | 
265 | /*  Function: void gsl_matrix_max_index (const gsl_matrix * m, size_t * imax, size_t * jmax)  */
266 | /*      This function returns the indices of the maximum value in the matrix m, storing them in imax and jmax. When there are several equal maximum elements then the first element found */
267 | /*      is returned.  */
268 | 
269 | /*  Function: void gsl_matrix_min_index (const gsl_matrix * m, size_t * imax, size_t * jmax)  */
270 | /*      This function returns the indices of the minimum value in the matrix m, storing them in imax and jmax. When there are several equal minimum elements then the first element found */
271 | /*      is returned.  */
272 | 
273 | /*  Function: void gsl_matrix_minmax_index (const gsl_matrix * m, size_t * imin, size_t * imax)  */
274 | /*      This function returns the indices of the minimum and maximum values in the matrix m, storing them in (imin,jmin) and (imax,jmax). When there are several equal minimum or */
275 | /*      maximum elements then the first elements found are returned.  */
276 | 
277 | 	/** for interfacing with gsl c */
278 | /*  	gsl_matrix_float       *gslobj()       {if (!m){cout << "matrix_float::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */
279 | /*  	const gsl_matrix_float *gslobj() const {if (!m){cout << "matrix_float::gslobj ERROR, data not initialized!! " << endl; exit(-1);}return m;} */
280 | 	gsl_matrix_float       *gslobj()       {assert(m);return m;}
281 | 	const gsl_matrix_float *gslobj() const {assert(m);return m;}
282 | private:
283 | 	///
284 |    gsl_matrix_float *m;
285 | 
286 | };
287 | }
288 | #undef type_is_float
289 | #undef type_is_double
290 | 
291 | #endif // _matrix_float_h
292 | 


--------------------------------------------------------------------------------
/dtm/gsl-wrappers.c:
--------------------------------------------------------------------------------
  1 | #include <gflags/gflags.h>
  2 | #include "gsl-wrappers.h"
  3 | #include <sys/stat.h>
  4 | 
  5 | static gsl_rng* RANDOM_NUMBER_GENERATOR = NULL;
  6 | 
  7 | DEFINE_int64(rng_seed,
  8 | 	     0,
  9 | 	     "Specifies the random seed.  If 0, seeds pseudo-randomly.");
 10 | 
 11 | // The maximum number of iterations for each update.
 12 | const double MAX_ITER = 15;
 13 | 
 14 | /*
 15 |  * safe logarithm function
 16 |  *
 17 |  */
 18 | 
 19 | double safe_log(double x)
 20 | {
 21 |     if (x == 0)
 22 |     {
 23 |         return(-1000);
 24 |     }
 25 |     else
 26 |     {
 27 |         return(log(x));
 28 |     }
 29 | }
 30 | 
 31 | 
 32 | /*
 33 |  * given log(a) and log(b), return log(a+b)
 34 |  *
 35 |  */
 36 | 
 37 | double log_sum(double log_a, double log_b)
 38 | {
 39 |   double v;
 40 | 
 41 |   if (log_a == -1) return(log_b);
 42 | 
 43 |   if (log_a < log_b)
 44 |   {
 45 |       v = log_b+log(1 + exp(log_a-log_b));
 46 |   }
 47 |   else
 48 |   {
 49 |       v = log_a+log(1 + exp(log_b-log_a));
 50 |   }
 51 |   return(v);
 52 | }
 53 | 
 54 | 
 55 | void vinc(gsl_vector* v, int i, double x)
 56 | {
 57 |     vset(v, i, vget(v, i) + x);
 58 | }
 59 | 
 60 | void minc(gsl_matrix* m, int i, int j, double x)
 61 | {
 62 |     mset(m, i, j, mget(m, i, j) + x);
 63 | }
 64 | 
 65 | 
 66 | void msetrow(gsl_matrix* m, int r, const gsl_vector* val)
 67 | {
 68 |     int i;
 69 |     gsl_vector v = gsl_matrix_row(m, r).vector;
 70 |     for (i = 0; i < v.size; i++)
 71 |         vset(&v, i, vget(val, i));
 72 | }
 73 | 
 74 | 
 75 | void msetcol(gsl_matrix* m, int r, const gsl_vector* val)
 76 | {
 77 |     int i;
 78 |     gsl_vector v = gsl_matrix_column(m, r).vector;
 79 |     for (i = 0; i < v.size; i++)
 80 |         vset(&v, i, vget(val, i));
 81 | }
 82 | 
 83 | 
 84 | /*
 85 |  * compute the column sums of a matrix
 86 |  *
 87 |  */
 88 | 
 89 | void col_sum(gsl_matrix* m, gsl_vector* val)
 90 | {
 91 |     int i, j;
 92 |     gsl_vector_set_all(val, 0);
 93 | 
 94 |     for (i = 0; i < m->size1; i++)
 95 |         for (j = 0; j < m->size2; j++)
 96 |             vinc(val, j, mget(m, i, j));
 97 | }
 98 | 
 99 | 
100 | /*
101 |  * print a vector to standard out
102 |  *
103 |  */
104 | 
105 | void vct_printf(const gsl_vector * v)
106 | {
107 |     int i;
108 |     for (i = 0; i < v->size; i++)
109 | 	printf("%5.5f ", vget(v, i));
110 |     printf("\n\n");
111 | }
112 | 
113 | 
114 | /*
115 |  * print a matrix to standard out
116 |  *
117 |  */
118 | 
119 | void mtx_printf(const gsl_matrix * m)
120 | {
121 |     int i, j;
122 |     for (i = 0; i < m->size1; i++)
123 |     {
124 | 	for (j = 0; j < m->size2; j++)
125 | 	    printf("%5.5f ", mget(m, i, j));
126 | 	printf("\n");
127 |     }
128 | }
129 | 
130 | 
131 | /*
132 |  * read/write a vector/matrix from a file
133 |  *
134 |  */
135 | 
136 | void vct_fscanf(const char* filename, gsl_vector* v)
137 | {
138 |     outlog("reading %ld vector from %s", v->size, filename);
139 |     FILE* fileptr;
140 |     if (!fileptr) {
141 |       outlog("Error opening file %s. Failing.", filename);
142 |       exit(1);
143 |     }
144 |     fileptr = fopen(filename, "r");
145 |     gsl_vector_fscanf(fileptr, v);
146 |     fclose(fileptr);
147 | }
148 | 
149 | void mtx_fscanf(const char* filename, gsl_matrix * m)
150 | {
151 |     FILE* fileptr = fopen(filename, "r");
152 | 
153 |     outlog("reading %ld x %ld matrix from %s",
154 |            m->size1, m->size2, filename);
155 |     if (!fileptr) {
156 |       outlog("Error opening file %s. Failing.", filename);
157 |       exit(1);
158 |     }
159 | 
160 |     gsl_matrix_fscanf(fileptr, m);
161 |     fclose(fileptr);
162 | }
163 | 
164 | void vct_fprintf(const char* filename, gsl_vector* v)
165 | {
166 |     outlog( "writing %ld vector to %s", v->size, filename);
167 |     FILE* fileptr;
168 |     fileptr = fopen(filename, "w");
169 |     if (!fileptr) {
170 |       outlog("Error opening file %s. Failing.", filename);
171 |       exit(1);
172 |     }
173 |     gsl_vector_fprintf(fileptr, v, "%20.17e");
174 |     fclose(fileptr);
175 | }
176 | 
177 | 
178 | void mtx_fprintf(const char* filename, const gsl_matrix * m)
179 | {
180 |     outlog( "writing %ld x %ld matrix to %s",
181 |             m->size1, m->size2, filename);
182 |     FILE* fileptr;
183 |     fileptr = fopen(filename, "w");
184 |     if (!fileptr) {
185 |       outlog("Error opening file: %s", filename);
186 |       exit(1);
187 |     }
188 |     gsl_matrix_fprintf(fileptr, m, "%20.17e");
189 |     fclose(fileptr);
190 | }
191 | 
192 | 
193 | /*
194 |  * matrix inversion using blas
195 |  *
196 |  */
197 | 
198 | void matrix_inverse(gsl_matrix* m, gsl_matrix* inverse)
199 | {
200 |     gsl_matrix *lu;
201 |     gsl_permutation* p;
202 |     int signum;
203 | 
204 |     p = gsl_permutation_alloc(m->size1);
205 |     lu = gsl_matrix_alloc(m->size1, m->size2);
206 | 
207 |     gsl_matrix_memcpy(lu, m);
208 |     gsl_linalg_LU_decomp(lu, p, &signum);
209 |     gsl_linalg_LU_invert(lu, p, inverse);
210 | 
211 |     gsl_matrix_free(lu);
212 |     gsl_permutation_free(p);
213 | }
214 | 
215 | 
216 | /*
217 |  * log determinant using blas
218 |  *
219 |  */
220 | 
221 | double log_det(gsl_matrix* m)
222 | {
223 |     gsl_matrix* lu;
224 |     gsl_permutation* p;
225 |     double result;
226 |     int signum;
227 | 
228 |     p = gsl_permutation_alloc(m->size1);
229 |     lu = gsl_matrix_alloc(m->size1, m->size2);
230 | 
231 |     gsl_matrix_memcpy(lu, m);
232 |     gsl_linalg_LU_decomp(lu, p, &signum);
233 |     result = gsl_linalg_LU_lndet(lu);
234 | 
235 |     gsl_matrix_free(lu);
236 |     gsl_permutation_free(p);
237 | 
238 |     return(result);
239 | }
240 | 
241 | 
242 | /*
243 |  * eigenvalues of a symmetric matrix using blas
244 |  *
245 |  */
246 | 
247 | void sym_eigen(gsl_matrix* m, gsl_vector* vals, gsl_matrix* vects)
248 | {
249 |     gsl_eigen_symmv_workspace* wk;
250 |     gsl_matrix* mcpy;
251 |     int r;
252 | 
253 |     mcpy = gsl_matrix_alloc(m->size1, m->size2);
254 |     wk = gsl_eigen_symmv_alloc(m->size1);
255 |     gsl_matrix_memcpy(mcpy, m);
256 |     r = gsl_eigen_symmv(mcpy, vals, vects, wk);
257 |     gsl_eigen_symmv_free(wk);
258 |     gsl_matrix_free(mcpy);
259 | }
260 | 
261 | 
262 | /*
263 |  * sum of a vector
264 |  *
265 |  */
266 | 
267 | double sum(const gsl_vector* v)
268 | {
269 |     double val = 0;
270 |     int i, size = v->size;
271 |     for (i = 0; i < size; i++)
272 |         val += vget(v, i);
273 |     return(val);
274 | }
275 | 
276 | 
277 | /*
278 |  * take log of each element
279 |  *
280 |  */
281 | 
282 | void vct_log(gsl_vector* v)
283 | {
284 |     int i, size = v->size;
285 |     for (i = 0; i < size; i++)
286 |         vset(v, i, safe_log(vget(v, i)));
287 | }
288 | 
289 | 
290 | /*
291 |  * l2 norm of a vector
292 |  *
293 |  */
294 | 
295 | // !!! this can be BLASified
296 | 
297 | double norm(gsl_vector *v)
298 | {
299 |     double val = 0;
300 |     int i;
301 | 
302 |     for (i = 0; i < v->size; i++)
303 |         val += vget(v, i) * vget(v, i);
304 |     return(sqrt(val));
305 | }
306 | 
307 | 
308 | /*
309 |  * draw K random integers from 0..N-1
310 |  *
311 |  */
312 | 
313 | void choose_k_from_n(int k, int n, int* result)
314 | {
315 |     int i, x[n];
316 | 
317 |     if (RANDOM_NUMBER_GENERATOR == NULL)
318 |         RANDOM_NUMBER_GENERATOR = gsl_rng_alloc(gsl_rng_taus);
319 |     for (i = 0; i < n; i++)
320 |         x[i] = i;
321 | 
322 |     gsl_ran_choose (RANDOM_NUMBER_GENERATOR, (void *) result,  k,
323 |                     (void *) x, n, sizeof(int));
324 | }
325 | 
326 | 
327 | /*
328 |  * normalize a vector in log space
329 |  *
330 |  * x_i = log(a_i)
331 |  * v = log(a_1 + ... + a_k)
332 |  * x_i = x_i - v
333 |  *
334 |  */
335 | 
336 | void log_normalize(gsl_vector* x)
337 | {
338 |     double v = vget(x, 0);
339 |     int i;
340 | 
341 |     for (i = 1; i < x->size; i++)
342 |         v = log_sum(v, vget(x, i));
343 | 
344 |     for (i = 0; i < x->size; i++)
345 |         vset(x, i, vget(x,i)-v);
346 | }
347 | 
348 | 
349 | /*
350 |  * normalize a positive vector
351 |  *
352 |  */
353 | 
354 | void normalize(gsl_vector* x)
355 | {
356 |     double v = 0;
357 |     int i;
358 | 
359 |     for (i = 0; i < x->size; i++)
360 |         v += vget(x, i);
361 | 
362 |     for (i = 0; i < x->size; i++)
363 |         vset(x, i, vget(x, i) / v);
364 | }
365 | 
366 | 
367 | /*
368 |  * exponentiate a vector
369 |  *
370 |  */
371 | 
372 | void vct_exp(gsl_vector* x)
373 | {
374 |     int i;
375 | 
376 |     for (i = 0; i < x->size; i++)
377 |         vset(x, i, exp(vget(x, i)));
378 | }
379 | 
380 | 
381 | /*
382 |  * maximize a function using its derivative
383 |  *
384 |  */
385 | 
386 | void optimize_fdf(int dim,
387 |                   gsl_vector* x,
388 |                   void* params,
389 |                   void (*fdf)(const gsl_vector*, void*, double*, gsl_vector*),
390 |                   void (*df)(const gsl_vector*, void*, gsl_vector*),
391 |                   double (*f)(const gsl_vector*, void*),
392 |                   double* f_val,
393 |                   double* conv_val,
394 |                   int* niter)
395 | {
396 |     gsl_multimin_function_fdf obj;
397 |     obj.f = f;
398 |     obj.df = df;
399 |     obj.fdf = fdf;
400 |     obj.n = dim;
401 |     obj.params = params;
402 | 
403 | //    const gsl_multimin_fdfminimizer_type * method =
404 | //        gsl_multimin_fdfminimizer_vector_bfgs;
405 |     const gsl_multimin_fdfminimizer_type * method =
406 |         gsl_multimin_fdfminimizer_conjugate_fr;
407 | 
408 |     gsl_multimin_fdfminimizer * opt =
409 |         gsl_multimin_fdfminimizer_alloc(method, dim);
410 | 
411 |     gsl_multimin_fdfminimizer_set(opt, &obj, x, 0.01, 1e-3);
412 | 
413 |     int iter = 0, status;
414 |     double converged, f_old = 0;
415 |     do
416 |     {
417 |         iter++;
418 |         status = gsl_multimin_fdfminimizer_iterate(opt);
419 |         // assert(status==0);
420 |         converged = fabs((f_old - opt->f) / (dim * f_old));
421 |         // status = gsl_multimin_test_gradient(opt->gradient, 1e-3);
422 |         // printf("f = %1.15e; conv = %5.3e; norm = %5.3e; niter = %03d\n",
423 |         // opt->f, converged, norm(opt->gradient), iter);
424 |         f_old = opt->f;
425 |     }
426 |     while (converged > 1e-8 && iter < MAX_ITER);
427 |     // while (status == GSL_CONTINUE);
428 |     *f_val = opt->f;
429 |     *conv_val = converged;
430 |     *niter = iter;
431 |     gsl_multimin_fdfminimizer_free(opt);
432 | }
433 | 
434 | 
435 | 
436 | /*
437 |  * maximize a function
438 |  *
439 |  */
440 | 
441 | void optimize_f(int dim,
442 |                 gsl_vector* x,
443 |                 void* params,
444 |                 double (*f)(const gsl_vector*, void*))
445 | {
446 |     gsl_multimin_function obj;
447 |     obj.f = f;
448 |     obj.n = dim;
449 |     obj.params = params;
450 | 
451 |     const gsl_multimin_fminimizer_type * method =
452 |         gsl_multimin_fminimizer_nmsimplex;
453 | 
454 |     gsl_multimin_fminimizer * opt =
455 |         gsl_multimin_fminimizer_alloc(method, dim);
456 | 
457 |     gsl_vector * step_size = gsl_vector_alloc(dim);
458 |     gsl_vector_set_all(step_size, 1);
459 |     gsl_multimin_fminimizer_set(opt, &obj, x, step_size);
460 | 
461 |     int iter = 0, status;
462 |     double converged, f_old;
463 |     do
464 |     {
465 |         iter++;
466 |         f_old = opt->fval;
467 |         status = gsl_multimin_fminimizer_iterate(opt);
468 |         converged = fabs((f_old - opt->fval) / f_old);
469 |         printf("f = %1.15e; conv = %5.3e; size = %5.3e; niter = %03d\n",
470 |                opt->fval, converged, opt->size, iter);
471 |     }
472 |     while ((converged > 1e-10) || (iter < 10000));
473 |     // while (status == GSL_CONTINUE);
474 |     printf("f = %1.15e; conv = %5.3e; niter = %03d\n",
475 |            opt->fval, converged, iter);
476 | 
477 |     gsl_multimin_fminimizer_free(opt);
478 |     gsl_vector_free(step_size);
479 | }
480 | 
481 | 
482 | /*
483 |  * check if a directory exists
484 |  *
485 |  * !!! shouldn't be here
486 |  */
487 | 
488 | int directory_exist(const char *dname)
489 | {
490 |     struct stat st;
491 |     int ret;
492 | 
493 |     if (stat(dname,&st) != 0)
494 |     {
495 |         return 0;
496 |     }
497 | 
498 |     ret = S_ISDIR(st.st_mode);
499 | 
500 |     if(!ret)
501 |     {
502 |         errno = ENOTDIR;
503 |     }
504 | 
505 |     return ret;
506 | }
507 | 
508 | void make_directory(char* name)
509 | {
510 | #if _POSIX_C_SOURCE || __MACH__
511 |     mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
512 | #else
513 |     mkdir(name);
514 | #endif
515 | }
516 | 
517 | gsl_rng* new_random_number_generator()
518 | {
519 |     gsl_rng* random_number_generator = gsl_rng_alloc(gsl_rng_taus);
520 |     time_t t1;
521 |     (void) time(&t1);
522 | 
523 |     if (FLAGS_rng_seed) {
524 |       t1 = FLAGS_rng_seed;
525 |     }
526 | 
527 |     // !!! DEBUG
528 |     // t1 = 1147530551;
529 |     printf("RANDOM SEED = %ld\n", t1);
530 |     gsl_rng_set(random_number_generator, t1);
531 |     return(random_number_generator);
532 | }
533 | 
534 | 


--------------------------------------------------------------------------------
/dtm/data.c:
--------------------------------------------------------------------------------
  1 | // Authors: David Blei (blei@cs.princeton.edu)
  2 | //          Sean Gerrish (sgerrish@cs.princeton.edu)
  3 | //
  4 | // Copyright 2011 Sean Gerrish and David Blei
  5 | // All Rights Reserved.
  6 | //
  7 | // See the README for this package for details about modifying or
  8 | // distributing this software.
  9 | 
 10 | #include <gflags/gflags.h>
 11 | 
 12 | #define PI 3.141592653589793
 13 | 
 14 | #include "data.h"
 15 | 
 16 | DEFINE_double(sigma_l,
 17 | 	      0.05,
 18 | 	      "If true, use the new phi calculation.");
 19 | DEFINE_double(sigma_d,
 20 | 	      0.05,
 21 | 	      "If true, use the new phi calculation.");
 22 | DEFINE_double(sigma_c,
 23 | 	      0.05,
 24 | 	      "c stdev.");
 25 | DEFINE_double(sigma_cv,
 26 | 	      1e-6,
 27 | 	      "Variational c stdev.");
 28 | DEFINE_double(resolution,
 29 | 	      1,
 30 | 	      "The resolution.  Used to determine how far out the beta mean should be.");
 31 | DEFINE_int32(max_number_time_points,
 32 | 	     200,
 33 | 	     "Used for the influence window.");
 34 | DEFINE_double(time_resolution,
 35 | 	      0.5,
 36 | 	      "This is the number of years per time slice.");
 37 | DEFINE_double(influence_mean_years,
 38 | 	      20.0,
 39 | 	      "How many years is the mean number of citations?");
 40 | DEFINE_double(influence_stdev_years,
 41 | 	      15.0,
 42 | 	      "How many years is the stdev number of citations?");
 43 | DEFINE_int32(influence_flat_years,
 44 | 	      -1,
 45 | 	     "How many years is the influence nonzero?"
 46 | 	     "If nonpositive, a lognormal distribution is used.");
 47 | 
 48 | DECLARE_string(normalize_docs);
 49 | 
 50 | /*
 51 |  * seq corpus range: [start, end)
 52 |  *
 53 |  * creates a subset of time slices
 54 |  *
 55 |  */
 56 | 
 57 | corpus_seq_t* make_corpus_seq_subset(corpus_seq_t* all, int start, int end)
 58 | {
 59 |     int n;
 60 |     corpus_seq_t* subset_corpus = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));
 61 |     subset_corpus->nterms = all->nterms;
 62 |     subset_corpus->len    = end - start;
 63 |     subset_corpus->ndocs  = 0;
 64 |     subset_corpus->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * subset_corpus->len);
 65 |     for (n = start; n < end; n++)
 66 |     {
 67 |         subset_corpus->corpus[n - start] = all->corpus[n];
 68 |         subset_corpus->ndocs += all->corpus[n]->ndocs;
 69 |     }
 70 |     return(subset_corpus);
 71 | }
 72 | 
 73 | 
 74 | /*
 75 |  * collapse a sequential corpus to a flat corpus
 76 |  *
 77 |  */
 78 | 
 79 | corpus_t* collapse_corpus_seq(corpus_seq_t* c)
 80 | {
 81 |     corpus_t* collapsed = (corpus_t*) malloc(sizeof(corpus_t));
 82 |     collapsed->ndocs  = c->ndocs;
 83 |     collapsed->nterms = c->nterms;
 84 |     collapsed->doc    = (doc_t**) malloc(sizeof(doc_t*) * c->ndocs);
 85 |     collapsed->max_unique = 0;
 86 |     int t, n, doc_idx = 0;
 87 |     for (t = 0; t < c->len; t++)
 88 |     {
 89 |         for (n = 0; n < c->corpus[t]->ndocs; n++)
 90 |         {
 91 |             collapsed->doc[doc_idx] = c->corpus[t]->doc[n];
 92 |             if (collapsed->doc[doc_idx]->nterms > collapsed->max_unique)
 93 |                 collapsed->max_unique = collapsed->doc[doc_idx]->nterms;
 94 |             doc_idx++;
 95 |         }
 96 |     }
 97 |     assert(doc_idx == collapsed->ndocs);
 98 |     return(collapsed);
 99 | }
100 | 
101 | /*
102 |  * read corpus
103 |  *
104 |  */
105 | 
106 | corpus_t* read_corpus(const char* name)
107 | {
108 |     int length, count, word, n;
109 |     corpus_t* c;
110 |     char filename[400];
111 |     sprintf(filename, "%s-mult.dat", name);
112 |     outlog("reading corpus from %s", filename);
113 |     c = (corpus_t*) malloc(sizeof(corpus_t));
114 |     c->max_unique = 0;
115 |     FILE* fileptr = fopen(filename, "r");
116 |     if (fileptr == NULL) {
117 |       outlog("Error reading corpus prefix %s. Failing.",
118 | 	     filename);
119 |       exit(1);
120 |     }
121 |     c->ndocs = 0; c->nterms = 0;
122 |     c->doc = (doc_t**) malloc(sizeof(doc_t*));
123 |     int grand_total = 0;
124 |     while ((fscanf(fileptr, "%10d", &length) != EOF))
125 |     {
126 |         if (length > c->max_unique) c->max_unique = length;
127 |         c->doc = (doc_t**) realloc(c->doc, sizeof(doc_t*)*(c->ndocs+1));
128 |         c->doc[c->ndocs] = (doc_t*) malloc(sizeof(doc_t));
129 |         c->doc[c->ndocs]->nterms = length;
130 |         c->doc[c->ndocs]->total = 0;
131 | 	c->doc[c->ndocs]->log_likelihood = 0.0;
132 | 
133 |         c->doc[c->ndocs]->word = (int*) malloc(sizeof(int)*length);
134 |         c->doc[c->ndocs]->count = (int*) malloc(sizeof(int)*length);
135 |         c->doc[c->ndocs]->lambda = (double*) malloc(sizeof(double)*length);
136 |         c->doc[c->ndocs]->log_likelihoods = (double*) malloc(sizeof(double)*length);
137 |         for (n = 0; n < length; n++)
138 |         {
139 |             fscanf(fileptr, "%10d:%10d", &word, &count);
140 |             word = word - OFFSET;
141 | 	    if (FLAGS_normalize_docs == "occurrence") {
142 | 	      count = 1;
143 | 	    }
144 |             c->doc[c->ndocs]->word[n] = word;
145 |             c->doc[c->ndocs]->count[n] = count;
146 |             c->doc[c->ndocs]->total += count;
147 | 	    // Is there a better value for initializing lambda?
148 | 	    c->doc[c->ndocs]->lambda[n] = 0.0;
149 | 	    c->doc[c->ndocs]->log_likelihoods[n] = 0.0;
150 |             if (word >= c->nterms) { c->nterms = word + 1; }
151 |         }
152 |         grand_total += c->doc[c->ndocs]->total;
153 |         c->ndocs = c->ndocs + 1;
154 |     }
155 |     fclose(fileptr);
156 |     outlog("read corpus (ndocs = %d; nterms = %d; nwords = %d)\n",
157 |            c->ndocs, c->nterms, grand_total);
158 |     return(c);
159 | }
160 | 
161 | /*
162 |  * read corpus sequence
163 |  *
164 |  */
165 | 
166 | corpus_seq_t* read_corpus_seq(const char* name)
167 | {
168 |     char filename[400];
169 |     corpus_seq_t* corpus_seq = (corpus_seq_t*) malloc(sizeof(corpus_seq_t));
170 | 
171 |     // read corpus
172 |     corpus_t* raw_corpus = read_corpus(name);
173 |     corpus_seq->nterms = raw_corpus->nterms;
174 |     // read sequence information
175 |     sprintf(filename, "%s-seq.dat", name);
176 |     outlog("Reading corpus sequence %s.", filename);
177 |     FILE* fileptr = fopen(filename, "r");
178 |     if (!fileptr) {
179 |       outlog("Error opening dtm sequence file %s.\n",
180 | 	     filename);
181 |       exit(1);
182 |     }
183 |     fscanf(fileptr, "%d", &(corpus_seq->len));
184 |     corpus_seq->corpus = (corpus_t**) malloc(sizeof(corpus_t*) * corpus_seq->len);
185 |     // allocate corpora
186 |     int doc_idx = 0;
187 |     int ndocs, i, j;
188 |     corpus_seq->ndocs = 0;
189 |     for (i = 0; i < corpus_seq->len; ++i)
190 |     {
191 |         fscanf(fileptr, "%d", &ndocs);
192 |         corpus_seq->ndocs += ndocs;
193 |         corpus_seq->corpus[i] = (corpus_t*) malloc(sizeof(corpus_t));
194 |         corpus_seq->corpus[i]->ndocs = ndocs;
195 |         corpus_seq->corpus[i]->doc = (doc_t**) malloc(sizeof(doc_t*) * ndocs);
196 |         for (j = 0; j < ndocs; j++)
197 |         {
198 | 	  if (doc_idx >= raw_corpus->ndocs) {
199 | 	    outlog("Error: too few documents listed in dtm sequence file %s.\n"
200 | 		   "Current  line: %d %d %d.\n",
201 | 		   filename,
202 | 		   doc_idx,
203 | 		   ndocs,
204 | 		   j);
205 | 	    exit(1);
206 | 	  }
207 | 	  //	  outlog("%d %d %d %d\n", i, j, doc_idx, raw_corpus->ndocs);
208 | 	  corpus_seq->corpus[i]->doc[j] = raw_corpus->doc[doc_idx];
209 | 	  doc_idx++;
210 |         }
211 |     }
212 |     corpus_seq->max_nterms = compute_max_nterms(corpus_seq);
213 |     outlog("read corpus of length %d\n", corpus_seq->len);
214 |     return(corpus_seq);
215 | }
216 | 
217 | 
218 | /*
219 |  * write sequential corpus
220 |  *
221 |  */
222 | 
223 | void write_corpus_seq(corpus_seq_t* c, char* name)
224 | {
225 |     char tmp_string[400];
226 |     int n;
227 | 
228 |     outlog("writing %d slices to %s (%d total docs)", c->len, name, c->ndocs);
229 |     sprintf(tmp_string, "%s-seq.dat", name);
230 |     FILE* seq_file = fopen(tmp_string, "w");
231 |     fprintf(seq_file, "%d", c->len);
232 |     for (n = 0; n < c->len; n++)
233 |         fprintf(seq_file, " %d", c->corpus[n]->ndocs);
234 |     fclose(seq_file);
235 | 
236 |     corpus_t* flat = collapse_corpus_seq(c);
237 |     sprintf(tmp_string, "%s-mult.dat", name);
238 |     write_corpus(flat, tmp_string);
239 | }
240 | 
241 | /*
242 |  * write corpus
243 |  *
244 |  */
245 | 
246 | void write_corpus(corpus_t* c, char* filename)
247 | {
248 |     int i, j;
249 |     FILE * fileptr;
250 |     doc_t * d;
251 |     outlog("writing %d docs to %s\n", c->ndocs, filename);
252 |     fileptr = fopen(filename, "w");
253 |     for (i = 0; i < c->ndocs; i++)
254 |     {
255 |         d = c->doc[i];
256 |         fprintf(fileptr, "%d", d->nterms);
257 |         for (j = 0; j < d->nterms; j++)
258 |         {
259 |             fprintf(fileptr, " %d:%d", d->word[j], d->count[j]);
260 |         }
261 |         fprintf(fileptr, "\n");
262 |     }
263 |     fclose(fileptr);
264 | }
265 | 
266 | 
267 | /*
268 |  * compute the maximum nterms in a corpus sequence
269 |  *
270 |  */
271 | 
272 | int compute_max_nterms(const corpus_seq_t* c)
273 | {
274 |     int i,j;
275 |     int max = 0;
276 |     for (i = 0; i < c->len; i++)
277 |     {
278 |         corpus_t* corpus = c->corpus[i];
279 |         for (j = 0; j < corpus->ndocs; j++)
280 |             if (corpus->doc[j]->nterms > max)
281 |                 max = corpus->doc[j]->nterms;
282 |     }
283 |     return(max);
284 | }
285 | 
286 | 
287 | /*
288 |  * compute the total matrix of counts (W x T)
289 |  *
290 |  */
291 | 
292 | gsl_matrix* compute_total_counts(const corpus_seq_t* c)
293 | {
294 |     int t, d, n;
295 |     gsl_matrix* ret = gsl_matrix_alloc(c->nterms, c->len);
296 | 
297 |     for (t = 0; t < c->len; t++)
298 |     {
299 |         corpus_t* corpus = c->corpus[t];
300 |         for (d = 0; d < corpus->ndocs; d++)
301 |         {
302 |             doc_t* doc = corpus->doc[d];
303 |             for (n = 0; n < doc->nterms; n++)
304 |             {
305 |                 minc(ret, doc->word[n], t, (double) doc->count[n]);
306 |             }
307 |         }
308 |     }
309 |     return(ret);
310 | }
311 | 
312 | /**
313 |  * Creates a new array of doubles with kScaledBetaMax elements.
314 |  */
315 | double * NewScaledInfluence(int size)
316 | {
317 | 	double* scaled_influence = new double[size];
318 |   
319 | 	if (FLAGS_influence_flat_years > 0) {
320 | 		// Note that we round up, to make sure we have at least one epoch.
321 | 		int number_epochs = FLAGS_influence_flat_years * FLAGS_time_resolution;
322 | 		double epoch_weight = 1.0 / number_epochs;
323 | 		for (int i = 0; i < number_epochs; ++i) {
324 | 			scaled_influence[i] = epoch_weight;
325 | 		}
326 | 		for (int i = number_epochs; i < size; ++i) {
327 | 			scaled_influence[i] = 0.0;
328 | 		}
329 | 		return scaled_influence;
330 | 	}
331 | 
332 | 
333 | 	/*
334 |  	// Use the simple distribution: 1 at [0], 0 everywhere else.
335 | 	for (int i=0; i < size; ++i) {
336 | 		scaled_influence[i] = 0.0;
337 | 	}
338 | 	scaled_influence[0] = 1.0;
339 | 	return scaled_influence;
340 | 	*/
341 | 
342 | 	/*
343 | 	// Simulate a beta distribution with specified mean and variance.
344 | 	double total = 0.0;
345 | 	double tmp = (scaled_beta_mean * (1 - scaled_beta_mean) / scaled_beta_variance) - 1.0;
346 | 	double beta_alpha = scaled_beta_mean * tmp;
347 | 	double beta_beta = (1 - scaled_beta_mean) * tmp;
348 | 	for (int i = 0; i < scaled_beta_max; ++i) {
349 | 		// Offset tmp by 0.5 so we get a centered distribution and don't run into degeneracy issues.
350 | 		tmp = (i + 0.5) / (scaled_beta_max);
351 | 		scaled_beta[i] = (pow(tmp, beta_alpha - 1.0) * pow(1 - tmp, beta_beta - 1.0));
352 | 		total += scaled_beta[i];
353 | 	}
354 | 	*/
355 | 
356 | 
357 | 	// Handle the log-normal distribution.
358 | 	double total = 0.0;
359 | 
360 | 	// Here, we're interested more in the median. So we treat the variable mean as
361 | 	// median and note this in our paper.
362 | 	double scaled_influence_mean = FLAGS_influence_mean_years;
363 | 	double scaled_influence_variance = (FLAGS_influence_stdev_years * FLAGS_influence_stdev_years);
364 | 	double tmp = (1.0 + (scaled_influence_variance / (scaled_influence_mean * scaled_influence_mean)));
365 | 	double lognormal_sigma_squared = log(tmp);
366 | 	double lognormal_mu = (log(scaled_influence_mean) - 0.5 * lognormal_sigma_squared);
367 | 	double halfTimeframe = (1.0 / FLAGS_time_resolution) / 2;
368 | 	printf("Median: %.2f\n", exp(lognormal_mu));
369 | 	for (int i = 0; i < size; ++i) {
370 | 		// Shift right by half a timeframe to avoid corner cases.
371 | 		double x = (i / FLAGS_time_resolution) + halfTimeframe;
372 | 		double tmp2 = (log(x) - lognormal_mu);
373 | 		scaled_influence[i] = (1.0 / (x * sqrt(lognormal_sigma_squared * 2 * PI)) * exp(-tmp2 * tmp2/ (2.0 * lognormal_sigma_squared)));
374 | 		total += scaled_influence[i];
375 | 	}
376 | 	for (int i = 0; i < kScaledInfluenceMax; ++i) {
377 | 		scaled_influence[i] /= total;
378 | 	}
379 | 
380 | 	return scaled_influence;
381 | }
382 | 


--------------------------------------------------------------------------------
/doc/dtm.tex:
--------------------------------------------------------------------------------
  1 | \section{Dynamic Topic Model (DTM)}
  2 | 
  3 | 
  4 | \subsection{Formato dos dados de entrada}
  5 | 
  6 | A ferramenta \srccode{dtm} requer, no mínimo, dois arquivos de entrada:
  7 | um para a descrição de cada documento e seus respectivos termos e outro
  8 | para identificar as fatias de tempo a serem analisadas.
  9 | 
 10 | O primeiro arquivo, geralmente definido com o nome \srccode{???-mult.dat},
 11 | contém M linhas, sendo M a quantidade de documentos a serem analisados.
 12 | Os documentos devem ser ordenados pela data, em ordem crescente.
 13 | Cada linha descreve um documento, os seus termos e a quantidade de cada
 14 | termo no documento, de acordo com o seguinte formato:
 15 | 
 16 | \begin{lstlisting}
 17 | unique_word_count index1:count1 index2:count2 ... indexn:counnt
 18 | \end{lstlisting}
 19 | 
 20 | Não existe um identificador para o documento: o número da linha é utilizado
 21 | para esse fim. Os termos que compõe o documento também não são declarados
 22 | de forma textual: deve-se adotar um identificador numérico para cada termo.
 23 | Esse identificador deve ser único para o mesmo termo em relação a todos os
 24 | documentos (veja isto como uma otimização: é mais rápido processar um número
 25 | do que uma palavra). A contagem corresponde a frequência absoluta do termo
 26 | no documento da linha atual. Finalmente, o primeiro elemento da linha indica
 27 | o tamanho do vocabulário necessário para descrever o documento (veja isto
 28 | como um facilitador para ler os dados restantes da linha).
 29 | 
 30 | Por exemplo, considere os documentos da \cref{}. A definição deles no formato
 31 | DTM é apresentada na \cref{}. Observe que, para o segundo documento, alguns
 32 | termos que já tinham sido utilizados para especificar o primeiro documento,
 33 | apareceram. Logo, o índice que identifica o termo é o mesmo (\eg{], 3, 9 e 14},
 34 | embora a contagem seja particular a cada documento (\eg{}, o termo 3 apareceu
 35 | 3 vezes no primeiro documento e apenas 1 vez no segundo documento).
 36 | 
 37 | \begin{figure}
 38 | \begin{itemize}
 39 | 	\item Documento 1~\cite{Kulesza-etal2007}:
 40 | 	\\``The development of collaborative and multimedia systems is a
 41 | 	complex task and one of the key challenges is to promote the reuse and
 42 | 	integration of those two software categories in the same environment.''
 43 | 
 44 | 	\item Documento 2~\cite{Bezerra-Wainer2006}:
 45 | 	``This work shows a model to detect a set of anomalous traces in a log
 46 | 	generated by a business process management system.''
 47 | \end{itemize}	
 48 | 
 49 | \begin{lstlisting}
 50 | 24 1:4 2:1 3:3 4:1 5:3 6:1 7:1 8:2 9:1 10:1 11:1 12:1 13:1 14:1 15:1 16:1 17:1 18:1 19:1 20:1 21:1 22:1 23:1 24:1
 51 | 17 3:1 9:4 14:1 22:1 25:1 26:1 27:1 28:1 29:1 30:1 31:1 32:1 33:1 34:1 35:1 36:1 37:1
 52 | \end{lstlisting}
 53 | 
 54 | \begin{tabular}
 55 | 1. the
 56 | 2. development
 57 | 3. of
 58 | 4. collaborative
 59 | 5. and
 60 | 6. multimedia
 61 | 7. systems
 62 | 8 is
 63 | 9 a
 64 | 10 complex
 65 | 11 task 
 66 | 12 one
 67 | 13 challenges
 68 | 14 to
 69 | 15 promote
 70 | 16 reuse
 71 | 17 integration
 72 | 18 those
 73 | 19 two
 74 | 20 software
 75 | 21 categories
 76 | 22 in
 77 | 23 same
 78 | 24 environment
 79 | 25 this
 80 | 26 work
 81 | 27 shows
 82 | 28 model
 83 | 29 detect
 84 | 30 set
 85 | 31 log
 86 | 32 generated
 87 | 33 by
 88 | 34 business
 89 | 35 process
 90 | 36 management
 91 | 37 system
 92 | \end{tabular}
 93 | \end{figure}
 94 | 
 95 | Para facilitar a posterior análise dos dados, é necessário guardar a
 96 | informação de qual termo corresponde a cada identificador de termo e 
 97 | de qual documento corresponde a cada identificador de documento.
 98 | Para o primeiro caso, deve-se criar um arquivo \srccode{vocab} com o
 99 | vocabulário utilizado na coleção analisada. O formato deste arquivo é
100 | bem simples: coloca-se um termo por linha, sendo que o número da linha
101 | corresponde ao identificador do termo. Para o segundo caso, deve-se criar
102 | um arquivo \srccode{docs} com o nome de cada documento, organizados na
103 | mesma ordem em que foram especificados no arquivo de entrada (\srccode{-mult.dat}).
104 | Dessa forma, os dados em \cref{} podem ser posteriormente recuperados,
105 | facilitando a análise dos resultados pelos pesquisadores.
106 | 
107 | 
108 | O segundo arquivo de entrada do DTM define as fatias de tempo que serão
109 | analisadas. Esse arquivo, geralmente nomeado \srccode{???-seq.dat}, adota
110 | o seguinte formato:
111 | 
112 | \begin{lstlisting}
113 |    Number_Timestamps
114 |    number_docs_time_1
115 |    ...
116 |    number_docs_time_i
117 |    ...
118 |    number_docs_time_NumberTimestamps
119 | \end{lstlisting}
120 | 
121 | A primeira linha determina a quantidade de fatias de tempo a serem analisadas.
122 | As linhas seguintes especificam quantos documentos fazem parte de cada fatia,
123 | em ordem crescente. Esses documentos são obtidos em sequência, do início para
124 | o fim, do arquivo que descreve a coleção de dados a serem analisadas. Como
125 | aquele arquivo descreve um documento por linha, serão utilizados as $M_1$
126 | primeiras linhas para a primeira fatia de tempo, as $M_2$ linhas seguintes para
127 | a segunda fatia de tempo e assim por diante.
128 | 
129 | Por exemplo, observando-se o \cref{}, para a fatia de tempo 1, definida na
130 | linha 2, serão considerados 15 documentos
131 | 
132 | 
133 | 
134 | 
135 | \subsection{Configuração}
136 | 
137 | \begin
138 |   Flags from data.c:
139 |     -influence_flat_years (How many years is the influence nonzero?If
140 |       nonpositive, a lognormal distribution is used.) type: int32 default: -1
141 |     -influence_mean_years (How many years is the mean number of citations?)
142 |       type: double default: 20
143 |     -influence_stdev_years (How many years is the stdev number of citations?)
144 |       type: double default: 15
145 |     -max_number_time_points (Used for the influence window.) type: int32
146 |       default: 200
147 |     -resolution (The resolution.  Used to determine how far out the beta mean
148 |       should be.) type: double default: 1
149 |     -sigma_c (c stdev.) type: double default: 0.050000000000000003
150 |     -sigma_cv (Variational c stdev.) type: double
151 |       default: 9.9999999999999995e-07
152 |     -sigma_d (If true, use the new phi calculation.) type: double
153 |       default: 0.050000000000000003
154 |     -sigma_l (If true, use the new phi calculation.) type: double
155 |       default: 0.050000000000000003
156 |     -time_resolution (This is the number of years per time slice.) type: double
157 |       default: 0.5
158 | 
159 |   Flags from gsl-wrappers.c:
160 |     -rng_seed (Specifies the random seed.  If 0, seeds pseudo-randomly.)
161 |       type: int64 default: 0
162 | 
163 |   Flags from lda-seq.c:
164 |     -fix_topics (Fix a set of this many topics. This amounts to fixing these
165 |       topics' variance at 1e-10.) type: int32 default: 0
166 |     -forward_window (The forward window for deltas. If negative, we use a beta
167 |       with mean 5.) type: int32 default: 1
168 |     -lda_sequence_max_iter (The maximum number of iterations.) type: int32
169 |       default: 20
170 |     -lda_sequence_min_iter (The maximum number of iterations.) type: int32
171 |       default: 1
172 |     -normalize_docs (Describes how documents's wordcounts are considered for
173 |       finding influence. Options are "normalize", "none", "occurrence", "log",
174 |       or "log_norm".) type: string default: "normalize"
175 |     -save_time (Save a specific time.  If -1, save all times.) type: int32
176 |       default: 2147483647
177 | 
178 |   Flags from lda.c:
179 |     -lambda_convergence (Specifies the level of convergence required for lambda
180 |       in the phi updates.) type: double default: 0.01
181 | 
182 |   Flags from main.c:
183 |     -alpha () type: double default: -10
184 |     -corpus_prefix (The function to perform. Can be dtm or dim.) type: string
185 |       default: ""
186 |     -end () type: int32 default: -1
187 |     -heldout_corpus_prefix () type: string default: ""
188 |     -heldout_time (A time up to (but not including) which we wish to train, and
189 |       at which we wish to test.) type: int32 default: -1
190 |     -initialize_lda (If true, initialize the model with lda.) type: bool
191 |       default: false
192 |     -lda_max_em_iter () type: int32 default: 20
193 |     -lda_model_prefix (The name of a fit model to be used for testing
194 |       likelihood.  Appending "info.dat" to this should give the name of the
195 |       file.) type: string default: ""
196 |     -mode (The function to perform. Can be fit, est, or time.) type: string
197 |       default: "fit"
198 |     -model (The function to perform. Can be dtm or dim.) type: string
199 |       default: "dtm"
200 |     -ntopics () type: double default: -1
201 |     -outname () type: string default: ""
202 |     -output_table () type: string default: ""
203 |     -params_file (A file containing parameters for this run.) type: string
204 |       default: "settings.txt"
205 |     -start () type: int32 default: -1
206 |     -top_chain_var () type: double default: 0.0050000000000000001
207 |     -top_obs_var () type: double default: 0.5
208 | 
209 | 
210 | 
211 | 
212 | 
213 | \subsection{Running}
214 | 
215 | This progam takes as input a collection of text documents and creates
216 | as output a list of topics over time, a description of each document
217 | as a mixture of these topics, and (possibly) a measure of how
218 | "influential" each document is, based on its language.
219 | 
220 | We have provided an example dataset, instructions for formatting input
221 | data and processing output files, and example command lines for
222 | running this software in the file dtm/sample.sh.
223 | 
224 | 
225 | \subsubsection{Topic estimation}
226 | 
227 | ./main \
228 |   --ntopics=20 \
229 |   --mode=fit \
230 |   --rng_seed=0 \
231 |   --initialize_lda=true \
232 |   --corpus_prefix=example/test \
233 |   --outname=example/model_run \
234 |   --top_chain_var=0.005 \
235 |   --alpha=0.01 \
236 |   --lda_sequence_min_iter=6 \
237 |   --lda_sequence_max_iter=20 \
238 |   --lda_max_em_iter=10
239 | 
240 | 
241 | \subsubsection{Topic inference}
242 | 
243 | ./main \
244 |     --mode=fit \
245 |     --rng_seed=0 \
246 |     --model=fixed \
247 |     --initialize_lda=true \
248 |     --corpus_prefix=example/test \
249 |     --outname=example/output \
250 |     --time_resolution=2 \
251 |     --influence_flat_years=5 \
252 |     --top_obs_var=0.5 \
253 |     --top_chain_var=0.005 \
254 |     --sigma_d=0.0001 \
255 |     --sigma_l=0.0001 \
256 |     --alpha=0.01 \
257 |     --lda_sequence_min_iter=6 \
258 |     --lda_sequence_max_iter=20 \
259 |     --save_time=-1 \
260 |     --ntopics=10 \
261 |     --lda_max_em_iter=10
262 | 
263 | 
264 | 
265 | \subsection{Resultado}
266 | 
267 | O programa \srccode{dtm} cria os seguintes arquivos:
268 | 
269 | \begin{itemize}
270 | 	\item \srccode{topic-???-var-e-log-prob.dat}: a distribuição das
271 | 	palavras (e-betas) para o tópico ??? para cada período analisado.
272 | 
273 | 	Os dados contidos no arquivo estão no formato \foreign{row-major},
274 | 	ou seja, as linhas da tabela estão armazenadas uma após a outra.
275 | 	Cada linha, por sua vez, possui uma quantidade de colunas equivalente
276 | 	à quantidade de frações de tempo analisadas. Por exemplo, se foram
277 | 	analisadas 10 fatias de tempo, o comando para ler os dados relativos
278 | 	ao tópico 2 seria, em R:
279 | 
280 | 	\begin{lstlisting}[language=R]
281 | 	a = scan("topic-002-var-e-log-prob.dat")
282 | 	b = matrix(a, ncol=10, byrow=TRUE)
283 | 	\end{lstlisting}
284 | 
285 | 	Cada célula da matriz possui o logaritmo natural da probabilidade do
286 | 	termo M (sendo M o número identificador do termo, tal como definido
287 | 	no vocabulário e na matriz de entrada), o qual está na linha M da
288 | 	matriz, em relação ao tópico identificado pelo número N, o qual
289 | 	corresponde à coluna da matriz. Por exemplo, para obter a
290 | 	probabilidade do termo 100 para a fatia de tempo 3, o comando seria:
291 | 
292 | 	\begin{lstlisting}[language=R]
293 | 	exp(b[100, 3])
294 | 	\end{lstlisting}
295 | 
296 | 
297 | 	\item \srccode{gam.dat}: Armazena os parâmetros do Dirichlet variacional
298 | 	para cada documento. 
299 | 
300 | 
301 | 	Divide these by the sum for each document to get expected topic mixtures.
302 | 	\begin{lstlisting}[language=R}
303 | 	a = scan("gam.dat")
304 | 	b = matrix(a, ncol=10, byrow=TRUE)
305 | 	rs = rowSums(b)
306 | 	e.theta = b / rs
307 | 	# Proportion of topic 5 in document 3:
308 | 	e.theta[3, 5]
309 | 	\end{lstlisting}
310 | 
311 | 
312 | 	\item[\srccode{influence_time-???}]: Armazena a influência dos documentos na
313 | 	fatia de tempo ??? para cada tópico. Cada linha do arquivo corresponde ao
314 | 	documento M, sendo tal identificador M equivalente à linha em que o documento
315 | 	em questão se encontra na matriz de entrada (-mult). Cada coluna corresponde
316 | 	a um tópico N. Por exemplo, para obter a influência do documento 2 no tópico
317 | 	5, os comandos em R seriam:
318 | 
319 | 	\begin{lstlisting}[language=R]
320 | 	a = scan("influence-time-010")
321 | 	b = matrix(a, ncol=10, byrow=TRUE)
322 | 	b[2, 5]
323 | 	\end{lstlisting}
324 | \end{description}
325 | 
326 | 
327 | A análise de todos esses arquivos podem ser automatizada da seguinte forma:
328 | 
329 | \begin{lstlisting}[language=R]
330 | # Para um tópico
331 | data0 = scan("topic-000-var-e-log-prob.dat")
332 | b0 = matrix(data0, ncol=10, byrow=TRUE)
333 | write.table(b0, file="dist-topic0.csv", sep=";")
334 | 
335 | 
336 | # Processa todos tópicos
337 | # Para cada tópico, gera um arquivo com a probabilidade de cada
338 | # termo para cada ano
339 | # TODO: rodar exp() nos valores
340 | topics = list()
341 | for (i in 0:9) {
342 | 	filename = paste("topic-00", i, sep = "")
343 | 	filename = paste(filename, "-var-e-log-prob.dat", sep = "")
344 | 	data = scan(filename)
345 | 	topic = matrix(data, ncol=10, byrow=TRUE)
346 | 	filename = paste("dist-topic", i, sep = "")
347 | 	filename = paste(filename, ".csv", sep = "")
348 | 	write.table(topic, file=filename, sep=";")
349 | }
350 | 
351 | 
352 | # - gam.dat: The gammas associated with each document.  Divide these by
353 | #  the sum for each document to get expected topic mixtures.
354 | # Proportion of topic 5 in document 3:
355 | # e.theta[3, 5]
356 | a = scan("gam.dat")
357 | b = matrix(a, ncol=10, byrow=TRUE)
358 | rs = rowSums(b)
359 | e.theta = b / rs
360 | write.table(e.theta, file="documents_topics.csv", sep=";"
361 | \end{lstlisting}
362 | 
363 | 
364 | 


--------------------------------------------------------------------------------