├── .Rbuildignore
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── count.R
    ├── topics.R
    └── tpx.R
├── README.md
├── inst
    └── CITATION
├── man
    ├── counts.Rd
    ├── predict.topics.Rd
    ├── rdir.Rd
    ├── topicVar.Rd
    └── topics.Rd
└── src
    ├── Makevars
    ├── init.c
    ├── latools.c
    ├── latools.h
    ├── rhelp.c
    ├── rhelp.h
    └── topics.c


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | .git/*
2 | .git
3 | README.md


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: maptpx
 2 | Title:  MAP Estimation of Topic Models
 3 | Version: 1.9-7
 4 | Author: Matt Taddy <mataddy@gmail.com>
 5 | Depends: R (>= 2.10), slam
 6 | Suggests: MASS
 7 | Description:  Maximum a posteriori (MAP) estimation for topic models (i.e., Latent Dirichlet Allocation) in text analysis, as described in Taddy (2012) 'On estimation and selection for topic models'.  Previous versions of this code were included as part of the 'textir' package.  If you want to take advantage of openmp parallelization, uncomment the relevant flags in src/MAKEVARS before compiling.
 8 | Maintainer: Matt Taddy <mataddy@gmail.com>
 9 | License: GPL-3
10 | URL: http://taddylab.com
11 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | useDynLib(maptpx)
 2 | import(slam)
 3 | importFrom("grDevices", "grey")
 4 | importFrom("graphics", "axis", "hist", "image", "par", "points", "text")
 5 | importFrom("stats", "pchisq", "rgamma")
 6 | 
 7 | export(stm_tfidf,
 8 |        normalize,
 9 |        rdir,
10 |        topics,
11 |        predict.topics,
12 |        topicVar,
13 |        expit,
14 |        logit)
15 | 
16 | S3method(plot,topics)
17 | S3method(predict,topics)
18 | S3method(summary,topics)
19 | 


--------------------------------------------------------------------------------
/R/count.R:
--------------------------------------------------------------------------------
 1 | ## Tools for manipulation of text count matrices ##
 2 | 
 3 | ## converting count to frequency matrix
 4 | normalize <- function(x, byrow=TRUE){
 5 |     if(byrow){ s <- row_sums(x)
 6 |                s[s==0] <- 1
 7 |                return( x/s ) }
 8 |     else{
 9 |       s <- col_sums(x)
10 |       s[s==0] <- 1
11 |       return(t(t(x)/s)) }
12 | }
13 | 
14 | ## converting a count/freq matrix to tfidf
15 | stm_tfidf <- function(x){
16 |   idf <- log( nrow(x) ) - log(col_sums(x>0) + 1) 
17 |   t( t(x) * idf )
18 | }
19 |     
20 | ## Dirichlet RNG
21 | rdir <- function(n, alpha)
22 | {
23 |     x <- matrix(rgamma(length(alpha)*n,alpha),nrow=n,byrow=TRUE)
24 |     return(t(x/rowSums(x))) }
25 | 


--------------------------------------------------------------------------------
/R/topics.R:
--------------------------------------------------------------------------------
  1 | ##### Estimation for Topic Models ######
  2 | 
  3 | ## intended main function; provides defaults and selects K via marginal lhd
  4 | topics <- function(counts, K, shape=NULL, initopics=NULL, tol=0.1, 
  5 |                    bf=FALSE, kill=2, ord=TRUE, verb=1, ...)
  6 |   ## tpxselect defaults: tmax=10000, wtol=10^(-4), qn=100, grp=NULL, admix=TRUE, nonzero=FALSE, dcut=-10
  7 | {
  8 |   X <- CheckCounts(counts)
  9 |   p <- ncol(X) 
 10 |   if(verb>0)
 11 |     cat(sprintf("\nEstimating on a %d document collection.\n", nrow(X)))
 12 | 
 13 |   ## check the prior parameters for theta
 14 |   if(prod(shape>0) != 1){ stop("use shape > 0\n") }
 15 |                 
 16 |   ## check the list of candidate K values
 17 |   if(prod(K>1)!=1){ stop(cat("use K values > 1\n")) }
 18 |   K <- sort(K)
 19 |  
 20 |   ## initialize
 21 |   initopics <- tpxinit(X[1:min(ceiling(nrow(X)*.05),100),], initopics, K[1], shape, verb)
 22 |   
 23 |   ## either search for marginal MAP K and return bayes factors, or just fit
 24 |   tpx <- tpxSelect(X, K, bf, initopics, alpha=shape, tol, kill, verb, ...)
 25 |   K <- tpx$K
 26 |   
 27 |   ## clean up and out
 28 |   if(ord){ worder <- order(col_sums(tpx$omega), decreasing=TRUE) } # order by decreasing usage
 29 |   else{ worder <- 1:K }
 30 |   ## Main parameters
 31 |   theta=matrix(tpx$theta[,worder], ncol=K, dimnames=list(phrase=dimnames(X)[[2]], topic=paste(1:K)) )
 32 |   omega=matrix(tpx$omega[,worder], ncol=K, dimnames=list(document=NULL, topic=paste(1:K)) )
 33 |   if(nrow(omega)==nrow(X)){ dimnames(omega)[[1]] <- dimnames(X)[[1]] }
 34 |   
 35 |   ## topic object
 36 |   out <- list(K=K, theta=theta, omega=omega, BF=tpx$BF, D=tpx$D, X=X)
 37 |   class(out) <- "topics"
 38 |   invisible(out) }
 39 | 
 40 |  
 41 | ## S3 method predict function
 42 | predict.topics <- function(object, newcounts, loglhd=FALSE, ...)
 43 |   ## tpxweights optional arguments and defauls are verb=FALSE, nef=TRUE, wtol=10^{-5}, tmax=1000
 44 | {
 45 |   if(is.vector(newcounts)){ newcounts <- matrix(newcounts, nrow=1) }
 46 |   if(class(newcounts)[1] == "TermDocumentMatrix"){ newcounts <- t(newcounts) }
 47 |   X <- as.simple_triplet_matrix(newcounts)
 48 |   
 49 |   if(!(class(object)[1]%in%c("topics","matrix"))){ stop("object class must be `topics' or 'matrix'.") }
 50 | 
 51 |   if(class(object)[1]=="topics"){
 52 |     theta <- object$theta
 53 |     if(nrow(theta) != ncol(X)){ stop("Dimension mismatch: nrow(theta) != ncol(X)") }
 54 |     if(nrow(object$X) != nrow(object$omega)) # simple mixture
 55 |       { Q <- matrix(tpxMixQ(X, omega=object$omega, theta=theta, ...)$lQ, ncol=ncol(theta))
 56 |         return( (1:ncol(theta))[apply(Q,1,which.max)] ) } 
 57 |   }
 58 |   else{ theta <- object }
 59 | 
 60 |   start <- tpxOmegaStart(X=X, theta=theta)
 61 |   
 62 |   ## re-order xvec in doc-blocks, and build indices
 63 |   doc <- c(0,cumsum(as.double(table(factor(X$i, levels=c(1:nrow(X)))))))
 64 |   xvo <- X$v[order(X$i)]
 65 |   wrd <- X$j[order(X$i)]-1
 66 | 
 67 |   W <- tpxweights(n=nrow(X), p=ncol(X), xvo=xvo, wrd=wrd, doc=doc, start=start, theta=theta, ...)
 68 | 
 69 |   if(loglhd){
 70 |     L <- sum( X$v*log(tpxQ(theta=theta, omega=W, doc=X$i, wrd=X$j)) )
 71 |     return(list(W=W, L=L)) }
 72 |   else{ return(W) }
 73 | }
 74 | 
 75 | ## S3 method summary function
 76 | summary.topics <- function(object, nwrd=5, tpk=NULL, verb=TRUE, ...){
 77 | 
 78 |     
 79 |   K <- object$K
 80 |   if(is.null(tpk)){ tpk <- 1:K }
 81 |   else if(prod(tpk %in% 1:K)!=1){ stop("requested tpk's are not in 1:K") }
 82 |  
 83 |   if(nwrd>0){ if(verb){ cat(paste("\nTop", nwrd, "phrases by topic-over-null term lift (and usage %):\n\n")) }
 84 |               Q0 <- col_sums(object$X)/sum(object$X)
 85 |               topwords <- c()
 86 |               toplift <- c()
 87 |               usage <- round( (col_means(object$omega)*100), 1)
 88 |               for(k in tpk){
 89 |                 odds <- (log(object$theta[,k]) - log(Q0))[Q0!=0]
 90 |                 ko <- order(odds, decreasing=TRUE)
 91 |                 topk <- dimnames(object$theta)[[1]][Q0!=0][ko[1:nwrd]]
 92 |                 topwords <- cbind(topwords, topk)
 93 |                 toplift <- cbind(toplift, exp(sort(odds, decreasing=TRUE)[1:nwrd]))
 94 |                 if(verb){ cat(paste("[",k, "] '", sep=""))
 95 |                           cat(topk, sep="', '")
 96 |                           cat(paste("' (",usage[k],") \n",sep="")) }
 97 |               }
 98 |               topwords <- as.matrix(topwords)
 99 |               toplift <- as.matrix(toplift)
100 |               dimnames(topwords)[[2]] <- dimnames(toplift)[[2]] <- paste("topic",tpk,sep="")
101 |               dimnames(toplift)[[1]] <- dimnames(toplift)[[1]] <- 1:nwrd
102 |             }
103 |   else{ topwords <- toplift <- NULL }
104 |               
105 |   if(!is.null(object$BF) && !is.null(object$D) && verb)
106 |     {
107 |       cat("\nLog Bayes factor and estimated dispersion, by number of topics:\n\n")
108 |       D <- rbind(logBF=object$BF,Disp=object$D[1,])
109 |       if(verb>1){ D <- rbind(D,p.val=object$D[2,]) }
110 |       print(round(D,2))
111 |       cat(paste("\nSelected the K =",object$K,"topic model\n\n"))
112 |     }
113 |   else if(verb){
114 |     n <- nrow(object$K)
115 |     rho = pchisq(sum(object$r^2), df=length(object$r), lower.tail=FALSE)
116 |     cat("\nDispersion = ", round(object$D$dispersion,2) ,"\n\n",sep="") }
117 | 
118 |   retobj <- data.frame(topic=rep(tpk, each=nwrd), phrase=c(topwords), lift=c(toplift))
119 |   invisible(retobj)
120 | }
121 | 
122 | 
123 | ## Colors for topic plotting
124 | TOPICOLS <- matrix(nrow=6,
125 |                    c(grey(c(.9,.8,.7,.55,.35,0)), #GREY
126 |                      "#FFCCCC", "#FA8072", "#FF5333", "#EE0000", "#CC1100", "#800000", #RED
127 |                      "#BDFCC9", "#98FB98", "#49E20E", "#009900", "#006400", "#004F00", #GREEN	
128 |                      "#BBFFFF", "#67E6EC", "#00C5CD", "#0198E1", "#0147FA",  "#000080"))  #BLUE
129 | 
130 |               	
131 | 
132 | plot.topics <- function(x, type=c("weight","resid"), group=NULL, labels=NULL, 
133 |                         col=NULL, xlab=NULL, ylab=NULL, main=NULL, tpk=NULL,
134 |                         lgd.K=NULL, cex.lgdc = 1, cex.lgdt = 1, cex.rmar= 1, ...){
135 | 
136 |   if(type[1]=="resid"){
137 | 
138 |     if(is.null(col[1])){col <- 8}
139 |     if(is.null(xlab)){ xlab="abs( adusted residuals )" }
140 |     if(is.null(main)){ main="" }
141 | 
142 |     resids <- tpxResids(x$X, theta=x$theta, omega=x$omega, ...)$r
143 | 
144 |     hist(resids, col=col, border=grey(.9),
145 |          xlab=xlab,
146 |          main="", cex.lab=cex.lgdt, font.lab=3)
147 |     
148 |     return(invisible())
149 |   }
150 |   
151 |   n <- nrow(x$omega)
152 |   mmm <- n==nrow(x$X)
153 |   if(n==1){
154 |     if(is.null(main)){ main="" }
155 |     if(is.null(xlab)){ xlab="topic" }
156 |     if(is.null(ylab)){ ylab="weight" }
157 |     if(is.null(col)){ col = 8 }
158 |     return(plot(c(x$omega), type="h", lwd=10, col=col, main=main, ylab=ylab, xlab=xlab)) }
159 | 
160 |   if(is.null(tpk)){ tpk <- 1:x$K }
161 |   if(is.null(lgd.K)){ lgd.K <- max(.1*length(tpk),.75) }
162 |   
163 |   if(is.null(group) || !mmm){
164 |     ltop = .65*n
165 |     lbot = .35*n
166 |     
167 |     if(is.null(col)){ col<-1 }
168 |     tpcl <- c(0, TOPICOLS[1:6,col[1]%%5])
169 |     W <- x$omega[,tpk]
170 |     if(mmm){
171 |       brks <- seq(0,1,length=8)
172 |       tplg <- c("w=0","w=1") }
173 |     else{
174 |       brks <- seq(min(W),max(W),length=8)
175 |       tplg <- c(round(min(W),3),round(max(W),3))
176 |     }
177 |   } else{ # bunch of extra commands to get shading for two groups
178 |     group <- as.factor(group)
179 |     ltop = .85*n
180 |     lbot = .15*n
181 |     
182 |     if(length(group)!=n){ stop("Your group membership length doesn't match omega.") }
183 |     if(nlevels(group)!=2){ stop("Sorry, grouping must be a two-level factor") }
184 |     if(is.null(col) || length(col)<2){ col <- 1:2 }
185 |     
186 |     tpcl <- c(TOPICOLS[6:1,col[1]%%5],"#FFFFFF",TOPICOLS[,col[2]%%5])
187 |     brks <- c(seq(-1, -0.1,length=7),seq(0.1,1,length=7))
188 |     W <- x$omega[,tpk]*(c(-1,1)[as.logical(group)+1])
189 |     if(is.null(labels)){labels <- c("F","T")}
190 |     tplg=rep("w=1",2)
191 |   }
192 | 
193 |   ## plot parameters
194 |   xlg <- length(tpk)+lgd.K
195 |   old.mar <- par()$mar
196 |   par(xpd=TRUE, mar=c(5.1,4.1,2.1,5*cex.rmar))
197 |   if(is.null(ylab)){
198 |     if(nrow(x$X)==nrow(x$omega)){ ylab="Document" }
199 |     else{ ylab="group" } }
200 |   if(is.null(xlab)){ xlab="Topic" }
201 |   if(is.null(main)){ main="Topic-Loading Weights" }
202 | 
203 |   ## finally plot
204 |   image(y=1:n, x=1:length(tpk), t(W), ylab=ylab, xlab=xlab,
205 |         main=main, col=tpcl, font.lab=3, xaxt="n", yaxt="n", breaks=brks, ...)
206 |   axis(side=1, at=1:length(tpk), labels=tpk, tick=FALSE, line=-.5)
207 | 
208 |   if(!mmm){ axis(side=2, at=1:n) }
209 |   else{axis(2)}
210 |   
211 |   points(rep(xlg,length(tpcl)), seq(lbot,ltop,length=length(tpcl)), col=tpcl, pch=15, cex=3*cex.lgdc)
212 |   text(rep(xlg,2), y=c(lbot-.08*n, ltop+.08*n), tplg, cex=cex.lgdt)
213 |   if(!is.null(labels)){ text(rep(xlg,2), y=c(lbot-.14*n, ltop+.14*n), labels, font=3, cex=cex.lgdt) }
214 |   par(mar=old.mar)
215 | }
216 | 
217 | ## logit and expit, treating first element as null
218 | logit <- function(prob){
219 |   f <- function(p) .C("Rlogit", d=as.integer(d), 
220 |     eta=double(d-1), prob=as.double(p), PACKAGE="maptpx")$eta
221 |   prob[prob < 1e-10] <- 1e-10
222 |   prob[1-prob < 1e-10] <- 1-1e-10
223 |   if(is.matrix(prob) || is.data.frame(prob)){
224 |     d <- ncol(prob)
225 |     eta <- matrix(t(apply(prob,1,f)), ncol=d-1, dimnames=list(row=dimnames(prob)[[1]], col=dimnames(prob)[[2]][-1]))
226 |   }
227 |   else{
228 |     d <- length(prob)
229 |     eta <- f(prob)
230 |   }
231 |   return(eta) }
232 | 
233 | expit <- function(eta){
234 |   f <- function(e) .C("Rexpit", d=as.integer(d), prob=double(d), 
235 |     eta=as.double(e), PACKAGE="maptpx")$prob
236 |   eta[eta==Inf] <- 1e10
237 |   eta[eta==-Inf] <- -1e10
238 |   if(is.matrix(eta) || is.data.frame(eta)){
239 |     d <- ncol(eta)+1
240 |     if(is.null(dimnames(eta)[[2]])) dimnames(eta)[[2]] <- paste("nef",1:ncol(eta),sep='')
241 |     prob <- matrix(t(apply(eta,1,f)), ncol=d, dimnames=list(row=dimnames(eta)[[1]], col=c('nullcat',dimnames(eta)[[2]])))
242 |   }
243 |   else{
244 |     d <- length(eta)+1
245 |     prob <- f(eta)
246 |   }
247 |   return(prob) }
248 | 
249 | 
250 | ### topic weight variance matrix (in NEF parametrization)
251 | topicVar <- function(counts, theta, omega){
252 |   X <- CheckCounts(counts)
253 |   if(nrow(omega) != nrow(X)) stop("omega does not match counts")
254 |   if(ncol(omega) != ncol(theta)) stop("omega does not match theta")
255 | 
256 |   K <- ncol(omega)
257 |   n <- nrow(X)
258 |   p <- nrow(theta)
259 |   
260 |   q <- tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j)
261 |   H <- array(.C("RnegHW",
262 |                   n = as.integer(n),
263 |                   p = as.integer(p),
264 |                   K = as.integer(K-1),
265 |                   omeg = as.double(omega[,-1]),
266 |                   thet = as.double(theta[,-1]),
267 |                   doc = as.integer(X$i-1),
268 |                   wrd = as.integer(X$j-1),
269 |                   cnt = as.double(X$v),
270 |                   q = as.double(q),
271 |                   N = as.integer(length(q)),
272 |                   H = double(n*(K-1)^2),
273 |                   PACKAGE="maptpx")$H,
274 |                dim=c(K-1,K-1,n))
275 | 
276 |   S <- array(apply(H, 3, function(h) tryCatch(solve(h), error=function(e) solve(h + diag(.00001,K-1))) ),
277 |              dim=c(K-1,K-1,n), dimnames=list(topics=1:(K-1), topics=1:(K-1), doc=dimnames(X)[[1]]) )
278 |   return(S)
279 | }
280 | 


--------------------------------------------------------------------------------
/R/tpx.R:
--------------------------------------------------------------------------------
  1 | #######  Undocumented "tpx" utility functions #########
  2 | 
  3 | ## ** Only referenced from topics.R
  4 | 
  5 | ## check counts (can be an object from tm, slam, or a simple co-occurance matrix)
  6 | CheckCounts <- function(counts){
  7 |   if(class(counts)[1] == "TermDocumentMatrix"){ counts <- t(counts) }
  8 |   if(is.null(dimnames(counts)[[1]])){ dimnames(counts)[[1]] <- paste("doc",1:nrow(counts)) }
  9 |   if(is.null(dimnames(counts)[[2]])){ dimnames(counts)[[2]] <- paste("wrd",1:ncol(counts)) }
 10 |   empty <- row_sums(counts) == 0
 11 |   if(sum(empty) != 0){
 12 |     counts <- counts[!empty,]
 13 |     cat(paste("Removed", sum(empty), "blank documents.\n")) }
 14 |   return(as.simple_triplet_matrix(counts))
 15 | }
 16 |  
 17 | ## Topic estimation and selection for a list of K values
 18 | tpxSelect <- function(X, K, bf, initheta, alpha, tol, kill, verb,
 19 |                       admix=TRUE, grp=NULL, tmax=10000,
 20 |                       wtol=10^{-4}, qn=100, nonzero=FALSE, dcut=-10){
 21 | 
 22 |   ## check grp if simple mixture
 23 |   if(!admix){
 24 |     if(is.null(grp) || length(grp)!=nrow(X)){  grp <- rep(1,nrow(X)) }
 25 |     else{ grp <- factor(grp) }
 26 |   }
 27 | 
 28 |   ## return fit for single K
 29 |   if(length(K)==1 && bf==FALSE){
 30 |     if(verb){ cat(paste("Fitting the",K,"topic model.\n")) }
 31 |     fit <-  tpxfit(X=X, theta=initheta, alpha=alpha, tol=tol, verb=verb,
 32 |                    admix=admix, grp=grp, tmax=tmax, wtol=wtol, qn=qn)
 33 |     fit$D <- tpxResids(X=X, theta=fit$theta, omega=fit$omega, grp=grp, nonzero=nonzero)$D
 34 |     return(fit)
 35 |   }
 36 | 
 37 |   if(is.matrix(alpha)){ stop("Matrix alpha only works for fixed K") }
 38 |   
 39 |   if(verb){ cat(paste("Fit and Bayes Factor Estimation for K =",K[1]))
 40 |             if(length(K)>1){ cat(paste(" ...", max(K))) }
 41 |             cat("\n") }
 42 | 
 43 |   ## dimensions
 44 |   n <- nrow(X)
 45 |   p <- ncol(X)
 46 |   nK <- length(K)
 47 |     
 48 |   BF <- D <- NULL
 49 |   iter <- 0
 50 |   
 51 |   ## Null model log probability
 52 |   sx <- sum(X)
 53 |   qnull <- col_sums(X)/sx
 54 |   null <- sum( X$v*log(qnull[X$j]) ) - 0.5*(n+p)*(log(sx) - log(2*pi))
 55 |   
 56 |   ## allocate and initialize
 57 |   best <- -Inf
 58 |   bestfit <- NULL  
 59 |   
 60 |   ## loop over topic numbers
 61 |   for(i in 1:nK){
 62 |     
 63 |     ## Solve for map omega in NEF space
 64 |     fit <- tpxfit(X=X, theta=initheta, alpha=alpha, tol=tol, verb=verb,
 65 |                   admix=admix, grp=grp, tmax=tmax, wtol=wtol, qn=qn)
 66 |     
 67 |     BF <- c(BF, tpxML(X=X, theta=fit$theta, omega=fit$omega, alpha=fit$alpha, L=fit$L, dcut=dcut, admix=admix, grp=grp) - null)
 68 |     R <- tpxResids(X=X, theta=fit$theta, omega=fit$omega, grp=grp, nonzero=nonzero)
 69 |     D <- cbind(D, unlist(R$D))
 70 | 
 71 |     if(verb>0) cat(paste("log BF(", K[i], ") =", round(BF[i],2)))
 72 |     if(verb>1) cat(paste(" [ ", fit$iter,"steps, disp =",round(D[1,i],2)," ]\n")) else if(verb >0) cat("\n")
 73 |     
 74 |     if(is.nan(BF[i])){ 
 75 |       cat("NAN for Bayes factor.\n")
 76 |       return(bestfit)
 77 |       break} 
 78 |     
 79 |     if(BF[i] > best){ # check for a new "best" topic
 80 |       best <- BF[i]
 81 |       bestfit <- fit
 82 |     } else if(kill>0 && i>kill){ # break after kill consecutive drops
 83 |       if(prod(BF[i-0:(kill-1)] < BF[i-1:kill])==1) break }
 84 |     
 85 |     if(i<nK){
 86 |       if(!admix){ initheta <- tpxinit(X,2,K[i+1], alpha, 0) }
 87 |       else{ initheta <- tpxThetaStart(X, fit$theta, fit$omega, K[i+1]) }
 88 |     }
 89 |   }
 90 | 
 91 |   names(BF) <- dimnames(D)[[2]] <- paste(K[1:length(BF)]) 
 92 |  
 93 |   return(list(theta=bestfit$theta, omega=bestfit$omega, alpha=bestfit$alpha,
 94 |               BF=BF, D=D, K=K[which.max(BF)])) }
 95 | 
 96 | ## theta initialization
 97 | tpxinit <- function(X, initheta, K1, alpha, verb){
 98 | ## initheta can be matrix, or c(nK, tmax, tol, verb)
 99 |   
100 |   if(is.matrix(initheta)){
101 |     if(ncol(initheta)!=K1){ stop("mis-match between initheta and K.") }
102 |     if(prod(initheta>0) != 1){ stop("use probs > 0 for initheta.") }
103 |     return(normalize(initheta, byrow=FALSE)) }
104 | 
105 |   if(is.matrix(alpha)){
106 |     if(nrow(alpha)!=ncol(X) || ncol(alpha)!=K1){ stop("bad matrix alpha dimensions; check your K") }
107 |     return(normalize(alpha, byrow=FALSE)) }
108 | 
109 |   if(is.null(initheta)){ ilength <- K1-1 }
110 |   else{ ilength <- initheta[1] }
111 |   if(ilength < 1){ ilength <- 1 }
112 | 
113 |   ## set number of initial steps
114 |   if(length(initheta)>1){ tmax <- initheta[2] }
115 |   else{ tmax <- 3 }
116 |   ## set the tolerance
117 |   if(length(initheta)>2){ tol <- initheta[3] }
118 |   else{ tol <- 0.5 }
119 |   ## print option
120 |   if(length(initheta)>3){ verb <- initheta[4] }
121 |   else{ verb <- 0 }
122 |   
123 | 
124 |   if(verb){ cat("Building initial topics") 
125 |             if(verb > 1){ cat(" for K = ") }
126 |             else{ cat("... ") } }
127 |             
128 |   nK <- length( Kseq <-  unique(ceiling(seq(2,K1,length=ilength))) )
129 |   initheta <- tpxThetaStart(X, matrix(col_sums(X)/sum(X), ncol=1), matrix(rep(1,nrow(X))), 2)
130 | 
131 |   if(verb > 0)
132 |     { cat("\n")
133 |       print(list(Kseq=Kseq, tmax=tmax, tol=tol)) }
134 |     
135 |   ## loop over topic numbers
136 |   for(i in 1:nK){
137 | 
138 |     ## Solve for map omega in NEF space
139 |     fit <- tpxfit(X=X, theta=initheta, alpha=alpha, tol=tol, verb=verb,
140 |                   admix=TRUE, grp=NULL, tmax=tmax, wtol=-1, qn=-1)
141 |     if(verb>1){ cat(paste(Kseq[i],",", sep="")) }
142 | 
143 |     if(i<nK){ initheta <- tpxThetaStart(X, fit$theta, fit$omega, Kseq[i+1]) }
144 |     else{ initheta <- fit$theta }
145 |   }
146 |   if(verb){ cat("done.\n") }
147 |   return(initheta)
148 | }
149 |                
150 | ## ** main workhorse function.  Only Called by the above wrappers.
151 | ## topic estimation for a given number of topics (taken as ncol(theta))
152 | tpxfit <- function(X, theta, alpha, tol, verb,
153 |                    admix, grp, tmax, wtol, qn)
154 | {
155 |   ## inputs and dimensions
156 |   if(!inherits(X,"simple_triplet_matrix")){ stop("X needs to be a simple_triplet_matrix") }
157 |   K <- ncol(theta)
158 |   n <- nrow(X)
159 |   p <- ncol(X)
160 |   m <- row_sums(X)
161 |   if(is.null(alpha)){ alpha <- 1/(K*p) }
162 |   if(is.matrix(alpha)){ if(nrow(alpha)!=p || ncol(alpha)!=K){ stop("bad matrix alpha dimensions") }}
163 | 
164 |   ## recycle these in tpcweights to save time
165 |   xvo <- X$v[order(X$i)]
166 |   wrd <- X$j[order(X$i)]-1
167 |   doc <- c(0,cumsum(as.double(table(factor(X$i, levels=c(1:nrow(X)))))))
168 |   
169 |   ## Initialize
170 |   omega <- tpxweights(n=n, p=p, xvo=xvo, wrd=wrd, doc=doc, start=tpxOmegaStart(X,theta), theta=theta)
171 |   if(!admix){ omega <- matrix(apply(omega,2, function(w) tapply(w,grp,mean)), ncol=K) }
172 | 
173 |   ## tracking
174 |   iter <- 0
175 |   dif <- tol+1+qn
176 |   update <- TRUE
177 |   if(verb>0){
178 |     cat("log posterior increase: " )
179 |     digits <- max(1, -floor(log(tol, base=10))) }
180 |   
181 |   Y <- NULL # only used for qn > 0 
182 |   Q0 <- col_sums(X)/sum(X)
183 |   L <- tpxlpost(X=X, theta=theta, omega=omega, alpha=alpha, admix=admix, grp=grp) 
184 |  # if(is.infinite(L)){ L <- sum( (log(Q0)*col_sums(X))[Q0>0] ) }
185 |   
186 |   ## Iterate towards MAP
187 |   while( update  && iter < tmax ){ 
188 | 
189 |     ## sequential quadratic programming for conditional Y solution
190 |     if(admix && wtol > 0){ Wfit <- tpxweights(n=nrow(X), p=ncol(X), xvo=xvo, wrd=wrd, doc=doc,
191 |                                 start=omega, theta=theta,  verb=0, nef=TRUE, wtol=wtol, tmax=20) }
192 |     else{ Wfit <- omega }
193 | 
194 |     ## joint parameter EM update
195 |     move <- tpxEM(X=X, m=m, theta=theta, omega=Wfit, alpha=alpha, admix=admix, grp=grp)
196 |     
197 |     ## quasinewton-newton acceleration
198 |     QNup <- tpxQN(move=move, Y=Y, X=X, alpha=alpha, verb=verb, admix=admix, grp=grp, doqn=qn-dif)
199 |     move <- QNup$move
200 |     Y <- QNup$Y
201 |     
202 |     if(QNup$L < L){  # happens on bad Wfit, so fully reverse
203 |       if(verb > 10){ cat("_reversing a step_") }
204 |       move <- tpxEM(X=X, m=m, theta=theta, omega=omega, alpha=alpha, admix=admix, grp=grp)
205 |       QNup$L <-  tpxlpost(X=X, theta=move$theta, omega=move$omega, alpha=alpha, admix=admix, grp=grp) }
206 | 
207 |     ## calculate dif
208 |     dif <- (QNup$L-L)
209 |    
210 |     L <- QNup$L
211 |     
212 |         
213 |     ## check convergence
214 |     if(abs(dif) < tol){
215 |       if(sum(abs(theta-move$theta)) < tol){ update = FALSE } }
216 | 
217 |     ## print
218 |     if(verb>0 && (iter-1)%%ceiling(10/verb)==0 && iter>0){
219 |       cat( paste( round(dif,digits), #" (", sum(abs(theta-move$theta)),")",
220 |                  ", ", sep="") ) }
221 |     
222 |     ## heartbeat for long jobs
223 |     if(((iter+1)%%1000)==0){ 
224 |           cat(sprintf("p %d iter %d diff %g\n",
225 |                 nrow(theta), iter+1,round(dif))) }
226 | 
227 |     ## iterate
228 |     iter <- iter+1
229 |     theta <- move$theta
230 |     omega <- move$omega
231 |     
232 |   }
233 | 
234 |   ## final log posterior
235 |   L <- tpxlpost(X=X, theta=theta, omega=omega, alpha=alpha, admix=admix, grp=grp) 
236 | 
237 |   ## summary print
238 |   if(verb>0){
239 |     cat("done.")
240 |     if(verb>1) { cat(paste(" (L = ", round(L,digits), ")", sep="")) }
241 |     cat("\n")
242 |   }
243 |   
244 |   out <- list(theta=theta, omega=omega, K=K, alpha=alpha, L=L, iter=iter)
245 |   invisible(out) }
246 | 
247 |  
248 | ## ** called from topics.R (predict) and tpx.R
249 | ## Conditional solution for topic weights given theta
250 | tpxweights <- function(n, p, xvo, wrd, doc, start, theta, verb=FALSE, nef=TRUE, wtol=10^{-5}, tmax=1000)
251 | {
252 |   K <- ncol(theta)
253 |   start[start == 0] <- 0.1/K
254 |   start <- start/rowSums(start) 
255 |   omega <- .C("Romega",
256 |               n = as.integer(n),
257 |               p = as.integer(p),
258 |               K = as.integer(K),
259 |               doc = as.integer(doc),
260 |               wrd = as.integer(wrd),
261 |               X = as.double(xvo),
262 |               theta = as.double(theta),
263 |               W = as.double(t(start)),
264 |               nef = as.integer(nef),
265 |               tol = as.double(wtol),
266 |               tmax = as.integer(tmax),
267 |               verb = as.integer(verb),
268 |               PACKAGE="maptpx")
269 |   return(t(matrix(omega$W, nrow=ncol(theta), ncol=n))) }
270 | 
271 | ## ** Called only in tpx.R
272 | 
273 | ## single EM update. two versions: admix and mix
274 | tpxEM <- function(X, m, theta, omega, alpha, admix, grp)
275 | {
276 |   n <- nrow(X)
277 |   p <- ncol(X)
278 |   K <- ncol(theta)
279 | 
280 |   if(admix){ Xhat <- (X$v/tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j))*(omega[X$i,]*theta[X$j,])
281 |              Zhat <- .C("Rzhat", n=as.integer(n), p=as.integer(p), K=as.integer(K), N=as.integer(nrow(Xhat)),
282 |                          Xhat=as.double(Xhat), doc=as.integer(X$i-1), wrd=as.integer(X$j-1),
283 |                         zj = as.double(rep(0,K*p)), zi = as.double(rep(0,K*n)), PACKAGE="maptpx")
284 |              theta <- normalize(matrix(Zhat$zj+alpha, ncol=K), byrow=FALSE)
285 |              omega <- normalize(matrix(Zhat$zi+1/K, ncol=K)) }
286 |   else{
287 |     qhat <- tpxMixQ(X, omega, theta, grp, qhat=TRUE)$qhat
288 |     ## EM update
289 |     theta <- normalize(tcrossprod_simple_triplet_matrix( t(X), t(qhat) ) + alpha, byrow=FALSE)
290 |     omega <- normalize(matrix(apply(qhat*m,2, function(x) tapply(x,grp,sum)), ncol=K)+1/K )  }
291 |     
292 |   return(list(theta=theta, omega=omega)) }
293 | 
294 | ## Quasi Newton update for q>0 
295 | tpxQN <- function(move, Y, X, alpha, verb, admix, grp, doqn)
296 | {
297 |   ## always check likelihood
298 |   L <- tpxlpost(X=X, theta=move$theta, omega=move$omega,
299 |                 alpha=alpha, admix=admix, grp=grp) 
300 | 
301 |   if(doqn < 0){ return(list(move=move, L=L, Y=Y)) }
302 | 
303 |   ## update Y accounting
304 |   Y <- cbind(Y, tpxToNEF(theta=move$theta, omega=move$omega))
305 |   if(ncol(Y) < 3){ return(list(Y=Y, move=move, L=L)) }
306 |   if(ncol(Y) > 3){ warning("mis-specification in quasi-newton update; please report this bug.") }
307 |   
308 |   ## Check quasinewton secant conditions and solve F(x) - x = 0.
309 |   U <- as.matrix(Y[,2]-Y[,1])
310 |   V <- as.matrix(Y[,3]-Y[,2])
311 |   sUU <- sum(U^2)
312 |   sVU <- sum(V*U)
313 |   Ynew <- Y[,3] + V*(sVU/(sUU-sVU)) 
314 |   qnup <- tpxFromNEF(Ynew, n=nrow(move$omega),
315 |                      p=nrow(move$theta), K=ncol(move$theta))
316 | 
317 |   ## check for a likelihood improvement
318 |   Lqnup <- try(tpxlpost(X=X, theta=qnup$theta, omega=qnup$omega,
319 |                         alpha=alpha, admix=admix, grp=grp), silent=TRUE)
320 |   
321 |   if(inherits(Lqnup, "try-error")){
322 |     if(verb>10){ cat("(QN: try error) ") }
323 |     return(list(Y=Y[,-1], move=move, L=L)) }
324 |   
325 |   if(verb>10){ cat(paste("(QN diff ", round(Lqnup-L,3), ")\n", sep="")) }
326 |   
327 |   if(Lqnup < L){
328 |     return(list(Y=Y[,-1], move=move, L=L)) }
329 |   else{
330 |     L <- Lqnup
331 |     Y <- cbind(Y[,2],Ynew)
332 |     return( list(Y=Y, move=qnup, L=L) )
333 |   }
334 | }
335 | 
336 |   
337 | ## unnormalized log posterior (objective function)
338 | tpxlpost <- function(X, theta, omega, alpha, admix=TRUE, grp=NULL)
339 | {
340 |   if(!inherits(X,"simple_triplet_matrix")){ stop("X needs to be a simple_triplet_matrix.") }
341 |   K <- ncol(theta)
342 | 
343 |   if(admix){ L <- sum( X$v*log(tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j)) ) }
344 |   else{ L <- sum(tpxMixQ(X, omega, theta, grp)$lqlhd) }
345 |   if(is.null(nrow(alpha))){ if(alpha != 0){ L <- L + sum(alpha*log(theta))  } } # unnormalized prior
346 |   L <- L + sum(log(omega))/K 
347 |   
348 |   return(L) }
349 | 
350 | ## log marginal likelihood
351 | tpxML <- function(X, theta, omega, alpha, L, dcut, admix=TRUE, grp=NULL){
352 |   ## get the indices
353 |   K <- ncol(theta)
354 |   p <- nrow(theta)
355 |   n <- nrow(omega)
356 | 
357 |   ## return BIC for simple finite mixture model
358 |   if(!admix){
359 |     qhat <- tpxMixQ(X, omega, theta, grp, qhat=TRUE)$qhat
360 |     ML <- sum(X$v*log(row_sums(qhat[X$i,]*theta[X$j,])))
361 |     return( ML - 0.5*( K*p + (K-1)*n )*log(sum(X)) ) } 
362 | 
363 |   ML <- L  + lfactorial(K) # lhd multiplied by label switching modes
364 | 
365 |   ## block-diagonal approx to determinant of the negative log hessian matrix
366 |   q <- tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j)
367 |   D <- tpxHnegDet(X=X, q=q, theta=theta, omega=omega, alpha=alpha)
368 | 
369 |   D[D < dcut] <- dcut
370 |   ML <- ML - 0.5*sum( D  )   # -1/2 |-H|
371 | 
372 |   ML <- ML + (K*p + sum(omega>0.01))*log(2*pi)/2  # (d/2)log(2pi)
373 |   if(is.null(nrow(alpha))){ # theta prior normalizing constant
374 |     ML <- ML + K*( lgamma(p*(alpha+1)) - p*lgamma(alpha+1) )  }
375 |   else{ ML <- ML + sum(lgamma(col_sums(alpha+1)) - col_sums(lgamma(alpha+1))) } # matrix version
376 |   ## omega(phi) prior normalizing constant number of parameters
377 |   ML <- ML +  sum(D[-(1:p)]>dcut)*( lfactorial(K) - K*lgamma( 1+1/K ) ) #
378 |   
379 |   return(ML) }
380 | 
381 | ## find residuals for X$v
382 | tpxResids <- function(X, theta, omega, grp=NULL, nonzero=TRUE)
383 | {
384 |   if(!inherits(X,"simple_triplet_matrix")){ stop("X needs to be a simple_triplet_matrix.") }
385 | 
386 |   m <- row_sums(X)
387 |   K <- ncol(theta)
388 |   n <- nrow(X)
389 |   phat <- sum(col_sums(X)>0)
390 |   d <- n*(K-1) + K*( phat-1 )
391 | 
392 |   if(nrow(omega) == nrow(X)){
393 |     qhat <- tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j)
394 |     xhat <- qhat*m[X$i]
395 |   } else{
396 |     q <- tpxMixQ(X=X, omega=omega, theta=theta, grp=grp, qhat=TRUE)$qhat
397 |     qhat <- row_sums(q[X$i,]*theta[X$j,])
398 |     xhat <- qhat*m[X$i] }
399 | 
400 |   if(nonzero || nrow(omega) < nrow(X)){
401 |     ## Calculations based on nonzero counts
402 |     ## conditional adjusted residuals
403 |     e <- X$v^2 - 2*(X$v*xhat - xhat^2)
404 |     s <- qhat*m[X$i]*(1-qhat)^{1-m[X$i]}
405 |     r <- sqrt(e/s)
406 |     df <- length(r)*(1-d/(n*phat))
407 |     R <- sum(r^2) 
408 |   }
409 |   else{
410 |     ## full table calculations
411 |     e <- (X$v^2 - 2*X$v*m[X$i]*qhat)
412 |     s <- m[X$i]*qhat*(1-qhat)
413 |     fulltable <- .C("RcalcTau",
414 |                  n = as.integer(nrow(omega)),
415 |                  p = as.integer(nrow(theta)),
416 |                  K = as.integer(ncol(theta)),
417 |                  m = as.double(m),
418 |                  omega = as.double(omega),
419 |                  theta = as.double(theta),
420 |                  tau = double(1), size=double(1),
421 |                  PACKAGE="maptpx" )
422 |     tau <- fulltable$tau 
423 |     R <- sum(e/s) + tau
424 |     df <-  fulltable$size - phat  - d
425 |     r <- suppressWarnings(sqrt(e/s + tau))
426 |     r[is.nan(r)] <- 0 ## should not happen, but can theoretically
427 |   }
428 |   
429 |   ## collect and output
430 |   sig2 <- R/df
431 |   rho <- suppressWarnings(pchisq(R, df=df, lower.tail=FALSE))
432 |   D <- list(dispersion=sig2, pvalue=rho, df=df)
433 |   return( list(s=s, e=e, r=r, D=D) ) }
434 | 
435 |   
436 | ## fast initialization functions for theta (after increasing K) and omega (given theta)
437 | tpxThetaStart <- function(X, theta, omega, K)
438 |   {
439 |     R <- tpxResids(X, theta=theta, omega=omega, nonzero=TRUE) 
440 |     X$v <- R$e*(R$r>3) + 1/ncol(X)
441 |     Kpast <- ncol(theta)
442 |     Kdiff <- K-Kpast
443 |     if(Kpast != ncol(omega) || Kpast >= K){ stop("bad K in tpxThetaStart") }
444 |     initheta <- normalize(Kpast*theta+rowMeans(theta), byrow=FALSE)
445 |     n <- nrow(X)
446 |     ki <- matrix(1:(n-n%%Kdiff), ncol=Kdiff)
447 |     for(i in 1:Kdiff){ initheta <- cbind(initheta, (col_sums(X[ki[,i],])+1/ncol(X))/(sum(X[ki[,i],])+1)) }
448 |     return( initheta )
449 |   }
450 | 
451 | tpxOmegaStart <- function(X, theta)
452 |   {
453 |     if(!inherits(X,"simple_triplet_matrix")){ stop("X needs to be a simple_triplet_matrix.") }
454 |     omega <- try(tcrossprod_simple_triplet_matrix(X, solve(t(theta)%*%theta)%*%t(theta)), silent=TRUE )
455 |     if(inherits(omega,"try-error")){ return( matrix( 1/ncol(theta), nrow=nrow(X), ncol=ncol(theta) ) ) }
456 |     omega[omega <= 0] <- .5
457 |     return( normalize(omega, byrow=TRUE) )
458 |   }
459 | 
460 | 
461 | ## fast computation of sparse P(X) for X>0
462 | tpxQ <- function(theta, omega, doc, wrd){
463 | 
464 |   if(length(wrd)!=length(doc)){stop("index mis-match in tpxQ") }
465 |   if(ncol(omega)!=ncol(theta)){stop("theta/omega mis-match in tpxQ") }
466 |   
467 |   out <- .C("RcalcQ",
468 |             n = as.integer(nrow(omega)),
469 |             p = as.integer(nrow(theta)),
470 |             K = as.integer(ncol(theta)),
471 |             doc = as.integer(doc-1),
472 |             wrd = as.integer(wrd-1),
473 |             N = as.integer(length(wrd)),
474 |             omega = as.double(omega),
475 |             theta = as.double(theta),
476 |             q = double(length(wrd)),
477 |             PACKAGE="maptpx" )
478 | 
479 |   return( out$q ) }
480 | 
481 | ## model and component likelihoods for mixture model
482 | tpxMixQ <- function(X, omega, theta, grp=NULL, qhat=FALSE){
483 |   if(is.null(grp)){ grp <- rep(1, nrow(X)) }
484 |   K <- ncol(omega)
485 |   n <- nrow(X)
486 |   mixhat <- .C("RmixQ",
487 |                n = as.integer(nrow(X)),
488 |                p = as.integer(ncol(X)),
489 |                K = as.integer(K),
490 |                N = as.integer(length(X$v)),
491 |                B = as.integer(nrow(omega)),
492 |                cnt = as.double(X$v),
493 |                doc = as.integer(X$i-1),
494 |                wrd = as.integer(X$j-1),
495 |                grp = as.integer(as.numeric(grp)-1),
496 |                omega = as.double(omega),
497 |                theta = as.double(theta),
498 |                Q = double(K*n),
499 |                PACKAGE="maptpx")
500 |   ## model and component likelihoods
501 |   lQ <- matrix(mixhat$Q, ncol=K)
502 |   lqlhd <- log(row_sums(exp(lQ)))
503 |   lqlhd[is.infinite(lqlhd)] <- -600 # remove infs
504 |   if(qhat){
505 |     qhat <- exp(lQ-lqlhd)
506 |     ## deal with numerical overload
507 |     infq <- row_sums(qhat) < .999
508 |     if(sum(infq)>0){
509 |       qhat[infq,] <- 0
510 |       qhat[n*(apply(matrix(lQ[infq,],ncol=K),1,which.max)-1) + (1:n)[infq]] <- 1 }
511 |   }
512 |   return(list(lQ=lQ, lqlhd=lqlhd, qhat=qhat)) }
513 | 
514 | ## negative log hessian block diagonal matrix for theta & omega
515 | tpxHnegDet <- function(X, q, theta, omega, alpha){
516 |   K <- ncol(theta)
517 |   n <- nrow(omega)
518 | 
519 |   ## sparse Xij/Qij^2 
520 |   Xq <- X
521 |   Xq$v <- Xq$v/q^2
522 |   
523 |   ## negative 2nd derivitive matrices for theta
524 |   HT <- tcrossprod_simple_triplet_matrix(t(Xq), apply(omega, 1, function(v) v%o%v ) )
525 |   HT[,K*(0:(K-1))+1:K] <- HT[,K*(0:(K-1))+1:K] + alpha/theta^2 # will break for alpha<=1
526 |   DT <- apply(HT, 1, tpxlogdet)
527 | 
528 |   ## ditto for omega
529 |   HW <- matrix(.C("RnegHW",
530 |                   n = as.integer(nrow(omega)),
531 |                   p = as.integer(nrow(theta)),
532 |                   K = as.integer(K-1),
533 |                   omeg = as.double(omega[,-1]),
534 |                   thet = as.double(theta[,-1]),
535 |                   doc = as.integer(X$i-1),
536 |                   wrd = as.integer(X$j-1),
537 |                   cnt = as.double(X$v),
538 |                   q = as.double(q),
539 |                   N = as.integer(length(q)),
540 |                   H = double(n*(K-1)^2),
541 |                   PACKAGE="maptpx")$H,
542 |                nrow=(K-1)^2, ncol=n)
543 |   DW <- apply(HW, 2, tpxlogdet) 
544 |   return( c(DT,DW) )  }
545 | 
546 | ## functions to move theta/omega to and from NEF.  
547 | tpxToNEF <- function(theta, omega){
548 |   n <- nrow(omega)
549 |   p <- nrow(theta)
550 |   K <- ncol(omega)
551 |   return(.C("RtoNEF",
552 |             n=as.integer(n), p=as.integer(p), K=as.integer(K),
553 |             Y=double((p-1)*K + n*(K-1)),
554 |             theta=as.double(theta), tomega=as.double(t(omega)),
555 |             PACKAGE="maptpx")$Y)
556 | }
557 | 
558 | ## 'From' NEF representation back to probabilities
559 | tpxFromNEF <- function(Y, n, p, K){
560 |   bck <- .C("RfromNEF",
561 |             n=as.integer(n), p=as.integer(p), K=as.integer(K),
562 |             Y=as.double(Y), theta=double(K*p), tomega=double(K*n),
563 |             PACKAGE="maptpx")
564 |   return(list(omega=t( matrix(bck$tomega, nrow=K) ), theta=matrix(bck$theta, ncol=K)))
565 | }
566 | 
567 | ## utility log determinant function for speed/stabilty
568 | tpxlogdet <- function(v){
569 |     v <- matrix(v, ncol=sqrt(length(v)))
570 |     
571 |     if( sum(zeros <- colSums(v)==0)!=0 ){
572 |       cat("warning: boundary values in laplace approx\n")
573 |       v <- v[-zeros,-zeros] }
574 |    
575 |     return(determinant(v, logarithm=TRUE)$modulus) }
576 | 
577 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | maptpx
2 | ======
3 | 
4 | MAP estimation of topic models in R.  It implements estimation via posterior maximization for latent topic models in text analysis, as described in <a href="http://jmlr.csail.mit.edu/proceedings/papers/v22/taddy12/taddy12.pdf">"On Estimation and Selection for Topic Models"</a>.  These routines were previously part of <a href="http://www.cran.r-project.org/web/packages/textir/index.html">textir</a>, but have been spun off since version 2.0 of that package.  This maptpx package is no longer actively maintained; instead, we are focusing on developing faster distributed factor models based on <a href="http://arxiv.org/abs/1311.6139">distributed multinomial regression</a>, as implemented in the <a href="http://cran.r-project.org/web/packages/distrom/index.html">distrom</a> package.
5 | 
6 | If you want to take advantage of openmp parallelization, uncomment the relevant flags in src/MAKEVARS before compiling.
7 | 


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite use:")
 2 | 
 3 | citEntry(entry = "Inproceedings",
 4 |   title        = "On Estimation for Topic Models",
 5 |   author       = personList(as.person("Matt Taddy")),
 6 |   year         = "2012",
 7 |   booktitle    = "AISTATS",
 8 |   note          = "http://jmlr.csail.mit.edu/proceedings/papers/v22/taddy12/taddy12.pdf",
 9 | 
10 |   textVersion  =
11 |   paste("Matt Taddy","(AISTATS 2012)",
12 |         "On Estimation of Topic Models,",
13 |         "http://jmlr.csail.mit.edu/proceedings/papers/v22/taddy12/taddy12.pdf")
14 | 
15 | )


--------------------------------------------------------------------------------
/man/counts.Rd:
--------------------------------------------------------------------------------
 1 | \name{counts}
 2 | \alias{stm_tfidf}
 3 | \alias{normalize}
 4 | \title{Utilities for count matrices}
 5 | \description{
 6 | Tools for manipulating (sparse) count matrices.}
 7 | \usage{
 8 | normalize(x,byrow=TRUE)
 9 | stm_tfidf(x)
10 | }
11 | \arguments{
12 |   \item{x}{A \code{simple_triplet_matrix} or \code{matrix} of counts. }
13 |   \item{byrow}{Whether to normalize by row or column totals.}
14 | }
15 | \value{
16 |  \code{normalize} divides the counts by row or column totals, and  \code{stm_tfidf} returns a matrix with entries \eqn{x_{ij} \log[ n/(d_j+1) ]}, where \eqn{x_{ij}} is term-j frequency in document-i, 
17 | and \eqn{d_j} is the number of documents containing term-j.
18 | }
19 | \author{
20 |   Matt Taddy \email{mataddy@gmail.com}
21 | }
22 | \examples{
23 | normalize( matrix(1:9, ncol=3) )
24 | normalize( matrix(1:9, ncol=3), byrow=FALSE )
25 | 
26 | (x <- matrix(rbinom(15,size=2,prob=.25),ncol=3))
27 | stm_tfidf(x)
28 | }
29 | 


--------------------------------------------------------------------------------
/man/predict.topics.Rd:
--------------------------------------------------------------------------------
 1 | \name{predict.topics}
 2 | \alias{predict.topics}
 3 | \title{
 4 | topic predict
 5 | }
 6 | \description{ Predict function for Topic Models }
 7 | \usage{
 8 | \method{predict}{topics}( object, newcounts, loglhd=FALSE, ... )
 9 | }
10 | \arguments{
11 |   \item{object}{An output object from the \code{topics} function, or the corresponding matrix of estimated topics.}
12 |   \item{newcounts}{  An \code{nrow(object$theta)}-column matrix of multinomial phrase/category counts for new documents/observations. 
13 | Can be either a simple \code{matrix} or a \code{simple_triplet_matrix}. }
14 |   \item{loglhd}{ Whether or not to calculate and return \code{sum(x*log(p))}, 
15 | the un-normalized log likelihood. }
16 |   \item{...}{Additional arguments to the undocumented internal \code{tpx*} functions. }
17 | }
18 | \details{ Under the default mixed-membership topic model, this function uses sequential quadratic programming to fit topic weights \eqn{\Omega} for new documents.   
19 | Estimates for each new \eqn{\omega_i} are, conditional on \code{object$theta}, 
20 | MAP in the (K-1)-dimensional logit transformed parameter space.  }
21 | \value{ The output is an \code{nrow(newcounts)} by \code{object$K} matrix of document topic weights, or a list with including these weights as \code{W} and the log likelihood as \code{L}. }
22 | \references{
23 | Taddy (2012), \emph{On Estimation and Selection for Topic Models}.
24 | \url{http://arxiv.org/abs/1109.4518}
25 | }
26 | \author{
27 |   Matt Taddy \email{mataddy@gmail.com}
28 | }
29 | 
30 | \seealso{
31 |   topics, plot.topics, summary.topics, congress109
32 | }
33 | \examples{
34 | 
35 | ## Simulate some data
36 | omega <- t(rdir(500, rep(1/10,10)))
37 | theta <- rdir(10, rep(1/1000,1000))
38 | Q <- omega\%*\%t(theta)
39 | counts <- matrix(ncol=1000, nrow=500)
40 | totals <- rpois(500, 200)
41 | for(i in 1:500){ counts[i,] <- rmultinom(1, size=totals[i], prob=Q[i,]) }
42 | 
43 | ## predict omega given theta
44 | W <- predict.topics( theta, counts )
45 | plot(W, omega, pch=21, bg=8)
46 | 
47 | }
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/man/rdir.Rd:
--------------------------------------------------------------------------------
 1 | \name{rdir}
 2 | \alias{rdir}
 3 | \title{Dirichlet RNG}
 4 | \description{
 5 | Generate random draws from a Dirichlet distribution}
 6 | \usage{
 7 | rdir(n, alpha)
 8 | }
 9 | \arguments{
10 |   \item{n}{ The number of observations. }
11 |   \item{alpha}{A \code{vector} of scale parameters, such that \eqn{E[p_j] = \alpha_j/\sum_i\alpha_i}. }
12 | }
13 | \value{
14 |  An \code{n} column matrix containing the observations.
15 | }
16 | \author{
17 |   Matt Taddy \email{mataddy@gmail.com}
18 | }
19 | \examples{
20 | rdir(3,rep(1,6))
21 |  }
22 | 


--------------------------------------------------------------------------------
/man/topicVar.Rd:
--------------------------------------------------------------------------------
 1 | \name{topicVar}
 2 | \alias{topicVar}
 3 | \alias{expit}
 4 | \alias{logit}
 5 | \title{
 6 | topic variance
 7 | }
 8 | \description{ Tools for looking at the variance of document-topic weights. }
 9 | \usage{
10 | topicVar(counts, theta, omega) 
11 | logit(prob)
12 | expit(eta)
13 | }
14 | \arguments{
15 |   \item{counts}{A matrix of multinomial response counts, as inputed to the \code{topics} or \code{predict.topics} functions.}
16 |   \item{theta}{A fitted topic matrix, as ouput from the \code{topics} or \code{predict.topics} functions.}
17 |   \item{omega}{A fitted document topic-weight matrix, as ouput from the \code{topics} or \code{predict.topics} functions.}
18 |   \item{prob}{A probability vector (positive and sums to one) or a matrix with probability vector rows.}
19 |   \item{eta}{A vector of the natural exponential family parameterization for a probability vector (with first category taken as null) or a matrix with each row the NEF parameters for a single observation.}
20 | }
21 | \details{ These function use the natural exponential family (NEF) parametrization of a probability vector \eqn{q_0 ... q_{K-1}} with the first element corresponding to a 'null' category; that is, with 
22 | 	  \eqn{NEF(q) = e_1 ... e_{K-1}} and setting \eqn{e_0 = 0}, the probabilities are
23 | \deqn{q_k = \frac{exp[e_k]}{1 + \sum exp[e_j]}.} 
24 | Refer to Taddy (2012) for details. }
25 | \value{ \code{topicVar} returns an array with dimensions \eqn{(K-1,K-1,n)}, where \code{K=ncol(omega)=ncol(theta)} and \code{n = nrow(counts) = nrow(omega)}, filled with the posterior covariance matrix for the NEF parametrization of each row of \code{omega}.  Utility \code{logit} performs the NEF transformation and \code{expit} reverses it.  }
26 | \references{
27 | Taddy (2012), \emph{On Estimation and Selection for Topic Models}.
28 | \url{http://arxiv.org/abs/1109.4518}
29 | }
30 | \author{
31 |   Matt Taddy \email{mataddy@gmail.com}
32 | }
33 | 
34 | \seealso{
35 |   topics, predict.topics
36 | }
37 | 


--------------------------------------------------------------------------------
/man/topics.Rd:
--------------------------------------------------------------------------------
  1 | \name{topics}
  2 | \alias{topics}
  3 | \title{
  4 | Estimation for Topic Models
  5 | }
  6 | \description{ MAP estimation of Topic models  }
  7 | \usage{
  8 | topics(counts, K, shape=NULL, initopics=NULL, 
  9 |   tol=0.1, bf=FALSE, kill=2, ord=TRUE, verb=1, ...)
 10 | }
 11 | \arguments{
 12 |   \item{counts}{
 13 |     A matrix of multinomial response counts in \code{ncol(counts)} phrases/categories 
 14 |     for \code{nrow(counts)} documents/observations. Can be either a simple \code{matrix} or a \code{simple_triplet_matrix}.}
 15 |   \item{K}{The number of latent topics.  If \code{length(K)>1}, 
 16 |   \code{topics} will find the Bayes factor (vs a null single topic model) for each element and return parameter
 17 |   estimates for the highest probability K.}
 18 |   \item{shape}{ Optional argument to specify the Dirichlet prior concentration parameter as \code{shape} for topic-phrase probabilities.  Defaults to \code{1/(K*ncol(counts))}.  For fixed single \code{K}, this can also be a \code{ncol(counts)} 
 19 |   		by \code{K} matrix of unique shapes for each topic element.}
 20 |   \item{initopics}{ Optional start-location for \eqn{[\theta_1 ... \theta_K]}, the topic-phrase probabilities.  
 21 |   		    Dimensions must accord with the smallest element of \code{K}.  
 22 |   		    If \code{NULL}, the initial estimates are built by incrementally adding topics. }
 23 |   \item{tol}{ Convergence tolerance: optimization stops, conditional on some extra checks, when the \emph{absolute} posterior increase over a full paramater set update is less than \code{tol}.}
 24 |   \item{bf}{An indicator for whether or not to calculate the Bayes factor for univariate \code{K}.  
 25 |   If \code{length(K)>1}, this is ignored and Bayes factors are always calculated. }
 26 |   \item{kill}{For choosing from multiple \code{K} numbers of topics (evaluated in increasing order),
 27 |    the search will stop after \code{kill} consecutive drops in the corresponding Bayes factor.  Specify \code{kill=0}
 28 |    if you want Bayes factors for all elements of \code{K}. }
 29 |  \item{ord}{If \code{TRUE}, the returned topics (columns of \code{theta}) will be ordered by decreasing usage (i.e., 
 30 |  by decreasing \code{colSums(omega)}).}
 31 |   \item{verb}{A switch for controlling printed output.  \code{verb > 0} will print something, with the level of detail
 32 |   increasing with \code{verb}.}
 33 |   \item{...}{Additional arguments to the undocumented internal \code{tpx*} functions. }
 34 |  }
 35 | \details{A latent topic model represents each i'th document's term-count vector \eqn{X_i} 
 36 | (with \eqn{\sum_{j} x_{ij} = m_i} total phrase count)
 37 | as having been drawn from a mixture of \code{K} multinomials, each parameterized by topic-phrase
 38 | probabilities \eqn{\theta_i}, such that 
 39 | \deqn{X_i \sim MN(m_i, \omega_1 \theta_1 + ... + \omega_K\theta_K).}
 40 | We assign a K-dimensional Dirichlet(1/K) prior to each document's topic weights 
 41 | \eqn{[\omega_{i1}...\omega_{iK}]}, and the prior on each \eqn{\theta_k} is Dirichlet with concentration \eqn{\alpha}.
 42 | The \code{topics} function uses quasi-newton accelerated EM, augmented with sequential quadratic programming 
 43 | for conditional \eqn{\Omega | \Theta} updates, to obtain MAP estimates for the topic model parameters. 
 44 | We also provide Bayes factor estimation, from marginal likelihood
 45 | calculations based on a Laplace approximation around the converged MAP parameter estimates.  If  input \code{length(K)>1}, these
 46 | Bayes factors are used for model selection. Full details are in Taddy (2011). }
 47 | \note{ Estimates are actually functions of the MAP (K-1 or p-1)-dimensional 
 48 | logit transformed natural exponential family parameters.  }
 49 | \value{
 50 |  An \code{topics} object list with entries
 51 | \item{K}{The number of latent topics estimated.  If input \code{length(K)>1}, 
 52 | on output this is a single value corresponding to the model with the highest Bayes factor. }
 53 | \item{theta}{The \code{ncol{counts}} by \code{K} matrix of estimated topic-phrase probabilities.}
 54 | \item{omega}{The \code{nrow{counts}} by \code{K} matrix of estimated document-topic weights. }
 55 | \item{BF}{The log Bayes factor for each number of topics in the input \code{K}, against a null single topic model.}
 56 | \item{D}{Residual dispersion: for each element of \code{K}, estimated dispersion parameter
 57 |  (which should be near one for the multinomial), degrees of freedom, and p-value for a test of whether the true dispersion is \eqn{>1}.}
 58 | \item{X}{The input count matrix, in \code{dgTMatrix} format.}
 59 | }
 60 | \references{
 61 | Taddy (2012), \emph{On Estimation and Selection for Topic Models}.
 62 | \url{http://arxiv.org/abs/1109.4518}
 63 | }
 64 | \author{
 65 |   Matt Taddy \email{mataddy@gmail.com}
 66 | }
 67 | \seealso{
 68 |   plot.topics, summary.topics, predict.topics, wsjibm, congress109, we8there
 69 | }
 70 | \examples{
 71 | 
 72 | ## Simulation Parameters
 73 | K <- 10
 74 | n <- 100
 75 | p <- 100
 76 | omega <- t(rdir(n, rep(1/K,K)))
 77 | theta <- rdir(K, rep(1/p,p))
 78 | 
 79 | ## Simulated counts
 80 | Q <- omega\%*\%t(theta)
 81 | counts <- matrix(ncol=p, nrow=n)
 82 | totals <- rpois(n, 100)
 83 | for(i in 1:n){ counts[i,] <- rmultinom(1, size=totals[i], prob=Q[i,]) }
 84 | 
 85 | ## Bayes Factor model selection (should choose K or nearby)
 86 | summary(simselect <- topics(counts, K=K+c(-5:5)), nwrd=0)
 87 | 
 88 | ## MAP fit for given K
 89 | summary( simfit <- topics(counts,  K=K, verb=2), n=0 )
 90 | 
 91 | ## Adjust for label switching and plot the fit (color by topic)
 92 | toplab <- rep(0,K)
 93 | for(k in 1:K){ toplab[k] <- which.min(colSums(abs(simfit$theta-theta[,k]))) }
 94 | par(mfrow=c(1,2))
 95 | tpxcols <- matrix(rainbow(K), ncol=ncol(theta), byrow=TRUE)
 96 | plot(theta,simfit$theta[,toplab], ylab="fitted values", pch=21, bg=tpxcols)
 97 | plot(omega,simfit$omega[,toplab], ylab="fitted values", pch=21, bg=tpxcols)
 98 | title("True vs Fitted Values (color by topic)", outer=TRUE, line=-2)
 99 | 
100 | ## The S3 method plot functions
101 | par(mfrow=c(1,2))
102 | plot(simfit, lgd.K=2)
103 | plot(simfit, type="resid")
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | ## uncomment the flags below for openmp
2 | PKG_CFLAGS = -DRPRINT #-fopenmp
3 | PKG_LIBS =  ${LAPACK_LIBS} ${BLAS_LIBS} ${FLIBS} #-lgomp
4 | 


--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h> // for NULL
 2 | #include <R_ext/Rdynload.h>
 3 | 
 4 | /* .C calls */
 5 | extern void Romega( void *, void *, void *, void *, void *, void *,
 6 | 		void *, void *, void *, void *, void *, void *);
 7 | extern void RmixQ(void *, void *, void *, void *, void *, void *,
 8 | 		void *, void *, void *, void *, void *, void *);
 9 | extern void RcalcQ(void *, void *, void *, void *, void *, void *,
10 | 		void *, void *, void *);
11 | extern void RcalcTau(void *, void *, void *, void *, void *, void *,
12 | 		void *, void * ); 
13 | extern void RnegHW( void *, void *, void *, void *, void *, void *,
14 | 		void *, void *, void *, void *, void *);
15 | extern void Rlogit( void *, void *, void *);
16 | extern void Rexpit( void *, void *, void *);
17 | extern void RtoNEF( void *, void *, void *, void *, void *, void *);
18 | extern void RfromNEF( void *, void *, void *, void *, void *, void *);
19 | extern void Rzhat( void *, void *, void *, void *, void *, void *,
20 | 		void *, void *, void *);
21 | 
22 | static const R_CMethodDef CEntries[] = {
23 |     {"Romega",         (DL_FUNC) &Romega,        12},
24 |     {"RmixQ",          (DL_FUNC) &RmixQ,         12},
25 |     {"RcalcQ",         (DL_FUNC) &RcalcQ,       9},
26 |     {"RcalcTau",       (DL_FUNC) &RcalcTau,     8},
27 |     {"RnegHW",       (DL_FUNC) &RnegHW,     11},
28 |     {"Rlogit",       (DL_FUNC) &Rlogit,     3},
29 |     {"Rexpit",       (DL_FUNC) &Rexpit,     3},
30 |     {"RtoNEF",       (DL_FUNC) &RtoNEF,     6},
31 |     {"RfromNEF",       (DL_FUNC) &RfromNEF,     6},
32 |     {"Rzhat",       (DL_FUNC) &Rzhat,     9},
33 |     {NULL, NULL, 0}
34 | };
35 | 
36 | void R_init_maptpx(DllInfo *dll)
37 | {
38 |     R_registerRoutines(dll, CEntries, NULL, NULL, NULL);
39 |     R_useDynamicSymbols(dll, FALSE);
40 | }
41 | 


--------------------------------------------------------------------------------
/src/latools.c:
--------------------------------------------------------------------------------
  1 | #include "latools.h"
  2 | #include "rhelp.h"
  3 | #include <assert.h>
  4 | #include <math.h>
  5 | #include <stdio.h>
  6 | #include <stdlib.h>
  7 | #include <stdarg.h>
  8 | 
  9 |  
 10 | /*
 11 |  * la_dgemm:  blas wrapper
 12 |  *
 13 |  * C := alpha*op(A)*op(B) +  beta*C.
 14 |  */
 15 | 
 16 | void la_dgemm(int tA, int tB, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol,
 17 | 		  double *A, double *B, double *C, double alpha, double beta)
 18 | {
 19 | 	char opA, opB;
 20 | 	int m, n, k, lda, ldb, ldc;
 21 | 
 22 | 	if(tA){ opA = 'T'; m = Acol; k = Arow; lda = k; }
 23 | 	else{ opA = 'N'; m = Arow; k = Acol; lda = m; }
 24 | 	
 25 | 	if(tB){ opB = 'T'; n = Brow; assert(Bcol==k); ldb = n; }
 26 | 	else{ opB = 'N'; n = Bcol; assert(Brow==k); ldb = k; }
 27 | 
 28 | 	ldc = m; assert(Crow==m); assert(Ccol==n);
 29 | 
 30 | 	dgemm_(&opA,&opB,&m,&n,&k,&alpha,A,&lda,B,&ldb,&beta,C,&ldc);
 31 | }
 32 | 
 33 | 
 34 | /*
 35 |  * la_dsymm:  blas wrapper
 36 |  *
 37 |  * C := alpha*A*B + beta*C or alpha*B*A + beta*C, where A is symmetric
 38 |  */
 39 | 
 40 | 
 41 | void la_dsymm(int Alhs, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol,
 42 | 		  double *A, double *B, double *C, double alpha, double beta)
 43 | {
 44 |   int m, n, lda, ldb, ldc;
 45 |   char Atri = 'L';  // reference the lower triangle of column major A. 
 46 |   char Aside;
 47 | 
 48 |   assert(Arow=Acol); 
 49 |   
 50 |   m = Crow; n = Ccol; lda = Arow; ldb = Brow; ldc = Crow;
 51 |   assert(ldb == m);
 52 | 
 53 |   if(Alhs){  Aside='L';  assert(lda == m); }
 54 |   else{  Aside='R';  assert(lda == n); }
 55 |   
 56 |   dsymm_(&Aside,&Atri,&m,&n,&alpha,A,&lda,B,&ldb,&beta,C,&ldc);
 57 | }
 58 | 
 59 | 
 60 | /*
 61 |  * la_dposv:  lapack wrapper
 62 |  *
 63 |  * Solves the system A*X = B, where A is symmetric AND positive definite.
 64 |  * Upon return, B is the solution and A contains its cholesky decomp.
 65 |  */
 66 | 
 67 | int la_dposv(int Arow, int Bcol, double *A, double *B)
 68 | {
 69 |   char cholTri = 'L'; // return cholesky decomp of col-major A in lower triangle
 70 |   int info;
 71 |   dposv_(&cholTri, &Arow, &Bcol, A, &Arow, B, &Arow, &info);
 72 |   return info; // if info = -i, i'th arg is wrong.  if info > A is not pos-def.
 73 | }
 74 | 
 75 | 
 76 | /*
 77 |  * la_dgesv:  lapack wrapper
 78 |  *
 79 |  * Solves the system A*X = B, where A is a general square matrix.
 80 |  * Upon return, B is the solution and A contains L and U from 
 81 |  * the decomposition A = P * L * U (unit diag for L are not returned)
 82 |  */
 83 | 
 84 | int la_dgesv(int Arow, int Bcol, double *A, double *B)
 85 | {
 86 |   int info;
 87 |   int *ip = new_ivec(Arow);  /* pivot indices which define P; 
 88 | 			      row i was interchanged with row ip[i]*/
 89 |   dgesv_(&Arow, &Bcol, A, &Arow, ip, B, &Arow, &info);
 90 |   return info; // if info = -i, i'th arg is wrong.  if info > 0, A is not pos-def.
 91 | }
 92 | 
 93 | 
 94 | /*
 95 |  * la_dsysv:  lapack wrapper
 96 |  *
 97 |  * Solves the system A*X = B, where A is a symmetric matrix.
 98 |  * Upon return, B is block diagonal matrix D and the
 99 |  * multipliers used to obtain the factor U or L from the
100 |  * factorization A = U*D*U**T or A = L*D*L**T as computed by DSYTRF.
101 |  */
102 | 
103 | int la_dsysv(int Arow, int Bcol, double *A, double *B)
104 | {
105 |   int info;
106 |   double wkopt;
107 |   int lwork = -1;
108 |   int *ipiv = new_ivec(Arow);  
109 |   char uplo = 'L';
110 | 
111 |   dsysv_(&uplo, &Arow, &Bcol, A, &Arow, ipiv, B, &Arow, &wkopt, &lwork, &info);
112 | 
113 |   lwork = (int) wkopt;
114 |   double *work = new_dvec(lwork);
115 | 
116 |   dsysv_(&uplo, &Arow, &Bcol, A, &Arow, ipiv, B, &Arow, work, &lwork, &info);
117 | 
118 |   free(work);
119 |   free(ipiv);
120 |   return info;
121 | }
122 | 
123 | 
124 | /*
125 |  * la_dpotrf:  lapack wrapper
126 |  *
127 |  * Replaces col-major A with the lower triangle cholesky L s.t. A = LL'
128 |  * Upper triangle is untouched.
129 |  * 
130 |  */
131 | 
132 | int la_dpotrf(int dim, double *A)
133 | {
134 |   int info;
135 |   char cholTri = 'L'; // return cholesky decomp of col-major A in lower triangle
136 |   dpotrf_(&cholTri,&dim,A,&dim,&info); 
137 |   return info; // if info = -i, i'th arg is wrong.  if info > 0, A is not pos-def.
138 | }
139 | 
140 | 
141 | /*
142 |  * I'll take one brand-new ncxnr array of doubles.
143 |  *
144 |  */
145 | 
146 | double** new_mat(int nr, int nc){
147 |   
148 |   if(nr == 0 || nc == 0) return NULL;
149 |   int j;
150 |   double **mat = (double**) malloc(sizeof(double*) * (unsigned) nc);
151 |   assert(mat);
152 |   mat[0] = (double*) malloc(sizeof(double) *  (unsigned) (nc*nr));
153 |   assert(mat[0]);
154 |   
155 |   for(j=1; j<nc; j++) mat[j] = mat[j-1] +  (unsigned) nr;
156 |   
157 |   return mat;
158 | }
159 | 
160 | 
161 | double** new_mat_fromv(int nr, int nc, double *v){
162 |   int i, j;
163 |   double **mat = new_mat(nr, nc);
164 |   for(j=0; j<nc; j++) for(i=0; i<nr; i++) mat[j][i] = v[j*nr + i];
165 |   
166 |   return mat;
167 | }
168 | 
169 | 
170 | /*
171 |  * and for ints
172 |  *
173 |  */
174 | 
175 | int** new_imat(int nr, int nc){
176 |   
177 |   if(nr == 0 || nc == 0) return NULL;
178 |   int j;
179 |   int **mat = (int**) malloc(sizeof(int*) * (unsigned) nc);
180 |   assert(mat);
181 |   mat[0] = (int*) malloc(sizeof(int) *  (unsigned) (nc*nr));
182 |   assert(mat[0]);
183 |   
184 |   for(j=1; j<nc; j++) mat[j] = mat[j-1] +  (unsigned) nr;
185 |   
186 |   return mat;
187 | }
188 | 
189 | 
190 | int** new_imat_fromv(int nr, int nc, int *v){
191 |   int i, j;
192 |   int **mat = new_imat(nr, nc);
193 |   for(j=0; j<nc; j++) for(i=0; i<nr; i++) mat[j][i] = v[j*nr + i];
194 |   
195 |   return mat;
196 | }
197 | 
198 | /*
199 |  * Gimme a new ncxnr array full of zeros.
200 |  *
201 |  */
202 | 
203 | double** new_zero_mat(int nr, int nc){
204 |   int i, j;
205 |   double **mat = new_mat(nr, nc);
206 |   for(j=0; j<nc; j++) for(i=0; i<nr; i++) mat[j][i] = 0.0;
207 |   return mat;
208 | }
209 | 
210 | 
211 | /*
212 |  * Gimme a new ncxnr array full of val
213 |  *
214 |  */
215 | 
216 | double** new_val_mat(int nr, int nc, double val){
217 |   int i, j;
218 |   double **mat = new_mat(nr, nc);
219 |   for(j=0; j<nc; j++) for(i=0; i<nr; i++) mat[j][i] = val;
220 |   return mat;
221 | }
222 | 
223 | /*
224 |  * How about a new dimxdim ID array?
225 |  *
226 |  */
227 | 
228 | double** new_id_mat(int dim){
229 |   int j;
230 |   double **mat = new_zero_mat(dim, dim);
231 |   for(j=0; j<dim; j++) mat[j][j] = 1.0;
232 |   return mat;
233 | }
234 | 
235 | /*
236 |  * I'll have the same.
237 |  *
238 |  */
239 | 
240 | double** new_dup_mat(int nr, int nc, double **orig){
241 |   int i, j;
242 |   double **mat = new_mat(nr, nc);
243 |   for(j=0; j<nc; j++) for(i=0; i<nr; i++) mat[j][i] = orig[j][i];
244 |   return mat;
245 | }
246 | 
247 | 
248 | /*
249 |  * That 2d array I created above?  I don't need it any more.
250 |  * 
251 |  */
252 | 
253 | 
254 | void delete_mat(double **mat){
255 |  if(mat == NULL) return;
256 |   assert(*mat);
257 |   free(*mat);
258 |   assert(mat);
259 |   free(mat);
260 | }
261 | 
262 | 
263 | void delete_imat(int **mat){
264 |  if(mat == NULL) return;
265 |   assert(*mat);
266 |   free(*mat);
267 |   assert(mat);
268 |   free(mat);
269 | }
270 | 
271 | 
272 | 
273 | void zero_mat(double **mat, int nr, int nc){
274 |   int i, j;
275 |   for(j=0; j<nc; j++) for(i=0; i<nr; i++) mat[j][i] = 0.0;
276 | }
277 | 
278 | 
279 | void id_mat(double **mat, int dim){
280 |   int j;
281 |   zero_mat(mat, dim, dim);
282 |   for(j=0; j<dim; j++) mat[j][j] = 1.0;
283 | }
284 | 
285 | void diag_mat(double **mat, double *v, int dim){
286 |   int j;
287 |   zero_mat(mat, dim, dim);
288 |   for(j=0; j<dim; j++) mat[j][j] = v[j];
289 | }
290 | 
291 | /*
292 |  * copy orig to mat
293 |  */
294 | 
295 | void copy_mat(int nr, int nc, double** mat, double** orig)
296 | {
297 |   int i, j;
298 |   for(j=0; j<nc; j++) for(i=0; i<nr; i++) mat[j][i] = orig[j][i];
299 | }
300 | 
301 | 
302 | /*
303 |  * print an n x col matrix allocated as above out an opened outfile.
304 |  * actually, this routine can print any double** 
305 |  */
306 | 
307 | void print_mat(int nr, int nc, double **mat, FILE *outfile)
308 | {
309 |   int i,j;
310 |   for(i=0; i<nr; i++) {
311 |     for(j=0; j<nc; j++) {
312 |       if(j==nc-1) myprintf(outfile, "%g\n", mat[j][i]);
313 |       else myprintf(outfile, "%g ", mat[j][i]);
314 |     }
315 |   }
316 | }
317 | 
318 | void print_imat(int nr, int nc, int **mat, FILE *outfile)
319 | {
320 |   int i,j;
321 |   for(i=0; i<nr; i++) {
322 |     for(j=0; j<nc; j++) {
323 |       if(j==nc-1) myprintf(outfile, "%d\n", mat[j][i]);
324 |       else myprintf(outfile, "%d ", mat[j][i]);
325 |     }
326 |   }
327 | }
328 | 
329 | 
330 | /************* double vec tools ********/
331 | 
332 | /*
333 |  * new vec of integers of length n
334 |  */
335 | 
336 | double *new_dvec(int n)
337 | {
338 |   double *v;
339 |   if(n == 0) return NULL;
340 |   v = (double*)  malloc(sizeof(double) * (unsigned) n);
341 |   assert(v);
342 |   return v;
343 | }
344 | 
345 | 
346 | /*
347 |  * allocate and return an array containing
348 |  * the double n-seqence [from...to]
349 |  */
350 | 
351 | double* new_dseq(double from, double to, int n)
352 | {
353 |   int i;
354 |   
355 |   assert( from <= to);
356 |   double *v = new_dvec(n);
357 |   v[0] = from;
358 |   double by = (to-from)/((double) n-1);
359 |   for(i=1; i<n; i++) v[i] = v[i-1] + by;
360 |   return v;
361 | }
362 | 
363 | 
364 | /*
365 |  * allocates a new int array of size n
366 |  * and fills it with zeros
367 |  */
368 | 
369 | double* new_dzero(int n)
370 | {
371 |   int i;
372 |   double *v = new_dvec(n);
373 |   for(i=0; i<n; i++) v[i] = 0;
374 |   return v;
375 | }
376 | 
377 | /*
378 |  * allocate a new double vec of length n and copy the integer
379 |  * contents of dv into it
380 |  */
381 | 
382 | double* new_dup_dvec(double *v, int n)
383 | {
384 |   double* dv_new = new_dvec(n);
385 |   copy_dvec(dv_new, v, n);
386 |   return dv_new;
387 | }
388 | 
389 | 
390 | /* 
391 |  * sum:
392 |  *
393 |  * return the sum of the contents of the dvec
394 |  */
395 | 
396 | double sum_dvec(double *v, int n)
397 | {
398 |   int i;
399 |   if(n==0) return 0.0;
400 |   assert(v);
401 |   double s = 0.0;
402 |   for(i=0; i<n; i++) s += v[i];
403 |   return(s);
404 | }
405 | 
406 | 
407 | /*
408 |  * zeros out v
409 |  * 
410 |  */
411 | 
412 | void zero_dvec(double *v, int n)
413 | {
414 |   int i;
415 |   for(i=0; i<n; i++) v[i] = 0;
416 | }
417 | 
418 | 
419 | /*
420 |  * copy orig into v
421 |  */
422 | 
423 | void copy_dvec(double *v, double *orig, int n)
424 | {
425 |   int i;
426 |   if(n > 0) assert(v && orig);
427 |   for(i=0; i<n; i++) v[i] = orig[i];
428 | }
429 | 
430 | 
431 | /*
432 |  * printing an double vector out to outfile
433 |  */
434 | 
435 | void print_dvec(double *v, int n, FILE *outfile)
436 | {
437 |   int i;
438 |   for(i=0; i<n; i++) myprintf(outfile, "%g ", v[i]);
439 |   myprintf(outfile, "\n");
440 | }
441 | 
442 | 
443 | double* drep(double val, int n)
444 | {
445 |   int i;
446 |   double *v = new_dvec(n);
447 |   for(i=0; i<n; i++) v[i] = val;
448 |   return v;
449 | }
450 | 
451 | double normalize(double *v, int n){
452 |   int i;
453 |   double vsum = 0.0;
454 |   for(i=0; i<n; i++) vsum += v[i]; 
455 |   if(vsum == 0.0)
456 |     { for(i=0; i<n; i++) v[i] = 1.0/((double) n);
457 |       return 0.0; }
458 |   for(i=0; i<n; i++)
459 |     v[i] = v[i]/vsum;
460 |   return vsum;
461 | }
462 | /*****integer vector tools******/
463 | 
464 | 
465 | 
466 | 
467 | /*
468 |  * new vec of integers of length n
469 |  */
470 | 
471 | int *new_ivec(int n)
472 | {
473 |   int *iv;
474 |   if(n == 0) return NULL;
475 |   iv = (int*)  malloc(sizeof(int) * (unsigned) n);
476 |   assert(iv);
477 |   return iv;
478 | }
479 | 
480 | 
481 | /*
482 |  * allocate and return an array containing
483 |  * the integer seqence [from...to]
484 |  */
485 | 
486 | int* new_iseq(int from, int to)
487 | {
488 |   int n,i;
489 |   
490 |   assert( from <= to);
491 |   n =  (to - from) + 1;    
492 |   int *v = new_ivec(n);
493 |   v[0] = from;
494 |   for(i=1; i<n; i++) v[i] = v[i-1] + 1;
495 |   return v;
496 | }
497 | 
498 | 
499 | 
500 | 
501 | /*
502 |  * allocates a new int array of size n
503 |  * and fills it with zeros
504 |  */
505 | 
506 | int* new_izero(int n)
507 | {
508 |   int i;
509 |   int *v = new_ivec(n);
510 |   for(i=0; i<n; i++) v[i] = 0;
511 |   return v;
512 | }
513 | 
514 | /*
515 |  * allocate a new integer vec of length n and copy the integer
516 |  * contents of iv into it
517 |  */
518 | 
519 | int* new_dup_ivec(int *v, int n)
520 | {
521 |   int* iv_new = new_ivec(n);
522 |   copy_ivec(iv_new, v, n);
523 |   return iv_new;
524 | }
525 | 
526 | 
527 | /* 
528 |  * sumiv:
529 |  *
530 |  * return the sum of the contents of the ivec
531 |  */
532 | 
533 | int sum_ivec(int *v, int n)
534 | {
535 |   int i;
536 |   if(n==0) return 0;
537 |   assert(v);
538 |   int s = 0;
539 |   for(i=0; i<n; i++) s += v[i];
540 |   return(s);
541 | }
542 | 
543 | 
544 | /*
545 |  * zeros out v
546 |  * 
547 |  */
548 | 
549 | void zero_ivec(int *v, int n)
550 | {
551 |   int i;
552 |   for(i=0; i<n; i++) v[i] = 0;
553 | }
554 | 
555 | 
556 | /*
557 |  * copy orig into v
558 |  */
559 | 
560 | void copy_ivec(int *v, int *orig, int n)
561 | {
562 |   int i;
563 |   if(n > 0) assert(v && orig);
564 |   for(i=0; i<n; i++) v[i] = orig[i];
565 | }
566 | 
567 | 
568 | /*
569 |  * printing an integer vector out to outfile
570 |  */
571 | 
572 | void print_ivec(int *v, int n, FILE *outfile)
573 | {
574 |   int i;
575 |   for(i=0; i<n; i++) myprintf(outfile, "%d ", v[i]);
576 |   myprintf(outfile, "\n");
577 | }
578 | 
579 | 
580 | int* irep(int val, int n)
581 | {
582 |   int i;
583 |   int *v = new_ivec(n);
584 |   for(i=0; i<n; i++) v[i] = val;
585 |   return v;
586 | }
587 | 
588 | 
589 | 


--------------------------------------------------------------------------------
/src/latools.h:
--------------------------------------------------------------------------------
 1 | /*  Linear algebra tools (wrappers for lapack and blas) 
 2 |  *  and some basic array tools.
 3 |  *  Everything is COLUMN MAJOR for communication with fortran.
 4 |  *  
 5 |  */
 6 | 
 7 | #ifndef __LATOOLS_H__
 8 | #define __LATOOLS_H__ 
 9 | 
10 | #include <stdio.h>
11 | #include <stdlib.h>
12 | 
13 | /* These will all be defined if you're compiling with R */
14 | 
15 | // fns from lapack
16 | extern void dpotrf_(char*, int*, double*, int*, int*);
17 | extern void dposv_(char*, int*, int*, double*, int*, double*, int*, int*);
18 | extern void dgesv_(int*,int*,double*,int*,int*,double*,int*,int*);
19 | extern void dsysv_(char*, int*,int *,double *,int*,int*,double*,int*,double*,int*,int*);
20 | 
21 | // fns from blas
22 | extern void dgemm_(char*, char*,  int*, int*, int*, double*,
23 | 		  double*, int*, double*, int*, double*, double*, int*);
24 | extern void dsymm_(char*, char*, int*, int*, double*,
25 | 		  double*, int*, double*, int*, double*, double*, int*);
26 | 
27 | // friendlier versions.  
28 | 
29 | void la_dgemm(int tA, int tB, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol,
30 | 	      double *A, double *B, double *C, double alpha, double beta);
31 | void la_dsymm(int Alhs, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol,
32 | 	      double *A, double *B, double *C, double alpha, double beta);
33 | 
34 | int la_dposv(int Arow, int Bcol, double *A, double *B);
35 | int la_dgesv(int Arow, int Bcol, double *A, double *B);
36 | int la_dsysv(int Arow, int Bcol, double *A, double *B);
37 | int la_dpotrf(int dim, double *A);
38 | 
39 | // 2D double array tools.
40 | 
41 | double** new_mat(int nr, int nc);
42 | double** new_mat_fromv(int nr, int nc, double *v);
43 | int** new_imat(int nr, int nc);
44 | int** new_imat_fromv(int nr, int nc, int *v);
45 | double** new_zero_mat(int nr, int nc);
46 | double** new_val_mat(int nr, int nc, double val);
47 | double** new_id_mat(int dim);
48 | double** new_dup_mat(int nr, int nc, double **orig);
49 | void delete_mat(double **mat);
50 | void delete_imat(int **mat);
51 | void copy_mat(int nr, int nc, double** mat, double** orig);
52 | void print_mat(int nr, int nc, double **mat, FILE *outfile);
53 | void print_imat(int nr, int nc, int **mat, FILE *outfile);
54 | void zero_mat(double **mat, int nr, int nc);
55 | void id_mat(double **mat, int dim);
56 | void diag_mat(double **mat, double *v, int dim);
57 | 
58 | // 1D double array tools.
59 | 
60 | double* new_dvec(int n);
61 | double* new_dseq(double from, double to, int n);
62 | double* new_dzero(int n);
63 | double* new_dup_dvec(double *v, int n);
64 | double sum_dvec(double *v, int n);
65 | void zero_dvec(double *v, int n);
66 | void copy_dvec(double *v, double *orig, int n);
67 | void print_dvec(double *v, int n, FILE *outfile);
68 | double* drep(double val, int n);
69 | double normalize(double *v, int n);
70 | 
71 | // 1D int array tools.
72 | 
73 | int* new_ivec(int n);
74 | int* new_iseq(int from, int to);
75 | int* new_izero(int n);
76 | int* new_dup_ivec(int *v, int n);
77 | int sum_ivec(int *v, int n);
78 | void zero_ivec(int *v,  int n);
79 | void copy_ivec(int *v, int *orig, int n);
80 | void print_ivec(int *v, int n, FILE *outfile);
81 | int* irep(int val, int n);
82 | 
83 | #endif
84 | 
85 | 


--------------------------------------------------------------------------------
/src/rhelp.c:
--------------------------------------------------------------------------------
  1 | /******************************************************************************** 
  2 |  *
  3 |  * Bayesian Regression and Adaptive Sampling with Gaussian Process Trees
  4 |  * Copyright (C) 2005, University of California
  5 |  * 
  6 |  * This library is free software; you can redistribute it and/or
  7 |  * modify it under the terms of the GNU Lesser General Public
  8 |  * License as published by the Free Software Foundation; either
  9 |  * version 2.1 of the License, or (at your option) any later version.
 10 |  * 
 11 |  * This library is distributed in the hope that it will be useful,
 12 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 |  * Lesser General Public License for more details.
 15 |  * 
 16 |  * You should have received a copy of the GNU Lesser General Public
 17 |  * License along with this library; if not, write to the Free Software
 18 |  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 19 |  *
 20 |  * Questions? Contact Robert B. Gramacy (rbgramacy@ams.ucsc.edu)
 21 |  *
 22 |  ********************************************************************************/
 23 | 
 24 | #include "rhelp.h"
 25 | #ifdef RPRINT
 26 | #include <R_ext/Print.h>
 27 | #include <R.h>
 28 | FILE *mystdout = (FILE*) 0;
 29 | FILE *mystderr = (FILE*) 1;
 30 | #else
 31 | FILE *mystdout = stdio;
 32 | FILE *mystderr = stderr;
 33 | #endif
 34 | #include <stdarg.h>
 35 | #include <time.h>
 36 | #include <assert.h>
 37 | 
 38 | /* 
 39 |  * myprintf:
 40 |  *
 41 |  * a function many different types of printing-- in particular, using 
 42 |  * the Rprintf if the code happens to be compiled with RPRINT, 
 43 |  * othersie fprintf (takes the same arguments as fprintf)
 44 |  */
 45 | 
 46 | void myprintf(FILE *outfile, const char *str, ...)
 47 | {
 48 |   va_list argp;
 49 |   va_start(argp, str);
 50 |   
 51 |   #ifdef RPRINT
 52 |   if(outfile == mystdout) Rvprintf(str, argp);
 53 |   else if(outfile == mystderr) REvprintf(str, argp);
 54 |   else vfprintf(outfile, str, argp);
 55 |   #else
 56 |   vfprintf(outfile, str, argp);
 57 |   #endif
 58 | 
 59 |   va_end(argp);
 60 | }
 61 | 
 62 | 
 63 | #ifndef RPRINT
 64 | /*
 65 |  * error:
 66 |  *
 67 |  * printf style function that reports errors to stderr
 68 |  */
 69 | 
 70 | void error(const char *str, ...)
 71 | {
 72 |   va_list argp;
 73 |   va_start(argp, str);
 74 |   
 75 |   myprintf(stderr, "ERROR: ");
 76 |   vfprintf(stderr, str, argp);
 77 |   
 78 |   va_end(argp);
 79 |   myflush(stderr);
 80 |   
 81 |   /* add a final newline */
 82 |   myprintf(stderr, "\n");
 83 | 
 84 |   /* kill the code */
 85 |   assert(0);
 86 | }
 87 | 
 88 | 
 89 | /*
 90 |  * warning:
 91 |  *
 92 |  * printf style function that reports warnings to stderr
 93 |  */
 94 | 
 95 | void warning(const char *str, ...)
 96 | {
 97 |   va_list argp;
 98 |   va_start(argp, str);
 99 |   
100 |   myprintf(stderr, "WARNING: ");
101 |   vfprintf(stderr, str, argp);
102 | 
103 |   va_end(argp);
104 |   myflush(stderr);
105 | 
106 |   /* add a final newline */
107 |   myprintf(stderr, "\n");
108 | }
109 | #endif
110 | 
111 | 
112 | /* 
113 |  * myflush:
114 |  *
115 |  * a function for many different types of flushing--  in particular, 
116 |  * using * the R_FlushConsole the code happens to be compiled with 
117 |  * RPRINT, otherwise fflush
118 |  */
119 | 
120 | void myflush(FILE *outfile)
121 | {
122 | #ifdef RPRINT
123 |   R_FlushConsole();
124 | #else
125 |   fflush(outfile);
126 | #endif
127 | }
128 | 
129 | 
130 | /*
131 |  * my_r_process_events:
132 |  *
133 |  * at least every 1 second(s) pass control back to
134 |  * R so that it can check for interrupts and/or 
135 |  * process other R-gui events
136 |  */
137 | 
138 | time_t my_r_process_events(time_t itime)
139 | {
140 | #ifdef RPRINT  
141 |   time_t ntime = time(NULL);
142 | 
143 |   if(ntime - itime > 1) {
144 |     R_FlushConsole();
145 |     R_CheckUserInterrupt();
146 | #if  (defined(HAVE_AQUA) || defined(Win32) || defined(Win64))
147 |     R_ProcessEvents();
148 | #endif
149 |     itime = ntime;
150 |   }
151 | #endif
152 |   return itime;
153 | }
154 | 


--------------------------------------------------------------------------------
/src/rhelp.h:
--------------------------------------------------------------------------------
 1 | #ifndef __RHELP_H__
 2 | #define __RHELP_H__
 3 | 
 4 | #include <stdio.h>
 5 | #include <time.h>
 6 | 
 7 | /* this is now covered by -D RPRINT flags in Makevars */
 8 | /*#define RPRINT*/
 9 | 
10 | extern FILE *mystdout, *mystderr;
11 | 
12 | #ifndef RPRINT
13 | void warning(const char *str, ...);
14 | void error(const char *str, ...);
15 | #else
16 | #include <R_ext/Utils.h>
17 | #include <R.h>
18 | #endif
19 | 
20 | void R_FlushConsole(void); /* R < 2.3 does not have this in R.h (in Rinterface.h) */
21 | void myprintf(FILE *outfile, const char *str, ...);
22 | void myflush(FILE *outfile);
23 | time_t my_r_process_events(time_t itime);
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/topics.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <assert.h>
  3 | #include <Rmath.h>
  4 | #include "latools.h"
  5 | #include "rhelp.h"
  6 | # ifdef _OPENMP
  7 | #include <omp.h>
  8 | # endif
  9 | 
 10 | 
 11 | double wllhd(int nwrd, double *x, double *q){
 12 |   double l = 0.0;
 13 |   int j;
 14 |   for(j=0; j<nwrd; j++) l += x[j]*log(q[j]);
 15 |   return l;
 16 | }
 17 | 
 18 | void wrdprob(double *q, int nwrd, int K, int *wrd, double **theta, double *w){ 
 19 |   int j, k;
 20 |   zero_dvec(q,nwrd);
 21 |   for(j=0; j<nwrd; j++)
 22 |     for(k=0; k<K; k++)
 23 |       q[j] += w[k]*theta[k][wrd[j]];
 24 | }
 25 | 
 26 | void wgrad(double *grad, int nwrd, int K, int *wrd, double *x, double *q, double **theta, double *w, int nef){
 27 |   int j, k;
 28 |   zero_dvec(grad,K);
 29 |   for(k=0; k<K; k++)
 30 |     { for(j=0; j<nwrd; j++) grad[k] += theta[k][wrd[j]]*x[j]/q[j];
 31 |       if(nef) grad[k] += 1.0/(w[k]*((double) K)); } 
 32 | }
 33 | 
 34 | 
 35 | void wneghess_lowtri(double *nH, int nwrd, int K, int *wrd, double *x,
 36 | 		     double *q, double **theta, double *w, int nef){
 37 |   int j, h, k;
 38 |   zero_dvec(nH, K*K);
 39 |   for(k=0; k<K; k++){
 40 |     if(nef) nH[k*(K+1)] += 1.0/(w[k]*w[k]*((double) K));
 41 |     for(h=k; h<K; h++)  
 42 | 	for(j=0; j<nwrd; j++)
 43 | 	  nH[k*K+h] += x[j]*theta[k][wrd[j]]*theta[h][wrd[j]]/(q[j]*q[j]); }
 44 | 
 45 | }
 46 | 
 47 | /* find a new active set  */
 48 | int wactivate(int K, double *w, int *active){
 49 |   int h, k, nact;
 50 |   nact = sum_ivec(active, K);
 51 | 
 52 |   for(h=0; h<K; h++)
 53 |     if(!active[h]){
 54 |       if(w[h] <= 0.0)
 55 | 	{ w[h]=0.0; active[h] = 1; nact++; }
 56 |       if(w[h] >= 1.0)
 57 | 	{ zero_dvec(w,K); w[h] = 1.0; 
 58 | 	  for(k=0; k<K; k++) active[k] = 1;
 59 | 	  return 0; } }
 60 | 
 61 |   return K - nact;
 62 | }
 63 | 
 64 | /* update function for a given solution */
 65 | int wupdate(int K, double *w, double *B, int *active){
 66 |     double delmin = 1.0;
 67 |     double delta;
 68 |     int h, d;
 69 | 
 70 |     d = 0;
 71 |     for(h=0; h<K; h++)
 72 |       if(!active[h])
 73 | 	{ assert(w[h]>0);
 74 | 	  delta = 1.0;
 75 | 	  if(B[d] < -w[h]) delta = -w[h]/B[d];
 76 | 	  if(B[d] > 1.0-w[h]) delta = (1.0-w[h])/B[d];
 77 | 	  if(delta < delmin) delmin = delta;
 78 | 	  d++; }
 79 |     
 80 |     d = 0;
 81 |     for(h=0; h<K; h++)
 82 |       if(!active[h]) 
 83 | 	{ w[h] += delmin*B[d]; d++; }
 84 | 
 85 |     return  wactivate(K, w, active);
 86 | }
 87 | 
 88 | /* main program: sequential quadratic programming for topic weights */
 89 | // NB: assumed prior concentration of 1/K if in NEF parametrization
 90 | int sqpw(int p, int nwrd, int K,  int *wrd, double *x, 
 91 | 	 double **theta, double *w, int nef,
 92 | 	 double tol, int tmax, int verb){
 93 | 
 94 |   int h,k,c,d;
 95 | 
 96 |   // single-word solution
 97 |   if(nwrd==1){
 98 |     zero_dvec(w,K);
 99 |     k = 0;
100 |     for(h=1;h<K;h++) 
101 |       if(theta[h][*wrd] > theta[k][*wrd]) k = h;
102 |     w[k] = 1.0;
103 |     return 1; }
104 | 
105 |   double diff = tol+1.0;
106 | 
107 |   double *q = new_dvec(nwrd);
108 | 
109 |   double *A = new_dzero( (K+1)*(K+1) );
110 |   double *B = new_dzero(K+1);
111 |   double *nhess = new_dvec(K*K);
112 |   double *grad = new_dvec(K);
113 |   double *wold = new_dup_dvec(w, K);
114 | 
115 |   int *active = new_izero(K);
116 |   int nw = K;
117 |   int info;
118 |   int dosysv = 1;
119 |   int iter = 1;
120 |    
121 |   while(diff > tol && iter < tmax && nw > 0){
122 | 
123 |     /* update gradient, and negative hessian */
124 |     wrdprob(q, nwrd, K, wrd, theta, w);
125 |     wgrad(grad, nwrd, K, wrd, x, q, theta, w, nef);
126 |     wneghess_lowtri(nhess, nwrd, K, wrd, x, q, theta, w, nef);
127 | 
128 |     /* build linear system for current active set */
129 |     d = 0;
130 |     for(k=0; k<K; k++)
131 |       if(!active[k]){ 
132 | 	B[d] = grad[k];
133 | 	c = d;
134 | 	for(h=k; h<K; h++) 
135 | 	  if(!active[h]){ A[d*(nw+1) + c] = nhess[k*K + h]; c++; }
136 | 	d++;
137 |       }
138 |     assert(d==nw);    
139 |     for(c=0; c<nw; c++) A[c*(nw+1) + nw] = 1.0;
140 |     A[(nw+1)*(nw+1)-1] = 0.0;
141 |     B[nw] = 0.0;
142 | 
143 | 	  
144 |     /* solve it  (sysv can be unstable for small samples; switch to gesv if necessary) */
145 |     if(dosysv){ info = la_dsysv(nw+1, 1, A, B); 
146 |       if(info!=0){ dosysv = 0; continue; } }
147 |     else{
148 |       for(d=1; d<=nw; d++) for(c=0; c<d; c++) A[d*(nw+1) + c] = A[c*(nw+1) +d];
149 |       info = la_dgesv(nw+1, 1, A, B);
150 |     }
151 | 
152 |     /* check and update */
153 |     if(fabs(sum_dvec(B, nw)) > .001 || (info !=0 && dosysv == 0)){
154 |       myprintf(mystdout, "trouble in wsolve\n"); 
155 |       nw = wactivate(K, w, active); 
156 |       return 0;
157 |     }
158 |     else nw = wupdate(K, w, B, active); 
159 | 
160 | 
161 |     /* iterate */
162 |     diff = 0.0;
163 |     for(h=0; h<K; h++) diff += fabs(wold[h]-w[h]);
164 |     copy_dvec(wold, w, K);
165 | 
166 |     if(verb==1){ 
167 |       myprintf(mystdout, 
168 | 	       "Omega Fit: iter = %d, unique words = %d, active=%d, diff = %g \n",
169 | 	       iter, nwrd, K-nw,  diff);
170 |       myprintf(mystdout, "          W = "); print_dvec(w, K, mystdout); }
171 |     
172 |     iter++;
173 |   }
174 | 
175 |   free(wold);
176 |   free(active);
177 |   free(q);
178 |   free(B);
179 |   free(grad);
180 |   free(A);
181 |   free(nhess);
182 | 
183 |   return 1;
184 | }
185 | 
186 | void Romega( int *n, int *p, int *K, 
187 | 	     int *doc, int *wrd, double *X, 
188 | 	     double *theta_vec, double *W, int *nef,
189 | 	     double *tol, int *tmax, int *verb){
190 |  
191 |   int speakup = *verb;
192 | 
193 |  double **theta = new_mat_fromv(*p, *K, theta_vec);
194 |  int i;
195 | 
196 | # ifdef _OPENMP
197 |  speakup=0; // otherwise it'll crash R-GUI trying to print all at once
198 | #pragma omp parallel for private(i) 
199 | # endif
200 |  for(i=0; i<(*n); i++) 
201 |    if(sqpw(*p, doc[i+1]-doc[i], *K, &wrd[doc[i]], &X[doc[i]], theta,  &W[i*(*K)], *nef, *tol, *tmax, speakup) == 0)
202 |      myprintf(mystdout, "Failed to converge for omega at i = %d\n", i+1); 
203 | 
204 |  delete_mat(theta);
205 | }
206 | 
207 | void RmixQ( int *n_in, int *p_in, int *K_in, int *N_in, int *B_in,
208 | 	      double *X, int *doc, int *wrd, int *grp, 
209 | 	     double *omega_vec, double *theta_vec, double *Q){
210 |   
211 |   int l, i, k, n, p, K, N, B;
212 |   
213 |   n = *n_in;
214 |   p = *p_in;
215 |   K = *K_in;
216 |   N = *N_in;
217 |   B = *B_in;
218 |     
219 |   for(l=0; l<N; l++)
220 |     for(k=0; k<K; k++)
221 |       Q[k*n + doc[l]] += X[l]*log(theta_vec[k*p + wrd[l]]);
222 | 
223 |   for(i=0; i<n; i++)
224 |     for(k=0; k<K; k++)
225 |       Q[k*n + i] += log(omega_vec[k*B + grp[i]]); 
226 | 
227 | } 
228 | 
229 | void RcalcQ(int *n_in, int *p_in, int *K_in, int *doc, int *wrd, 
230 | 	    int *N, double *omega, double *theta, double *q ){
231 | 
232 |   int n, p, K, l, h;
233 | 
234 |   n = *n_in;
235 |   p = *p_in;
236 |   K = *K_in;
237 | 
238 |   for(l=0; l<(*N); l++){
239 |     q[l] = 0.0;
240 |     for(h=0; h<K; h++) q[l] += omega[h*n + doc[l]]*theta[h*p + wrd[l]];
241 |   }
242 | }
243 | 
244 | void RcalcTau(int *n_in, int *p_in, int *K_in,  double *m,
245 | 	      double *omega, double *theta, double *tau, double *size ){
246 | 
247 |   int n, p, K, i, j, h;
248 |   double q;
249 | 
250 |   n = *n_in;
251 |   p = *p_in;
252 |   K = *K_in;
253 | 
254 |   *tau = 0.0;
255 |   *size = 0;
256 | 
257 |   for(j=0; j<p; j++){
258 |     for(i=0; i<n; i++){
259 |       q = 0.0;
260 |       for(h=0; h<K; h++) q += omega[h*n + i]*theta[h*p + j];
261 |       *tau += m[i]*q/(1-q);
262 |       if( (q*m[i]) > 0.01 ){  (*size)++; }
263 |     }
264 |   }
265 | }
266 | 
267 | /* negative hessian in full NEF representation */
268 | void RnegHW(int *n_in, int *p_in, int *K_in, double *omeg, double *thet,
269 | 	     int *doc, int *wrd, double *cnt, double *q, int *N, double *H){
270 | 
271 |   int n, p, K, K2, l, h, k;
272 | 
273 |   n = *n_in;
274 |   p = *p_in;
275 |   K = *K_in;
276 |   K2 = K*K;
277 |   double ttbyq2;
278 | 
279 |   zero_dvec(H, n*K2);
280 | 
281 |   for(l=0; l<(*N); l++)
282 |     for(k=0; k<K; k++)
283 |       {
284 | 	for(h=k; h<K; h++)
285 | 	  { ttbyq2 = exp( log(thet[k*p+wrd[l]])+log(thet[h*p+wrd[l]]) - 2*log(q[l]) );
286 | 	    H[doc[l]*K2 + K*k +h] += -cnt[l]*omeg[k*n+doc[l]]*omeg[h*n+doc[l]]*(1.0 - ttbyq2); 
287 | 	    H[doc[l]*K2 + K*k +h] += -omeg[k*n+doc[l]]*omeg[h*n+doc[l]]; }
288 | 	
289 | 	H[doc[l]*K2 + (K+1)*k] += -cnt[l]*omeg[k*n+doc[l]]*( thet[k*p+wrd[l]]-q[l] )/q[l]
290 | 	  + omeg[k*n+doc[l]]; 	
291 |       }
292 | 
293 |   for(l=0; l<n; l++)
294 |     for(k=0; k<K; k++)
295 |       for(h=0; h<k; h++) H[l*K2 + K*k +h] = H[l*K2 + K*h +k]; 
296 | }
297 | 
298 | 
299 | void Rlogit(int *d, double *eta, double *prob){
300 |   int j;
301 |   double lp0 = log(prob[0]);
302 |   for(j=1; j<*d; j++) eta[j-1] = log(prob[j]) - lp0; }
303 | 
304 | void Rexpit(int *d, double *prob, double *eta){
305 |   int j;
306 |   prob[0] = 1; 
307 |   for(j=1; j<*d; j++) prob[j] = exp(eta[j-1]);
308 |   double p0 = sum_dvec(prob, *d);
309 |   for(j=0; j<*d; j++) prob[j] *= 1.0/p0;  }
310 | 
311 | 
312 | void RtoNEF(int *n_in, int *p_in, int *K_in, double *Y, double *theta, double *tomega){
313 |   int l;
314 |   int n = *n_in;
315 |   int p = *p_in;
316 |   int K = *K_in;
317 | 
318 |   for(l=0; l<K; l++) Rlogit(&p, &Y[l*(p-1)], &theta[l*p]);
319 |   for(l=0; l<n; l++) Rlogit(&K, &Y[K*(p-1) + l*(K-1)], &tomega[l*K]); }
320 | 
321 | 
322 | void RfromNEF(int *n_in, int *p_in, int *K_in, double *Y, double *theta, double *tomega){
323 |   int l;
324 |   int n = *n_in;
325 |   int p = *p_in;
326 |   int K = *K_in;
327 | 
328 |   for(l=0; l<K; l++) Rexpit(&p, &theta[l*p], &Y[l*(p-1)]);
329 |   for(l=0; l<n; l++) Rexpit(&K, &tomega[l*K], &Y[K*(p-1) + l*(K-1)]); }
330 | 
331 | 
332 | void Rzhat(int *n_in, int *p_in, int *K_in, int *N_in, double *xhat, 
333 | 	   int *doc, int *wrd, double *zhatj, double *zhati){
334 |   int h,l;
335 |   int n = *n_in;
336 |   int p = *p_in;
337 |   int K = *K_in;
338 |   int N = *N_in;
339 | 
340 |   // zhat's must be pre-zero'd
341 |   for(l=0; l<N; l++)
342 |     for(h=0; h<K; h++)
343 |       { zhatj[h*p + wrd[l]] += xhat[N*h +l];
344 | 	zhati[h*n + doc[l]] += xhat[N*h +l]; }
345 | }
346 | 


--------------------------------------------------------------------------------