├── .Rbuildignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── count.R ├── topics.R └── tpx.R ├── README.md ├── inst └── CITATION ├── man ├── counts.Rd ├── predict.topics.Rd ├── rdir.Rd ├── topicVar.Rd └── topics.Rd └── src ├── Makevars ├── init.c ├── latools.c ├── latools.h ├── rhelp.c ├── rhelp.h └── topics.c /.Rbuildignore: -------------------------------------------------------------------------------- 1 | .git/* 2 | .git 3 | README.md -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: maptpx 2 | Title: MAP Estimation of Topic Models 3 | Version: 1.9-7 4 | Author: Matt Taddy 5 | Depends: R (>= 2.10), slam 6 | Suggests: MASS 7 | Description: Maximum a posteriori (MAP) estimation for topic models (i.e., Latent Dirichlet Allocation) in text analysis, as described in Taddy (2012) 'On estimation and selection for topic models'. Previous versions of this code were included as part of the 'textir' package. If you want to take advantage of openmp parallelization, uncomment the relevant flags in src/MAKEVARS before compiling. 8 | Maintainer: Matt Taddy 9 | License: GPL-3 10 | URL: http://taddylab.com 11 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | useDynLib(maptpx) 2 | import(slam) 3 | importFrom("grDevices", "grey") 4 | importFrom("graphics", "axis", "hist", "image", "par", "points", "text") 5 | importFrom("stats", "pchisq", "rgamma") 6 | 7 | export(stm_tfidf, 8 | normalize, 9 | rdir, 10 | topics, 11 | predict.topics, 12 | topicVar, 13 | expit, 14 | logit) 15 | 16 | S3method(plot,topics) 17 | S3method(predict,topics) 18 | S3method(summary,topics) 19 | -------------------------------------------------------------------------------- /R/count.R: -------------------------------------------------------------------------------- 1 | ## Tools for manipulation of text count matrices ## 2 | 3 | ## converting count to frequency matrix 4 | normalize <- function(x, byrow=TRUE){ 5 | if(byrow){ s <- row_sums(x) 6 | s[s==0] <- 1 7 | return( x/s ) } 8 | else{ 9 | s <- col_sums(x) 10 | s[s==0] <- 1 11 | return(t(t(x)/s)) } 12 | } 13 | 14 | ## converting a count/freq matrix to tfidf 15 | stm_tfidf <- function(x){ 16 | idf <- log( nrow(x) ) - log(col_sums(x>0) + 1) 17 | t( t(x) * idf ) 18 | } 19 | 20 | ## Dirichlet RNG 21 | rdir <- function(n, alpha) 22 | { 23 | x <- matrix(rgamma(length(alpha)*n,alpha),nrow=n,byrow=TRUE) 24 | return(t(x/rowSums(x))) } 25 | -------------------------------------------------------------------------------- /R/topics.R: -------------------------------------------------------------------------------- 1 | ##### Estimation for Topic Models ###### 2 | 3 | ## intended main function; provides defaults and selects K via marginal lhd 4 | topics <- function(counts, K, shape=NULL, initopics=NULL, tol=0.1, 5 | bf=FALSE, kill=2, ord=TRUE, verb=1, ...) 6 | ## tpxselect defaults: tmax=10000, wtol=10^(-4), qn=100, grp=NULL, admix=TRUE, nonzero=FALSE, dcut=-10 7 | { 8 | X <- CheckCounts(counts) 9 | p <- ncol(X) 10 | if(verb>0) 11 | cat(sprintf("\nEstimating on a %d document collection.\n", nrow(X))) 12 | 13 | ## check the prior parameters for theta 14 | if(prod(shape>0) != 1){ stop("use shape > 0\n") } 15 | 16 | ## check the list of candidate K values 17 | if(prod(K>1)!=1){ stop(cat("use K values > 1\n")) } 18 | K <- sort(K) 19 | 20 | ## initialize 21 | initopics <- tpxinit(X[1:min(ceiling(nrow(X)*.05),100),], initopics, K[1], shape, verb) 22 | 23 | ## either search for marginal MAP K and return bayes factors, or just fit 24 | tpx <- tpxSelect(X, K, bf, initopics, alpha=shape, tol, kill, verb, ...) 25 | K <- tpx$K 26 | 27 | ## clean up and out 28 | if(ord){ worder <- order(col_sums(tpx$omega), decreasing=TRUE) } # order by decreasing usage 29 | else{ worder <- 1:K } 30 | ## Main parameters 31 | theta=matrix(tpx$theta[,worder], ncol=K, dimnames=list(phrase=dimnames(X)[[2]], topic=paste(1:K)) ) 32 | omega=matrix(tpx$omega[,worder], ncol=K, dimnames=list(document=NULL, topic=paste(1:K)) ) 33 | if(nrow(omega)==nrow(X)){ dimnames(omega)[[1]] <- dimnames(X)[[1]] } 34 | 35 | ## topic object 36 | out <- list(K=K, theta=theta, omega=omega, BF=tpx$BF, D=tpx$D, X=X) 37 | class(out) <- "topics" 38 | invisible(out) } 39 | 40 | 41 | ## S3 method predict function 42 | predict.topics <- function(object, newcounts, loglhd=FALSE, ...) 43 | ## tpxweights optional arguments and defauls are verb=FALSE, nef=TRUE, wtol=10^{-5}, tmax=1000 44 | { 45 | if(is.vector(newcounts)){ newcounts <- matrix(newcounts, nrow=1) } 46 | if(class(newcounts)[1] == "TermDocumentMatrix"){ newcounts <- t(newcounts) } 47 | X <- as.simple_triplet_matrix(newcounts) 48 | 49 | if(!(class(object)[1]%in%c("topics","matrix"))){ stop("object class must be `topics' or 'matrix'.") } 50 | 51 | if(class(object)[1]=="topics"){ 52 | theta <- object$theta 53 | if(nrow(theta) != ncol(X)){ stop("Dimension mismatch: nrow(theta) != ncol(X)") } 54 | if(nrow(object$X) != nrow(object$omega)) # simple mixture 55 | { Q <- matrix(tpxMixQ(X, omega=object$omega, theta=theta, ...)$lQ, ncol=ncol(theta)) 56 | return( (1:ncol(theta))[apply(Q,1,which.max)] ) } 57 | } 58 | else{ theta <- object } 59 | 60 | start <- tpxOmegaStart(X=X, theta=theta) 61 | 62 | ## re-order xvec in doc-blocks, and build indices 63 | doc <- c(0,cumsum(as.double(table(factor(X$i, levels=c(1:nrow(X))))))) 64 | xvo <- X$v[order(X$i)] 65 | wrd <- X$j[order(X$i)]-1 66 | 67 | W <- tpxweights(n=nrow(X), p=ncol(X), xvo=xvo, wrd=wrd, doc=doc, start=start, theta=theta, ...) 68 | 69 | if(loglhd){ 70 | L <- sum( X$v*log(tpxQ(theta=theta, omega=W, doc=X$i, wrd=X$j)) ) 71 | return(list(W=W, L=L)) } 72 | else{ return(W) } 73 | } 74 | 75 | ## S3 method summary function 76 | summary.topics <- function(object, nwrd=5, tpk=NULL, verb=TRUE, ...){ 77 | 78 | 79 | K <- object$K 80 | if(is.null(tpk)){ tpk <- 1:K } 81 | else if(prod(tpk %in% 1:K)!=1){ stop("requested tpk's are not in 1:K") } 82 | 83 | if(nwrd>0){ if(verb){ cat(paste("\nTop", nwrd, "phrases by topic-over-null term lift (and usage %):\n\n")) } 84 | Q0 <- col_sums(object$X)/sum(object$X) 85 | topwords <- c() 86 | toplift <- c() 87 | usage <- round( (col_means(object$omega)*100), 1) 88 | for(k in tpk){ 89 | odds <- (log(object$theta[,k]) - log(Q0))[Q0!=0] 90 | ko <- order(odds, decreasing=TRUE) 91 | topk <- dimnames(object$theta)[[1]][Q0!=0][ko[1:nwrd]] 92 | topwords <- cbind(topwords, topk) 93 | toplift <- cbind(toplift, exp(sort(odds, decreasing=TRUE)[1:nwrd])) 94 | if(verb){ cat(paste("[",k, "] '", sep="")) 95 | cat(topk, sep="', '") 96 | cat(paste("' (",usage[k],") \n",sep="")) } 97 | } 98 | topwords <- as.matrix(topwords) 99 | toplift <- as.matrix(toplift) 100 | dimnames(topwords)[[2]] <- dimnames(toplift)[[2]] <- paste("topic",tpk,sep="") 101 | dimnames(toplift)[[1]] <- dimnames(toplift)[[1]] <- 1:nwrd 102 | } 103 | else{ topwords <- toplift <- NULL } 104 | 105 | if(!is.null(object$BF) && !is.null(object$D) && verb) 106 | { 107 | cat("\nLog Bayes factor and estimated dispersion, by number of topics:\n\n") 108 | D <- rbind(logBF=object$BF,Disp=object$D[1,]) 109 | if(verb>1){ D <- rbind(D,p.val=object$D[2,]) } 110 | print(round(D,2)) 111 | cat(paste("\nSelected the K =",object$K,"topic model\n\n")) 112 | } 113 | else if(verb){ 114 | n <- nrow(object$K) 115 | rho = pchisq(sum(object$r^2), df=length(object$r), lower.tail=FALSE) 116 | cat("\nDispersion = ", round(object$D$dispersion,2) ,"\n\n",sep="") } 117 | 118 | retobj <- data.frame(topic=rep(tpk, each=nwrd), phrase=c(topwords), lift=c(toplift)) 119 | invisible(retobj) 120 | } 121 | 122 | 123 | ## Colors for topic plotting 124 | TOPICOLS <- matrix(nrow=6, 125 | c(grey(c(.9,.8,.7,.55,.35,0)), #GREY 126 | "#FFCCCC", "#FA8072", "#FF5333", "#EE0000", "#CC1100", "#800000", #RED 127 | "#BDFCC9", "#98FB98", "#49E20E", "#009900", "#006400", "#004F00", #GREEN 128 | "#BBFFFF", "#67E6EC", "#00C5CD", "#0198E1", "#0147FA", "#000080")) #BLUE 129 | 130 | 131 | 132 | plot.topics <- function(x, type=c("weight","resid"), group=NULL, labels=NULL, 133 | col=NULL, xlab=NULL, ylab=NULL, main=NULL, tpk=NULL, 134 | lgd.K=NULL, cex.lgdc = 1, cex.lgdt = 1, cex.rmar= 1, ...){ 135 | 136 | if(type[1]=="resid"){ 137 | 138 | if(is.null(col[1])){col <- 8} 139 | if(is.null(xlab)){ xlab="abs( adusted residuals )" } 140 | if(is.null(main)){ main="" } 141 | 142 | resids <- tpxResids(x$X, theta=x$theta, omega=x$omega, ...)$r 143 | 144 | hist(resids, col=col, border=grey(.9), 145 | xlab=xlab, 146 | main="", cex.lab=cex.lgdt, font.lab=3) 147 | 148 | return(invisible()) 149 | } 150 | 151 | n <- nrow(x$omega) 152 | mmm <- n==nrow(x$X) 153 | if(n==1){ 154 | if(is.null(main)){ main="" } 155 | if(is.null(xlab)){ xlab="topic" } 156 | if(is.null(ylab)){ ylab="weight" } 157 | if(is.null(col)){ col = 8 } 158 | return(plot(c(x$omega), type="h", lwd=10, col=col, main=main, ylab=ylab, xlab=xlab)) } 159 | 160 | if(is.null(tpk)){ tpk <- 1:x$K } 161 | if(is.null(lgd.K)){ lgd.K <- max(.1*length(tpk),.75) } 162 | 163 | if(is.null(group) || !mmm){ 164 | ltop = .65*n 165 | lbot = .35*n 166 | 167 | if(is.null(col)){ col<-1 } 168 | tpcl <- c(0, TOPICOLS[1:6,col[1]%%5]) 169 | W <- x$omega[,tpk] 170 | if(mmm){ 171 | brks <- seq(0,1,length=8) 172 | tplg <- c("w=0","w=1") } 173 | else{ 174 | brks <- seq(min(W),max(W),length=8) 175 | tplg <- c(round(min(W),3),round(max(W),3)) 176 | } 177 | } else{ # bunch of extra commands to get shading for two groups 178 | group <- as.factor(group) 179 | ltop = .85*n 180 | lbot = .15*n 181 | 182 | if(length(group)!=n){ stop("Your group membership length doesn't match omega.") } 183 | if(nlevels(group)!=2){ stop("Sorry, grouping must be a two-level factor") } 184 | if(is.null(col) || length(col)<2){ col <- 1:2 } 185 | 186 | tpcl <- c(TOPICOLS[6:1,col[1]%%5],"#FFFFFF",TOPICOLS[,col[2]%%5]) 187 | brks <- c(seq(-1, -0.1,length=7),seq(0.1,1,length=7)) 188 | W <- x$omega[,tpk]*(c(-1,1)[as.logical(group)+1]) 189 | if(is.null(labels)){labels <- c("F","T")} 190 | tplg=rep("w=1",2) 191 | } 192 | 193 | ## plot parameters 194 | xlg <- length(tpk)+lgd.K 195 | old.mar <- par()$mar 196 | par(xpd=TRUE, mar=c(5.1,4.1,2.1,5*cex.rmar)) 197 | if(is.null(ylab)){ 198 | if(nrow(x$X)==nrow(x$omega)){ ylab="Document" } 199 | else{ ylab="group" } } 200 | if(is.null(xlab)){ xlab="Topic" } 201 | if(is.null(main)){ main="Topic-Loading Weights" } 202 | 203 | ## finally plot 204 | image(y=1:n, x=1:length(tpk), t(W), ylab=ylab, xlab=xlab, 205 | main=main, col=tpcl, font.lab=3, xaxt="n", yaxt="n", breaks=brks, ...) 206 | axis(side=1, at=1:length(tpk), labels=tpk, tick=FALSE, line=-.5) 207 | 208 | if(!mmm){ axis(side=2, at=1:n) } 209 | else{axis(2)} 210 | 211 | points(rep(xlg,length(tpcl)), seq(lbot,ltop,length=length(tpcl)), col=tpcl, pch=15, cex=3*cex.lgdc) 212 | text(rep(xlg,2), y=c(lbot-.08*n, ltop+.08*n), tplg, cex=cex.lgdt) 213 | if(!is.null(labels)){ text(rep(xlg,2), y=c(lbot-.14*n, ltop+.14*n), labels, font=3, cex=cex.lgdt) } 214 | par(mar=old.mar) 215 | } 216 | 217 | ## logit and expit, treating first element as null 218 | logit <- function(prob){ 219 | f <- function(p) .C("Rlogit", d=as.integer(d), 220 | eta=double(d-1), prob=as.double(p), PACKAGE="maptpx")$eta 221 | prob[prob < 1e-10] <- 1e-10 222 | prob[1-prob < 1e-10] <- 1-1e-10 223 | if(is.matrix(prob) || is.data.frame(prob)){ 224 | d <- ncol(prob) 225 | eta <- matrix(t(apply(prob,1,f)), ncol=d-1, dimnames=list(row=dimnames(prob)[[1]], col=dimnames(prob)[[2]][-1])) 226 | } 227 | else{ 228 | d <- length(prob) 229 | eta <- f(prob) 230 | } 231 | return(eta) } 232 | 233 | expit <- function(eta){ 234 | f <- function(e) .C("Rexpit", d=as.integer(d), prob=double(d), 235 | eta=as.double(e), PACKAGE="maptpx")$prob 236 | eta[eta==Inf] <- 1e10 237 | eta[eta==-Inf] <- -1e10 238 | if(is.matrix(eta) || is.data.frame(eta)){ 239 | d <- ncol(eta)+1 240 | if(is.null(dimnames(eta)[[2]])) dimnames(eta)[[2]] <- paste("nef",1:ncol(eta),sep='') 241 | prob <- matrix(t(apply(eta,1,f)), ncol=d, dimnames=list(row=dimnames(eta)[[1]], col=c('nullcat',dimnames(eta)[[2]]))) 242 | } 243 | else{ 244 | d <- length(eta)+1 245 | prob <- f(eta) 246 | } 247 | return(prob) } 248 | 249 | 250 | ### topic weight variance matrix (in NEF parametrization) 251 | topicVar <- function(counts, theta, omega){ 252 | X <- CheckCounts(counts) 253 | if(nrow(omega) != nrow(X)) stop("omega does not match counts") 254 | if(ncol(omega) != ncol(theta)) stop("omega does not match theta") 255 | 256 | K <- ncol(omega) 257 | n <- nrow(X) 258 | p <- nrow(theta) 259 | 260 | q <- tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j) 261 | H <- array(.C("RnegHW", 262 | n = as.integer(n), 263 | p = as.integer(p), 264 | K = as.integer(K-1), 265 | omeg = as.double(omega[,-1]), 266 | thet = as.double(theta[,-1]), 267 | doc = as.integer(X$i-1), 268 | wrd = as.integer(X$j-1), 269 | cnt = as.double(X$v), 270 | q = as.double(q), 271 | N = as.integer(length(q)), 272 | H = double(n*(K-1)^2), 273 | PACKAGE="maptpx")$H, 274 | dim=c(K-1,K-1,n)) 275 | 276 | S <- array(apply(H, 3, function(h) tryCatch(solve(h), error=function(e) solve(h + diag(.00001,K-1))) ), 277 | dim=c(K-1,K-1,n), dimnames=list(topics=1:(K-1), topics=1:(K-1), doc=dimnames(X)[[1]]) ) 278 | return(S) 279 | } 280 | -------------------------------------------------------------------------------- /R/tpx.R: -------------------------------------------------------------------------------- 1 | ####### Undocumented "tpx" utility functions ######### 2 | 3 | ## ** Only referenced from topics.R 4 | 5 | ## check counts (can be an object from tm, slam, or a simple co-occurance matrix) 6 | CheckCounts <- function(counts){ 7 | if(class(counts)[1] == "TermDocumentMatrix"){ counts <- t(counts) } 8 | if(is.null(dimnames(counts)[[1]])){ dimnames(counts)[[1]] <- paste("doc",1:nrow(counts)) } 9 | if(is.null(dimnames(counts)[[2]])){ dimnames(counts)[[2]] <- paste("wrd",1:ncol(counts)) } 10 | empty <- row_sums(counts) == 0 11 | if(sum(empty) != 0){ 12 | counts <- counts[!empty,] 13 | cat(paste("Removed", sum(empty), "blank documents.\n")) } 14 | return(as.simple_triplet_matrix(counts)) 15 | } 16 | 17 | ## Topic estimation and selection for a list of K values 18 | tpxSelect <- function(X, K, bf, initheta, alpha, tol, kill, verb, 19 | admix=TRUE, grp=NULL, tmax=10000, 20 | wtol=10^{-4}, qn=100, nonzero=FALSE, dcut=-10){ 21 | 22 | ## check grp if simple mixture 23 | if(!admix){ 24 | if(is.null(grp) || length(grp)!=nrow(X)){ grp <- rep(1,nrow(X)) } 25 | else{ grp <- factor(grp) } 26 | } 27 | 28 | ## return fit for single K 29 | if(length(K)==1 && bf==FALSE){ 30 | if(verb){ cat(paste("Fitting the",K,"topic model.\n")) } 31 | fit <- tpxfit(X=X, theta=initheta, alpha=alpha, tol=tol, verb=verb, 32 | admix=admix, grp=grp, tmax=tmax, wtol=wtol, qn=qn) 33 | fit$D <- tpxResids(X=X, theta=fit$theta, omega=fit$omega, grp=grp, nonzero=nonzero)$D 34 | return(fit) 35 | } 36 | 37 | if(is.matrix(alpha)){ stop("Matrix alpha only works for fixed K") } 38 | 39 | if(verb){ cat(paste("Fit and Bayes Factor Estimation for K =",K[1])) 40 | if(length(K)>1){ cat(paste(" ...", max(K))) } 41 | cat("\n") } 42 | 43 | ## dimensions 44 | n <- nrow(X) 45 | p <- ncol(X) 46 | nK <- length(K) 47 | 48 | BF <- D <- NULL 49 | iter <- 0 50 | 51 | ## Null model log probability 52 | sx <- sum(X) 53 | qnull <- col_sums(X)/sx 54 | null <- sum( X$v*log(qnull[X$j]) ) - 0.5*(n+p)*(log(sx) - log(2*pi)) 55 | 56 | ## allocate and initialize 57 | best <- -Inf 58 | bestfit <- NULL 59 | 60 | ## loop over topic numbers 61 | for(i in 1:nK){ 62 | 63 | ## Solve for map omega in NEF space 64 | fit <- tpxfit(X=X, theta=initheta, alpha=alpha, tol=tol, verb=verb, 65 | admix=admix, grp=grp, tmax=tmax, wtol=wtol, qn=qn) 66 | 67 | BF <- c(BF, tpxML(X=X, theta=fit$theta, omega=fit$omega, alpha=fit$alpha, L=fit$L, dcut=dcut, admix=admix, grp=grp) - null) 68 | R <- tpxResids(X=X, theta=fit$theta, omega=fit$omega, grp=grp, nonzero=nonzero) 69 | D <- cbind(D, unlist(R$D)) 70 | 71 | if(verb>0) cat(paste("log BF(", K[i], ") =", round(BF[i],2))) 72 | if(verb>1) cat(paste(" [ ", fit$iter,"steps, disp =",round(D[1,i],2)," ]\n")) else if(verb >0) cat("\n") 73 | 74 | if(is.nan(BF[i])){ 75 | cat("NAN for Bayes factor.\n") 76 | return(bestfit) 77 | break} 78 | 79 | if(BF[i] > best){ # check for a new "best" topic 80 | best <- BF[i] 81 | bestfit <- fit 82 | } else if(kill>0 && i>kill){ # break after kill consecutive drops 83 | if(prod(BF[i-0:(kill-1)] < BF[i-1:kill])==1) break } 84 | 85 | if(i0) != 1){ stop("use probs > 0 for initheta.") } 103 | return(normalize(initheta, byrow=FALSE)) } 104 | 105 | if(is.matrix(alpha)){ 106 | if(nrow(alpha)!=ncol(X) || ncol(alpha)!=K1){ stop("bad matrix alpha dimensions; check your K") } 107 | return(normalize(alpha, byrow=FALSE)) } 108 | 109 | if(is.null(initheta)){ ilength <- K1-1 } 110 | else{ ilength <- initheta[1] } 111 | if(ilength < 1){ ilength <- 1 } 112 | 113 | ## set number of initial steps 114 | if(length(initheta)>1){ tmax <- initheta[2] } 115 | else{ tmax <- 3 } 116 | ## set the tolerance 117 | if(length(initheta)>2){ tol <- initheta[3] } 118 | else{ tol <- 0.5 } 119 | ## print option 120 | if(length(initheta)>3){ verb <- initheta[4] } 121 | else{ verb <- 0 } 122 | 123 | 124 | if(verb){ cat("Building initial topics") 125 | if(verb > 1){ cat(" for K = ") } 126 | else{ cat("... ") } } 127 | 128 | nK <- length( Kseq <- unique(ceiling(seq(2,K1,length=ilength))) ) 129 | initheta <- tpxThetaStart(X, matrix(col_sums(X)/sum(X), ncol=1), matrix(rep(1,nrow(X))), 2) 130 | 131 | if(verb > 0) 132 | { cat("\n") 133 | print(list(Kseq=Kseq, tmax=tmax, tol=tol)) } 134 | 135 | ## loop over topic numbers 136 | for(i in 1:nK){ 137 | 138 | ## Solve for map omega in NEF space 139 | fit <- tpxfit(X=X, theta=initheta, alpha=alpha, tol=tol, verb=verb, 140 | admix=TRUE, grp=NULL, tmax=tmax, wtol=-1, qn=-1) 141 | if(verb>1){ cat(paste(Kseq[i],",", sep="")) } 142 | 143 | if(i0){ 178 | cat("log posterior increase: " ) 179 | digits <- max(1, -floor(log(tol, base=10))) } 180 | 181 | Y <- NULL # only used for qn > 0 182 | Q0 <- col_sums(X)/sum(X) 183 | L <- tpxlpost(X=X, theta=theta, omega=omega, alpha=alpha, admix=admix, grp=grp) 184 | # if(is.infinite(L)){ L <- sum( (log(Q0)*col_sums(X))[Q0>0] ) } 185 | 186 | ## Iterate towards MAP 187 | while( update && iter < tmax ){ 188 | 189 | ## sequential quadratic programming for conditional Y solution 190 | if(admix && wtol > 0){ Wfit <- tpxweights(n=nrow(X), p=ncol(X), xvo=xvo, wrd=wrd, doc=doc, 191 | start=omega, theta=theta, verb=0, nef=TRUE, wtol=wtol, tmax=20) } 192 | else{ Wfit <- omega } 193 | 194 | ## joint parameter EM update 195 | move <- tpxEM(X=X, m=m, theta=theta, omega=Wfit, alpha=alpha, admix=admix, grp=grp) 196 | 197 | ## quasinewton-newton acceleration 198 | QNup <- tpxQN(move=move, Y=Y, X=X, alpha=alpha, verb=verb, admix=admix, grp=grp, doqn=qn-dif) 199 | move <- QNup$move 200 | Y <- QNup$Y 201 | 202 | if(QNup$L < L){ # happens on bad Wfit, so fully reverse 203 | if(verb > 10){ cat("_reversing a step_") } 204 | move <- tpxEM(X=X, m=m, theta=theta, omega=omega, alpha=alpha, admix=admix, grp=grp) 205 | QNup$L <- tpxlpost(X=X, theta=move$theta, omega=move$omega, alpha=alpha, admix=admix, grp=grp) } 206 | 207 | ## calculate dif 208 | dif <- (QNup$L-L) 209 | 210 | L <- QNup$L 211 | 212 | 213 | ## check convergence 214 | if(abs(dif) < tol){ 215 | if(sum(abs(theta-move$theta)) < tol){ update = FALSE } } 216 | 217 | ## print 218 | if(verb>0 && (iter-1)%%ceiling(10/verb)==0 && iter>0){ 219 | cat( paste( round(dif,digits), #" (", sum(abs(theta-move$theta)),")", 220 | ", ", sep="") ) } 221 | 222 | ## heartbeat for long jobs 223 | if(((iter+1)%%1000)==0){ 224 | cat(sprintf("p %d iter %d diff %g\n", 225 | nrow(theta), iter+1,round(dif))) } 226 | 227 | ## iterate 228 | iter <- iter+1 229 | theta <- move$theta 230 | omega <- move$omega 231 | 232 | } 233 | 234 | ## final log posterior 235 | L <- tpxlpost(X=X, theta=theta, omega=omega, alpha=alpha, admix=admix, grp=grp) 236 | 237 | ## summary print 238 | if(verb>0){ 239 | cat("done.") 240 | if(verb>1) { cat(paste(" (L = ", round(L,digits), ")", sep="")) } 241 | cat("\n") 242 | } 243 | 244 | out <- list(theta=theta, omega=omega, K=K, alpha=alpha, L=L, iter=iter) 245 | invisible(out) } 246 | 247 | 248 | ## ** called from topics.R (predict) and tpx.R 249 | ## Conditional solution for topic weights given theta 250 | tpxweights <- function(n, p, xvo, wrd, doc, start, theta, verb=FALSE, nef=TRUE, wtol=10^{-5}, tmax=1000) 251 | { 252 | K <- ncol(theta) 253 | start[start == 0] <- 0.1/K 254 | start <- start/rowSums(start) 255 | omega <- .C("Romega", 256 | n = as.integer(n), 257 | p = as.integer(p), 258 | K = as.integer(K), 259 | doc = as.integer(doc), 260 | wrd = as.integer(wrd), 261 | X = as.double(xvo), 262 | theta = as.double(theta), 263 | W = as.double(t(start)), 264 | nef = as.integer(nef), 265 | tol = as.double(wtol), 266 | tmax = as.integer(tmax), 267 | verb = as.integer(verb), 268 | PACKAGE="maptpx") 269 | return(t(matrix(omega$W, nrow=ncol(theta), ncol=n))) } 270 | 271 | ## ** Called only in tpx.R 272 | 273 | ## single EM update. two versions: admix and mix 274 | tpxEM <- function(X, m, theta, omega, alpha, admix, grp) 275 | { 276 | n <- nrow(X) 277 | p <- ncol(X) 278 | K <- ncol(theta) 279 | 280 | if(admix){ Xhat <- (X$v/tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j))*(omega[X$i,]*theta[X$j,]) 281 | Zhat <- .C("Rzhat", n=as.integer(n), p=as.integer(p), K=as.integer(K), N=as.integer(nrow(Xhat)), 282 | Xhat=as.double(Xhat), doc=as.integer(X$i-1), wrd=as.integer(X$j-1), 283 | zj = as.double(rep(0,K*p)), zi = as.double(rep(0,K*n)), PACKAGE="maptpx") 284 | theta <- normalize(matrix(Zhat$zj+alpha, ncol=K), byrow=FALSE) 285 | omega <- normalize(matrix(Zhat$zi+1/K, ncol=K)) } 286 | else{ 287 | qhat <- tpxMixQ(X, omega, theta, grp, qhat=TRUE)$qhat 288 | ## EM update 289 | theta <- normalize(tcrossprod_simple_triplet_matrix( t(X), t(qhat) ) + alpha, byrow=FALSE) 290 | omega <- normalize(matrix(apply(qhat*m,2, function(x) tapply(x,grp,sum)), ncol=K)+1/K ) } 291 | 292 | return(list(theta=theta, omega=omega)) } 293 | 294 | ## Quasi Newton update for q>0 295 | tpxQN <- function(move, Y, X, alpha, verb, admix, grp, doqn) 296 | { 297 | ## always check likelihood 298 | L <- tpxlpost(X=X, theta=move$theta, omega=move$omega, 299 | alpha=alpha, admix=admix, grp=grp) 300 | 301 | if(doqn < 0){ return(list(move=move, L=L, Y=Y)) } 302 | 303 | ## update Y accounting 304 | Y <- cbind(Y, tpxToNEF(theta=move$theta, omega=move$omega)) 305 | if(ncol(Y) < 3){ return(list(Y=Y, move=move, L=L)) } 306 | if(ncol(Y) > 3){ warning("mis-specification in quasi-newton update; please report this bug.") } 307 | 308 | ## Check quasinewton secant conditions and solve F(x) - x = 0. 309 | U <- as.matrix(Y[,2]-Y[,1]) 310 | V <- as.matrix(Y[,3]-Y[,2]) 311 | sUU <- sum(U^2) 312 | sVU <- sum(V*U) 313 | Ynew <- Y[,3] + V*(sVU/(sUU-sVU)) 314 | qnup <- tpxFromNEF(Ynew, n=nrow(move$omega), 315 | p=nrow(move$theta), K=ncol(move$theta)) 316 | 317 | ## check for a likelihood improvement 318 | Lqnup <- try(tpxlpost(X=X, theta=qnup$theta, omega=qnup$omega, 319 | alpha=alpha, admix=admix, grp=grp), silent=TRUE) 320 | 321 | if(inherits(Lqnup, "try-error")){ 322 | if(verb>10){ cat("(QN: try error) ") } 323 | return(list(Y=Y[,-1], move=move, L=L)) } 324 | 325 | if(verb>10){ cat(paste("(QN diff ", round(Lqnup-L,3), ")\n", sep="")) } 326 | 327 | if(Lqnup < L){ 328 | return(list(Y=Y[,-1], move=move, L=L)) } 329 | else{ 330 | L <- Lqnup 331 | Y <- cbind(Y[,2],Ynew) 332 | return( list(Y=Y, move=qnup, L=L) ) 333 | } 334 | } 335 | 336 | 337 | ## unnormalized log posterior (objective function) 338 | tpxlpost <- function(X, theta, omega, alpha, admix=TRUE, grp=NULL) 339 | { 340 | if(!inherits(X,"simple_triplet_matrix")){ stop("X needs to be a simple_triplet_matrix.") } 341 | K <- ncol(theta) 342 | 343 | if(admix){ L <- sum( X$v*log(tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j)) ) } 344 | else{ L <- sum(tpxMixQ(X, omega, theta, grp)$lqlhd) } 345 | if(is.null(nrow(alpha))){ if(alpha != 0){ L <- L + sum(alpha*log(theta)) } } # unnormalized prior 346 | L <- L + sum(log(omega))/K 347 | 348 | return(L) } 349 | 350 | ## log marginal likelihood 351 | tpxML <- function(X, theta, omega, alpha, L, dcut, admix=TRUE, grp=NULL){ 352 | ## get the indices 353 | K <- ncol(theta) 354 | p <- nrow(theta) 355 | n <- nrow(omega) 356 | 357 | ## return BIC for simple finite mixture model 358 | if(!admix){ 359 | qhat <- tpxMixQ(X, omega, theta, grp, qhat=TRUE)$qhat 360 | ML <- sum(X$v*log(row_sums(qhat[X$i,]*theta[X$j,]))) 361 | return( ML - 0.5*( K*p + (K-1)*n )*log(sum(X)) ) } 362 | 363 | ML <- L + lfactorial(K) # lhd multiplied by label switching modes 364 | 365 | ## block-diagonal approx to determinant of the negative log hessian matrix 366 | q <- tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j) 367 | D <- tpxHnegDet(X=X, q=q, theta=theta, omega=omega, alpha=alpha) 368 | 369 | D[D < dcut] <- dcut 370 | ML <- ML - 0.5*sum( D ) # -1/2 |-H| 371 | 372 | ML <- ML + (K*p + sum(omega>0.01))*log(2*pi)/2 # (d/2)log(2pi) 373 | if(is.null(nrow(alpha))){ # theta prior normalizing constant 374 | ML <- ML + K*( lgamma(p*(alpha+1)) - p*lgamma(alpha+1) ) } 375 | else{ ML <- ML + sum(lgamma(col_sums(alpha+1)) - col_sums(lgamma(alpha+1))) } # matrix version 376 | ## omega(phi) prior normalizing constant number of parameters 377 | ML <- ML + sum(D[-(1:p)]>dcut)*( lfactorial(K) - K*lgamma( 1+1/K ) ) # 378 | 379 | return(ML) } 380 | 381 | ## find residuals for X$v 382 | tpxResids <- function(X, theta, omega, grp=NULL, nonzero=TRUE) 383 | { 384 | if(!inherits(X,"simple_triplet_matrix")){ stop("X needs to be a simple_triplet_matrix.") } 385 | 386 | m <- row_sums(X) 387 | K <- ncol(theta) 388 | n <- nrow(X) 389 | phat <- sum(col_sums(X)>0) 390 | d <- n*(K-1) + K*( phat-1 ) 391 | 392 | if(nrow(omega) == nrow(X)){ 393 | qhat <- tpxQ(theta=theta, omega=omega, doc=X$i, wrd=X$j) 394 | xhat <- qhat*m[X$i] 395 | } else{ 396 | q <- tpxMixQ(X=X, omega=omega, theta=theta, grp=grp, qhat=TRUE)$qhat 397 | qhat <- row_sums(q[X$i,]*theta[X$j,]) 398 | xhat <- qhat*m[X$i] } 399 | 400 | if(nonzero || nrow(omega) < nrow(X)){ 401 | ## Calculations based on nonzero counts 402 | ## conditional adjusted residuals 403 | e <- X$v^2 - 2*(X$v*xhat - xhat^2) 404 | s <- qhat*m[X$i]*(1-qhat)^{1-m[X$i]} 405 | r <- sqrt(e/s) 406 | df <- length(r)*(1-d/(n*phat)) 407 | R <- sum(r^2) 408 | } 409 | else{ 410 | ## full table calculations 411 | e <- (X$v^2 - 2*X$v*m[X$i]*qhat) 412 | s <- m[X$i]*qhat*(1-qhat) 413 | fulltable <- .C("RcalcTau", 414 | n = as.integer(nrow(omega)), 415 | p = as.integer(nrow(theta)), 416 | K = as.integer(ncol(theta)), 417 | m = as.double(m), 418 | omega = as.double(omega), 419 | theta = as.double(theta), 420 | tau = double(1), size=double(1), 421 | PACKAGE="maptpx" ) 422 | tau <- fulltable$tau 423 | R <- sum(e/s) + tau 424 | df <- fulltable$size - phat - d 425 | r <- suppressWarnings(sqrt(e/s + tau)) 426 | r[is.nan(r)] <- 0 ## should not happen, but can theoretically 427 | } 428 | 429 | ## collect and output 430 | sig2 <- R/df 431 | rho <- suppressWarnings(pchisq(R, df=df, lower.tail=FALSE)) 432 | D <- list(dispersion=sig2, pvalue=rho, df=df) 433 | return( list(s=s, e=e, r=r, D=D) ) } 434 | 435 | 436 | ## fast initialization functions for theta (after increasing K) and omega (given theta) 437 | tpxThetaStart <- function(X, theta, omega, K) 438 | { 439 | R <- tpxResids(X, theta=theta, omega=omega, nonzero=TRUE) 440 | X$v <- R$e*(R$r>3) + 1/ncol(X) 441 | Kpast <- ncol(theta) 442 | Kdiff <- K-Kpast 443 | if(Kpast != ncol(omega) || Kpast >= K){ stop("bad K in tpxThetaStart") } 444 | initheta <- normalize(Kpast*theta+rowMeans(theta), byrow=FALSE) 445 | n <- nrow(X) 446 | ki <- matrix(1:(n-n%%Kdiff), ncol=Kdiff) 447 | for(i in 1:Kdiff){ initheta <- cbind(initheta, (col_sums(X[ki[,i],])+1/ncol(X))/(sum(X[ki[,i],])+1)) } 448 | return( initheta ) 449 | } 450 | 451 | tpxOmegaStart <- function(X, theta) 452 | { 453 | if(!inherits(X,"simple_triplet_matrix")){ stop("X needs to be a simple_triplet_matrix.") } 454 | omega <- try(tcrossprod_simple_triplet_matrix(X, solve(t(theta)%*%theta)%*%t(theta)), silent=TRUE ) 455 | if(inherits(omega,"try-error")){ return( matrix( 1/ncol(theta), nrow=nrow(X), ncol=ncol(theta) ) ) } 456 | omega[omega <= 0] <- .5 457 | return( normalize(omega, byrow=TRUE) ) 458 | } 459 | 460 | 461 | ## fast computation of sparse P(X) for X>0 462 | tpxQ <- function(theta, omega, doc, wrd){ 463 | 464 | if(length(wrd)!=length(doc)){stop("index mis-match in tpxQ") } 465 | if(ncol(omega)!=ncol(theta)){stop("theta/omega mis-match in tpxQ") } 466 | 467 | out <- .C("RcalcQ", 468 | n = as.integer(nrow(omega)), 469 | p = as.integer(nrow(theta)), 470 | K = as.integer(ncol(theta)), 471 | doc = as.integer(doc-1), 472 | wrd = as.integer(wrd-1), 473 | N = as.integer(length(wrd)), 474 | omega = as.double(omega), 475 | theta = as.double(theta), 476 | q = double(length(wrd)), 477 | PACKAGE="maptpx" ) 478 | 479 | return( out$q ) } 480 | 481 | ## model and component likelihoods for mixture model 482 | tpxMixQ <- function(X, omega, theta, grp=NULL, qhat=FALSE){ 483 | if(is.null(grp)){ grp <- rep(1, nrow(X)) } 484 | K <- ncol(omega) 485 | n <- nrow(X) 486 | mixhat <- .C("RmixQ", 487 | n = as.integer(nrow(X)), 488 | p = as.integer(ncol(X)), 489 | K = as.integer(K), 490 | N = as.integer(length(X$v)), 491 | B = as.integer(nrow(omega)), 492 | cnt = as.double(X$v), 493 | doc = as.integer(X$i-1), 494 | wrd = as.integer(X$j-1), 495 | grp = as.integer(as.numeric(grp)-1), 496 | omega = as.double(omega), 497 | theta = as.double(theta), 498 | Q = double(K*n), 499 | PACKAGE="maptpx") 500 | ## model and component likelihoods 501 | lQ <- matrix(mixhat$Q, ncol=K) 502 | lqlhd <- log(row_sums(exp(lQ))) 503 | lqlhd[is.infinite(lqlhd)] <- -600 # remove infs 504 | if(qhat){ 505 | qhat <- exp(lQ-lqlhd) 506 | ## deal with numerical overload 507 | infq <- row_sums(qhat) < .999 508 | if(sum(infq)>0){ 509 | qhat[infq,] <- 0 510 | qhat[n*(apply(matrix(lQ[infq,],ncol=K),1,which.max)-1) + (1:n)[infq]] <- 1 } 511 | } 512 | return(list(lQ=lQ, lqlhd=lqlhd, qhat=qhat)) } 513 | 514 | ## negative log hessian block diagonal matrix for theta & omega 515 | tpxHnegDet <- function(X, q, theta, omega, alpha){ 516 | K <- ncol(theta) 517 | n <- nrow(omega) 518 | 519 | ## sparse Xij/Qij^2 520 | Xq <- X 521 | Xq$v <- Xq$v/q^2 522 | 523 | ## negative 2nd derivitive matrices for theta 524 | HT <- tcrossprod_simple_triplet_matrix(t(Xq), apply(omega, 1, function(v) v%o%v ) ) 525 | HT[,K*(0:(K-1))+1:K] <- HT[,K*(0:(K-1))+1:K] + alpha/theta^2 # will break for alpha<=1 526 | DT <- apply(HT, 1, tpxlogdet) 527 | 528 | ## ditto for omega 529 | HW <- matrix(.C("RnegHW", 530 | n = as.integer(nrow(omega)), 531 | p = as.integer(nrow(theta)), 532 | K = as.integer(K-1), 533 | omeg = as.double(omega[,-1]), 534 | thet = as.double(theta[,-1]), 535 | doc = as.integer(X$i-1), 536 | wrd = as.integer(X$j-1), 537 | cnt = as.double(X$v), 538 | q = as.double(q), 539 | N = as.integer(length(q)), 540 | H = double(n*(K-1)^2), 541 | PACKAGE="maptpx")$H, 542 | nrow=(K-1)^2, ncol=n) 543 | DW <- apply(HW, 2, tpxlogdet) 544 | return( c(DT,DW) ) } 545 | 546 | ## functions to move theta/omega to and from NEF. 547 | tpxToNEF <- function(theta, omega){ 548 | n <- nrow(omega) 549 | p <- nrow(theta) 550 | K <- ncol(omega) 551 | return(.C("RtoNEF", 552 | n=as.integer(n), p=as.integer(p), K=as.integer(K), 553 | Y=double((p-1)*K + n*(K-1)), 554 | theta=as.double(theta), tomega=as.double(t(omega)), 555 | PACKAGE="maptpx")$Y) 556 | } 557 | 558 | ## 'From' NEF representation back to probabilities 559 | tpxFromNEF <- function(Y, n, p, K){ 560 | bck <- .C("RfromNEF", 561 | n=as.integer(n), p=as.integer(p), K=as.integer(K), 562 | Y=as.double(Y), theta=double(K*p), tomega=double(K*n), 563 | PACKAGE="maptpx") 564 | return(list(omega=t( matrix(bck$tomega, nrow=K) ), theta=matrix(bck$theta, ncol=K))) 565 | } 566 | 567 | ## utility log determinant function for speed/stabilty 568 | tpxlogdet <- function(v){ 569 | v <- matrix(v, ncol=sqrt(length(v))) 570 | 571 | if( sum(zeros <- colSums(v)==0)!=0 ){ 572 | cat("warning: boundary values in laplace approx\n") 573 | v <- v[-zeros,-zeros] } 574 | 575 | return(determinant(v, logarithm=TRUE)$modulus) } 576 | 577 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | maptpx 2 | ====== 3 | 4 | MAP estimation of topic models in R. It implements estimation via posterior maximization for latent topic models in text analysis, as described in "On Estimation and Selection for Topic Models". These routines were previously part of textir, but have been spun off since version 2.0 of that package. This maptpx package is no longer actively maintained; instead, we are focusing on developing faster distributed factor models based on distributed multinomial regression, as implemented in the distrom package. 5 | 6 | If you want to take advantage of openmp parallelization, uncomment the relevant flags in src/MAKEVARS before compiling. 7 | -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite use:") 2 | 3 | citEntry(entry = "Inproceedings", 4 | title = "On Estimation for Topic Models", 5 | author = personList(as.person("Matt Taddy")), 6 | year = "2012", 7 | booktitle = "AISTATS", 8 | note = "http://jmlr.csail.mit.edu/proceedings/papers/v22/taddy12/taddy12.pdf", 9 | 10 | textVersion = 11 | paste("Matt Taddy","(AISTATS 2012)", 12 | "On Estimation of Topic Models,", 13 | "http://jmlr.csail.mit.edu/proceedings/papers/v22/taddy12/taddy12.pdf") 14 | 15 | ) -------------------------------------------------------------------------------- /man/counts.Rd: -------------------------------------------------------------------------------- 1 | \name{counts} 2 | \alias{stm_tfidf} 3 | \alias{normalize} 4 | \title{Utilities for count matrices} 5 | \description{ 6 | Tools for manipulating (sparse) count matrices.} 7 | \usage{ 8 | normalize(x,byrow=TRUE) 9 | stm_tfidf(x) 10 | } 11 | \arguments{ 12 | \item{x}{A \code{simple_triplet_matrix} or \code{matrix} of counts. } 13 | \item{byrow}{Whether to normalize by row or column totals.} 14 | } 15 | \value{ 16 | \code{normalize} divides the counts by row or column totals, and \code{stm_tfidf} returns a matrix with entries \eqn{x_{ij} \log[ n/(d_j+1) ]}, where \eqn{x_{ij}} is term-j frequency in document-i, 17 | and \eqn{d_j} is the number of documents containing term-j. 18 | } 19 | \author{ 20 | Matt Taddy \email{mataddy@gmail.com} 21 | } 22 | \examples{ 23 | normalize( matrix(1:9, ncol=3) ) 24 | normalize( matrix(1:9, ncol=3), byrow=FALSE ) 25 | 26 | (x <- matrix(rbinom(15,size=2,prob=.25),ncol=3)) 27 | stm_tfidf(x) 28 | } 29 | -------------------------------------------------------------------------------- /man/predict.topics.Rd: -------------------------------------------------------------------------------- 1 | \name{predict.topics} 2 | \alias{predict.topics} 3 | \title{ 4 | topic predict 5 | } 6 | \description{ Predict function for Topic Models } 7 | \usage{ 8 | \method{predict}{topics}( object, newcounts, loglhd=FALSE, ... ) 9 | } 10 | \arguments{ 11 | \item{object}{An output object from the \code{topics} function, or the corresponding matrix of estimated topics.} 12 | \item{newcounts}{ An \code{nrow(object$theta)}-column matrix of multinomial phrase/category counts for new documents/observations. 13 | Can be either a simple \code{matrix} or a \code{simple_triplet_matrix}. } 14 | \item{loglhd}{ Whether or not to calculate and return \code{sum(x*log(p))}, 15 | the un-normalized log likelihood. } 16 | \item{...}{Additional arguments to the undocumented internal \code{tpx*} functions. } 17 | } 18 | \details{ Under the default mixed-membership topic model, this function uses sequential quadratic programming to fit topic weights \eqn{\Omega} for new documents. 19 | Estimates for each new \eqn{\omega_i} are, conditional on \code{object$theta}, 20 | MAP in the (K-1)-dimensional logit transformed parameter space. } 21 | \value{ The output is an \code{nrow(newcounts)} by \code{object$K} matrix of document topic weights, or a list with including these weights as \code{W} and the log likelihood as \code{L}. } 22 | \references{ 23 | Taddy (2012), \emph{On Estimation and Selection for Topic Models}. 24 | \url{http://arxiv.org/abs/1109.4518} 25 | } 26 | \author{ 27 | Matt Taddy \email{mataddy@gmail.com} 28 | } 29 | 30 | \seealso{ 31 | topics, plot.topics, summary.topics, congress109 32 | } 33 | \examples{ 34 | 35 | ## Simulate some data 36 | omega <- t(rdir(500, rep(1/10,10))) 37 | theta <- rdir(10, rep(1/1000,1000)) 38 | Q <- omega\%*\%t(theta) 39 | counts <- matrix(ncol=1000, nrow=500) 40 | totals <- rpois(500, 200) 41 | for(i in 1:500){ counts[i,] <- rmultinom(1, size=totals[i], prob=Q[i,]) } 42 | 43 | ## predict omega given theta 44 | W <- predict.topics( theta, counts ) 45 | plot(W, omega, pch=21, bg=8) 46 | 47 | } 48 | 49 | 50 | -------------------------------------------------------------------------------- /man/rdir.Rd: -------------------------------------------------------------------------------- 1 | \name{rdir} 2 | \alias{rdir} 3 | \title{Dirichlet RNG} 4 | \description{ 5 | Generate random draws from a Dirichlet distribution} 6 | \usage{ 7 | rdir(n, alpha) 8 | } 9 | \arguments{ 10 | \item{n}{ The number of observations. } 11 | \item{alpha}{A \code{vector} of scale parameters, such that \eqn{E[p_j] = \alpha_j/\sum_i\alpha_i}. } 12 | } 13 | \value{ 14 | An \code{n} column matrix containing the observations. 15 | } 16 | \author{ 17 | Matt Taddy \email{mataddy@gmail.com} 18 | } 19 | \examples{ 20 | rdir(3,rep(1,6)) 21 | } 22 | -------------------------------------------------------------------------------- /man/topicVar.Rd: -------------------------------------------------------------------------------- 1 | \name{topicVar} 2 | \alias{topicVar} 3 | \alias{expit} 4 | \alias{logit} 5 | \title{ 6 | topic variance 7 | } 8 | \description{ Tools for looking at the variance of document-topic weights. } 9 | \usage{ 10 | topicVar(counts, theta, omega) 11 | logit(prob) 12 | expit(eta) 13 | } 14 | \arguments{ 15 | \item{counts}{A matrix of multinomial response counts, as inputed to the \code{topics} or \code{predict.topics} functions.} 16 | \item{theta}{A fitted topic matrix, as ouput from the \code{topics} or \code{predict.topics} functions.} 17 | \item{omega}{A fitted document topic-weight matrix, as ouput from the \code{topics} or \code{predict.topics} functions.} 18 | \item{prob}{A probability vector (positive and sums to one) or a matrix with probability vector rows.} 19 | \item{eta}{A vector of the natural exponential family parameterization for a probability vector (with first category taken as null) or a matrix with each row the NEF parameters for a single observation.} 20 | } 21 | \details{ These function use the natural exponential family (NEF) parametrization of a probability vector \eqn{q_0 ... q_{K-1}} with the first element corresponding to a 'null' category; that is, with 22 | \eqn{NEF(q) = e_1 ... e_{K-1}} and setting \eqn{e_0 = 0}, the probabilities are 23 | \deqn{q_k = \frac{exp[e_k]}{1 + \sum exp[e_j]}.} 24 | Refer to Taddy (2012) for details. } 25 | \value{ \code{topicVar} returns an array with dimensions \eqn{(K-1,K-1,n)}, where \code{K=ncol(omega)=ncol(theta)} and \code{n = nrow(counts) = nrow(omega)}, filled with the posterior covariance matrix for the NEF parametrization of each row of \code{omega}. Utility \code{logit} performs the NEF transformation and \code{expit} reverses it. } 26 | \references{ 27 | Taddy (2012), \emph{On Estimation and Selection for Topic Models}. 28 | \url{http://arxiv.org/abs/1109.4518} 29 | } 30 | \author{ 31 | Matt Taddy \email{mataddy@gmail.com} 32 | } 33 | 34 | \seealso{ 35 | topics, predict.topics 36 | } 37 | -------------------------------------------------------------------------------- /man/topics.Rd: -------------------------------------------------------------------------------- 1 | \name{topics} 2 | \alias{topics} 3 | \title{ 4 | Estimation for Topic Models 5 | } 6 | \description{ MAP estimation of Topic models } 7 | \usage{ 8 | topics(counts, K, shape=NULL, initopics=NULL, 9 | tol=0.1, bf=FALSE, kill=2, ord=TRUE, verb=1, ...) 10 | } 11 | \arguments{ 12 | \item{counts}{ 13 | A matrix of multinomial response counts in \code{ncol(counts)} phrases/categories 14 | for \code{nrow(counts)} documents/observations. Can be either a simple \code{matrix} or a \code{simple_triplet_matrix}.} 15 | \item{K}{The number of latent topics. If \code{length(K)>1}, 16 | \code{topics} will find the Bayes factor (vs a null single topic model) for each element and return parameter 17 | estimates for the highest probability K.} 18 | \item{shape}{ Optional argument to specify the Dirichlet prior concentration parameter as \code{shape} for topic-phrase probabilities. Defaults to \code{1/(K*ncol(counts))}. For fixed single \code{K}, this can also be a \code{ncol(counts)} 19 | by \code{K} matrix of unique shapes for each topic element.} 20 | \item{initopics}{ Optional start-location for \eqn{[\theta_1 ... \theta_K]}, the topic-phrase probabilities. 21 | Dimensions must accord with the smallest element of \code{K}. 22 | If \code{NULL}, the initial estimates are built by incrementally adding topics. } 23 | \item{tol}{ Convergence tolerance: optimization stops, conditional on some extra checks, when the \emph{absolute} posterior increase over a full paramater set update is less than \code{tol}.} 24 | \item{bf}{An indicator for whether or not to calculate the Bayes factor for univariate \code{K}. 25 | If \code{length(K)>1}, this is ignored and Bayes factors are always calculated. } 26 | \item{kill}{For choosing from multiple \code{K} numbers of topics (evaluated in increasing order), 27 | the search will stop after \code{kill} consecutive drops in the corresponding Bayes factor. Specify \code{kill=0} 28 | if you want Bayes factors for all elements of \code{K}. } 29 | \item{ord}{If \code{TRUE}, the returned topics (columns of \code{theta}) will be ordered by decreasing usage (i.e., 30 | by decreasing \code{colSums(omega)}).} 31 | \item{verb}{A switch for controlling printed output. \code{verb > 0} will print something, with the level of detail 32 | increasing with \code{verb}.} 33 | \item{...}{Additional arguments to the undocumented internal \code{tpx*} functions. } 34 | } 35 | \details{A latent topic model represents each i'th document's term-count vector \eqn{X_i} 36 | (with \eqn{\sum_{j} x_{ij} = m_i} total phrase count) 37 | as having been drawn from a mixture of \code{K} multinomials, each parameterized by topic-phrase 38 | probabilities \eqn{\theta_i}, such that 39 | \deqn{X_i \sim MN(m_i, \omega_1 \theta_1 + ... + \omega_K\theta_K).} 40 | We assign a K-dimensional Dirichlet(1/K) prior to each document's topic weights 41 | \eqn{[\omega_{i1}...\omega_{iK}]}, and the prior on each \eqn{\theta_k} is Dirichlet with concentration \eqn{\alpha}. 42 | The \code{topics} function uses quasi-newton accelerated EM, augmented with sequential quadratic programming 43 | for conditional \eqn{\Omega | \Theta} updates, to obtain MAP estimates for the topic model parameters. 44 | We also provide Bayes factor estimation, from marginal likelihood 45 | calculations based on a Laplace approximation around the converged MAP parameter estimates. If input \code{length(K)>1}, these 46 | Bayes factors are used for model selection. Full details are in Taddy (2011). } 47 | \note{ Estimates are actually functions of the MAP (K-1 or p-1)-dimensional 48 | logit transformed natural exponential family parameters. } 49 | \value{ 50 | An \code{topics} object list with entries 51 | \item{K}{The number of latent topics estimated. If input \code{length(K)>1}, 52 | on output this is a single value corresponding to the model with the highest Bayes factor. } 53 | \item{theta}{The \code{ncol{counts}} by \code{K} matrix of estimated topic-phrase probabilities.} 54 | \item{omega}{The \code{nrow{counts}} by \code{K} matrix of estimated document-topic weights. } 55 | \item{BF}{The log Bayes factor for each number of topics in the input \code{K}, against a null single topic model.} 56 | \item{D}{Residual dispersion: for each element of \code{K}, estimated dispersion parameter 57 | (which should be near one for the multinomial), degrees of freedom, and p-value for a test of whether the true dispersion is \eqn{>1}.} 58 | \item{X}{The input count matrix, in \code{dgTMatrix} format.} 59 | } 60 | \references{ 61 | Taddy (2012), \emph{On Estimation and Selection for Topic Models}. 62 | \url{http://arxiv.org/abs/1109.4518} 63 | } 64 | \author{ 65 | Matt Taddy \email{mataddy@gmail.com} 66 | } 67 | \seealso{ 68 | plot.topics, summary.topics, predict.topics, wsjibm, congress109, we8there 69 | } 70 | \examples{ 71 | 72 | ## Simulation Parameters 73 | K <- 10 74 | n <- 100 75 | p <- 100 76 | omega <- t(rdir(n, rep(1/K,K))) 77 | theta <- rdir(K, rep(1/p,p)) 78 | 79 | ## Simulated counts 80 | Q <- omega\%*\%t(theta) 81 | counts <- matrix(ncol=p, nrow=n) 82 | totals <- rpois(n, 100) 83 | for(i in 1:n){ counts[i,] <- rmultinom(1, size=totals[i], prob=Q[i,]) } 84 | 85 | ## Bayes Factor model selection (should choose K or nearby) 86 | summary(simselect <- topics(counts, K=K+c(-5:5)), nwrd=0) 87 | 88 | ## MAP fit for given K 89 | summary( simfit <- topics(counts, K=K, verb=2), n=0 ) 90 | 91 | ## Adjust for label switching and plot the fit (color by topic) 92 | toplab <- rep(0,K) 93 | for(k in 1:K){ toplab[k] <- which.min(colSums(abs(simfit$theta-theta[,k]))) } 94 | par(mfrow=c(1,2)) 95 | tpxcols <- matrix(rainbow(K), ncol=ncol(theta), byrow=TRUE) 96 | plot(theta,simfit$theta[,toplab], ylab="fitted values", pch=21, bg=tpxcols) 97 | plot(omega,simfit$omega[,toplab], ylab="fitted values", pch=21, bg=tpxcols) 98 | title("True vs Fitted Values (color by topic)", outer=TRUE, line=-2) 99 | 100 | ## The S3 method plot functions 101 | par(mfrow=c(1,2)) 102 | plot(simfit, lgd.K=2) 103 | plot(simfit, type="resid") 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | ## uncomment the flags below for openmp 2 | PKG_CFLAGS = -DRPRINT #-fopenmp 3 | PKG_LIBS = ${LAPACK_LIBS} ${BLAS_LIBS} ${FLIBS} #-lgomp 4 | -------------------------------------------------------------------------------- /src/init.c: -------------------------------------------------------------------------------- 1 | #include // for NULL 2 | #include 3 | 4 | /* .C calls */ 5 | extern void Romega( void *, void *, void *, void *, void *, void *, 6 | void *, void *, void *, void *, void *, void *); 7 | extern void RmixQ(void *, void *, void *, void *, void *, void *, 8 | void *, void *, void *, void *, void *, void *); 9 | extern void RcalcQ(void *, void *, void *, void *, void *, void *, 10 | void *, void *, void *); 11 | extern void RcalcTau(void *, void *, void *, void *, void *, void *, 12 | void *, void * ); 13 | extern void RnegHW( void *, void *, void *, void *, void *, void *, 14 | void *, void *, void *, void *, void *); 15 | extern void Rlogit( void *, void *, void *); 16 | extern void Rexpit( void *, void *, void *); 17 | extern void RtoNEF( void *, void *, void *, void *, void *, void *); 18 | extern void RfromNEF( void *, void *, void *, void *, void *, void *); 19 | extern void Rzhat( void *, void *, void *, void *, void *, void *, 20 | void *, void *, void *); 21 | 22 | static const R_CMethodDef CEntries[] = { 23 | {"Romega", (DL_FUNC) &Romega, 12}, 24 | {"RmixQ", (DL_FUNC) &RmixQ, 12}, 25 | {"RcalcQ", (DL_FUNC) &RcalcQ, 9}, 26 | {"RcalcTau", (DL_FUNC) &RcalcTau, 8}, 27 | {"RnegHW", (DL_FUNC) &RnegHW, 11}, 28 | {"Rlogit", (DL_FUNC) &Rlogit, 3}, 29 | {"Rexpit", (DL_FUNC) &Rexpit, 3}, 30 | {"RtoNEF", (DL_FUNC) &RtoNEF, 6}, 31 | {"RfromNEF", (DL_FUNC) &RfromNEF, 6}, 32 | {"Rzhat", (DL_FUNC) &Rzhat, 9}, 33 | {NULL, NULL, 0} 34 | }; 35 | 36 | void R_init_maptpx(DllInfo *dll) 37 | { 38 | R_registerRoutines(dll, CEntries, NULL, NULL, NULL); 39 | R_useDynamicSymbols(dll, FALSE); 40 | } 41 | -------------------------------------------------------------------------------- /src/latools.c: -------------------------------------------------------------------------------- 1 | #include "latools.h" 2 | #include "rhelp.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | 10 | /* 11 | * la_dgemm: blas wrapper 12 | * 13 | * C := alpha*op(A)*op(B) + beta*C. 14 | */ 15 | 16 | void la_dgemm(int tA, int tB, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol, 17 | double *A, double *B, double *C, double alpha, double beta) 18 | { 19 | char opA, opB; 20 | int m, n, k, lda, ldb, ldc; 21 | 22 | if(tA){ opA = 'T'; m = Acol; k = Arow; lda = k; } 23 | else{ opA = 'N'; m = Arow; k = Acol; lda = m; } 24 | 25 | if(tB){ opB = 'T'; n = Brow; assert(Bcol==k); ldb = n; } 26 | else{ opB = 'N'; n = Bcol; assert(Brow==k); ldb = k; } 27 | 28 | ldc = m; assert(Crow==m); assert(Ccol==n); 29 | 30 | dgemm_(&opA,&opB,&m,&n,&k,&alpha,A,&lda,B,&ldb,&beta,C,&ldc); 31 | } 32 | 33 | 34 | /* 35 | * la_dsymm: blas wrapper 36 | * 37 | * C := alpha*A*B + beta*C or alpha*B*A + beta*C, where A is symmetric 38 | */ 39 | 40 | 41 | void la_dsymm(int Alhs, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol, 42 | double *A, double *B, double *C, double alpha, double beta) 43 | { 44 | int m, n, lda, ldb, ldc; 45 | char Atri = 'L'; // reference the lower triangle of column major A. 46 | char Aside; 47 | 48 | assert(Arow=Acol); 49 | 50 | m = Crow; n = Ccol; lda = Arow; ldb = Brow; ldc = Crow; 51 | assert(ldb == m); 52 | 53 | if(Alhs){ Aside='L'; assert(lda == m); } 54 | else{ Aside='R'; assert(lda == n); } 55 | 56 | dsymm_(&Aside,&Atri,&m,&n,&alpha,A,&lda,B,&ldb,&beta,C,&ldc); 57 | } 58 | 59 | 60 | /* 61 | * la_dposv: lapack wrapper 62 | * 63 | * Solves the system A*X = B, where A is symmetric AND positive definite. 64 | * Upon return, B is the solution and A contains its cholesky decomp. 65 | */ 66 | 67 | int la_dposv(int Arow, int Bcol, double *A, double *B) 68 | { 69 | char cholTri = 'L'; // return cholesky decomp of col-major A in lower triangle 70 | int info; 71 | dposv_(&cholTri, &Arow, &Bcol, A, &Arow, B, &Arow, &info); 72 | return info; // if info = -i, i'th arg is wrong. if info > A is not pos-def. 73 | } 74 | 75 | 76 | /* 77 | * la_dgesv: lapack wrapper 78 | * 79 | * Solves the system A*X = B, where A is a general square matrix. 80 | * Upon return, B is the solution and A contains L and U from 81 | * the decomposition A = P * L * U (unit diag for L are not returned) 82 | */ 83 | 84 | int la_dgesv(int Arow, int Bcol, double *A, double *B) 85 | { 86 | int info; 87 | int *ip = new_ivec(Arow); /* pivot indices which define P; 88 | row i was interchanged with row ip[i]*/ 89 | dgesv_(&Arow, &Bcol, A, &Arow, ip, B, &Arow, &info); 90 | return info; // if info = -i, i'th arg is wrong. if info > 0, A is not pos-def. 91 | } 92 | 93 | 94 | /* 95 | * la_dsysv: lapack wrapper 96 | * 97 | * Solves the system A*X = B, where A is a symmetric matrix. 98 | * Upon return, B is block diagonal matrix D and the 99 | * multipliers used to obtain the factor U or L from the 100 | * factorization A = U*D*U**T or A = L*D*L**T as computed by DSYTRF. 101 | */ 102 | 103 | int la_dsysv(int Arow, int Bcol, double *A, double *B) 104 | { 105 | int info; 106 | double wkopt; 107 | int lwork = -1; 108 | int *ipiv = new_ivec(Arow); 109 | char uplo = 'L'; 110 | 111 | dsysv_(&uplo, &Arow, &Bcol, A, &Arow, ipiv, B, &Arow, &wkopt, &lwork, &info); 112 | 113 | lwork = (int) wkopt; 114 | double *work = new_dvec(lwork); 115 | 116 | dsysv_(&uplo, &Arow, &Bcol, A, &Arow, ipiv, B, &Arow, work, &lwork, &info); 117 | 118 | free(work); 119 | free(ipiv); 120 | return info; 121 | } 122 | 123 | 124 | /* 125 | * la_dpotrf: lapack wrapper 126 | * 127 | * Replaces col-major A with the lower triangle cholesky L s.t. A = LL' 128 | * Upper triangle is untouched. 129 | * 130 | */ 131 | 132 | int la_dpotrf(int dim, double *A) 133 | { 134 | int info; 135 | char cholTri = 'L'; // return cholesky decomp of col-major A in lower triangle 136 | dpotrf_(&cholTri,&dim,A,&dim,&info); 137 | return info; // if info = -i, i'th arg is wrong. if info > 0, A is not pos-def. 138 | } 139 | 140 | 141 | /* 142 | * I'll take one brand-new ncxnr array of doubles. 143 | * 144 | */ 145 | 146 | double** new_mat(int nr, int nc){ 147 | 148 | if(nr == 0 || nc == 0) return NULL; 149 | int j; 150 | double **mat = (double**) malloc(sizeof(double*) * (unsigned) nc); 151 | assert(mat); 152 | mat[0] = (double*) malloc(sizeof(double) * (unsigned) (nc*nr)); 153 | assert(mat[0]); 154 | 155 | for(j=1; j 0) assert(v && orig); 427 | for(i=0; i 0) assert(v && orig); 564 | for(i=0; i 11 | #include 12 | 13 | /* These will all be defined if you're compiling with R */ 14 | 15 | // fns from lapack 16 | extern void dpotrf_(char*, int*, double*, int*, int*); 17 | extern void dposv_(char*, int*, int*, double*, int*, double*, int*, int*); 18 | extern void dgesv_(int*,int*,double*,int*,int*,double*,int*,int*); 19 | extern void dsysv_(char*, int*,int *,double *,int*,int*,double*,int*,double*,int*,int*); 20 | 21 | // fns from blas 22 | extern void dgemm_(char*, char*, int*, int*, int*, double*, 23 | double*, int*, double*, int*, double*, double*, int*); 24 | extern void dsymm_(char*, char*, int*, int*, double*, 25 | double*, int*, double*, int*, double*, double*, int*); 26 | 27 | // friendlier versions. 28 | 29 | void la_dgemm(int tA, int tB, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol, 30 | double *A, double *B, double *C, double alpha, double beta); 31 | void la_dsymm(int Alhs, int Arow, int Acol, int Brow, int Bcol, int Crow, int Ccol, 32 | double *A, double *B, double *C, double alpha, double beta); 33 | 34 | int la_dposv(int Arow, int Bcol, double *A, double *B); 35 | int la_dgesv(int Arow, int Bcol, double *A, double *B); 36 | int la_dsysv(int Arow, int Bcol, double *A, double *B); 37 | int la_dpotrf(int dim, double *A); 38 | 39 | // 2D double array tools. 40 | 41 | double** new_mat(int nr, int nc); 42 | double** new_mat_fromv(int nr, int nc, double *v); 43 | int** new_imat(int nr, int nc); 44 | int** new_imat_fromv(int nr, int nc, int *v); 45 | double** new_zero_mat(int nr, int nc); 46 | double** new_val_mat(int nr, int nc, double val); 47 | double** new_id_mat(int dim); 48 | double** new_dup_mat(int nr, int nc, double **orig); 49 | void delete_mat(double **mat); 50 | void delete_imat(int **mat); 51 | void copy_mat(int nr, int nc, double** mat, double** orig); 52 | void print_mat(int nr, int nc, double **mat, FILE *outfile); 53 | void print_imat(int nr, int nc, int **mat, FILE *outfile); 54 | void zero_mat(double **mat, int nr, int nc); 55 | void id_mat(double **mat, int dim); 56 | void diag_mat(double **mat, double *v, int dim); 57 | 58 | // 1D double array tools. 59 | 60 | double* new_dvec(int n); 61 | double* new_dseq(double from, double to, int n); 62 | double* new_dzero(int n); 63 | double* new_dup_dvec(double *v, int n); 64 | double sum_dvec(double *v, int n); 65 | void zero_dvec(double *v, int n); 66 | void copy_dvec(double *v, double *orig, int n); 67 | void print_dvec(double *v, int n, FILE *outfile); 68 | double* drep(double val, int n); 69 | double normalize(double *v, int n); 70 | 71 | // 1D int array tools. 72 | 73 | int* new_ivec(int n); 74 | int* new_iseq(int from, int to); 75 | int* new_izero(int n); 76 | int* new_dup_ivec(int *v, int n); 77 | int sum_ivec(int *v, int n); 78 | void zero_ivec(int *v, int n); 79 | void copy_ivec(int *v, int *orig, int n); 80 | void print_ivec(int *v, int n, FILE *outfile); 81 | int* irep(int val, int n); 82 | 83 | #endif 84 | 85 | -------------------------------------------------------------------------------- /src/rhelp.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************** 2 | * 3 | * Bayesian Regression and Adaptive Sampling with Gaussian Process Trees 4 | * Copyright (C) 2005, University of California 5 | * 6 | * This library is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version. 10 | * 11 | * This library is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | * Lesser General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU Lesser General Public 17 | * License along with this library; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 | * 20 | * Questions? Contact Robert B. Gramacy (rbgramacy@ams.ucsc.edu) 21 | * 22 | ********************************************************************************/ 23 | 24 | #include "rhelp.h" 25 | #ifdef RPRINT 26 | #include 27 | #include 28 | FILE *mystdout = (FILE*) 0; 29 | FILE *mystderr = (FILE*) 1; 30 | #else 31 | FILE *mystdout = stdio; 32 | FILE *mystderr = stderr; 33 | #endif 34 | #include 35 | #include 36 | #include 37 | 38 | /* 39 | * myprintf: 40 | * 41 | * a function many different types of printing-- in particular, using 42 | * the Rprintf if the code happens to be compiled with RPRINT, 43 | * othersie fprintf (takes the same arguments as fprintf) 44 | */ 45 | 46 | void myprintf(FILE *outfile, const char *str, ...) 47 | { 48 | va_list argp; 49 | va_start(argp, str); 50 | 51 | #ifdef RPRINT 52 | if(outfile == mystdout) Rvprintf(str, argp); 53 | else if(outfile == mystderr) REvprintf(str, argp); 54 | else vfprintf(outfile, str, argp); 55 | #else 56 | vfprintf(outfile, str, argp); 57 | #endif 58 | 59 | va_end(argp); 60 | } 61 | 62 | 63 | #ifndef RPRINT 64 | /* 65 | * error: 66 | * 67 | * printf style function that reports errors to stderr 68 | */ 69 | 70 | void error(const char *str, ...) 71 | { 72 | va_list argp; 73 | va_start(argp, str); 74 | 75 | myprintf(stderr, "ERROR: "); 76 | vfprintf(stderr, str, argp); 77 | 78 | va_end(argp); 79 | myflush(stderr); 80 | 81 | /* add a final newline */ 82 | myprintf(stderr, "\n"); 83 | 84 | /* kill the code */ 85 | assert(0); 86 | } 87 | 88 | 89 | /* 90 | * warning: 91 | * 92 | * printf style function that reports warnings to stderr 93 | */ 94 | 95 | void warning(const char *str, ...) 96 | { 97 | va_list argp; 98 | va_start(argp, str); 99 | 100 | myprintf(stderr, "WARNING: "); 101 | vfprintf(stderr, str, argp); 102 | 103 | va_end(argp); 104 | myflush(stderr); 105 | 106 | /* add a final newline */ 107 | myprintf(stderr, "\n"); 108 | } 109 | #endif 110 | 111 | 112 | /* 113 | * myflush: 114 | * 115 | * a function for many different types of flushing-- in particular, 116 | * using * the R_FlushConsole the code happens to be compiled with 117 | * RPRINT, otherwise fflush 118 | */ 119 | 120 | void myflush(FILE *outfile) 121 | { 122 | #ifdef RPRINT 123 | R_FlushConsole(); 124 | #else 125 | fflush(outfile); 126 | #endif 127 | } 128 | 129 | 130 | /* 131 | * my_r_process_events: 132 | * 133 | * at least every 1 second(s) pass control back to 134 | * R so that it can check for interrupts and/or 135 | * process other R-gui events 136 | */ 137 | 138 | time_t my_r_process_events(time_t itime) 139 | { 140 | #ifdef RPRINT 141 | time_t ntime = time(NULL); 142 | 143 | if(ntime - itime > 1) { 144 | R_FlushConsole(); 145 | R_CheckUserInterrupt(); 146 | #if (defined(HAVE_AQUA) || defined(Win32) || defined(Win64)) 147 | R_ProcessEvents(); 148 | #endif 149 | itime = ntime; 150 | } 151 | #endif 152 | return itime; 153 | } 154 | -------------------------------------------------------------------------------- /src/rhelp.h: -------------------------------------------------------------------------------- 1 | #ifndef __RHELP_H__ 2 | #define __RHELP_H__ 3 | 4 | #include 5 | #include 6 | 7 | /* this is now covered by -D RPRINT flags in Makevars */ 8 | /*#define RPRINT*/ 9 | 10 | extern FILE *mystdout, *mystderr; 11 | 12 | #ifndef RPRINT 13 | void warning(const char *str, ...); 14 | void error(const char *str, ...); 15 | #else 16 | #include 17 | #include 18 | #endif 19 | 20 | void R_FlushConsole(void); /* R < 2.3 does not have this in R.h (in Rinterface.h) */ 21 | void myprintf(FILE *outfile, const char *str, ...); 22 | void myflush(FILE *outfile); 23 | time_t my_r_process_events(time_t itime); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/topics.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "latools.h" 5 | #include "rhelp.h" 6 | # ifdef _OPENMP 7 | #include 8 | # endif 9 | 10 | 11 | double wllhd(int nwrd, double *x, double *q){ 12 | double l = 0.0; 13 | int j; 14 | for(j=0; j= 1.0) 57 | { zero_dvec(w,K); w[h] = 1.0; 58 | for(k=0; k0); 74 | delta = 1.0; 75 | if(B[d] < -w[h]) delta = -w[h]/B[d]; 76 | if(B[d] > 1.0-w[h]) delta = (1.0-w[h])/B[d]; 77 | if(delta < delmin) delmin = delta; 78 | d++; } 79 | 80 | d = 0; 81 | for(h=0; h theta[k][*wrd]) k = h; 102 | w[k] = 1.0; 103 | return 1; } 104 | 105 | double diff = tol+1.0; 106 | 107 | double *q = new_dvec(nwrd); 108 | 109 | double *A = new_dzero( (K+1)*(K+1) ); 110 | double *B = new_dzero(K+1); 111 | double *nhess = new_dvec(K*K); 112 | double *grad = new_dvec(K); 113 | double *wold = new_dup_dvec(w, K); 114 | 115 | int *active = new_izero(K); 116 | int nw = K; 117 | int info; 118 | int dosysv = 1; 119 | int iter = 1; 120 | 121 | while(diff > tol && iter < tmax && nw > 0){ 122 | 123 | /* update gradient, and negative hessian */ 124 | wrdprob(q, nwrd, K, wrd, theta, w); 125 | wgrad(grad, nwrd, K, wrd, x, q, theta, w, nef); 126 | wneghess_lowtri(nhess, nwrd, K, wrd, x, q, theta, w, nef); 127 | 128 | /* build linear system for current active set */ 129 | d = 0; 130 | for(k=0; k .001 || (info !=0 && dosysv == 0)){ 154 | myprintf(mystdout, "trouble in wsolve\n"); 155 | nw = wactivate(K, w, active); 156 | return 0; 157 | } 158 | else nw = wupdate(K, w, B, active); 159 | 160 | 161 | /* iterate */ 162 | diff = 0.0; 163 | for(h=0; h 0.01 ){ (*size)++; } 263 | } 264 | } 265 | } 266 | 267 | /* negative hessian in full NEF representation */ 268 | void RnegHW(int *n_in, int *p_in, int *K_in, double *omeg, double *thet, 269 | int *doc, int *wrd, double *cnt, double *q, int *N, double *H){ 270 | 271 | int n, p, K, K2, l, h, k; 272 | 273 | n = *n_in; 274 | p = *p_in; 275 | K = *K_in; 276 | K2 = K*K; 277 | double ttbyq2; 278 | 279 | zero_dvec(H, n*K2); 280 | 281 | for(l=0; l<(*N); l++) 282 | for(k=0; k