├── documentation.pdf ├── .gitignore ├── code ├── Makefile ├── wrapper.R ├── helper_functions.R └── dmbvs.c ├── README.md ├── example_analysis_script.R └── LICENSE /documentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/duncanwadsworth/dmbvs/HEAD/documentation.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | *.RData 4 | *.Rproj 5 | .DS_Store 6 | *.txt 7 | *.x 8 | *.pdf 9 | *.html 10 | !documentation.pdf 11 | -------------------------------------------------------------------------------- /code/Makefile: -------------------------------------------------------------------------------- 1 | ## Makefile for dmbvs 2 | 3 | ## file and path names 4 | ## LIBS and INCS may need to be changed 5 | INPUT = dmbvs 6 | LIBS = /usr/local/lib 7 | INCS = /usr/local/include 8 | 9 | compile: 10 | #clang -I$(INCS) -L$(LIBS) -Wall $(INPUT).c -o $(INPUT).x -lgsl -lgslcblas -lm 11 | #/local/gcc-5.2.0/bin/gcc -I/local/gsl-2.1/include -L/local/gsl-2.1/lib -Wall $(INPUT).c -o $(INPUT).x -lgsl -lgslcblas -lm 12 | gcc -I$(INCS) -L$(LIBS) -Wall $(INPUT).c -o $(INPUT).x -lgsl -lgslcblas -lm 13 | 14 | clean: 15 | rm -f $(INPUT).x 16 | if test -a $(INPUT).o ; then rm -f $(INPUT).o ; fi 17 | 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | dmbvs 2 | === 3 | 4 | > **Important**\ 5 | > This repo is no longer maintained and its contents have been superseded by the library here: [https://github.com/mkoslovsky/MicroBVS](https://github.com/mkoslovsky/MicroBVS). Anything you can do with `dmbvs` can be done better with `MicroBVS`. 6 | 7 | ### Description 8 | 9 | Bayesian variable selection for Dirichlet-Multinomial regression. For $n$ 10 | samples, let $Y$ be an $n \times q$ matrix with $q$ taxa. Let $X$ be an 11 | $n \times p$ matrix with $p$ covariates. By applying spike-and-slab priors to 12 | the Dirichlet-Multinomial regression coefficients, we obtain concise 13 | taxa/covariate associations. The methodology is further described in the 14 | manuscript: 15 | 16 | > Wadsworth, W. D., Argiento, R., Guindani, M., Galloway-Pena, J., Samuel, S. A., & 17 | > Vannucci, M. (2016). An Integrative Bayesian Dirichlet-Multinomial Regression 18 | > Model for the Analysis of Taxonomic Abundances in Microbiome data. 19 | 20 | The code has been updated on 10/02/17 with an option to perform a stochastic 21 | search algorithm for Bayesian variable selection instead of the Gibbs sampler. 22 | The stochastic search variable selection approach provides gains in computational 23 | speed. 24 | 25 | ### Contents 26 | 27 | The repository contains 28 | 29 | 1. C code implementing an MCMC sampler for Dirichlet-Multinomial Bayesian 30 | Variable Selection (dmbvs) using spike-and-slab priors, 31 | 32 | 2. R code to wrap and run the sampler from within R, 33 | 34 | 3. a function for simulating data, 35 | 36 | 4. and a "start-to-finish" script (`example_analysis_script.R`) demonstrating 37 | usage of the code. This script gives reasonable default settings for the 38 | hyperparameters and MCMC parameters for the example simulated data. Settings may 39 | change for other data. 40 | 41 | ### Usage 42 | 43 | * The R code requires the `dirmult` and `MASS` packages. Please ensure those are 44 | installed first. 45 | 46 | * The C code relies on the [GNU Scientific Library (GSL)](https://www.gnu.org/software/gsl/). 47 | GSL must be installed and modifications may need to be made to the 'library' 48 | and 'include' paths in the Makefile. 49 | 50 | * The main C file (`dmbvs.c`), as well as the Makefile to be used for compilation 51 | of the code, can be found under the directory `code/`. 52 | 53 | * On Linux and Mac the C code may be compiled with the Makefile from R using: 54 | 55 | ```{r} 56 | # must be in package's root directory 57 | setwd("dmbvs") 58 | system("cd code; make") 59 | ``` 60 | 61 | * If compilation has been successful there will be an executable called `dmbvs.x` 62 | the `code/` directory. Data may be simulated and the MCMC code run using: 63 | 64 | ```{r} 65 | source(file.path("code", "wrapper.R")) 66 | source(file.path("code", "helper_functions.R")) 67 | simdata = simulate_dirichlet_multinomial_regression(n_obs = 100, n_vars = 50, 68 | n_taxa = 50, n_relevant_vars = 5, 69 | n_relevant_taxa = 5) 70 | results = dmbvs(XX = simdata$XX[,-1], YY = simdata$YY, 71 | intercept_variance = 10, slab_variance = 10, 72 | bb_alpha = 0.02, bb_beta = 1.98, GG = 1100L, thin = 10L, burn = 100L, 73 | exec = file.path(".", "code", "dmbvs.x"), output_location = ".") 74 | ``` 75 | 76 | * Tested on: 77 | * Mac with R version 3.2.2 and GSL version 2.1 with the Apple LLVM version 78 | 7.0.2 compiler 79 | * Red Hat Linux 6.6 with R version 3.1.2 and GSL version 2.1 and gcc 80 | version 5.2.0 81 | 82 | -------------------------------------------------------------------------------- /example_analysis_script.R: -------------------------------------------------------------------------------- 1 | 2 | ################################################################### 3 | # Example Analysis of simulated data using a Dirichlet-Multinomial 4 | # Bayesian variable selection model 5 | ################################################################### 6 | 7 | # -- README ------------------------------------------------------------------- 8 | # This script gives an example of how to run the code on a simulated 9 | # dataset. It assumes several things: 10 | # 1. The Gnu Scientific Library is installed 11 | # 2. The paths in the Makefile are correct 12 | # 3. The data is available 13 | # ----------------------------------------------------------------------------- 14 | 15 | # -- simulate data ------------------------------------------------------------ 16 | # this simulates a dataset identical in dimension to the ones used in the 17 | # simulation sections of the manuscript 18 | source(file.path("code", "helper_functions.R")) 19 | simdata = simulate_dirichlet_multinomial_regression(n_obs = 100, n_vars = 50, 20 | n_taxa = 50, n_relevant_vars = 5, 21 | n_relevant_taxa = 5) 22 | Ysim = simdata$YY 23 | Xsim = simdata$XX 24 | 25 | # -- run analysis ------------------------------------------------------------- 26 | # preliminaries 27 | source(file.path("code", "wrapper.R")) 28 | system("cd code; make") 29 | executable_location = "code/dmbvs.x" 30 | save_prefix = "simulation" 31 | 32 | # prepare and check data 33 | YY = as.matrix(Ysim) 34 | XX = scale(as.matrix(Xsim[,-1]), center = T, scale = T) 35 | colnames(YY) = paste0("taxa", 1:ncol(YY)) 36 | colnames(XX) = paste0("covariate", 1:ncol(XX)) 37 | dim(YY) 38 | dim(XX) 39 | 40 | # MCMC and hyperparameters 41 | # these values are reasonable for the data simulated here but should be changed 42 | # depending on the characteristics of other datasets 43 | GG = 301L; thin = 2L; burn = 101L; # fast, for testing 44 | #GG = 11001L; thin = 10L; burn = 1001L; # good defaults, in this case 45 | # reasonable default parameters, see further discussion in the manuscript 46 | bb_alpha = 0.02; bb_beta = 2 - bb_alpha 47 | proposal_alpha = 0.5; proposal_beta = 0.5 48 | slab_variance = 10; intercept_variance = 10 49 | 50 | # description 51 | cat("Beta-Binomial mean:", bb_alpha/(bb_alpha + bb_beta), "\n") 52 | cat("Number of kept iterations:", (GG - burn)/thin, "\n") 53 | 54 | # run the algorithm 55 | results = dmbvs(XX = XX, YY = YY, intercept_variance = intercept_variance, 56 | slab_variance = slab_variance, bb_alpha = bb_alpha, 57 | bb_beta = bb_beta, GG = GG, thin = thin, burn = burn, 58 | init_beta = "warmstart", init_alpha = "warmstart", 59 | proposal_alpha = proposal_alpha, proposal_beta = proposal_beta, 60 | exec = executable_location, selection_type = "ss", 61 | output_location = ".") 62 | params = data.frame(GG, burn, thin, intercept_variance, 63 | slab_variance, bb_alpha, bb_beta, 64 | proposal_alpha, proposal_beta) 65 | save(results, params, XX, YY, 66 | file = paste0("results-", save_prefix, "-", Sys.Date(), ".RData")) 67 | 68 | # quick check results 69 | source(file.path("code", "helper_functions.R")) 70 | mppi = colMeans((results$beta != 0) + 0) 71 | (blfdrate = bfdr(mppi, threshold = 0.1)$threshold) 72 | MPPI = data.frame(expand.grid(covariates = colnames(results$hyperparameters$inputdata$XX), 73 | taxa = colnames(results$hyperparameters$inputdata$YY)), 74 | mppi = mppi, 75 | beta = colMeans(results$beta), 76 | truebeta = c(t(simdata$betas[,-1]))) 77 | subset(MPPI, mppi > blfdrate | truebeta != 0) 78 | plot(mppi, type = "h", ylab = "MPPI", 79 | xlab = "beta index", main = "Manhattan plot") 80 | 81 | # active variable traceplot 82 | plot.ts(rowSums((results$beta != 0) + 0), main = "Active variables traceplot", 83 | ylab = "number of betas in the model", xlab = "iteration") 84 | 85 | # some of the selected beta traceplots 86 | selected = which(mppi > 0.5) 87 | fortraces = selected[sample(length(selected), 10)] 88 | plot.ts(results$beta[,fortraces], main = "Some selected beta traceplots", 89 | xlab = "iteration", ylab = "") 90 | 91 | # visualize the associations 92 | association_plot(MPPI[,-5], graph_layout = "bipartite", main = "Sample Results") 93 | 94 | -------------------------------------------------------------------------------- /code/wrapper.R: -------------------------------------------------------------------------------- 1 | #' an R wrapper to C code for spike-and-slab Dirichlet--Multinomial 2 | #' Bayesian variable selection 3 | #' 4 | #' @param XX covariate matrix (without intercept) 5 | #' @param YY count matrix 6 | #' @param intercept_variance a scalar for the prior variance on the intercept 7 | #' of the log-linear predictors 8 | #' @param slab_variance a scalar for the prior variance on the slab of the 9 | #' spike-and-slab 10 | #' @param bb_alpha a scalar for the alpha hyperparameter of the Beta-Bernoulli 11 | #' spike inclusion prior 12 | #' @param bb_beta a scalar for the beta hyperparameter of the Beta-Bernoulli 13 | #' spike inclusion prior 14 | #' @param GG the total number of MCMC iterations 15 | #' @param thin the MCMC thinning interval 16 | #' @param burn the number of MCMC iterations out of GG that will be discarded 17 | #' @param proposal_alpha initial value, either a scalar or a vector of length 18 | #' ncol(YY), if a scalar, that value is used for all proposals on alpha 19 | #' @param proposal_beta initial value, either a scalar or a matrix with 20 | #' ncol(XX) columns and ncol(YY) rows, if a scalar, that value is used for all 21 | #' proposals on beta 22 | #' @param init_alpha either a scalar or a vector of size ncol(YY) 23 | #' @param init_beta either a scalar or a matrix with ncol(YY) rows and 24 | #' ncol(XX) columns, inclusion initialization uses non-zero elements of 25 | #' init_beta 26 | #' @param exec the path to the C executable 27 | #' @param output_location if NULL, output goes to an the directory output/ 28 | #' created in the working directory 29 | #' @param r_seed an integer seed to pass to GSL's random number generator 30 | #' @param selection_type either "ss" (Stochastic Search) or "Gibbs", determines 31 | #' the MCMC mechanism for variable selection 32 | #' 33 | #' @return alpha: a matrix with iterations in the rows and the alphas in the 34 | #' columns 35 | #' @return alpha_accept: the Metropolis-Hastings acceptance ratio for the alphas 36 | #' @return beta: a matrix with iterations in the rows and the per-iteration beta 37 | #' matrix -- flattened by rows -- in the columns 38 | #' @return beta_accept: the Metropolis-Hastings acceptance ratio for the betas 39 | #' @return hyperparameters: a list containing the hyperparameters, the MCMC 40 | #' parameters, and the data from the original function call 41 | #' 42 | #' @export 43 | dmbvs = function(XX, YY, intercept_variance, slab_variance, bb_alpha, bb_beta, 44 | GG, thin, burn, proposal_alpha = 0.5, proposal_beta = 0.5, 45 | init_alpha = 0, init_beta = 0, exec = file.path(".","dmbvs.x"), 46 | output_location = NULL, r_seed = NULL, selection_type = "Gibbs"){ 47 | 48 | # data dimensions 49 | n_cats = ncol(YY) 50 | n_obs = nrow(XX) 51 | n_vars = ncol(XX) 52 | 53 | # argument checking 54 | #if(!is.integer(YY)){warning("YY is not integer typed: is it a count matrix?")} 55 | if(bb_alpha > bb_beta | bb_alpha == bb_beta){warning("Beta-Bernoulli prior is symmetric or left skewed: right skewed enforces sparsity")} 56 | if(all(XX[,1] == 1)){stop("XX appears to have an intercept column")} 57 | if(n_obs != nrow(YY)){stop("dimensions of XX and YY do not match")} 58 | if(burn > GG){stop("burnin greater than the number of iterations")} 59 | if(any(intercept_variance < 0, slab_variance < 0, bb_alpha < 0, bb_beta < 0)){stop("check hyperparameter values: at least one is negative")} 60 | if((length(proposal_beta) != 1) & (nrow(as.matrix(proposal_beta)) != n_cats) & (ncol(as.matrix(proposal_beta)) != n_vars)){stop("bad dimension for proposal_beta")} 61 | if(!(selection_type %in% c("ss", "Gibbs"))){stop("unrecognized variable selection type: choose ss or Gibbs")} 62 | 63 | # set output location 64 | # for extremely long runs it's necessary to leave output in a subdirectory of the 65 | # working directory since temp directories seems to get cleaned out occasionally 66 | if(is.null(output_location)){ 67 | # for parallel simulations need unique output directories 68 | dir.create(paste0(getwd(), "/output-pid-", Sys.getpid())) 69 | out_dir = paste0(getwd(), "/output-pid-", Sys.getpid()) 70 | }else{ 71 | dir.create(paste0(output_location, "/output-pid-", Sys.getpid())) 72 | out_dir = paste0(output_location, "/output-pid-", Sys.getpid()) 73 | } 74 | 75 | # text files for data 76 | utils::write.table(as.matrix(XX), file.path(out_dir, "covariates.txt"), 77 | row.names = F, col.names = F) 78 | utils::write.table(as.matrix(YY), file.path(out_dir, "count_matrix.txt"), 79 | row.names = F, col.names = F) 80 | 81 | # text files for proposals and initialization 82 | # intercept initialization 83 | if(is.null(init_alpha)){ 84 | utils::write.table(rep(0, times = n_cats), file.path(out_dir, "init_alpha.txt"), 85 | row.names = F, col.names = F) 86 | }else if(length(init_alpha) == 1 & is.numeric(init_alpha)){ 87 | utils::write.table(rep(init_alpha, times = n_cats), file.path(out_dir, "init_alpha.txt"), 88 | row.names = F, col.names = F) 89 | }else if(length(init_alpha) == n_cats & is.numeric(init_alpha)){ 90 | utils::write.table(init_alpha, file.path(out_dir, "init_alpha.txt"), 91 | row.names = F, col.names = F) 92 | }else if(init_alpha == "warmstart"){ 93 | utils::write.table(scale(log(colSums(YY))), file.path(out_dir, "init_alpha.txt"), 94 | row.names = F, col.names = F) 95 | }else{ 96 | stop("init_alpha not recognized") 97 | } 98 | # regression parameter initialization 99 | if(is.null(init_beta)){ 100 | # empty initialization 101 | utils::write.table(rep(0, times = n_cats * n_vars), 102 | file.path(out_dir, "init_beta.txt"), row.names = F, 103 | col.names = F) 104 | }else if(length(init_beta) == 1 & is.numeric(init_beta)){ 105 | # scalar initialization 106 | utils::write.table(rep(init_beta, times = n_cats * n_vars), 107 | file.path(out_dir, "init_beta.txt"), row.names = F, 108 | col.names = F) 109 | }else if(!is.null(dim(init_beta)) & all(dim(init_beta) == c(n_cats, n_vars))){ 110 | # matrix initialization 111 | utils::write.table(c(t(as.matrix(init_beta))), 112 | file.path(out_dir, "init_beta.txt"), row.names = F, 113 | col.names = F) 114 | }else if(is.character(init_beta) | init_beta == "warmstart"){ 115 | # false discovery rate on correlation test initialization 116 | cormat = matrix(0, n_cats, n_vars) 117 | pmat = matrix(0, n_cats, n_vars) 118 | yy = YY/rowSums(YY) # compositionalize 119 | for(rr in 1:n_cats){ 120 | for(cc in 1:n_vars){ 121 | pmat[rr, cc] = stats::cor.test(XX[, cc], yy[, rr], method = "spearman", 122 | exact = F)$p.value 123 | cormat[rr, cc] = stats::cor(XX[, cc], yy[, rr], method = "spearman") 124 | } 125 | } 126 | # defaults to 0.2 false discovery rate 127 | pm = matrix((stats::p.adjust(c(pmat), method = "fdr") <= 0.2) + 0, n_cats, 128 | n_vars) 129 | betmat = cormat * pm 130 | utils::write.table(c(t(betmat)), file.path(out_dir, "init_beta.txt"), 131 | row.names = F, col.names = F) 132 | }else{ 133 | stop("init_beta not recognized, if using a matrix, ensure proper dimensions") 134 | } 135 | 136 | # intercept proposal 137 | if(length(proposal_alpha) == 1){ 138 | utils::write.table(rep(proposal_alpha, times = n_cats), 139 | file.path(out_dir, "proposal_alpha.txt"), row.names = F, 140 | col.names = F) 141 | }else if(length(proposal_alpha) == n_cats){ 142 | utils::write.table(proposal_alpha, file.path(out_dir, "proposal_alpha.txt"), 143 | row.names = F, col.names = F) 144 | }else{ 145 | stop("problem with proposal_alpha") 146 | } 147 | # regression proposal 148 | if(length(proposal_beta) == 1){ 149 | utils::write.table(rep(proposal_beta, times = n_cats * n_vars), 150 | file.path(out_dir, "proposal_beta.txt"), row.names = F, 151 | col.names = F) 152 | }else if((nrow(proposal_beta) == n_cats) & (ncol(proposal_beta) == n_vars)){ 153 | utils::write.table(c(t(proposal_beta)), file.path(out_dir, "proposal_beta.txt"), 154 | row.names = F, col.names = F) 155 | }else{ 156 | stop("problem with proposal_beta") 157 | } 158 | 159 | # pass the external seed to GSL which will have its own corresponding seed (see the .out file) 160 | if(is.null(r_seed)){ 161 | a_random_seed = sample(1e6, 1) 162 | }else if(is.integer(r_seed)){ 163 | a_random_seed = r_seed 164 | }else{ 165 | stop("problem with external seed to pass to GSL") 166 | } 167 | # run compiled code 168 | if(selection_type == "ss"){ 169 | command = paste(exec, GG, thin, burn, intercept_variance, slab_variance, 170 | bb_alpha, bb_beta, n_cats, n_obs, n_vars, a_random_seed, 171 | out_dir, 0, ">", paste0("dmbvs-pid-", Sys.getpid(), ".out")) 172 | } else if(selection_type == "Gibbs"){ 173 | command = paste(exec, GG, thin, burn, intercept_variance, slab_variance, 174 | bb_alpha, bb_beta, n_cats, n_obs, n_vars, a_random_seed, 175 | out_dir, 1, ">", paste0("dmbvs-pid-", Sys.getpid(), ".out")) 176 | } 177 | system(command) 178 | # read output 179 | aa = utils::read.table(file.path(out_dir, "alpha.out")) 180 | aaa = utils::read.table(file.path(out_dir, "alpha_acceptance.out")) 181 | # bb is read as a n_vars * n_cats long list with each element have n_iters 182 | # so when it's unlist()ed you get the first variable for n_iters, then the 183 | # second, etc 184 | bb = utils::read.table(file.path(out_dir, "beta.out")) 185 | bba = utils::read.table(file.path(out_dir, "beta_acceptance.out")) 186 | 187 | # variables in the columns, iterations in the rows 188 | return(list(alpha = t(matrix(unlist(aa), nrow = n_cats, byrow = T)), 189 | alpha_accept = aaa[,1], 190 | # after transpose beta has iterations in the rows 191 | beta = t(matrix(unlist(bb), nrow = (n_cats * n_vars), byrow = T)), 192 | beta_accept = bba[,1], 193 | hyperparameters = list(mcmc = data.frame(GG = GG, thin = thin, burn = burn, 194 | proposal_alpha = proposal_alpha, 195 | proposal_beta = proposal_beta, 196 | random_seed = a_random_seed), 197 | priors = data.frame(intercept_variance = intercept_variance, 198 | slab_variance = slab_variance, 199 | bb_alpha = bb_alpha, bb_beta = bb_beta), 200 | inputdata = list(XX = XX, YY = YY)))) 201 | } 202 | 203 | -------------------------------------------------------------------------------- /code/helper_functions.R: -------------------------------------------------------------------------------- 1 | #' calculate the Bayesian False Discovery Rate 2 | #' 3 | #' @param mppi_vector A vector of marginal posterior probabilities of inclusion. 4 | #' @param threshold The expected false discovery rate threshold 5 | #' 6 | #' @return selected: A boolean vector of selected (= T) and rejected (= F) 7 | #' variables 8 | #' @return threshold: The BFDR threshold 9 | #' 10 | #' @references 11 | #' Newton, M. A., Noueiry, A., Sarkar, D., & Ahlquist, P. (2004). Detecting 12 | #' differential gene expression with a semiparametric hierarchical mixture 13 | #' method. Biostatistics, 5(2), 155-76. doi:10.1093/biostatistics/5.2.155 14 | #' 15 | #' @export 16 | 17 | bfdr = function(mppi_vector, threshold = 0.1){ 18 | # arg checking 19 | if(any(mppi_vector > 1 | mppi_vector < 0)){ 20 | stop("Bad input: mppi_vector should contain probabilities") 21 | } 22 | if(threshold > 1 | threshold < 0){ 23 | stop("Bad input: threshold should be a probability") 24 | } 25 | # sorting the ppi's in decresing order 26 | sorted_mppi_vector = sort(mppi_vector, decreasing = TRUE) 27 | # computing the fdr 28 | fdr = cumsum((1 - sorted_mppi_vector))/seq(1:length(mppi_vector)) 29 | # determine index of the largest fdr less than threshold 30 | thecut.index = max(which(fdr < threshold)) 31 | ppi_threshold = sorted_mppi_vector[thecut.index] 32 | selected = mppi_vector > ppi_threshold 33 | return(list(selected = selected, threshold = ppi_threshold)) 34 | } 35 | 36 | #' a bipartite graph showing the association between X and Y 37 | #' 38 | #' @note requires the igraph package 39 | #' 40 | #' @param MPPI a data.frame with n_vars x n_taxa rows and these four columns: 41 | #' 1) covariate: the names of the columns of X 42 | #' 2) taxa: the names of the columns of Y 43 | #' 3) mppi: the marginal posterior probability of inclusion for each taxa 44 | #' by covariate parameter 45 | #' 4) beta: a point estimate of for each taxa by covariate parameter 46 | #' @param mppi_threshold the threshold for inclusion in the plot 47 | #' @param inc_legend boolean to include the legend 48 | #' @param lwdx a scalar multiplier for growing or shrinking the widths of edges 49 | #' @param graph_layout either "circular" for a round layout or "bipartite" for 50 | #' a side-by-side layout 51 | #' @param lab_dist a scalar argument for the distance between node centers and 52 | #' the node labels 53 | #' @param ... passthrough arguments 54 | #' 55 | #' @return a plot 56 | #' 57 | #' @export 58 | association_plot = function(MPPI, mppi_threshold = 0.5, inc_legend = F, 59 | lwdx = 5, graph_layout = "circular", lab_dist = 2, 60 | ...){ 61 | if(any(!(colnames(MPPI) %in% c("covariates", "taxa", "mppi", "beta")))){ 62 | stop("please ensure the column names of MPPI are correct\n*** they must be: covariates, taxa, mppi, beta") 63 | } 64 | if(!require(igraph)){ 65 | stop("please ensure the igraph package is installed") 66 | } 67 | # first munge the data into a format easily read by graph_from_data_frame() 68 | mm = subset(MPPI, mppi > mppi_threshold) 69 | if(nrow(mm) == 0){ # when there are no relevant associations 70 | graphics::plot(1, type = "n", axes = F, xlab = "", ylab = "", 71 | xlim = c(-1, 1), ylim = c(-1, 1), ...) 72 | graphics::text(0, 0, "No Associations\nAbove MPPI\nThreshold") 73 | }else{ # when there are relevant associations 74 | mm$esign = round((sign(mm$beta) + 2)/3) + 1 75 | mm$emag = abs(mm$beta) 76 | mppi = mm[order(mm$covariates, mm$taxa),] 77 | # readin data.frame 78 | hh = igraph::graph_from_data_frame(mppi, directed = F) 79 | # modify graph characteristics 80 | igraph::E(hh)$weight = igraph::E(hh)$emag*lwdx 81 | #igraph::E(hh)$lty = igraph::E(hh)$esign 82 | igraph::E(hh)$lty = 1 83 | igraph::E(hh)$color = ifelse(igraph::E(hh)$esign == 2, 1, 2) 84 | igraph::V(hh)$type = c(rep(T, times = length(unique(mppi$covariates))), 85 | rep(F, times = length(unique(mppi$taxa)))) 86 | # layout as a bipartite graph 87 | if(graph_layout == "bipartite"){ 88 | la = layout.bipartite(hh, hgap = 20) 89 | graphics::plot(hh, layout = la[,c(2,1)], 90 | edge.width = igraph::E(hh)$weight, 91 | edge.lty = igraph::E(hh)$lty, 92 | edge.color = c("red", "blue")[igraph::E(hh)$color], 93 | vertex.color = c("green", "yellow")[igraph::V(hh)$type + 1], 94 | vertex.shape = c("circle", "square")[igraph::V(hh)$type + 1], 95 | vertex.frame.color = "black", vertex.label.cex = 1.2, 96 | vertex.label.color = "black", vertex.label.family = "sans", 97 | vertex.label.font = 2, 98 | vertex.label.degree = c(2*pi, pi)[igraph::V(hh)$type + 1], 99 | vertex.label.dist = lab_dist, ...) 100 | } 101 | # layout as a circular graph 102 | if(graph_layout == "circular"){ 103 | la = igraph::layout.circle(hh) 104 | # from http://stackoverflow.com/questions/23209802/placing-vertex-label-outside-a-circular-layout-in-igraph 105 | radian.rescale = function(x, start = 0, direction = 1) { 106 | c.rotate = function(x) (x + start) %% (2 * pi) * direction 107 | c.rotate(scales::rescale(x, c(0, 2 * pi), range(x))) 108 | } 109 | lab.locs = radian.rescale(x = 1:nrow(la), start = 0, direction = -1) 110 | graphics::plot(hh, layout = la, 111 | edge.width = igraph::E(hh)$weight, 112 | edge.lty = igraph::E(hh)$lty, 113 | edge.color = c("red", "blue")[igraph::E(hh)$color], 114 | vertex.color = c("green", "yellow")[igraph::V(hh)$type + 1], 115 | vertex.shape = c("circle", "square")[igraph::V(hh)$type + 1], 116 | vertex.frame.color = "black", vertex.label.cex = 1.2, 117 | vertex.label.color = "black", vertex.label.family = "sans", 118 | vertex.label.font = 2, vertex.label.dist = lab_dist, 119 | vertex.label.degree = lab.locs) 120 | } 121 | # a black and white color scheme without vertex shapes 122 | # graphics::plot(hh, layout = la, 123 | # edge.width = igraph::E(hh)$weight, 124 | # edge.color = "black", 125 | # vertex.color = c("grey", "white")[V(hh)$type + 1], 126 | # vertex.label.cex = 1.3, 127 | # vertex.label.color = "black", vertex.label.family = "sans", 128 | # vertex.label.font = c(2, 3)[igraph::V(hh)$type + 1], 129 | # vertex.size = 10, vertex.label.dist = 0.5, 130 | # vertex.label.degree = lab.locs) 131 | if(inc_legend){ 132 | #legend("topright", legend = c("positive", "negative"), lty = c(1, 2), col = "grey50", lwd = 3) 133 | graphics::legend("topright", legend = c("positive", "negative"), lwd = c(3, 3), col = c("blue","red")) 134 | } 135 | } 136 | } 137 | 138 | #' abundance table visualization 139 | #' 140 | #' @note requires the ggplot2 package 141 | #' 142 | #' @param count_matrix a matrix of integers 143 | #' @param title an optional title 144 | #' 145 | #' @return a plot 146 | #' 147 | #' @export 148 | abundance_plot = function(count_matrix, title = ""){ 149 | if(!require(ggplot2)){ 150 | stop("please ensure the ggplot2 package is installed") 151 | } 152 | # sort matrix by column means - make pretty 153 | ss = count_matrix[, order(colMeans(count_matrix), decreasing = T)] 154 | # coordinates 155 | xyz = cbind(expand.grid(1:dim(ss)[1], 1:dim(ss)[2]), as.vector(ss), as.vector(ss) > 0) 156 | names(xyz) = c("Sample.ID","Phylotype","Counts","Presence") 157 | print(ggplot2::ggplot(xyz, ggplot2::aes(y = Sample.ID, x = Phylotype, fill = log(Counts))) + 158 | ggplot2::geom_raster() + ggplot2::theme_bw() + ggplot2::ggtitle(title)) 159 | } 160 | 161 | 162 | #' simulate data from a Dirichlet-Multinomial regression model 163 | #' 164 | #' @note Requires the dirmult and MASS packages 165 | #' 166 | #' @param n_obs: the number of samples 167 | #' @param n_vars: number of covariates excluding the intercept 168 | #' @param n_taxa: number of species 169 | #' @param n_relevant_vars: number of relevant nutrients 170 | #' @param n_relevant_taxa: number of relevant species 171 | #' @param beta_min: minimum absolute value of the regression parameters 172 | #' @param beta_max: maximum absolute value of the regression parameters 173 | #' @param signoise: scalar multiplier on the regression parameters 174 | #' @param n_reads_min: lower bound on uniform distribution for number of reads in each sample 175 | #' @param n_reads_max: upper bound on uniform distribution for number of reads in each sample 176 | #' @param theta0: the dispersion parameter 177 | #' @param rho: the correlation between covariates 178 | #' 179 | #' @return XX: (design matrix) with intercept: n_obs * (n_vars + 1) 180 | #' @return YY: (count matrix) rows: n_obs samples, columns: n_taxa species 181 | #' @return alphas: simulated intercept vector 182 | #' @return betas: simulated coefficient matrix n_taxa * (n_vars + 1) 183 | #' @return n_reads_min, n_read_max: row sum parameters 184 | #' @return theta0, phi, rho, signoise: simulation inputs 185 | #' 186 | #' @export 187 | simulate_dirichlet_multinomial_regression = function(n_obs = 100, 188 | n_vars = 100, 189 | n_taxa = 40, 190 | n_relevant_vars = 4, 191 | n_relevant_taxa = 4, 192 | beta_min = 0.5, 193 | beta_max = 1.0, 194 | signoise = 1.0, 195 | n_reads_min = 1000, 196 | n_reads_max = 2000, 197 | theta0 = 0.01, 198 | rho = 0.4){ 199 | 200 | # check for required packages 201 | if(!require(dirmult)){ 202 | stop("dirmult package required") 203 | } 204 | if(!require(MASS)){ 205 | stop("MASS package required") 206 | } 207 | # covariance matrix for predictors 208 | Sigma = matrix(1, n_vars, n_vars) 209 | Sigma = rho^abs(row(Sigma) - col(Sigma)) 210 | # include the intercept 211 | XX = cbind(rep(1, n_obs), 212 | scale(MASS::mvrnorm(n = n_obs, mu = rep(0, n_vars), Sigma = Sigma))) 213 | # empties 214 | YY = matrix(0, n_obs, n_taxa) 215 | betas = matrix(0, n_taxa, n_vars) 216 | phi = matrix(0, n_obs, n_taxa) 217 | # parameters with signs alternating 218 | st = 0 219 | low_side = beta_min 220 | high_side = beta_max 221 | if(n_relevant_taxa != 1){ 222 | # warning if the lengths don't match 223 | coef = suppressWarnings(seq(low_side, high_side, len = n_relevant_taxa) * c(1, -1)) 224 | }else{ 225 | coef = (low_side + high_side) / 2 226 | } 227 | coef_g = rep(1.0, len = n_relevant_vars) 228 | for(ii in 1:n_relevant_vars){ 229 | # overlap species 230 | betas[(st:(st + n_relevant_taxa - 1)) %% n_taxa + 1, 3 * ii - 2] = coef_g[ii] * sample(coef)[((ii - 1):(ii + n_relevant_taxa - 2)) %% n_relevant_taxa + 1] 231 | st = st + 1 232 | } 233 | # -2.3 and 2.3 so that the intercept varies over three orders of magnitude 234 | intercept = runif(n_taxa, -2.3, 2.3) 235 | Beta = cbind(intercept, signoise * betas) 236 | # row totals 237 | ct0 = sample(n_reads_min:n_reads_max, n_obs, rep = T) 238 | for(ii in 1:n_obs){ 239 | thisrow = as.vector(exp(Beta %*% XX[ii, ])) 240 | phi[ii, ] = thisrow/sum(thisrow) 241 | YY[ii, ] = dirmult::simPop(J = 1, n = ct0[ii], pi = phi[ii, ], theta = theta0)$data[1, ] 242 | } 243 | 244 | return(list(XX = XX, YY = YY, alphas = intercept, betas = Beta, 245 | n_reads_min = n_reads_min, n_reads_max = n_reads_max, 246 | theta0 = theta0, phi = phi, rho = rho, signoise = signoise)) 247 | 248 | } 249 | 250 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | -------------------------------------------------------------------------------- /code/dmbvs.c: -------------------------------------------------------------------------------- 1 | // *********************************************************************** // 2 | // MCMC code implementing integrative Bayesian variable selection for the // 3 | // Dirichlet-Multinomial regression model with application to the analysis // 4 | // of taxonomic abundances in microbiome data // 5 | // *********************************************************************** // 6 | // 7 | // Copyright (C) 2016, Raffaele Argiento and W. Duncan Wadsworth with contributions 8 | // from Michele Guindani 9 | // 10 | // This program is free software: you can redistribute it and/or modify 11 | // it under the terms of the GNU General Public License as published by 12 | // the Free Software Foundation, either version 3 of the License, or 13 | // (at your option) any later version. 14 | // 15 | // This program is distributed in the hope that it will be useful, 16 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | // GNU General Public License for more details. 19 | // 20 | // You should have received a copy of the GNU General Public License 21 | // along with this program. If not, see . 22 | // 23 | // *********************************************************************** // 24 | // 25 | // This version has been updated on October 2, 2017 with an implementation 26 | // of a stochastic search algorithm for variable selection in addition to the 27 | // Gibbs sampler 28 | // 29 | // standard includes 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | // GSL includes 36 | #include 37 | #include // has both the lngamma and lnbeta functions 38 | #include 39 | #include 40 | 41 | // define the RNG globally 42 | gsl_rng *rando; 43 | 44 | // data dimensions required in the function prototypes below 45 | static int n_cats; 46 | static int n_obs; 47 | static int n_vars; 48 | 49 | // ----- Function declarations ----- // 50 | // adaptive MH helper functions 51 | double online_mean(int iteration, double last_mean, double curr_obs); 52 | double online_var(int iteration, double last_mean, double last_var, 53 | double curr_mean, double curr_obs); 54 | double adap_prop(double curr_var); 55 | // log Beta-Bernoulli spike-and-slab prior evaluation 56 | double lprior_bbsas(double betajk, int sjk, double sig_bejk, double mu_bejk, 57 | double aa_hp, double bb_hp); 58 | // calculates the linear predictor for each category of the Dirichlet-Multinomial 59 | double calculate_gamma(double **XX, double *alpha, double *beta, int jj, 60 | int ii, int Log); 61 | // MH update for the regression parameters, Gibbs version 62 | void update_beta_jj(double **XX, double **JJ, double **loggamma, 63 | double *beta_temp, int *inclusion_indicator, 64 | double *prop_per_beta, double mu_be[n_cats][n_vars], 65 | double sig_be[n_cats][n_vars], double aa_hp, double bb_hp, 66 | int jj); 67 | // MH update for the regression parameters, stochastic search version 68 | void update_beta_jj_ss(double **XX, double **JJ, double **loggamma, 69 | double *beta_temp, int *inclusion_indicator, 70 | double *prop_per_beta, double mu_be[n_cats][n_vars], 71 | double sig_be[n_cats][n_vars], double aa_hp, double bb_hp, 72 | int jj); 73 | // MH update for the intercept parameters 74 | void update_alpha_jj(double **JJ, double **loggamma, double *alpha, 75 | double *prop_per_alpha, int *accepted_alpha_flag, 76 | double mu_al[n_cats], double sig_al[n_cats], int jj); 77 | // for the Savitsky et al. inclusion proposal step 78 | void between_models_jj(double **XX, double **JJ, double **loggamma, 79 | double *beta_temp, int *accepted_beta_flag, 80 | int *inclusion_indicator, double mu_be[n_cats][n_vars], 81 | double sig_be[n_cats][n_vars], double aa_hp, double bb_hp, 82 | int jj); 83 | // Principal function using Gibbs variable selection 84 | void dmbvs_gibbs(double **XX, int **YY, double *alpha, double *beta, 85 | double mu_al[n_cats], double sig_al[n_cats], 86 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars], 87 | double aa_hp, double bb_hp, double *prop_per_alpha, 88 | double *prop_per_beta, const int GG, const int burn, 89 | const int thin, int Log, char *temp_dir); 90 | // Principal function using stochastic search variable selection 91 | void dmbvs_ss(double **XX, int **YY, double *alpha, double *beta, 92 | double mu_al[n_cats], double sig_al[n_cats], 93 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars], 94 | double aa_hp, double bb_hp, double *prop_per_alpha, 95 | double *prop_per_beta, const int GG, const int burn, 96 | const int thin, int Log, char *temp_dir); 97 | 98 | // -------------------------------- // 99 | int main(int argc, char *argv[]){ 100 | 101 | // command line argument parsing 102 | if(argc != 14){ 103 | fprintf(stderr, "Usage: %s \n \n \n", argv[0]); 104 | exit(EXIT_FAILURE); 105 | } 106 | const int GG = atoi(argv[1]); 107 | const int thin = atoi(argv[2]); 108 | const int burn = atoi(argv[3]); 109 | double alpha_variance = atof(argv[4]); 110 | double slab_variance = atof(argv[5]); 111 | double aa_hp = atof(argv[6]); 112 | double bb_hp = atof(argv[7]); 113 | n_cats = atoi(argv[8]); 114 | n_obs = atoi(argv[9]); 115 | n_vars = atoi(argv[10]); 116 | long int external_seed = atoi(argv[11]); 117 | char *temp_dir = argv[12]; 118 | int selection_type = atoi(argv[13]); 119 | 120 | // hyperparameters and Metropolis-Hastings proposal parameters 121 | const double mean_alpha = 0.0; 122 | const double mean_slab = 0.0; 123 | 124 | printf("***********************************************************\n"); 125 | printf("********************* Starting dmbvs **********************\n"); 126 | // outfile info 127 | printf("***********************************************************\n"); 128 | printf("*************************** MCMC **************************\n"); 129 | printf("Total iterations = %i, Thinning = %i, Burnin = %i\n", GG, thin, burn); 130 | // random number generator initialization 131 | const gsl_rng_type * Tipo; 132 | Tipo = gsl_rng_default; 133 | rando = gsl_rng_alloc(Tipo); 134 | gsl_rng_set(rando, external_seed); 135 | printf("External seed = %lu, GSL seed = %lu, Generator type: %s\n", 136 | external_seed, gsl_rng_get(rando), gsl_rng_name(rando)); 137 | printf("Text output deposited in directory:\n %s\n", temp_dir); 138 | printf("***********************************************************\n"); 139 | printf("*************************** Data **************************\n"); 140 | printf("# of categories = %i, # of observations = %i, # of covariates = %i\n", 141 | n_cats, n_obs, n_vars); 142 | printf("***********************************************************\n"); 143 | printf("********************* Hyperparameters *********************\n"); 144 | printf("Slab variance = %0.1f, Intercept variance = %0.1f\n", slab_variance, 145 | alpha_variance); 146 | printf("Beta-Bernoulli alpha = %0.2f, Beta-Bernoulli beta = %0.2f\n", aa_hp, 147 | bb_hp); 148 | printf("***********************************************************\n"); 149 | printf("***********************************************************\n"); 150 | 151 | /////////////////////////////////////////////////// 152 | // Indices (shifted right one) 153 | // ii: index of the observations in 1, ..., n_obs 154 | // jj: index of the categories in 1, ..., n_cats 155 | // kk: index of the covariates in 1, ..., n_vars 156 | // hh: index of the beta vector in 1, ..., n_vars * n_cats, is collapsed 157 | // row-wise so that the 2x2 identity matrix becomes (1, 0, 0, 1) 158 | int ii, jj, kk, hh; 159 | 160 | // Larger objects such as data and initializations are read from files 161 | 162 | // single file pointer for all files 163 | FILE *fin; 164 | 165 | // reading in YY: the sample-by-category count matrix 166 | // all other files are read in in the same way 167 | char td_YY[200]; 168 | strcpy(td_YY, temp_dir); 169 | strcat(td_YY, "/count_matrix.txt"); 170 | fin = fopen(td_YY, "r"); 171 | // dynamic allocation for YY for an array of pointers 172 | int **YY; 173 | YY = (int **) malloc(n_obs * sizeof(int *)); 174 | for(ii = 0 ; ii < n_obs ; ii++){ 175 | // allocate each row of the matrix YY 176 | YY[ii] = (int *) malloc(n_cats * sizeof(int)); 177 | // fill in each row of the matrix YY 178 | for(jj = 0 ; jj < n_cats ; jj++){ 179 | fscanf(fin, "%u", &YY[ii][jj]); 180 | } 181 | } 182 | 183 | // reading in XX: the sample-by-covariate matrix 184 | char td_XX[200]; 185 | strcpy(td_XX, temp_dir); 186 | strcat(td_XX, "/covariates.txt"); 187 | fin = fopen(td_XX, "r"); 188 | double **XX; 189 | XX = (double **) malloc(n_obs * sizeof(double *)); 190 | for(ii = 0 ; ii < n_obs ; ii++){ 191 | XX[ii] = (double *) malloc(n_vars * sizeof(double)); 192 | for(jj = 0 ; jj < n_vars ; jj++){ 193 | fscanf(fin, "%lf", &XX[ii][jj]); 194 | } 195 | } 196 | 197 | ///////// Initializations from text files 198 | 199 | // initialization for beta 200 | char td_ib[200]; 201 | strcpy(td_ib, temp_dir); 202 | strcat(td_ib, "/init_beta.txt"); 203 | fin = fopen(td_ib, "r"); 204 | double *beta; 205 | beta = malloc((n_vars * n_cats) * sizeof(double)); 206 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 207 | fscanf(fin, "%lf", &beta[hh]); 208 | } 209 | 210 | // initialization for alpha 211 | char td_ia[200]; 212 | strcpy(td_ia, temp_dir); 213 | strcat(td_ia, "/init_alpha.txt"); 214 | fin = fopen(td_ia, "r"); 215 | double *alpha; 216 | alpha = malloc(n_cats * sizeof(double)); 217 | for(jj = 0 ; jj < n_cats ; jj++){ 218 | fscanf(fin, "%lf", &alpha[jj]); 219 | } 220 | 221 | // for the beta proposal variance 222 | char td_pb[200]; 223 | strcpy(td_pb, temp_dir); 224 | strcat(td_pb, "/proposal_beta.txt"); 225 | fin = fopen(td_pb, "r"); 226 | double *prop_per_beta; 227 | prop_per_beta = malloc((n_vars * n_cats) * sizeof(double)); 228 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 229 | fscanf(fin, "%lf", &prop_per_beta[hh]); 230 | } 231 | 232 | // for the alpha proposal variance 233 | char td_pa[200]; 234 | strcpy(td_pa, temp_dir); 235 | strcat(td_pa, "/proposal_alpha.txt"); 236 | fin = fopen(td_pa, "r"); 237 | double *prop_per_alpha; 238 | prop_per_alpha = malloc(n_cats * sizeof(double)); 239 | for(jj = 0 ; jj < n_cats ; jj++){ 240 | fscanf(fin, "%lf", &prop_per_alpha[jj]); 241 | } 242 | 243 | // mean and standard deviation of the independent normal priors on alpha and beta 244 | double mu_al[n_cats], sig_al[n_cats]; 245 | double mu_be[n_cats][n_vars], sig_be[n_cats][n_vars]; 246 | for(jj = 0 ; jj < n_cats ; jj++){ 247 | mu_al[jj] = mean_alpha; 248 | sig_al[jj] = alpha_variance; 249 | for(kk = 0 ; kk < n_vars ; kk++){ 250 | mu_be[jj][kk] = mean_slab; 251 | sig_be[jj][kk] = slab_variance; 252 | } 253 | } 254 | 255 | // a flag for functions that can calculate on the log scale 256 | int Log = 1; 257 | 258 | ///////// Initialization finished: commence the MCMC 259 | 260 | if(selection_type == 0){ 261 | dmbvs_ss(XX, YY, alpha, beta, mu_al, sig_al, mu_be, sig_be, aa_hp, bb_hp, 262 | prop_per_alpha, prop_per_beta, GG, burn, thin, Log, temp_dir); 263 | } 264 | else if(selection_type == 1){ 265 | dmbvs_gibbs(XX, YY, alpha, beta, mu_al, sig_al, mu_be, sig_be, aa_hp, bb_hp, 266 | prop_per_alpha, prop_per_beta, GG, burn, thin, Log, temp_dir); 267 | } 268 | else{ 269 | printf("Unrecognized variable selection argument\n"); 270 | } 271 | 272 | 273 | // cleanup (kind of pedantic) 274 | fclose(fin); 275 | free(YY); 276 | free(XX); 277 | free(beta); 278 | free(alpha); 279 | free(prop_per_alpha); 280 | free(prop_per_beta); 281 | gsl_rng_free(rando); 282 | 283 | return(0); 284 | 285 | } // end of main() 286 | 287 | ////////////////////////////////////// 288 | // ----- Function definitions ----- // 289 | ////////////////////////////////////// 290 | 291 | // Gibbs Sampler function 292 | void dmbvs_ss(double **XX, int **YY, double *alpha, double *beta, 293 | double mu_al[n_cats], double sig_al[n_cats], 294 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars], 295 | double aa_hp, double bb_hp, double *prop_per_alpha, 296 | double *prop_per_beta, const int GG, const int burn, 297 | const int thin, int Log, char *temp_dir){ 298 | 299 | // loop indices 300 | int hh, ii, jj, kk, gg; 301 | 302 | // for timing iterations 303 | time_t mytime; 304 | 305 | // adaptive proposal values 306 | double last_mean; 307 | double last_var; 308 | 309 | ///////// Initialize variables that don't require input or can be obtained 310 | ///////// from inputs 311 | 312 | // for MH acceptance/rejection ratios, initialized with zeros 313 | int *accepted_beta_flag; 314 | accepted_beta_flag = malloc((n_vars * n_cats) * sizeof(int)); 315 | int *accepted_beta; 316 | accepted_beta = malloc((n_vars * n_cats) * sizeof(int)); 317 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 318 | accepted_beta_flag[hh] = 0; 319 | accepted_beta[hh] = 0; 320 | } 321 | int *accepted_alpha_flag; 322 | accepted_alpha_flag = malloc(n_cats * sizeof(int)); 323 | int *accepted_alpha; 324 | accepted_alpha = malloc(n_cats * sizeof(int)); 325 | for(hh = 0 ; hh < n_cats ; hh++){ 326 | accepted_alpha_flag[hh] = 0; 327 | accepted_alpha[hh] = 0; 328 | } 329 | // for adaptive MH need to keep track of target density mean and variance 330 | double *curr_mean; 331 | curr_mean = malloc((n_vars * n_cats) * sizeof(double)); 332 | double *curr_var; 333 | curr_var = malloc((n_vars * n_cats) * sizeof(double)); 334 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 335 | curr_mean[hh] = 0.0; 336 | curr_var[hh] = 0.5; 337 | } 338 | 339 | // variable inclusion indicator 340 | int *inclusion_indicator; 341 | inclusion_indicator = malloc((n_vars * n_cats) * sizeof(int)); 342 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 343 | if(beta[hh] == 0){ 344 | inclusion_indicator[hh] = 0; 345 | }else{ 346 | inclusion_indicator[hh] = 1; 347 | } 348 | } 349 | 350 | // row sums of YY 351 | int *Ypiu; 352 | Ypiu = malloc(n_obs * sizeof(int)); 353 | for(ii = 0 ; ii < n_obs ; ii++){ 354 | for(jj = 0 ; jj < n_cats ; jj++){ 355 | Ypiu[ii] = Ypiu[ii] + YY[ii][jj]; 356 | } 357 | } 358 | 359 | // the beta_temp vector is an n_vars long vector used for updating each of the 360 | // jj-th rows in Beta corresponding the jj-th category in YY 361 | double *beta_temp; 362 | beta_temp = malloc(n_vars * sizeof(double)); 363 | 364 | // the linear predictor matrix, represented as a vector, is in the log scale 365 | double **loggamma; 366 | loggamma = malloc(n_obs * sizeof(double *)); 367 | for(ii = 0 ; ii < n_obs ; ii++){ 368 | loggamma[ii] = malloc(n_cats * sizeof(double)); 369 | for(jj = 0 ; jj < n_cats ; jj++){ 370 | loggamma[ii][jj] = calculate_gamma(XX, alpha, beta, jj, ii, Log); 371 | } 372 | } 373 | 374 | // initialize latent variable: JJ ~ gamma() 375 | double **JJ; 376 | JJ = malloc(n_obs * sizeof(double *)); 377 | // TT is the vector of normalization constants 378 | double *TT; 379 | TT = malloc(n_obs * sizeof(double *)); 380 | for(ii = 0 ; ii < n_obs ; ii++){ 381 | JJ[ii] = malloc(n_cats * sizeof(double)); 382 | TT[ii] = 0.0; 383 | for(jj = 0 ; jj < n_cats ; jj++){ 384 | JJ[ii][jj] = (double)YY[ii][jj]; 385 | // a very small number 386 | if(JJ[ii][jj] < pow(10.0, -100.0)){ 387 | JJ[ii][jj] = pow(10.0, -100.0); 388 | } 389 | TT[ii] = TT[ii] + JJ[ii][jj]; 390 | } 391 | } 392 | 393 | // initialize the other clever latent variable: uu 394 | // note that this uses the data to initialize 395 | double *uu; 396 | uu = malloc(n_obs * sizeof(double)); 397 | for(ii = 0 ; ii < n_obs ; ii++){ 398 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]); 399 | } 400 | 401 | // output file pointers 402 | FILE *fout_alpha, *fout_beta, *fout_alpha_acceptance, *fout_beta_acceptance, *fout_beta_proposal; 403 | 404 | // this is stupid, I'm sure there's a better way 405 | char tf_alpha[200]; 406 | strcpy(tf_alpha, temp_dir); 407 | strcat(tf_alpha, "/alpha.out"); 408 | fout_alpha = fopen(tf_alpha, "w"); 409 | char tf_alpha_acceptance[200]; 410 | strcpy(tf_alpha_acceptance, temp_dir); 411 | strcat(tf_alpha_acceptance, "/alpha_acceptance.out"); 412 | fout_alpha_acceptance = fopen(tf_alpha_acceptance, "w"); 413 | char tf_beta[200]; 414 | strcpy(tf_beta, temp_dir); 415 | strcat(tf_beta, "/beta.out"); 416 | fout_beta = fopen(tf_beta, "w"); 417 | char tf_beta_acceptance[200]; 418 | strcpy(tf_beta_acceptance, temp_dir); 419 | strcat(tf_beta_acceptance, "/beta_acceptance.out"); 420 | fout_beta_acceptance = fopen(tf_beta_acceptance, "w"); 421 | char tf_beta_proposal[200]; 422 | strcpy(tf_beta_proposal, temp_dir); 423 | strcat(tf_beta_proposal, "/beta_proposal.out"); 424 | fout_beta_proposal = fopen(tf_beta_proposal, "w"); 425 | 426 | // the magic starts here 427 | printf("Starting the sampler\n"); 428 | 429 | for(gg = 0 ; gg < GG ; gg++){ 430 | 431 | // timing info to the printed output 432 | mytime = time(NULL); 433 | if(gg % 1000 == 0){ 434 | printf("Starting the %uth iteration out of %u at %s", gg, GG, ctime(&mytime)); 435 | } 436 | 437 | // first a round of the between-model step for every covariate within every taxa 438 | jj = floor(gsl_rng_uniform(rando) * n_cats);//for(jj = 0 ; jj < n_cats ; jj++){ 439 | 440 | // fill in beta_temp 441 | for(kk = 0 ; kk < n_vars ; kk++){ 442 | hh = kk + jj * n_vars; 443 | beta_temp[kk] = beta[hh]; 444 | } 445 | 446 | between_models_jj(XX, JJ, loggamma, beta_temp, accepted_beta_flag, 447 | inclusion_indicator, mu_be, sig_be, aa_hp, bb_hp, jj); 448 | 449 | // update beta with beta_temp 450 | for(kk = 0 ; kk < n_vars ; kk++){ 451 | hh = kk + jj * n_vars; 452 | beta[hh] = beta_temp[kk]; 453 | } 454 | //} 455 | 456 | // now an accelerating within-model step 457 | for(jj = 0 ; jj < n_cats ; jj++){ 458 | 459 | // fill in beta_temp and update the proposal variance 460 | for(kk = 0 ; kk < n_vars ; kk++){ 461 | hh = kk + jj * n_vars; 462 | beta_temp[kk] = beta[hh]; 463 | // wait until each parameter has had a few iterations 464 | if(gg > 2 * n_vars * n_cats){ 465 | // calculate the online variance 466 | last_mean = curr_mean[hh]; 467 | last_var = curr_var[hh]; 468 | curr_mean[hh] = online_mean(gg, last_mean, beta[hh]); 469 | curr_var[hh] = online_var(gg, last_mean, last_var, curr_mean[hh], beta[hh]); 470 | // update proposal variance 471 | prop_per_beta[hh] = curr_var[hh]; 472 | } 473 | } 474 | 475 | update_beta_jj_ss(XX, JJ, loggamma, beta_temp, inclusion_indicator, 476 | prop_per_beta, mu_be, sig_be, aa_hp, bb_hp, jj); 477 | 478 | // update beta with beta_temp and write to file 479 | for(kk = 0 ; kk < n_vars ; kk++){ 480 | hh = kk + jj * n_vars; 481 | beta[hh] = beta_temp[kk]; 482 | if((gg >= burn) & (gg % thin == 0)){ 483 | accepted_beta[hh] = accepted_beta[hh] + accepted_beta_flag[hh]; 484 | accepted_beta_flag[hh] = 0; 485 | fprintf(fout_beta, "%e ", beta[hh]); // space delimited output files 486 | } 487 | } 488 | } 489 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_beta, "\n");} 490 | 491 | // update alpha and write to file 492 | for(jj = 0 ; jj < n_cats ; jj++){ 493 | update_alpha_jj(JJ, loggamma, alpha, prop_per_alpha, accepted_alpha_flag, 494 | mu_al, sig_al, jj); 495 | if((gg >= burn) & (gg % thin == 0)){ 496 | accepted_alpha[jj] = accepted_alpha[jj] + accepted_alpha_flag[jj]; 497 | accepted_alpha_flag[jj] = 0; 498 | fprintf(fout_alpha,"%e ", alpha[jj]); 499 | } 500 | } 501 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_alpha, "\n ");} 502 | 503 | // update JJ and consequently TT 504 | for(ii = 0 ; ii < n_obs ; ii++){ 505 | TT[ii] = 0.0; 506 | for(jj = 0 ; jj < n_cats ; jj++){ 507 | JJ[ii][jj] = gsl_ran_gamma(rando, YY[ii][jj] + exp(loggamma[ii][jj]), 1.0/(uu[ii] + 1.0)); 508 | if(JJ[ii][jj] < pow(10.0, -100.0)){ 509 | JJ[ii][jj] = pow(10.0, -100.0); 510 | } 511 | TT[ii] = TT[ii] + JJ[ii][jj]; 512 | } 513 | } 514 | 515 | // update latent variables uu 516 | for(ii = 0 ; ii < n_obs ; ii++){ 517 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]); 518 | } 519 | 520 | } // end of iterations 521 | 522 | // print acceptance ratio files 523 | for(jj = 0 ; jj < n_cats ; jj++){ 524 | fprintf(fout_alpha_acceptance, "%f\n", (float)accepted_alpha[jj]/((GG - burn)/thin)); 525 | for(kk = 0 ; kk < n_vars ; kk++){ 526 | fprintf(fout_beta_acceptance, "%f\n", (float)accepted_beta[kk + jj * n_vars]/((GG - burn)/thin)); 527 | fprintf(fout_beta_proposal, "%f\n", prop_per_beta[kk + jj * n_vars]); 528 | } 529 | } 530 | 531 | // cleanup (kind of pedantic) 532 | fclose(fout_alpha); 533 | fclose(fout_beta); 534 | fclose(fout_alpha_acceptance); 535 | fclose(fout_beta_acceptance); 536 | free(accepted_alpha); 537 | free(accepted_beta); 538 | free(accepted_alpha_flag); 539 | free(accepted_beta_flag); 540 | free(inclusion_indicator); 541 | free(beta_temp); 542 | free(loggamma); 543 | free(JJ); 544 | free(TT); 545 | free(uu); 546 | free(Ypiu); 547 | 548 | printf("Sampling finished!\n"); 549 | 550 | return; 551 | 552 | } // close Gibbs sampler function 553 | 554 | void dmbvs_gibbs(double **XX, int **YY, double *alpha, double *beta, 555 | double mu_al[n_cats], double sig_al[n_cats], 556 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars], 557 | double aa_hp, double bb_hp, double *prop_per_alpha, 558 | double *prop_per_beta, const int GG, const int burn, 559 | const int thin, int Log, char *temp_dir){ 560 | 561 | // loop indices 562 | int hh, ii, jj, kk, gg; 563 | 564 | // for timing iterations 565 | time_t mytime; 566 | 567 | // adaptive proposal values 568 | double last_mean; 569 | double last_var; 570 | 571 | ///////// Initialize variables that don't require input or can be obtained 572 | ///////// from inputs 573 | 574 | // for MH acceptance/rejection ratios, initialized with zeros 575 | int *accepted_beta_flag; 576 | accepted_beta_flag = malloc((n_vars * n_cats) * sizeof(int)); 577 | int *accepted_beta; 578 | accepted_beta = malloc((n_vars * n_cats) * sizeof(int)); 579 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 580 | accepted_beta_flag[hh] = 0; 581 | accepted_beta[hh] = 0; 582 | } 583 | int *accepted_alpha_flag; 584 | accepted_alpha_flag = malloc(n_cats * sizeof(int)); 585 | int *accepted_alpha; 586 | accepted_alpha = malloc(n_cats * sizeof(int)); 587 | for(hh = 0 ; hh < n_cats ; hh++){ 588 | accepted_alpha_flag[hh] = 0; 589 | accepted_alpha[hh] = 0; 590 | } 591 | // for adaptive MH need to keep track of target density mean and variance 592 | double *curr_mean; 593 | curr_mean = malloc((n_vars * n_cats) * sizeof(double)); 594 | double *curr_var; 595 | curr_var = malloc((n_vars * n_cats) * sizeof(double)); 596 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 597 | curr_mean[hh] = 0.0; 598 | curr_var[hh] = 0.5; 599 | } 600 | 601 | // variable inclusion indicator 602 | int *inclusion_indicator; 603 | inclusion_indicator = malloc((n_vars * n_cats) * sizeof(int)); 604 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){ 605 | if(beta[hh] == 0){ 606 | inclusion_indicator[hh] = 0; 607 | }else{ 608 | inclusion_indicator[hh] = 1; 609 | } 610 | } 611 | 612 | // row sums of YY 613 | int *Ypiu; 614 | Ypiu = malloc(n_obs * sizeof(int)); 615 | for(ii = 0 ; ii < n_obs ; ii++){ 616 | for(jj = 0 ; jj < n_cats ; jj++){ 617 | Ypiu[ii] = Ypiu[ii] + YY[ii][jj]; 618 | } 619 | } 620 | 621 | // the beta_temp vector is an n_vars long vector used for updating each of the 622 | // jj-th rows in Beta corresponding the jj-th category in YY 623 | double *beta_temp; 624 | beta_temp = malloc(n_vars * sizeof(double)); 625 | 626 | // the linear predictor matrix, represented as a vector, is in the log scale 627 | double **loggamma; 628 | loggamma = malloc(n_obs * sizeof(double *)); 629 | for(ii = 0 ; ii < n_obs ; ii++){ 630 | loggamma[ii] = malloc(n_cats * sizeof(double)); 631 | for(jj = 0 ; jj < n_cats ; jj++){ 632 | loggamma[ii][jj] = calculate_gamma(XX, alpha, beta, jj, ii, Log); 633 | } 634 | } 635 | 636 | // initialize latent variable: JJ ~ gamma() 637 | double **JJ; 638 | JJ = malloc(n_obs * sizeof(double *)); 639 | // TT is the vector of normalization constants 640 | double *TT; 641 | TT = malloc(n_obs * sizeof(double *)); 642 | for(ii = 0 ; ii < n_obs ; ii++){ 643 | JJ[ii] = malloc(n_cats * sizeof(double)); 644 | TT[ii] = 0.0; 645 | for(jj = 0 ; jj < n_cats ; jj++){ 646 | JJ[ii][jj] = (double)YY[ii][jj]; 647 | // a very small number 648 | if(JJ[ii][jj] < pow(10.0, -100.0)){ 649 | JJ[ii][jj] = pow(10.0, -100.0); 650 | } 651 | TT[ii] = TT[ii] + JJ[ii][jj]; 652 | } 653 | } 654 | 655 | // initialize the other clever latent variable: uu 656 | // note that this uses the data to initialize 657 | double *uu; 658 | uu = malloc(n_obs * sizeof(double)); 659 | for(ii = 0 ; ii < n_obs ; ii++){ 660 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]); 661 | } 662 | 663 | // output file pointers 664 | FILE *fout_alpha, *fout_beta, *fout_alpha_acceptance, *fout_beta_acceptance, *fout_beta_proposal; 665 | 666 | // this is stupid, I'm sure there's a better way 667 | char tf_alpha[200]; 668 | strcpy(tf_alpha, temp_dir); 669 | strcat(tf_alpha, "/alpha.out"); 670 | fout_alpha = fopen(tf_alpha, "w"); 671 | char tf_alpha_acceptance[200]; 672 | strcpy(tf_alpha_acceptance, temp_dir); 673 | strcat(tf_alpha_acceptance, "/alpha_acceptance.out"); 674 | fout_alpha_acceptance = fopen(tf_alpha_acceptance, "w"); 675 | char tf_beta[200]; 676 | strcpy(tf_beta, temp_dir); 677 | strcat(tf_beta, "/beta.out"); 678 | fout_beta = fopen(tf_beta, "w"); 679 | char tf_beta_acceptance[200]; 680 | strcpy(tf_beta_acceptance, temp_dir); 681 | strcat(tf_beta_acceptance, "/beta_acceptance.out"); 682 | fout_beta_acceptance = fopen(tf_beta_acceptance, "w"); 683 | char tf_beta_proposal[200]; 684 | strcpy(tf_beta_proposal, temp_dir); 685 | strcat(tf_beta_proposal, "/beta_proposal.out"); 686 | fout_beta_proposal = fopen(tf_beta_proposal, "w"); 687 | 688 | // the magic starts here 689 | printf("Starting the sampler\n"); 690 | 691 | for(gg = 0 ; gg < GG ; gg++){ 692 | 693 | // timing info to the printed output 694 | mytime = time(NULL); 695 | if(gg % 1000 == 0){ 696 | printf("Starting the %uth iteration out of %u at %s", gg, GG, ctime(&mytime)); 697 | } 698 | 699 | // first a round of the between-model step for every covariate within every taxa 700 | for(jj = 0 ; jj < n_cats ; jj++){ 701 | 702 | // fill in beta_temp 703 | for(kk = 0 ; kk < n_vars ; kk++){ 704 | hh = kk + jj * n_vars; 705 | beta_temp[kk] = beta[hh]; 706 | } 707 | 708 | between_models_jj(XX, JJ, loggamma, beta_temp, accepted_beta_flag, 709 | inclusion_indicator, mu_be, sig_be, aa_hp, bb_hp, jj); 710 | 711 | // update beta with beta_temp 712 | for(kk = 0 ; kk < n_vars ; kk++){ 713 | hh = kk + jj * n_vars; 714 | beta[hh] = beta_temp[kk]; 715 | } 716 | } 717 | 718 | // now an accelerating within-model step 719 | for(jj = 0 ; jj < n_cats ; jj++){ 720 | 721 | // fill in beta_temp and update the proposal variance 722 | for(kk = 0 ; kk < n_vars ; kk++){ 723 | hh = kk + jj * n_vars; 724 | beta_temp[kk] = beta[hh]; 725 | // wait until each parameter has had a few iterations 726 | if(gg > 2 * n_vars * n_cats){ 727 | // calculate the online variance 728 | last_mean = curr_mean[hh]; 729 | last_var = curr_var[hh]; 730 | curr_mean[hh] = online_mean(gg, last_mean, beta[hh]); 731 | curr_var[hh] = online_var(gg, last_mean, last_var, curr_mean[hh], beta[hh]); 732 | // update proposal variance 733 | prop_per_beta[hh] = curr_var[hh]; 734 | } 735 | } 736 | 737 | update_beta_jj(XX, JJ, loggamma, beta_temp, inclusion_indicator, 738 | prop_per_beta, mu_be, sig_be, aa_hp, bb_hp, jj); 739 | 740 | // update beta with beta_temp and write to file 741 | for(kk = 0 ; kk < n_vars ; kk++){ 742 | hh = kk + jj * n_vars; 743 | beta[hh] = beta_temp[kk]; 744 | if((gg >= burn) & (gg % thin == 0)){ 745 | accepted_beta[hh] = accepted_beta[hh] + accepted_beta_flag[hh]; 746 | accepted_beta_flag[hh] = 0; 747 | fprintf(fout_beta, "%e ", beta[hh]); // space delimited output files 748 | } 749 | } 750 | } 751 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_beta, "\n");} 752 | 753 | // update alpha and write to file 754 | for(jj = 0 ; jj < n_cats ; jj++){ 755 | update_alpha_jj(JJ, loggamma, alpha, prop_per_alpha, accepted_alpha_flag, 756 | mu_al, sig_al, jj); 757 | if((gg >= burn) & (gg % thin == 0)){ 758 | accepted_alpha[jj] = accepted_alpha[jj] + accepted_alpha_flag[jj]; 759 | accepted_alpha_flag[jj] = 0; 760 | fprintf(fout_alpha,"%e ", alpha[jj]); 761 | } 762 | } 763 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_alpha, "\n ");} 764 | 765 | // update JJ and consequently TT 766 | for(ii = 0 ; ii < n_obs ; ii++){ 767 | TT[ii] = 0.0; 768 | for(jj = 0 ; jj < n_cats ; jj++){ 769 | JJ[ii][jj] = gsl_ran_gamma(rando, YY[ii][jj] + exp(loggamma[ii][jj]), 1.0/(uu[ii] + 1.0)); 770 | if(JJ[ii][jj] < pow(10.0, -100.0)){ 771 | JJ[ii][jj] = pow(10.0, -100.0); 772 | } 773 | TT[ii] = TT[ii] + JJ[ii][jj]; 774 | } 775 | } 776 | 777 | // update latent variables uu 778 | for(ii = 0 ; ii < n_obs ; ii++){ 779 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]); 780 | } 781 | 782 | } // end of iterations 783 | 784 | // print acceptance ratio files 785 | for(jj = 0 ; jj < n_cats ; jj++){ 786 | fprintf(fout_alpha_acceptance, "%f\n", (float)accepted_alpha[jj]/((GG - burn)/thin)); 787 | for(kk = 0 ; kk < n_vars ; kk++){ 788 | fprintf(fout_beta_acceptance, "%f\n", (float)accepted_beta[kk + jj * n_vars]/((GG - burn)/thin)); 789 | fprintf(fout_beta_proposal, "%f\n", prop_per_beta[kk + jj * n_vars]); 790 | } 791 | } 792 | 793 | // cleanup (kind of pedantic) 794 | fclose(fout_alpha); 795 | fclose(fout_beta); 796 | fclose(fout_alpha_acceptance); 797 | fclose(fout_beta_acceptance); 798 | free(accepted_alpha); 799 | free(accepted_beta); 800 | free(accepted_alpha_flag); 801 | free(accepted_beta_flag); 802 | free(inclusion_indicator); 803 | free(beta_temp); 804 | free(loggamma); 805 | free(JJ); 806 | free(TT); 807 | free(uu); 808 | free(Ypiu); 809 | 810 | printf("Sampling finished!\n"); 811 | 812 | return; 813 | 814 | } // close Gibbs (stochastic search) sampler function 815 | 816 | // 817 | double calculate_gamma(double **XX, double *alpha, double *beta, int jj, int ii, int Log){ 818 | 819 | // loop indices and out 820 | int kk, hh; 821 | double out; 822 | 823 | out = alpha[jj]; 824 | for(kk = 0 ; kk < n_vars ; kk++){ 825 | 826 | hh = kk + jj * n_vars; 827 | out = out + beta[hh] * XX[ii][kk]; 828 | 829 | } 830 | 831 | if(Log == 1){ 832 | return(out); 833 | } 834 | 835 | return(exp(out)); 836 | 837 | } 838 | 839 | // Gibbs variable selection version 840 | void update_beta_jj(double **XX, double **JJ, double **loggamma, 841 | double *beta_temp, int *inclusion_indicator, 842 | double *prop_per_beta, double mu_be[n_cats][n_vars], 843 | double sig_be[n_cats][n_vars], double aa_hp, 844 | double bb_hp, int jj){ 845 | 846 | // this function loops through n_vars so it is called for one taxa at a time 847 | 848 | double sig_prop; 849 | double loggamma_p[n_obs]; 850 | int hh, ii, kk; 851 | 852 | // define proposal variable and acceptance ratio variables 853 | double beta_p; 854 | double lnu, ln_acp; 855 | 856 | double log_full_beta, log_full_beta_p; 857 | 858 | for(kk = 0 ; kk < n_vars ; kk++){ 859 | 860 | // Stride for full (n_vars * n_cats) vector 861 | hh = kk + jj * n_vars; 862 | 863 | if(inclusion_indicator[hh] == 1){ 864 | 865 | log_full_beta = 0; 866 | 867 | for(ii = 0 ; ii < n_obs ; ii++){ 868 | 869 | log_full_beta = log_full_beta - gsl_sf_lngamma(exp(loggamma[ii][jj])); 870 | log_full_beta = log_full_beta + exp(loggamma[ii][jj]) * log(JJ[ii][jj]); 871 | 872 | } 873 | 874 | log_full_beta = log_full_beta + lprior_bbsas(beta_temp[kk], inclusion_indicator[hh], 875 | sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp); 876 | sig_prop = prop_per_beta[hh]; 877 | beta_p = beta_temp[kk] + adap_prop(sig_prop); 878 | 879 | 880 | for(ii = 0 ; ii < n_obs ; ii++){ 881 | loggamma_p[ii] = loggamma[ii][jj] - beta_temp[kk] * XX[ii][kk] + beta_p * XX[ii][kk]; 882 | } 883 | 884 | // calculate proposal probability 885 | log_full_beta_p = 0; 886 | for(ii = 0 ; ii < n_obs ; ii++){ 887 | log_full_beta_p = log_full_beta_p - gsl_sf_lngamma(exp(loggamma_p[ii])); 888 | log_full_beta_p = log_full_beta_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]); 889 | } 890 | log_full_beta_p = log_full_beta_p + lprior_bbsas(beta_p, inclusion_indicator[hh], sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp); 891 | 892 | ln_acp = log_full_beta_p - log_full_beta; 893 | lnu = log(gsl_rng_uniform(rando)); 894 | 895 | if(lnu < ln_acp){ 896 | 897 | beta_temp[kk] = beta_p; 898 | 899 | for(ii = 0 ; ii < n_obs ; ii++){ 900 | 901 | loggamma[ii][jj] = loggamma_p[ii]; 902 | 903 | } 904 | 905 | } // Close MH accept 906 | } // Close inclusion_indicator[hh] == 1 907 | } // Close for kk 908 | } // Close function 909 | 910 | // stochastic search variable selection version 911 | void update_beta_jj_ss(double **XX, double **JJ, double **loggamma, 912 | double *beta_temp, int *inclusion_indicator, 913 | double *prop_per_beta, double mu_be[n_cats][n_vars], 914 | double sig_be[n_cats][n_vars], double aa_hp, 915 | double bb_hp, int jj){ 916 | 917 | // this function loops through n_vars so it is called for one taxa at a time 918 | 919 | double sig_prop; 920 | double loggamma_p[n_obs]; 921 | int hh, ii, kk; 922 | 923 | // define proposal variable and acceptance ratio variables 924 | double beta_p; 925 | double lnu, ln_acp; 926 | 927 | double log_full_beta, log_full_beta_p; 928 | 929 | kk = floor(gsl_rng_uniform(rando) * n_vars);//for(kk = 0 ; kk < n_vars ; kk++){ 930 | 931 | // Stride for full (n_vars * n_cats) vector 932 | hh = kk + jj * n_vars; 933 | 934 | if(inclusion_indicator[hh] == 1){ 935 | 936 | log_full_beta = 0; 937 | 938 | for(ii = 0 ; ii < n_obs ; ii++){ 939 | 940 | log_full_beta = log_full_beta - gsl_sf_lngamma(exp(loggamma[ii][jj])); 941 | log_full_beta = log_full_beta + exp(loggamma[ii][jj]) * log(JJ[ii][jj]); 942 | 943 | } 944 | 945 | log_full_beta = log_full_beta + lprior_bbsas(beta_temp[kk], inclusion_indicator[hh], 946 | sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp); 947 | sig_prop = prop_per_beta[hh]; 948 | beta_p = beta_temp[kk] + adap_prop(sig_prop); 949 | 950 | 951 | for(ii = 0 ; ii < n_obs ; ii++){ 952 | loggamma_p[ii] = loggamma[ii][jj] - beta_temp[kk] * XX[ii][kk] + beta_p * XX[ii][kk]; 953 | } 954 | 955 | // calculate proposal probability 956 | log_full_beta_p = 0; 957 | for(ii = 0 ; ii < n_obs ; ii++){ 958 | log_full_beta_p = log_full_beta_p - gsl_sf_lngamma(exp(loggamma_p[ii])); 959 | log_full_beta_p = log_full_beta_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]); 960 | } 961 | log_full_beta_p = log_full_beta_p + lprior_bbsas(beta_p, inclusion_indicator[hh], sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp); 962 | 963 | ln_acp = log_full_beta_p - log_full_beta; 964 | lnu = log(gsl_rng_uniform(rando)); 965 | 966 | if(lnu < ln_acp){ 967 | 968 | beta_temp[kk] = beta_p; 969 | 970 | for(ii = 0 ; ii < n_obs ; ii++){ 971 | 972 | loggamma[ii][jj] = loggamma_p[ii]; 973 | 974 | } 975 | 976 | } // Close MH accept 977 | } // Close inclusion_indicator[hh] == 1 978 | //} // Close for kk 979 | } // Close function 980 | 981 | void between_models_jj(double **XX, double **JJ, double **loggamma, 982 | double *beta_temp, int *accepted_beta_flag, 983 | int *inclusion_indicator, double mu_be[n_cats][n_vars], 984 | double sig_be[n_cats][n_vars], double aa_hp, 985 | double bb_hp, int jj){ 986 | 987 | // Metropolis-Hastings proposals 988 | double sig_prop; 989 | double mu_prop; 990 | 991 | int hh, kk, ii; 992 | 993 | // proposed value vectors 994 | int inclusion_indicator_p; 995 | double loggamma_p[n_obs]; 996 | double beta_p; 997 | 998 | // acceptance ratio variables 999 | double lnu, ln_acp; 1000 | 1001 | // must update beta_jk for kk = 1, ..., n_vars 1002 | double log_full_beta, log_full_beta_p; 1003 | 1004 | for(kk = 0 ; kk < n_vars ; kk++){ 1005 | 1006 | hh = kk + jj * n_vars; 1007 | 1008 | // calculate the current log full conditional 1009 | log_full_beta = 0; 1010 | for(ii = 0 ; ii < n_obs ; ii++){ 1011 | log_full_beta = log_full_beta - gsl_sf_lngamma(exp(loggamma[ii][jj])); 1012 | log_full_beta = log_full_beta + exp(loggamma[ii][jj]) * log(JJ[ii][jj]); 1013 | } 1014 | log_full_beta = log_full_beta + lprior_bbsas(beta_temp[kk], inclusion_indicator[hh], sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp); 1015 | 1016 | // proposing a new value for beta[jj][kk] using the prior mean and standard deviation 1017 | sig_prop = pow(sig_be[jj][kk], 0.5); 1018 | mu_prop = mu_be[jj][kk]; 1019 | 1020 | // swap and sample beta 1021 | if(inclusion_indicator[hh] == 0){ 1022 | beta_p = mu_prop + gsl_ran_gaussian_ziggurat(rando, sig_prop); 1023 | inclusion_indicator_p = 1; 1024 | } 1025 | else{ 1026 | beta_p = 0.0; 1027 | inclusion_indicator_p = 0; 1028 | } 1029 | 1030 | // update proposal gamma (linear predictor) 1031 | for(ii = 0 ; ii < n_obs ; ii++){ 1032 | loggamma_p[ii] = loggamma[ii][jj] - beta_temp[kk] * XX[ii][kk] + beta_p * XX[ii][kk]; 1033 | } 1034 | 1035 | // calculate the proposed log full conditional 1036 | log_full_beta_p = 0; 1037 | for(ii = 0 ; ii < n_obs ; ii++){ 1038 | log_full_beta_p = log_full_beta_p - gsl_sf_lngamma(exp(loggamma_p[ii])); 1039 | log_full_beta_p = log_full_beta_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]); 1040 | } 1041 | log_full_beta_p = log_full_beta_p + lprior_bbsas(beta_p, inclusion_indicator_p, sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp); 1042 | 1043 | ln_acp = log_full_beta_p - log_full_beta; 1044 | lnu = log(gsl_rng_uniform(rando)); 1045 | 1046 | // Metropolis-Hastings ratio 1047 | if(lnu < ln_acp){ 1048 | 1049 | accepted_beta_flag[hh] = 1; 1050 | 1051 | // update accepted beta into beta_temp 1052 | beta_temp[kk] = beta_p; 1053 | inclusion_indicator[hh] = inclusion_indicator_p; 1054 | 1055 | for(ii = 0 ; ii < n_obs ; ii++){ 1056 | // also update the gamma (linear predictor) 1057 | loggamma[ii][jj] = loggamma_p[ii]; 1058 | } 1059 | } // Close MH if 1060 | } // Close for kk 1061 | } // Close function 1062 | 1063 | void update_alpha_jj(double **JJ, double **loggamma, double *alpha, 1064 | double *prop_per_alpha, int *accepted_alpha_flag, 1065 | double mu_al[n_cats], double sig_al[n_cats], int jj){ 1066 | 1067 | int ii; 1068 | 1069 | double sig_prop = prop_per_alpha[jj]; 1070 | double loggamma_p[n_obs]; 1071 | 1072 | double alpha_p; 1073 | double lnu, ln_acp; 1074 | 1075 | // Prepare the current and proposed full conditional values 1076 | double log_full_alpha, log_full_alpha_p; 1077 | 1078 | // Calculate the full conditional for the current value 1079 | log_full_alpha=0; 1080 | for(ii = 0 ; ii < n_obs ; ii++){ 1081 | 1082 | log_full_alpha = log_full_alpha - gsl_sf_lngamma(exp(loggamma[ii][jj])); 1083 | log_full_alpha = log_full_alpha + exp(loggamma[ii][jj]) * log(JJ[ii][jj]); 1084 | 1085 | } 1086 | log_full_alpha = log_full_alpha - 1.0/(2.0 * sig_al[jj]) * pow(alpha[jj] - mu_al[jj], 2.0); 1087 | 1088 | // Propose a new value for alpha[jj] using a random walk proposal centered on 1089 | // the current value of alpha[jj] 1090 | alpha_p = alpha[jj] + gsl_ran_gaussian_ziggurat(rando, sig_prop); 1091 | 1092 | // Gamma must be updated too 1093 | for(ii = 0 ; ii < n_obs ; ii++){ 1094 | loggamma_p[ii] = loggamma[ii][jj] - alpha[jj] + alpha_p; 1095 | } 1096 | 1097 | // Calculate the full conditional for the proposed value 1098 | log_full_alpha_p = 0; 1099 | for(ii = 0 ; ii < n_obs ; ii++){ 1100 | log_full_alpha_p = log_full_alpha_p - gsl_sf_lngamma(exp(loggamma_p[ii])); 1101 | log_full_alpha_p = log_full_alpha_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]); 1102 | } 1103 | 1104 | log_full_alpha_p = log_full_alpha_p - 1.0/(2.0 * sig_al[jj]) * pow(alpha_p - mu_al[jj], 2.0); 1105 | ln_acp = log_full_alpha_p - log_full_alpha; 1106 | lnu = log(gsl_rng_uniform(rando)); 1107 | 1108 | if(lnu < ln_acp){ 1109 | 1110 | // If accepted, update both alpha[jj] and loggamma[ii][jj], and keep 1111 | // track of acceptances 1112 | accepted_alpha_flag[jj] = 1; 1113 | alpha[jj] = alpha_p; 1114 | 1115 | for(ii = 0 ; ii < n_obs ; ii++){ 1116 | loggamma[ii][jj] = loggamma_p[ii]; 1117 | } 1118 | } 1119 | 1120 | return; 1121 | 1122 | } // Close function 1123 | 1124 | // the probability of the Bernoulli trial is integrated out 1125 | double lprior_bbsas(double betajk, int sjk, double sig_bejk, double mu_bejk, 1126 | double aa_hp, double bb_hp){ 1127 | 1128 | // calculate additional beta factor 1129 | double aa = sjk + aa_hp; 1130 | double bb = 1 - sjk + bb_hp; 1131 | double lbeta_factor = gsl_sf_lnbeta(aa, bb) - gsl_sf_lnbeta(aa_hp, bb_hp); 1132 | 1133 | // piece-wise function 1134 | if(sjk == 0){ 1135 | return(log(1.0 - exp(lbeta_factor))); 1136 | } 1137 | else{ 1138 | return(lbeta_factor - 0.5 * log(2.0 * M_PI * sig_bejk) - 1.0/(2.0 * sig_bejk) * pow(betajk - mu_bejk, 2.0)); 1139 | } 1140 | 1141 | } 1142 | 1143 | // simple adaptive MH proposal 1144 | // this uses equation (3) from Roberts & Rosenthal (2009) but without the full 1145 | // covariance of the target which makes it similar to the component-wise update 1146 | // of Haario et al. (2005) 1147 | double adap_prop(double curr_var){ 1148 | 1149 | // need dimension (???) 1150 | double dd = (double)n_cats * (double)n_vars; 1151 | 1152 | // ensures bounded convergence 1153 | int safeguard = gsl_ran_bernoulli(rando, 0.05); 1154 | double usually = pow(2.38, 2.0) * curr_var/dd; 1155 | double unusually = pow(0.1, 2.0)/dd; 1156 | 1157 | double prop_var = (1 - safeguard) * gsl_ran_gaussian_ziggurat(rando, pow(usually, 0.5)) + safeguard * gsl_ran_gaussian_ziggurat(rando, pow(unusually, 0.5)); 1158 | 1159 | return(prop_var); 1160 | } 1161 | 1162 | // sort of following http://www.johndcook.com/blog/standard_deviation/ 1163 | double online_mean(int iteration, double last_mean, double curr_obs){ 1164 | 1165 | // don't fall off by one 1166 | int n_vals = iteration++; 1167 | 1168 | // assume that iteration > 0 since there's a burnin proposal 1169 | double curr_mean = last_mean + (curr_obs - last_mean)/(double)n_vals; 1170 | 1171 | return(curr_mean); 1172 | } 1173 | 1174 | double online_var(int iteration, double last_mean, double last_var, double curr_mean, double curr_obs){ 1175 | 1176 | // note that iteration == n - 1 and that (n - 1) * last_var == last_ss 1177 | // assume that iteration > 0 since there's a burnin proposal 1178 | double curr_ss = (double)iteration * last_var + (curr_obs - last_mean) * (curr_obs - curr_mean); 1179 | 1180 | return(curr_ss/(double)iteration); 1181 | } 1182 | 1183 | --------------------------------------------------------------------------------