├── documentation.pdf
├── .gitignore
├── code
├── Makefile
├── wrapper.R
├── helper_functions.R
└── dmbvs.c
├── README.md
├── example_analysis_script.R
└── LICENSE
/documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/duncanwadsworth/dmbvs/HEAD/documentation.pdf
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | *.RData
4 | *.Rproj
5 | .DS_Store
6 | *.txt
7 | *.x
8 | *.pdf
9 | *.html
10 | !documentation.pdf
11 |
--------------------------------------------------------------------------------
/code/Makefile:
--------------------------------------------------------------------------------
1 | ## Makefile for dmbvs
2 |
3 | ## file and path names
4 | ## LIBS and INCS may need to be changed
5 | INPUT = dmbvs
6 | LIBS = /usr/local/lib
7 | INCS = /usr/local/include
8 |
9 | compile:
10 | #clang -I$(INCS) -L$(LIBS) -Wall $(INPUT).c -o $(INPUT).x -lgsl -lgslcblas -lm
11 | #/local/gcc-5.2.0/bin/gcc -I/local/gsl-2.1/include -L/local/gsl-2.1/lib -Wall $(INPUT).c -o $(INPUT).x -lgsl -lgslcblas -lm
12 | gcc -I$(INCS) -L$(LIBS) -Wall $(INPUT).c -o $(INPUT).x -lgsl -lgslcblas -lm
13 |
14 | clean:
15 | rm -f $(INPUT).x
16 | if test -a $(INPUT).o ; then rm -f $(INPUT).o ; fi
17 |
18 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | dmbvs
2 | ===
3 |
4 | > **Important**\
5 | > This repo is no longer maintained and its contents have been superseded by the library here: [https://github.com/mkoslovsky/MicroBVS](https://github.com/mkoslovsky/MicroBVS). Anything you can do with `dmbvs` can be done better with `MicroBVS`.
6 |
7 | ### Description
8 |
9 | Bayesian variable selection for Dirichlet-Multinomial regression. For $n$
10 | samples, let $Y$ be an $n \times q$ matrix with $q$ taxa. Let $X$ be an
11 | $n \times p$ matrix with $p$ covariates. By applying spike-and-slab priors to
12 | the Dirichlet-Multinomial regression coefficients, we obtain concise
13 | taxa/covariate associations. The methodology is further described in the
14 | manuscript:
15 |
16 | > Wadsworth, W. D., Argiento, R., Guindani, M., Galloway-Pena, J., Samuel, S. A., &
17 | > Vannucci, M. (2016). An Integrative Bayesian Dirichlet-Multinomial Regression
18 | > Model for the Analysis of Taxonomic Abundances in Microbiome data.
19 |
20 | The code has been updated on 10/02/17 with an option to perform a stochastic
21 | search algorithm for Bayesian variable selection instead of the Gibbs sampler.
22 | The stochastic search variable selection approach provides gains in computational
23 | speed.
24 |
25 | ### Contents
26 |
27 | The repository contains
28 |
29 | 1. C code implementing an MCMC sampler for Dirichlet-Multinomial Bayesian
30 | Variable Selection (dmbvs) using spike-and-slab priors,
31 |
32 | 2. R code to wrap and run the sampler from within R,
33 |
34 | 3. a function for simulating data,
35 |
36 | 4. and a "start-to-finish" script (`example_analysis_script.R`) demonstrating
37 | usage of the code. This script gives reasonable default settings for the
38 | hyperparameters and MCMC parameters for the example simulated data. Settings may
39 | change for other data.
40 |
41 | ### Usage
42 |
43 | * The R code requires the `dirmult` and `MASS` packages. Please ensure those are
44 | installed first.
45 |
46 | * The C code relies on the [GNU Scientific Library (GSL)](https://www.gnu.org/software/gsl/).
47 | GSL must be installed and modifications may need to be made to the 'library'
48 | and 'include' paths in the Makefile.
49 |
50 | * The main C file (`dmbvs.c`), as well as the Makefile to be used for compilation
51 | of the code, can be found under the directory `code/`.
52 |
53 | * On Linux and Mac the C code may be compiled with the Makefile from R using:
54 |
55 | ```{r}
56 | # must be in package's root directory
57 | setwd("dmbvs")
58 | system("cd code; make")
59 | ```
60 |
61 | * If compilation has been successful there will be an executable called `dmbvs.x`
62 | the `code/` directory. Data may be simulated and the MCMC code run using:
63 |
64 | ```{r}
65 | source(file.path("code", "wrapper.R"))
66 | source(file.path("code", "helper_functions.R"))
67 | simdata = simulate_dirichlet_multinomial_regression(n_obs = 100, n_vars = 50,
68 | n_taxa = 50, n_relevant_vars = 5,
69 | n_relevant_taxa = 5)
70 | results = dmbvs(XX = simdata$XX[,-1], YY = simdata$YY,
71 | intercept_variance = 10, slab_variance = 10,
72 | bb_alpha = 0.02, bb_beta = 1.98, GG = 1100L, thin = 10L, burn = 100L,
73 | exec = file.path(".", "code", "dmbvs.x"), output_location = ".")
74 | ```
75 |
76 | * Tested on:
77 | * Mac with R version 3.2.2 and GSL version 2.1 with the Apple LLVM version
78 | 7.0.2 compiler
79 | * Red Hat Linux 6.6 with R version 3.1.2 and GSL version 2.1 and gcc
80 | version 5.2.0
81 |
82 |
--------------------------------------------------------------------------------
/example_analysis_script.R:
--------------------------------------------------------------------------------
1 |
2 | ###################################################################
3 | # Example Analysis of simulated data using a Dirichlet-Multinomial
4 | # Bayesian variable selection model
5 | ###################################################################
6 |
7 | # -- README -------------------------------------------------------------------
8 | # This script gives an example of how to run the code on a simulated
9 | # dataset. It assumes several things:
10 | # 1. The Gnu Scientific Library is installed
11 | # 2. The paths in the Makefile are correct
12 | # 3. The data is available
13 | # -----------------------------------------------------------------------------
14 |
15 | # -- simulate data ------------------------------------------------------------
16 | # this simulates a dataset identical in dimension to the ones used in the
17 | # simulation sections of the manuscript
18 | source(file.path("code", "helper_functions.R"))
19 | simdata = simulate_dirichlet_multinomial_regression(n_obs = 100, n_vars = 50,
20 | n_taxa = 50, n_relevant_vars = 5,
21 | n_relevant_taxa = 5)
22 | Ysim = simdata$YY
23 | Xsim = simdata$XX
24 |
25 | # -- run analysis -------------------------------------------------------------
26 | # preliminaries
27 | source(file.path("code", "wrapper.R"))
28 | system("cd code; make")
29 | executable_location = "code/dmbvs.x"
30 | save_prefix = "simulation"
31 |
32 | # prepare and check data
33 | YY = as.matrix(Ysim)
34 | XX = scale(as.matrix(Xsim[,-1]), center = T, scale = T)
35 | colnames(YY) = paste0("taxa", 1:ncol(YY))
36 | colnames(XX) = paste0("covariate", 1:ncol(XX))
37 | dim(YY)
38 | dim(XX)
39 |
40 | # MCMC and hyperparameters
41 | # these values are reasonable for the data simulated here but should be changed
42 | # depending on the characteristics of other datasets
43 | GG = 301L; thin = 2L; burn = 101L; # fast, for testing
44 | #GG = 11001L; thin = 10L; burn = 1001L; # good defaults, in this case
45 | # reasonable default parameters, see further discussion in the manuscript
46 | bb_alpha = 0.02; bb_beta = 2 - bb_alpha
47 | proposal_alpha = 0.5; proposal_beta = 0.5
48 | slab_variance = 10; intercept_variance = 10
49 |
50 | # description
51 | cat("Beta-Binomial mean:", bb_alpha/(bb_alpha + bb_beta), "\n")
52 | cat("Number of kept iterations:", (GG - burn)/thin, "\n")
53 |
54 | # run the algorithm
55 | results = dmbvs(XX = XX, YY = YY, intercept_variance = intercept_variance,
56 | slab_variance = slab_variance, bb_alpha = bb_alpha,
57 | bb_beta = bb_beta, GG = GG, thin = thin, burn = burn,
58 | init_beta = "warmstart", init_alpha = "warmstart",
59 | proposal_alpha = proposal_alpha, proposal_beta = proposal_beta,
60 | exec = executable_location, selection_type = "ss",
61 | output_location = ".")
62 | params = data.frame(GG, burn, thin, intercept_variance,
63 | slab_variance, bb_alpha, bb_beta,
64 | proposal_alpha, proposal_beta)
65 | save(results, params, XX, YY,
66 | file = paste0("results-", save_prefix, "-", Sys.Date(), ".RData"))
67 |
68 | # quick check results
69 | source(file.path("code", "helper_functions.R"))
70 | mppi = colMeans((results$beta != 0) + 0)
71 | (blfdrate = bfdr(mppi, threshold = 0.1)$threshold)
72 | MPPI = data.frame(expand.grid(covariates = colnames(results$hyperparameters$inputdata$XX),
73 | taxa = colnames(results$hyperparameters$inputdata$YY)),
74 | mppi = mppi,
75 | beta = colMeans(results$beta),
76 | truebeta = c(t(simdata$betas[,-1])))
77 | subset(MPPI, mppi > blfdrate | truebeta != 0)
78 | plot(mppi, type = "h", ylab = "MPPI",
79 | xlab = "beta index", main = "Manhattan plot")
80 |
81 | # active variable traceplot
82 | plot.ts(rowSums((results$beta != 0) + 0), main = "Active variables traceplot",
83 | ylab = "number of betas in the model", xlab = "iteration")
84 |
85 | # some of the selected beta traceplots
86 | selected = which(mppi > 0.5)
87 | fortraces = selected[sample(length(selected), 10)]
88 | plot.ts(results$beta[,fortraces], main = "Some selected beta traceplots",
89 | xlab = "iteration", ylab = "")
90 |
91 | # visualize the associations
92 | association_plot(MPPI[,-5], graph_layout = "bipartite", main = "Sample Results")
93 |
94 |
--------------------------------------------------------------------------------
/code/wrapper.R:
--------------------------------------------------------------------------------
1 | #' an R wrapper to C code for spike-and-slab Dirichlet--Multinomial
2 | #' Bayesian variable selection
3 | #'
4 | #' @param XX covariate matrix (without intercept)
5 | #' @param YY count matrix
6 | #' @param intercept_variance a scalar for the prior variance on the intercept
7 | #' of the log-linear predictors
8 | #' @param slab_variance a scalar for the prior variance on the slab of the
9 | #' spike-and-slab
10 | #' @param bb_alpha a scalar for the alpha hyperparameter of the Beta-Bernoulli
11 | #' spike inclusion prior
12 | #' @param bb_beta a scalar for the beta hyperparameter of the Beta-Bernoulli
13 | #' spike inclusion prior
14 | #' @param GG the total number of MCMC iterations
15 | #' @param thin the MCMC thinning interval
16 | #' @param burn the number of MCMC iterations out of GG that will be discarded
17 | #' @param proposal_alpha initial value, either a scalar or a vector of length
18 | #' ncol(YY), if a scalar, that value is used for all proposals on alpha
19 | #' @param proposal_beta initial value, either a scalar or a matrix with
20 | #' ncol(XX) columns and ncol(YY) rows, if a scalar, that value is used for all
21 | #' proposals on beta
22 | #' @param init_alpha either a scalar or a vector of size ncol(YY)
23 | #' @param init_beta either a scalar or a matrix with ncol(YY) rows and
24 | #' ncol(XX) columns, inclusion initialization uses non-zero elements of
25 | #' init_beta
26 | #' @param exec the path to the C executable
27 | #' @param output_location if NULL, output goes to an the directory output/
28 | #' created in the working directory
29 | #' @param r_seed an integer seed to pass to GSL's random number generator
30 | #' @param selection_type either "ss" (Stochastic Search) or "Gibbs", determines
31 | #' the MCMC mechanism for variable selection
32 | #'
33 | #' @return alpha: a matrix with iterations in the rows and the alphas in the
34 | #' columns
35 | #' @return alpha_accept: the Metropolis-Hastings acceptance ratio for the alphas
36 | #' @return beta: a matrix with iterations in the rows and the per-iteration beta
37 | #' matrix -- flattened by rows -- in the columns
38 | #' @return beta_accept: the Metropolis-Hastings acceptance ratio for the betas
39 | #' @return hyperparameters: a list containing the hyperparameters, the MCMC
40 | #' parameters, and the data from the original function call
41 | #'
42 | #' @export
43 | dmbvs = function(XX, YY, intercept_variance, slab_variance, bb_alpha, bb_beta,
44 | GG, thin, burn, proposal_alpha = 0.5, proposal_beta = 0.5,
45 | init_alpha = 0, init_beta = 0, exec = file.path(".","dmbvs.x"),
46 | output_location = NULL, r_seed = NULL, selection_type = "Gibbs"){
47 |
48 | # data dimensions
49 | n_cats = ncol(YY)
50 | n_obs = nrow(XX)
51 | n_vars = ncol(XX)
52 |
53 | # argument checking
54 | #if(!is.integer(YY)){warning("YY is not integer typed: is it a count matrix?")}
55 | if(bb_alpha > bb_beta | bb_alpha == bb_beta){warning("Beta-Bernoulli prior is symmetric or left skewed: right skewed enforces sparsity")}
56 | if(all(XX[,1] == 1)){stop("XX appears to have an intercept column")}
57 | if(n_obs != nrow(YY)){stop("dimensions of XX and YY do not match")}
58 | if(burn > GG){stop("burnin greater than the number of iterations")}
59 | if(any(intercept_variance < 0, slab_variance < 0, bb_alpha < 0, bb_beta < 0)){stop("check hyperparameter values: at least one is negative")}
60 | if((length(proposal_beta) != 1) & (nrow(as.matrix(proposal_beta)) != n_cats) & (ncol(as.matrix(proposal_beta)) != n_vars)){stop("bad dimension for proposal_beta")}
61 | if(!(selection_type %in% c("ss", "Gibbs"))){stop("unrecognized variable selection type: choose ss or Gibbs")}
62 |
63 | # set output location
64 | # for extremely long runs it's necessary to leave output in a subdirectory of the
65 | # working directory since temp directories seems to get cleaned out occasionally
66 | if(is.null(output_location)){
67 | # for parallel simulations need unique output directories
68 | dir.create(paste0(getwd(), "/output-pid-", Sys.getpid()))
69 | out_dir = paste0(getwd(), "/output-pid-", Sys.getpid())
70 | }else{
71 | dir.create(paste0(output_location, "/output-pid-", Sys.getpid()))
72 | out_dir = paste0(output_location, "/output-pid-", Sys.getpid())
73 | }
74 |
75 | # text files for data
76 | utils::write.table(as.matrix(XX), file.path(out_dir, "covariates.txt"),
77 | row.names = F, col.names = F)
78 | utils::write.table(as.matrix(YY), file.path(out_dir, "count_matrix.txt"),
79 | row.names = F, col.names = F)
80 |
81 | # text files for proposals and initialization
82 | # intercept initialization
83 | if(is.null(init_alpha)){
84 | utils::write.table(rep(0, times = n_cats), file.path(out_dir, "init_alpha.txt"),
85 | row.names = F, col.names = F)
86 | }else if(length(init_alpha) == 1 & is.numeric(init_alpha)){
87 | utils::write.table(rep(init_alpha, times = n_cats), file.path(out_dir, "init_alpha.txt"),
88 | row.names = F, col.names = F)
89 | }else if(length(init_alpha) == n_cats & is.numeric(init_alpha)){
90 | utils::write.table(init_alpha, file.path(out_dir, "init_alpha.txt"),
91 | row.names = F, col.names = F)
92 | }else if(init_alpha == "warmstart"){
93 | utils::write.table(scale(log(colSums(YY))), file.path(out_dir, "init_alpha.txt"),
94 | row.names = F, col.names = F)
95 | }else{
96 | stop("init_alpha not recognized")
97 | }
98 | # regression parameter initialization
99 | if(is.null(init_beta)){
100 | # empty initialization
101 | utils::write.table(rep(0, times = n_cats * n_vars),
102 | file.path(out_dir, "init_beta.txt"), row.names = F,
103 | col.names = F)
104 | }else if(length(init_beta) == 1 & is.numeric(init_beta)){
105 | # scalar initialization
106 | utils::write.table(rep(init_beta, times = n_cats * n_vars),
107 | file.path(out_dir, "init_beta.txt"), row.names = F,
108 | col.names = F)
109 | }else if(!is.null(dim(init_beta)) & all(dim(init_beta) == c(n_cats, n_vars))){
110 | # matrix initialization
111 | utils::write.table(c(t(as.matrix(init_beta))),
112 | file.path(out_dir, "init_beta.txt"), row.names = F,
113 | col.names = F)
114 | }else if(is.character(init_beta) | init_beta == "warmstart"){
115 | # false discovery rate on correlation test initialization
116 | cormat = matrix(0, n_cats, n_vars)
117 | pmat = matrix(0, n_cats, n_vars)
118 | yy = YY/rowSums(YY) # compositionalize
119 | for(rr in 1:n_cats){
120 | for(cc in 1:n_vars){
121 | pmat[rr, cc] = stats::cor.test(XX[, cc], yy[, rr], method = "spearman",
122 | exact = F)$p.value
123 | cormat[rr, cc] = stats::cor(XX[, cc], yy[, rr], method = "spearman")
124 | }
125 | }
126 | # defaults to 0.2 false discovery rate
127 | pm = matrix((stats::p.adjust(c(pmat), method = "fdr") <= 0.2) + 0, n_cats,
128 | n_vars)
129 | betmat = cormat * pm
130 | utils::write.table(c(t(betmat)), file.path(out_dir, "init_beta.txt"),
131 | row.names = F, col.names = F)
132 | }else{
133 | stop("init_beta not recognized, if using a matrix, ensure proper dimensions")
134 | }
135 |
136 | # intercept proposal
137 | if(length(proposal_alpha) == 1){
138 | utils::write.table(rep(proposal_alpha, times = n_cats),
139 | file.path(out_dir, "proposal_alpha.txt"), row.names = F,
140 | col.names = F)
141 | }else if(length(proposal_alpha) == n_cats){
142 | utils::write.table(proposal_alpha, file.path(out_dir, "proposal_alpha.txt"),
143 | row.names = F, col.names = F)
144 | }else{
145 | stop("problem with proposal_alpha")
146 | }
147 | # regression proposal
148 | if(length(proposal_beta) == 1){
149 | utils::write.table(rep(proposal_beta, times = n_cats * n_vars),
150 | file.path(out_dir, "proposal_beta.txt"), row.names = F,
151 | col.names = F)
152 | }else if((nrow(proposal_beta) == n_cats) & (ncol(proposal_beta) == n_vars)){
153 | utils::write.table(c(t(proposal_beta)), file.path(out_dir, "proposal_beta.txt"),
154 | row.names = F, col.names = F)
155 | }else{
156 | stop("problem with proposal_beta")
157 | }
158 |
159 | # pass the external seed to GSL which will have its own corresponding seed (see the .out file)
160 | if(is.null(r_seed)){
161 | a_random_seed = sample(1e6, 1)
162 | }else if(is.integer(r_seed)){
163 | a_random_seed = r_seed
164 | }else{
165 | stop("problem with external seed to pass to GSL")
166 | }
167 | # run compiled code
168 | if(selection_type == "ss"){
169 | command = paste(exec, GG, thin, burn, intercept_variance, slab_variance,
170 | bb_alpha, bb_beta, n_cats, n_obs, n_vars, a_random_seed,
171 | out_dir, 0, ">", paste0("dmbvs-pid-", Sys.getpid(), ".out"))
172 | } else if(selection_type == "Gibbs"){
173 | command = paste(exec, GG, thin, burn, intercept_variance, slab_variance,
174 | bb_alpha, bb_beta, n_cats, n_obs, n_vars, a_random_seed,
175 | out_dir, 1, ">", paste0("dmbvs-pid-", Sys.getpid(), ".out"))
176 | }
177 | system(command)
178 | # read output
179 | aa = utils::read.table(file.path(out_dir, "alpha.out"))
180 | aaa = utils::read.table(file.path(out_dir, "alpha_acceptance.out"))
181 | # bb is read as a n_vars * n_cats long list with each element have n_iters
182 | # so when it's unlist()ed you get the first variable for n_iters, then the
183 | # second, etc
184 | bb = utils::read.table(file.path(out_dir, "beta.out"))
185 | bba = utils::read.table(file.path(out_dir, "beta_acceptance.out"))
186 |
187 | # variables in the columns, iterations in the rows
188 | return(list(alpha = t(matrix(unlist(aa), nrow = n_cats, byrow = T)),
189 | alpha_accept = aaa[,1],
190 | # after transpose beta has iterations in the rows
191 | beta = t(matrix(unlist(bb), nrow = (n_cats * n_vars), byrow = T)),
192 | beta_accept = bba[,1],
193 | hyperparameters = list(mcmc = data.frame(GG = GG, thin = thin, burn = burn,
194 | proposal_alpha = proposal_alpha,
195 | proposal_beta = proposal_beta,
196 | random_seed = a_random_seed),
197 | priors = data.frame(intercept_variance = intercept_variance,
198 | slab_variance = slab_variance,
199 | bb_alpha = bb_alpha, bb_beta = bb_beta),
200 | inputdata = list(XX = XX, YY = YY))))
201 | }
202 |
203 |
--------------------------------------------------------------------------------
/code/helper_functions.R:
--------------------------------------------------------------------------------
1 | #' calculate the Bayesian False Discovery Rate
2 | #'
3 | #' @param mppi_vector A vector of marginal posterior probabilities of inclusion.
4 | #' @param threshold The expected false discovery rate threshold
5 | #'
6 | #' @return selected: A boolean vector of selected (= T) and rejected (= F)
7 | #' variables
8 | #' @return threshold: The BFDR threshold
9 | #'
10 | #' @references
11 | #' Newton, M. A., Noueiry, A., Sarkar, D., & Ahlquist, P. (2004). Detecting
12 | #' differential gene expression with a semiparametric hierarchical mixture
13 | #' method. Biostatistics, 5(2), 155-76. doi:10.1093/biostatistics/5.2.155
14 | #'
15 | #' @export
16 |
17 | bfdr = function(mppi_vector, threshold = 0.1){
18 | # arg checking
19 | if(any(mppi_vector > 1 | mppi_vector < 0)){
20 | stop("Bad input: mppi_vector should contain probabilities")
21 | }
22 | if(threshold > 1 | threshold < 0){
23 | stop("Bad input: threshold should be a probability")
24 | }
25 | # sorting the ppi's in decresing order
26 | sorted_mppi_vector = sort(mppi_vector, decreasing = TRUE)
27 | # computing the fdr
28 | fdr = cumsum((1 - sorted_mppi_vector))/seq(1:length(mppi_vector))
29 | # determine index of the largest fdr less than threshold
30 | thecut.index = max(which(fdr < threshold))
31 | ppi_threshold = sorted_mppi_vector[thecut.index]
32 | selected = mppi_vector > ppi_threshold
33 | return(list(selected = selected, threshold = ppi_threshold))
34 | }
35 |
36 | #' a bipartite graph showing the association between X and Y
37 | #'
38 | #' @note requires the igraph package
39 | #'
40 | #' @param MPPI a data.frame with n_vars x n_taxa rows and these four columns:
41 | #' 1) covariate: the names of the columns of X
42 | #' 2) taxa: the names of the columns of Y
43 | #' 3) mppi: the marginal posterior probability of inclusion for each taxa
44 | #' by covariate parameter
45 | #' 4) beta: a point estimate of for each taxa by covariate parameter
46 | #' @param mppi_threshold the threshold for inclusion in the plot
47 | #' @param inc_legend boolean to include the legend
48 | #' @param lwdx a scalar multiplier for growing or shrinking the widths of edges
49 | #' @param graph_layout either "circular" for a round layout or "bipartite" for
50 | #' a side-by-side layout
51 | #' @param lab_dist a scalar argument for the distance between node centers and
52 | #' the node labels
53 | #' @param ... passthrough arguments
54 | #'
55 | #' @return a plot
56 | #'
57 | #' @export
58 | association_plot = function(MPPI, mppi_threshold = 0.5, inc_legend = F,
59 | lwdx = 5, graph_layout = "circular", lab_dist = 2,
60 | ...){
61 | if(any(!(colnames(MPPI) %in% c("covariates", "taxa", "mppi", "beta")))){
62 | stop("please ensure the column names of MPPI are correct\n*** they must be: covariates, taxa, mppi, beta")
63 | }
64 | if(!require(igraph)){
65 | stop("please ensure the igraph package is installed")
66 | }
67 | # first munge the data into a format easily read by graph_from_data_frame()
68 | mm = subset(MPPI, mppi > mppi_threshold)
69 | if(nrow(mm) == 0){ # when there are no relevant associations
70 | graphics::plot(1, type = "n", axes = F, xlab = "", ylab = "",
71 | xlim = c(-1, 1), ylim = c(-1, 1), ...)
72 | graphics::text(0, 0, "No Associations\nAbove MPPI\nThreshold")
73 | }else{ # when there are relevant associations
74 | mm$esign = round((sign(mm$beta) + 2)/3) + 1
75 | mm$emag = abs(mm$beta)
76 | mppi = mm[order(mm$covariates, mm$taxa),]
77 | # readin data.frame
78 | hh = igraph::graph_from_data_frame(mppi, directed = F)
79 | # modify graph characteristics
80 | igraph::E(hh)$weight = igraph::E(hh)$emag*lwdx
81 | #igraph::E(hh)$lty = igraph::E(hh)$esign
82 | igraph::E(hh)$lty = 1
83 | igraph::E(hh)$color = ifelse(igraph::E(hh)$esign == 2, 1, 2)
84 | igraph::V(hh)$type = c(rep(T, times = length(unique(mppi$covariates))),
85 | rep(F, times = length(unique(mppi$taxa))))
86 | # layout as a bipartite graph
87 | if(graph_layout == "bipartite"){
88 | la = layout.bipartite(hh, hgap = 20)
89 | graphics::plot(hh, layout = la[,c(2,1)],
90 | edge.width = igraph::E(hh)$weight,
91 | edge.lty = igraph::E(hh)$lty,
92 | edge.color = c("red", "blue")[igraph::E(hh)$color],
93 | vertex.color = c("green", "yellow")[igraph::V(hh)$type + 1],
94 | vertex.shape = c("circle", "square")[igraph::V(hh)$type + 1],
95 | vertex.frame.color = "black", vertex.label.cex = 1.2,
96 | vertex.label.color = "black", vertex.label.family = "sans",
97 | vertex.label.font = 2,
98 | vertex.label.degree = c(2*pi, pi)[igraph::V(hh)$type + 1],
99 | vertex.label.dist = lab_dist, ...)
100 | }
101 | # layout as a circular graph
102 | if(graph_layout == "circular"){
103 | la = igraph::layout.circle(hh)
104 | # from http://stackoverflow.com/questions/23209802/placing-vertex-label-outside-a-circular-layout-in-igraph
105 | radian.rescale = function(x, start = 0, direction = 1) {
106 | c.rotate = function(x) (x + start) %% (2 * pi) * direction
107 | c.rotate(scales::rescale(x, c(0, 2 * pi), range(x)))
108 | }
109 | lab.locs = radian.rescale(x = 1:nrow(la), start = 0, direction = -1)
110 | graphics::plot(hh, layout = la,
111 | edge.width = igraph::E(hh)$weight,
112 | edge.lty = igraph::E(hh)$lty,
113 | edge.color = c("red", "blue")[igraph::E(hh)$color],
114 | vertex.color = c("green", "yellow")[igraph::V(hh)$type + 1],
115 | vertex.shape = c("circle", "square")[igraph::V(hh)$type + 1],
116 | vertex.frame.color = "black", vertex.label.cex = 1.2,
117 | vertex.label.color = "black", vertex.label.family = "sans",
118 | vertex.label.font = 2, vertex.label.dist = lab_dist,
119 | vertex.label.degree = lab.locs)
120 | }
121 | # a black and white color scheme without vertex shapes
122 | # graphics::plot(hh, layout = la,
123 | # edge.width = igraph::E(hh)$weight,
124 | # edge.color = "black",
125 | # vertex.color = c("grey", "white")[V(hh)$type + 1],
126 | # vertex.label.cex = 1.3,
127 | # vertex.label.color = "black", vertex.label.family = "sans",
128 | # vertex.label.font = c(2, 3)[igraph::V(hh)$type + 1],
129 | # vertex.size = 10, vertex.label.dist = 0.5,
130 | # vertex.label.degree = lab.locs)
131 | if(inc_legend){
132 | #legend("topright", legend = c("positive", "negative"), lty = c(1, 2), col = "grey50", lwd = 3)
133 | graphics::legend("topright", legend = c("positive", "negative"), lwd = c(3, 3), col = c("blue","red"))
134 | }
135 | }
136 | }
137 |
138 | #' abundance table visualization
139 | #'
140 | #' @note requires the ggplot2 package
141 | #'
142 | #' @param count_matrix a matrix of integers
143 | #' @param title an optional title
144 | #'
145 | #' @return a plot
146 | #'
147 | #' @export
148 | abundance_plot = function(count_matrix, title = ""){
149 | if(!require(ggplot2)){
150 | stop("please ensure the ggplot2 package is installed")
151 | }
152 | # sort matrix by column means - make pretty
153 | ss = count_matrix[, order(colMeans(count_matrix), decreasing = T)]
154 | # coordinates
155 | xyz = cbind(expand.grid(1:dim(ss)[1], 1:dim(ss)[2]), as.vector(ss), as.vector(ss) > 0)
156 | names(xyz) = c("Sample.ID","Phylotype","Counts","Presence")
157 | print(ggplot2::ggplot(xyz, ggplot2::aes(y = Sample.ID, x = Phylotype, fill = log(Counts))) +
158 | ggplot2::geom_raster() + ggplot2::theme_bw() + ggplot2::ggtitle(title))
159 | }
160 |
161 |
162 | #' simulate data from a Dirichlet-Multinomial regression model
163 | #'
164 | #' @note Requires the dirmult and MASS packages
165 | #'
166 | #' @param n_obs: the number of samples
167 | #' @param n_vars: number of covariates excluding the intercept
168 | #' @param n_taxa: number of species
169 | #' @param n_relevant_vars: number of relevant nutrients
170 | #' @param n_relevant_taxa: number of relevant species
171 | #' @param beta_min: minimum absolute value of the regression parameters
172 | #' @param beta_max: maximum absolute value of the regression parameters
173 | #' @param signoise: scalar multiplier on the regression parameters
174 | #' @param n_reads_min: lower bound on uniform distribution for number of reads in each sample
175 | #' @param n_reads_max: upper bound on uniform distribution for number of reads in each sample
176 | #' @param theta0: the dispersion parameter
177 | #' @param rho: the correlation between covariates
178 | #'
179 | #' @return XX: (design matrix) with intercept: n_obs * (n_vars + 1)
180 | #' @return YY: (count matrix) rows: n_obs samples, columns: n_taxa species
181 | #' @return alphas: simulated intercept vector
182 | #' @return betas: simulated coefficient matrix n_taxa * (n_vars + 1)
183 | #' @return n_reads_min, n_read_max: row sum parameters
184 | #' @return theta0, phi, rho, signoise: simulation inputs
185 | #'
186 | #' @export
187 | simulate_dirichlet_multinomial_regression = function(n_obs = 100,
188 | n_vars = 100,
189 | n_taxa = 40,
190 | n_relevant_vars = 4,
191 | n_relevant_taxa = 4,
192 | beta_min = 0.5,
193 | beta_max = 1.0,
194 | signoise = 1.0,
195 | n_reads_min = 1000,
196 | n_reads_max = 2000,
197 | theta0 = 0.01,
198 | rho = 0.4){
199 |
200 | # check for required packages
201 | if(!require(dirmult)){
202 | stop("dirmult package required")
203 | }
204 | if(!require(MASS)){
205 | stop("MASS package required")
206 | }
207 | # covariance matrix for predictors
208 | Sigma = matrix(1, n_vars, n_vars)
209 | Sigma = rho^abs(row(Sigma) - col(Sigma))
210 | # include the intercept
211 | XX = cbind(rep(1, n_obs),
212 | scale(MASS::mvrnorm(n = n_obs, mu = rep(0, n_vars), Sigma = Sigma)))
213 | # empties
214 | YY = matrix(0, n_obs, n_taxa)
215 | betas = matrix(0, n_taxa, n_vars)
216 | phi = matrix(0, n_obs, n_taxa)
217 | # parameters with signs alternating
218 | st = 0
219 | low_side = beta_min
220 | high_side = beta_max
221 | if(n_relevant_taxa != 1){
222 | # warning if the lengths don't match
223 | coef = suppressWarnings(seq(low_side, high_side, len = n_relevant_taxa) * c(1, -1))
224 | }else{
225 | coef = (low_side + high_side) / 2
226 | }
227 | coef_g = rep(1.0, len = n_relevant_vars)
228 | for(ii in 1:n_relevant_vars){
229 | # overlap species
230 | betas[(st:(st + n_relevant_taxa - 1)) %% n_taxa + 1, 3 * ii - 2] = coef_g[ii] * sample(coef)[((ii - 1):(ii + n_relevant_taxa - 2)) %% n_relevant_taxa + 1]
231 | st = st + 1
232 | }
233 | # -2.3 and 2.3 so that the intercept varies over three orders of magnitude
234 | intercept = runif(n_taxa, -2.3, 2.3)
235 | Beta = cbind(intercept, signoise * betas)
236 | # row totals
237 | ct0 = sample(n_reads_min:n_reads_max, n_obs, rep = T)
238 | for(ii in 1:n_obs){
239 | thisrow = as.vector(exp(Beta %*% XX[ii, ]))
240 | phi[ii, ] = thisrow/sum(thisrow)
241 | YY[ii, ] = dirmult::simPop(J = 1, n = ct0[ii], pi = phi[ii, ], theta = theta0)$data[1, ]
242 | }
243 |
244 | return(list(XX = XX, YY = YY, alphas = intercept, betas = Beta,
245 | n_reads_min = n_reads_min, n_reads_max = n_reads_max,
246 | theta0 = theta0, phi = phi, rho = rho, signoise = signoise))
247 |
248 | }
249 |
250 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
--------------------------------------------------------------------------------
/code/dmbvs.c:
--------------------------------------------------------------------------------
1 | // *********************************************************************** //
2 | // MCMC code implementing integrative Bayesian variable selection for the //
3 | // Dirichlet-Multinomial regression model with application to the analysis //
4 | // of taxonomic abundances in microbiome data //
5 | // *********************************************************************** //
6 | //
7 | // Copyright (C) 2016, Raffaele Argiento and W. Duncan Wadsworth with contributions
8 | // from Michele Guindani
9 | //
10 | // This program is free software: you can redistribute it and/or modify
11 | // it under the terms of the GNU General Public License as published by
12 | // the Free Software Foundation, either version 3 of the License, or
13 | // (at your option) any later version.
14 | //
15 | // This program is distributed in the hope that it will be useful,
16 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | // GNU General Public License for more details.
19 | //
20 | // You should have received a copy of the GNU General Public License
21 | // along with this program. If not, see .
22 | //
23 | // *********************************************************************** //
24 | //
25 | // This version has been updated on October 2, 2017 with an implementation
26 | // of a stochastic search algorithm for variable selection in addition to the
27 | // Gibbs sampler
28 | //
29 | // standard includes
30 | #include
31 | #include
32 | #include
33 | #include
34 | #include
35 | // GSL includes
36 | #include
37 | #include // has both the lngamma and lnbeta functions
38 | #include
39 | #include
40 |
41 | // define the RNG globally
42 | gsl_rng *rando;
43 |
44 | // data dimensions required in the function prototypes below
45 | static int n_cats;
46 | static int n_obs;
47 | static int n_vars;
48 |
49 | // ----- Function declarations ----- //
50 | // adaptive MH helper functions
51 | double online_mean(int iteration, double last_mean, double curr_obs);
52 | double online_var(int iteration, double last_mean, double last_var,
53 | double curr_mean, double curr_obs);
54 | double adap_prop(double curr_var);
55 | // log Beta-Bernoulli spike-and-slab prior evaluation
56 | double lprior_bbsas(double betajk, int sjk, double sig_bejk, double mu_bejk,
57 | double aa_hp, double bb_hp);
58 | // calculates the linear predictor for each category of the Dirichlet-Multinomial
59 | double calculate_gamma(double **XX, double *alpha, double *beta, int jj,
60 | int ii, int Log);
61 | // MH update for the regression parameters, Gibbs version
62 | void update_beta_jj(double **XX, double **JJ, double **loggamma,
63 | double *beta_temp, int *inclusion_indicator,
64 | double *prop_per_beta, double mu_be[n_cats][n_vars],
65 | double sig_be[n_cats][n_vars], double aa_hp, double bb_hp,
66 | int jj);
67 | // MH update for the regression parameters, stochastic search version
68 | void update_beta_jj_ss(double **XX, double **JJ, double **loggamma,
69 | double *beta_temp, int *inclusion_indicator,
70 | double *prop_per_beta, double mu_be[n_cats][n_vars],
71 | double sig_be[n_cats][n_vars], double aa_hp, double bb_hp,
72 | int jj);
73 | // MH update for the intercept parameters
74 | void update_alpha_jj(double **JJ, double **loggamma, double *alpha,
75 | double *prop_per_alpha, int *accepted_alpha_flag,
76 | double mu_al[n_cats], double sig_al[n_cats], int jj);
77 | // for the Savitsky et al. inclusion proposal step
78 | void between_models_jj(double **XX, double **JJ, double **loggamma,
79 | double *beta_temp, int *accepted_beta_flag,
80 | int *inclusion_indicator, double mu_be[n_cats][n_vars],
81 | double sig_be[n_cats][n_vars], double aa_hp, double bb_hp,
82 | int jj);
83 | // Principal function using Gibbs variable selection
84 | void dmbvs_gibbs(double **XX, int **YY, double *alpha, double *beta,
85 | double mu_al[n_cats], double sig_al[n_cats],
86 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars],
87 | double aa_hp, double bb_hp, double *prop_per_alpha,
88 | double *prop_per_beta, const int GG, const int burn,
89 | const int thin, int Log, char *temp_dir);
90 | // Principal function using stochastic search variable selection
91 | void dmbvs_ss(double **XX, int **YY, double *alpha, double *beta,
92 | double mu_al[n_cats], double sig_al[n_cats],
93 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars],
94 | double aa_hp, double bb_hp, double *prop_per_alpha,
95 | double *prop_per_beta, const int GG, const int burn,
96 | const int thin, int Log, char *temp_dir);
97 |
98 | // -------------------------------- //
99 | int main(int argc, char *argv[]){
100 |
101 | // command line argument parsing
102 | if(argc != 14){
103 | fprintf(stderr, "Usage: %s \n \n \n", argv[0]);
104 | exit(EXIT_FAILURE);
105 | }
106 | const int GG = atoi(argv[1]);
107 | const int thin = atoi(argv[2]);
108 | const int burn = atoi(argv[3]);
109 | double alpha_variance = atof(argv[4]);
110 | double slab_variance = atof(argv[5]);
111 | double aa_hp = atof(argv[6]);
112 | double bb_hp = atof(argv[7]);
113 | n_cats = atoi(argv[8]);
114 | n_obs = atoi(argv[9]);
115 | n_vars = atoi(argv[10]);
116 | long int external_seed = atoi(argv[11]);
117 | char *temp_dir = argv[12];
118 | int selection_type = atoi(argv[13]);
119 |
120 | // hyperparameters and Metropolis-Hastings proposal parameters
121 | const double mean_alpha = 0.0;
122 | const double mean_slab = 0.0;
123 |
124 | printf("***********************************************************\n");
125 | printf("********************* Starting dmbvs **********************\n");
126 | // outfile info
127 | printf("***********************************************************\n");
128 | printf("*************************** MCMC **************************\n");
129 | printf("Total iterations = %i, Thinning = %i, Burnin = %i\n", GG, thin, burn);
130 | // random number generator initialization
131 | const gsl_rng_type * Tipo;
132 | Tipo = gsl_rng_default;
133 | rando = gsl_rng_alloc(Tipo);
134 | gsl_rng_set(rando, external_seed);
135 | printf("External seed = %lu, GSL seed = %lu, Generator type: %s\n",
136 | external_seed, gsl_rng_get(rando), gsl_rng_name(rando));
137 | printf("Text output deposited in directory:\n %s\n", temp_dir);
138 | printf("***********************************************************\n");
139 | printf("*************************** Data **************************\n");
140 | printf("# of categories = %i, # of observations = %i, # of covariates = %i\n",
141 | n_cats, n_obs, n_vars);
142 | printf("***********************************************************\n");
143 | printf("********************* Hyperparameters *********************\n");
144 | printf("Slab variance = %0.1f, Intercept variance = %0.1f\n", slab_variance,
145 | alpha_variance);
146 | printf("Beta-Bernoulli alpha = %0.2f, Beta-Bernoulli beta = %0.2f\n", aa_hp,
147 | bb_hp);
148 | printf("***********************************************************\n");
149 | printf("***********************************************************\n");
150 |
151 | ///////////////////////////////////////////////////
152 | // Indices (shifted right one)
153 | // ii: index of the observations in 1, ..., n_obs
154 | // jj: index of the categories in 1, ..., n_cats
155 | // kk: index of the covariates in 1, ..., n_vars
156 | // hh: index of the beta vector in 1, ..., n_vars * n_cats, is collapsed
157 | // row-wise so that the 2x2 identity matrix becomes (1, 0, 0, 1)
158 | int ii, jj, kk, hh;
159 |
160 | // Larger objects such as data and initializations are read from files
161 |
162 | // single file pointer for all files
163 | FILE *fin;
164 |
165 | // reading in YY: the sample-by-category count matrix
166 | // all other files are read in in the same way
167 | char td_YY[200];
168 | strcpy(td_YY, temp_dir);
169 | strcat(td_YY, "/count_matrix.txt");
170 | fin = fopen(td_YY, "r");
171 | // dynamic allocation for YY for an array of pointers
172 | int **YY;
173 | YY = (int **) malloc(n_obs * sizeof(int *));
174 | for(ii = 0 ; ii < n_obs ; ii++){
175 | // allocate each row of the matrix YY
176 | YY[ii] = (int *) malloc(n_cats * sizeof(int));
177 | // fill in each row of the matrix YY
178 | for(jj = 0 ; jj < n_cats ; jj++){
179 | fscanf(fin, "%u", &YY[ii][jj]);
180 | }
181 | }
182 |
183 | // reading in XX: the sample-by-covariate matrix
184 | char td_XX[200];
185 | strcpy(td_XX, temp_dir);
186 | strcat(td_XX, "/covariates.txt");
187 | fin = fopen(td_XX, "r");
188 | double **XX;
189 | XX = (double **) malloc(n_obs * sizeof(double *));
190 | for(ii = 0 ; ii < n_obs ; ii++){
191 | XX[ii] = (double *) malloc(n_vars * sizeof(double));
192 | for(jj = 0 ; jj < n_vars ; jj++){
193 | fscanf(fin, "%lf", &XX[ii][jj]);
194 | }
195 | }
196 |
197 | ///////// Initializations from text files
198 |
199 | // initialization for beta
200 | char td_ib[200];
201 | strcpy(td_ib, temp_dir);
202 | strcat(td_ib, "/init_beta.txt");
203 | fin = fopen(td_ib, "r");
204 | double *beta;
205 | beta = malloc((n_vars * n_cats) * sizeof(double));
206 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
207 | fscanf(fin, "%lf", &beta[hh]);
208 | }
209 |
210 | // initialization for alpha
211 | char td_ia[200];
212 | strcpy(td_ia, temp_dir);
213 | strcat(td_ia, "/init_alpha.txt");
214 | fin = fopen(td_ia, "r");
215 | double *alpha;
216 | alpha = malloc(n_cats * sizeof(double));
217 | for(jj = 0 ; jj < n_cats ; jj++){
218 | fscanf(fin, "%lf", &alpha[jj]);
219 | }
220 |
221 | // for the beta proposal variance
222 | char td_pb[200];
223 | strcpy(td_pb, temp_dir);
224 | strcat(td_pb, "/proposal_beta.txt");
225 | fin = fopen(td_pb, "r");
226 | double *prop_per_beta;
227 | prop_per_beta = malloc((n_vars * n_cats) * sizeof(double));
228 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
229 | fscanf(fin, "%lf", &prop_per_beta[hh]);
230 | }
231 |
232 | // for the alpha proposal variance
233 | char td_pa[200];
234 | strcpy(td_pa, temp_dir);
235 | strcat(td_pa, "/proposal_alpha.txt");
236 | fin = fopen(td_pa, "r");
237 | double *prop_per_alpha;
238 | prop_per_alpha = malloc(n_cats * sizeof(double));
239 | for(jj = 0 ; jj < n_cats ; jj++){
240 | fscanf(fin, "%lf", &prop_per_alpha[jj]);
241 | }
242 |
243 | // mean and standard deviation of the independent normal priors on alpha and beta
244 | double mu_al[n_cats], sig_al[n_cats];
245 | double mu_be[n_cats][n_vars], sig_be[n_cats][n_vars];
246 | for(jj = 0 ; jj < n_cats ; jj++){
247 | mu_al[jj] = mean_alpha;
248 | sig_al[jj] = alpha_variance;
249 | for(kk = 0 ; kk < n_vars ; kk++){
250 | mu_be[jj][kk] = mean_slab;
251 | sig_be[jj][kk] = slab_variance;
252 | }
253 | }
254 |
255 | // a flag for functions that can calculate on the log scale
256 | int Log = 1;
257 |
258 | ///////// Initialization finished: commence the MCMC
259 |
260 | if(selection_type == 0){
261 | dmbvs_ss(XX, YY, alpha, beta, mu_al, sig_al, mu_be, sig_be, aa_hp, bb_hp,
262 | prop_per_alpha, prop_per_beta, GG, burn, thin, Log, temp_dir);
263 | }
264 | else if(selection_type == 1){
265 | dmbvs_gibbs(XX, YY, alpha, beta, mu_al, sig_al, mu_be, sig_be, aa_hp, bb_hp,
266 | prop_per_alpha, prop_per_beta, GG, burn, thin, Log, temp_dir);
267 | }
268 | else{
269 | printf("Unrecognized variable selection argument\n");
270 | }
271 |
272 |
273 | // cleanup (kind of pedantic)
274 | fclose(fin);
275 | free(YY);
276 | free(XX);
277 | free(beta);
278 | free(alpha);
279 | free(prop_per_alpha);
280 | free(prop_per_beta);
281 | gsl_rng_free(rando);
282 |
283 | return(0);
284 |
285 | } // end of main()
286 |
287 | //////////////////////////////////////
288 | // ----- Function definitions ----- //
289 | //////////////////////////////////////
290 |
291 | // Gibbs Sampler function
292 | void dmbvs_ss(double **XX, int **YY, double *alpha, double *beta,
293 | double mu_al[n_cats], double sig_al[n_cats],
294 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars],
295 | double aa_hp, double bb_hp, double *prop_per_alpha,
296 | double *prop_per_beta, const int GG, const int burn,
297 | const int thin, int Log, char *temp_dir){
298 |
299 | // loop indices
300 | int hh, ii, jj, kk, gg;
301 |
302 | // for timing iterations
303 | time_t mytime;
304 |
305 | // adaptive proposal values
306 | double last_mean;
307 | double last_var;
308 |
309 | ///////// Initialize variables that don't require input or can be obtained
310 | ///////// from inputs
311 |
312 | // for MH acceptance/rejection ratios, initialized with zeros
313 | int *accepted_beta_flag;
314 | accepted_beta_flag = malloc((n_vars * n_cats) * sizeof(int));
315 | int *accepted_beta;
316 | accepted_beta = malloc((n_vars * n_cats) * sizeof(int));
317 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
318 | accepted_beta_flag[hh] = 0;
319 | accepted_beta[hh] = 0;
320 | }
321 | int *accepted_alpha_flag;
322 | accepted_alpha_flag = malloc(n_cats * sizeof(int));
323 | int *accepted_alpha;
324 | accepted_alpha = malloc(n_cats * sizeof(int));
325 | for(hh = 0 ; hh < n_cats ; hh++){
326 | accepted_alpha_flag[hh] = 0;
327 | accepted_alpha[hh] = 0;
328 | }
329 | // for adaptive MH need to keep track of target density mean and variance
330 | double *curr_mean;
331 | curr_mean = malloc((n_vars * n_cats) * sizeof(double));
332 | double *curr_var;
333 | curr_var = malloc((n_vars * n_cats) * sizeof(double));
334 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
335 | curr_mean[hh] = 0.0;
336 | curr_var[hh] = 0.5;
337 | }
338 |
339 | // variable inclusion indicator
340 | int *inclusion_indicator;
341 | inclusion_indicator = malloc((n_vars * n_cats) * sizeof(int));
342 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
343 | if(beta[hh] == 0){
344 | inclusion_indicator[hh] = 0;
345 | }else{
346 | inclusion_indicator[hh] = 1;
347 | }
348 | }
349 |
350 | // row sums of YY
351 | int *Ypiu;
352 | Ypiu = malloc(n_obs * sizeof(int));
353 | for(ii = 0 ; ii < n_obs ; ii++){
354 | for(jj = 0 ; jj < n_cats ; jj++){
355 | Ypiu[ii] = Ypiu[ii] + YY[ii][jj];
356 | }
357 | }
358 |
359 | // the beta_temp vector is an n_vars long vector used for updating each of the
360 | // jj-th rows in Beta corresponding the jj-th category in YY
361 | double *beta_temp;
362 | beta_temp = malloc(n_vars * sizeof(double));
363 |
364 | // the linear predictor matrix, represented as a vector, is in the log scale
365 | double **loggamma;
366 | loggamma = malloc(n_obs * sizeof(double *));
367 | for(ii = 0 ; ii < n_obs ; ii++){
368 | loggamma[ii] = malloc(n_cats * sizeof(double));
369 | for(jj = 0 ; jj < n_cats ; jj++){
370 | loggamma[ii][jj] = calculate_gamma(XX, alpha, beta, jj, ii, Log);
371 | }
372 | }
373 |
374 | // initialize latent variable: JJ ~ gamma()
375 | double **JJ;
376 | JJ = malloc(n_obs * sizeof(double *));
377 | // TT is the vector of normalization constants
378 | double *TT;
379 | TT = malloc(n_obs * sizeof(double *));
380 | for(ii = 0 ; ii < n_obs ; ii++){
381 | JJ[ii] = malloc(n_cats * sizeof(double));
382 | TT[ii] = 0.0;
383 | for(jj = 0 ; jj < n_cats ; jj++){
384 | JJ[ii][jj] = (double)YY[ii][jj];
385 | // a very small number
386 | if(JJ[ii][jj] < pow(10.0, -100.0)){
387 | JJ[ii][jj] = pow(10.0, -100.0);
388 | }
389 | TT[ii] = TT[ii] + JJ[ii][jj];
390 | }
391 | }
392 |
393 | // initialize the other clever latent variable: uu
394 | // note that this uses the data to initialize
395 | double *uu;
396 | uu = malloc(n_obs * sizeof(double));
397 | for(ii = 0 ; ii < n_obs ; ii++){
398 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]);
399 | }
400 |
401 | // output file pointers
402 | FILE *fout_alpha, *fout_beta, *fout_alpha_acceptance, *fout_beta_acceptance, *fout_beta_proposal;
403 |
404 | // this is stupid, I'm sure there's a better way
405 | char tf_alpha[200];
406 | strcpy(tf_alpha, temp_dir);
407 | strcat(tf_alpha, "/alpha.out");
408 | fout_alpha = fopen(tf_alpha, "w");
409 | char tf_alpha_acceptance[200];
410 | strcpy(tf_alpha_acceptance, temp_dir);
411 | strcat(tf_alpha_acceptance, "/alpha_acceptance.out");
412 | fout_alpha_acceptance = fopen(tf_alpha_acceptance, "w");
413 | char tf_beta[200];
414 | strcpy(tf_beta, temp_dir);
415 | strcat(tf_beta, "/beta.out");
416 | fout_beta = fopen(tf_beta, "w");
417 | char tf_beta_acceptance[200];
418 | strcpy(tf_beta_acceptance, temp_dir);
419 | strcat(tf_beta_acceptance, "/beta_acceptance.out");
420 | fout_beta_acceptance = fopen(tf_beta_acceptance, "w");
421 | char tf_beta_proposal[200];
422 | strcpy(tf_beta_proposal, temp_dir);
423 | strcat(tf_beta_proposal, "/beta_proposal.out");
424 | fout_beta_proposal = fopen(tf_beta_proposal, "w");
425 |
426 | // the magic starts here
427 | printf("Starting the sampler\n");
428 |
429 | for(gg = 0 ; gg < GG ; gg++){
430 |
431 | // timing info to the printed output
432 | mytime = time(NULL);
433 | if(gg % 1000 == 0){
434 | printf("Starting the %uth iteration out of %u at %s", gg, GG, ctime(&mytime));
435 | }
436 |
437 | // first a round of the between-model step for every covariate within every taxa
438 | jj = floor(gsl_rng_uniform(rando) * n_cats);//for(jj = 0 ; jj < n_cats ; jj++){
439 |
440 | // fill in beta_temp
441 | for(kk = 0 ; kk < n_vars ; kk++){
442 | hh = kk + jj * n_vars;
443 | beta_temp[kk] = beta[hh];
444 | }
445 |
446 | between_models_jj(XX, JJ, loggamma, beta_temp, accepted_beta_flag,
447 | inclusion_indicator, mu_be, sig_be, aa_hp, bb_hp, jj);
448 |
449 | // update beta with beta_temp
450 | for(kk = 0 ; kk < n_vars ; kk++){
451 | hh = kk + jj * n_vars;
452 | beta[hh] = beta_temp[kk];
453 | }
454 | //}
455 |
456 | // now an accelerating within-model step
457 | for(jj = 0 ; jj < n_cats ; jj++){
458 |
459 | // fill in beta_temp and update the proposal variance
460 | for(kk = 0 ; kk < n_vars ; kk++){
461 | hh = kk + jj * n_vars;
462 | beta_temp[kk] = beta[hh];
463 | // wait until each parameter has had a few iterations
464 | if(gg > 2 * n_vars * n_cats){
465 | // calculate the online variance
466 | last_mean = curr_mean[hh];
467 | last_var = curr_var[hh];
468 | curr_mean[hh] = online_mean(gg, last_mean, beta[hh]);
469 | curr_var[hh] = online_var(gg, last_mean, last_var, curr_mean[hh], beta[hh]);
470 | // update proposal variance
471 | prop_per_beta[hh] = curr_var[hh];
472 | }
473 | }
474 |
475 | update_beta_jj_ss(XX, JJ, loggamma, beta_temp, inclusion_indicator,
476 | prop_per_beta, mu_be, sig_be, aa_hp, bb_hp, jj);
477 |
478 | // update beta with beta_temp and write to file
479 | for(kk = 0 ; kk < n_vars ; kk++){
480 | hh = kk + jj * n_vars;
481 | beta[hh] = beta_temp[kk];
482 | if((gg >= burn) & (gg % thin == 0)){
483 | accepted_beta[hh] = accepted_beta[hh] + accepted_beta_flag[hh];
484 | accepted_beta_flag[hh] = 0;
485 | fprintf(fout_beta, "%e ", beta[hh]); // space delimited output files
486 | }
487 | }
488 | }
489 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_beta, "\n");}
490 |
491 | // update alpha and write to file
492 | for(jj = 0 ; jj < n_cats ; jj++){
493 | update_alpha_jj(JJ, loggamma, alpha, prop_per_alpha, accepted_alpha_flag,
494 | mu_al, sig_al, jj);
495 | if((gg >= burn) & (gg % thin == 0)){
496 | accepted_alpha[jj] = accepted_alpha[jj] + accepted_alpha_flag[jj];
497 | accepted_alpha_flag[jj] = 0;
498 | fprintf(fout_alpha,"%e ", alpha[jj]);
499 | }
500 | }
501 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_alpha, "\n ");}
502 |
503 | // update JJ and consequently TT
504 | for(ii = 0 ; ii < n_obs ; ii++){
505 | TT[ii] = 0.0;
506 | for(jj = 0 ; jj < n_cats ; jj++){
507 | JJ[ii][jj] = gsl_ran_gamma(rando, YY[ii][jj] + exp(loggamma[ii][jj]), 1.0/(uu[ii] + 1.0));
508 | if(JJ[ii][jj] < pow(10.0, -100.0)){
509 | JJ[ii][jj] = pow(10.0, -100.0);
510 | }
511 | TT[ii] = TT[ii] + JJ[ii][jj];
512 | }
513 | }
514 |
515 | // update latent variables uu
516 | for(ii = 0 ; ii < n_obs ; ii++){
517 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]);
518 | }
519 |
520 | } // end of iterations
521 |
522 | // print acceptance ratio files
523 | for(jj = 0 ; jj < n_cats ; jj++){
524 | fprintf(fout_alpha_acceptance, "%f\n", (float)accepted_alpha[jj]/((GG - burn)/thin));
525 | for(kk = 0 ; kk < n_vars ; kk++){
526 | fprintf(fout_beta_acceptance, "%f\n", (float)accepted_beta[kk + jj * n_vars]/((GG - burn)/thin));
527 | fprintf(fout_beta_proposal, "%f\n", prop_per_beta[kk + jj * n_vars]);
528 | }
529 | }
530 |
531 | // cleanup (kind of pedantic)
532 | fclose(fout_alpha);
533 | fclose(fout_beta);
534 | fclose(fout_alpha_acceptance);
535 | fclose(fout_beta_acceptance);
536 | free(accepted_alpha);
537 | free(accepted_beta);
538 | free(accepted_alpha_flag);
539 | free(accepted_beta_flag);
540 | free(inclusion_indicator);
541 | free(beta_temp);
542 | free(loggamma);
543 | free(JJ);
544 | free(TT);
545 | free(uu);
546 | free(Ypiu);
547 |
548 | printf("Sampling finished!\n");
549 |
550 | return;
551 |
552 | } // close Gibbs sampler function
553 |
554 | void dmbvs_gibbs(double **XX, int **YY, double *alpha, double *beta,
555 | double mu_al[n_cats], double sig_al[n_cats],
556 | double mu_be[n_cats][n_vars], double sig_be[n_cats][n_vars],
557 | double aa_hp, double bb_hp, double *prop_per_alpha,
558 | double *prop_per_beta, const int GG, const int burn,
559 | const int thin, int Log, char *temp_dir){
560 |
561 | // loop indices
562 | int hh, ii, jj, kk, gg;
563 |
564 | // for timing iterations
565 | time_t mytime;
566 |
567 | // adaptive proposal values
568 | double last_mean;
569 | double last_var;
570 |
571 | ///////// Initialize variables that don't require input or can be obtained
572 | ///////// from inputs
573 |
574 | // for MH acceptance/rejection ratios, initialized with zeros
575 | int *accepted_beta_flag;
576 | accepted_beta_flag = malloc((n_vars * n_cats) * sizeof(int));
577 | int *accepted_beta;
578 | accepted_beta = malloc((n_vars * n_cats) * sizeof(int));
579 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
580 | accepted_beta_flag[hh] = 0;
581 | accepted_beta[hh] = 0;
582 | }
583 | int *accepted_alpha_flag;
584 | accepted_alpha_flag = malloc(n_cats * sizeof(int));
585 | int *accepted_alpha;
586 | accepted_alpha = malloc(n_cats * sizeof(int));
587 | for(hh = 0 ; hh < n_cats ; hh++){
588 | accepted_alpha_flag[hh] = 0;
589 | accepted_alpha[hh] = 0;
590 | }
591 | // for adaptive MH need to keep track of target density mean and variance
592 | double *curr_mean;
593 | curr_mean = malloc((n_vars * n_cats) * sizeof(double));
594 | double *curr_var;
595 | curr_var = malloc((n_vars * n_cats) * sizeof(double));
596 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
597 | curr_mean[hh] = 0.0;
598 | curr_var[hh] = 0.5;
599 | }
600 |
601 | // variable inclusion indicator
602 | int *inclusion_indicator;
603 | inclusion_indicator = malloc((n_vars * n_cats) * sizeof(int));
604 | for(hh = 0 ; hh < (n_vars * n_cats) ; hh++){
605 | if(beta[hh] == 0){
606 | inclusion_indicator[hh] = 0;
607 | }else{
608 | inclusion_indicator[hh] = 1;
609 | }
610 | }
611 |
612 | // row sums of YY
613 | int *Ypiu;
614 | Ypiu = malloc(n_obs * sizeof(int));
615 | for(ii = 0 ; ii < n_obs ; ii++){
616 | for(jj = 0 ; jj < n_cats ; jj++){
617 | Ypiu[ii] = Ypiu[ii] + YY[ii][jj];
618 | }
619 | }
620 |
621 | // the beta_temp vector is an n_vars long vector used for updating each of the
622 | // jj-th rows in Beta corresponding the jj-th category in YY
623 | double *beta_temp;
624 | beta_temp = malloc(n_vars * sizeof(double));
625 |
626 | // the linear predictor matrix, represented as a vector, is in the log scale
627 | double **loggamma;
628 | loggamma = malloc(n_obs * sizeof(double *));
629 | for(ii = 0 ; ii < n_obs ; ii++){
630 | loggamma[ii] = malloc(n_cats * sizeof(double));
631 | for(jj = 0 ; jj < n_cats ; jj++){
632 | loggamma[ii][jj] = calculate_gamma(XX, alpha, beta, jj, ii, Log);
633 | }
634 | }
635 |
636 | // initialize latent variable: JJ ~ gamma()
637 | double **JJ;
638 | JJ = malloc(n_obs * sizeof(double *));
639 | // TT is the vector of normalization constants
640 | double *TT;
641 | TT = malloc(n_obs * sizeof(double *));
642 | for(ii = 0 ; ii < n_obs ; ii++){
643 | JJ[ii] = malloc(n_cats * sizeof(double));
644 | TT[ii] = 0.0;
645 | for(jj = 0 ; jj < n_cats ; jj++){
646 | JJ[ii][jj] = (double)YY[ii][jj];
647 | // a very small number
648 | if(JJ[ii][jj] < pow(10.0, -100.0)){
649 | JJ[ii][jj] = pow(10.0, -100.0);
650 | }
651 | TT[ii] = TT[ii] + JJ[ii][jj];
652 | }
653 | }
654 |
655 | // initialize the other clever latent variable: uu
656 | // note that this uses the data to initialize
657 | double *uu;
658 | uu = malloc(n_obs * sizeof(double));
659 | for(ii = 0 ; ii < n_obs ; ii++){
660 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]);
661 | }
662 |
663 | // output file pointers
664 | FILE *fout_alpha, *fout_beta, *fout_alpha_acceptance, *fout_beta_acceptance, *fout_beta_proposal;
665 |
666 | // this is stupid, I'm sure there's a better way
667 | char tf_alpha[200];
668 | strcpy(tf_alpha, temp_dir);
669 | strcat(tf_alpha, "/alpha.out");
670 | fout_alpha = fopen(tf_alpha, "w");
671 | char tf_alpha_acceptance[200];
672 | strcpy(tf_alpha_acceptance, temp_dir);
673 | strcat(tf_alpha_acceptance, "/alpha_acceptance.out");
674 | fout_alpha_acceptance = fopen(tf_alpha_acceptance, "w");
675 | char tf_beta[200];
676 | strcpy(tf_beta, temp_dir);
677 | strcat(tf_beta, "/beta.out");
678 | fout_beta = fopen(tf_beta, "w");
679 | char tf_beta_acceptance[200];
680 | strcpy(tf_beta_acceptance, temp_dir);
681 | strcat(tf_beta_acceptance, "/beta_acceptance.out");
682 | fout_beta_acceptance = fopen(tf_beta_acceptance, "w");
683 | char tf_beta_proposal[200];
684 | strcpy(tf_beta_proposal, temp_dir);
685 | strcat(tf_beta_proposal, "/beta_proposal.out");
686 | fout_beta_proposal = fopen(tf_beta_proposal, "w");
687 |
688 | // the magic starts here
689 | printf("Starting the sampler\n");
690 |
691 | for(gg = 0 ; gg < GG ; gg++){
692 |
693 | // timing info to the printed output
694 | mytime = time(NULL);
695 | if(gg % 1000 == 0){
696 | printf("Starting the %uth iteration out of %u at %s", gg, GG, ctime(&mytime));
697 | }
698 |
699 | // first a round of the between-model step for every covariate within every taxa
700 | for(jj = 0 ; jj < n_cats ; jj++){
701 |
702 | // fill in beta_temp
703 | for(kk = 0 ; kk < n_vars ; kk++){
704 | hh = kk + jj * n_vars;
705 | beta_temp[kk] = beta[hh];
706 | }
707 |
708 | between_models_jj(XX, JJ, loggamma, beta_temp, accepted_beta_flag,
709 | inclusion_indicator, mu_be, sig_be, aa_hp, bb_hp, jj);
710 |
711 | // update beta with beta_temp
712 | for(kk = 0 ; kk < n_vars ; kk++){
713 | hh = kk + jj * n_vars;
714 | beta[hh] = beta_temp[kk];
715 | }
716 | }
717 |
718 | // now an accelerating within-model step
719 | for(jj = 0 ; jj < n_cats ; jj++){
720 |
721 | // fill in beta_temp and update the proposal variance
722 | for(kk = 0 ; kk < n_vars ; kk++){
723 | hh = kk + jj * n_vars;
724 | beta_temp[kk] = beta[hh];
725 | // wait until each parameter has had a few iterations
726 | if(gg > 2 * n_vars * n_cats){
727 | // calculate the online variance
728 | last_mean = curr_mean[hh];
729 | last_var = curr_var[hh];
730 | curr_mean[hh] = online_mean(gg, last_mean, beta[hh]);
731 | curr_var[hh] = online_var(gg, last_mean, last_var, curr_mean[hh], beta[hh]);
732 | // update proposal variance
733 | prop_per_beta[hh] = curr_var[hh];
734 | }
735 | }
736 |
737 | update_beta_jj(XX, JJ, loggamma, beta_temp, inclusion_indicator,
738 | prop_per_beta, mu_be, sig_be, aa_hp, bb_hp, jj);
739 |
740 | // update beta with beta_temp and write to file
741 | for(kk = 0 ; kk < n_vars ; kk++){
742 | hh = kk + jj * n_vars;
743 | beta[hh] = beta_temp[kk];
744 | if((gg >= burn) & (gg % thin == 0)){
745 | accepted_beta[hh] = accepted_beta[hh] + accepted_beta_flag[hh];
746 | accepted_beta_flag[hh] = 0;
747 | fprintf(fout_beta, "%e ", beta[hh]); // space delimited output files
748 | }
749 | }
750 | }
751 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_beta, "\n");}
752 |
753 | // update alpha and write to file
754 | for(jj = 0 ; jj < n_cats ; jj++){
755 | update_alpha_jj(JJ, loggamma, alpha, prop_per_alpha, accepted_alpha_flag,
756 | mu_al, sig_al, jj);
757 | if((gg >= burn) & (gg % thin == 0)){
758 | accepted_alpha[jj] = accepted_alpha[jj] + accepted_alpha_flag[jj];
759 | accepted_alpha_flag[jj] = 0;
760 | fprintf(fout_alpha,"%e ", alpha[jj]);
761 | }
762 | }
763 | if((gg >= burn) & (gg % thin == 0)){fprintf(fout_alpha, "\n ");}
764 |
765 | // update JJ and consequently TT
766 | for(ii = 0 ; ii < n_obs ; ii++){
767 | TT[ii] = 0.0;
768 | for(jj = 0 ; jj < n_cats ; jj++){
769 | JJ[ii][jj] = gsl_ran_gamma(rando, YY[ii][jj] + exp(loggamma[ii][jj]), 1.0/(uu[ii] + 1.0));
770 | if(JJ[ii][jj] < pow(10.0, -100.0)){
771 | JJ[ii][jj] = pow(10.0, -100.0);
772 | }
773 | TT[ii] = TT[ii] + JJ[ii][jj];
774 | }
775 | }
776 |
777 | // update latent variables uu
778 | for(ii = 0 ; ii < n_obs ; ii++){
779 | uu[ii] = gsl_ran_gamma(rando, Ypiu[ii], 1.0/TT[ii]);
780 | }
781 |
782 | } // end of iterations
783 |
784 | // print acceptance ratio files
785 | for(jj = 0 ; jj < n_cats ; jj++){
786 | fprintf(fout_alpha_acceptance, "%f\n", (float)accepted_alpha[jj]/((GG - burn)/thin));
787 | for(kk = 0 ; kk < n_vars ; kk++){
788 | fprintf(fout_beta_acceptance, "%f\n", (float)accepted_beta[kk + jj * n_vars]/((GG - burn)/thin));
789 | fprintf(fout_beta_proposal, "%f\n", prop_per_beta[kk + jj * n_vars]);
790 | }
791 | }
792 |
793 | // cleanup (kind of pedantic)
794 | fclose(fout_alpha);
795 | fclose(fout_beta);
796 | fclose(fout_alpha_acceptance);
797 | fclose(fout_beta_acceptance);
798 | free(accepted_alpha);
799 | free(accepted_beta);
800 | free(accepted_alpha_flag);
801 | free(accepted_beta_flag);
802 | free(inclusion_indicator);
803 | free(beta_temp);
804 | free(loggamma);
805 | free(JJ);
806 | free(TT);
807 | free(uu);
808 | free(Ypiu);
809 |
810 | printf("Sampling finished!\n");
811 |
812 | return;
813 |
814 | } // close Gibbs (stochastic search) sampler function
815 |
816 | //
817 | double calculate_gamma(double **XX, double *alpha, double *beta, int jj, int ii, int Log){
818 |
819 | // loop indices and out
820 | int kk, hh;
821 | double out;
822 |
823 | out = alpha[jj];
824 | for(kk = 0 ; kk < n_vars ; kk++){
825 |
826 | hh = kk + jj * n_vars;
827 | out = out + beta[hh] * XX[ii][kk];
828 |
829 | }
830 |
831 | if(Log == 1){
832 | return(out);
833 | }
834 |
835 | return(exp(out));
836 |
837 | }
838 |
839 | // Gibbs variable selection version
840 | void update_beta_jj(double **XX, double **JJ, double **loggamma,
841 | double *beta_temp, int *inclusion_indicator,
842 | double *prop_per_beta, double mu_be[n_cats][n_vars],
843 | double sig_be[n_cats][n_vars], double aa_hp,
844 | double bb_hp, int jj){
845 |
846 | // this function loops through n_vars so it is called for one taxa at a time
847 |
848 | double sig_prop;
849 | double loggamma_p[n_obs];
850 | int hh, ii, kk;
851 |
852 | // define proposal variable and acceptance ratio variables
853 | double beta_p;
854 | double lnu, ln_acp;
855 |
856 | double log_full_beta, log_full_beta_p;
857 |
858 | for(kk = 0 ; kk < n_vars ; kk++){
859 |
860 | // Stride for full (n_vars * n_cats) vector
861 | hh = kk + jj * n_vars;
862 |
863 | if(inclusion_indicator[hh] == 1){
864 |
865 | log_full_beta = 0;
866 |
867 | for(ii = 0 ; ii < n_obs ; ii++){
868 |
869 | log_full_beta = log_full_beta - gsl_sf_lngamma(exp(loggamma[ii][jj]));
870 | log_full_beta = log_full_beta + exp(loggamma[ii][jj]) * log(JJ[ii][jj]);
871 |
872 | }
873 |
874 | log_full_beta = log_full_beta + lprior_bbsas(beta_temp[kk], inclusion_indicator[hh],
875 | sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp);
876 | sig_prop = prop_per_beta[hh];
877 | beta_p = beta_temp[kk] + adap_prop(sig_prop);
878 |
879 |
880 | for(ii = 0 ; ii < n_obs ; ii++){
881 | loggamma_p[ii] = loggamma[ii][jj] - beta_temp[kk] * XX[ii][kk] + beta_p * XX[ii][kk];
882 | }
883 |
884 | // calculate proposal probability
885 | log_full_beta_p = 0;
886 | for(ii = 0 ; ii < n_obs ; ii++){
887 | log_full_beta_p = log_full_beta_p - gsl_sf_lngamma(exp(loggamma_p[ii]));
888 | log_full_beta_p = log_full_beta_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]);
889 | }
890 | log_full_beta_p = log_full_beta_p + lprior_bbsas(beta_p, inclusion_indicator[hh], sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp);
891 |
892 | ln_acp = log_full_beta_p - log_full_beta;
893 | lnu = log(gsl_rng_uniform(rando));
894 |
895 | if(lnu < ln_acp){
896 |
897 | beta_temp[kk] = beta_p;
898 |
899 | for(ii = 0 ; ii < n_obs ; ii++){
900 |
901 | loggamma[ii][jj] = loggamma_p[ii];
902 |
903 | }
904 |
905 | } // Close MH accept
906 | } // Close inclusion_indicator[hh] == 1
907 | } // Close for kk
908 | } // Close function
909 |
910 | // stochastic search variable selection version
911 | void update_beta_jj_ss(double **XX, double **JJ, double **loggamma,
912 | double *beta_temp, int *inclusion_indicator,
913 | double *prop_per_beta, double mu_be[n_cats][n_vars],
914 | double sig_be[n_cats][n_vars], double aa_hp,
915 | double bb_hp, int jj){
916 |
917 | // this function loops through n_vars so it is called for one taxa at a time
918 |
919 | double sig_prop;
920 | double loggamma_p[n_obs];
921 | int hh, ii, kk;
922 |
923 | // define proposal variable and acceptance ratio variables
924 | double beta_p;
925 | double lnu, ln_acp;
926 |
927 | double log_full_beta, log_full_beta_p;
928 |
929 | kk = floor(gsl_rng_uniform(rando) * n_vars);//for(kk = 0 ; kk < n_vars ; kk++){
930 |
931 | // Stride for full (n_vars * n_cats) vector
932 | hh = kk + jj * n_vars;
933 |
934 | if(inclusion_indicator[hh] == 1){
935 |
936 | log_full_beta = 0;
937 |
938 | for(ii = 0 ; ii < n_obs ; ii++){
939 |
940 | log_full_beta = log_full_beta - gsl_sf_lngamma(exp(loggamma[ii][jj]));
941 | log_full_beta = log_full_beta + exp(loggamma[ii][jj]) * log(JJ[ii][jj]);
942 |
943 | }
944 |
945 | log_full_beta = log_full_beta + lprior_bbsas(beta_temp[kk], inclusion_indicator[hh],
946 | sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp);
947 | sig_prop = prop_per_beta[hh];
948 | beta_p = beta_temp[kk] + adap_prop(sig_prop);
949 |
950 |
951 | for(ii = 0 ; ii < n_obs ; ii++){
952 | loggamma_p[ii] = loggamma[ii][jj] - beta_temp[kk] * XX[ii][kk] + beta_p * XX[ii][kk];
953 | }
954 |
955 | // calculate proposal probability
956 | log_full_beta_p = 0;
957 | for(ii = 0 ; ii < n_obs ; ii++){
958 | log_full_beta_p = log_full_beta_p - gsl_sf_lngamma(exp(loggamma_p[ii]));
959 | log_full_beta_p = log_full_beta_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]);
960 | }
961 | log_full_beta_p = log_full_beta_p + lprior_bbsas(beta_p, inclusion_indicator[hh], sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp);
962 |
963 | ln_acp = log_full_beta_p - log_full_beta;
964 | lnu = log(gsl_rng_uniform(rando));
965 |
966 | if(lnu < ln_acp){
967 |
968 | beta_temp[kk] = beta_p;
969 |
970 | for(ii = 0 ; ii < n_obs ; ii++){
971 |
972 | loggamma[ii][jj] = loggamma_p[ii];
973 |
974 | }
975 |
976 | } // Close MH accept
977 | } // Close inclusion_indicator[hh] == 1
978 | //} // Close for kk
979 | } // Close function
980 |
981 | void between_models_jj(double **XX, double **JJ, double **loggamma,
982 | double *beta_temp, int *accepted_beta_flag,
983 | int *inclusion_indicator, double mu_be[n_cats][n_vars],
984 | double sig_be[n_cats][n_vars], double aa_hp,
985 | double bb_hp, int jj){
986 |
987 | // Metropolis-Hastings proposals
988 | double sig_prop;
989 | double mu_prop;
990 |
991 | int hh, kk, ii;
992 |
993 | // proposed value vectors
994 | int inclusion_indicator_p;
995 | double loggamma_p[n_obs];
996 | double beta_p;
997 |
998 | // acceptance ratio variables
999 | double lnu, ln_acp;
1000 |
1001 | // must update beta_jk for kk = 1, ..., n_vars
1002 | double log_full_beta, log_full_beta_p;
1003 |
1004 | for(kk = 0 ; kk < n_vars ; kk++){
1005 |
1006 | hh = kk + jj * n_vars;
1007 |
1008 | // calculate the current log full conditional
1009 | log_full_beta = 0;
1010 | for(ii = 0 ; ii < n_obs ; ii++){
1011 | log_full_beta = log_full_beta - gsl_sf_lngamma(exp(loggamma[ii][jj]));
1012 | log_full_beta = log_full_beta + exp(loggamma[ii][jj]) * log(JJ[ii][jj]);
1013 | }
1014 | log_full_beta = log_full_beta + lprior_bbsas(beta_temp[kk], inclusion_indicator[hh], sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp);
1015 |
1016 | // proposing a new value for beta[jj][kk] using the prior mean and standard deviation
1017 | sig_prop = pow(sig_be[jj][kk], 0.5);
1018 | mu_prop = mu_be[jj][kk];
1019 |
1020 | // swap and sample beta
1021 | if(inclusion_indicator[hh] == 0){
1022 | beta_p = mu_prop + gsl_ran_gaussian_ziggurat(rando, sig_prop);
1023 | inclusion_indicator_p = 1;
1024 | }
1025 | else{
1026 | beta_p = 0.0;
1027 | inclusion_indicator_p = 0;
1028 | }
1029 |
1030 | // update proposal gamma (linear predictor)
1031 | for(ii = 0 ; ii < n_obs ; ii++){
1032 | loggamma_p[ii] = loggamma[ii][jj] - beta_temp[kk] * XX[ii][kk] + beta_p * XX[ii][kk];
1033 | }
1034 |
1035 | // calculate the proposed log full conditional
1036 | log_full_beta_p = 0;
1037 | for(ii = 0 ; ii < n_obs ; ii++){
1038 | log_full_beta_p = log_full_beta_p - gsl_sf_lngamma(exp(loggamma_p[ii]));
1039 | log_full_beta_p = log_full_beta_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]);
1040 | }
1041 | log_full_beta_p = log_full_beta_p + lprior_bbsas(beta_p, inclusion_indicator_p, sig_be[jj][kk], mu_be[jj][kk], aa_hp, bb_hp);
1042 |
1043 | ln_acp = log_full_beta_p - log_full_beta;
1044 | lnu = log(gsl_rng_uniform(rando));
1045 |
1046 | // Metropolis-Hastings ratio
1047 | if(lnu < ln_acp){
1048 |
1049 | accepted_beta_flag[hh] = 1;
1050 |
1051 | // update accepted beta into beta_temp
1052 | beta_temp[kk] = beta_p;
1053 | inclusion_indicator[hh] = inclusion_indicator_p;
1054 |
1055 | for(ii = 0 ; ii < n_obs ; ii++){
1056 | // also update the gamma (linear predictor)
1057 | loggamma[ii][jj] = loggamma_p[ii];
1058 | }
1059 | } // Close MH if
1060 | } // Close for kk
1061 | } // Close function
1062 |
1063 | void update_alpha_jj(double **JJ, double **loggamma, double *alpha,
1064 | double *prop_per_alpha, int *accepted_alpha_flag,
1065 | double mu_al[n_cats], double sig_al[n_cats], int jj){
1066 |
1067 | int ii;
1068 |
1069 | double sig_prop = prop_per_alpha[jj];
1070 | double loggamma_p[n_obs];
1071 |
1072 | double alpha_p;
1073 | double lnu, ln_acp;
1074 |
1075 | // Prepare the current and proposed full conditional values
1076 | double log_full_alpha, log_full_alpha_p;
1077 |
1078 | // Calculate the full conditional for the current value
1079 | log_full_alpha=0;
1080 | for(ii = 0 ; ii < n_obs ; ii++){
1081 |
1082 | log_full_alpha = log_full_alpha - gsl_sf_lngamma(exp(loggamma[ii][jj]));
1083 | log_full_alpha = log_full_alpha + exp(loggamma[ii][jj]) * log(JJ[ii][jj]);
1084 |
1085 | }
1086 | log_full_alpha = log_full_alpha - 1.0/(2.0 * sig_al[jj]) * pow(alpha[jj] - mu_al[jj], 2.0);
1087 |
1088 | // Propose a new value for alpha[jj] using a random walk proposal centered on
1089 | // the current value of alpha[jj]
1090 | alpha_p = alpha[jj] + gsl_ran_gaussian_ziggurat(rando, sig_prop);
1091 |
1092 | // Gamma must be updated too
1093 | for(ii = 0 ; ii < n_obs ; ii++){
1094 | loggamma_p[ii] = loggamma[ii][jj] - alpha[jj] + alpha_p;
1095 | }
1096 |
1097 | // Calculate the full conditional for the proposed value
1098 | log_full_alpha_p = 0;
1099 | for(ii = 0 ; ii < n_obs ; ii++){
1100 | log_full_alpha_p = log_full_alpha_p - gsl_sf_lngamma(exp(loggamma_p[ii]));
1101 | log_full_alpha_p = log_full_alpha_p + exp(loggamma_p[ii]) * log(JJ[ii][jj]);
1102 | }
1103 |
1104 | log_full_alpha_p = log_full_alpha_p - 1.0/(2.0 * sig_al[jj]) * pow(alpha_p - mu_al[jj], 2.0);
1105 | ln_acp = log_full_alpha_p - log_full_alpha;
1106 | lnu = log(gsl_rng_uniform(rando));
1107 |
1108 | if(lnu < ln_acp){
1109 |
1110 | // If accepted, update both alpha[jj] and loggamma[ii][jj], and keep
1111 | // track of acceptances
1112 | accepted_alpha_flag[jj] = 1;
1113 | alpha[jj] = alpha_p;
1114 |
1115 | for(ii = 0 ; ii < n_obs ; ii++){
1116 | loggamma[ii][jj] = loggamma_p[ii];
1117 | }
1118 | }
1119 |
1120 | return;
1121 |
1122 | } // Close function
1123 |
1124 | // the probability of the Bernoulli trial is integrated out
1125 | double lprior_bbsas(double betajk, int sjk, double sig_bejk, double mu_bejk,
1126 | double aa_hp, double bb_hp){
1127 |
1128 | // calculate additional beta factor
1129 | double aa = sjk + aa_hp;
1130 | double bb = 1 - sjk + bb_hp;
1131 | double lbeta_factor = gsl_sf_lnbeta(aa, bb) - gsl_sf_lnbeta(aa_hp, bb_hp);
1132 |
1133 | // piece-wise function
1134 | if(sjk == 0){
1135 | return(log(1.0 - exp(lbeta_factor)));
1136 | }
1137 | else{
1138 | return(lbeta_factor - 0.5 * log(2.0 * M_PI * sig_bejk) - 1.0/(2.0 * sig_bejk) * pow(betajk - mu_bejk, 2.0));
1139 | }
1140 |
1141 | }
1142 |
1143 | // simple adaptive MH proposal
1144 | // this uses equation (3) from Roberts & Rosenthal (2009) but without the full
1145 | // covariance of the target which makes it similar to the component-wise update
1146 | // of Haario et al. (2005)
1147 | double adap_prop(double curr_var){
1148 |
1149 | // need dimension (???)
1150 | double dd = (double)n_cats * (double)n_vars;
1151 |
1152 | // ensures bounded convergence
1153 | int safeguard = gsl_ran_bernoulli(rando, 0.05);
1154 | double usually = pow(2.38, 2.0) * curr_var/dd;
1155 | double unusually = pow(0.1, 2.0)/dd;
1156 |
1157 | double prop_var = (1 - safeguard) * gsl_ran_gaussian_ziggurat(rando, pow(usually, 0.5)) + safeguard * gsl_ran_gaussian_ziggurat(rando, pow(unusually, 0.5));
1158 |
1159 | return(prop_var);
1160 | }
1161 |
1162 | // sort of following http://www.johndcook.com/blog/standard_deviation/
1163 | double online_mean(int iteration, double last_mean, double curr_obs){
1164 |
1165 | // don't fall off by one
1166 | int n_vals = iteration++;
1167 |
1168 | // assume that iteration > 0 since there's a burnin proposal
1169 | double curr_mean = last_mean + (curr_obs - last_mean)/(double)n_vals;
1170 |
1171 | return(curr_mean);
1172 | }
1173 |
1174 | double online_var(int iteration, double last_mean, double last_var, double curr_mean, double curr_obs){
1175 |
1176 | // note that iteration == n - 1 and that (n - 1) * last_var == last_ss
1177 | // assume that iteration > 0 since there's a burnin proposal
1178 | double curr_ss = (double)iteration * last_var + (curr_obs - last_mean) * (curr_obs - curr_mean);
1179 |
1180 | return(curr_ss/(double)iteration);
1181 | }
1182 |
1183 |
--------------------------------------------------------------------------------