├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── codacore.R ├── data.R └── simulations.R ├── README.md ├── cran-comments.md ├── data ├── Crohn.rda ├── HIV.rda └── sCD14.rda ├── inst ├── CITATION └── misc │ ├── guide.Rmd │ └── guide.html ├── man ├── Crohn.Rd ├── HIV.Rd ├── activeInputs.codacore.Rd ├── codacore.Rd ├── getBinaryPartitions.Rd ├── getDenominatorParts.Rd ├── getLogRatios.Rd ├── getNumLogRatios.Rd ├── getNumeratorParts.Rd ├── getSlopes.Rd ├── getTidyTable.Rd ├── plot.codacore.Rd ├── plotROC.Rd ├── predict.codacore.Rd ├── print.codacore.Rd ├── sCD14.Rd └── simulateHTS.Rd ├── tests ├── testthat.R └── testthat │ └── test-codacore.R └── vignettes └── guide.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^cran-comments\.md$ 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | R-codacore.Rproj 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | .Ruserdata 6 | .DS_Store 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: codacore 2 | Title: Learning Sparse Log-Ratios for Compositional Data 3 | Version: 0.0.4 4 | Authors@R: c( 5 | person("Elliott", "Gordon-Rodriguez", email = "eg2912@columbia.edu", role = c("aut", "cre")), 6 | person("Thomas", "Quinn", email = "contacttomquinn@gmail.com", role = c("aut")) 7 | ) 8 | Description: In the context of high-throughput genetic data, 9 | CoDaCoRe identifies a set of sparse biomarkers that are 10 | predictive of a response variable of interest (Gordon-Rodriguez 11 | et al., 2021) . More 12 | generally, CoDaCoRe can be applied to any regression problem 13 | where the independent variable is Compositional (CoDa), to 14 | derive a set of scale-invariant log-ratios (ILR or SLR) that 15 | are maximally associated to a dependent variable. 16 | License: MIT + file LICENSE 17 | Encoding: UTF-8 18 | LazyData: true 19 | RoxygenNote: 7.1.1 20 | Depends: 21 | R (>= 3.6.0) 22 | Imports: 23 | tensorflow (>= 2.1), 24 | keras (>= 2.3), 25 | pROC (>= 1.17), 26 | R6 (>= 2.5), 27 | gtools(>= 3.8) 28 | SystemRequirements: TensorFlow (https://www.tensorflow.org/) 29 | Suggests: 30 | zCompositions, 31 | testthat (>= 2.1.0), 32 | knitr, 33 | rmarkdown 34 | VignetteBuilder: knitr 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: Elliott Gordon-Rodriguez 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 Elliott Gordon-Rodriguez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(plot,codacore) 4 | S3method(predict,codacore) 5 | S3method(print,codacore) 6 | export(activeInputs.codacore) 7 | export(codacore) 8 | export(getBinaryPartitions) 9 | export(getDenominatorParts) 10 | export(getLogRatios) 11 | export(getNumLogRatios) 12 | export(getNumeratorParts) 13 | export(getSlopes) 14 | export(getTidyTable) 15 | export(plotROC) 16 | export(simulateHTS) 17 | import(keras) 18 | importFrom(stats,predict) 19 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | ## codacore 0.0.4 2 | --------------------- 3 | * Update vignette. 4 | * Fix title. 5 | * Add incremental fit for cts. target. 6 | * Minor clarifications. 7 | * Updated README to reflect CRAN latest. 8 | * Add cvParams default values to func documentation. 9 | * Add helper funcs `getNumLogRatios()` and `getTidyTable()` 10 | * Add `getBinaryPartitions` function to retrieve SBP-like representation of learned balances. 11 | * Allow tibble inputs. 12 | 13 | ## codacore 0.0.3 14 | --------------------- 15 | * Live on CRAN. 16 | * Updated readme and vignettes to reflect this. 17 | 18 | ## codacore 0.0.2 19 | --------------------- 20 | * Updated tests. 21 | * Updated guide. 22 | * Covariate adjustment 23 | * Unsupervised learning 24 | * Multi-omics 25 | * Minor bugfix with glm numerics. 26 | * Added numLogRatios param to predict(). 27 | 28 | ## codacore 0.0.1 29 | --------------------- 30 | * Fix a bug in lambda-standard-error rule. 31 | * Estimation of cross-validation prediction error was missing a scaling factor to account for the number of folds. 32 | * As a result, models were over-regularized. 33 | * Update guide. -------------------------------------------------------------------------------- /R/codacore.R: -------------------------------------------------------------------------------- 1 | 2 | # Here we implement the codacore model 3 | 4 | library(keras) 5 | utils::globalVariables(c("self")) 6 | 7 | # """Fits a single base learner""" 8 | # Private class not to be called by user 9 | .CoDaBaseLearner <- function( 10 | x, 11 | y, 12 | boostingOffset, 13 | logRatioType, 14 | objective, 15 | lambda, 16 | cvParams, 17 | optParams, 18 | verbose 19 | ){ 20 | 21 | cdbl = list( 22 | intercept=NULL, 23 | slope=NULL, 24 | weights=NULL, 25 | softAssignment=NULL, 26 | hard=NULL, 27 | x=x, 28 | y=y, 29 | boostingOffset=boostingOffset, 30 | logRatioType=logRatioType, 31 | objective=objective, 32 | lambda=lambda, 33 | cvParams=cvParams, 34 | optParams=optParams, 35 | verbose=verbose 36 | ) 37 | class(cdbl) = "CoDaBaseLearner" 38 | 39 | # Train the relaxation model 40 | cdbl = trainRelaxation.CoDaBaseLearner(cdbl) 41 | 42 | # Find optimal cutoff by CV 43 | cutoff = findBestCutoff.CoDaBaseLearner(cdbl) 44 | 45 | # Use cutoff to "harden" the log-ratio 46 | cdbl = harden.CoDaBaseLearner(cdbl, cutoff) 47 | 48 | # And recompute the linear coefficients 49 | cdbl = setInterceptAndSlope.CoDaBaseLearner(cdbl, cdbl$x, cdbl$y, cdbl$boostingOffset) 50 | 51 | # Add some metrics 52 | yHat = predict(cdbl, x) + boostingOffset 53 | if (cdbl$objective == 'binary classification') { 54 | cdbl$ROC = pROC::roc(y, yHat, quiet=TRUE) 55 | cdbl$AUC = pROC::auc(cdbl$ROC) 56 | cdbl$accuracy = mean(y == (yHat > 0)) 57 | } else { 58 | cdbl$RMSE = sqrt(mean((y - yHat)^2)) 59 | cdbl$Rsquared = 1 - cdbl$RMSE^2 / stats::var(y) 60 | } 61 | 62 | return(cdbl) 63 | } 64 | 65 | 66 | #' @import keras 67 | trainRelaxation.CoDaBaseLearner = function(cdbl) { 68 | startTime = Sys.time() 69 | 70 | # Set up traininable variables 71 | inputDim = ncol(cdbl$x) 72 | numObs = nrow(cdbl$x) 73 | 74 | # Initializaing the intercept at the average of the data 75 | # this helps optimization greatly 76 | # TODO: should experiment with slopeInit parameter for potential gains 77 | if (cdbl$objective == "binary classification") { 78 | loss_func = 'binary_crossentropy' 79 | if (abs(mean(1 / (1 + exp(-cdbl$boostingOffset))) - mean(cdbl$y)) < 0.001) { 80 | # Protect against numerical errors in glm() call 81 | interceptInit = 0.0 82 | } else { 83 | tempGLM = stats::glm(cdbl$y ~ 1, offset=cdbl$boostingOffset, family='binomial') 84 | interceptInit = tempGLM$coef[[1]] 85 | } 86 | slopeInit = 0.1 87 | metrics = c('accuracy') 88 | } else if (cdbl$objective == "regression") { 89 | loss_func = 'mean_squared_error' 90 | interceptInit = mean(cdbl$y - cdbl$boostingOffset) 91 | slopeInit = 0.1 # * stats::sd(cdbl$y - cdbl$boostingOffset) 92 | metrics = c('mean_squared_error') 93 | } 94 | 95 | # Define the forward pass for our relaxation, 96 | # which differs for balances and amalgamations 97 | if (cdbl$logRatioType == 'A') { 98 | epsilon = cdbl$optParams$epsilonA 99 | forwardPass = function(x, mask = NULL) { 100 | softAssignment = 2 * keras::k_sigmoid(self$weights) - 1 101 | # Add the small value to ensure gradient flows at exact zeros (initial values) 102 | pvePart = keras::k_dot(x, keras::k_relu(softAssignment + 1e-20)) 103 | nvePart = keras::k_dot(x, keras::k_relu(-softAssignment)) 104 | logRatio = keras::k_log(pvePart + epsilon) - 105 | keras::k_log(nvePart + epsilon) 106 | eta = self$slope * logRatio + self$intercept + self$boostingOffset 107 | # keras::k_sigmoid(eta) 108 | eta 109 | } 110 | } else if (cdbl$logRatioType == 'B') { 111 | epsilon = cdbl$optParams$epsilonB 112 | forwardPass = function(x, mask = NULL) { 113 | softAssignment = 2 * keras::k_sigmoid(self$weights) - 1 114 | # Add the small value to ensure gradient flows at exact zeros (initial values) 115 | pvePart = keras::k_relu(softAssignment + 1e-20) 116 | nvePart = keras::k_relu(-softAssignment) 117 | logRatio = keras::k_dot(keras::k_log(x), pvePart) / keras::k_maximum(keras::k_sum(pvePart), epsilon) - 118 | keras::k_dot(keras::k_log(x), nvePart) / keras::k_maximum(keras::k_sum(nvePart), epsilon) 119 | eta = self$slope * logRatio + self$intercept + self$boostingOffset 120 | # keras::k_sigmoid(eta) 121 | eta 122 | } 123 | } 124 | 125 | if (FALSE) { 126 | tensorflow::tf$random$set_seed(0) 127 | } 128 | 129 | # Set up custom layer 130 | CustomLayer <- R6::R6Class( 131 | "CustomLayer", 132 | 133 | inherit = keras::KerasLayer, 134 | 135 | public = list( 136 | output_dim = NULL, 137 | weights = NULL, 138 | intercept = NULL, 139 | slope = NULL, 140 | boostingOffset = NULL, 141 | # epsilon = NULL, 142 | 143 | initialize = function() { 144 | self$output_dim <- 1 145 | }, 146 | 147 | build = function(input_shape) { 148 | self$weights <- self$add_weight( 149 | name = 'weights', 150 | shape = list(as.integer(inputDim), as.integer(1)), 151 | initializer = keras::initializer_zeros(), 152 | trainable = TRUE 153 | ) 154 | self$intercept <- self$add_weight( 155 | name = 'intercept', 156 | shape = list(as.integer(1)), 157 | initializer = keras::initializer_constant(interceptInit), 158 | trainable = TRUE 159 | ) 160 | self$slope <- self$add_weight( 161 | name = 'slope', 162 | shape = list(as.integer(1)), 163 | initializer = keras::initializer_constant(slopeInit), 164 | trainable = TRUE 165 | ) 166 | self$boostingOffset <- self$add_weight( 167 | name = 'boostingOffset', 168 | shape = list(as.integer(numObs), as.integer(1)), 169 | initializer = keras::initializer_constant(cdbl$boostingOffset), 170 | trainable = FALSE 171 | ) 172 | # self$epsilon <- self$add_weight( 173 | # name = 'epsilon', 174 | # shape = list(as.integer(1)), 175 | # initializer = keras::initializer_constant(cdbl$epsilon), 176 | # trainable = FALSE 177 | # ) 178 | }, 179 | 180 | call = forwardPass, 181 | 182 | compute_output_shape = function(input_shape) { 183 | list(input_shape[[1]], self$output_dim) 184 | } 185 | ) 186 | ) 187 | 188 | .trainKeras = function(lr, epochs) { 189 | # define layer wrapper function 190 | codacoreLayer <- function(object) { 191 | keras::create_layer(CustomLayer, object) 192 | } 193 | 194 | # use it in a model 195 | model <- keras::keras_model_sequential() 196 | model %>% codacoreLayer() 197 | if (cdbl$objective == "binary classification") { 198 | model %>% layer_activation('sigmoid') 199 | } 200 | 201 | # compile graph 202 | model %>% keras::compile( 203 | loss = loss_func, 204 | optimizer = keras::optimizer_sgd(lr, momentum=cdbl$optParams$momentum), 205 | # optimizer = keras::optimizer_adam(0.001), 206 | metrics = metrics 207 | ) 208 | 209 | 210 | model %>% keras::fit(cdbl$x, cdbl$y, epochs=epochs, 211 | batch_size=cdbl$optParams$batchSize, 212 | verbose=FALSE)# =TRUE) for debugging 213 | return(model) 214 | } 215 | 216 | runAdaptively = is.numeric(cdbl$optParams$adaptiveLR) & is.null(cdbl$optParams$vanillaLR) 217 | if (runAdaptively) { 218 | # Adaptive learning rate here means that we pick the lr s.t. 219 | # our first gradient step moves the amalWeights out by a specified amount 220 | model = .trainKeras(1, 1) 221 | lr = cdbl$optParams$adaptiveLR 222 | epochs = cdbl$optParams$epochs 223 | lr = lr / max(abs(as.numeric(model$get_weights()[[1]]))) 224 | model = .trainKeras(lr, epochs) 225 | } else { 226 | warning("Using non-adaptive learning rate may hinder optimization.") 227 | lr = cdbl$optParams$vanillaLR 228 | epochs = cdbl$optParams$epochs 229 | model = .trainKeras(lr, epochs) 230 | } 231 | 232 | 233 | # Save results: 234 | cdbl$weights = as.numeric(model$get_weights()[[1]]) 235 | cdbl$softAssignment = 2 / (1 + exp(-cdbl$weights)) - 1 236 | cdbl$intercept = as.numeric(model$get_weights()[[2]]) 237 | cdbl$slope = as.numeric(model$get_weights()[[3]]) 238 | 239 | # Equalize the largest + and largest - assignment for more 'balanced' balances 240 | eqRatio = max(cdbl$softAssignment) / min(cdbl$softAssignment) * (-1) 241 | cdbl$softAssignment[cdbl$softAssignment < 0] = cdbl$softAssignment[cdbl$softAssignment < 0] * eqRatio 242 | 243 | endTime = Sys.time() 244 | if (cdbl$verbose) { 245 | print('GD time:') 246 | print(endTime - startTime) 247 | } 248 | # cdbl$runTimeGD = endTime - startTime 249 | 250 | return(cdbl) 251 | } 252 | 253 | # Given a trained softAssignment, which corresponds to running 254 | # the weights through an activation, we find 255 | # the cutoff at which we define our log-ratio 256 | findBestCutoff.CoDaBaseLearner = function(cdbl) { 257 | if (any(abs(cdbl$softAssignment) > 0.999999)) { 258 | warning("Large weights encountered in gradient descent; 259 | vanishing gradients likely. 260 | Learning rates might need recalibrating - try adaptive rates?") 261 | } 262 | 263 | candidateCutoffs = sort(abs(cdbl$softAssignment), decreasing=TRUE) 264 | maxCutoffs = cdbl$cvParams$maxCutoffs 265 | # Start from 2nd since we equalized +ve and -ve; thus neither side will be empty 266 | candidateCutoffs = candidateCutoffs[2:min(maxCutoffs, length(candidateCutoffs))] 267 | 268 | # TODO: re-implement without passing cdbl to harden() 269 | # and setInterceptAndSlope() to avoid computational overhead 270 | # from copying data unnecessarily 271 | 272 | # Compute the CV scores: 273 | startTime = Sys.time() 274 | numFolds = cdbl$cvParams$numFolds 275 | # Naive way of splitting equally into folds: 276 | foldIdx = sample(cut(1:length(cdbl$y), breaks=numFolds, labels=FALSE)) 277 | if (cdbl$objective == "binary classification") { 278 | # Instead we randomize with equal # of case/controls in each fold 279 | # See discussion on stratified CV in page 204 of He & Ma 2013 280 | if (sum(cdbl$y) < numFolds | sum(1 - cdbl$y) < numFolds) { 281 | stop("Insufficient samples from each class available for cross-validation.") 282 | } 283 | caseIdx = sample(cut(1:sum(cdbl$y), breaks=numFolds, labels=FALSE)) 284 | controlIdx = sample(cut(1:sum(1 - cdbl$y), breaks=numFolds, labels=FALSE)) 285 | foldIdx[cdbl$y == 1] = caseIdx 286 | foldIdx[cdbl$y == 0] = controlIdx 287 | } 288 | scores = matrix(nrow=length(candidateCutoffs), ncol=numFolds) 289 | i = 0 290 | for (cutoff in candidateCutoffs) { 291 | i = i + 1 292 | cdbl = harden.CoDaBaseLearner(cdbl, cutoff) 293 | for (j in 1:numFolds) { 294 | cdbl = setInterceptAndSlope.CoDaBaseLearner(cdbl, cdbl$x[foldIdx != j,], cdbl$y[foldIdx != j], cdbl$boostingOffset[foldIdx != j]) 295 | yHat = predict(cdbl, cdbl$x[foldIdx == j,]) + cdbl$boostingOffset[foldIdx == j] 296 | if (cdbl$objective == "binary classification") { 297 | ROC = pROC::roc(cdbl$y[foldIdx == j], yHat, quiet=TRUE) 298 | scores[i, j] = pROC::auc(ROC) 299 | } else if (cdbl$objective == "regression") { 300 | scores[i, j] = -sqrt(mean((cdbl$y[foldIdx == j] - yHat)^2)) 301 | } 302 | } 303 | } 304 | # Now implement lambda-SE rule 305 | means = apply(scores, 1, mean) 306 | # see eqn 9.2 here https://www.cs.cmu.edu/~psarkar/sds383c_16/lecture9_scribe.pdf 307 | stds = apply(scores, 1, stats::sd) / sqrt(numFolds) 308 | lambdaSeRule = max(means) - stds[which.max(means)] * cdbl$lambda 309 | # oneSdRule = max(means - stds) 310 | bestCutoff = candidateCutoffs[means >= lambdaSeRule][1] 311 | # bestCutoff = candidateCutoffs[which.max(scores)] 312 | 313 | 314 | endTime = Sys.time() 315 | if (cdbl$verbose) { 316 | print('CV time:') 317 | print(endTime - startTime) 318 | xCoor = 2:(length(means) + 1) 319 | graphics::plot(xCoor, means, ylim=range(c(means-stds, means+stds))) 320 | graphics::arrows(xCoor, means-stds, xCoor, means+stds, length=0.05, angle=90, code=3) 321 | graphics::abline(lambdaSeRule, 0) 322 | } 323 | 324 | if (cdbl$objective == "binary classification") { 325 | baseLineScore = pROC::auc(pROC::roc(cdbl$y, cdbl$boostingOffset, quiet=TRUE)) 326 | } else if (cdbl$objective == "regression") { 327 | baseLineScore = -sqrt(mean((cdbl$y - cdbl$boostingOffset)^2)) 328 | } 329 | noImprovement = lambdaSeRule < baseLineScore 330 | if (noImprovement) { 331 | bestCutoff = 1.1 # bigger than the softAssignment 332 | } 333 | 334 | return(bestCutoff) 335 | } 336 | 337 | 338 | harden.CoDaBaseLearner = function(cdbl, cutoff) { 339 | numPart = cdbl$softAssignment >= cutoff 340 | denPart = cdbl$softAssignment <= -cutoff 341 | hard = list(numerator=numPart, denominator=denPart) 342 | cdbl$hard = hard 343 | return(cdbl) 344 | } 345 | 346 | 347 | setInterceptAndSlope.CoDaBaseLearner = function(cdbl, x, y, boostingOffset) { 348 | # If our base learner is empty (i.e. couldn't beat the 1SE rule), 349 | # we simply set to 0: 350 | if (!any(cdbl$hard$numerator) & !any(cdbl$hard$denominator)) { 351 | cdbl$slope = 0.0 352 | cdbl$intercept = 0.0 353 | return(cdbl) 354 | } 355 | # Otherwise, we have a non-empty SLR, so we compute it's regression coefficient 356 | logRatio = computeLogRatio.CoDaBaseLearner(cdbl, x) 357 | dat = data.frame(x=logRatio, y=y) 358 | if (cdbl$objective == "binary classification") { 359 | glm = stats::glm(y~x, family='binomial', data=dat, offset=boostingOffset) 360 | if (any(is.na(glm$coefficients))) { 361 | glm = list(coefficients=list(0, 0)) 362 | warning("Numerical error during glm fit. Possible data issue.") 363 | } 364 | } else if (cdbl$objective == "regression") { 365 | glm = stats::glm(y~x, family='gaussian', data=dat, offset=boostingOffset) 366 | } else { 367 | stop("Not implemented objective=", cdbl$objective) 368 | } 369 | cdbl$intercept = glm$coefficients[[1]] 370 | cdbl$slope = glm$coefficients[[2]] 371 | return(cdbl) 372 | } 373 | 374 | 375 | computeLogRatio.CoDaBaseLearner = function(cdbl, x) { 376 | 377 | if (!any(cdbl$hard$numerator) | !any(cdbl$hard$denominator)) { 378 | logRatio = rowSums(x * 0) 379 | } else { # we have a bona fide log-ratio 380 | if (cdbl$logRatioType == 'A') { 381 | epsilon = cdbl$optParams$epsilonA 382 | pvePart = rowSums(x[, cdbl$hard$numerator, drop=FALSE]) # drop=FALSE to keep as matrix 383 | nvePart = rowSums(x[, cdbl$hard$denominator, drop=FALSE]) 384 | logRatio = log(pvePart + epsilon) - log(nvePart + epsilon) 385 | } else if (cdbl$logRatioType == 'B') { 386 | pvePart = rowMeans(log(x[, cdbl$hard$numerator, drop=FALSE])) # drop=FALSE to keep as matrix 387 | nvePart = rowMeans(log(x[, cdbl$hard$denominator, drop=FALSE])) 388 | logRatio = pvePart - nvePart 389 | } 390 | } 391 | 392 | return(logRatio) 393 | } 394 | 395 | 396 | predict.CoDaBaseLearner = function(cdbl, x, asLogits=TRUE) { 397 | logRatio = computeLogRatio.CoDaBaseLearner(cdbl, x) 398 | eta = cdbl$slope * logRatio + cdbl$intercept 399 | if (asLogits) { 400 | return(eta) 401 | } else { 402 | if (cdbl$objective == 'regression') { 403 | stop("Logits argument should only be used for classification, not regression.") 404 | } 405 | return(1 / (1 + exp(-eta))) 406 | } 407 | } 408 | 409 | 410 | #' codacore 411 | #' 412 | #' This function implements the codacore algorithm described by Gordon-Rodriguez et al. 2021 413 | #' (https://doi.org/10.1101/2021.02.11.430695). 414 | #' 415 | #' @param x A data.frame or matrix of the compositional predictor variables. 416 | #' Rows represent observations and columns represent variables. 417 | #' @param y A data.frame, matrix or vector of the response. In the case of a 418 | #' data.frame or matrix, there should be one row for each observation, and 419 | #' just a single column. 420 | #' @param logRatioType A string indicating whether to use "balances" or "amalgamations". 421 | #' Also accepts "balance", "B", "ILR", or "amalgam", "A", "SLR". 422 | #' Note that the current implementation for balances is not strictly an ILR, 423 | #' but rather just a collection of balances (which are possibly non-orthogonal 424 | #' in the Aitchison sense). 425 | #' @param objective A string indicating "binary classification" or "regression". By default, 426 | #' it is NULL and gets inferred from the values in y. 427 | #' @param lambda A numeric. Corresponds to the "lambda-SE" rule. Sets the "regularization strength" 428 | #' used by the algorithm to decide how to harden the ratio. 429 | #' Larger numbers tend to yield fewer, more sparse ratios. 430 | #' @param offset A numeric vector of the same length as y. Works similarly to the offset in a glm. 431 | #' @param shrinkage A numeric. Shrinkage factor applied to each base learner. 432 | #' Defaults to 1.0, i.e., no shrinkage applied. 433 | #' @param maxBaseLearners An integer. The maximum number of log-ratios that the model will 434 | #' learn before stopping. Automatic stopping based on \code{seRule} may occur sooner. 435 | #' @param optParams A list of named parameters for the optimization of the 436 | #' continuous relaxation. Empty by default. User can override as few or as 437 | #' many of our defaults as desired. Includes adaptiveLR (learning rate under 438 | #' adaptive training scheme), momentum (in the gradient-descent sense), 439 | #' epochs (number of gradient-descent epochs), batchSize (number of 440 | #' observations per minibatch, by default the entire dataset), 441 | #' and vanillaLR (the learning rate to be used if the user does *not* want 442 | #' to use the 'adaptiveLR', to be used at the risk of optimization issues). 443 | #' @param cvParams A list of named parameters for the "hardening" procedure 444 | #' using cross-validation. Includes numFolds (number of folds, default=5) and 445 | #' maxCutoffs (number of candidate cutoff values of 'c' to be tested out 446 | #' during CV process, default=20 meaning log-ratios with up to 21 components 447 | #' can be found by codacore). 448 | #' @param verbose A boolean. Toggles whether to display intermediate steps. 449 | #' @param overlap A boolean. Toggles whether successive log-ratios found by 450 | #' CoDaCoRe may contain repeated input variables. TRUE by default. 451 | #' Changing to FALSE implies that the log-ratios obtained by CoDaCoRe 452 | #' will become orthogonal in the Aitchison sense, analogously to the 453 | #' isometric-log-ratio transformation, while losing a small amount of 454 | #' model flexibility. 455 | #' @param fast A boolean. Whether to run in fast or slow mode. TRUE by 456 | #' default. Running in slow mode will take ~x5 the computation time, 457 | #' but may help identify slightly more accurate log-ratios. 458 | #' 459 | #' @return A \code{codacore} object. 460 | #' 461 | #' @examples 462 | #' \dontrun{ 463 | #' data("Crohn") 464 | #' x <- Crohn[, -ncol(Crohn)] 465 | #' y <- Crohn[, ncol(Crohn)] 466 | #' x <- x + 1 467 | #' model = codacore(x, y) 468 | #' print(model) 469 | #' plot(model) 470 | #' } 471 | #' 472 | #' @importFrom stats predict 473 | #' 474 | #' @export 475 | codacore <- function( 476 | x, 477 | y, 478 | logRatioType='balances', 479 | objective=NULL, 480 | lambda=1.0, 481 | offset=NULL, 482 | shrinkage=1.0, 483 | maxBaseLearners=5, 484 | optParams=list(), 485 | cvParams=list(), 486 | verbose=FALSE, 487 | overlap=TRUE, 488 | fast=TRUE 489 | ){ 490 | 491 | # Convert x and y to the appropriate objects 492 | x = .prepx(x) 493 | y = .prepy(y) 494 | 495 | # Check whether we are in regression or classification mode by inspecting y 496 | if (is.null(objective)) { 497 | distinct_values = length(unique(y)) 498 | if (distinct_values == 2) { 499 | objective = 'binary classification' 500 | } else if (inherits(y, 'factor')) { 501 | stop("Multi-class classification note yet implemented.") 502 | } else if (inherits(y, 'numeric')) { 503 | objective = 'regression' 504 | if (distinct_values <= 10) { 505 | warning("Response only has ", distinct_values, " distinct values.") 506 | warning("Consider changing the objective function.") 507 | } 508 | } 509 | } 510 | 511 | # Make sure we recognize objective 512 | if (! objective %in% c('binary classification', 'regression')) { 513 | stop("Objective: ", objective, " not yet implemented.") 514 | } 515 | 516 | # Save names of labels if relevant 517 | if (objective == 'binary classification' & inherits(y, 'factor')) { 518 | yLevels = levels(y) 519 | y = as.numeric(y) - 1 520 | } else { 521 | yLevels = NULL 522 | } 523 | 524 | # In the regression case, standardize data and save scale 525 | if (objective == 'regression') { 526 | yMean = mean(y) 527 | yScale = stats::sd(y) 528 | y = (y - yMean) / yScale 529 | } else { 530 | yMean = NULL 531 | yScale = NULL 532 | } 533 | 534 | # Convert logRatioType to a unique label: 535 | if (logRatioType %in% c('amalgamations', 'amalgam', 'A', 'SLR')) { 536 | logRatioType='A' 537 | } else if (logRatioType %in% c('balances', 'balance', 'B', 'ILR')) { 538 | logRatioType='B' 539 | } else { 540 | stop('Invalid logRatioType argument given: ', logRatioType) 541 | } 542 | 543 | if (any(x == 0)) { 544 | if (logRatioType == 'A') { 545 | warning("The data contain zeros. An epsilon is used to prevent divide-by-zero errors.") 546 | } else if (logRatioType == 'B') { 547 | stop("The data contain zeros. Balances cannot be used in this case.") 548 | } 549 | } 550 | 551 | if (!overlap) { 552 | # We store away the original data, since we will override during 553 | # the stagewise-additive procedure, zeroing out the input variables 554 | # that get picked up by each log-ratio. 555 | xOriginal = x 556 | } 557 | 558 | if (nrow(x) > 10000) { 559 | warning("Large number of observations; codacore could benefit from minibatching.") 560 | } 561 | 562 | if (nrow(x) < 50) { 563 | warning("Small number of observations; proceed with care (the likelihood of unstable results may increase).") 564 | } 565 | 566 | # Set up optimization parameters 567 | optDefaults = list( 568 | epochs=100, 569 | batchSize=nrow(x), 570 | vanillaLR=NULL, 571 | adaptiveLR=0.5, 572 | momentum=0.9, 573 | epsilonA=1e-6, 574 | epsilonB=1e-2 575 | # initialization = 'zeros' 576 | ) 577 | # Take the defaults and override with any user-specified params, if given 578 | for (param in names(optParams)) { 579 | if (param %in% names(optDefaults)) { 580 | optDefaults[param] = optParams[param] 581 | } else { 582 | stop('Unknown optimization parameter given:', param) 583 | } 584 | } 585 | optParams = optDefaults 586 | 587 | # Check whether we are running in fast or slow mode 588 | if (!fast) { 589 | message("CoDaCoRe is running in slow mode. Switch to fast=TRUE for ~x5 speedup.") 590 | optParams$epochs = 1000 591 | } 592 | 593 | # Set up cross-validation parameters 594 | cvDefaults = list( 595 | maxCutoffs=20, 596 | numFolds=5 597 | ) 598 | # Take the defaults and override with any user-specified params, if given 599 | for (param in names(cvParams)) { 600 | if (param %in% names(cvDefaults)) { 601 | cvDefaults[param] = cvParams[param] 602 | } else { 603 | stop('Unknown optimization parameter given:', param) 604 | } 605 | } 606 | cvParams = cvDefaults 607 | 608 | 609 | ### Now we train codacore: 610 | # Initialize from an empty ensemble 611 | ensemble = list() 612 | if (is.null(offset)) { 613 | boostingOffset = y * 0.0 614 | } else { 615 | boostingOffset = offset 616 | } 617 | maxBaseLearners = maxBaseLearners / shrinkage 618 | for (i in 1:maxBaseLearners) { 619 | startTime = Sys.time() 620 | cdbl = .CoDaBaseLearner( 621 | x=x, 622 | y=y, 623 | boostingOffset=boostingOffset, 624 | logRatioType=logRatioType, 625 | objective=objective, 626 | lambda=lambda, 627 | optParams=optParams, 628 | cvParams=cvParams, 629 | verbose=verbose 630 | ) 631 | endTime = Sys.time() 632 | 633 | if (verbose) { 634 | cat('\n\n\nBase Learner', i) 635 | cat('\nLog-ratio indexes:') 636 | cat('\nNumerator =', which(cdbl$hard$numerator)) 637 | cat('\nDenominator =', which(cdbl$hard$denominator)) 638 | if (objective == 'binary classification') { 639 | cat('\nAccuracy:', cdbl$accuracy) 640 | cat('\nAUC:', cdbl$AUC) 641 | } else if (objective == 'regression') { 642 | cat('\nRMSE', cdbl$RMSE) 643 | } 644 | cat('\nTime taken:', endTime - startTime) 645 | } 646 | 647 | # If base learner is empty, we stop (no further gain in CV AUC): 648 | if (!any(cdbl$hard$numerator) & !any(cdbl$hard$denominator)) {break} 649 | 650 | # Add the new base learner to ensemble 651 | boostingOffset = boostingOffset + shrinkage * predict(cdbl, x) 652 | ensemble[[i]] = cdbl 653 | 654 | # If AUC is ~1, we stop (we separated the training data): 655 | # Note this won't always get caught by previous check since separability can lead to 656 | # numerical overflow which throws an error rather than finding an empty base learner 657 | if (cdbl$objective == 'binary classification' && cdbl$AUC > 0.999) {break} 658 | if (cdbl$objective == 'regression' && cdbl$Rsquared > 0.999) {break} 659 | 660 | # To avoid overlapping log-ratios, we "zero-out" the input variables that have 661 | # already been used 662 | if (!overlap) { 663 | x[, cdbl$hard$numerator] = min(x) 664 | x[, cdbl$hard$denominator] = min(x) 665 | } 666 | } 667 | 668 | if (!overlap) { 669 | # Replace the original data frame for saving in the object 670 | x = xOriginal 671 | } 672 | 673 | cdcr = list( 674 | ensemble=ensemble, 675 | x = x, 676 | y = y, 677 | objective=objective, 678 | logRatioType=logRatioType, 679 | lambda=lambda, 680 | shrinkage=shrinkage, 681 | maxBaseLearners=maxBaseLearners, 682 | optParams=optParams, 683 | cvParams=cvParams, 684 | overlap=overlap, 685 | yLevels=yLevels, 686 | yMean=yMean, 687 | yScale=yScale 688 | ) 689 | class(cdcr) = "codacore" 690 | 691 | # If no log-ratios were found, suggest reducing regularization strength 692 | if (length(ensemble) == 0) { 693 | warning("No predictive log-ratios were found. Consider using lower values of lambda.") 694 | } 695 | 696 | return(cdcr) 697 | } 698 | 699 | 700 | #' predict 701 | #' 702 | #' @param object A codacore object. 703 | #' @param newx A set of inputs to our model. 704 | #' @param asLogits Whether to return outputs in logit space 705 | #' (as opposed to probability space). Should always be set 706 | #' to TRUE for regression with continuous outputs, but can 707 | #' be toggled for classification problems. 708 | #' @param numLogRatios How many predictive log-ratios to 709 | #' include in the prediction. By default, includes the 710 | #' effects of all log-ratios that were obtained during 711 | #' training. Setting this parameter to an integer k will 712 | #' restrict to using only the top k log-ratios in the model. 713 | #' @param ... Not used. 714 | #' 715 | #' @export 716 | predict.codacore = function(object, newx, asLogits=TRUE, numLogRatios=NA, ...) { 717 | # Throw an error if zeros are present 718 | if (any(newx == 0)) { 719 | if (object$logRatioType == 'A') { 720 | warning("The data contain zeros. An epsilon is used to prevent divide-by-zero errors.") 721 | } else if (object$logRatioType == 'B') { 722 | stop("The data contain zeros. Balances cannot be used in this case.") 723 | } 724 | } 725 | 726 | x = .prepx(newx) 727 | yHat = rep(0, nrow(x)) 728 | 729 | if (is.na(numLogRatios)) { 730 | numLogRatios = length(object$ensemble) 731 | } 732 | 733 | for (i in 1:numLogRatios) { 734 | cdbl = object$ensemble[[i]] 735 | yHat = yHat + object$shrinkage * predict(cdbl, x) 736 | } 737 | 738 | if (object$objective == 'binary classification') { 739 | if (asLogits) { 740 | return(yHat) 741 | } else { 742 | return(1 / (1 + exp(-yHat))) 743 | } 744 | } else if (object$objective == 'regression') { 745 | return(yHat * object$yScale + object$yMean) 746 | } 747 | } 748 | 749 | 750 | #' print 751 | #' 752 | #' @param x A codacore object. 753 | #' @param ... Not used. 754 | #' 755 | #' @export 756 | print.codacore = function(x, ...) { 757 | # TODO: Make this into a table to print all at once 758 | cat("\nNumber of log-ratios found:", length(x$ensemble)) 759 | if (length(x$ensemble) >= 1) { 760 | for (i in 1:length(x$ensemble)) { 761 | cat("\n***") 762 | cat("\nLog-ratio rank", i) 763 | cdbl = x$ensemble[[i]] 764 | hard = x$ensemble[[i]]$hard 765 | if (is.null(rownames(cdbl$x))) { 766 | cat("\nNumerator:", which(cdbl$hard$numerator)) 767 | cat("\nDenominator:", which(cdbl$hard$denominator)) 768 | } else { 769 | cat("\nNumerator:", colnames(cdbl$x)[which(cdbl$hard$numerator)]) 770 | cat("\nDenominator:", colnames(cdbl$x)[which(cdbl$hard$denominator)]) 771 | } 772 | # cat("\nIntercept:", cdbl$intercept) 773 | if (cdbl$objective == 'binary classification') { 774 | cat("\nAUC:", cdbl$AUC) 775 | cat("\nSlope:", cdbl$slope) 776 | } else if (cdbl$objective == 'regression') { 777 | cat("\nR squared:", cdbl$Rsquared) 778 | cat("\nSlope:", cdbl$slope * x$yScale) 779 | } 780 | } 781 | } 782 | cat("\n") # one final new line at end to finish print block 783 | } 784 | 785 | 786 | #' plot 787 | #' 788 | #' Plots a summary of a fitted codacore model. 789 | #' Credit to the authors of the selbal package (Rivera-Pinto et al., 2018), 790 | #' from whose package these plots were inspired. 791 | #' 792 | #' @param x A codacore object. 793 | #' @param index The index of the log-ratio to plot. 794 | #' @param ... Not used. 795 | #' 796 | #' @export 797 | plot.codacore = function(x, index = 1, ...) { 798 | 799 | allRatios = getLogRatios(x) 800 | if(index > ncol(allRatios)){ 801 | stop("The selected log-ratio does not exist!") 802 | } 803 | 804 | if (x$objective == 'regression') { 805 | 806 | logRatio = allRatios[, index] 807 | graphics::plot(logRatio, x$y, xlab='Log-ratio score', ylab='Response') 808 | graphics::abline(x$ensemble[[1]]$intercept, x$ensemble[[1]]$slope, lwd=2) 809 | 810 | } else if (x$objective == 'binary classification') { 811 | 812 | logRatio = allRatios[, index] 813 | 814 | # Convert 0/1 binary output to the original labels, if any 815 | if (!is.null(x$yLevels)) { 816 | y = x$yLevels[x$y + 1] 817 | } 818 | 819 | graphics::boxplot( 820 | logRatio ~ y, 821 | col=c('orange','lightblue'), 822 | main=paste0('Distribution of log-ratio ', index), 823 | xlab='Log-ratio score', 824 | ylab='Outcome', 825 | horizontal=TRUE 826 | ) 827 | 828 | } 829 | } 830 | 831 | 832 | #' plotROC 833 | #' 834 | #' @param cdcr A codacore object. 835 | #' 836 | #' @export 837 | plotROC = function(cdcr) { 838 | 839 | if (cdcr$objective != 'binary classification') { 840 | stop("ROC curves undefined for binary classification") 841 | } 842 | cols = c("black", "gray50", "gray70", "gray80", "gray90") 843 | lwds = c(2.0, 1.5, 1.2, 0.8, 0.6) 844 | oldPar <- graphics::par(no.readonly = TRUE) 845 | on.exit(graphics::par(oldPar)) # make sure to restore params even if there's an error 846 | graphics::par(pty = 's') 847 | graphics::plot(cdcr$ensemble[[1]]$ROC) 848 | legendCols = cols 849 | numBL = length(cdcr$ensemble) 850 | legendText = c() 851 | legendLwds = c() 852 | for (i in 1:min(5, numBL)) { 853 | cdbl = cdcr$ensemble[[i]] 854 | graphics::lines(cdbl$ROC$specificities, cdbl$ROC$sensitivities, col=cols[i], lwd=lwds[i]) 855 | legendText = c(legendText, paste0("Log-ratio: ", i, ", AUC: ", round(cdbl$AUC, 2))) 856 | legendCols = c(legendCols, cols[i]) 857 | legendLwds = c(legendLwds, lwds[i]) 858 | } 859 | graphics::legend( 860 | "bottomright", 861 | rev(legendText), 862 | lty=1, 863 | col=rev(legendCols), 864 | lwd=rev(legendLwds) + 0.5 865 | ) 866 | } 867 | 868 | 869 | # Helper functions below... 870 | 871 | 872 | #' activeInputs 873 | #' 874 | #' @param cdcr A codacore object. 875 | #' 876 | #' @return The covariates included in the log-ratios 877 | #' 878 | #' @export 879 | activeInputs.codacore = function(cdcr) { 880 | 881 | vars = c() 882 | 883 | for (cdbl in cdcr$ensemble) { 884 | vars = c(vars, which(cdbl$hard$numerator)) 885 | vars = c(vars, which(cdbl$hard$denominator)) 886 | } 887 | 888 | return(sort(unique(vars))) 889 | } 890 | 891 | 892 | #' getNumeratorParts 893 | #' 894 | #' @param cdcr A codacore object. 895 | #' @param baseLearnerIndex An integer indicating which of the 896 | #' (possibly multiple) log-ratios learned by codacore to be used. 897 | #' @param boolean Whether to return the parts in boolean form 898 | #' (a vector of TRUE/FALSE) or to return the column names of 899 | #' those parts directly. 900 | #' 901 | #' @return The covariates in the numerator of the selected log-ratio. 902 | #' 903 | #' @export 904 | getNumeratorParts <- function(cdcr, baseLearnerIndex=1, boolean=TRUE){ 905 | 906 | parts = cdcr$ensemble[[baseLearnerIndex]]$hard$numerator 907 | 908 | if (boolean) { 909 | return(parts) 910 | } else { 911 | return(colnames(cdcr$x)[parts]) 912 | } 913 | } 914 | 915 | #' getDenominatorParts 916 | #' 917 | #' @param cdcr A codacore object. 918 | #' @param baseLearnerIndex An integer indicating which of the 919 | #' (possibly multiple) log-ratios learned by codacore to be used. 920 | #' @param boolean Whether to return the parts in boolean form 921 | #' (a vector of TRUE/FALSE) or to return the column names of 922 | #' those parts directly. 923 | #' 924 | #' @return The covariates in the denominator of the selected log-ratio. 925 | #' 926 | #' @export 927 | getDenominatorParts <- function(cdcr, baseLearnerIndex=1, boolean=TRUE){ 928 | 929 | parts = cdcr$ensemble[[baseLearnerIndex]]$hard$denominator 930 | 931 | if (boolean) { 932 | return(parts) 933 | } else { 934 | return(colnames(cdcr$x)[parts]) 935 | } 936 | } 937 | 938 | #' getLogRatios 939 | #' 940 | #' @param cdcr A codacore object 941 | #' @param x A set of (possibly unseen) compositional data. 942 | #' The covariates must be passed in the same order as 943 | #' for the original codacore() call. 944 | #' 945 | #' @return The learned log-ratio features, computed on input x. 946 | #' 947 | #' @export 948 | getLogRatios <- function(cdcr, x=NULL){ 949 | 950 | if (is.null(x)) { 951 | x = cdcr$x 952 | } 953 | 954 | if (cdcr$logRatioType == 'A') { 955 | epsilonA = cdcr$optParams$epsilonA 956 | ratios <- lapply(cdcr$ensemble, function(a){ 957 | num <- rowSums(x[, a$hard$numerator, drop=FALSE]) + epsilonA 958 | den <- rowSums(x[, a$hard$denominator, drop=FALSE]) + epsilonA 959 | log(num/den) 960 | }) 961 | } else if (cdcr$logRatioType == 'B') { 962 | ratios <- lapply(cdcr$ensemble, function(a){ 963 | num <- rowMeans(log(x[, a$hard$numerator, drop=FALSE])) 964 | den <- rowMeans(log(x[, a$hard$denominator, drop=FALSE])) 965 | num - den 966 | }) 967 | } 968 | 969 | out <- do.call("cbind", ratios) 970 | colnames(out) <- paste0("log-ratio", 1:ncol(out)) 971 | return(out) 972 | } 973 | 974 | 975 | #' getSlopes 976 | #' 977 | #' @param cdcr A codacore object 978 | #' 979 | #' @return The slopes (i.e., regression coefficients) for each log-ratio. 980 | #' 981 | #' @export 982 | getSlopes <- function(cdcr){ 983 | 984 | out = c() 985 | 986 | for (cdbl in cdcr$ensemble) { 987 | out = c(out, cdbl$slope) 988 | } 989 | 990 | return(out) 991 | } 992 | 993 | 994 | #' getNumLogRatios 995 | #' 996 | #' @param cdcr A codacore object 997 | #' 998 | #' @return The number of log-ratios that codacore found. 999 | #' Typically a small integer. Can be zero if codacore 1000 | #' found no predictive log-ratios in the data. 1001 | #' 1002 | #' @export 1003 | getNumLogRatios <- function(cdcr){ 1004 | return(length(cdcr$ensemble)) 1005 | } 1006 | 1007 | 1008 | #' getTidyTable 1009 | #' 1010 | #' @param cdcr A codacore object 1011 | #' 1012 | #' @return A table displaying the log-ratios found. 1013 | #' 1014 | #' @export 1015 | getTidyTable <- function(cdcr){ 1016 | 1017 | tidyLogRatio = function(baseLearnerIndex, model, xTrain){ 1018 | x = getNumeratorParts(model, baseLearnerIndex, FALSE) 1019 | df = data.frame(Side = 'Numerator', Name = x) 1020 | x = getDenominatorParts(model, baseLearnerIndex, FALSE) 1021 | df = rbind(df, data.frame(Side = 'Denominator', Name = x)) 1022 | df$logRatioIndex = baseLearnerIndex 1023 | return(df) 1024 | } 1025 | 1026 | num = getNumLogRatios(cdcr) 1027 | 1028 | if (num == 0) { 1029 | return() 1030 | } else { 1031 | do.call(rbind, lapply(1:num, tidyLogRatio, model=cdcr)) 1032 | } 1033 | } 1034 | 1035 | #' getBinaryPartitions 1036 | #' 1037 | #' @param cdcr A codacore object 1038 | #' 1039 | #' @return A matrix describing whether each component (as rows) is found in the 1040 | #' numerator (1) or denominator (-1) of each learned log-ratio (as columns). 1041 | #' This format resembles a serial binary partition matrix frequently used 1042 | #' in balance analysis. 1043 | #' 1044 | #' @export 1045 | getBinaryPartitions <- function(cdcr){ 1046 | 1047 | numBaseLearners <- length(cdcr$ensemble) 1048 | res <- list(numBaseLearners) 1049 | for(baseLearner in 1:numBaseLearners){ 1050 | thisNumerator <- getNumeratorParts(cdcr, baseLearner) 1051 | thisDenominater <- getDenominatorParts(cdcr, baseLearner) 1052 | res[[baseLearner]] <- thisNumerator*1 + thisDenominater*-1 1053 | } 1054 | do.call("cbind", res) 1055 | } 1056 | 1057 | .prepx = function(x) { 1058 | if (class(x)[1] == 'tbl_df') {x = as.data.frame(x)} 1059 | if (class(x)[1] == 'data.frame') {x = as.matrix(x)} 1060 | if (is.integer(x)) {x = x * 1.0} 1061 | 1062 | # If the data is un-normalized (e.g. raw counts), 1063 | # we normalize it to ensure our learning rate is well calibrated 1064 | x = x / rowSums(x) 1065 | return(x) 1066 | } 1067 | 1068 | .prepy = function(y) { 1069 | if (inherits(y, 'tbl_df')) { 1070 | y = as.data.frame(y) 1071 | } 1072 | if (inherits(y, 'data.frame')) { 1073 | if (ncol(y) > 1) { 1074 | stop("Response should be 1-dimensional (if given 1075 | as a data.frame or matrix, it should have a 1076 | row for each sample, and a single column).") 1077 | } 1078 | y = y[[1]] 1079 | } 1080 | if (inherits(y, 'matrix')) { 1081 | if (ncol(y) > 1) { 1082 | stop("Response should be 1-dimensional (if given 1083 | as a data.frame or matrix, it should have a 1084 | row for each sample, and a single column).") 1085 | } 1086 | if (inherits(y, 'character')) { 1087 | y = as.character(y) 1088 | } 1089 | if (inherits(y, 'numeric')){ 1090 | y = as.numeric(y) 1091 | } 1092 | } 1093 | if (inherits(y, 'character')) { 1094 | y = factor(y) 1095 | } 1096 | return(y) 1097 | } 1098 | 1099 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Microbiome, HIV infection and MSM factor 2 | #' 3 | #' A dataset containing the number of counts of 60 different genera in a group 4 | #' of 155 samples (including HIV - infected and non - infected patients). 5 | #' The \code{data.frame} is composed by 60 genera and two variables. 6 | #' 7 | #' @format The \code{data.frame} is composed by 60 genera and 2 variables 8 | #' \describe{ 9 | #' \item{genera}{The first 60 columns, from \emph{g_Prevotella} until 10 | #' \emph{o_NB1-n_g_unclassified} referred to different genera.} 11 | #' \item{MSM}{a factor determining if the individual is \code{MSM} (\emph{Men Sex with 12 | #' Men}) or not (\code{nonMSM}).} 13 | #' \item{HIV_Status}{a factor specifying if the individual is infected 14 | #' (\code{Pos}) or not (\code{Neg}).} 15 | #' 16 | #' } 17 | #' @docType data 18 | #' @name HIV 19 | #' @references \url{https://pubmed.ncbi.nlm.nih.gov/27077120/} 20 | #' @keywords data 21 | NULL 22 | 23 | 24 | #' Microbiome and sCD14 inflammation parameter 25 | #' 26 | #' A dataset containing the number of counts of 60 different genera in a group 27 | #' of 151 samples (including HIV - infected and non - infected patients). 28 | #' The \code{data.frame} is composed by 60 genera and a numeric variable 29 | #' 30 | #' @format The \code{data.frame} is composed by 60 genera and a variable 31 | #' \describe{ 32 | #' \item{genera}{The first 60 columns, from \emph{g_Prevotella} until 33 | #' \emph{o_NB1-n_g_unclassified} referred to different genera.} 34 | #' \item{sCD14}{a \code{numeric} variable with the value of the inflammation 35 | #' parameter sCD14 for each sample.} 36 | #' } 37 | #' @name sCD14 38 | #' @docType data 39 | #' @references \doi{10.1016/j.ebiom.2016.01.032} 40 | #' @keywords data 41 | NULL 42 | 43 | 44 | 45 | #' Microbiome composition related to Crohn`s disease study 46 | #' 47 | #' A dataset containing the number of counts of 48 different genera in a group 48 | #' of 975 samples (including 662 samples of patients with Crohn`s disease and 49 | #' 313 controls). 50 | #' The \code{data.frame} is composed by 48 genera and a factor variable 51 | #' 52 | #' @format The \code{data.frame} is composed by 48 genera and a variable 53 | #' \describe{ 54 | #' \item{genera}{The first 48 columns, from \emph{g_Turicibacter} until 55 | #' \emph{g_Bilophila} referred to different genera.} 56 | #' \item{y}{a \code{factor} indicating if the sample corresponds to a case ( 57 | #' \emph{CD}) or a control (\emph{no}).} 58 | #' } 59 | #' @name Crohn 60 | #' @docType data 61 | #' @references \url{https://qiita.ucsd.edu/} 62 | #' @keywords data 63 | NULL 64 | -------------------------------------------------------------------------------- /R/simulations.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | #' simulateHTS 5 | #' 6 | #' This function simulates a set of (x, y) pairs. 7 | #' The covariates x are compositional, meaning they only 8 | #' carry relative information. 9 | #' The response y is a binary indicator. 10 | #' The rule linking x and y can be a balance or an amalgamation. 11 | #' 12 | #' @param n Number of observations. 13 | #' @param p Number of covariates. 14 | #' @param outputType A string indicating 'binary' or 'continuous'. 15 | #' @param logratio A string indicating 'simple', 'balance', or 16 | #' 'amalgamation'. 17 | #' 18 | #' @return A list containing a matrix of inputs and a vector of outputs 19 | #' 20 | #' @export 21 | simulateHTS = function(n, p, outputType = 'binary', logratio = 'simple'){ 22 | 23 | # Simulate independent variables 24 | alpha0 = rep(1.0, p) / log(p) 25 | alpha = gtools::rdirichlet(1, alpha0) 26 | alpha = sort(alpha, decreasing=T) 27 | X = matrix(0.0, n, p) 28 | P = matrix(0.0, n, p) 29 | numCounts = stats::rpois(n, 10 * p) 30 | for (i in 1:n) { 31 | classProb = gtools::rdirichlet(1, alpha) 32 | x = stats::rmultinom(1, numCounts[i], classProb) 33 | # X[i,] = x / sum(x) 34 | X[i,] = x 35 | P[i,] = classProb 36 | } 37 | 38 | # Simulate dependent variable 39 | if (logratio == 'simple') { 40 | if (p < 2) { 41 | stop("Input dimension must be >= 2") 42 | } 43 | eta = log(P[, 1]) - log(P[, 2]) 44 | } else if (logratio == 'balance') { 45 | if (p < 10) { 46 | stop("Input dimension must be >= 10") 47 | } 48 | eta = rowMeans(log(P[, c(4, 6)])) - log(P[, 5]) 49 | } else if (logratio == 'amalgamation') { 50 | if (p < 20) { 51 | stop("Input dimension must be >= 20") 52 | } 53 | eta = log(rowSums(P[, c(1,2,6,7,15)])) - log(rowSums(P[, c(3,8,16,17)])) 54 | } else { 55 | stop("Variable logratio incorrectly specified.") 56 | } 57 | 58 | if (outputType == 'binary') { 59 | outProb = 1 / (1 + exp(-(eta - mean(eta)))) * 1.0 60 | y = stats::rbinom(n, 1, outProb) 61 | } else if (outputType == 'continuous') { 62 | y = stats::rnorm(n, eta) 63 | } else { 64 | stop("Argument outputType:", outputType, ", not recognized") 65 | } 66 | 67 | return(list(x=data.frame(X), y=data.frame(y))) 68 | } 69 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CRAN_Status_Badge](https://cranlogs.r-pkg.org:443/badges/codacore)](https://cranlogs.r-pkg.org:443/badges/codacore) 2 | [![Downloads](https://cranlogs.r-pkg.org:443/badges/codacore)](https://cranlogs.r-pkg.org:443/badges/codacore) 3 | [![Total Downloads](https://cranlogs.r-pkg.org:443/badges/grand-total/codacore)](https://cranlogs.r-pkg.org:443/badges/grand-total/codacore) 4 | 5 | # codacore 6 | 7 | *Update: [CoDaCoRe is now live on CRAN](https://CRAN.R-project.org/package=codacore/)* 8 | 9 | A self-contained, up-to-date implementation of [CoDaCoRe](https://doi.org/10.1093/bioinformatics/btab645), in the R programming language, by the original authors. 10 | 11 | The [CoDaCoRe guide](https://egr95.github.io/R-codacore/inst/misc/guide.html) contains a detailed tutorial on installation, usage and functionality. 12 | 13 | Note this repository is under active development. If you would like to use CoDaCoRe on your dataset, and have any questions regarding the installation, usage, implementation, or model itself, do not hesitate to contact . Some previously asked questions are available on the [Issues page](https://github.com/egr95/R-codacore/issues). 14 | Contributions, fixes, and feature requests are also welcome - please create an issue, submit a pull request, or email me. 15 | 16 | ## Quick-start: how to install and run CoDaCoRe 17 | 18 | 1. We can install CoDaCoRe by running (further details in the [guide](https://egr95.github.io/R-codacore/inst/misc/guide.html#installation)): 19 | 20 | ```r 21 | install.packages('codacore') 22 | ``` 23 | 24 | 2. To fit codacore on some data and check the results (further details in the [guide](https://egr95.github.io/R-codacore/inst/misc/guide.html#training-the-model): 25 | ```r 26 | library("codacore") 27 | help(codacore) # if in doubt, check documentation 28 | data("Crohn") # load some data and apply codacore 29 | x <- Crohn[, -ncol(Crohn)] + 1 30 | y <- Crohn[, ncol(Crohn)] 31 | model = codacore( 32 | x, # compositional input, e.g., HTS count data 33 | y, # response variable, typically a 0/1 binary indicator 34 | logRatioType = "balances", # can use "amalgamations" instead, or abbreviations "B" and "A" 35 | lambda = 1 # regularization strength (default corresponds to 1SE rule) 36 | ) 37 | print(model) 38 | plot(model) 39 | ``` 40 | 41 | ## Reference 42 | 43 | Gordon-Rodriguez, Elliott, Thomas P. Quinn, and John P. Cunningham. "Learning sparse log-ratios for high-throughput sequencing data." Bioinformatics 38.1 (2022): 157-163. [[link](https://doi.org/10.1093/bioinformatics/btab645)] 44 | 45 | Quinn, Thomas P., Elliott Gordon-Rodriguez, and Ionas Erb. "A critique of differential abundance analysis, and advocacy for an alternative." arXiv preprint arXiv:2104.07266 (2021). [[link](https://arxiv.org/abs/2104.07266)] 46 | 47 | ## Acknowledgements 48 | Thanks for your contributions to codacore! 49 | 50 | - Marcus Fedarko 51 | - Gregor Seyer 52 | - Nick Youngblut 53 | - Antonio Garrido Fernandez 54 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | - local ubuntu 14.04, R 4.0.4 3 | - local ubuntu 16.04, R 4.0.4 4 | - win-builder (devel and release) 5 | 6 | ## R CMD check results 7 | There were no ERRORs or WARNINGs. 8 | 9 | There was 1 NOTE: 10 | 11 | * checking dependencies in R code ... NOTE 12 | Namespace in Imports field not imported from: 'R6' 13 | 14 | This is the initial submission of codacore . 15 | 16 | ## Downstream dependencies 17 | There are currently no downstream dependencies for this package. -------------------------------------------------------------------------------- /data/Crohn.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egr95/R-codacore/b87df7921a7fabdb355990e6fc439ff80d30c67d/data/Crohn.rda -------------------------------------------------------------------------------- /data/HIV.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egr95/R-codacore/b87df7921a7fabdb355990e6fc439ff80d30c67d/data/HIV.rda -------------------------------------------------------------------------------- /data/sCD14.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egr95/R-codacore/b87df7921a7fabdb355990e6fc439ff80d30c67d/data/sCD14.rda -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite codacore in publications use:") 2 | 3 | citEntry(entry = "Article", 4 | title = paste("Learning Sparse Log-Ratios For High-Throughput Sequencing Data"), 5 | author = personList(as.person("Elliott Gordon-Rodriguez"), 6 | as.person("Thomas P. Quinn"), 7 | as.person("John P. Cunningham")), 8 | journal = "Bioinformatics", 9 | year = "2021", 10 | 11 | textVersion = 12 | paste("Elliott Gordon-Rodriguez, Thomas P Quinn, John P Cunningham,", 13 | "Learning sparse log-ratios for high-throughput sequencing data,", 14 | "Bioinformatics, 2021;, btab645, doi:10.1093/bioinformatics/btab645") 15 | ) 16 | -------------------------------------------------------------------------------- /inst/misc/guide.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "CoDaCoRe Guide" 3 | author: 4 | - Elliott Gordon Rodriguez^[eg2912@columbia.edu] 5 | date: "`r Sys.Date()`" 6 | output: 7 | rmarkdown::html_vignette: 8 | number_sections: yes 9 | toc: true 10 | toc_depth: 3 11 | pdf_document: 12 | number_sections: no 13 | toc: false 14 | keep_tex: true 15 | md_document: 16 | number_sections: yes 17 | toc: true 18 | toc_depth: 3 19 | vignette: > 20 | %\VignetteIndexEntry{CoDaCoRe Guide} 21 | %\VignetteEncoding{UTF-8} 22 | %\VignetteEngine{knitr::rmarkdown} 23 | --- 24 | 25 | ```{r, include = FALSE} 26 | knitr::opts_chunk$set( 27 | collapse = TRUE, 28 | comment = "#>", 29 | fig.width = 7, 30 | fig.height = 5, 31 | tidy.opts=list(width.cutoff=40), tidy=TRUE 32 | ) 33 | ``` 34 | 35 | # Installation 36 | 37 | You can install ```codacore``` by running: 38 | ```r 39 | install.packages("codacore") 40 | ``` 41 | 42 | You may instead install the [development version](https://github.com/egr95/R-codacore) directly from Github, using the [devtools package](https://www.r-project.org/nosvn/pandoc/devtools.html). 43 | ```r 44 | devtools::install_github("egr95/R-codacore", ref="main") 45 | ``` 46 | 47 | Note that CoDaCoRe requires a working installation of [TensorFlow](https://tensorflow.rstudio.com/). 48 | If you do not have Tensorflow previously installed, when you run ```codacore()``` for the first time you will likely encounter an error message of the form: 49 | ```r 50 | > codacore(x, y) 51 | 52 | ERROR: Could not find a version that satisfies the requirement tensorflow 53 | ERROR: No matching distribution found for tensorflow 54 | Error: Installation of TensorFlow not found. 55 | 56 | Python environments searched for 'tensorflow' package: 57 | /moto/stats/users/eg2912/miniconda3/envs/r-test/bin/python3.9 58 | /usr/bin/python2.7 59 | 60 | You can install TensorFlow using the install_tensorflow() function. 61 | ``` 62 | 63 | This can be fixed simply by [installing tensorflow](https://tensorflow.rstudio.com/install/), as follows: 64 | ```r 65 | install.packages("tensorflow") 66 | library("tensorflow") 67 | install_tensorflow() 68 | 69 | install.packages("keras") 70 | library("keras") 71 | install_keras() 72 | ``` 73 | 74 | Note also that you may have to restart your R session between installation of ```codacore```, ```tensorflow```, and ```keras```. 75 | 76 | 77 | # Summary of method 78 | 79 | CoDaCoRe is an algorithm to identify predictive log-ratio biomarkers in high-throughput sequencing data. Let $x$ denote HTS input (e.g., $x_{i,j}$ denotes the abundance of the $j$th bacteria in the $i$th subject), and let $y$ denote the outcome of interest (e.g., $y_i$ is equal to 0 or 1 depending on whether the $i$th subject belonged to the case or the control group). Given a set of $(x_i, y_i)$ pairs, CoDaCoRe identifies predictive biomarkers of the form: 80 | $$ 81 | B(x_i; J^+, J^-) = \log \left( \frac{\sum_{j \in J^+} x_{i,j}}{\sum_{j \in J^-} x_{i,j}} \right), 82 | $$ 83 | that are maximally associated with the response variable $y_i$. In other words, CoDaCoRe identifies a numerator set $J^+$ and a denominator set $J^-$, such that their log-ratio is most predictive of the response variable. By default, CoDaCoRe uses *balances*, which are defined as the log-ratio of *geometric means* (as opposed to summations): 84 | $$ 85 | B(x_i; J^+, J^-) = \log \left( \frac{(\prod_{j \in J^+} x_{i,j})^{|J^+|}}{(\prod_{j \in J^-} x_{i,j})^{|J^-|}} \right). 86 | $$ 87 | 88 | For an introduction to balances, we refer the reader to the [selbal paper](https://doi.org/10.1101/219386), and for a more detailed treatment of CoDaCoRe and other log-ratio methodology, we refer the reader to the [codacore paper](https://doi.org/10.1093/bioinformatics/btab645) and [this paper](https://arxiv.org/abs/2104.07266). 89 | 90 | 91 | # Training the model 92 | 93 | We assume a working installation of `codacore` ([link](https://github.com/egr95/R-codacore/blob/main/README.md)). 94 | ```{r} 95 | library("codacore") 96 | help(codacore) 97 | ``` 98 | 99 | In this tutorial, we will showcase `codacore` using three datasets that were also analyzed by the authors of `selbal` [(Rivera-Pinto et al., 2018)](https://doi.org/10.1101/219386). First, we consider the Crohn's disease data from [(Gevers et al., 2014)](http://dx.doi.org/10.1016/j.chom.2014.02.005). 100 | ```{r} 101 | data("Crohn") 102 | x <- Crohn[, -ncol(Crohn)] 103 | y <- Crohn[, ncol(Crohn)] 104 | ``` 105 | 106 | Our goal is to identify ratio-based biomarkers that are predictive of disease status. Our input variable consists of the abundance of 48 microbial species in 975 samples. *As is common in most machine learning libraries, our package expects an input of shape (n, p), with a row for each sample and a column for each variable.* 107 | ```{r} 108 | dim(x) 109 | ``` 110 | 111 | The output variable is a binary indicator (CD stands for Chron's disease). 112 | ```{r} 113 | table(y) 114 | ``` 115 | 116 | Prior to fitting CoDaCoRe, we must impute any zeros in our input variable (a standard pre-processing step for ratio-based methods). 117 | ```{r} 118 | x <- x + 1 119 | ``` 120 | 121 | Next, we split our data into a training and a test set (to keep things simple we do this naively at random, though in practice one might consider stratified sampling and class rebalancing). 122 | ```{r} 123 | # For reproducibility, we set a random seed (including in TensorFlow, used by codacore) 124 | set.seed(0); library(tensorflow); tf$random$set_seed(0) 125 | trainIndex <- sample(1:nrow(x), 0.8 * nrow(x)) 126 | xTrain <- x[trainIndex,] 127 | yTrain <- y[trainIndex] 128 | ``` 129 | 130 | We are ready to fit CoDaCoRe. We stick to the default parameters for now. Notice the fast runtime (as compared to, for example, `selbal.cv`). 131 | ```{r} 132 | model <- codacore( 133 | xTrain, 134 | yTrain, 135 | logRatioType = 'balances', # can also use 'amalgamations' 136 | lambda = 1 # regularization parameter (1 corresponds to "1SE rule") 137 | ) 138 | ``` 139 | 140 | # Visualizing results 141 | 142 | Next we can check the learned output of the model: what inputs were included in the learned log-ratios, how strongly associated they are to the response, and how well they classified the data. 143 | ```{r} 144 | print(model) 145 | ``` 146 | 147 | The most predictive ratio identified by CoDaCoRe is Roseburia / Dialister, which can be visualized with the `plot` function. 148 | ```{r} 149 | plot(model) 150 | ``` 151 | 152 | Note that CoDaCoRe is an ensemble model, where multiple log-ratios are learned sequentially in decreasing order of importance (with automatic stopping whenever no additional log-ratio improved the loss function during training). We can visualize the performance of this ensembling procedure by "stacking" the respective ROC curves. 153 | ```{r} 154 | plotROC(model) 155 | ``` 156 | 157 | # Predicting on new data 158 | 159 | We can also use our trained model to classify new samples. 160 | ```{r} 161 | xTest <- x[-trainIndex,] 162 | yTest <- y[-trainIndex] 163 | yHat <- predict(model, xTest, logits=F) 164 | cat("Test set AUC =", pROC::auc(pROC::roc(yTest, yHat, quiet=T))) 165 | # Convert probabilities into a binary class 166 | failure <- yHat < 0.5 167 | success <- yHat >= 0.5 168 | yHat[failure] <- levels(y)[1] 169 | yHat[success] <- levels(y)[2] 170 | cat("Classification accuracy on test set =", round(mean(yHat == yTest), 2)) 171 | ``` 172 | 173 | Note our `predict` function can be restricted to only use the top _k_ log-ratios in the model for prediction. 174 | For example, the following will compute the AUC of a 1-log-ratio model, using only the top log-ratio. 175 | ```{r} 176 | yHat <- predict(model, xTest, logits=F, numLogRatios=1) 177 | cat("Test set AUC =", pROC::auc(pROC::roc(yTest, yHat, quiet=T))) 178 | ``` 179 | 180 | Other useful functions include: 181 | ```{r, results=F} 182 | getNumeratorParts(model, 1) 183 | getDenominatorParts(model, 1) 184 | getLogRatios(model, xTest) 185 | getNumLogRatios(model) 186 | getTidyTable(model) 187 | getSlopes(model) 188 | ``` 189 | 190 | # Controlling overlap between log-ratios 191 | 192 | By default, CoDaCoRe allows for "overlapping log-ratios", in other words, an input variable that is included in the first log-ratio may well be included in a second or third log-ratio provided it is sufficiently predictive. However, the user may choose to restrict each successive log-ratio to be constructed from a mutually exclusive set of input variables (e.g., to obtain _orthogonal balances_, in the Aitchison sense). This can be specified with the parameter `overlap`. In our example, note how `g__Dialister` is no longer repeated. 193 | 194 | ```{r} 195 | model <- codacore(xTrain, yTrain, overlap=F) 196 | print(model) 197 | ``` 198 | 199 | # Using amalgamations (summed-log-ratios) 200 | 201 | CoDaCoRe can be used to learn log-ratios between both geometric means (known as "balances" or "isometric-log-ratio") or summations (known as "amalgamations" or "summed-log-ratio"), depending on the goals of the user. This can be specified with the parameter `logRatioType`. 202 | ```{r} 203 | model <- codacore(xTrain, yTrain, logRatioType = "amalgamations") 204 | print(model) 205 | ``` 206 | 207 | Note that amalgamations/summed-log-ratios are less sensitive to covariates that are small in magnitude (e.g., rare microbes), which can hinder their predictive strength for datasets where small covariates are important. On the other hand, summed-log-ratios have a different interpretation than isometric-log-ratios and may therefore be preferrable in some applications (e.g., when the "summed" effect of an aggregated sub-population is the object of interest). In our Crohn's disease data, the rare species Roseburia gets picked up by the isometric-log-ratio, but not by the summed-log-ratio, which is more sensitive to more common bacteria species such as Faecalibacterium. 208 | 209 | # Continuous outcomes 210 | 211 | We consider the HIV data from [(Noguera-Julian et al., 2016)](http://dx.doi.org/10.1016/j.ebiom.2016.01.032). The goal here is to construct a log-ratio of the microbial abundances that is predictive of the inflammation marker "sCD14", a continuous response variable. CoDaCoRe can be applied much in the same way, except the loss function changes from binary cross-entropy to mean-squared-error. This change will happen automatically based on the values inputted as `y` (although it can also be overriden manually via the ```objective``` parameter, for example, if the user wanted to fit a binary response using the mean-squared-error, they could specify ```objective = 'regression'```). 212 | 213 | ```{r} 214 | data("sCD14") 215 | x <- sCD14[, -ncol(sCD14)] 216 | y <- sCD14[, ncol(sCD14)] 217 | 218 | # Replace zeros as before 219 | x <- x + 1 220 | 221 | # Split the data 222 | trainIndex <- sample(1:nrow(x), 0.8 * nrow(x)) 223 | xTrain <- x[trainIndex,] 224 | yTrain <- y[trainIndex] 225 | 226 | # Fit codacore and inspect results 227 | model <- codacore(xTrain, yTrain) 228 | print(model) 229 | plot(model) 230 | ``` 231 | 232 | # Multiclass classification 233 | 234 | Our implementation does not currently support multiclass targets, however a multiclass classifier can be constructed from CoDaCoRe by taking the One-vs-One or One-vs-Rest strategies. 235 | 236 | # Tuning the regularization parameter lambda 237 | 238 | The parameter `lambda` controls the regularization strength of CoDaCoRe. In particular, `lambda = 1` (the default value) corresponds to applying the 1-standard-error rule in the discretization step of the log-ratio (details in [Section 3.3](https://www.biorxiv.org/content/10.1101/2021.02.11.430695v2.full.pdf)). This is typically a good choice, leading to models that are both sparse and predictive. Sparser models can be achieved by higher values of `lambda`, for example, `lambda = 2` corresponds to applying a "2-standard-error" rule. On the other hand, smaller values of lambda result in less sparse, but typically most predictive, models. In particular, `lambda = 0` corresponds to a "0 standard-error rule", in other words choosing the log-ratio that minimizes cross-validation score. Such a choice can be good when we seek a maximally predictive model, but care less about sparsity. 239 | ```{r} 240 | model <- codacore(xTrain, yTrain, lambda = 0.0) 241 | print(model) 242 | ``` 243 | 244 | Notice the increased R-squared score relative to the previous model (at the expense of sparsity). 245 | 246 | ## When no predictive log-ratios are found 247 | 248 | On some datasets, CoDaCoRe may have trouble finding _any_ predictive log-ratios. If none are found, this is typically a sign that the signal in the data is weak. In this case, the analyst may choose to reduce the value of `lambda` (for example, to `lambda = 0`), in order to allow our algorithm to search more aggressively for predictive log-ratios. Doing so will often allow the algorithm to identify at least one predictive log-ratio, at the risk of overfitting the training data. Additional care must be taken in validating such log-ratios on held-out data. 249 | 250 | # Covariate adjustment 251 | 252 | Many applications require accounting for potential confounder variables as well as our ratio-based biomarkers. As an example, we consider a second HIV dataset from [(Noguera-Julian et al. 2016)](http://dx.doi.org/10.1016/j.ebiom.2016.01.032). The goal is to find a microbial signature for HIV status, i.e., a log-ratio that can discriminate between HIV-positive and HIV-negative individuals. However, we have an additional confounder variable, MSM (Men who have Sex with Men). In the context of CoDaCoRe, there are multiple approaches that can be used to adjust for covariates. 253 | 254 | ## Incremental fit 255 | 256 | Given the _stagewise-additive_ (i.e., ensemble) nature of CoDaCoRe, whereby each successive log-ratio is fitted on the residual of the previous iteration, a very natural approach is to fit the covariates _a priori_ and then fit CoDaCoRe on the residual. In other words, we would start by regressing HIV status on MSM, "partialling out" this covariate, and then fit CoDaCoRe on the residual from this model. This can be implemented easily by means of the `offset` parameter. 257 | ```{r} 258 | data("HIV") 259 | x <- HIV[, 1:(ncol(HIV) - 2)] 260 | z <- HIV[, 'MSM'] 261 | y <- HIV$HIV_Status 262 | 263 | # Replace zeros as before 264 | x <- x + 1 265 | 266 | # Split the data 267 | trainIndex <- sample(1:nrow(x), 0.8 * nrow(x)) 268 | dfTrain <- HIV[trainIndex,] 269 | xTrain <- x[trainIndex,] 270 | yTrain <- y[trainIndex] 271 | 272 | partial <- glm(HIV_Status ~ MSM, data=dfTrain, family='binomial') 273 | # Note the offset must be given in logit space 274 | model <- codacore(xTrain, yTrain, offset=predict(partial)) 275 | print(model) 276 | partialAUC <- pROC::auc(pROC::roc(yTrain, predict(partial), quiet=T)) 277 | codacoreAUC <- model$ensemble[[1]]$AUC 278 | cat("AUC gain:", round(100 * (codacoreAUC - partialAUC)), "%") 279 | ``` 280 | 281 | Note that, when predicting on new data, the contributions of the covariates and the log-ratios should be added up in logit space. 282 | ```{r} 283 | dfTest <- HIV[-trainIndex,] 284 | xTest <- x[-trainIndex,] 285 | yTest <- z[-trainIndex] 286 | yHatLogit <- predict(partial, newdata = dfTest) + predict(model, xTest, logits=T) 287 | yHat <- yHatLogit > 0 # in case we need binary predictions e.g. to compute accuracy 288 | testAUC <- pROC::auc(pROC::roc(yTest, yHatLogit, quiet=T)) 289 | cat("Test AUC:", round(100 * testAUC), "%") 290 | ``` 291 | 292 | When the outcome variable is continuous, this is simpler as there is no logit transformation and the contributions of the partial model can be added directly, e.g., 293 | ```r 294 | # Suppose that, instead of predicting HIV status (a binary target), 295 | # we now have some continuous target, 'yCts' 296 | partial2 <- lm(yCts ~ MSM, data=dfTrain) 297 | model2 <- codacore(xTrain, yCtsTrain, offset=predict(partial)) 298 | print(model2) 299 | yCtsHat <- predict(partial2, newdata = dfTest) + predict(model2, xTest) 300 | MSE <- mean((yCtsTest - yCtsHat)^2) 301 | ``` 302 | 303 | ## Joint fit 304 | 305 | Depending on the application and the goals of the analyst, it may be of interest to understand the _joint_ effect of the covariates and log-ratios on the response. To do so, one option is to simply regress the outcome jointly against the covariates and the learned log-ratios from the previous step. This can be implemented by running, in addition to the above, an additional `glm` fit. 306 | ```{r} 307 | # Create a new design matrix with response & covariates, as well as log-ratios obtained from codacore 308 | dfJoint = cbind(dfTrain[, c('MSM', 'HIV_Status')], getLogRatios(model)) 309 | 310 | # And fit everything jointly 311 | modelJoint <- glm(HIV_Status ~ ., data=dfJoint, family='binomial') 312 | # Can again use this model to make predictions or to interpret regression coefficients 313 | yHat <- predict(modelJoint, newData=dfJoint) 314 | summary(modelJoint) 315 | ``` 316 | 317 | Note that, in any case, the CoDaCoRe algorithm itself only optimizes over one log-ratio at a time (in its current implementation). In some applications, it may in fact be beneficial to optimize over the set of log-ratios jointly with the regression coefficients of the covariates. However, this is not yet implemented. 318 | 319 | # Unsupervised learning 320 | 321 | CoDaCoRe can be used as follows to obtain a fast, scalable, interpretable and sparse log-ratio based unsupervised learning algorithm. The idea is to first compute a dense representation of the data using traditional methods, and then regress the data against this representation using CoDaCoRe to obtain a sparse log-ratio representation in its stead. For example, one could take the first principal component of the CLR-transformed data, and use CoDaCoRe to approximate this real-valued representation with a single sparse log-ratio score [(Quinn et al., 2021)](https://arxiv.org/abs/2104.07266). In the present HIV dataset, we find that the learned log-ratio biomarker provides a useful representation of the data, markedly separating the MSM from the non-MSM individuals. 322 | ```{r} 323 | clr <- t(apply(x, 1, function(x) log(x) - mean(log(x)))) 324 | pca <- prcomp(clr, scale=T) 325 | pc1 = clr %*% pca$rotation[, 1] 326 | 327 | model <- codacore(x, as.numeric(pc1)) 328 | logRatio1 <- getLogRatios(model, x)[, 1] 329 | boxplot(logRatio1 ~ z) 330 | ``` 331 | 332 | We can take things one step further and derive a second unsupervised log-ratio biomarker, by simply fitting CoDaCoRe on the second principal component. Taken together, our two log-ratio biomarkers capture important information in the data: 333 | ```{r} 334 | pc2 = clr %*% pca$rotation[, 2] 335 | model <- codacore(x, as.numeric(pc2)) 336 | logRatio2 <- getLogRatios(model, x)[, 1] 337 | plot(logRatio1, logRatio2, col=z) 338 | legend('bottomleft', legend=levels(z), pch=1, col=1:2) 339 | ``` 340 | 341 | 342 | Note also that the CoDaCoRe framework can be applied to the unsupervised learning problem in several other ways, some of which are under active development. 343 | 344 | 345 | # Multi-omics integration 346 | 347 | With a similar approach, CoDaCoRe can be used for scalable, sparse, and interpretable multi-omics data integration. We briefly highlight an example multi-omics analysis of paired gut microbiome and metabolomics data, taken from 220 clinical samples of which 88 have Chron's disease and 76 have ulcerative colitis [(Franzosa et al., 2019)](https://www.nature.com/articles/s41564-018-0306-4). For a full analysis, see Section 5 and the appendix in [Quinn et al., 2021](https://arxiv.org/abs/2104.07266). Again, we will use standard techniques to compute a (dense) latent representation of the data, which we will then approximate using sparse log-ratio biomarkers. Letting $\mathbf T$ denote the microbe abundances $\mathbf U$ the metabolite abundances, we will use partial least squares (PLS) regression to model the association between $\mathbf T$ and $\mathbf U$. This will result in two latent factors, one for $\mathbf T$ and one for $\mathbf U$, that capture the _joint_ information in the data. These latent factors will constitute the regression target for CoDaCoRe. 348 | ```{r} 349 | # Load data 350 | download.file("https://github.com/egr95/FranzosaData/blob/main/FranzosaMicrobiome.rda?raw=true", "FranzosaMicrobiome") 351 | download.file("https://github.com/egr95/FranzosaData/blob/main/FranzosaMetabolite.rda?raw=true", "FranzosaMetabolite") 352 | load("FranzosaMicrobiome") 353 | load("FranzosaMetabolite") 354 | 355 | # Note data have already been pre-processed as per (Quinn et al., 2021), 356 | # including zero-replacement and normalization to a unit total. 357 | T <- FranzosaMicrobiome[, -ncol(FranzosaMicrobiome)] # We remove the last column (response variable) 358 | U <- FranzosaMetabolite[, -ncol(FranzosaMetabolite)] 359 | 360 | # Apply clr transform prior to PLS 361 | clrT <- t(apply(T, 1, function(x) log(x) - mean(log(x)))) 362 | clrU <- t(apply(U, 1, function(x) log(x) - mean(log(x)))) 363 | 364 | # Call mixOmics package and plot first PLS components 365 | suppressMessages(library('mixOmics')) 366 | pls <- mixOmics::pls(X = clrT, Y = clrU, ncomp = 1) 367 | plot(pls$variates$X[,1], pls$variates$Y[,1], main = 'PLS multi-omics (dense)') 368 | 369 | # Approximate the dense PLS representations with sparse log-ratio biomarkers 370 | plsX <- pls$variates$X[,1] 371 | modelX <- codacore(T, plsX) 372 | logRatioX <- getLogRatios(modelX)[,1] 373 | 374 | plsY <- pls$variates$Y[,1] 375 | modelY <- codacore(U, plsY, logRatioType = "B") 376 | logRatioY <- getLogRatios(modelY)[,1] 377 | 378 | plot(logRatioX, logRatioY, main = 'CoDaCoRe multi-omics (sparse)') 379 | ``` 380 | 381 | ```{r, include=FALSE} 382 | file.remove("FranzosaMicrobiome") 383 | file.remove("FranzosaMetabolite") 384 | ``` 385 | 386 | Note that CoDaCoRe obtains a sparse representation that also has better statistical properties than the original (dense) PLS components, markedly de-skewing the data. 387 | 388 | -------------------------------------------------------------------------------- /man/Crohn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{Crohn} 5 | \alias{Crohn} 6 | \title{Microbiome composition related to Crohn`s disease study} 7 | \format{ 8 | The \code{data.frame} is composed by 48 genera and a variable 9 | \describe{ 10 | \item{genera}{The first 48 columns, from \emph{g_Turicibacter} until 11 | \emph{g_Bilophila} referred to different genera.} 12 | \item{y}{a \code{factor} indicating if the sample corresponds to a case ( 13 | \emph{CD}) or a control (\emph{no}).} 14 | } 15 | } 16 | \description{ 17 | A dataset containing the number of counts of 48 different genera in a group 18 | of 975 samples (including 662 samples of patients with Crohn`s disease and 19 | 313 controls). 20 | The \code{data.frame} is composed by 48 genera and a factor variable 21 | } 22 | \references{ 23 | \url{https://qiita.ucsd.edu/} 24 | } 25 | \keyword{data} 26 | -------------------------------------------------------------------------------- /man/HIV.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{HIV} 5 | \alias{HIV} 6 | \title{Microbiome, HIV infection and MSM factor} 7 | \format{ 8 | The \code{data.frame} is composed by 60 genera and 2 variables 9 | \describe{ 10 | \item{genera}{The first 60 columns, from \emph{g_Prevotella} until 11 | \emph{o_NB1-n_g_unclassified} referred to different genera.} 12 | \item{MSM}{a factor determining if the individual is \code{MSM} (\emph{Men Sex with 13 | Men}) or not (\code{nonMSM}).} 14 | \item{HIV_Status}{a factor specifying if the individual is infected 15 | (\code{Pos}) or not (\code{Neg}).} 16 | 17 | } 18 | } 19 | \description{ 20 | A dataset containing the number of counts of 60 different genera in a group 21 | of 155 samples (including HIV - infected and non - infected patients). 22 | The \code{data.frame} is composed by 60 genera and two variables. 23 | } 24 | \references{ 25 | \url{https://pubmed.ncbi.nlm.nih.gov/27077120/} 26 | } 27 | \keyword{data} 28 | -------------------------------------------------------------------------------- /man/activeInputs.codacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{activeInputs.codacore} 4 | \alias{activeInputs.codacore} 5 | \title{activeInputs} 6 | \usage{ 7 | activeInputs.codacore(cdcr) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object.} 11 | } 12 | \value{ 13 | The covariates included in the log-ratios 14 | } 15 | \description{ 16 | activeInputs 17 | } 18 | -------------------------------------------------------------------------------- /man/codacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{codacore} 4 | \alias{codacore} 5 | \title{codacore} 6 | \usage{ 7 | codacore( 8 | x, 9 | y, 10 | logRatioType = "balances", 11 | objective = NULL, 12 | lambda = 1, 13 | offset = NULL, 14 | shrinkage = 1, 15 | maxBaseLearners = 5, 16 | optParams = list(), 17 | cvParams = list(), 18 | verbose = FALSE, 19 | overlap = TRUE, 20 | fast = TRUE 21 | ) 22 | } 23 | \arguments{ 24 | \item{x}{A data.frame or matrix of the compositional predictor variables.} 25 | 26 | \item{y}{A data.frame, matrix or vector of the response.} 27 | 28 | \item{logRatioType}{A string indicating whether to use "balances" or "amalgamations". 29 | Also accepts "balance", "B", "ILR", or "amalgam", "A", "SLR". 30 | Note that the current implementation for balances is not strictly an ILR, 31 | but rather just a collection of balances (which are possibly non-orthogonal 32 | in the Aitchison sense).} 33 | 34 | \item{objective}{A string indicating "binary classification" or "regression". By default, 35 | it is NULL and gets inferred from the values in y.} 36 | 37 | \item{lambda}{A numeric. Corresponds to the "lambda-SE" rule. Sets the "regularization strength" 38 | used by the algorithm to decide how to harden the ratio. 39 | Larger numbers tend to yield fewer, more sparse ratios.} 40 | 41 | \item{offset}{A numeric vector of the same length as y. Works similarly to the offset in a glm.} 42 | 43 | \item{shrinkage}{A numeric. Shrinkage factor applied to each base learner. 44 | Defaults to 1.0, i.e., no shrinkage applied.} 45 | 46 | \item{maxBaseLearners}{An integer. The maximum number of log-ratios that the model will 47 | learn before stopping. Automatic stopping based on \code{seRule} may occur sooner.} 48 | 49 | \item{optParams}{A list of named parameters for the optimization of the 50 | continuous relaxation. Empty by default. User can override as few or as 51 | many of our defaults as desired. Includes adaptiveLR (learning rate under 52 | adaptive training scheme), momentum (in the gradient-descent sense), 53 | epochs (number of gradient-descent epochs), batchSize (number of 54 | observations per minibatch, by default the entire dataset), 55 | and vanillaLR (the learning rate to be used if the user does *not* want 56 | to use the 'adaptiveLR', to be used at the risk of optimization issues).} 57 | 58 | \item{cvParams}{A list of named parameters for the "hardening" procedure 59 | using cross-validation. Includes numFolds (number of folds, default=5) and 60 | maxCutoffs (number of candidate cutoff values of 'c' to be tested out 61 | during CV process, default=20 meaning log-ratios with up to 21 components 62 | can be found by codacore).} 63 | 64 | \item{verbose}{A boolean. Toggles whether to display intermediate steps.} 65 | 66 | \item{overlap}{A boolean. Toggles whether successive log-ratios found by 67 | CoDaCoRe may contain repeated input variables. TRUE by default. 68 | Changing to FALSE implies that the log-ratios obtained by CoDaCoRe 69 | will become orthogonal in the Aitchison sense, analogously to the 70 | isometric-log-ratio transformation, while losing a small amount of 71 | model flexibility.} 72 | 73 | \item{fast}{A boolean. Whether to run in fast or slow mode. TRUE by 74 | default. Running in slow mode will take ~x5 the computation time, 75 | but may help identify slightly more accurate log-ratios.} 76 | } 77 | \value{ 78 | A \code{codacore} object. 79 | } 80 | \description{ 81 | This function implements the codacore algorithm described by Gordon-Rodriguez et al. 2021 82 | (https://doi.org/10.1101/2021.02.11.430695). 83 | } 84 | \examples{ 85 | \dontrun{ 86 | data("Crohn") 87 | x <- Crohn[, -ncol(Crohn)] 88 | y <- Crohn[, ncol(Crohn)] 89 | x <- x + 1 90 | model = codacore(x, y) 91 | print(model) 92 | plot(model) 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /man/getBinaryPartitions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{getBinaryPartitions} 4 | \alias{getBinaryPartitions} 5 | \title{getBinaryPartitions} 6 | \usage{ 7 | getBinaryPartitions(cdcr) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object} 11 | } 12 | \value{ 13 | A matrix describing whether each component (as rows) is found in the 14 | numerator (1) or denominator (-1) of each learned log-ratio (as columns). 15 | This format resembles a serial binary partition matrix frequently used 16 | in balance analysis. 17 | } 18 | \description{ 19 | getBinaryPartitions 20 | } 21 | -------------------------------------------------------------------------------- /man/getDenominatorParts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{getDenominatorParts} 4 | \alias{getDenominatorParts} 5 | \title{getDenominatorParts} 6 | \usage{ 7 | getDenominatorParts(cdcr, baseLearnerIndex = 1, boolean = TRUE) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object.} 11 | 12 | \item{baseLearnerIndex}{An integer indicating which of the 13 | (possibly multiple) log-ratios learned by codacore to be used.} 14 | 15 | \item{boolean}{Whether to return the parts in boolean form 16 | (a vector of TRUE/FALSE) or to return the column names of 17 | those parts directly.} 18 | } 19 | \value{ 20 | The covariates in the denominator of the selected log-ratio. 21 | } 22 | \description{ 23 | getDenominatorParts 24 | } 25 | -------------------------------------------------------------------------------- /man/getLogRatios.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{getLogRatios} 4 | \alias{getLogRatios} 5 | \title{getLogRatios} 6 | \usage{ 7 | getLogRatios(cdcr, x = NULL) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object} 11 | 12 | \item{x}{A set of (possibly unseen) compositional data. 13 | The covariates must be passed in the same order as 14 | for the original codacore() call.} 15 | } 16 | \value{ 17 | The learned log-ratio features, computed on input x. 18 | } 19 | \description{ 20 | getLogRatios 21 | } 22 | -------------------------------------------------------------------------------- /man/getNumLogRatios.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{getNumLogRatios} 4 | \alias{getNumLogRatios} 5 | \title{getNumLogRatios} 6 | \usage{ 7 | getNumLogRatios(cdcr) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object} 11 | } 12 | \value{ 13 | The number of log-ratios that codacore found. 14 | Typically a small integer. Can be zero if codacore 15 | found no predictive log-ratios in the data. 16 | } 17 | \description{ 18 | getNumLogRatios 19 | } 20 | -------------------------------------------------------------------------------- /man/getNumeratorParts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{getNumeratorParts} 4 | \alias{getNumeratorParts} 5 | \title{getNumeratorParts} 6 | \usage{ 7 | getNumeratorParts(cdcr, baseLearnerIndex = 1, boolean = TRUE) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object.} 11 | 12 | \item{baseLearnerIndex}{An integer indicating which of the 13 | (possibly multiple) log-ratios learned by codacore to be used.} 14 | 15 | \item{boolean}{Whether to return the parts in boolean form 16 | (a vector of TRUE/FALSE) or to return the column names of 17 | those parts directly.} 18 | } 19 | \value{ 20 | The covariates in the numerator of the selected log-ratio. 21 | } 22 | \description{ 23 | getNumeratorParts 24 | } 25 | -------------------------------------------------------------------------------- /man/getSlopes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{getSlopes} 4 | \alias{getSlopes} 5 | \title{getSlopes} 6 | \usage{ 7 | getSlopes(cdcr) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object} 11 | } 12 | \value{ 13 | The slopes (i.e., regression coefficients) for each log-ratio. 14 | } 15 | \description{ 16 | getSlopes 17 | } 18 | -------------------------------------------------------------------------------- /man/getTidyTable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{getTidyTable} 4 | \alias{getTidyTable} 5 | \title{getTidyTable} 6 | \usage{ 7 | getTidyTable(cdcr) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object} 11 | } 12 | \value{ 13 | A table displaying the log-ratios found. 14 | } 15 | \description{ 16 | getTidyTable 17 | } 18 | -------------------------------------------------------------------------------- /man/plot.codacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{plot.codacore} 4 | \alias{plot.codacore} 5 | \title{plot} 6 | \usage{ 7 | \method{plot}{codacore}(x, index = 1, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A codacore object.} 11 | 12 | \item{index}{The index of the log-ratio to plot.} 13 | 14 | \item{...}{Not used.} 15 | } 16 | \description{ 17 | Plots a summary of a fitted codacore model. 18 | Credit to the authors of the selbal package (Rivera-Pinto et al., 2018), 19 | from whose package these plots were inspired. 20 | } 21 | -------------------------------------------------------------------------------- /man/plotROC.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{plotROC} 4 | \alias{plotROC} 5 | \title{plotROC} 6 | \usage{ 7 | plotROC(cdcr) 8 | } 9 | \arguments{ 10 | \item{cdcr}{A codacore object.} 11 | } 12 | \description{ 13 | plotROC 14 | } 15 | -------------------------------------------------------------------------------- /man/predict.codacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{predict.codacore} 4 | \alias{predict.codacore} 5 | \title{predict} 6 | \usage{ 7 | \method{predict}{codacore}(object, newx, asLogits = TRUE, numLogRatios = NA, ...) 8 | } 9 | \arguments{ 10 | \item{object}{A codacore object.} 11 | 12 | \item{newx}{A set of inputs to our model.} 13 | 14 | \item{asLogits}{Whether to return outputs in logit space 15 | (as opposed to probability space). Should always be set 16 | to TRUE for regression with continuous outputs, but can 17 | be toggled for classification problems.} 18 | 19 | \item{numLogRatios}{How many predictive log-ratios to 20 | include in the prediction. By default, includes the 21 | effects of all log-ratios that were obtained during 22 | training. Setting this parameter to an integer k will 23 | restrict to using only the top k log-ratios in the model.} 24 | 25 | \item{...}{Not used.} 26 | } 27 | \description{ 28 | predict 29 | } 30 | -------------------------------------------------------------------------------- /man/print.codacore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/codacore.R 3 | \name{print.codacore} 4 | \alias{print.codacore} 5 | \title{print} 6 | \usage{ 7 | \method{print}{codacore}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A codacore object.} 11 | 12 | \item{...}{Not used.} 13 | } 14 | \description{ 15 | print 16 | } 17 | -------------------------------------------------------------------------------- /man/sCD14.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sCD14} 5 | \alias{sCD14} 6 | \title{Microbiome and sCD14 inflammation parameter} 7 | \format{ 8 | The \code{data.frame} is composed by 60 genera and a variable 9 | \describe{ 10 | \item{genera}{The first 60 columns, from \emph{g_Prevotella} until 11 | \emph{o_NB1-n_g_unclassified} referred to different genera.} 12 | \item{sCD14}{a \code{numeric} variable with the value of the inflammation 13 | parameter sCD14 for each sample.} 14 | } 15 | } 16 | \description{ 17 | A dataset containing the number of counts of 60 different genera in a group 18 | of 151 samples (including HIV - infected and non - infected patients). 19 | The \code{data.frame} is composed by 60 genera and a numeric variable 20 | } 21 | \references{ 22 | \doi{10.1016/j.ebiom.2016.01.032} 23 | } 24 | \keyword{data} 25 | -------------------------------------------------------------------------------- /man/simulateHTS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simulations.R 3 | \name{simulateHTS} 4 | \alias{simulateHTS} 5 | \title{simulateHTS} 6 | \usage{ 7 | simulateHTS(n, p, outputType = "binary", logratio = "simple") 8 | } 9 | \arguments{ 10 | \item{n}{Number of observations.} 11 | 12 | \item{p}{Number of covariates.} 13 | 14 | \item{outputType}{A string indicating 'binary' or 'continuous'.} 15 | 16 | \item{logratio}{A string indicating 'simple', 'balance', or 17 | 'amalgamation'.} 18 | } 19 | \value{ 20 | A list containing a matrix of inputs and a vector of outputs 21 | } 22 | \description{ 23 | This function simulates a set of (x, y) pairs. 24 | The covariates x are compositional, meaning they only 25 | carry relative information. 26 | The response y is a binary indicator. 27 | The rule linking x and y can be a balance or an amalgamation. 28 | } 29 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(codacore) 3 | 4 | test_check("codacore") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-codacore.R: -------------------------------------------------------------------------------- 1 | tensorflow_is_installed <- function(){ 2 | check <- tryCatch({ 3 | # dummy tensorflow code 4 | tensorflow::set_random_seed(0) 5 | TRUE 6 | }, error = function(e){ 7 | FALSE 8 | }) 9 | return(check) 10 | } 11 | 12 | testthat::test_that("simple logratios", { 13 | if (tensorflow_is_installed()){ 14 | set.seed(0) 15 | tensorflow::set_random_seed(0) 16 | n = 1000 17 | p = 100 18 | HTS = simulateHTS(n, p) 19 | x = HTS$x + 1 20 | y = HTS$y 21 | model = codacore(x, y, logRatioType='B') 22 | testthat::expect_true(getNumeratorParts(model, 1)[1]) 23 | testthat::expect_true(getDenominatorParts(model, 1)[2]) 24 | testthat::expect_equal(model$ensemble[[1]]$accuracy, 0.851) 25 | 26 | model = codacore(x, y, logRatioType='A') 27 | testthat::expect_true(getNumeratorParts(model, 1)[1]) 28 | testthat::expect_true(getDenominatorParts(model, 1)[2]) 29 | testthat::expect_equal(model$ensemble[[1]]$accuracy, 0.846) 30 | 31 | # test getBinaryPartitions() function 32 | testthat::expect_true(getBinaryPartitions(model)[1,1] == 1) 33 | testthat::expect_true(getBinaryPartitions(model)[2,1] == -1) 34 | testthat::expect_true(getBinaryPartitions(model)[3,1] == 0) 35 | 36 | # Now test in regression mode 37 | HTS = simulateHTS(n, p, outputType = 'continuous') 38 | x = HTS$x + 1 39 | y = HTS$y 40 | model = codacore(x, y, logRatioType='B', objective='regression') 41 | testthat::expect_true(getNumeratorParts(model, 1)[1]) 42 | testthat::expect_true(getDenominatorParts(model, 1)[2]) 43 | testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.349, tolerance=0.001) 44 | 45 | model = codacore(x, y, logRatioType='A', objective='regression') 46 | testthat::expect_true(getNumeratorParts(model, 1)[1]) 47 | testthat::expect_true(getDenominatorParts(model, 1)[2]) 48 | testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.349, tolerance=0.001) 49 | } 50 | }) 51 | 52 | testthat::test_that("balances", { 53 | if (tensorflow_is_installed()){ 54 | set.seed(0) 55 | tensorflow::set_random_seed(0) 56 | n = 1000 57 | p = 100 58 | HTS = simulateHTS(n, p, logratio='balance') 59 | x = HTS$x + 1 60 | y = HTS$y 61 | model = codacore(x, y, logRatioType='B') 62 | 63 | testthat::expect_true(getNumeratorParts(model, 1)[4]) 64 | testthat::expect_true(getNumeratorParts(model, 1)[6]) 65 | testthat::expect_true(getDenominatorParts(model, 1)[5]) 66 | testthat::expect_equal(model$ensemble[[1]]$accuracy, 0.733) 67 | 68 | # Now test in regression mode 69 | HTS = simulateHTS(n, p, logratio='balance', outputType = 'continuous') 70 | x = HTS$x + 1 71 | y = HTS$y 72 | model = codacore(x, y, logRatioType='B', objective='regression') 73 | testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.257, tolerance=0.001) 74 | } 75 | }) 76 | 77 | testthat::test_that("amalgamations", { 78 | if (tensorflow_is_installed()){ 79 | set.seed(0) 80 | tensorflow::set_random_seed(0) 81 | n = 1000 82 | p = 100 83 | HTS = simulateHTS(n, p, logratio='amalgamation') 84 | x = HTS$x + 1 85 | y = HTS$y 86 | model = codacore(x, y, logRatioType='A') 87 | 88 | testthat::expect_true(getNumeratorParts(model, 1)[1]) 89 | testthat::expect_true(getNumeratorParts(model, 1)[2]) 90 | testthat::expect_true(getDenominatorParts(model, 1)[3]) 91 | testthat::expect_equal(model$ensemble[[1]]$AUC[1], 0.925, tolerance=0.001) 92 | 93 | 94 | # Now test in regression mode 95 | HTS = simulateHTS(n, p, logratio='amalgamation', outputType = 'continuous') 96 | x = HTS$x + 1 97 | y = HTS$y 98 | model = codacore(x, y, logRatioType='A', objective='regression') 99 | testthat::expect_true(getNumeratorParts(model, 1)[1]) 100 | testthat::expect_true(getNumeratorParts(model, 1)[2]) 101 | testthat::expect_true(getDenominatorParts(model, 1)[3]) 102 | testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.540, tolerance=0.001) 103 | } 104 | }) 105 | 106 | -------------------------------------------------------------------------------- /vignettes/guide.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "CoDaCoRe guide" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{my-vignette} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r setup, include=FALSE} 11 | knitr::opts_chunk$set(echo = TRUE) 12 | ``` 13 | 14 | ```{r, echo = FALSE, results = "asis"} 15 | vignette_dir <- "../inst/misc/" 16 | vignette_file <- "guide.md" 17 | lines <- readLines(paste0(vignette_dir, vignette_file)) 18 | lines <- gsub("![](", replacement = paste0("![](", vignette_dir), x = lines, fixed = TRUE) 19 | cat(lines, sep = "\n") 20 | ``` 21 | --------------------------------------------------------------------------------