├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── codacore.R
    ├── data.R
    └── simulations.R
├── README.md
├── cran-comments.md
├── data
    ├── Crohn.rda
    ├── HIV.rda
    └── sCD14.rda
├── inst
    ├── CITATION
    └── misc
    │   ├── guide.Rmd
    │   └── guide.html
├── man
    ├── Crohn.Rd
    ├── HIV.Rd
    ├── activeInputs.codacore.Rd
    ├── codacore.Rd
    ├── getBinaryPartitions.Rd
    ├── getDenominatorParts.Rd
    ├── getLogRatios.Rd
    ├── getNumLogRatios.Rd
    ├── getNumeratorParts.Rd
    ├── getSlopes.Rd
    ├── getTidyTable.Rd
    ├── plot.codacore.Rd
    ├── plotROC.Rd
    ├── predict.codacore.Rd
    ├── print.codacore.Rd
    ├── sCD14.Rd
    └── simulateHTS.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   └── test-codacore.R
└── vignettes
    └── guide.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^LICENSE\.md$
4 | ^cran-comments\.md$
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | R-codacore.Rproj
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | .Ruserdata
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: codacore
 2 | Title: Learning Sparse Log-Ratios for Compositional Data
 3 | Version: 0.0.4
 4 | Authors@R: c(
 5 |     person("Elliott", "Gordon-Rodriguez", email = "eg2912@columbia.edu", role = c("aut", "cre")),
 6 |     person("Thomas", "Quinn", email = "contacttomquinn@gmail.com", role = c("aut"))
 7 |     )
 8 | Description: In the context of high-throughput genetic data,
 9 |     CoDaCoRe identifies a set of sparse biomarkers that are
10 |     predictive of a response variable of interest (Gordon-Rodriguez 
11 |     et al., 2021) <doi:10.1093/bioinformatics/btab645>. More 
12 |     generally, CoDaCoRe can be applied to any regression problem 
13 |     where the independent variable is Compositional (CoDa), to 
14 |     derive a set of scale-invariant log-ratios (ILR or SLR) that 
15 |     are maximally associated to a dependent variable.
16 | License: MIT + file LICENSE
17 | Encoding: UTF-8
18 | LazyData: true
19 | RoxygenNote: 7.1.1
20 | Depends:
21 |     R (>= 3.6.0)
22 | Imports:
23 |     tensorflow (>= 2.1),
24 |     keras (>= 2.3),
25 |     pROC (>= 1.17),
26 |     R6 (>= 2.5),
27 |     gtools(>= 3.8)
28 | SystemRequirements: TensorFlow (https://www.tensorflow.org/)
29 | Suggests:
30 |     zCompositions,
31 |     testthat (>= 2.1.0),
32 |     knitr,
33 |     rmarkdown
34 | VignetteBuilder: knitr
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: Elliott Gordon-Rodriguez
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2021 Elliott Gordon-Rodriguez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(plot,codacore)
 4 | S3method(predict,codacore)
 5 | S3method(print,codacore)
 6 | export(activeInputs.codacore)
 7 | export(codacore)
 8 | export(getBinaryPartitions)
 9 | export(getDenominatorParts)
10 | export(getLogRatios)
11 | export(getNumLogRatios)
12 | export(getNumeratorParts)
13 | export(getSlopes)
14 | export(getTidyTable)
15 | export(plotROC)
16 | export(simulateHTS)
17 | import(keras)
18 | importFrom(stats,predict)
19 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | ## codacore 0.0.4
 2 | ---------------------
 3 | * Update vignette.
 4 |     * Fix title.
 5 |     * Add incremental fit for cts. target.
 6 |     * Minor clarifications.
 7 | * Updated README to reflect CRAN latest.
 8 | * Add cvParams default values to func documentation.
 9 | * Add helper funcs `getNumLogRatios()` and `getTidyTable()`
10 | * Add `getBinaryPartitions` function to retrieve SBP-like representation of learned balances.
11 | * Allow tibble inputs.
12 | 
13 | ## codacore 0.0.3
14 | ---------------------
15 | * Live on CRAN.
16 | * Updated readme and vignettes to reflect this.
17 | 
18 | ## codacore 0.0.2
19 | ---------------------
20 | * Updated tests.
21 | * Updated guide.
22 |     * Covariate adjustment
23 |     * Unsupervised learning
24 |     * Multi-omics
25 | * Minor bugfix with glm numerics.
26 | * Added numLogRatios param to predict().
27 | 
28 | ## codacore 0.0.1
29 | ---------------------
30 | * Fix a bug in lambda-standard-error rule.
31 |     * Estimation of cross-validation prediction error was missing a scaling factor to account for the number of folds.
32 |     * As a result, models were over-regularized.
33 | * Update guide.


--------------------------------------------------------------------------------
/R/codacore.R:
--------------------------------------------------------------------------------
   1 | 
   2 | # Here we implement the codacore model
   3 | 
   4 | library(keras)
   5 | utils::globalVariables(c("self"))
   6 | 
   7 | # """Fits a single base learner"""
   8 | # Private class not to be called by user
   9 | .CoDaBaseLearner <- function(
  10 |   x,
  11 |   y,
  12 |   boostingOffset,
  13 |   logRatioType,
  14 |   objective,
  15 |   lambda,
  16 |   cvParams,
  17 |   optParams,
  18 |   verbose
  19 | ){
  20 |   
  21 |   cdbl = list(
  22 |     intercept=NULL,
  23 |     slope=NULL,
  24 |     weights=NULL,
  25 |     softAssignment=NULL,
  26 |     hard=NULL,
  27 |     x=x,
  28 |     y=y,
  29 |     boostingOffset=boostingOffset,
  30 |     logRatioType=logRatioType,
  31 |     objective=objective,
  32 |     lambda=lambda,
  33 |     cvParams=cvParams,
  34 |     optParams=optParams,
  35 |     verbose=verbose
  36 |   )
  37 |   class(cdbl) = "CoDaBaseLearner"
  38 |   
  39 |   # Train the relaxation model
  40 |   cdbl = trainRelaxation.CoDaBaseLearner(cdbl)
  41 |   
  42 |   # Find optimal cutoff by CV
  43 |   cutoff = findBestCutoff.CoDaBaseLearner(cdbl)
  44 |   
  45 |   # Use cutoff to "harden" the log-ratio
  46 |   cdbl = harden.CoDaBaseLearner(cdbl, cutoff)
  47 |   
  48 |   # And recompute the linear coefficients
  49 |   cdbl = setInterceptAndSlope.CoDaBaseLearner(cdbl, cdbl$x, cdbl$y, cdbl$boostingOffset)
  50 |   
  51 |   # Add some metrics
  52 |   yHat = predict(cdbl, x) + boostingOffset
  53 |   if (cdbl$objective == 'binary classification') {
  54 |     cdbl$ROC = pROC::roc(y, yHat, quiet=TRUE)
  55 |     cdbl$AUC = pROC::auc(cdbl$ROC)
  56 |     cdbl$accuracy = mean(y == (yHat > 0))
  57 |   } else {
  58 |     cdbl$RMSE = sqrt(mean((y - yHat)^2))
  59 |     cdbl$Rsquared = 1 - cdbl$RMSE^2 / stats::var(y)
  60 |   }
  61 |   
  62 |   return(cdbl)
  63 | }
  64 | 
  65 | 
  66 | #' @import keras
  67 | trainRelaxation.CoDaBaseLearner = function(cdbl) {
  68 |   startTime = Sys.time()
  69 |   
  70 |   # Set up traininable variables
  71 |   inputDim = ncol(cdbl$x)
  72 |   numObs = nrow(cdbl$x)
  73 |   
  74 |   # Initializaing the intercept at the average of the data
  75 |   # this helps optimization greatly
  76 |   # TODO: should experiment with slopeInit parameter for potential gains
  77 |   if (cdbl$objective == "binary classification") {
  78 |     loss_func = 'binary_crossentropy'
  79 |     if (abs(mean(1 / (1 + exp(-cdbl$boostingOffset))) - mean(cdbl$y)) < 0.001) {
  80 |       # Protect against numerical errors in glm() call
  81 |       interceptInit = 0.0
  82 |     } else {
  83 |       tempGLM = stats::glm(cdbl$y ~ 1, offset=cdbl$boostingOffset, family='binomial')
  84 |       interceptInit = tempGLM$coef[[1]]
  85 |     }
  86 |     slopeInit = 0.1
  87 |     metrics = c('accuracy')
  88 |   } else if (cdbl$objective == "regression") {
  89 |     loss_func = 'mean_squared_error'
  90 |     interceptInit = mean(cdbl$y - cdbl$boostingOffset)
  91 |     slopeInit = 0.1 # * stats::sd(cdbl$y - cdbl$boostingOffset)
  92 |     metrics = c('mean_squared_error')
  93 |   }
  94 |   
  95 |   # Define the forward pass for our relaxation,
  96 |   # which differs for balances and amalgamations
  97 |   if (cdbl$logRatioType == 'A') {
  98 |     epsilon = cdbl$optParams$epsilonA
  99 |     forwardPass = function(x, mask = NULL) {
 100 |       softAssignment = 2 * keras::k_sigmoid(self$weights) - 1
 101 |       # Add the small value to ensure gradient flows at exact zeros (initial values)
 102 |       pvePart = keras::k_dot(x, keras::k_relu(softAssignment + 1e-20))
 103 |       nvePart = keras::k_dot(x, keras::k_relu(-softAssignment))
 104 |       logRatio = keras::k_log(pvePart + epsilon) - 
 105 |         keras::k_log(nvePart + epsilon)
 106 |       eta = self$slope * logRatio + self$intercept + self$boostingOffset
 107 |       # keras::k_sigmoid(eta)
 108 |       eta
 109 |     }
 110 |   } else if (cdbl$logRatioType == 'B') {
 111 |     epsilon = cdbl$optParams$epsilonB
 112 |     forwardPass = function(x, mask = NULL) {
 113 |       softAssignment = 2 * keras::k_sigmoid(self$weights) - 1
 114 |       # Add the small value to ensure gradient flows at exact zeros (initial values)
 115 |       pvePart = keras::k_relu(softAssignment + 1e-20)
 116 |       nvePart = keras::k_relu(-softAssignment)
 117 |       logRatio = keras::k_dot(keras::k_log(x), pvePart) / keras::k_maximum(keras::k_sum(pvePart), epsilon) -
 118 |         keras::k_dot(keras::k_log(x), nvePart) / keras::k_maximum(keras::k_sum(nvePart), epsilon)
 119 |       eta = self$slope * logRatio + self$intercept + self$boostingOffset
 120 |       # keras::k_sigmoid(eta)
 121 |       eta
 122 |     }
 123 |   }
 124 |   
 125 |   if (FALSE) {
 126 |     tensorflow::tf$random$set_seed(0)
 127 |   }
 128 |   
 129 |   # Set up custom layer
 130 |   CustomLayer <- R6::R6Class(
 131 |     "CustomLayer",
 132 |     
 133 |     inherit = keras::KerasLayer,
 134 |     
 135 |     public = list(
 136 |       output_dim = NULL,
 137 |       weights = NULL,
 138 |       intercept = NULL,
 139 |       slope = NULL,
 140 |       boostingOffset = NULL,
 141 |       # epsilon = NULL,
 142 |       
 143 |       initialize = function() {
 144 |         self$output_dim <- 1
 145 |       },
 146 |       
 147 |       build = function(input_shape) {
 148 |         self$weights <- self$add_weight(
 149 |           name = 'weights', 
 150 |           shape = list(as.integer(inputDim), as.integer(1)),
 151 |           initializer = keras::initializer_zeros(),
 152 |           trainable = TRUE
 153 |         )
 154 |         self$intercept <- self$add_weight(
 155 |           name = 'intercept', 
 156 |           shape = list(as.integer(1)),
 157 |           initializer = keras::initializer_constant(interceptInit),
 158 |           trainable = TRUE
 159 |         )
 160 |         self$slope <- self$add_weight(
 161 |           name = 'slope', 
 162 |           shape = list(as.integer(1)),
 163 |           initializer = keras::initializer_constant(slopeInit),
 164 |           trainable = TRUE
 165 |         )
 166 |         self$boostingOffset <- self$add_weight(
 167 |           name = 'boostingOffset',
 168 |           shape = list(as.integer(numObs), as.integer(1)),
 169 |           initializer = keras::initializer_constant(cdbl$boostingOffset),
 170 |           trainable = FALSE
 171 |         )
 172 |         # self$epsilon <- self$add_weight(
 173 |         #   name = 'epsilon', 
 174 |         #   shape = list(as.integer(1)),
 175 |         #   initializer = keras::initializer_constant(cdbl$epsilon),
 176 |         #   trainable = FALSE
 177 |         # )
 178 |       },
 179 |       
 180 |       call = forwardPass,
 181 |       
 182 |       compute_output_shape = function(input_shape) {
 183 |         list(input_shape[[1]], self$output_dim)
 184 |       }
 185 |     )
 186 |   )
 187 |   
 188 |   .trainKeras = function(lr, epochs) {
 189 |     # define layer wrapper function
 190 |     codacoreLayer <- function(object) {
 191 |       keras::create_layer(CustomLayer, object)
 192 |     }
 193 |     
 194 |     # use it in a model
 195 |     model <- keras::keras_model_sequential()
 196 |     model %>% codacoreLayer()
 197 |     if (cdbl$objective == "binary classification") {
 198 |       model %>% layer_activation('sigmoid')
 199 |     }
 200 |     
 201 |     # compile graph
 202 |     model %>% keras::compile(
 203 |       loss = loss_func,
 204 |       optimizer = keras::optimizer_sgd(lr, momentum=cdbl$optParams$momentum),
 205 |       # optimizer = keras::optimizer_adam(0.001),
 206 |       metrics = metrics
 207 |     )
 208 |     
 209 |     
 210 |     model %>% keras::fit(cdbl$x, cdbl$y, epochs=epochs, 
 211 |                          batch_size=cdbl$optParams$batchSize, 
 212 |                          verbose=FALSE)# =TRUE) for debugging
 213 |     return(model)
 214 |   }
 215 |   
 216 |   runAdaptively = is.numeric(cdbl$optParams$adaptiveLR) & is.null(cdbl$optParams$vanillaLR)
 217 |   if (runAdaptively) {
 218 |     # Adaptive learning rate here means that we pick the lr s.t.
 219 |     # our first gradient step moves the amalWeights out by a specified amount
 220 |     model = .trainKeras(1, 1)
 221 |     lr = cdbl$optParams$adaptiveLR
 222 |     epochs = cdbl$optParams$epochs
 223 |     lr = lr / max(abs(as.numeric(model$get_weights()[[1]])))
 224 |     model = .trainKeras(lr, epochs)
 225 |   } else {
 226 |     warning("Using non-adaptive learning rate may hinder optimization.")
 227 |     lr = cdbl$optParams$vanillaLR
 228 |     epochs = cdbl$optParams$epochs
 229 |     model = .trainKeras(lr, epochs)
 230 |   }
 231 |   
 232 |   
 233 |   # Save results:
 234 |   cdbl$weights = as.numeric(model$get_weights()[[1]])
 235 |   cdbl$softAssignment = 2 / (1 + exp(-cdbl$weights)) - 1
 236 |   cdbl$intercept = as.numeric(model$get_weights()[[2]])
 237 |   cdbl$slope = as.numeric(model$get_weights()[[3]])
 238 |   
 239 |   # Equalize the largest + and largest - assignment for more 'balanced' balances
 240 |   eqRatio = max(cdbl$softAssignment) / min(cdbl$softAssignment) * (-1)
 241 |   cdbl$softAssignment[cdbl$softAssignment < 0] = cdbl$softAssignment[cdbl$softAssignment < 0] * eqRatio
 242 |   
 243 |   endTime = Sys.time()
 244 |   if (cdbl$verbose) {
 245 |     print('GD time:')
 246 |     print(endTime - startTime)
 247 |   }
 248 |   # cdbl$runTimeGD = endTime - startTime
 249 |   
 250 |   return(cdbl)
 251 | }
 252 | 
 253 | # Given a trained softAssignment, which corresponds to running
 254 | # the weights through an activation, we find
 255 | # the cutoff at which we define our log-ratio
 256 | findBestCutoff.CoDaBaseLearner = function(cdbl) {
 257 |   if (any(abs(cdbl$softAssignment) > 0.999999)) {
 258 |     warning("Large weights encountered in gradient descent;
 259 |             vanishing gradients likely.
 260 |             Learning rates might need recalibrating - try adaptive rates?")
 261 |   }
 262 |   
 263 |   candidateCutoffs = sort(abs(cdbl$softAssignment), decreasing=TRUE)
 264 |   maxCutoffs = cdbl$cvParams$maxCutoffs
 265 |   # Start from 2nd since we equalized +ve and -ve; thus neither side will be empty
 266 |   candidateCutoffs = candidateCutoffs[2:min(maxCutoffs, length(candidateCutoffs))]
 267 |   
 268 |   # TODO: re-implement without passing cdbl to harden()
 269 |   # and setInterceptAndSlope() to avoid computational overhead
 270 |   # from copying data unnecessarily
 271 |   
 272 |   # Compute the CV scores:
 273 |   startTime = Sys.time()
 274 |   numFolds = cdbl$cvParams$numFolds
 275 |   # Naive way of splitting equally into folds:
 276 |   foldIdx = sample(cut(1:length(cdbl$y), breaks=numFolds, labels=FALSE))
 277 |   if (cdbl$objective == "binary classification") {
 278 |     # Instead we randomize with equal # of case/controls in each fold
 279 |     # See discussion on stratified CV in page 204 of He & Ma 2013
 280 |     if (sum(cdbl$y) < numFolds | sum(1 - cdbl$y) < numFolds) {
 281 |       stop("Insufficient samples from each class available for cross-validation.")
 282 |     }
 283 |     caseIdx = sample(cut(1:sum(cdbl$y), breaks=numFolds, labels=FALSE))
 284 |     controlIdx = sample(cut(1:sum(1 - cdbl$y), breaks=numFolds, labels=FALSE))
 285 |     foldIdx[cdbl$y == 1] = caseIdx
 286 |     foldIdx[cdbl$y == 0] = controlIdx
 287 |   } 
 288 |   scores = matrix(nrow=length(candidateCutoffs), ncol=numFolds)
 289 |   i = 0
 290 |   for (cutoff in candidateCutoffs) {
 291 |     i = i + 1
 292 |     cdbl = harden.CoDaBaseLearner(cdbl, cutoff)
 293 |     for (j in 1:numFolds) {
 294 |       cdbl = setInterceptAndSlope.CoDaBaseLearner(cdbl, cdbl$x[foldIdx != j,], cdbl$y[foldIdx != j], cdbl$boostingOffset[foldIdx != j])
 295 |       yHat = predict(cdbl, cdbl$x[foldIdx == j,]) + cdbl$boostingOffset[foldIdx == j]
 296 |       if (cdbl$objective == "binary classification") {
 297 |         ROC = pROC::roc(cdbl$y[foldIdx == j], yHat, quiet=TRUE)
 298 |         scores[i, j] = pROC::auc(ROC)
 299 |       } else if (cdbl$objective == "regression") {
 300 |         scores[i, j] = -sqrt(mean((cdbl$y[foldIdx == j] - yHat)^2))
 301 |       }
 302 |     }
 303 |   }
 304 |   # Now implement lambda-SE rule
 305 |   means = apply(scores, 1, mean)
 306 |   # see eqn 9.2 here https://www.cs.cmu.edu/~psarkar/sds383c_16/lecture9_scribe.pdf
 307 |   stds = apply(scores, 1, stats::sd) / sqrt(numFolds)
 308 |   lambdaSeRule = max(means) - stds[which.max(means)] * cdbl$lambda
 309 |   # oneSdRule = max(means - stds)
 310 |   bestCutoff = candidateCutoffs[means >= lambdaSeRule][1]
 311 |   # bestCutoff = candidateCutoffs[which.max(scores)]
 312 |   
 313 |   
 314 |   endTime = Sys.time()
 315 |   if (cdbl$verbose) {
 316 |     print('CV time:')
 317 |     print(endTime - startTime)
 318 |     xCoor = 2:(length(means) + 1)
 319 |     graphics::plot(xCoor, means, ylim=range(c(means-stds, means+stds)))
 320 |     graphics::arrows(xCoor, means-stds, xCoor, means+stds, length=0.05, angle=90, code=3)
 321 |     graphics::abline(lambdaSeRule, 0)
 322 |   }
 323 |   
 324 |   if (cdbl$objective == "binary classification") {
 325 |     baseLineScore = pROC::auc(pROC::roc(cdbl$y, cdbl$boostingOffset, quiet=TRUE))
 326 |   } else if (cdbl$objective == "regression") {
 327 |     baseLineScore = -sqrt(mean((cdbl$y - cdbl$boostingOffset)^2))
 328 |   }
 329 |   noImprovement = lambdaSeRule < baseLineScore
 330 |   if (noImprovement) {
 331 |     bestCutoff = 1.1 # bigger than the softAssignment
 332 |   }
 333 |   
 334 |   return(bestCutoff)
 335 | }
 336 | 
 337 | 
 338 | harden.CoDaBaseLearner = function(cdbl, cutoff) {
 339 |   numPart = cdbl$softAssignment >= cutoff
 340 |   denPart = cdbl$softAssignment <= -cutoff
 341 |   hard = list(numerator=numPart, denominator=denPart)
 342 |   cdbl$hard = hard
 343 |   return(cdbl)
 344 | }
 345 | 
 346 | 
 347 | setInterceptAndSlope.CoDaBaseLearner = function(cdbl, x, y, boostingOffset) {
 348 |   # If our base learner is empty (i.e. couldn't beat the 1SE rule),
 349 |   # we simply set to 0:
 350 |   if (!any(cdbl$hard$numerator) & !any(cdbl$hard$denominator)) {
 351 |     cdbl$slope = 0.0
 352 |     cdbl$intercept = 0.0
 353 |     return(cdbl)
 354 |   }
 355 |   # Otherwise, we have a non-empty SLR, so we compute it's regression coefficient
 356 |   logRatio = computeLogRatio.CoDaBaseLearner(cdbl, x)
 357 |   dat = data.frame(x=logRatio, y=y)
 358 |   if (cdbl$objective == "binary classification") {
 359 |     glm = stats::glm(y~x, family='binomial', data=dat, offset=boostingOffset)
 360 |     if (any(is.na(glm$coefficients))) {
 361 |       glm = list(coefficients=list(0, 0))
 362 |       warning("Numerical error during glm fit. Possible data issue.") 
 363 |     }
 364 |   } else if (cdbl$objective == "regression") {
 365 |     glm = stats::glm(y~x, family='gaussian', data=dat, offset=boostingOffset)
 366 |   } else {
 367 |     stop("Not implemented objective=", cdbl$objective)
 368 |   }
 369 |   cdbl$intercept = glm$coefficients[[1]]
 370 |   cdbl$slope = glm$coefficients[[2]]
 371 |   return(cdbl)
 372 | }
 373 | 
 374 | 
 375 | computeLogRatio.CoDaBaseLearner = function(cdbl, x) {
 376 |   
 377 |   if (!any(cdbl$hard$numerator) | !any(cdbl$hard$denominator)) {
 378 |     logRatio = rowSums(x * 0)
 379 |   } else { # we have a bona fide log-ratio
 380 |     if (cdbl$logRatioType == 'A') {
 381 |       epsilon = cdbl$optParams$epsilonA
 382 |       pvePart = rowSums(x[, cdbl$hard$numerator, drop=FALSE]) # drop=FALSE to keep as matrix
 383 |       nvePart = rowSums(x[, cdbl$hard$denominator, drop=FALSE])
 384 |       logRatio = log(pvePart + epsilon) - log(nvePart + epsilon)
 385 |     } else if (cdbl$logRatioType == 'B') {
 386 |       pvePart = rowMeans(log(x[, cdbl$hard$numerator, drop=FALSE])) # drop=FALSE to keep as matrix
 387 |       nvePart = rowMeans(log(x[, cdbl$hard$denominator, drop=FALSE]))
 388 |       logRatio = pvePart - nvePart
 389 |     }
 390 |   }
 391 |   
 392 |   return(logRatio)
 393 | }
 394 | 
 395 | 
 396 | predict.CoDaBaseLearner = function(cdbl, x, asLogits=TRUE) {
 397 |   logRatio = computeLogRatio.CoDaBaseLearner(cdbl, x)
 398 |   eta = cdbl$slope * logRatio + cdbl$intercept
 399 |   if (asLogits) {
 400 |     return(eta)
 401 |   } else {
 402 |     if (cdbl$objective == 'regression') {
 403 |       stop("Logits argument should only be used for classification, not regression.")
 404 |     }
 405 |     return(1 / (1 + exp(-eta)))
 406 |   }
 407 | }
 408 | 
 409 | 
 410 | #' codacore
 411 | #' 
 412 | #' This function implements the codacore algorithm described by Gordon-Rodriguez et al. 2021 
 413 | #' (https://doi.org/10.1101/2021.02.11.430695).
 414 | #' 
 415 | #' @param x A data.frame or matrix of the compositional predictor variables.
 416 | #'  Rows represent observations and columns represent variables.
 417 | #' @param y A data.frame, matrix or vector of the response. In the case of a 
 418 | #'  data.frame or matrix, there should be one row for each observation, and
 419 | #'  just a single column.
 420 | #' @param logRatioType A string indicating whether to use "balances" or "amalgamations".
 421 | #'  Also accepts "balance", "B", "ILR", or "amalgam", "A", "SLR".
 422 | #'  Note that the current implementation for balances is not strictly an ILR,
 423 | #'  but rather just a collection of balances (which are possibly non-orthogonal
 424 | #'  in the Aitchison sense).
 425 | #' @param objective A string indicating "binary classification" or "regression". By default,
 426 | #'  it is NULL and gets inferred from the values in y.
 427 | #' @param lambda A numeric. Corresponds to the "lambda-SE" rule. Sets the "regularization strength"
 428 | #'  used by the algorithm to decide how to harden the ratio. 
 429 | #'  Larger numbers tend to yield fewer, more sparse ratios.
 430 | #' @param offset A numeric vector of the same length as y. Works similarly to the offset in a glm.
 431 | #' @param shrinkage A numeric. Shrinkage factor applied to each base learner.
 432 | #'  Defaults to 1.0, i.e., no shrinkage applied.
 433 | #' @param maxBaseLearners An integer. The maximum number of log-ratios that the model will
 434 | #'  learn before stopping. Automatic stopping based on \code{seRule} may occur sooner.
 435 | #' @param optParams A list of named parameters for the optimization of the
 436 | #'  continuous relaxation. Empty by default. User can override as few or as
 437 | #'  many of our defaults as desired. Includes adaptiveLR (learning rate under
 438 | #'  adaptive training scheme), momentum (in the gradient-descent sense), 
 439 | #'  epochs (number of gradient-descent epochs), batchSize (number of 
 440 | #'  observations per minibatch, by default the entire dataset),
 441 | #'  and vanillaLR (the learning rate to be used if the user does *not* want
 442 | #'  to use the 'adaptiveLR', to be used at the risk of optimization issues).
 443 | #' @param cvParams A list of named parameters for the "hardening" procedure
 444 | #'  using cross-validation. Includes numFolds (number of folds, default=5) and
 445 | #'  maxCutoffs (number of candidate cutoff values of 'c' to be tested out
 446 | #'  during CV process, default=20 meaning log-ratios with up to 21 components
 447 | #'  can be found by codacore).
 448 | #' @param verbose A boolean. Toggles whether to display intermediate steps.
 449 | #' @param overlap A boolean. Toggles whether successive log-ratios found by 
 450 | #'  CoDaCoRe may contain repeated input variables. TRUE by default.
 451 | #'  Changing to FALSE implies that the log-ratios obtained by CoDaCoRe
 452 | #'  will become orthogonal in the Aitchison sense, analogously to the
 453 | #'  isometric-log-ratio transformation, while losing a small amount of
 454 | #'  model flexibility.
 455 | #' @param fast A boolean. Whether to run in fast or slow mode. TRUE by
 456 | #'  default. Running in slow mode will take ~x5 the computation time,
 457 | #'  but may help identify slightly more accurate log-ratios.
 458 | #' 
 459 | #' @return A \code{codacore} object.
 460 | #' 
 461 | #' @examples
 462 | #' \dontrun{
 463 | #' data("Crohn")
 464 | #' x <- Crohn[, -ncol(Crohn)]
 465 | #' y <- Crohn[, ncol(Crohn)]
 466 | #' x <- x + 1
 467 | #' model = codacore(x, y)
 468 | #' print(model)
 469 | #' plot(model)
 470 | #' }
 471 | #' 
 472 | #' @importFrom stats predict
 473 | #' 
 474 | #' @export
 475 | codacore <- function(
 476 |   x,
 477 |   y,
 478 |   logRatioType='balances',
 479 |   objective=NULL,
 480 |   lambda=1.0,
 481 |   offset=NULL,
 482 |   shrinkage=1.0,
 483 |   maxBaseLearners=5,
 484 |   optParams=list(),
 485 |   cvParams=list(),
 486 |   verbose=FALSE,
 487 |   overlap=TRUE,
 488 |   fast=TRUE
 489 | ){
 490 |   
 491 |   # Convert x and y to the appropriate objects
 492 |   x = .prepx(x)
 493 |   y = .prepy(y)
 494 |   
 495 |   # Check whether we are in regression or classification mode by inspecting y
 496 |   if (is.null(objective)) {
 497 |     distinct_values = length(unique(y))
 498 |     if (distinct_values == 2) {
 499 |       objective = 'binary classification'
 500 |     } else if (inherits(y, 'factor')) {
 501 |       stop("Multi-class classification note yet implemented.")
 502 |     } else if (inherits(y, 'numeric')) {
 503 |       objective = 'regression'
 504 |       if (distinct_values <= 10) {
 505 |         warning("Response only has ", distinct_values, " distinct values.")
 506 |         warning("Consider changing the objective function.")
 507 |       }
 508 |     }
 509 |   }
 510 |   
 511 |   # Make sure we recognize objective
 512 |   if (! objective %in% c('binary classification', 'regression')) {
 513 |     stop("Objective: ", objective, " not yet implemented.")
 514 |   }
 515 |   
 516 |   # Save names of labels if relevant
 517 |   if (objective == 'binary classification' & inherits(y, 'factor')) {
 518 |     yLevels = levels(y)
 519 |     y = as.numeric(y) - 1
 520 |   } else {
 521 |     yLevels = NULL
 522 |   }
 523 |   
 524 |   # In the regression case, standardize data and save scale
 525 |   if (objective == 'regression') {
 526 |     yMean = mean(y)
 527 |     yScale = stats::sd(y)
 528 |     y = (y - yMean) / yScale
 529 |   } else {
 530 |     yMean = NULL
 531 |     yScale = NULL
 532 |   }
 533 |   
 534 |   # Convert logRatioType to a unique label:
 535 |   if (logRatioType %in% c('amalgamations', 'amalgam', 'A', 'SLR')) {
 536 |     logRatioType='A'
 537 |   } else if (logRatioType %in% c('balances', 'balance', 'B', 'ILR')) {
 538 |     logRatioType='B'
 539 |   } else {
 540 |     stop('Invalid logRatioType argument given: ', logRatioType)
 541 |   }
 542 |   
 543 |   if (any(x == 0)) {
 544 |     if (logRatioType == 'A') {
 545 |       warning("The data contain zeros. An epsilon is used to prevent divide-by-zero errors.")
 546 |     } else if (logRatioType == 'B') {
 547 |       stop("The data contain zeros. Balances cannot be used in this case.")
 548 |     }
 549 |   }
 550 |   
 551 |   if (!overlap) {
 552 |     # We store away the original data, since we will override during
 553 |     # the stagewise-additive procedure, zeroing out the input variables
 554 |     # that get picked up by each log-ratio.
 555 |     xOriginal = x
 556 |   }
 557 |   
 558 |   if (nrow(x) > 10000) {
 559 |     warning("Large number of observations; codacore could benefit from minibatching.")
 560 |   }
 561 |     
 562 |   if (nrow(x) < 50) {
 563 |     warning("Small number of observations; proceed with care (the likelihood of unstable results may increase).")
 564 |   }
 565 |   
 566 |   # Set up optimization parameters
 567 |   optDefaults = list(
 568 |     epochs=100,
 569 |     batchSize=nrow(x),
 570 |     vanillaLR=NULL,
 571 |     adaptiveLR=0.5,
 572 |     momentum=0.9,
 573 |     epsilonA=1e-6,
 574 |     epsilonB=1e-2
 575 |     # initialization = 'zeros'
 576 |   )
 577 |   # Take the defaults and override with any user-specified params, if given
 578 |   for (param in names(optParams)) {
 579 |     if (param %in% names(optDefaults)) {
 580 |       optDefaults[param] = optParams[param]
 581 |     } else {
 582 |       stop('Unknown optimization parameter given:', param)
 583 |     }
 584 |   }
 585 |   optParams = optDefaults
 586 |   
 587 |   # Check whether we are running in fast or slow mode
 588 |   if (!fast) {
 589 |     message("CoDaCoRe is running in slow mode. Switch to fast=TRUE for ~x5 speedup.")
 590 |     optParams$epochs = 1000
 591 |   }
 592 |   
 593 |   # Set up cross-validation parameters
 594 |   cvDefaults = list(
 595 |     maxCutoffs=20,
 596 |     numFolds=5
 597 |   )
 598 |   # Take the defaults and override with any user-specified params, if given
 599 |   for (param in names(cvParams)) {
 600 |     if (param %in% names(cvDefaults)) {
 601 |       cvDefaults[param] = cvParams[param]
 602 |     } else {
 603 |       stop('Unknown optimization parameter given:', param)
 604 |     }
 605 |   }
 606 |   cvParams = cvDefaults
 607 |   
 608 |   
 609 |   ### Now we train codacore:
 610 |   # Initialize from an empty ensemble
 611 |   ensemble = list()
 612 |   if (is.null(offset)) {
 613 |     boostingOffset = y * 0.0
 614 |   } else {
 615 |     boostingOffset = offset
 616 |   }
 617 |   maxBaseLearners = maxBaseLearners / shrinkage
 618 |   for (i in 1:maxBaseLearners) {
 619 |     startTime = Sys.time()
 620 |     cdbl = .CoDaBaseLearner(
 621 |       x=x,
 622 |       y=y,
 623 |       boostingOffset=boostingOffset,
 624 |       logRatioType=logRatioType,
 625 |       objective=objective,
 626 |       lambda=lambda,
 627 |       optParams=optParams,
 628 |       cvParams=cvParams,
 629 |       verbose=verbose
 630 |     )
 631 |     endTime = Sys.time()
 632 |     
 633 |     if (verbose) {
 634 |       cat('\n\n\nBase Learner', i)
 635 |       cat('\nLog-ratio indexes:')
 636 |       cat('\nNumerator =', which(cdbl$hard$numerator))
 637 |       cat('\nDenominator =', which(cdbl$hard$denominator))
 638 |       if (objective == 'binary classification') {
 639 |         cat('\nAccuracy:', cdbl$accuracy)
 640 |         cat('\nAUC:', cdbl$AUC)
 641 |       } else if (objective == 'regression') {
 642 |         cat('\nRMSE', cdbl$RMSE)
 643 |       }
 644 |       cat('\nTime taken:', endTime - startTime)
 645 |     }
 646 |     
 647 |     # If base learner is empty, we stop (no further gain in CV AUC):
 648 |     if (!any(cdbl$hard$numerator) & !any(cdbl$hard$denominator)) {break}
 649 |     
 650 |     # Add the new base learner to ensemble
 651 |     boostingOffset = boostingOffset + shrinkage * predict(cdbl, x)
 652 |     ensemble[[i]] = cdbl
 653 |     
 654 |     # If AUC is ~1, we stop (we separated the training data):
 655 |     # Note this won't always get caught by previous check since separability can lead to
 656 |     # numerical overflow which throws an error rather than finding an empty base learner
 657 |     if (cdbl$objective == 'binary classification' && cdbl$AUC > 0.999) {break}
 658 |     if (cdbl$objective == 'regression' && cdbl$Rsquared > 0.999) {break}
 659 |     
 660 |     # To avoid overlapping log-ratios, we "zero-out" the input variables that have 
 661 |     # already been used
 662 |     if (!overlap) {
 663 |       x[, cdbl$hard$numerator] = min(x)
 664 |       x[, cdbl$hard$denominator] = min(x)
 665 |     }
 666 |   }
 667 |   
 668 |   if (!overlap) {
 669 |     # Replace the original data frame for saving in the object
 670 |     x = xOriginal
 671 |   }
 672 |   
 673 |   cdcr = list(
 674 |     ensemble=ensemble,
 675 |     x = x,
 676 |     y = y,
 677 |     objective=objective,
 678 |     logRatioType=logRatioType,
 679 |     lambda=lambda,
 680 |     shrinkage=shrinkage,
 681 |     maxBaseLearners=maxBaseLearners,
 682 |     optParams=optParams,
 683 |     cvParams=cvParams,
 684 |     overlap=overlap,
 685 |     yLevels=yLevels,
 686 |     yMean=yMean,
 687 |     yScale=yScale
 688 |   )
 689 |   class(cdcr) = "codacore"
 690 |   
 691 |   # If no log-ratios were found, suggest reducing regularization strength
 692 |   if (length(ensemble) == 0) {
 693 |     warning("No predictive log-ratios were found. Consider using lower values of lambda.")
 694 |   }
 695 |   
 696 |   return(cdcr)
 697 | }
 698 | 
 699 | 
 700 | #' predict
 701 | #'
 702 | #' @param object A codacore object.
 703 | #' @param newx A set of inputs to our model.
 704 | #' @param asLogits Whether to return outputs in logit space
 705 | #'  (as opposed to probability space). Should always be set
 706 | #'  to TRUE for regression with continuous outputs, but can
 707 | #'  be toggled for classification problems.
 708 | #' @param numLogRatios How many predictive log-ratios to 
 709 | #'  include in the prediction. By default, includes the
 710 | #'  effects of all log-ratios that were obtained during
 711 | #'  training. Setting this parameter to an integer k will
 712 | #'  restrict to using only the top k log-ratios in the model.
 713 | #' @param ... Not used.
 714 | #'
 715 | #' @export
 716 | predict.codacore = function(object, newx, asLogits=TRUE, numLogRatios=NA, ...) {
 717 |   # Throw an error if zeros are present
 718 |   if (any(newx == 0)) {
 719 |     if (object$logRatioType == 'A') {
 720 |       warning("The data contain zeros. An epsilon is used to prevent divide-by-zero errors.")
 721 |     } else if (object$logRatioType == 'B') {
 722 |       stop("The data contain zeros. Balances cannot be used in this case.")
 723 |     }
 724 |   }
 725 |   
 726 |   x = .prepx(newx)
 727 |   yHat = rep(0, nrow(x))
 728 |   
 729 |   if (is.na(numLogRatios)) {
 730 |     numLogRatios = length(object$ensemble)
 731 |   }
 732 |   
 733 |   for (i in 1:numLogRatios) {
 734 |     cdbl = object$ensemble[[i]]
 735 |     yHat = yHat + object$shrinkage * predict(cdbl, x)
 736 |   }
 737 |   
 738 |   if (object$objective == 'binary classification') {
 739 |     if (asLogits) {
 740 |       return(yHat)
 741 |     } else {
 742 |       return(1 / (1 + exp(-yHat)))
 743 |     }
 744 |   } else if (object$objective == 'regression') {
 745 |     return(yHat * object$yScale + object$yMean)
 746 |   }
 747 | }
 748 | 
 749 | 
 750 | #' print
 751 | #'
 752 | #' @param x A codacore object.
 753 | #' @param ... Not used.
 754 | #'
 755 | #' @export
 756 | print.codacore = function(x, ...) {
 757 |   # TODO: Make this into a table to print all at once
 758 |   cat("\nNumber of log-ratios found:", length(x$ensemble))
 759 |   if (length(x$ensemble) >= 1) {
 760 |     for (i in 1:length(x$ensemble)) {
 761 |       cat("\n***")
 762 |       cat("\nLog-ratio rank", i)
 763 |       cdbl = x$ensemble[[i]]
 764 |       hard = x$ensemble[[i]]$hard
 765 |       if (is.null(rownames(cdbl$x))) {
 766 |         cat("\nNumerator:", which(cdbl$hard$numerator))
 767 |         cat("\nDenominator:", which(cdbl$hard$denominator))
 768 |       } else {
 769 |         cat("\nNumerator:", colnames(cdbl$x)[which(cdbl$hard$numerator)])
 770 |         cat("\nDenominator:", colnames(cdbl$x)[which(cdbl$hard$denominator)])
 771 |       }
 772 |       # cat("\nIntercept:", cdbl$intercept)
 773 |       if (cdbl$objective == 'binary classification') {
 774 |         cat("\nAUC:", cdbl$AUC)
 775 |         cat("\nSlope:", cdbl$slope)
 776 |       } else if (cdbl$objective == 'regression') {
 777 |         cat("\nR squared:", cdbl$Rsquared)
 778 |         cat("\nSlope:", cdbl$slope * x$yScale)
 779 |       }
 780 |     }
 781 |   }
 782 |   cat("\n") # one final new line at end to finish print block
 783 | }
 784 | 
 785 | 
 786 | #' plot
 787 | #' 
 788 | #' Plots a summary of a fitted codacore model.
 789 | #' Credit to the authors of the selbal package (Rivera-Pinto et al., 2018),
 790 | #' from whose package these plots were inspired.
 791 | #'
 792 | #' @param x A codacore object.
 793 | #' @param index The index of the log-ratio to plot.
 794 | #' @param ... Not used.
 795 | #'
 796 | #' @export
 797 | plot.codacore = function(x, index = 1, ...) {
 798 |   
 799 |   allRatios = getLogRatios(x)
 800 |   if(index > ncol(allRatios)){
 801 |     stop("The selected log-ratio does not exist!")
 802 |   }
 803 |   
 804 |   if (x$objective == 'regression') {
 805 |     
 806 |     logRatio = allRatios[, index]
 807 |     graphics::plot(logRatio, x$y, xlab='Log-ratio score', ylab='Response')
 808 |     graphics::abline(x$ensemble[[1]]$intercept, x$ensemble[[1]]$slope, lwd=2)
 809 |     
 810 |   } else if (x$objective == 'binary classification') {
 811 |     
 812 |     logRatio = allRatios[, index]
 813 |     
 814 |     # Convert 0/1 binary output to the original labels, if any
 815 |     if (!is.null(x$yLevels)) {
 816 |       y = x$yLevels[x$y + 1]
 817 |     }
 818 |     
 819 |     graphics::boxplot(
 820 |       logRatio ~ y,
 821 |       col=c('orange','lightblue'),
 822 |       main=paste0('Distribution of log-ratio ', index),
 823 |       xlab='Log-ratio score',
 824 |       ylab='Outcome',
 825 |       horizontal=TRUE
 826 |     )
 827 |     
 828 |   }
 829 | }
 830 | 
 831 | 
 832 | #' plotROC
 833 | #'
 834 | #' @param cdcr A codacore object.
 835 | #'
 836 | #' @export
 837 | plotROC = function(cdcr) {
 838 |   
 839 |   if (cdcr$objective != 'binary classification') {
 840 |     stop("ROC curves undefined for binary classification")
 841 |   }
 842 |   cols = c("black", "gray50", "gray70", "gray80", "gray90")
 843 |   lwds = c(2.0, 1.5, 1.2, 0.8, 0.6)
 844 |   oldPar <- graphics::par(no.readonly = TRUE)
 845 |   on.exit(graphics::par(oldPar)) # make sure to restore params even if there's an error
 846 |   graphics::par(pty = 's')
 847 |   graphics::plot(cdcr$ensemble[[1]]$ROC)
 848 |   legendCols = cols
 849 |   numBL = length(cdcr$ensemble)
 850 |   legendText = c()
 851 |   legendLwds = c()
 852 |   for (i in 1:min(5, numBL)) {
 853 |     cdbl = cdcr$ensemble[[i]]
 854 |     graphics::lines(cdbl$ROC$specificities, cdbl$ROC$sensitivities, col=cols[i], lwd=lwds[i])
 855 |     legendText = c(legendText, paste0("Log-ratio: ", i, ", AUC: ", round(cdbl$AUC, 2)))
 856 |     legendCols = c(legendCols, cols[i])
 857 |     legendLwds = c(legendLwds, lwds[i])
 858 |   }
 859 |   graphics::legend(
 860 |     "bottomright",
 861 |     rev(legendText),
 862 |     lty=1,
 863 |     col=rev(legendCols),
 864 |     lwd=rev(legendLwds) + 0.5
 865 |   )
 866 | }
 867 | 
 868 | 
 869 | # Helper functions below...
 870 | 
 871 | 
 872 | #' activeInputs
 873 | #'
 874 | #' @param cdcr A codacore object.
 875 | #'
 876 | #' @return The covariates included in the log-ratios
 877 | #' 
 878 | #' @export
 879 | activeInputs.codacore = function(cdcr) {
 880 |   
 881 |   vars = c()
 882 |   
 883 |   for (cdbl in cdcr$ensemble) {
 884 |     vars = c(vars, which(cdbl$hard$numerator))
 885 |     vars = c(vars, which(cdbl$hard$denominator))
 886 |   }
 887 |   
 888 |   return(sort(unique(vars)))
 889 | }
 890 | 
 891 | 
 892 | #' getNumeratorParts
 893 | #'
 894 | #' @param cdcr A codacore object.
 895 | #' @param baseLearnerIndex An integer indicating which of the 
 896 | #'     (possibly multiple) log-ratios learned by codacore to be used.
 897 | #' @param boolean Whether to return the parts in boolean form
 898 | #'     (a vector of TRUE/FALSE) or to return the column names of
 899 | #'     those parts directly.
 900 | #'
 901 | #' @return The covariates in the numerator of the selected log-ratio.
 902 | #' 
 903 | #' @export
 904 | getNumeratorParts <- function(cdcr, baseLearnerIndex=1, boolean=TRUE){
 905 |   
 906 |   parts = cdcr$ensemble[[baseLearnerIndex]]$hard$numerator
 907 |   
 908 |   if (boolean) {
 909 |     return(parts)
 910 |   } else {
 911 |     return(colnames(cdcr$x)[parts])
 912 |   }
 913 | }
 914 | 
 915 | #' getDenominatorParts
 916 | #'
 917 | #' @param cdcr A codacore object.
 918 | #' @param baseLearnerIndex An integer indicating which of the 
 919 | #'     (possibly multiple) log-ratios learned by codacore to be used.
 920 | #' @param boolean Whether to return the parts in boolean form
 921 | #'     (a vector of TRUE/FALSE) or to return the column names of
 922 | #'     those parts directly.
 923 | #' 
 924 | #' @return The covariates in the denominator of the selected log-ratio.
 925 | #' 
 926 | #' @export
 927 | getDenominatorParts <- function(cdcr, baseLearnerIndex=1, boolean=TRUE){
 928 |   
 929 |   parts = cdcr$ensemble[[baseLearnerIndex]]$hard$denominator
 930 |   
 931 |   if (boolean) {
 932 |     return(parts)
 933 |   } else {
 934 |     return(colnames(cdcr$x)[parts])
 935 |   }
 936 | }
 937 | 
 938 | #' getLogRatios
 939 | #'
 940 | #' @param cdcr A codacore object
 941 | #' @param x A set of (possibly unseen) compositional data. 
 942 | #'     The covariates must be passed in the same order as 
 943 | #'     for the original codacore() call.
 944 | #'
 945 | #' @return The learned log-ratio features, computed on input x.
 946 | #' 
 947 | #' @export
 948 | getLogRatios <- function(cdcr, x=NULL){
 949 |   
 950 |   if (is.null(x)) {
 951 |     x = cdcr$x
 952 |   }
 953 |   
 954 |   if (cdcr$logRatioType == 'A') {
 955 |     epsilonA = cdcr$optParams$epsilonA
 956 |     ratios <- lapply(cdcr$ensemble, function(a){
 957 |       num <- rowSums(x[, a$hard$numerator, drop=FALSE]) + epsilonA
 958 |       den <- rowSums(x[, a$hard$denominator, drop=FALSE]) + epsilonA
 959 |       log(num/den)
 960 |     })
 961 |   } else if (cdcr$logRatioType == 'B') {
 962 |     ratios <- lapply(cdcr$ensemble, function(a){
 963 |       num <- rowMeans(log(x[, a$hard$numerator, drop=FALSE]))
 964 |       den <- rowMeans(log(x[, a$hard$denominator, drop=FALSE]))
 965 |       num - den
 966 |     })
 967 |   }
 968 |   
 969 |   out <- do.call("cbind", ratios)
 970 |   colnames(out) <- paste0("log-ratio", 1:ncol(out))
 971 |   return(out)
 972 | }
 973 | 
 974 | 
 975 | #' getSlopes
 976 | #'
 977 | #' @param cdcr A codacore object
 978 | #'
 979 | #' @return The slopes (i.e., regression coefficients) for each log-ratio.
 980 | #' 
 981 | #' @export
 982 | getSlopes <- function(cdcr){
 983 |   
 984 |   out = c()
 985 |   
 986 |   for (cdbl in cdcr$ensemble) {
 987 |     out = c(out, cdbl$slope)
 988 |   }
 989 |   
 990 |   return(out)
 991 | }
 992 | 
 993 | 
 994 | #' getNumLogRatios
 995 | #'
 996 | #' @param cdcr A codacore object
 997 | #'
 998 | #' @return The number of log-ratios that codacore found.
 999 | #'     Typically a small integer. Can be zero if codacore
1000 | #'     found no predictive log-ratios in the data.
1001 | #' 
1002 | #' @export
1003 | getNumLogRatios <- function(cdcr){
1004 |   return(length(cdcr$ensemble))
1005 | }
1006 | 
1007 | 
1008 | #' getTidyTable
1009 | #'
1010 | #' @param cdcr A codacore object
1011 | #'
1012 | #' @return A table displaying the log-ratios found.
1013 | #' 
1014 | #' @export
1015 | getTidyTable <- function(cdcr){
1016 |   
1017 |   tidyLogRatio = function(baseLearnerIndex, model, xTrain){
1018 |     x = getNumeratorParts(model, baseLearnerIndex, FALSE)
1019 |     df = data.frame(Side = 'Numerator', Name = x)
1020 |     x = getDenominatorParts(model, baseLearnerIndex, FALSE)
1021 |     df = rbind(df, data.frame(Side = 'Denominator', Name = x))
1022 |     df$logRatioIndex = baseLearnerIndex
1023 |     return(df)
1024 |   }
1025 |   
1026 |   num = getNumLogRatios(cdcr)
1027 |   
1028 |   if (num == 0) {
1029 |     return()
1030 |   } else {
1031 |     do.call(rbind, lapply(1:num, tidyLogRatio, model=cdcr))
1032 |   }
1033 | }
1034 | 
1035 | #' getBinaryPartitions
1036 | #'
1037 | #' @param cdcr A codacore object
1038 | #'
1039 | #' @return A matrix describing whether each component (as rows) is found in the
1040 | #'  numerator (1) or denominator (-1) of each learned log-ratio (as columns).
1041 | #'  This format resembles a serial binary partition matrix frequently used
1042 | #'  in balance analysis.
1043 | #' 
1044 | #' @export
1045 | getBinaryPartitions <- function(cdcr){
1046 |   
1047 |   numBaseLearners <- length(cdcr$ensemble)
1048 |   res <- list(numBaseLearners)
1049 |   for(baseLearner in 1:numBaseLearners){
1050 |     thisNumerator <- getNumeratorParts(cdcr, baseLearner)
1051 |     thisDenominater <- getDenominatorParts(cdcr, baseLearner)
1052 |     res[[baseLearner]] <- thisNumerator*1 + thisDenominater*-1
1053 |   }
1054 |   do.call("cbind", res)
1055 | }
1056 | 
1057 | .prepx = function(x) {
1058 |   if (class(x)[1] == 'tbl_df') {x = as.data.frame(x)}
1059 |   if (class(x)[1] == 'data.frame') {x = as.matrix(x)}
1060 |   if (is.integer(x)) {x = x * 1.0}
1061 |   
1062 |   # If the data is un-normalized (e.g. raw counts),
1063 |   # we normalize it to ensure our learning rate is well calibrated
1064 |   x = x / rowSums(x)
1065 |   return(x)
1066 | }
1067 | 
1068 | .prepy = function(y) {
1069 |   if (inherits(y, 'tbl_df')) {
1070 |     y = as.data.frame(y)
1071 |   }
1072 |   if (inherits(y, 'data.frame')) {
1073 |     if (ncol(y) > 1) {
1074 |       stop("Response should be 1-dimensional (if given 
1075 |            as a data.frame or matrix, it should have a 
1076 |            row for each sample, and a single column).")
1077 |     }
1078 |     y = y[[1]]
1079 |   }
1080 |   if (inherits(y, 'matrix')) {
1081 |     if (ncol(y) > 1) {
1082 |       stop("Response should be 1-dimensional (if given 
1083 |            as a data.frame or matrix, it should have a 
1084 |            row for each sample, and a single column).")
1085 |     }
1086 |     if (inherits(y, 'character')) {
1087 |       y = as.character(y)
1088 |     }
1089 |     if (inherits(y, 'numeric')){
1090 |       y = as.numeric(y)
1091 |     }
1092 |   }
1093 |   if (inherits(y, 'character')) {
1094 |     y = factor(y)
1095 |   }
1096 |   return(y)
1097 | }
1098 | 
1099 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Microbiome, HIV infection and MSM factor
 2 | #'
 3 | #' A dataset containing the number of counts of 60 different genera in a group
 4 | #' of 155 samples (including HIV - infected and non - infected patients).
 5 | #' The \code{data.frame} is composed by 60 genera and two variables.
 6 | #'
 7 | #' @format The \code{data.frame} is composed by 60 genera and 2 variables
 8 | #' \describe{
 9 | #'   \item{genera}{The first 60 columns, from \emph{g_Prevotella} until
10 | #'        \emph{o_NB1-n_g_unclassified} referred to different genera.}
11 | #'   \item{MSM}{a factor determining if the individual is \code{MSM} (\emph{Men Sex with
12 | #'    Men}) or not (\code{nonMSM}).}
13 | #'   \item{HIV_Status}{a factor specifying if the individual is infected
14 | #'    (\code{Pos}) or not (\code{Neg}).}
15 | #'
16 | #' }
17 | #' @docType data
18 | #' @name HIV
19 | #' @references \url{https://pubmed.ncbi.nlm.nih.gov/27077120/}
20 | #' @keywords data
21 | NULL
22 | 
23 | 
24 | #' Microbiome and sCD14 inflammation parameter
25 | #'
26 | #' A dataset containing the number of counts of 60 different genera in a group
27 | #' of 151 samples (including HIV - infected and non - infected patients).
28 | #' The \code{data.frame} is composed by 60 genera and a numeric variable
29 | #'
30 | #' @format The \code{data.frame} is composed by 60 genera and a variable
31 | #' \describe{
32 | #'   \item{genera}{The first 60 columns, from \emph{g_Prevotella} until
33 | #'   \emph{o_NB1-n_g_unclassified} referred to different genera.}
34 | #'   \item{sCD14}{a \code{numeric} variable with the value of the inflammation
35 | #'   parameter sCD14 for each sample.}
36 | #' }
37 | #' @name sCD14
38 | #' @docType data
39 | #' @references \doi{10.1016/j.ebiom.2016.01.032}
40 | #' @keywords data
41 | NULL
42 | 
43 | 
44 | 
45 | #' Microbiome composition related to Crohn`s disease study
46 | #'
47 | #' A dataset containing the number of counts of 48 different genera in a group
48 | #' of 975 samples (including 662 samples of patients with Crohn`s disease and
49 | #' 313 controls).
50 | #' The \code{data.frame} is composed by 48 genera and a factor variable
51 | #'
52 | #' @format The \code{data.frame} is composed by 48 genera and a variable
53 | #' \describe{
54 | #'   \item{genera}{The first 48 columns, from \emph{g_Turicibacter} until
55 | #'   \emph{g_Bilophila} referred to different genera.}
56 | #'   \item{y}{a \code{factor} indicating if the sample corresponds to a case (
57 | #'   \emph{CD}) or a control (\emph{no}).}
58 | #' }
59 | #' @name Crohn
60 | #' @docType data
61 | #' @references \url{https://qiita.ucsd.edu/}
62 | #' @keywords data
63 | NULL
64 | 


--------------------------------------------------------------------------------
/R/simulations.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | #' simulateHTS
 5 | #' 
 6 | #' This function simulates a set of (x, y) pairs.
 7 | #' The covariates x are compositional, meaning they only
 8 | #' carry relative information.
 9 | #' The response y is a binary indicator.
10 | #' The rule linking x and y can be a balance or an amalgamation.
11 | #'
12 | #' @param n Number of observations.
13 | #' @param p Number of covariates.
14 | #' @param outputType A string indicating 'binary' or 'continuous'.
15 | #' @param logratio A string indicating 'simple', 'balance', or 
16 | #'     'amalgamation'.
17 | #'
18 | #' @return A list containing a matrix of inputs and a vector of outputs
19 | #' 
20 | #' @export
21 | simulateHTS = function(n, p, outputType = 'binary', logratio = 'simple'){
22 |   
23 |   # Simulate independent variables
24 |   alpha0 = rep(1.0, p) / log(p)
25 |   alpha = gtools::rdirichlet(1, alpha0)
26 |   alpha = sort(alpha, decreasing=T)
27 |   X = matrix(0.0, n, p)
28 |   P = matrix(0.0, n, p)
29 |   numCounts = stats::rpois(n, 10 * p)
30 |   for (i in 1:n) {
31 |     classProb = gtools::rdirichlet(1, alpha)
32 |     x = stats::rmultinom(1, numCounts[i], classProb)
33 |     # X[i,] = x / sum(x)
34 |     X[i,] = x
35 |     P[i,] = classProb
36 |   }
37 |   
38 |   # Simulate dependent variable
39 |   if (logratio == 'simple') {
40 |     if (p < 2) {
41 |       stop("Input dimension must be >= 2")
42 |     }
43 |     eta = log(P[, 1]) - log(P[, 2])
44 |   } else if (logratio == 'balance') {
45 |     if (p < 10) {
46 |       stop("Input dimension must be >= 10")
47 |     }
48 |     eta = rowMeans(log(P[, c(4, 6)])) - log(P[, 5])
49 |   } else if (logratio == 'amalgamation') {
50 |     if (p < 20) {
51 |       stop("Input dimension must be >= 20")
52 |     }
53 |     eta = log(rowSums(P[, c(1,2,6,7,15)])) - log(rowSums(P[, c(3,8,16,17)]))
54 |   } else {
55 |     stop("Variable logratio incorrectly specified.")
56 |   }
57 |   
58 |   if (outputType == 'binary') {
59 |     outProb = 1 / (1 + exp(-(eta - mean(eta)))) * 1.0
60 |     y = stats::rbinom(n, 1, outProb)
61 |   } else if (outputType == 'continuous') {
62 |     y = stats::rnorm(n, eta)
63 |   } else {
64 |     stop("Argument outputType:", outputType, ", not recognized")
65 |   }
66 |   
67 |   return(list(x=data.frame(X), y=data.frame(y)))
68 | }
69 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![CRAN_Status_Badge](https://cranlogs.r-pkg.org:443/badges/codacore)](https://cranlogs.r-pkg.org:443/badges/codacore)
 2 | [![Downloads](https://cranlogs.r-pkg.org:443/badges/codacore)](https://cranlogs.r-pkg.org:443/badges/codacore)
 3 | [![Total Downloads](https://cranlogs.r-pkg.org:443/badges/grand-total/codacore)](https://cranlogs.r-pkg.org:443/badges/grand-total/codacore)
 4 | 
 5 | # codacore
 6 | 
 7 | *Update: [CoDaCoRe is now live on CRAN](https://CRAN.R-project.org/package=codacore/)*
 8 | 
 9 | A self-contained, up-to-date implementation of [CoDaCoRe](https://doi.org/10.1093/bioinformatics/btab645), in the R programming language, by the original authors.
10 | 
11 | The [CoDaCoRe guide](https://egr95.github.io/R-codacore/inst/misc/guide.html) contains a detailed tutorial on installation, usage and functionality.
12 | 
13 | Note this repository is under active development. If you would like to use CoDaCoRe on your dataset, and have any questions regarding the installation, usage, implementation, or model itself, do not hesitate to contact <eg2912@columbia.edu>. Some previously asked questions are available on the [Issues page](https://github.com/egr95/R-codacore/issues).
14 | Contributions, fixes, and feature requests are also welcome - please create an issue, submit a pull request, or email me.
15 | 
16 | ## Quick-start: how to install and run CoDaCoRe
17 | 
18 | 1. We can install CoDaCoRe by running (further details in the [guide](https://egr95.github.io/R-codacore/inst/misc/guide.html#installation)):
19 | 
20 | ```r
21 | install.packages('codacore')
22 | ```
23 | 
24 | 2. To fit codacore on some data and check the results (further details in the [guide](https://egr95.github.io/R-codacore/inst/misc/guide.html#training-the-model):
25 | ```r
26 | library("codacore")
27 | help(codacore) # if in doubt, check documentation
28 | data("Crohn") # load some data and apply codacore
29 | x <- Crohn[, -ncol(Crohn)] + 1
30 | y <- Crohn[, ncol(Crohn)]
31 | model = codacore(
32 |     x, # compositional input, e.g., HTS count data 
33 |     y, # response variable, typically a 0/1 binary indicator 
34 |     logRatioType = "balances", # can use "amalgamations" instead, or abbreviations "B" and "A"
35 |     lambda = 1 # regularization strength (default corresponds to 1SE rule) 
36 | )
37 | print(model)
38 | plot(model)
39 | ```
40 | 
41 | ## Reference
42 | 
43 | Gordon-Rodriguez, Elliott, Thomas P. Quinn, and John P. Cunningham. "Learning sparse log-ratios for high-throughput sequencing data." Bioinformatics 38.1 (2022): 157-163. [[link](https://doi.org/10.1093/bioinformatics/btab645)]
44 | 
45 | Quinn, Thomas P., Elliott Gordon-Rodriguez, and Ionas Erb. "A critique of differential abundance analysis, and advocacy for an alternative." arXiv preprint arXiv:2104.07266 (2021). [[link](https://arxiv.org/abs/2104.07266)]
46 | 
47 | ## Acknowledgements
48 | Thanks for your contributions to codacore!
49 | 
50 | - Marcus Fedarko
51 | - Gregor Seyer
52 | - Nick Youngblut
53 | - Antonio Garrido Fernandez
54 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Test environments
 2 | - local ubuntu 14.04, R 4.0.4
 3 | - local ubuntu 16.04, R 4.0.4
 4 | - win-builder (devel and release)
 5 | 
 6 | ## R CMD check results
 7 | There were no ERRORs or WARNINGs. 
 8 | 
 9 | There was 1 NOTE:
10 | 
11 | * checking dependencies in R code ... NOTE
12 |   Namespace in Imports field not imported from: 'R6'
13 | 
14 |   This is the initial submission of codacore <doi:10.1093/bioinformatics/btab645>.
15 | 
16 | ## Downstream dependencies
17 | There are currently no downstream dependencies for this package.


--------------------------------------------------------------------------------
/data/Crohn.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egr95/R-codacore/b87df7921a7fabdb355990e6fc439ff80d30c67d/data/Crohn.rda


--------------------------------------------------------------------------------
/data/HIV.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egr95/R-codacore/b87df7921a7fabdb355990e6fc439ff80d30c67d/data/HIV.rda


--------------------------------------------------------------------------------
/data/sCD14.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egr95/R-codacore/b87df7921a7fabdb355990e6fc439ff80d30c67d/data/sCD14.rda


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite codacore in publications use:")
 2 | 
 3 | citEntry(entry = "Article",
 4 |          title        = paste("Learning Sparse Log-Ratios For High-Throughput Sequencing Data"),
 5 |          author       = personList(as.person("Elliott Gordon-Rodriguez"),
 6 |                                    as.person("Thomas P. Quinn"),
 7 |                                    as.person("John P. Cunningham")),
 8 |          journal      = "Bioinformatics",
 9 |          year         = "2021",
10 | 
11 |          textVersion  =
12 |            paste("Elliott Gordon-Rodriguez, Thomas P Quinn, John P Cunningham,",
13 |                  "Learning sparse log-ratios for high-throughput sequencing data,",
14 |                  "Bioinformatics, 2021;, btab645, doi:10.1093/bioinformatics/btab645")
15 | )
16 | 


--------------------------------------------------------------------------------
/inst/misc/guide.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "CoDaCoRe Guide"
  3 | author: 
  4 |   - Elliott Gordon Rodriguez^[eg2912@columbia.edu]
  5 | date: "`r Sys.Date()`"
  6 | output: 
  7 |   rmarkdown::html_vignette:
  8 |     number_sections: yes
  9 |     toc: true
 10 |     toc_depth: 3
 11 |   pdf_document:
 12 |     number_sections: no
 13 |     toc: false
 14 |     keep_tex: true
 15 |   md_document:
 16 |     number_sections: yes
 17 |     toc: true
 18 |     toc_depth: 3
 19 | vignette: >
 20 |   %\VignetteIndexEntry{CoDaCoRe Guide}
 21 |   %\VignetteEncoding{UTF-8}
 22 |   %\VignetteEngine{knitr::rmarkdown}
 23 | ---
 24 | 
 25 | ```{r, include = FALSE}
 26 | knitr::opts_chunk$set(
 27 |   collapse = TRUE,
 28 |   comment = "#>",
 29 |   fig.width = 7,
 30 |   fig.height = 5,
 31 |   tidy.opts=list(width.cutoff=40), tidy=TRUE
 32 | )
 33 | ```
 34 | 
 35 | # Installation
 36 | 
 37 | You can install ```codacore``` by running:
 38 | ```r
 39 | install.packages("codacore")
 40 | ```
 41 | 
 42 | You may instead install the [development version](https://github.com/egr95/R-codacore) directly from Github, using the [devtools package](https://www.r-project.org/nosvn/pandoc/devtools.html).
 43 | ```r
 44 | devtools::install_github("egr95/R-codacore", ref="main")
 45 | ```
 46 | 
 47 | Note that CoDaCoRe requires a working installation of [TensorFlow](https://tensorflow.rstudio.com/).
 48 | If you do not have Tensorflow previously installed, when you run ```codacore()``` for the first time you will likely encounter an error message of the form:
 49 | ```r
 50 | > codacore(x, y)
 51 | 
 52 | ERROR: Could not find a version that satisfies the requirement tensorflow
 53 | ERROR: No matching distribution found for tensorflow
 54 | Error: Installation of TensorFlow not found.
 55 | 
 56 | Python environments searched for 'tensorflow' package:
 57 |  /moto/stats/users/eg2912/miniconda3/envs/r-test/bin/python3.9
 58 |  /usr/bin/python2.7
 59 | 
 60 | You can install TensorFlow using the install_tensorflow() function.
 61 | ```
 62 | 
 63 | This can be fixed simply by [installing tensorflow](https://tensorflow.rstudio.com/install/), as follows:
 64 | ```r
 65 | install.packages("tensorflow")
 66 | library("tensorflow")
 67 | install_tensorflow()
 68 | 
 69 | install.packages("keras")
 70 | library("keras")
 71 | install_keras()
 72 | ```
 73 | 
 74 | Note also that you may have to restart your R session between installation of ```codacore```, ```tensorflow```, and ```keras```.
 75 | 
 76 | 
 77 | # Summary of method
 78 | 
 79 | CoDaCoRe is an algorithm to identify predictive log-ratio biomarkers in high-throughput sequencing data. Let $x$ denote HTS input (e.g., $x_{i,j}$ denotes the abundance of the $j$th bacteria in the $i$th subject), and let $y$ denote the outcome of interest (e.g., $y_i$ is equal to 0 or 1 depending on whether the $i$th subject belonged to the case or the control group). Given a set of $(x_i, y_i)$ pairs, CoDaCoRe identifies predictive biomarkers of the form:
 80 | $$
 81 | B(x_i; J^+, J^-) = \log \left( \frac{\sum_{j \in J^+} x_{i,j}}{\sum_{j \in J^-} x_{i,j}} \right),
 82 | $$
 83 | that are maximally associated with the response variable $y_i$. In other words, CoDaCoRe identifies a numerator set $J^+$ and a denominator set $J^-$, such that their log-ratio is most predictive of the response variable. By default, CoDaCoRe uses *balances*, which are defined as the log-ratio of *geometric means* (as opposed to summations):
 84 | $$
 85 | B(x_i; J^+, J^-) = \log \left( \frac{(\prod_{j \in J^+} x_{i,j})^{|J^+|}}{(\prod_{j \in J^-} x_{i,j})^{|J^-|}} \right).
 86 | $$
 87 | 
 88 | For an introduction to balances, we refer the reader to the [selbal paper](https://doi.org/10.1101/219386), and for a more detailed treatment of CoDaCoRe and other log-ratio methodology, we refer the reader to the [codacore paper](https://doi.org/10.1093/bioinformatics/btab645) and [this paper](https://arxiv.org/abs/2104.07266).
 89 | 
 90 | 
 91 | # Training the model
 92 | 
 93 | We assume a working installation of `codacore` ([link](https://github.com/egr95/R-codacore/blob/main/README.md)).
 94 | ```{r}
 95 | library("codacore")
 96 | help(codacore)
 97 | ```
 98 | 
 99 | In this tutorial, we will showcase `codacore` using three datasets that were also analyzed by the authors of `selbal` [(Rivera-Pinto et al., 2018)](https://doi.org/10.1101/219386). First, we consider the Crohn's disease data from [(Gevers et al., 2014)](http://dx.doi.org/10.1016/j.chom.2014.02.005).
100 | ```{r}
101 | data("Crohn")
102 | x <- Crohn[, -ncol(Crohn)]
103 | y <- Crohn[, ncol(Crohn)]
104 | ```
105 | 
106 | Our goal is to identify ratio-based biomarkers that are predictive of disease status. Our input variable consists of the abundance of 48 microbial species in 975 samples. *As is common in most machine learning libraries, our package expects an input of shape (n, p), with a row for each sample and a column for each variable.*
107 | ```{r}
108 | dim(x)
109 | ```
110 | 
111 | The output variable is a binary indicator (CD stands for Chron's disease).
112 | ```{r}
113 | table(y)
114 | ```
115 | 
116 | Prior to fitting CoDaCoRe, we must impute any zeros in our input variable (a standard pre-processing step for ratio-based methods).
117 | ```{r}
118 | x <- x + 1
119 | ```
120 | 
121 | Next, we split our data into a training and a test set (to keep things simple we do this naively at random, though in practice one might consider stratified sampling and class rebalancing).
122 | ```{r}
123 | # For reproducibility, we set a random seed (including in TensorFlow, used by codacore)
124 | set.seed(0); library(tensorflow); tf$random$set_seed(0)
125 | trainIndex <- sample(1:nrow(x), 0.8 * nrow(x))
126 | xTrain <- x[trainIndex,]
127 | yTrain <- y[trainIndex]
128 | ```
129 | 
130 | We are ready to fit CoDaCoRe. We stick to the default parameters for now. Notice the fast runtime (as compared to, for example, `selbal.cv`).
131 | ```{r}
132 | model <- codacore(
133 |   xTrain,
134 |   yTrain,
135 |   logRatioType = 'balances', # can also use 'amalgamations'
136 |   lambda = 1                 # regularization parameter (1 corresponds to "1SE rule")
137 | )
138 | ```
139 | 
140 | # Visualizing results
141 | 
142 | Next we can check the learned output of the model: what inputs were included in the learned log-ratios, how strongly associated they are to the response, and how well they classified the data.
143 | ```{r}
144 | print(model)
145 | ```
146 | 
147 | The most predictive ratio identified by CoDaCoRe is Roseburia / Dialister, which can be visualized with the `plot` function.
148 | ```{r}
149 | plot(model)
150 | ```
151 | 
152 | Note that CoDaCoRe is an ensemble model, where multiple log-ratios are learned sequentially in decreasing order of importance (with automatic stopping whenever no additional log-ratio improved the loss function during training). We can visualize the performance of this ensembling procedure by "stacking" the respective ROC curves.
153 | ```{r}
154 | plotROC(model)
155 | ```
156 | 
157 | # Predicting on new data
158 | 
159 | We can also use our trained model to classify new samples.
160 | ```{r}
161 | xTest <- x[-trainIndex,]
162 | yTest <- y[-trainIndex]
163 | yHat <- predict(model, xTest, logits=F)
164 | cat("Test set AUC =", pROC::auc(pROC::roc(yTest, yHat, quiet=T)))
165 | # Convert probabilities into a binary class
166 | failure <- yHat < 0.5
167 | success <- yHat >= 0.5
168 | yHat[failure] <- levels(y)[1]
169 | yHat[success] <- levels(y)[2]
170 | cat("Classification accuracy on test set =", round(mean(yHat == yTest), 2))
171 | ```
172 | 
173 | Note our `predict` function can be restricted to only use the top _k_ log-ratios in the model for prediction.
174 | For example, the following will compute the AUC of a 1-log-ratio model, using only the top log-ratio.
175 | ```{r}
176 | yHat <- predict(model, xTest, logits=F, numLogRatios=1)
177 | cat("Test set AUC =", pROC::auc(pROC::roc(yTest, yHat, quiet=T)))
178 | ```
179 | 
180 | Other useful functions include:
181 | ```{r, results=F}
182 | getNumeratorParts(model, 1)
183 | getDenominatorParts(model, 1)
184 | getLogRatios(model, xTest)
185 | getNumLogRatios(model)
186 | getTidyTable(model)
187 | getSlopes(model)
188 | ```
189 | 
190 | # Controlling overlap between log-ratios
191 | 
192 | By default, CoDaCoRe allows for "overlapping log-ratios", in other words, an input variable that is included in the first log-ratio may well be included in a second or third log-ratio provided it is sufficiently predictive. However, the user may choose to restrict each successive log-ratio to be constructed from a mutually exclusive set of input variables (e.g., to obtain _orthogonal balances_, in the Aitchison sense). This can be specified with the parameter `overlap`. In our example, note how `g__Dialister` is no longer repeated.
193 | 
194 | ```{r}
195 | model <- codacore(xTrain, yTrain, overlap=F)
196 | print(model)
197 | ```
198 | 
199 | # Using amalgamations (summed-log-ratios)
200 | 
201 | CoDaCoRe can be used to learn log-ratios between both geometric means (known as "balances" or "isometric-log-ratio") or summations (known as "amalgamations" or "summed-log-ratio"), depending on the goals of the user. This can be specified with the parameter `logRatioType`.
202 | ```{r}
203 | model <- codacore(xTrain, yTrain, logRatioType = "amalgamations")
204 | print(model)
205 | ```
206 | 
207 | Note that amalgamations/summed-log-ratios are less sensitive to covariates that are small in magnitude (e.g., rare microbes), which can hinder their predictive strength for datasets where small covariates are important. On the other hand, summed-log-ratios have a different interpretation than isometric-log-ratios and may therefore be preferrable in some applications (e.g., when the "summed" effect of an aggregated sub-population is the object of interest). In our Crohn's disease data, the rare species Roseburia gets picked up by the isometric-log-ratio, but not by the summed-log-ratio, which is more sensitive to more common bacteria species such as Faecalibacterium.
208 | 
209 | # Continuous outcomes
210 | 
211 | We consider the HIV data from [(Noguera-Julian et al., 2016)](http://dx.doi.org/10.1016/j.ebiom.2016.01.032). The goal here is to construct a log-ratio of the microbial abundances that is predictive of the inflammation marker "sCD14", a continuous response variable. CoDaCoRe can be applied much in the same way, except the loss function changes from binary cross-entropy to mean-squared-error. This change will happen automatically based on the values inputted as `y` (although it can also be overriden manually via the ```objective``` parameter, for example, if the user wanted to fit a binary response using the mean-squared-error, they could specify ```objective = 'regression'```).
212 | 
213 | ```{r}
214 | data("sCD14")
215 | x <- sCD14[, -ncol(sCD14)]
216 | y <- sCD14[, ncol(sCD14)]
217 | 
218 | # Replace zeros as before
219 | x <- x + 1
220 | 
221 | # Split the data
222 | trainIndex <- sample(1:nrow(x), 0.8 * nrow(x))
223 | xTrain <- x[trainIndex,]
224 | yTrain <- y[trainIndex]
225 | 
226 | # Fit codacore and inspect results
227 | model <- codacore(xTrain, yTrain)
228 | print(model)
229 | plot(model)
230 | ```
231 | 
232 | # Multiclass classification
233 | 
234 | Our implementation does not currently support multiclass targets, however a multiclass classifier can be constructed from CoDaCoRe by taking the One-vs-One or One-vs-Rest strategies.
235 | 
236 | # Tuning the regularization parameter lambda
237 | 
238 | The parameter `lambda` controls the regularization strength of CoDaCoRe. In particular, `lambda = 1` (the default value) corresponds to applying the 1-standard-error rule in the discretization step of the log-ratio (details in [Section 3.3](https://www.biorxiv.org/content/10.1101/2021.02.11.430695v2.full.pdf)). This is typically a good choice, leading to models that are both sparse and predictive. Sparser models can be achieved by higher values of `lambda`, for example, `lambda = 2` corresponds to applying a "2-standard-error" rule. On the other hand, smaller values of lambda result in less sparse, but typically most predictive, models. In particular, `lambda = 0` corresponds to a "0 standard-error rule", in other words choosing the log-ratio that minimizes cross-validation score. Such a choice can be good when we seek a maximally predictive model, but care less about sparsity.
239 | ```{r}
240 | model <- codacore(xTrain, yTrain, lambda = 0.0)
241 | print(model)
242 | ```
243 | 
244 | Notice the increased R-squared score relative to the previous model (at the expense of sparsity).
245 | 
246 | ## When no predictive log-ratios are found
247 | 
248 | On some datasets, CoDaCoRe may have trouble finding _any_ predictive log-ratios. If none are found, this is typically a sign that the signal in the data is weak. In this case, the analyst may choose to reduce the value of `lambda` (for example, to `lambda = 0`), in order to allow our algorithm to search more aggressively for predictive log-ratios. Doing so will often allow the algorithm to identify at least one predictive log-ratio, at the risk of overfitting the training data. Additional care must be taken in validating such log-ratios on held-out data.
249 | 
250 | # Covariate adjustment
251 | 
252 | Many applications require accounting for potential confounder variables as well as our ratio-based biomarkers. As an example, we consider a second HIV dataset from [(Noguera-Julian et al. 2016)](http://dx.doi.org/10.1016/j.ebiom.2016.01.032). The goal is to find a microbial signature for HIV status, i.e., a log-ratio that can discriminate between HIV-positive and HIV-negative individuals. However, we have an additional confounder variable, MSM (Men who have Sex with Men). In the context of CoDaCoRe, there are multiple approaches that can be used to adjust for covariates.
253 | 
254 | ## Incremental fit
255 | 
256 | Given the _stagewise-additive_ (i.e., ensemble) nature of CoDaCoRe, whereby each successive log-ratio is fitted on the residual of the previous iteration, a very natural approach is to fit the covariates _a priori_ and then fit CoDaCoRe on the residual. In other words, we would start by regressing HIV status on MSM, "partialling out" this covariate, and then fit CoDaCoRe on the residual from this model. This can be implemented easily by means of the `offset` parameter.
257 | ```{r}
258 | data("HIV")
259 | x <- HIV[, 1:(ncol(HIV) - 2)]
260 | z <- HIV[, 'MSM']
261 | y <- HIV$HIV_Status
262 | 
263 | # Replace zeros as before
264 | x <- x + 1
265 | 
266 | # Split the data
267 | trainIndex <- sample(1:nrow(x), 0.8 * nrow(x))
268 | dfTrain <- HIV[trainIndex,]
269 | xTrain <- x[trainIndex,]
270 | yTrain <- y[trainIndex]
271 | 
272 | partial <- glm(HIV_Status ~ MSM, data=dfTrain, family='binomial')
273 | # Note the offset must be given in logit space
274 | model <- codacore(xTrain, yTrain, offset=predict(partial))
275 | print(model)
276 | partialAUC <- pROC::auc(pROC::roc(yTrain, predict(partial), quiet=T))
277 | codacoreAUC <- model$ensemble[[1]]$AUC
278 | cat("AUC gain:", round(100 * (codacoreAUC - partialAUC)), "%")
279 | ```
280 | 
281 | Note that, when predicting on new data, the contributions of the covariates and the log-ratios should be added up in logit space.
282 | ```{r}
283 | dfTest <- HIV[-trainIndex,]
284 | xTest <- x[-trainIndex,]
285 | yTest <- z[-trainIndex]
286 | yHatLogit <- predict(partial, newdata = dfTest) + predict(model, xTest, logits=T)
287 | yHat <- yHatLogit > 0 # in case we need binary predictions e.g. to compute accuracy
288 | testAUC <- pROC::auc(pROC::roc(yTest, yHatLogit, quiet=T))
289 | cat("Test AUC:", round(100 * testAUC), "%")
290 | ```
291 | 
292 | When the outcome variable is continuous, this is simpler as there is no logit transformation and the contributions of the partial model can be added directly, e.g.,
293 | ```r
294 | # Suppose that, instead of predicting HIV status (a binary target),
295 | # we now have some continuous target, 'yCts'
296 | partial2 <- lm(yCts ~ MSM, data=dfTrain)
297 | model2 <- codacore(xTrain, yCtsTrain, offset=predict(partial))
298 | print(model2)
299 | yCtsHat <- predict(partial2, newdata = dfTest) + predict(model2, xTest)
300 | MSE <- mean((yCtsTest - yCtsHat)^2)
301 | ```
302 | 
303 | ## Joint fit
304 | 
305 | Depending on the application and the goals of the analyst, it may be of interest to understand the _joint_ effect of the covariates and log-ratios on the response. To do so, one option is to simply regress the outcome jointly against the covariates and the learned log-ratios from the previous step. This can be implemented by running, in addition to the above, an additional `glm` fit.
306 | ```{r}
307 | # Create a new design matrix with response & covariates, as well as log-ratios obtained from codacore
308 | dfJoint = cbind(dfTrain[, c('MSM', 'HIV_Status')], getLogRatios(model))
309 | 
310 | # And fit everything jointly
311 | modelJoint <- glm(HIV_Status ~ ., data=dfJoint, family='binomial')
312 | # Can again use this model to make predictions or to interpret regression coefficients
313 | yHat <- predict(modelJoint, newData=dfJoint)
314 | summary(modelJoint)
315 | ```
316 | 
317 | Note that, in any case, the CoDaCoRe algorithm itself only optimizes over one log-ratio at a time (in its current implementation). In some applications, it may in fact be beneficial to optimize over the set of log-ratios jointly with the regression coefficients of the covariates. However, this is not yet implemented.
318 | 
319 | # Unsupervised learning
320 | 
321 | CoDaCoRe can be used as follows to obtain a fast, scalable, interpretable and sparse log-ratio based unsupervised learning algorithm. The idea is to first compute a dense representation of the data using traditional methods, and then regress the data against this representation using CoDaCoRe to obtain a sparse log-ratio representation in its stead. For example, one could take the first principal component of the CLR-transformed data, and use CoDaCoRe to approximate this real-valued representation with a single sparse log-ratio score [(Quinn et al., 2021)](https://arxiv.org/abs/2104.07266). In the present HIV dataset, we find that the learned log-ratio biomarker provides a useful representation of the data, markedly separating the MSM from the non-MSM individuals.
322 | ```{r}
323 | clr <- t(apply(x, 1, function(x) log(x) - mean(log(x))))
324 | pca <- prcomp(clr, scale=T)
325 | pc1 = clr %*% pca$rotation[, 1]
326 | 
327 | model <- codacore(x, as.numeric(pc1))
328 | logRatio1 <- getLogRatios(model, x)[, 1]
329 | boxplot(logRatio1 ~ z)
330 | ```
331 | 
332 | We can take things one step further and derive a second unsupervised log-ratio biomarker, by simply fitting CoDaCoRe on the second principal component. Taken together, our two log-ratio biomarkers capture important information in the data:
333 | ```{r}
334 | pc2 = clr %*% pca$rotation[, 2]
335 | model <- codacore(x, as.numeric(pc2))
336 | logRatio2 <- getLogRatios(model, x)[, 1]
337 | plot(logRatio1, logRatio2, col=z)
338 | legend('bottomleft', legend=levels(z), pch=1, col=1:2)
339 | ```
340 | 
341 | 
342 | Note also that the CoDaCoRe framework can be applied to the unsupervised learning problem in several other ways, some of which are under active development.
343 | 
344 | 
345 | # Multi-omics integration
346 | 
347 | With a similar approach, CoDaCoRe can be used for scalable, sparse, and interpretable multi-omics data integration. We briefly highlight an example multi-omics analysis of paired gut microbiome and metabolomics data, taken from 220 clinical samples of which 88 have Chron's disease and 76 have ulcerative colitis [(Franzosa et al., 2019)](https://www.nature.com/articles/s41564-018-0306-4). For a full analysis, see Section 5 and the appendix in [Quinn et al., 2021](https://arxiv.org/abs/2104.07266). Again, we will use standard techniques to compute a (dense) latent representation of the data, which we will then approximate using sparse log-ratio biomarkers. Letting $\mathbf T$ denote the microbe abundances $\mathbf U$ the metabolite abundances, we will use partial least squares (PLS) regression to model the association between $\mathbf T$ and $\mathbf U$. This will result in two latent factors, one for $\mathbf T$ and one for $\mathbf U$, that capture the _joint_ information in the data. These latent factors will constitute the regression target for CoDaCoRe.
348 | ```{r}
349 | # Load data
350 | download.file("https://github.com/egr95/FranzosaData/blob/main/FranzosaMicrobiome.rda?raw=true", "FranzosaMicrobiome")
351 | download.file("https://github.com/egr95/FranzosaData/blob/main/FranzosaMetabolite.rda?raw=true", "FranzosaMetabolite")
352 | load("FranzosaMicrobiome")
353 | load("FranzosaMetabolite")
354 | 
355 | # Note data have already been pre-processed as per (Quinn et al., 2021),
356 | # including zero-replacement and normalization to a unit total.
357 | T <- FranzosaMicrobiome[, -ncol(FranzosaMicrobiome)] # We remove the last column (response variable)
358 | U <- FranzosaMetabolite[, -ncol(FranzosaMetabolite)]
359 | 
360 | # Apply clr transform prior to PLS
361 | clrT <- t(apply(T, 1, function(x) log(x) - mean(log(x))))
362 | clrU <- t(apply(U, 1, function(x) log(x) - mean(log(x))))
363 | 
364 | # Call mixOmics package and plot first PLS components
365 | suppressMessages(library('mixOmics'))
366 | pls <- mixOmics::pls(X = clrT, Y = clrU, ncomp = 1)
367 | plot(pls$variates$X[,1], pls$variates$Y[,1], main = 'PLS multi-omics (dense)')
368 | 
369 | # Approximate the dense PLS representations with sparse log-ratio biomarkers
370 | plsX <- pls$variates$X[,1]
371 | modelX <- codacore(T, plsX)
372 | logRatioX <- getLogRatios(modelX)[,1]
373 | 
374 | plsY <- pls$variates$Y[,1]
375 | modelY <- codacore(U, plsY, logRatioType = "B")
376 | logRatioY <- getLogRatios(modelY)[,1]
377 | 
378 | plot(logRatioX, logRatioY, main = 'CoDaCoRe multi-omics (sparse)')
379 | ```
380 | 
381 | ```{r, include=FALSE}
382 | file.remove("FranzosaMicrobiome")
383 | file.remove("FranzosaMetabolite")
384 | ```
385 | 
386 | Note that CoDaCoRe obtains a sparse representation that also has better statistical properties than the original (dense) PLS components, markedly de-skewing the data.
387 | 
388 | 


--------------------------------------------------------------------------------
/man/Crohn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{Crohn}
 5 | \alias{Crohn}
 6 | \title{Microbiome composition related to Crohn`s disease study}
 7 | \format{
 8 | The \code{data.frame} is composed by 48 genera and a variable
 9 | \describe{
10 |   \item{genera}{The first 48 columns, from \emph{g_Turicibacter} until
11 |   \emph{g_Bilophila} referred to different genera.}
12 |   \item{y}{a \code{factor} indicating if the sample corresponds to a case (
13 |   \emph{CD}) or a control (\emph{no}).}
14 | }
15 | }
16 | \description{
17 | A dataset containing the number of counts of 48 different genera in a group
18 | of 975 samples (including 662 samples of patients with Crohn`s disease and
19 | 313 controls).
20 | The \code{data.frame} is composed by 48 genera and a factor variable
21 | }
22 | \references{
23 | \url{https://qiita.ucsd.edu/}
24 | }
25 | \keyword{data}
26 | 


--------------------------------------------------------------------------------
/man/HIV.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{HIV}
 5 | \alias{HIV}
 6 | \title{Microbiome, HIV infection and MSM factor}
 7 | \format{
 8 | The \code{data.frame} is composed by 60 genera and 2 variables
 9 | \describe{
10 |   \item{genera}{The first 60 columns, from \emph{g_Prevotella} until
11 |        \emph{o_NB1-n_g_unclassified} referred to different genera.}
12 |   \item{MSM}{a factor determining if the individual is \code{MSM} (\emph{Men Sex with
13 |    Men}) or not (\code{nonMSM}).}
14 |   \item{HIV_Status}{a factor specifying if the individual is infected
15 |    (\code{Pos}) or not (\code{Neg}).}
16 | 
17 | }
18 | }
19 | \description{
20 | A dataset containing the number of counts of 60 different genera in a group
21 | of 155 samples (including HIV - infected and non - infected patients).
22 | The \code{data.frame} is composed by 60 genera and two variables.
23 | }
24 | \references{
25 | \url{https://pubmed.ncbi.nlm.nih.gov/27077120/}
26 | }
27 | \keyword{data}
28 | 


--------------------------------------------------------------------------------
/man/activeInputs.codacore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{activeInputs.codacore}
 4 | \alias{activeInputs.codacore}
 5 | \title{activeInputs}
 6 | \usage{
 7 | activeInputs.codacore(cdcr)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object.}
11 | }
12 | \value{
13 | The covariates included in the log-ratios
14 | }
15 | \description{
16 | activeInputs
17 | }
18 | 


--------------------------------------------------------------------------------
/man/codacore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{codacore}
 4 | \alias{codacore}
 5 | \title{codacore}
 6 | \usage{
 7 | codacore(
 8 |   x,
 9 |   y,
10 |   logRatioType = "balances",
11 |   objective = NULL,
12 |   lambda = 1,
13 |   offset = NULL,
14 |   shrinkage = 1,
15 |   maxBaseLearners = 5,
16 |   optParams = list(),
17 |   cvParams = list(),
18 |   verbose = FALSE,
19 |   overlap = TRUE,
20 |   fast = TRUE
21 | )
22 | }
23 | \arguments{
24 | \item{x}{A data.frame or matrix of the compositional predictor variables.}
25 | 
26 | \item{y}{A data.frame, matrix or vector of the response.}
27 | 
28 | \item{logRatioType}{A string indicating whether to use "balances" or "amalgamations".
29 | Also accepts "balance", "B", "ILR", or "amalgam", "A", "SLR".
30 | Note that the current implementation for balances is not strictly an ILR,
31 | but rather just a collection of balances (which are possibly non-orthogonal
32 | in the Aitchison sense).}
33 | 
34 | \item{objective}{A string indicating "binary classification" or "regression". By default,
35 | it is NULL and gets inferred from the values in y.}
36 | 
37 | \item{lambda}{A numeric. Corresponds to the "lambda-SE" rule. Sets the "regularization strength"
38 | used by the algorithm to decide how to harden the ratio. 
39 | Larger numbers tend to yield fewer, more sparse ratios.}
40 | 
41 | \item{offset}{A numeric vector of the same length as y. Works similarly to the offset in a glm.}
42 | 
43 | \item{shrinkage}{A numeric. Shrinkage factor applied to each base learner.
44 | Defaults to 1.0, i.e., no shrinkage applied.}
45 | 
46 | \item{maxBaseLearners}{An integer. The maximum number of log-ratios that the model will
47 | learn before stopping. Automatic stopping based on \code{seRule} may occur sooner.}
48 | 
49 | \item{optParams}{A list of named parameters for the optimization of the
50 | continuous relaxation. Empty by default. User can override as few or as
51 | many of our defaults as desired. Includes adaptiveLR (learning rate under
52 | adaptive training scheme), momentum (in the gradient-descent sense), 
53 | epochs (number of gradient-descent epochs), batchSize (number of 
54 | observations per minibatch, by default the entire dataset),
55 | and vanillaLR (the learning rate to be used if the user does *not* want
56 | to use the 'adaptiveLR', to be used at the risk of optimization issues).}
57 | 
58 | \item{cvParams}{A list of named parameters for the "hardening" procedure
59 | using cross-validation. Includes numFolds (number of folds, default=5) and
60 | maxCutoffs (number of candidate cutoff values of 'c' to be tested out
61 | during CV process, default=20 meaning log-ratios with up to 21 components
62 | can be found by codacore).}
63 | 
64 | \item{verbose}{A boolean. Toggles whether to display intermediate steps.}
65 | 
66 | \item{overlap}{A boolean. Toggles whether successive log-ratios found by 
67 | CoDaCoRe may contain repeated input variables. TRUE by default.
68 | Changing to FALSE implies that the log-ratios obtained by CoDaCoRe
69 | will become orthogonal in the Aitchison sense, analogously to the
70 | isometric-log-ratio transformation, while losing a small amount of
71 | model flexibility.}
72 | 
73 | \item{fast}{A boolean. Whether to run in fast or slow mode. TRUE by
74 | default. Running in slow mode will take ~x5 the computation time,
75 | but may help identify slightly more accurate log-ratios.}
76 | }
77 | \value{
78 | A \code{codacore} object.
79 | }
80 | \description{
81 | This function implements the codacore algorithm described by Gordon-Rodriguez et al. 2021 
82 | (https://doi.org/10.1101/2021.02.11.430695).
83 | }
84 | \examples{
85 | \dontrun{
86 | data("Crohn")
87 | x <- Crohn[, -ncol(Crohn)]
88 | y <- Crohn[, ncol(Crohn)]
89 | x <- x + 1
90 | model = codacore(x, y)
91 | print(model)
92 | plot(model)
93 | }
94 | 
95 | }
96 | 


--------------------------------------------------------------------------------
/man/getBinaryPartitions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{getBinaryPartitions}
 4 | \alias{getBinaryPartitions}
 5 | \title{getBinaryPartitions}
 6 | \usage{
 7 | getBinaryPartitions(cdcr)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object}
11 | }
12 | \value{
13 | A matrix describing whether each component (as rows) is found in the
14 |  numerator (1) or denominator (-1) of each learned log-ratio (as columns).
15 |  This format resembles a serial binary partition matrix frequently used
16 |  in balance analysis.
17 | }
18 | \description{
19 | getBinaryPartitions
20 | }
21 | 


--------------------------------------------------------------------------------
/man/getDenominatorParts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{getDenominatorParts}
 4 | \alias{getDenominatorParts}
 5 | \title{getDenominatorParts}
 6 | \usage{
 7 | getDenominatorParts(cdcr, baseLearnerIndex = 1, boolean = TRUE)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object.}
11 | 
12 | \item{baseLearnerIndex}{An integer indicating which of the 
13 | (possibly multiple) log-ratios learned by codacore to be used.}
14 | 
15 | \item{boolean}{Whether to return the parts in boolean form
16 | (a vector of TRUE/FALSE) or to return the column names of
17 | those parts directly.}
18 | }
19 | \value{
20 | The covariates in the denominator of the selected log-ratio.
21 | }
22 | \description{
23 | getDenominatorParts
24 | }
25 | 


--------------------------------------------------------------------------------
/man/getLogRatios.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{getLogRatios}
 4 | \alias{getLogRatios}
 5 | \title{getLogRatios}
 6 | \usage{
 7 | getLogRatios(cdcr, x = NULL)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object}
11 | 
12 | \item{x}{A set of (possibly unseen) compositional data. 
13 | The covariates must be passed in the same order as 
14 | for the original codacore() call.}
15 | }
16 | \value{
17 | The learned log-ratio features, computed on input x.
18 | }
19 | \description{
20 | getLogRatios
21 | }
22 | 


--------------------------------------------------------------------------------
/man/getNumLogRatios.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{getNumLogRatios}
 4 | \alias{getNumLogRatios}
 5 | \title{getNumLogRatios}
 6 | \usage{
 7 | getNumLogRatios(cdcr)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object}
11 | }
12 | \value{
13 | The number of log-ratios that codacore found.
14 |     Typically a small integer. Can be zero if codacore
15 |     found no predictive log-ratios in the data.
16 | }
17 | \description{
18 | getNumLogRatios
19 | }
20 | 


--------------------------------------------------------------------------------
/man/getNumeratorParts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{getNumeratorParts}
 4 | \alias{getNumeratorParts}
 5 | \title{getNumeratorParts}
 6 | \usage{
 7 | getNumeratorParts(cdcr, baseLearnerIndex = 1, boolean = TRUE)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object.}
11 | 
12 | \item{baseLearnerIndex}{An integer indicating which of the 
13 | (possibly multiple) log-ratios learned by codacore to be used.}
14 | 
15 | \item{boolean}{Whether to return the parts in boolean form
16 | (a vector of TRUE/FALSE) or to return the column names of
17 | those parts directly.}
18 | }
19 | \value{
20 | The covariates in the numerator of the selected log-ratio.
21 | }
22 | \description{
23 | getNumeratorParts
24 | }
25 | 


--------------------------------------------------------------------------------
/man/getSlopes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{getSlopes}
 4 | \alias{getSlopes}
 5 | \title{getSlopes}
 6 | \usage{
 7 | getSlopes(cdcr)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object}
11 | }
12 | \value{
13 | The slopes (i.e., regression coefficients) for each log-ratio.
14 | }
15 | \description{
16 | getSlopes
17 | }
18 | 


--------------------------------------------------------------------------------
/man/getTidyTable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{getTidyTable}
 4 | \alias{getTidyTable}
 5 | \title{getTidyTable}
 6 | \usage{
 7 | getTidyTable(cdcr)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object}
11 | }
12 | \value{
13 | A table displaying the log-ratios found.
14 | }
15 | \description{
16 | getTidyTable
17 | }
18 | 


--------------------------------------------------------------------------------
/man/plot.codacore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{plot.codacore}
 4 | \alias{plot.codacore}
 5 | \title{plot}
 6 | \usage{
 7 | \method{plot}{codacore}(x, index = 1, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A codacore object.}
11 | 
12 | \item{index}{The index of the log-ratio to plot.}
13 | 
14 | \item{...}{Not used.}
15 | }
16 | \description{
17 | Plots a summary of a fitted codacore model.
18 | Credit to the authors of the selbal package (Rivera-Pinto et al., 2018),
19 | from whose package these plots were inspired.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/plotROC.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{plotROC}
 4 | \alias{plotROC}
 5 | \title{plotROC}
 6 | \usage{
 7 | plotROC(cdcr)
 8 | }
 9 | \arguments{
10 | \item{cdcr}{A codacore object.}
11 | }
12 | \description{
13 | plotROC
14 | }
15 | 


--------------------------------------------------------------------------------
/man/predict.codacore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{predict.codacore}
 4 | \alias{predict.codacore}
 5 | \title{predict}
 6 | \usage{
 7 | \method{predict}{codacore}(object, newx, asLogits = TRUE, numLogRatios = NA, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A codacore object.}
11 | 
12 | \item{newx}{A set of inputs to our model.}
13 | 
14 | \item{asLogits}{Whether to return outputs in logit space
15 | (as opposed to probability space). Should always be set
16 | to TRUE for regression with continuous outputs, but can
17 | be toggled for classification problems.}
18 | 
19 | \item{numLogRatios}{How many predictive log-ratios to 
20 | include in the prediction. By default, includes the
21 | effects of all log-ratios that were obtained during
22 | training. Setting this parameter to an integer k will
23 | restrict to using only the top k log-ratios in the model.}
24 | 
25 | \item{...}{Not used.}
26 | }
27 | \description{
28 | predict
29 | }
30 | 


--------------------------------------------------------------------------------
/man/print.codacore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/codacore.R
 3 | \name{print.codacore}
 4 | \alias{print.codacore}
 5 | \title{print}
 6 | \usage{
 7 | \method{print}{codacore}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A codacore object.}
11 | 
12 | \item{...}{Not used.}
13 | }
14 | \description{
15 | print
16 | }
17 | 


--------------------------------------------------------------------------------
/man/sCD14.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sCD14}
 5 | \alias{sCD14}
 6 | \title{Microbiome and sCD14 inflammation parameter}
 7 | \format{
 8 | The \code{data.frame} is composed by 60 genera and a variable
 9 | \describe{
10 |   \item{genera}{The first 60 columns, from \emph{g_Prevotella} until
11 |   \emph{o_NB1-n_g_unclassified} referred to different genera.}
12 |   \item{sCD14}{a \code{numeric} variable with the value of the inflammation
13 |   parameter sCD14 for each sample.}
14 | }
15 | }
16 | \description{
17 | A dataset containing the number of counts of 60 different genera in a group
18 | of 151 samples (including HIV - infected and non - infected patients).
19 | The \code{data.frame} is composed by 60 genera and a numeric variable
20 | }
21 | \references{
22 | \doi{10.1016/j.ebiom.2016.01.032}
23 | }
24 | \keyword{data}
25 | 


--------------------------------------------------------------------------------
/man/simulateHTS.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simulations.R
 3 | \name{simulateHTS}
 4 | \alias{simulateHTS}
 5 | \title{simulateHTS}
 6 | \usage{
 7 | simulateHTS(n, p, outputType = "binary", logratio = "simple")
 8 | }
 9 | \arguments{
10 | \item{n}{Number of observations.}
11 | 
12 | \item{p}{Number of covariates.}
13 | 
14 | \item{outputType}{A string indicating 'binary' or 'continuous'.}
15 | 
16 | \item{logratio}{A string indicating 'simple', 'balance', or 
17 | 'amalgamation'.}
18 | }
19 | \value{
20 | A list containing a matrix of inputs and a vector of outputs
21 | }
22 | \description{
23 | This function simulates a set of (x, y) pairs.
24 | The covariates x are compositional, meaning they only
25 | carry relative information.
26 | The response y is a binary indicator.
27 | The rule linking x and y can be a balance or an amalgamation.
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(codacore)
3 | 
4 | test_check("codacore")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-codacore.R:
--------------------------------------------------------------------------------
  1 | tensorflow_is_installed <- function(){
  2 |   check <- tryCatch({
  3 |     # dummy tensorflow code
  4 |     tensorflow::set_random_seed(0)
  5 |     TRUE
  6 |   }, error = function(e){
  7 |     FALSE
  8 |   })
  9 |   return(check)
 10 | }
 11 | 
 12 | testthat::test_that("simple logratios", {
 13 |   if (tensorflow_is_installed()){
 14 |     set.seed(0)
 15 |     tensorflow::set_random_seed(0)
 16 |     n = 1000
 17 |     p = 100
 18 |     HTS = simulateHTS(n, p)
 19 |     x = HTS$x + 1
 20 |     y = HTS$y
 21 |     model = codacore(x, y, logRatioType='B')
 22 |     testthat::expect_true(getNumeratorParts(model, 1)[1])
 23 |     testthat::expect_true(getDenominatorParts(model, 1)[2])
 24 |     testthat::expect_equal(model$ensemble[[1]]$accuracy, 0.851)
 25 |     
 26 |     model = codacore(x, y, logRatioType='A')
 27 |     testthat::expect_true(getNumeratorParts(model, 1)[1])
 28 |     testthat::expect_true(getDenominatorParts(model, 1)[2])
 29 |     testthat::expect_equal(model$ensemble[[1]]$accuracy, 0.846)
 30 |     
 31 |     # test getBinaryPartitions() function
 32 |     testthat::expect_true(getBinaryPartitions(model)[1,1] == 1)
 33 |     testthat::expect_true(getBinaryPartitions(model)[2,1] == -1)
 34 |     testthat::expect_true(getBinaryPartitions(model)[3,1] == 0)
 35 |     
 36 |     # Now test in regression mode
 37 |     HTS = simulateHTS(n, p, outputType = 'continuous')
 38 |     x = HTS$x + 1
 39 |     y = HTS$y
 40 |     model = codacore(x, y, logRatioType='B', objective='regression')
 41 |     testthat::expect_true(getNumeratorParts(model, 1)[1])
 42 |     testthat::expect_true(getDenominatorParts(model, 1)[2])
 43 |     testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.349, tolerance=0.001)
 44 |     
 45 |     model = codacore(x, y, logRatioType='A', objective='regression')
 46 |     testthat::expect_true(getNumeratorParts(model, 1)[1])
 47 |     testthat::expect_true(getDenominatorParts(model, 1)[2])
 48 |     testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.349, tolerance=0.001)
 49 |   }
 50 | })
 51 | 
 52 | testthat::test_that("balances", {
 53 |   if (tensorflow_is_installed()){
 54 |     set.seed(0)
 55 |     tensorflow::set_random_seed(0)
 56 |     n = 1000
 57 |     p = 100
 58 |     HTS = simulateHTS(n, p, logratio='balance')
 59 |     x = HTS$x + 1
 60 |     y = HTS$y
 61 |     model = codacore(x, y, logRatioType='B')
 62 |     
 63 |     testthat::expect_true(getNumeratorParts(model, 1)[4])
 64 |     testthat::expect_true(getNumeratorParts(model, 1)[6])
 65 |     testthat::expect_true(getDenominatorParts(model, 1)[5])
 66 |     testthat::expect_equal(model$ensemble[[1]]$accuracy, 0.733)
 67 |     
 68 |     # Now test in regression mode
 69 |     HTS = simulateHTS(n, p, logratio='balance', outputType = 'continuous')
 70 |     x = HTS$x + 1
 71 |     y = HTS$y
 72 |     model = codacore(x, y, logRatioType='B', objective='regression')
 73 |     testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.257, tolerance=0.001)
 74 |   }
 75 | })
 76 | 
 77 | testthat::test_that("amalgamations", {
 78 |   if (tensorflow_is_installed()){
 79 |     set.seed(0)
 80 |     tensorflow::set_random_seed(0)
 81 |     n = 1000
 82 |     p = 100
 83 |     HTS = simulateHTS(n, p, logratio='amalgamation')
 84 |     x = HTS$x + 1
 85 |     y = HTS$y
 86 |     model = codacore(x, y, logRatioType='A')
 87 |     
 88 |     testthat::expect_true(getNumeratorParts(model, 1)[1])
 89 |     testthat::expect_true(getNumeratorParts(model, 1)[2])
 90 |     testthat::expect_true(getDenominatorParts(model, 1)[3])
 91 |     testthat::expect_equal(model$ensemble[[1]]$AUC[1], 0.925, tolerance=0.001)
 92 |     
 93 |     
 94 |     # Now test in regression mode
 95 |     HTS = simulateHTS(n, p, logratio='amalgamation', outputType = 'continuous')
 96 |     x = HTS$x + 1
 97 |     y = HTS$y
 98 |     model = codacore(x, y, logRatioType='A', objective='regression')
 99 |     testthat::expect_true(getNumeratorParts(model, 1)[1])
100 |     testthat::expect_true(getNumeratorParts(model, 1)[2])
101 |     testthat::expect_true(getDenominatorParts(model, 1)[3])
102 |     testthat::expect_equal(model$ensemble[[1]]$Rsquared, 0.540, tolerance=0.001)
103 |   }
104 | })
105 | 
106 | 


--------------------------------------------------------------------------------
/vignettes/guide.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "CoDaCoRe guide"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{my-vignette}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | ---
 9 | 
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE)
12 | ```
13 | 
14 | ```{r, echo = FALSE, results = "asis"}
15 | vignette_dir <- "../inst/misc/"
16 | vignette_file <- "guide.md"
17 | lines <- readLines(paste0(vignette_dir, vignette_file))
18 | lines <- gsub("![](", replacement = paste0("![](", vignette_dir), x = lines, fixed = TRUE)
19 | cat(lines, sep = "\n")
20 | ```
21 | 


--------------------------------------------------------------------------------