├── DESCRIPTION ├── NAMESPACE ├── R ├── PLSDA.R ├── SPLSDA.R ├── ggbiplot.R ├── hello.R ├── normalization.R ├── okernelPLS.fit.R ├── plsmda.R ├── splitVar.R ├── splsmda.R └── stability.R ├── README.md ├── codexample_clrpca1.pdf ├── data ├── ArcticLake.rda └── Hydrochem.rda ├── man ├── alr.Rd ├── clr.Rd ├── compPLS-package.Rd ├── ggbiplot.Rd ├── hello.Rd ├── plsDA.Rd ├── plsDA_main.Rd ├── spls.stars.Rd └── splsDA.Rd └── tests ├── testthat.R └── testthat ├── test_norms_and_transforms.R └── test_plsda.R /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: compPLS 2 | Type: Package 3 | Title: Conduct Partial Least Squares regression for compositional data 4 | Version: 1.1 5 | Date: 2021-04-18 6 | Author: Zachary Kurtz 7 | Maintainer: Zachary Kurtz 8 | Description: Partial Least Squares method on log-ratio transformed data. 9 | Methods for PLS, sparse PLS discriminant analysis and biplot plotting for 10 | high-dimensional compositions. Particularly microbial 11 | communities/microbiota OTU tables, as generated by 16S amplicon sequencing. 12 | License: GPL (>= 2) 13 | Depends: 14 | R (>= 4.0.0), 15 | Imports: 16 | caret, 17 | ggplot2, 18 | MASS, 19 | pls, 20 | klaR, 21 | boot, 22 | grid, 23 | scales, 24 | kernlab 25 | Suggests: 26 | testthat 27 | RoxygenNote: 7.1.2 28 | Encoding: UTF-8 29 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(clr,data.frame) 4 | S3method(clr,default) 5 | S3method(clr,matrix) 6 | S3method(ggbiplot,default) 7 | S3method(ggbiplot,lda) 8 | S3method(ggbiplot,plsda) 9 | S3method(ggbiplot,prcomp) 10 | S3method(ggbiplot,princomp) 11 | S3method(ggbiplot,splsda) 12 | export(alr) 13 | export(alr.data.frame) 14 | export(alr.default) 15 | export(clr) 16 | export(ggbiplot) 17 | export(ggbiplot.matrix) 18 | export(norm_pseudo) 19 | export(norm_to_total) 20 | export(plsDA) 21 | export(plsDA_main) 22 | export(spls.stars) 23 | export(splsDA) 24 | export(splsDA_main) 25 | importFrom(ggplot2,aes) 26 | importFrom(ggplot2,geom_path) 27 | importFrom(ggplot2,geom_point) 28 | importFrom(ggplot2,geom_segment) 29 | importFrom(ggplot2,geom_text) 30 | importFrom(ggplot2,ggplot) 31 | importFrom(ggplot2,scale_x_continuous) 32 | importFrom(ggplot2,scale_y_continuous) 33 | -------------------------------------------------------------------------------- /R/PLSDA.R: -------------------------------------------------------------------------------- 1 | ################################ 2 | # @author Zachary Kurtz 3 | # @date 8/14/2013 4 | ## Methods for Partial Least Squares regression for compositional data 5 | 6 | 7 | 8 | 9 | #' Partial Least Squares Discriminant Analysis 10 | #' PLS regression to discriminate classes (via a logistic model) 11 | #' basically this is a wrapper for the \code{plsda} function in the caret package, 12 | #' but with default setup for dealing with uneven classes (via the priors option, see details) 13 | #' see caret::plsda for implementation details 14 | #' 15 | #' run this code if you don't need to fit paramaters by cross-validation 16 | #' 17 | #' @title plsDA partial least squares discriminant analysis 18 | #' @param x data with samples in rows, features are columns (not necessarily compositional data) 19 | #' @param grouping a numeric vector or factor with sample classes (length should equal \code{nrow(x)}) 20 | #' @param usePriors use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups) 21 | #' @param K number of components in the PLS model (default: number of classes - 1) 22 | #' @return a plsda fitted model 23 | #' @seealso \code{\link{plsDA_main}}, \code{\link{caret::plsda}} 24 | #' @rdname plsDA 25 | #' @export 26 | plsDA <- function(x, grouping, K, usePriors=FALSE, plsfun=caret::plsda, ...) { 27 | # wrapper function for plsda 28 | 29 | if (length(grouping) != nrow(x)) 30 | stop('length of grouping vector does equal the number of samples') 31 | 32 | args <- list(...) 33 | if (!('probMethod' %in% names(args))) args <- c(args, list(probMethod='Bayes')) 34 | if (usePriors) { 35 | # set priors to 1-(class freq) 36 | prior <- 1-(table(grouping)/length(grouping)) 37 | args <- c(args, list(prior=as.vector(prior))) 38 | } 39 | 40 | y <- as.factor(grouping) 41 | 42 | if (missing(K)) { 43 | K <- length(levels(y))-1 44 | } 45 | args <- c(args, ncomp=K) 46 | 47 | do.call(plsfun, c(list(x, y), args)) 48 | } 49 | 50 | 51 | #' The main wrapper for full Partial Least Squares discriminant analysis, 52 | #' performing cross-validation to tune model parameters (here, number of components) 53 | #' and do permutation tests (ie bootstrapping) to get pseudo-pvals estimates for model coefficients 54 | #' 55 | #' @title plsDA_main partial least squares discriminant analysis 56 | #' @param x data with samples in rows, features are columns (not necessarily compositional data) 57 | #' @param grouping a numeric vector or factor with sample classes (length should equal \code{nrow(x)}) 58 | #' @param usePriors use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups) 59 | #' @param K numeric vector containing number of components in the PLS model 60 | #' @param fold number of partitions to randomly subsample for cross-validation 61 | #' @param nboots number of bootstraps/permutations for estimating coefficient p-vals 62 | #' @param n.core number of cores for paralellization of bootstraps 63 | #' @param noise for very sparse components, some subsamples may have zero variance. Optionally, add some Gaussian noise to to avoid PLS errors 64 | #' @param ... additional arguments passed to plsDA 65 | #' 66 | #' @return a \code{plsDA} object that contains: the plsda model/object, \code{pvals}, the original data, \code{x}, and \code{groupings} 67 | #' @seealso \code{\link{plsDA}} 68 | #' @rdname plsDA_main 69 | #' @export 70 | plsDA_main <- function(x, grouping, K, usePriors=FALSE, fold=5, nboots=999, n.core=4, noise=0, ...) { 71 | 72 | if (length(K) > 1) 73 | K <- cv.plsDA.fit(x, grouping, K=K, noise=noise, fold=fold, n.core=n.core, usePriors=usePriors)$K 74 | .bstat <- function(data, indices, ...) plsDA(data[indices,,drop=FALSE], ...)$coefficients 75 | .pstat <- function(data, indices, ...) plsDA(apply(data[indices,,drop=FALSE], 2, function(x) sample(x)), ...)$coefficients 76 | 77 | if (nboots > 1) { 78 | sboots <- plsdaboot(x, .bstat, .pstat, R=nboots, n.core, K=K, grouping=grouping, ...) 79 | pmat <- pval(sboots) 80 | # keepInd <- which(suppressWarnings(apply(pmat, 1, min, na.rm=TRUE)) <= alpha) 81 | } else { 82 | keepInd <- 1:ncol(x) 83 | pmat <- NULL 84 | } 85 | plsmod <- plsDA(x, grouping, usePriors=usePriors, K=K, ...) 86 | 87 | # if (length(keepInd) > 0) { 88 | # splskmod <- plsDA(x[,keepInd], grouping, usePriors=usePriors, K=K, ...) 89 | 90 | # } else splskmod <- NULL 91 | structure(list(plsda=plsmod, pvals=pmat, # keep=keepInd, 92 | x=data, y=grouping), class="plsDA") 93 | 94 | 95 | } 96 | 97 | 98 | plsdaboot <- function(data, statisticboot, statisticperm, R, ncpus=1, ...) { 99 | res <- boot::boot(data, statisticboot, R=R, parallel="multicore", ncpus=ncpus, noise=0, ...) 100 | null_av <- boot::boot(data, statisticperm, sim='permutation', R=R, parallel="multicore", ncpus=ncpus, ...) 101 | class(res) <- 'list' 102 | structure(c(res, list(null_av=null_av)), class='plsdaboot') 103 | } 104 | 105 | 106 | 107 | pval.plsdaboot <- function(x, sided='both', mar=2) { 108 | # calculate 1 or 2 way pseudo p-val from boot object 109 | # Args: a boot object 110 | if (sided != "both") stop("only two-sided currently supported") 111 | x$t0 <- as.matrix(x$t0) 112 | nparams <- ncol(x$t) 113 | tmeans <- colMeans(x$null_av$t) 114 | # check to see whether betas are unstable -- confirm 115 | niters <- nrow(x$t) 116 | ind95 <- max(1,round(.025*niters)):round(.975*niters) 117 | boot_ord <- apply(x$t, 2, sort) 118 | boot_ord95 <- boot_ord[ind95,] 119 | outofrange <- unlist(lapply(1:length(x$t0), function(i) { 120 | betas <- x$t0[i] 121 | range <- range(boot_ord95[,i]) 122 | range[1] > betas || range[2] < betas 123 | })) 124 | # calc whether center of mass is above or below the mean 125 | bs_above <- unlist(lapply(1:nparams, function(i) 126 | length(which(x$t[, i] > tmeans[i])))) 127 | is_above <- bs_above > x$R/2 128 | pvals <- ifelse(is_above, 2*(1-bs_above/x$R), 2*bs_above/x$R) 129 | pvals[pvals > 1] <- 1 130 | pvals[outofrange] <- NaN 131 | pmat <- matrix(pvals, ncol=ncol(x$t0)) 132 | rownames(pmat) <- rownames(x$t0) 133 | colnames(pmat) <- colnames(x$t0) 134 | pmat 135 | } 136 | 137 | cv.plsDA.fit <- function(x, grouping, K, noise=0, fold=5, n.core, ...) { 138 | # Find eta & k by cross validation 139 | x <- x + matrix(rnorm(prod(dim(x)), 0, noise), ncol=ncol(x)) 140 | capture.output(out <- .cv.plsDA(x, grouping, fold=fold, K=K, plot.it=FALSE, n.core=n.core, ...)) 141 | out 142 | } 143 | 144 | #' @keywords internal 145 | .cv.plsDA <- function (x, y, fold = 10, K, kappa = 0.5, classifier = c("lda", 146 | "logistic"), scale.x = TRUE, n.core = 4, plot.it=FALSE, ...) 147 | { 148 | result.mat <- c() 149 | foldi <- .cv.split(y, fold) 150 | x <- as.matrix(x) 151 | n <- nrow(x) 152 | p <- ncol(x) 153 | ip <- c(1:p) 154 | # y <- as.matrix(y) 155 | q <- length(y) #ncol(y) 156 | # eta.K.pair <- cbind(rep(1, each = length(K)), rep(K, 1)) 157 | # eta.K.list <- split(eta.K.pair, c(1:nrow(eta.K.pair))) 158 | .fit.plsda <- function(K.val) { 159 | mspemati <- rep(0, fold) 160 | Ai <- rep(0, fold) 161 | for (k in 1:fold) { 162 | omit <- foldi[[k]] 163 | train.x <- x[-omit, ] 164 | train.y <- y[-omit] 165 | test.x <- x[omit, ] 166 | test.y <- y[omit] 167 | plsda.fit <- plsDA(train.x, train.y, K = K.val, ...) 168 | pred <- predict(plsda.fit, test.x) 169 | mspemati[k] <- mean(as.numeric(pred != test.y)) 170 | Ai[k] <- mean(length(plsda.fit$A)) 171 | } 172 | mspe.ij <- c(mean(mspemati), mean(Ai), 1, K.val) 173 | return(mspe.ij) 174 | } 175 | if (.Platform$OS.type == "unix") { 176 | result.list <- parallel::mclapply(K, function(k) .fit.plsda(k), mc.cores = n.core) 177 | } 178 | else { 179 | result.list <- lapply(K, function(k) .fit.plsda(k)) 180 | } 181 | result.mat <- c() 182 | for (i in 1:length(result.list)) { 183 | result.mat <- rbind(result.mat, result.list[[i]]) 184 | } 185 | 186 | mspemat <- matrix(result.mat[, 1], length(K), 1) 187 | mspemat <- t(mspemat) 188 | rownames(mspemat) <- '1' 189 | colnames(mspemat) <- K 190 | cands <- result.mat[result.mat[, 1] == min(result.mat[, 1]), 191 | , drop = FALSE] 192 | 193 | cands <- cands[cands[, 2] == min(cands[, 2]), , drop = FALSE] 194 | cands <- cands[cands[, 4] == min(cands[, 4]), , drop = FALSE] 195 | cands <- cands[cands[, 3] == max(cands[, 3]), , drop = FALSE] 196 | K.opt <- cands[, 4] 197 | eta.opt <- cands[, 3] 198 | 199 | # cat(paste("K = ", K.opt, "\n", sep = "")) 200 | # if (plot.it) { 201 | 202 | # spls::heatmap.spls(t(mspemat), xlab = "K", ylab = "eta", main = "CV MSPE Plot", 203 | # coln = 16, as = "n") 204 | # } 205 | # rownames(mspemat) <- paste("eta=", eta) 206 | colnames(mspemat) <- paste("K =", K) 207 | cv <- list(mspemat = mspemat, K.opt = K.opt) 208 | invisible(cv) 209 | } 210 | 211 | #' @keywords internal 212 | .cv.split <- function (y, fold) { 213 | n <- length(y) 214 | group <- table(y) 215 | x <- c() 216 | for (i in 1:length(group)) { 217 | x.group <- c(1:n)[y == names(group)[i]] 218 | x <- c(x, sample(x.group)) 219 | } 220 | foldi <- split(x, rep(1:fold, length = n)) 221 | return(foldi) 222 | } 223 | -------------------------------------------------------------------------------- /R/SPLSDA.R: -------------------------------------------------------------------------------- 1 | ######################################## 2 | # SPLS discriminant analysis pipeline 3 | # 4 | # 5 | 6 | 7 | 8 | 9 | #' sparse Partial Least Squares Discriminant Analysis 10 | #' sPLS regression to discriminate classes (via a logistic model) 11 | #' basically this is a wrapper for the \code{splsda} function in the caret package, 12 | #' but with default setup for dealing with uneven classes (via the priors option, see details) 13 | #' see caret::splsda for implementation details 14 | #' 15 | #' run this code if you don't need to fit paramaters by cross-validation 16 | #' 17 | #' @title splsDA sparse partial least squares discriminant analysis 18 | #' @param x x with samples in rows, features are columns (not necessarily compositional x) 19 | #' @param grouping a numeric vector or factor with sample classes (length should equal \code{nrow(x)}) 20 | #' @param usePriors use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups) 21 | #' @param K number of components in the PLS model (default: number of classes - 1) 22 | #' @param eta parameter that adjusts sparsity of the PLS model (between 0 and 1) 23 | #' @return a plsda fitted model 24 | #' @seealso \code{\link{plsDA_main}}, \code{\link{caret::plsda}}, \code{\link{caret::splsda}} 25 | #' @rdname splsDA 26 | #' @export 27 | splsDA <- function(x, grouping, eta, K, usePriors=FALSE, ...) { 28 | ## splsda wrapper 29 | # caret::splsda(...) 30 | 31 | if (length(grouping) != nrow(x)) 32 | stop('length of grouping vector does equal the number of samples') 33 | 34 | args <- list(...) 35 | if (!('probMethod' %in% names(args))) args <- c(args, list(probMethod='Bayes')) 36 | if (usePriors) { 37 | # set priors to 1-(class freq) 38 | prior <- 1-(table(grouping)/length(grouping)) 39 | args <- c(args, list(prior=as.vector(prior))) 40 | } 41 | 42 | y <- as.factor(grouping) 43 | 44 | if (missing(K)) { 45 | ncomp <- length(levels(y))-1 46 | } 47 | args <- c(args, K=K, eta=eta) 48 | do.call(caret::splsda, c(list(x, y), args)) 49 | } 50 | 51 | 52 | 53 | #' The main wrapper for full sparse Partial Least Squares discriminant analysis, 54 | #' performing cross-validation to tune model parameters (here, number of components) 55 | #' and do permutation tests (ie bootstrapping) to get pseudo-pvals estimates for model coefficients 56 | #' 57 | #' @title splsDA_main sparse partial least squares discriminant analysis 58 | #' @param x data with samples in rows, features are columns (not necessarily compositional x) 59 | #' @param grouping a numeric vector or factor with sample classes (length should equal \code{nrow(x)}) 60 | #' @param usePriors use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups) 61 | #' @param K numeric vector containing number of components in the PLS model 62 | #' @param fold number of partitions to randomly subsample for cross-validation 63 | #' @param nboots number of bootstraps/permutations for estimating coefficient p-vals 64 | #' @param n.core number of cores for paralellization of bootstraps 65 | #' @param noise for very sparse components, some subsamples may have zero variance. Optionally, add some Gaussian noise to to avoid PLS errors 66 | #' @param ... additional arguments passed to plsDA 67 | #' 68 | #' @return a \code{plsDA} object that contains: the plsda model/object, \code{pvals}, the original data, \code{x}, and \code{groupings} 69 | #' @seealso \code{\link{plsDA}} 70 | #' @rdname plsDA_main 71 | #' @export 72 | splsDA_main <- function(x, grouping, eta, K, usePriors=FALSE, fold=5, nboots=999, n.core=4, noise=0, ...) { 73 | # find optimal eta & K by cross validation 74 | 75 | opt <- cv.splsDA.fit(x, grouping, eta=eta, K=K, noise=noise, fold=fold, usePriors=usePriors, n.core=n.core) 76 | eta <- opt$eta.opt 77 | K <- opt$K.opt 78 | .bstat <- function(x, indices, ...) splsDA(x[indices,,drop=FALSE], ...)$betahat 79 | .pstat <- function(x, indices, ...) splsDA(apply(x[indices,], 2, function(x) sample(x)), ...)$betahat 80 | 81 | if (nboots > 1) { 82 | sboots <- splsdaboot(x, .bstat, .pstat, R=nboots, n.core, eta=eta, K=K, grouping=grouping, ...) 83 | pmat <- pval(sboots) 84 | # keepInd <- which(suppressWarnings(apply(pmat, 1, min, na.rm=TRUE)) <= alpha) 85 | } else { 86 | keepInd <- 1:ncol(x) 87 | pmat <- NULL 88 | } 89 | splsmod <- splsDA(x, grouping, usePriors=usePriors, eta=eta, K=K, ...) 90 | 91 | # if (length(keepInd) > 0) { 92 | # splskmod <- splsDA(x[,keepInd], grouping, usePriors=usePriors, eta=.99, K=K, ...) 93 | 94 | # } else splskmod <- NULL 95 | structure(list(splsda=splsmod, pvals=pmat, 96 | eta=eta, x=x, y=grouping), class="splsDA") 97 | } 98 | 99 | 100 | 101 | predict.splsDA <- function(train.mod, test) { 102 | fullpred <- caret::predict.splsda(train.mod$splsda, test) 103 | 104 | if (!is.null(train.mod$splsdakeep)) { 105 | keeppred <- caret::predict.splsda(train.mod$splsdakeep, test[,train.mod$keep]) 106 | } else { 107 | keeppred <- NULL 108 | } 109 | list(Full=fullpred, Keep=keeppred) 110 | } 111 | 112 | splsdaboot <- function(x, statisticboot, statisticperm, R, ncpus=1, ...) { 113 | res <- boot::boot(x, statisticboot, R=R, parallel="multicore", ncpus=ncpus, ...) 114 | null_av <- boot::boot(x, statisticperm, sim='permutation', R=R, parallel="multicore", ncpus=ncpus, ...) 115 | class(res) <- 'list' 116 | structure(c(res, list(null_av=null_av)), class='splsdaboot') 117 | } 118 | 119 | pval <- function(x, ...) { 120 | UseMethod("pval") 121 | } 122 | 123 | pval.splsdaboot <- function(x, sided='both', mar=2) { 124 | # calculate 1 or 2 way pseudo p-val from boot object 125 | # Args: a boot object 126 | if (sided != "both") stop("only two-sided currently supported") 127 | nparams <- ncol(x$t) 128 | tmeans <- colMeans(x$null_av$t) 129 | # check to see whether betas are unstable -- confirm 130 | niters <- nrow(x$t) 131 | ind95 <- max(1,round(.025*niters)):round(.975*niters) 132 | boot_ord <- apply(x$t, 2, sort) 133 | boot_ord95 <- boot_ord[ind95,] 134 | outofrange <- unlist(lapply(1:length(x$t0), function(i) { 135 | betas <- x$t0[i] 136 | range <- range(boot_ord95[,i]) 137 | range[1] > betas || range[2] < betas 138 | })) 139 | # calc whether center of mass is above or below the mean 140 | bs_above <- unlist(lapply(1:nparams, function(i) 141 | length(which(x$t[, i] > tmeans[i])))) 142 | is_above <- bs_above > x$R/2 143 | pvals <- ifelse(is_above, 2*(1-bs_above/x$R), 2*bs_above/x$R) 144 | pvals[pvals > 1] <- 1 145 | pvals[outofrange] <- NaN 146 | pmat <- matrix(pvals, ncol=ncol(x$t0)) 147 | rownames(pmat) <- rownames(x$t0) 148 | colnames(pmat) <- colnames(x$t0) 149 | pmat 150 | } 151 | 152 | 153 | cv.splsDA.fit <- function(x, grouping, eta, K, noise=0, fold=5, usePriors=FALSE, n.core) { 154 | # Find eta & k by cross validation 155 | ## wrapper for spls::cv.spls, need to add some random noise so variance isn't zero 156 | x <- x + matrix(rnorm(prod(dim(x)), 0, noise), ncol=ncol(x)) 157 | capture.output(out <- spls::cv.splsda(x, grouping, eta=eta, K=K, fold=fold, plot.it=FALSE, n.core=n.core)) 158 | out 159 | } 160 | 161 | 162 | 163 | 164 | 165 | #cross_validation.splsDA <- function(x, grouping, k=5, rep=1, allgroups=FALSE, ...) { 166 | # # Perform n-fold cross validation of LDA decision rule by leaving out samples and returning prediction 167 | # #Args: 168 | # # method -> a method string that takes x and grouping (or X & y) and has a predict method for resulting object 169 | # # x -> N by p x matrix 170 | # # k -> divide x into n even fractions for cross validation 171 | # # rep -> repeat randomized x-validation 172 | # # allgroups -> all groups should be represented at least twice in the subsample (could mean doubling up) 173 | # # ... -> additional arguments to lda_r 174 | # # Returns: 175 | # # matrix of n X rep with prediction error 176 | 177 | # nrec <- function(N, k) { 178 | # r <- N %% k 179 | # b <- floor(N/k) 180 | # c(rep(b+1, r), rep(b, k-r)) 181 | # } 182 | 183 | # ind <- 1:nrow(x) 184 | # err <- vector('list', length=rep) 185 | # errKeep <- vector('list', length=rep) 186 | # subsizes <- nrec(length(ind), k) 187 | # groupfact <- as.factor(grouping) 188 | # for (l in 1:rep) { 189 | # rind <- sample(ind) 190 | # mmFull <- vector('list', length=length(subsizes)) 191 | # mmKeep <- vector('list', length=length(subsizes)) 192 | # 193 | # if (allgroups) { 194 | # memlist <- split(1:length(grouping), grouping[rind]) 195 | # rindmat <- suppressWarnings(do.call('rbind', memlist)) 196 | # } 197 | # 198 | # for (i in 1:length(subsizes)) { 199 | # rind.i <- ((subsizes[i]*i)-(subsizes[i]-1)):(subsizes[i]*i) 200 | # 201 | # if (allgroups) { 202 | # rind.i <- rindmat[rind.i] 203 | # } 204 | # 205 | # test.ind <- rind[rind.i] 206 | # test <- x[test.ind,] 207 | # train <- x[-test.ind,] 208 | 209 | # args <- list(...) 210 | 211 | # vars <- apply(train, 2, var) 212 | # zind <- which(vars <= 1e-3) 213 | # vind <- setdiff(1:ncol(train), zind) 214 | 215 | # train.mod <- do.call(splsDA_main, c(list(train[,vind], grouping[-test.ind]), args)) 216 | 217 | # pred <- predict.splsDA(train.mod, test[,vind]) 218 | # if ('class' %in% names(pred)) pred <- pred$class # eg if method is lda_r 219 | # mmFull[[i]] <- table(factor(pred$Full, levels=levels(groupfact)), 220 | # factor(grouping[test.ind], levels=levels(groupfact))) 221 | 222 | # if (!is.null(pred$Keep)) { 223 | # mmKeep[[i]] <- table(factor(pred$Keep, levels=levels(groupfact)), 224 | # factor(grouping[test.ind], levels=levels(groupfact))) 225 | # } 226 | # } 227 | # mfind <- which(unlist(lapply(lapply(mmFull, dim), 228 | # function(x) all(x == rep(length(unique(grouping)), 2) )))) 229 | # mmFull <- mmFull[mfind] 230 | # err[[l]] <- Reduce('+', mmFull) / length(mmFull) 231 | # err[[l]] <- err[[l]] / rowSums(err[[l]]) 232 | # if (!all(unlist(lapply(mmKeep, is.null)))) { 233 | # mfind <- which(unlist(lapply(lapply(mmKeep, dim), 234 | # function(x) all(x == rep(length(unique(grouping)), 2) )))) 235 | # mmKeep <- mmKeep[mfind] 236 | # errKeep[[l]] <- Reduce('+', mmKeep) / length(mmKeep) 237 | # errKeep[[l]] <- errKeep[[l]] / rowSums(errKeep[[l]]) 238 | # } else errKeep <- NULL 239 | # } 240 | # structure(list(errFull = err, errKeep = errKeep), class='cv') 241 | #} 242 | 243 | 244 | #spls.dist <- function(x, ...) { 245 | ## means <- irispls$meanx 246 | ## X <- scale(xobj$x, center = means, scale = FALSE) 247 | # scores <- as.matrix(x$x[,x$A]) %*% as.matrix(x$projection) 248 | # dist(scores) 249 | #} 250 | 251 | 252 | #spls.sil <- function(dist, grouping, ...) { 253 | # x <- list() 254 | # x$clustering <- as.integer(grouping) 255 | # cluster:::silhouette.default(x, dist=dist) 256 | #} 257 | 258 | 259 | spls2pls <- function(x) { 260 | pls(x$x[,x$A]) 261 | } 262 | 263 | 264 | 265 | 266 | -------------------------------------------------------------------------------- /R/ggbiplot.R: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # Methods for making pretty biplots with ggplot2 package 3 | # 4 | # based on code by http://www.vince.vu/software/#ggbiplot 5 | 6 | 7 | #' Pretty biplots using ggplots 8 | #' @title methods for making biplots from various projection & classification models 9 | #' @param xobj The object to be plotted 10 | #' @rdname ggbiplot 11 | #' @export 12 | ggbiplot <- function(xobj, ...) { 13 | UseMethod("ggbiplot") 14 | } 15 | 16 | #' @rdname ggbiplot 17 | #' @method ggbiplot princomp 18 | #' @export 19 | ggbiplot.princomp <- function(xobj, ...) { 20 | nobs.factor <- sqrt(xobj$n.obs) 21 | d <- xobj$sdev 22 | scores <- sweep(xobj$scores, 2, 6/(d * nobs.factor), FUN = '*') 23 | ggbiplot.default(list(scores=scores, loadings=xobj$loadings), ...) 24 | } 25 | 26 | 27 | #' @rdname ggbiplot 28 | #' @method ggbiplot prcomp 29 | #' @export 30 | ggbiplot.prcomp <- function(xobj, ...) { 31 | nobs.factor <- sqrt(nrow(xobj$x) - 1) 32 | d <- xobj$sdev 33 | scores <- sweep(xobj$x, 2, 1/(d * nobs.factor), FUN = '*') 34 | loadings <- xobj$rotation 35 | ggbiplot.default(list(scores=scores, loadings=loadings), ...) 36 | } 37 | 38 | 39 | #' @rdname ggbiplot 40 | #' @method ggbiplot lda 41 | #' @export 42 | ggbiplot.lda <- function(xobj, ...) { 43 | xname <- xobj$call$x 44 | gname <- xobj$call[[3L]] 45 | X <- eval.parent(xname) 46 | g <- eval.parent(gname) 47 | means <- colMeans(xobj$means) 48 | X <- scale(X, center = means, scale = FALSE) 49 | x <- as.matrix(X) %*% xobj$scaling 50 | nobs.factor <- sqrt(nrow(x)) 51 | d <- apply(x, 2, sd) 52 | x <- sweep(x, 2, 25/(d * nobs.factor), FUN = '*') 53 | ggbiplot.default(list(scores=x, loadings=xobj$scaling), ...) 54 | } 55 | 56 | 57 | #' @rdname ggbiplot 58 | #' @method ggbiplot plsda 59 | #' @export 60 | ggbiplot.plsda <- function(xobj, Yplot=FALSE, ...) { 61 | if (Yplot) { 62 | scores <- xobj$Yscores 63 | loadings <- xobj$Yloadings 64 | } else { 65 | scores <- xobj$scores 66 | loadings <- xobj$loadings 67 | } 68 | nobs.factor <- sqrt(nrow(scores)) 69 | d <- apply(scores, 2, sd) 70 | means <- colMeans(scores) 71 | scores <- scale(scores, center = means, scale = FALSE) 72 | scores <- sweep(scores, 2, 6/(d * nobs.factor), FUN = '*') 73 | ggbiplot.default(list(scores=scores, loadings=loadings), ...) 74 | 75 | } 76 | 77 | 78 | #' @rdname ggbiplot 79 | #' @method ggbiplot splsda 80 | #' @export 81 | ggbiplot.splsda <- function(xobj, ...) { 82 | # means <- irispls$meanx 83 | # X <- scale(xobj$x, center = means, scale = FALSE) 84 | X <- xobj$x 85 | if (is.null(xobj$scores)) 86 | xobj$scores <- as.matrix(X[,xobj$A]) %*% as.matrix(xobj$projection) 87 | xobj$loadings <- xobj$projection 88 | ggbiplot.plsda(xobj, ...) 89 | } 90 | 91 | 92 | #' @rdname ggbiplot 93 | #' @method ggbiplot matrix 94 | #' @export ggbiplot.matrix 95 | ggbiplot.matrix <- function(xobj, ...) { 96 | scores <- xobj 97 | loadings <- matrix(NA, nrow(xobj), ncol(xobj)) 98 | ggbiplot.default(list(scores=scores, loadings=loadings), plot.loadings=FALSE, 99 | equalcoord=FALSE, ...) 100 | } 101 | 102 | 103 | #' @param grouping an optional grouping vector (ie - for coloring points) 104 | #' @param select index of components to be plotted (must be length 2) 105 | #' @param circle enclose points in a circle 106 | #' @param circle.prob controls circle diameter (scales data std dev) if \code{circle = TRUE} 107 | #' @param plot.loadings should loading vectors be plotted 108 | #' @param label.loadings text of loadings labels, taken from rownames of loadings (depends on class of \code{xobj}) 109 | #' @param label.offset absolute offset for loading labels, so labels don't cover loadings vectors 110 | #' @param scale.loadings scale length of loading vectors for plotting purposes 111 | #' @param col.loadings a single value of vector for color of loadings 112 | #' @param alpha controls relative transparency of various plot features 113 | #' @param col color factor for points 114 | #' @param group.ellipse enclose within-group points in an covariance ellipse 115 | #' @param scale.ellipse scale \code{group.ellipse} to 1 standard deviation 116 | #' @param group.cloud connect within-group points to a group mean point with a straight edge 117 | #' @param xlab label for x axis 118 | #' @param ylab label for y axis 119 | #' @param equalcoord equal coordinates, ie should the plot area be square? 120 | #' @param size point size 121 | #' @param size.loadings line width of loading vectors 122 | #' 123 | #' @details additional plotting attributes (eg colors, themes, etc) can be chained on in the usual way for ggplots 124 | #' @examples 125 | #' # an LDA example with iris data 126 | #' ldamod <- lda(iris[,1:4], grouping=iris[,5]) 127 | #' ggbiplot(ldamod, grouping=iris[,5], alpha=.7, group.cloud=TRUE) + theme_bw() 128 | 129 | #' @rdname ggbiplot 130 | #' @method ggbiplot default 131 | #' @importFrom ggplot2 ggplot geom_segment geom_point geom_text geom_path scale_x_continuous scale_y_continuous aes 132 | #' @export 133 | ggbiplot.default <- function(xobj, grouping, select=1:2, circle = FALSE, circle.prob = 0.69, 134 | plot.loadings=TRUE, label.loadings=FALSE, sub.loadings=1:nrow(xobj$loadings), 135 | label.offset=0, label.size=4.5, scale.loadings = 1, col.loadings=scales::muted("red"), 136 | alpha = 1, col=grouping, shape=NULL, group.ellipse=FALSE, scale.ellipse = 1, 137 | group.cloud = FALSE, xlab="", ylab="", equalcoord=TRUE, size=3, size.loadings=1, 138 | loadingsOnTop = FALSE) { 139 | ## get scores and loadings from xobj 140 | if (length(select) > 2) stop("Error: only 2d plots supported") 141 | if (length(select) < 2) stop("Error: need at least 2 coordinates/components") 142 | scores <- data.frame(xvar=xobj$scores[,select[1]], yvar=xobj$scores[,select[2]]) 143 | loadings <- data.frame(xvar=xobj$loadings[sub.loadings,select[1]], yvar=xobj$loadings[sub.loadings,select[2]]) 144 | # standardize scores (?) 145 | # Base plot 146 | g <- ggplot(data = scores, aes(x = xvar, y = yvar)) 147 | 148 | if (plot.loadings) { 149 | loadingslayer <- geom_segment(data = loadings*scale.loadings, 150 | aes(x = 0, y = 0, xend = xvar, yend = yvar), 151 | arrow = grid::arrow(length = grid::unit(1/2, 'picas')), 152 | size = size.loadings, color = col.loadings) 153 | } 154 | if (is.character(label.loadings) || label.loadings) { 155 | if (is.logical(label.loadings)) 156 | labs <- rownames(loadings) 157 | else labs <- label.loadings 158 | # compute angles from orig. 159 | ang <- atan2(loadings$yvar*scale.loadings, loadings$xvar*scale.loadings) 160 | hyp <- sqrt((loadings$yvar*scale.loadings)^2 + (loadings$xvar*scale.loadings)^2) 161 | 162 | labdat <- data.frame(newx=(hyp + label.offset)*cos(ang), 163 | newy=(hyp + label.offset)*sin(ang), 164 | label=labs) 165 | g <- g + 166 | geom_text(aes(x=newx, y=newy, label=label), data=labdat, size=label.size) 167 | } 168 | 169 | if (!missing(grouping)) { 170 | gind <- order(grouping) 171 | grouping <- grouping[gind] 172 | scores <- scores[gind,] 173 | 174 | df <- data.frame(xvar=scores$xvar, yvar=scores$yvar, grouping=grouping) 175 | if (!is.null(shape)) { 176 | aesfun <- aes(color = grouping, shape=shape) ; df$shape <- shape[gind] 177 | } else 178 | aesfun <- aes(color = grouping) 179 | 180 | scoreslayer <- geom_point(data = df, 181 | aesfun, alpha = alpha, size=size) 182 | 183 | } else { 184 | if (!missing(col)) { 185 | df <- data.frame(xvar=scores$xvar, yvar=scores$yvar, col=col) 186 | if (!is.null(shape)) { 187 | aesfun <- aes(color = col, shape=shape) ; df$shape <- shape 188 | } else 189 | aesfun <- aes(color = col) 190 | 191 | scoreslayer <- geom_point(data=df, 192 | aesfun, alpha = alpha, size=size) 193 | 194 | } else { 195 | if (!is.null(shape)) 196 | aesfun <- aes(shape=shape) 197 | else 198 | aesfun <- aes() 199 | scoreslayer <- geom_point(aesfun, alpha = alpha, size=size) 200 | } 201 | } 202 | if (plot.loadings) { 203 | if (!loadingsOnTop) g <- g + loadingslayer + scoreslayer 204 | else g <- g + scoreslayer + loadingslayer 205 | } else 206 | g <- g + scoreslayer 207 | 208 | 209 | if (group.ellipse && !missing(grouping)) { 210 | l <- 200 211 | group.scores <- split(scores[,1:2], grouping) 212 | group.centers <- lapply(group.scores, colMeans) 213 | group.cov <- lapply(group.scores, cov) 214 | group.RR <- lapply(group.cov, chol) 215 | angles <- seq(0, 2*pi, length.out=l) 216 | ell.list <- lapply(group.RR, function(RR) 217 | scale.ellipse * cbind(cos(angles), sin(angles)) %*% RR) 218 | ellCntr <- lapply(1:length(ell.list), function(i) 219 | sweep(ell.list[[i]], 2, group.centers[[i]], "+")) 220 | names(ellCntr) <- names(ell.list) 221 | ell.df <- as.data.frame(do.call("rbind", ellCntr)) 222 | ell.df$grouping <- factor(rep(names(ellCntr), each=l), levels=names(ellCntr)) 223 | g <- g + geom_path(data = ell.df, aes(color = grouping, group = grouping)) 224 | } 225 | 226 | if (group.cloud && !missing(grouping)) { 227 | group.scores <- split(scores[,1:2], grouping) 228 | group.centers <- lapply(group.scores, colMeans) 229 | centers.df <- do.call('rbind', rep(group.centers, table(grouping))) 230 | rownames(centers.df) <- rownames(scores) 231 | colnames(centers.df) <- c("xcntr", "ycntr") 232 | loadCntr.df <- cbind(scores, centers.df, grouping) 233 | g <- g + 234 | geom_segment(data = loadCntr.df, 235 | aes(x = xcntr, y = ycntr, xend = xvar, yend = yvar, 236 | color = grouping), alpha=10^log(alpha/1.4)) 237 | } 238 | 239 | 240 | if (circle) { 241 | # scale circle radius 242 | r1 <- sqrt(qchisq(circle.prob, df = 2)) * max(scores$xvar^2)^(1/2) 243 | r2 <- sqrt(qchisq(circle.prob, df = 2)) * max(scores$yvar^2)^(1/2) 244 | 245 | theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50)) 246 | circdat <- data.frame(xvar = r1 * cos(theta), yvar = r2 * sin(theta)) 247 | g <- g + geom_path(aes(x=xvar, y=yvar), data = circdat, color = scales::muted('black'), 248 | size = 0.5, alpha = alpha/3) 249 | } 250 | 251 | if (equalcoord) { 252 | if (circle) { 253 | xrange <- range(circdat$xvar) 254 | yrange <- range(circdat$yvar) 255 | } else { 256 | xrange <- c(-max(abs(scores$xvar)), max(abs(scores$xvar))) 257 | yrange <- c(-max(abs(scores$yvar)), max(abs(scores$yvar))) 258 | } 259 | 260 | g <- g + scale_x_continuous(xlab, limits=xrange) + 261 | scale_y_continuous(ylab, limits=yrange) 262 | } else { 263 | g <- g + scale_x_continuous(xlab) + 264 | scale_y_continuous(ylab) 265 | } 266 | g 267 | } 268 | -------------------------------------------------------------------------------- /R/hello.R: -------------------------------------------------------------------------------- 1 | 2 | ## a placeholder 3 | hello <- function(txt = "world") { 4 | cat("Hello, ", txt, "\n") 5 | } 6 | -------------------------------------------------------------------------------- /R/normalization.R: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # Different normalization schemes for microbiome counts (real or fake) 3 | # 4 | # @author Zachary Kurtz 5 | # @date 10/10/2013 6 | ##################################################################### 7 | 8 | 9 | #' @export 10 | norm_pseudo <- function(x) norm_to_total(x+1) 11 | 12 | 13 | #' @keywords internal 14 | norm_diric <- function(x, rep=1) { 15 | require(VGAM) 16 | dmat <- rdiric(rep, x+1) 17 | norm_to_total(colMeans(dmat)) 18 | } 19 | 20 | #' @export 21 | norm_to_total <- function(x) x/sum(x) 22 | 23 | 24 | #' The centered log-ratio transformation for 25 | #' compositional data (not necessarily closed/normalized!) 26 | #' 27 | #' The clr is computed as 28 | #' \code{x[i]} = log (\code{x[i]} / \code{exp(mean(log(x)))}) 29 | #' 30 | #' @title clr The centered log-ratio transformation 31 | #' @param x a numeric data vector containing components of a composition 32 | #' @param ... additional arguments 33 | #' @return clr transformed \code{x} 34 | #' @examples 35 | #' # vector examples: 36 | #' clr(norm_to_total(1:10)) 37 | #' clr(1:10) 38 | #' 39 | #' # matrix examples: 40 | #' dmat <- matrix(exp(rnorm(110)), 10) 41 | # rows are samples/compositions, cols are features/components 42 | #' clr(dmat, 1) 43 | # cols are samples/compositions, rows are features/components 44 | #' clr(dmat, 2) 45 | #' @rdname clr 46 | #' @export 47 | clr <- function(x, ...) { 48 | UseMethod('clr') 49 | } 50 | 51 | 52 | 53 | #' 54 | #' @param base base of log to use, default is natural log 55 | #' @param tol machine tolerance for a zero count, default is machine tol (.Machine$double.eps) 56 | #' @rdname clr 57 | #' @method clr default 58 | #' @export 59 | clr.default <- function(x, base=exp(1), tol=.Machine$double.eps) { 60 | nzero <- (x >= tol) 61 | LOG <- log(ifelse(nzero, x, 1), base) 62 | ifelse(nzero, LOG - mean(LOG)/mean(nzero), 0.0) 63 | } 64 | 65 | 66 | #' 67 | #' 68 | #' @rdname clr 69 | #' @method clr matrix 70 | #' @export 71 | clr.matrix <- function(x, mar=2, base=exp(1), tol=.Machine$double.eps) { 72 | # apply(x, mar, clr, ...) 73 | if (!mar %in% c(1,2)) stop('mar (margin) must be 1 (compositions are rows) or 2 (compositions are columns)') 74 | 75 | if (mar == 1) x <- t(x) 76 | nzero <- (x >= tol) 77 | LOG <- log(ifelse(nzero, x, 1), base) 78 | means <- colMeans(LOG) 79 | x.clr <- LOG - rep(means/colMeans(nzero), each=nrow(x)) 80 | x.clr[!nzero] <- 0.0 81 | if (mar == 1) x.clr <- t(x.clr) 82 | x.clr 83 | } 84 | 85 | 86 | #' 87 | #' 88 | #' @rdname clr 89 | #' @method clr data.frame 90 | #' @export 91 | clr.data.frame <- function(x, mar=2, ...) { 92 | clr(as.matrix(x), mar, ...) 93 | } 94 | 95 | 96 | 97 | #' The additive log-ratio transformation for 98 | #' compositional data (not necessarily closed/normalized!) 99 | #' 100 | #' The alr transformation is computed as: 101 | #' \code{x[i]} = log ( \code{x[i]} / x[D] ) 102 | #' 103 | #' @title alr The additive log-ratio transformation 104 | #' @param x a numeric data vector containing components of a composition 105 | #' @param ... additional arguments 106 | #' @return alr transformed \code{x} 107 | #' @examples 108 | #' # vector examples: 109 | #' alr(norm_to_total(1:10)) 110 | #' alr(1:10) 111 | #' 112 | #' # matrix examples: 113 | #' dmat <- matrix(exp(rnorm(110)), 10) 114 | # rows are samples/compositions, cols are features/components 115 | #' alr(dmat, 1) 116 | # cols are samples/compositions, rows are features/components 117 | #' alr(dmat, 2) 118 | #' @rdname alr 119 | #' @export 120 | alr <- function(x, ...) { 121 | UseMethod("alr", x) 122 | } 123 | 124 | 125 | #' @param divcomp index of the divisor component 126 | #' @param removeDivComp remove divisor component from the resulting data 127 | #' @param base base of log to use, default is natural log 128 | #' @param tol machine tolerance for a zero count, default is machine tol (.Machine$double.eps) 129 | #' @rdname alr 130 | #' @method alr default 131 | #' @export alr.default 132 | alr.default <- function(x, divcomp=1, base=exp(1), removeDivComp=TRUE, 133 | tol=.Machine$double.eps) { 134 | zero <- (x >= tol) 135 | LOG <- log(ifelse(zero, x, 1), base) 136 | x.alr <- ifelse(zero, LOG - LOG[divcomp], 0.0) 137 | if (removeDivComp) x.alr[-divcomp] 138 | else x.alr 139 | } 140 | 141 | #' 142 | #' 143 | #' @rdname alr 144 | #' @method alr matrix 145 | #' @export alr.default 146 | alr.matrix <- function(x, mar=2, divcomp=1, base=exp(1), removeDivComp=TRUE, 147 | tol=.Machine$double.eps) { 148 | if (mar == 1) x <- t(x) 149 | zero <- (x >= tol) 150 | LOG <- log(ifelse(zero, x, 1), base) 151 | x.alr <- ifelse(zero, LOG - rep(LOG[divcomp,], each=nrow(x)), 0.0) 152 | if (removeDivComp) x.alr <- x.alr[-divcomp,] 153 | 154 | if (mar ==1) t(x.alr) 155 | else x.alr 156 | } 157 | 158 | #' 159 | #' 160 | #' @rdname alr 161 | #' @method alr data.frame 162 | #' @export alr.data.frame 163 | alr.data.frame <- function(x, mar=2, ...) { 164 | alr(as.matrix(x), mar, ...) 165 | } 166 | 167 | #' @keywords internal 168 | ilr <- function(x.f, V, ...) { 169 | UseMethod('ilr') 170 | } 171 | 172 | #' @keywords internal 173 | ilr.default <- function(x.f, ...) { 174 | 175 | } 176 | 177 | 178 | 179 | #' @keywords internal 180 | CSS <- function(x, ...) { 181 | UseMethod("CSS") 182 | } 183 | #' @keywords internal 184 | CSS.default <- function(x, p=0.05, sl=1000) { 185 | # Cumulative sum scaling Normalization Paulson et al 2013 (Nature Methods) 186 | xx <- x 187 | xx[x==0] <- NA 188 | qs <- quantile(xx, p=p, na.rm=TRUE) 189 | xx <- x - .Machine$double.eps 190 | normFactor <- sum(xx[xx <= qs]) 191 | (x/normFactor)*sl 192 | } 193 | #' @keywords internal 194 | CSS.matrix <- function(x, p=CSSstat(x), sl=1000, mar=2) { 195 | apply(x, mar, CSS, p=p, sl=sl) 196 | } 197 | #' @keywords internal 198 | CSSstat <- function(mat, rel=0.1) { 199 | smat <- sapply(1:ncol(mat), function(i) { 200 | sort(mat[, i], decreasing = FALSE) 201 | }) 202 | ref <- rowMeans(smat) 203 | yy <- mat 204 | yy[yy == 0] <- NA 205 | ncols <- ncol(mat) 206 | refS <- sort(ref) 207 | k <- which(refS > 0)[1] 208 | lo <- (length(refS) - k + 1) 209 | diffr <- sapply(1:ncols, function(i) { 210 | refS[k:length(refS)] - quantile(yy[, i], p = seq(0, 211 | 1, length.out = lo), na.rm = TRUE)}) 212 | diffr2 <- apply(abs(diffr), 1, median, na.rm = TRUE) 213 | x <- which(abs(diff(diffr2))/diffr2[-1] > rel)[1]/length(diffr2) 214 | names(x) <- NULL 215 | x 216 | } 217 | 218 | #' @keywords internal 219 | DESeq <- function(x, ...) { 220 | UseMethod("DESeq") 221 | } 222 | #' @keywords internal 223 | DESeq.matrix <- function(mat, c) { 224 | # compute geometric mean along columns 225 | matt <- mat 226 | matt[matt == 0] <- NA 227 | k_ref <- apply(matt, 1, function(x) exp(mean(log(x), na.rm=TRUE))) 228 | krefmat <- matrix(rep(k_ref,ncol(mat)), nrow=nrow(mat)) 229 | s_hat <- apply(matt/krefmat, 2, median, na.rm=TRUE) 230 | if (missing(c)) { 231 | fn <- function(c, s_hat) abs(sum(log(c*s_hat))) 232 | c <- optimize(fn, interval=0:10, s_hat=s_hat)$minimum 233 | } 234 | s <- c * s_hat 235 | smat <- matrix(rep(s,nrow(mat)), ncol=ncol(mat), byrow=TRUE) 236 | mat/smat 237 | } 238 | -------------------------------------------------------------------------------- /R/okernelPLS.fit.R: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Orthonormalized kernel PLS fit algorithms 3 | # 4 | ########################################### 5 | 6 | 7 | okernelpls.fit <- function(X, Y, ncomp, stripped=FALSE, scale=FALSE, center=TRUE, kernelfun='linkern', ...) { 8 | X <- as.matrix(X) 9 | Y <- as.matrix(Y) 10 | if (!stripped) { 11 | dnX <- dimnames(X) 12 | dnY <- dimnames(Y) 13 | } 14 | dimnames(X) <- dimnames(Y) <- NULL 15 | nobj <- dim(X)[1] 16 | npred <- dim(X)[2] 17 | nresp <- dim(Y)[2] 18 | Xmeans <- colMeans(X) 19 | X <- scale(X, scale=scale, center=center) 20 | Ymeans <- colMeans(Y) 21 | Y <- scale(Y, scale=scale, center=center) 22 | 23 | # define a linear kernel for response vectors 24 | K_y <- tcrossprod(Y) 25 | if (kernelfun == "linkern") { 26 | K_x <- tcrossprod(X) 27 | } else { 28 | library(kernlab) 29 | K_x <- kernelMatrix(match.fun(kernelfun), X) 30 | } 31 | 32 | Amat <- K_x %*% K_y %*% K_x 33 | Amat0 <- Amat 34 | Bmat <- K_x %*% K_x 35 | TT <- U <- matrix(0, ncol = ncomp, nrow = nobj) 36 | B <- array(0, c(npred, nresp, ncomp)) 37 | In <- diag(nobj) 38 | nits <- numeric(ncomp) 39 | A <- matrix(0, nobj, ncomp) 40 | if (!stripped) { 41 | fitted <- array(0, dim = c(nobj, nresp, ncomp)) 42 | Xresvar <- numeric(ncomp) 43 | Xtotvar <- sum(diag(K_x)) 44 | } 45 | for (a in 1:ncomp) { 46 | # generalized eigenvalue decomposition of feature and response kernel 47 | out <- .geigen(Amat, Bmat) 48 | vals <- out$values 49 | vecs <- out$vectors 50 | val.max <- which.max((vals)) 51 | alpha <- vecs[,val.max,drop=FALSE] 52 | ## alpha <- alpha/ norm(matrix(alpha), 'F') 53 | beta <- K_y %*% alpha 54 | utmp <- beta/c(crossprod(alpha, beta)) 55 | wpw <- sqrt(c(crossprod(utmp, K_x) %*% utmp)) 56 | TT[, a] <- alpha * wpw 57 | U[, a] <- utmp * wpw 58 | A[,a] <- alpha 59 | Amat <- Amat - (vals[val.max] * (Bmat %*% alpha %*% t(alpha) %*% Bmat)) 60 | } 61 | n <- length(X) 62 | m <- length(X) 63 | ret <- t(t(K_x - rowSums(K_x)/m) - rowSums(K_x)/m) + sum(K_x)/(m * n) 64 | TT <- scale(tcrossprod(ret) %*% TT, scale=TRUE, center=TRUE)[,,drop=FALSE] 65 | TTtTinv <- TT %*% diag(1/colSums(TT * TT), ncol = ncol(TT)) 66 | if (kernelfun != 'linkern') { 67 | XW <- tcrossprod(ret) %*% U 68 | W <- scale(crossprod(X, XW), center=FALSE) 69 | W <- W/rep(sqrt(colSums(W * W)), each = npred) 70 | P <- crossprod(tcrossprod(XW, W), TTtTinv) 71 | } else { 72 | W <- crossprod(X, U) 73 | W <- W/rep(sqrt(colSums(W * W)), each = npred) 74 | P <- crossprod(X, TTtTinv) 75 | } 76 | 77 | Q <- crossprod(Y, TTtTinv) 78 | if (ncomp == 1) { 79 | R <- W 80 | } 81 | else { 82 | PW <- crossprod(P, W) 83 | if (nresp == 1) { 84 | PWinv <- diag(ncomp) 85 | bidiag <- -PW[row(PW) == col(PW) - 1] 86 | for (a in 1:(ncomp - 1)) PWinv[a, (a + 1):ncomp] <- cumprod(bidiag[a:(ncomp - 1)]) 87 | } 88 | else { 89 | PWinv <- backsolve(PW, diag(ncomp)) 90 | } 91 | R <- W %*% PWinv 92 | } 93 | for (a in 1:ncomp) { 94 | B[, , a] <- tcrossprod(R[, 1:a, drop = FALSE], Q[, 1:a, drop = FALSE]) 95 | } 96 | if (stripped) { 97 | list(coefficients = B, Xmeans = Xmeans, Ymeans = Ymeans) 98 | } else { 99 | for (a in 1:ncomp) fitted[, , a] <- tcrossprod(TT[, 1:a, drop = FALSE], Q[, 1:a, drop = FALSE]) 100 | residuals <- -fitted + c(Y) 101 | fitted <- fitted + rep(Ymeans, each = nobj) 102 | Xvar <- diff(-c(Xtotvar, Xresvar)) 103 | objnames <- dnX[[1]] 104 | if (is.null(objnames)) 105 | objnames <- dnY[[1]] 106 | prednames <- dnX[[2]] 107 | respnames <- dnY[[2]] 108 | compnames <- paste("Comp", 1:ncomp) 109 | nCompnames <- paste(1:ncomp, "comps") 110 | dimnames(TT) <- dimnames(U) <- list(objnames, compnames) 111 | dimnames(R) <- dimnames(W) <- dimnames(P) <- list(prednames, 112 | compnames) 113 | dimnames(Q) <- list(respnames, compnames) 114 | dimnames(B) <- list(prednames, respnames, nCompnames) 115 | dimnames(fitted) <- dimnames(residuals) <- list(objnames, 116 | respnames, nCompnames) 117 | names(Xvar) <- compnames 118 | class(TT) <- class(U) <- "scores" 119 | class(P) <- class(W) <- class(Q) <- "loadings" 120 | library(MASS) 121 | modbeta <- MASS::ginv(TT) %*% Y 122 | list(coefficients = B, scores = TT, loadings = P, loading.weights = W, 123 | Yscores = U, Yloadings = Q, projection = R, Xmeans = Xmeans, 124 | Ymeans = Ymeans, fitted.values = fitted, residuals = residuals, modbeta=modbeta, 125 | Xvar = Xvar, Xtotvar = Xtotvar, nits = nits, A=A, Kx=K_x, Ky=K_y, ret=ret) 126 | } 127 | 128 | 129 | # if (kernelfun != 'linkern') { 130 | # n <- length(X) 131 | # m <- length(X) 132 | # ret <- t(t(K_x - rowSums(K_x)/m) - rowSums(K_x)/m) + sum(K_x)/(m * n) 133 | # XU <- ret %*% t(ret) %*% A 134 | # # mimic feature loadings using kernel factor analysis on the scores 135 | # # and computing inner prod with X and scores 136 | ## fa <- kfa(XU, features=npred) 137 | ## U <- fa@xmatrix 138 | # U <- scale(t(X) %*% XU, center=FALSE) 139 | # } else { 140 | # ret <- X 141 | # U <- t(ret) %*% A 142 | # XU <- ret %*% U 143 | # } 144 | 145 | # rownames(U) <- colnames(X) 146 | # XUXU_inv <- tryCatch(solve(t(XU) %*% XU), error=function(e) { 147 | # require(MASS) ; ginv(t(XU) %*% XU) }) 148 | 149 | # B <- XUXU_inv %*% t(XU) %*% Y 150 | # residuals <- Y - (XU %*% B) 151 | 152 | 153 | # list(coefficients = B, scores = XU, loadings = U, Xmeans = Xmeans, fitted.values=XU%*%B, 154 | # Ymeans = Ymeans, residuals = residuals, Bmat=Bmat, Amat=Amat0, 155 | # Xvar = colSums(U * U), Xtotvar = sum(X * X), ret=ret) 156 | } 157 | 158 | 159 | 160 | .geigen <- function(A, B, rank=ncol(A)) { 161 | svd1 <- svd(B) 162 | F <- svd1$v 163 | D <- diag(svd1$d[1:rank]^-.5) 164 | T1 <- F[,1:rank] %*% D 165 | svd2 <- svd(t(T1) %*% A %*% T1) 166 | T2 <- svd2$v 167 | T3 <- T1 %*% T2 168 | list(vectors=T3, values=diag(t(T3) %*% A %*% T3)) 169 | } 170 | 171 | 172 | 173 | 174 | #kpls <- function (X, Y, ncomp, stripped = FALSE, tol = .Machine$double.eps^0.5, maxit = 100, ...) { 175 | # Y <- as.matrix(Y) 176 | # if (!stripped) { 177 | # dnX <- dimnames(X) 178 | # dnY <- dimnames(Y) 179 | # } 180 | # dimnames(X) <- dimnames(Y) <- NULL 181 | # nobj <- dim(X)[1] 182 | # npred <- dim(X)[2] 183 | # nresp <- dim(Y)[2] 184 | # TT <- U <- matrix(0, ncol = ncomp, nrow = nobj) 185 | # B <- array(0, c(npred, nresp, ncomp)) 186 | # In <- diag(nobj) 187 | # nits <- numeric(ncomp) 188 | # if (!stripped) { 189 | # fitted <- array(0, dim = c(nobj, nresp, ncomp)) 190 | # Xresvar <- numeric(ncomp) 191 | # } 192 | # Xmeans <- colMeans(X) 193 | # X <- X - rep(Xmeans, each = nobj) 194 | # Ymeans <- colMeans(Y) 195 | # Y <- Y - rep(Ymeans, each = nobj) 196 | # XXt <- tcrossprod(X) 197 | # YYt <- tcrossprod(Y) 198 | # if (!stripped) 199 | # Xtotvar <- sum(diag(XXt)) 200 | # for (a in 1:ncomp) { 201 | # XXtYYt <- XXt %*% YYt 202 | # XXtYYt <- XXtYYt %*% XXtYYt 203 | # t.a.old <- Y[, 1] 204 | # nit <- 0 205 | # repeat { 206 | # nit <- nit + 1 207 | # t.a <- XXtYYt %*% t.a.old 208 | # t.a <- t.a/sqrt(c(crossprod(t.a))) 209 | # if (sum(abs((t.a - t.a.old)/t.a), na.rm = TRUE) < 210 | # tol) 211 | # break 212 | # else t.a.old <- t.a 213 | # if (nit >= maxit) { 214 | # warning("No convergence in", maxit, "iterations\n") 215 | # break 216 | # } 217 | # } 218 | # nits[a] <- nit 219 | # u.a <- YYt %*% t.a 220 | # utmp <- u.a/c(crossprod(t.a, u.a)) 221 | # wpw <- sqrt(c(crossprod(utmp, XXt) %*% utmp)) 222 | # TT[, a] <- t.a * wpw 223 | # U[, a] <- utmp * wpw 224 | # G <- In - tcrossprod(t.a) 225 | # XXt <- G %*% XXt %*% G 226 | # YYt <- G %*% YYt %*% G 227 | # if (!stripped) 228 | # Xresvar[a] <- sum(diag(XXt)) 229 | # } 230 | # W <- crossprod(X, U) 231 | # W <- W/rep(sqrt(colSums(W * W)), each = npred) 232 | # TTtTinv <- TT %*% diag(1/colSums(TT * TT), ncol = ncol(TT)) 233 | # P <- crossprod(X, TTtTinv) 234 | # Q <- crossprod(Y, TTtTinv) 235 | # if (ncomp == 1) { 236 | # R <- W 237 | # } 238 | # else { 239 | # PW <- crossprod(P, W) 240 | # if (nresp == 1) { 241 | # PWinv <- diag(ncomp) 242 | # bidiag <- -PW[row(PW) == col(PW) - 1] 243 | # for (a in 1:(ncomp - 1)) PWinv[a, (a + 1):ncomp] <- cumprod(bidiag[a:(ncomp - 244 | # 1)]) 245 | # } 246 | # else { 247 | # PWinv <- backsolve(PW, diag(ncomp)) 248 | # } 249 | # R <- W %*% PWinv 250 | # } 251 | # for (a in 1:ncomp) { 252 | # B[, , a] <- tcrossprod(R[, 1:a, drop = FALSE], Q[, 1:a, 253 | # drop = FALSE]) 254 | # } 255 | # if (stripped) { 256 | # list(coefficients = B, Xmeans = Xmeans, Ymeans = Ymeans) 257 | # } 258 | # else { 259 | # for (a in 1:ncomp) fitted[, , a] <- tcrossprod(TT[, 1:a, 260 | # drop = FALSE], Q[, 1:a, drop = FALSE]) 261 | # residuals <- -fitted + c(Y) 262 | # fitted <- fitted + rep(Ymeans, each = nobj) 263 | # Xvar <- diff(-c(Xtotvar, Xresvar)) 264 | # objnames <- dnX[[1]] 265 | # if (is.null(objnames)) 266 | # objnames <- dnY[[1]] 267 | # prednames <- dnX[[2]] 268 | # respnames <- dnY[[2]] 269 | # compnames <- paste("Comp", 1:ncomp) 270 | # nCompnames <- paste(1:ncomp, "comps") 271 | # dimnames(TT) <- dimnames(U) <- list(objnames, compnames) 272 | # dimnames(R) <- dimnames(W) <- dimnames(P) <- list(prednames, 273 | # compnames) 274 | # dimnames(Q) <- list(respnames, compnames) 275 | # dimnames(B) <- list(prednames, respnames, nCompnames) 276 | # dimnames(fitted) <- dimnames(residuals) <- list(objnames, 277 | # respnames, nCompnames) 278 | # names(Xvar) <- compnames 279 | # class(TT) <- class(U) <- "scores" 280 | # class(P) <- class(W) <- class(Q) <- "loadings" 281 | # list(coefficients = B, scores = TT, loadings = P, loading.weights = W, 282 | # Yscores = U, Yloadings = Q, projection = R, Xmeans = Xmeans, 283 | # Ymeans = Ymeans, fitted.values = fitted, residuals = residuals, 284 | # Xvar = Xvar, Xtotvar = Xtotvar, nits = nits) 285 | # } 286 | #} 287 | 288 | 289 | 290 | -------------------------------------------------------------------------------- /R/plsmda.R: -------------------------------------------------------------------------------- 1 | ####################### 2 | # multilabel PLS-DA 3 | # 4 | ######################### 5 | 6 | 7 | plsmda <- function (x, y, ncomp = 2, probMethod = "softmax", prior = NULL, ...) { 8 | ## partial least squares multilabel discriminant analysis 9 | caret:::requireNamespaceQuietStop("pls") 10 | funcCall <- match.call(expand.dots = TRUE) 11 | if (!is.matrix(x)) 12 | x <- as.matrix(x) 13 | if (length(ncomp) > 1) { 14 | ncomp <- max(ncomp) 15 | warning(paste("A value single ncomp must be specified.", 16 | "max(ncomp) was used.", "Predictions can be obtained for values <= ncomp")) 17 | } 18 | if (probMethod == "softmax") { 19 | if (!is.null(prior)) 20 | warning("Priors are ignored unless probMethod = \"Bayes\"") 21 | } 22 | if (is.factor(y)) { 23 | obsLevels <- levels(y) 24 | oldY <- list(y) 25 | y <- .class2ind(y) 26 | } else if (is.data.frame(y)) { 27 | ## coerce to columns to factors 28 | for (i in 1:ncol(y)) y[,i] <- droplevels(as.factor(y[,i])) 29 | obsLevels <- lapply(y, levels) 30 | oldY <- y 31 | y <- do.call('cbind', lapply(y, .class2ind)) 32 | } 33 | else { 34 | if (is.matrix(y)) { 35 | # test <- apply(y, 1, sum) 36 | # if (any(test != 1)) 37 | # stop("the rows of y must be 0/1 and sum to 1") 38 | obsLevels <- colnames(y) 39 | if (is.null(obsLevels)) 40 | stop("the y matrix must have column names") 41 | oldY <- obsLevels[apply(y, 1, which.max)] 42 | } 43 | else stop("y must be a matrix, data.frame or a factor") 44 | } 45 | tmpData <- data.frame(n = paste("row", 1:nrow(y), sep = "")) 46 | tmpData$y <- y 47 | tmpData$x <- x 48 | out <- .plsr(y ~ x, data = tmpData, ncomp = ncomp, ...) 49 | out$obsLevels <- obsLevels 50 | out$probMethod <- probMethod 51 | if (probMethod == "Bayes") { 52 | caret:::requireNamespaceQuietStop("klaR") 53 | makeModels <- function(x, y, pri) { 54 | probModel <- klaR::NaiveBayes(x, y, prior = pri, usekernel = TRUE) 55 | probModel$train <- predict(probModel)$posterior 56 | probModel$x <- NULL 57 | probModel 58 | } 59 | cls <- class(out) 60 | class(out) <- "mvr" 61 | train <- predict(out, as.matrix(tmpData$x), ncomp = 1:ncomp) 62 | # train <- train[, -length(obsLevels), , drop = FALSE] 63 | usedlevs <- 0 64 | probMod <- vector("list", N <- length(obsLevels)) 65 | for (i in 1:N) { 66 | ysub <- oldY[[i]] 67 | nlevs <- nlevels(ysub) 68 | traintemp <- train[, (i+usedlevs):(nlevs+usedlevs), ,drop=FALSE] 69 | traintemp <- traintemp[, -length(obsLevels[[i]]), , drop = FALSE] 70 | usedlevs <- usedlevs + nlevels(ysub) 71 | probMod[[i]] <- apply(traintemp, 3, makeModels, y = ysub, pri = prior) 72 | } 73 | out$probModel <- probMod 74 | names(out$probModel) <- names(oldY) 75 | } 76 | else out$probModel <- NULL 77 | class(out) <- c("plsmda", "plsda", class(out)) 78 | out 79 | } 80 | 81 | 82 | .plsr <- function (..., method = "okernelpls") { 83 | cl <- match.call() 84 | cl$method <- match.arg(method, c("kernelpls", "widekernelpls", "okernelpls", 85 | "simpls", "oscorespls", "model.frame")) 86 | cl[[1]] <- quote(.mvr) 87 | res <- eval(cl, parent.frame()) 88 | if (cl$method != "model.frame") 89 | res$call[[1]] <- as.name(".plsr") 90 | if (missing(method)) 91 | res$call$method <- NULL 92 | res 93 | } 94 | 95 | 96 | .mvr <- function (formula, ncomp, Y.add, data, subset, na.action, method = pls.options()$mvralg, 97 | scale = FALSE, validation = c("none", "CV", "LOO"), model = TRUE, 98 | x = FALSE, y = FALSE, ...) { 99 | caret:::requireNamespaceQuietStop("pls") 100 | ret.x <- x 101 | ret.y <- y 102 | mf <- match.call(expand.dots = FALSE) 103 | if (!missing(Y.add)) { 104 | Y.addname <- as.character(substitute(Y.add)) 105 | mf$formula <- update(formula, paste("~ . +", Y.addname)) 106 | } 107 | m <- match(c("formula", "data", "subset", "na.action"), names(mf), 108 | 0) 109 | mf <- mf[c(1, m)] 110 | mf[[1]] <- as.name("model.frame") 111 | mf <- eval(mf, parent.frame()) 112 | method <- match.arg(method, c("kernelpls", "widekernelpls", "okernelpls", 113 | "simpls", "oscorespls", "cppls", "svdpc", "model.frame")) 114 | if (method == "model.frame") 115 | return(mf) 116 | mt <- attr(mf, "terms") 117 | Y <- model.response(mf, "numeric") 118 | if (is.matrix(Y)) { 119 | if (is.null(colnames(Y))) 120 | colnames(Y) <- paste("Y", 1:dim(Y)[2], sep = "") 121 | } 122 | 123 | else { 124 | Y <- as.matrix(Y) 125 | colnames(Y) <- deparse(formula[[2]]) 126 | } 127 | if (missing(Y.add)) { 128 | Y.add <- NULL 129 | } 130 | else { 131 | Y.add <- mf[, Y.addname] 132 | mt <- drop.terms(mt, which(attr(mt, "term.labels") == 133 | Y.addname), keep.response = TRUE) 134 | } 135 | X <- pls:::delete.intercept(model.matrix(mt, mf)) 136 | nobj <- dim(X)[1] 137 | npred <- dim(X)[2] 138 | if (length(attr(mt, "term.labels")) == 1 && !is.null(colnames(mf[[attr(mt, 139 | "term.labels")]]))) 140 | colnames(X) <- sub(attr(mt, "term.labels"), "", colnames(X)) 141 | if (missing(ncomp)) { 142 | ncomp <- min(nobj - 1, npred) 143 | ncompWarn <- FALSE 144 | } 145 | else { 146 | if (ncomp < 1 || ncomp > min(nobj - 1, npred)) 147 | stop("Invalid number of components, ncomp") 148 | ncompWarn <- TRUE 149 | } 150 | sdscale <- identical(TRUE, scale) 151 | if (is.numeric(scale)) 152 | if (length(scale) == npred) 153 | X <- X/rep(scale, each = nobj) 154 | else stop("length of 'scale' must equal the number of x variables") 155 | switch(match.arg(validation), CV = { 156 | val <- mvrCv(X, Y, ncomp, Y.add = Y.add, method = method, 157 | scale = sdscale, ...) 158 | }, LOO = { 159 | segments <- as.list(1:nobj) 160 | attr(segments, "type") <- "leave-one-out" 161 | val <- mvrCv(X, Y, ncomp, Y.add = Y.add, method = method, 162 | scale = sdscale, segments = segments, ...) 163 | }, none = { 164 | val <- NULL 165 | }) 166 | if (identical(TRUE, ncomp > val$ncomp)) { 167 | ncomp <- val$ncomp 168 | if (ncompWarn) 169 | warning("`ncomp' reduced to ", ncomp, " due to cross-validation") 170 | } 171 | fitFunc <- switch(method, kernelpls = pls:::kernelpls.fit, widekernelpls = pls:::widekernelpls.fit, 172 | simpls = pls:::simpls.fit, oscorespls = pls:::oscorespls.fit, 173 | cppls = pls:::cppls.fit, okernelpls=okernelpls.fit, svdpc = pls:::svdpc.fit) 174 | if (sdscale) { 175 | scale <- sqrt(colSums((X - rep(colMeans(X), each = nobj))^2)/(nobj - 176 | 1)) 177 | if (any(abs(scale) < .Machine$double.eps^0.5)) 178 | warning("Scaling with (near) zero standard deviation") 179 | X <- X/rep(scale, each = nobj) 180 | } 181 | start.time <- proc.time()[3] 182 | z <- fitFunc(X, Y, ncomp, Y.add = Y.add, ...) 183 | z$fit.time <- proc.time()[3] - start.time 184 | class(z) <- "mvr" 185 | z$na.action <- attr(mf, "na.action") 186 | z$ncomp <- ncomp 187 | z$method <- method 188 | if (is.numeric(scale)) 189 | z$scale <- scale 190 | z$validation <- val 191 | z$call <- match.call() 192 | z$terms <- mt 193 | if (model) 194 | z$model <- mf 195 | if (ret.x) 196 | z$x <- X 197 | if (ret.y) 198 | z$y <- Y 199 | z 200 | } 201 | 202 | print.plsmda <- function(x, ...) { 203 | if (x$method == "okernelpls") { 204 | ana = "Orthonormalized Partial Least Squares" 205 | alg = "kernel" 206 | kernfun = x$call$kernelfun 207 | cat(ana, ", fitted with the", alg, "algorithm and using a", kernfun, "kernel function") 208 | if (!is.null(x$validation)) 209 | cat("\nCross-validated using", length(x$validation$segments), 210 | attr(x$validation$segments, "type"), "segments.") 211 | cat("\nCall:\n", deparse(x$call), "\n", sep = "") 212 | invisible(x) 213 | } else { 214 | pls:::print.mvr(x) 215 | } 216 | } 217 | 218 | 219 | predict.plsmda <- function (object, newdata = NULL, ncomp = NULL, type = "class", ...) { 220 | caret:::requireNamespaceQuietStop("pls") 221 | if (is.null(ncomp)) 222 | if (!is.null(object$ncomp)) 223 | ncomp <- object$ncomp 224 | else stop("specify ncomp") 225 | if (!is.null(newdata)) { 226 | if (!is.matrix(newdata)) 227 | newdata <- as.matrix(newdata) 228 | } 229 | class(object) <- "mvr" 230 | tmpPred <- predict(object, newdata = newdata)[, , ncomp, drop = FALSE] 231 | if (type == "raw") 232 | return(tmpPred) 233 | out <- vector('list', N <- length(object$obsLevels)) 234 | if (is.null(object$probModel)) { 235 | ## softmax 236 | for (j in 1:N) { 237 | switch(type, class = { 238 | if (length(dim(tmpPred)) < 3) { 239 | # only 1 latent component 240 | out[[j]] <- object$obsLevels[apply(tmpPred, 1, which.max)] 241 | out[[j]] <- factor(out[[j]], levels = object$obsLevels[[j]]) 242 | } else { 243 | tmpOut <- matrix("", nrow = dim(tmpPred)[1], ncol = dim(tmpPred)[3]) 244 | for (i in 1:dim(tmpPred)[3]) { 245 | tmpOut[, i] <- object$obsLevels[[j]][apply(tmpPred[, , i, drop = FALSE], 1, which.max)] 246 | } 247 | out[[j]] <- as.data.frame(tmpOut) 248 | out[[j]] <- as.data.frame(lapply(out[[j]], function(x, y) factor(x, levels = y), y = object$obsLevels[[j]])) 249 | names(out[[j]]) <- paste("ncomp", ncomp, sep = "") 250 | rownames(out[[j]]) <- rownames(newdata) 251 | if (length(ncomp) == 1) out[[j]] <- out[[j]][, 1] 252 | } 253 | }, prob = { 254 | if (length(dim(tmpPred)) < 3) { 255 | out[[j]] <- t(apply(tmpPred, 1, function(data) exp(data)/sum(exp(data)))) 256 | } else { 257 | out[[j]] <- tmpPred * NA 258 | for (i in 1:dim(tmpPred)[3]) { 259 | out[[j]][, , i] <- t(apply(tmpPred[, , i, drop = FALSE], 1, function(data) exp(data)/sum(exp(data)))) 260 | } 261 | } 262 | }) 263 | } 264 | } 265 | else { 266 | for (j in 1:N) { 267 | library(klaR) 268 | tmp <- vector(mode = "list", length = length(ncomp)) 269 | for (i in seq(along = ncomp)) { 270 | tmp[[i]] <- predict(object$probModel[[j]][[ncomp[i]]], as.data.frame(tmpPred[, -length(object$obsLevels[[j]]), i])) 271 | } 272 | if (type == "class") { 273 | out[[j]] <- t(do.call("rbind", lapply(tmp, function(x) as.character(x$class)))) 274 | rownames(out[[j]]) <- names(tmp[[1]]$class) 275 | colnames(out[[j]]) <- paste("ncomp", ncomp, sep = "") 276 | out[[j]] <- as.data.frame(out[[j]]) 277 | out[[j]] <- as.data.frame(lapply(out[[j]], function(x, y) factor(x, levels = y), y = object$obsLevels[[j]])) 278 | if (length(ncomp) == 1) 279 | out[[j]] <- out[[j]][, 1] 280 | } else { 281 | out[[j]] <- array(dim = c(dim(tmp[[1]]$posterior), length(ncomp)), 282 | dimnames = list(rownames(tmp[[1]]$posterior), 283 | colnames(tmp[[1]]$posterior), paste("ncomp", ncomp, sep = ""))) 284 | for (i in seq(along = ncomp)) out[[j]][, , i] <- tmp[[i]]$posterior 285 | } 286 | } 287 | } 288 | names(out) <- names(object$obsLevels) 289 | out 290 | } 291 | 292 | 293 | .class2ind <- function(y) { 294 | y <- caret:::class2ind(y) 295 | y[y==0] <- -1 296 | y 297 | } 298 | -------------------------------------------------------------------------------- /R/splitVar.R: -------------------------------------------------------------------------------- 1 | 2 | ## multilevel tools, stolen from mixomics package 3 | .Split.variation.one.level <- function (X, Y, sample) { 4 | X = as.matrix(X) 5 | if (is.factor(sample)) { 6 | sample = as.numeric(sample) 7 | warning("the vector sample was converted into a numeric vector", 8 | call. = FALSE) 9 | } 10 | Xmi <- colMeans(X, na.rm=TRUE) 11 | Xm <- matrix(Xmi, nrow = nrow(X), ncol = ncol(X), byrow = T) 12 | indX <- cbind(sample, X) 13 | indsample <- unique(sample) 14 | n.sample <- length(indsample) 15 | Xbi <- t(apply(matrix(indsample, ncol = 1, nrow = n.sample), 16 | MARGIN = 1, FUN = function(x, indX) { 17 | indice <- which(indX[, 1,drop=FALSE] == x[1]) 18 | res <- colMeans(indX[indice, ,drop=FALSE], na.rm=TRUE)[-1] 19 | return(c(x, res)) 20 | }, indX = indX)) 21 | Xb <- apply(matrix(sample, ncol = 1, nrow = length(sample)), 22 | MARGIN = 1, FUN = function(x, Xbi) { 23 | Xbi[which(Xbi[, 1,drop=FALSE] == x), -1] 24 | }, Xbi = Xbi) 25 | Xb <- t(Xb) - Xm 26 | Xw <- X - Xm - Xb 27 | list(Xw = Xw, Xb = Xb, Xm = Xm) 28 | } 29 | 30 | 31 | .Split.variation.two.level <- function (X, factor1, factor2, sample) { 32 | if (is.factor(sample)) { 33 | sample = as.numeric(sample) 34 | warning("the vector sample was converted into a numeric vector", 35 | call. = FALSE) 36 | } 37 | Xmi <- colMeans(X, na.rm=TRUE) 38 | Xm <- matrix(Xmi, nrow = nrow(X), ncol = ncol(X), byrow = T) 39 | indX <- cbind(sample, X) 40 | Xb <- apply(indX, MARGIN = 1, FUN = function(x, indX) { 41 | indice <- which(indX[, 1,drop=FALSE] == x[1]) 42 | res <- colMeans(indX[indice, ,drop=FALSE], na.rm=TRUE) 43 | return(res[-1]) 44 | }, indX = indX) 45 | Xs <- t(Xb) 46 | Xb <- t(Xb) - Xm 47 | xbfactor1 <- X 48 | for (i in levels(factor(factor1))) { 49 | indice <- which(factor1 == i) 50 | indXX <- indX[indice, ] 51 | res1 <- apply(indXX, MARGIN = 1, FUN = function(x, indXX) { 52 | indice <- which(indXX[, 1] == x[1]) 53 | if (length(indice) == 1) { 54 | res <- colMeans(matrix(indXX[indice, ], nrow = 1, 55 | ncol = dim(indXX)[2]), na.rm=TRUE) 56 | } 57 | else { 58 | res <- colMeans(indXX[indice, ,drop=FALSE], na.rm=TRUE) 59 | } 60 | return(res[-1]) 61 | }, indXX = indXX) 62 | xbfactor1[indice, ] <- t(res1) 63 | } 64 | xbfactor2 <- X 65 | for (i in levels(factor(factor2))) { 66 | indice <- which(factor2 == i) 67 | indXX <- indX[indice, ] 68 | res1 <- apply(indXX, MARGIN = 1, FUN = function(x, indXX) { 69 | indice <- which(indXX[, 1,drop=FALSE] == x[1]) 70 | if (length(indice) == 1) { 71 | res <- colMeans(matrix(indXX[indice, ,drop=FALSE], nrow = 1, 72 | ncol = dim(indXX)[2]), na.rm=TRUE) 73 | } 74 | else { 75 | res <- colMeans(indXX[indice, ,drop=FALSE], na.rm=TRUE) 76 | } 77 | return(res[-1]) 78 | }, indXX = indXX) 79 | xbfactor2[indice, ] <- t(res1) 80 | } 81 | matfactor1 <- matrix(factor1, nrow = 1, ncol = length(factor1)) 82 | XFACTOR1 <- apply(matfactor1, MARGIN = 2, FUN = function(x, 83 | matfactor1) { 84 | indice <- which(matfactor1 == x[1]) 85 | res <- colMeans(X[indice, ,drop=FALSE], na.rm=TRUE) 86 | return(res) 87 | }, matfactor1 = matfactor1) 88 | matfactor2 <- matrix(factor2, nrow = 1, ncol = length(factor2)) 89 | XFACTOR2 <- apply(matfactor2, MARGIN = 2, FUN = function(x, 90 | matfactor2) { 91 | indice <- which(matfactor2 == x[1]) 92 | res <- colMeans(X[indice, ,drop=FALSE], na.rm=TRUE) 93 | return(res) 94 | }, matfactor2 = matfactor2) 95 | XCS <- xbfactor1 - Xs + Xm - t(XFACTOR1) 96 | XTS <- xbfactor2 - Xs + Xm - t(XFACTOR2) 97 | Xw <- X - Xb - Xm - XCS - XTS 98 | list(Xw = Xw, Xb = Xb, Xm = Xm, XCS = XCS, XTS = XTS) 99 | } 100 | 101 | -------------------------------------------------------------------------------- /R/splsmda.R: -------------------------------------------------------------------------------- 1 | ####################### 2 | # multilabel sPLS-DA 3 | # 4 | ######################### 5 | 6 | 7 | splsmda <- function (x, y, probMethod = "softmax", prior = NULL, ...) { 8 | caret:::requireNamespaceQuietStop("spls") 9 | funcCall <- match.call(expand.dots = TRUE) 10 | if (!is.matrix(x)) 11 | x <- as.matrix(x) 12 | if (probMethod == "softmax") { 13 | if (!is.null(prior)) 14 | warning("Priors are ignored unless probMethod = \"Bayes\"") 15 | } 16 | if (is.factor(y)) { 17 | obsLevels <- list(levels(y)) 18 | oldY <- list(y) 19 | y <- .class2ind(y) 20 | } else if (is.data.frame(y)) { 21 | ## coerce to columns to factors 22 | for (i in 1:ncol(y)) y[,i] <- droplevels(as.factor(y[,i])) 23 | obsLevels <- lapply(y, levels) 24 | oldY <- y 25 | y <- do.call('cbind', lapply(y, .class2ind)) 26 | } 27 | else { 28 | if (is.matrix(y)) { 29 | # test <- apply(y, 1, sum) 30 | # if (any(test != 1)) 31 | # stop("the rows of y must be 0/1 and sum to 1") 32 | obsLevels <- list(colnames(y)) 33 | if (is.null(obsLevels)) 34 | stop("the y matrix must have column names") 35 | oldY <- obsLevels[apply(y, 1, which.max)] 36 | } 37 | else stop("y must be a matrix, data.frame or a factor") 38 | } 39 | tmpData <- data.frame(n = paste("row", 1:nrow(y), sep = "")) 40 | tmpData$y <- y 41 | tmpData$x <- x 42 | out <- .spls(x, y, ...) 43 | out$obsLevels <- obsLevels 44 | out$probMethod <- probMethod 45 | if (probMethod == "Bayes") { 46 | caret:::requireNamespaceQuietStop("klaR") 47 | makeModels <- function(x, y, pri) { 48 | probModel <- klaR::NaiveBayes(x, y, prior = pri, usekernel = TRUE) 49 | probModel$train <- predict(probModel)$posterior 50 | probModel$x <- NULL 51 | probModel 52 | } 53 | cls <- class(out) 54 | ## spoof # obslevels so nothing is dropped in the predict function 55 | out$obsLevels <- ncol(y) + 1 56 | train <- spls:::predict.spls(out, as.matrix(tmpData$x)) 57 | usedlevs <- 0 58 | probMod <- vector("list", N <- length(obsLevels)) 59 | for (i in 1:N) { 60 | ysub <- oldY[[i]] 61 | nlevs <- nlevels(ysub) 62 | traintemp <- train[, (i+usedlevs):(nlevs+usedlevs)] 63 | traintemp <- traintemp[, -length(obsLevels[[i]]), drop = FALSE] 64 | usedlevs <- usedlevs + nlevels(ysub) 65 | probMod[[i]] <- makeModels(traintemp, y = ysub, pri = prior) 66 | } 67 | out$probModel <- probMod 68 | names(out$probModel) <- names(oldY) 69 | } 70 | else out$probModel <- NULL 71 | class(out) <- c("splsmda", "splsda") 72 | out 73 | } 74 | 75 | 76 | .spls <- function (x, y, K, eta, kappa = 0.5, select = "pls2", fit = "okernelpls", 77 | scale.x = TRUE, scale.y = FALSE, eps = 1e-04, maxstep = 100, trace = FALSE, ...) { 78 | caret:::requireNamespaceQuietStop("spls") 79 | caret:::requireNamespaceQuietStop("pls") 80 | x <- as.matrix(x) 81 | n <- nrow(x) 82 | p <- ncol(x) 83 | ip <- c(1:p) 84 | y <- as.matrix(y) 85 | q <- ncol(y) 86 | one <- matrix(1, 1, n) 87 | mu <- one %*% y/n 88 | y <- scale(y, drop(mu), FALSE) 89 | meanx <- drop(one %*% x)/n 90 | x <- scale(x, meanx, FALSE) 91 | if (scale.x) { 92 | normx <- sqrt(drop(one %*% (x^2))/(n - 1)) 93 | if (any(normx < .Machine$double.eps)) { 94 | stop("Some of the columns of the predictor matrix have zero variance.") 95 | } 96 | x <- scale(x, FALSE, normx) 97 | } 98 | else { 99 | normx <- rep(1, p) 100 | } 101 | if (scale.y) { 102 | normy <- sqrt(drop(one %*% (y^2))/(n - 1)) 103 | if (any(normy < .Machine$double.eps)) { 104 | stop("Some of the columns of the response matrix have zero variance.") 105 | } 106 | y <- scale(y, FALSE, normy) 107 | } 108 | else { 109 | normy <- rep(1, q) 110 | } 111 | betahat <- matrix(0, p, q) 112 | betamat <- list() 113 | x1 <- x 114 | y1 <- y 115 | type <- spls:::correctp(x, y, eta, K, kappa, "simpls", "simpls") 116 | eta <- type$eta 117 | K <- type$K 118 | kappa <- type$kappa 119 | # select <- type$select 120 | if (is.null(colnames(x))) { 121 | xnames <- c(1:p) 122 | } 123 | else { 124 | xnames <- colnames(x) 125 | } 126 | new2As <- list() 127 | if (trace) { 128 | cat("The variables that join the set of selected variables at each step:\n") 129 | } 130 | for (k in 1:K) { 131 | Z <- t(x1) %*% y1 132 | what <- spls:::spls.dv(Z, eta, kappa, eps, maxstep) 133 | A <- unique(ip[what != 0 | betahat[, 1] != 0]) 134 | new2A <- ip[what != 0 & betahat[, 1] == 0] 135 | xA <- x[, A, drop = FALSE] 136 | plsfit <- .plsr(y ~ xA, ncomp = min(k, length(A)), 137 | method = fit, scale = FALSE, ...) 138 | betahat <- matrix(0, p, q) 139 | betahat[A, ] <- matrix(coef(plsfit), length(A), q) 140 | betamat[[k]] <- betahat 141 | pj <- plsfit$projection 142 | scores <- plsfit$scores 143 | if (select == "pls2") { 144 | y1 <- y - x %*% betahat 145 | } 146 | if (select == "simpls") { 147 | pw <- pj %*% solve(t(pj) %*% pj) %*% t(pj) 148 | x1 <- x 149 | x1[, A] <- x[, A, drop = FALSE] - x[, A, drop = FALSE] %*% pw 150 | } 151 | if (select == "okernelpls") { 152 | modbeta <- plsfit$modbeta 153 | y1 <- y - scores %*% modbeta 154 | } 155 | new2As[[k]] <- new2A 156 | if (trace) { 157 | if (length(new2A) <= 10) { 158 | cat(paste("- ", k, "th step (K=", k, "):\n", 159 | sep = "")) 160 | cat(xnames[new2A]) 161 | cat("\n") 162 | } 163 | else { 164 | cat(paste("- ", k, "th step (K=", k, "):\n", 165 | sep = "")) 166 | nlines <- ceiling(length(new2A)/10) 167 | for (i in 0:(nlines - 2)) { 168 | cat(xnames[new2A[(10 * i + 1):(10 * (i + 1))]]) 169 | cat("\n") 170 | } 171 | cat(xnames[new2A[(10 * (nlines - 1) + 1):length(new2A)]]) 172 | cat("\n") 173 | } 174 | } 175 | } 176 | if (!is.null(colnames(x))) { 177 | rownames(betahat) <- colnames(x) 178 | } 179 | if (q > 1 & !is.null(colnames(y))) { 180 | colnames(betahat) <- colnames(y) 181 | } 182 | object <- list(x = x, y = y, betahat = betahat, A = A, betamat = betamat, 183 | new2As = new2As, mu = mu, meanx = meanx, normx = normx, 184 | normy = normy, eta = eta, K = K, kappa = kappa, select = select, 185 | fit = fit, projection = pj, scores=plsfit$scores) 186 | class(object) <- "spls" 187 | object 188 | } 189 | 190 | 191 | predict.splsmda <- function (object, newdata, type = "class", ...) { 192 | caret:::requireNamespaceQuietStop("spls") 193 | tmpPred <- spls::predict.spls(object, newx = newdata) 194 | if (type == "raw") 195 | return(tmpPred) 196 | if (is.null(object$probModel)) { 197 | # softmax 198 | usedlevs <- 1 199 | out <- vector("list", N <- length(object$obsLevels)) 200 | for (i in 1:N) { 201 | levs <- object$obsLevels[[i]] 202 | nlevs <- length(levs) 203 | tmpPred2 <- as.data.frame(tmpPred[, (usedlevs):(nlevs+usedlevs-1)]) 204 | out[[i]] <- switch(type, class = { 205 | classIndex <- levs[apply(tmpPred2, 1, which.max)] 206 | factor(classIndex, levels = levs) 207 | }, prob = t(apply(tmpPred2, 1, function(data) exp(data)/sum(exp(data)))) 208 | ) 209 | usedlevs <- usedlevs + nlevs 210 | } 211 | } 212 | else { 213 | # Naive Bayes 214 | caret:::requireNamespaceQuietStop("klaR") 215 | usedlevs <- 1 216 | out <- vector("list", N <- length(object$obsLevels)) 217 | for (i in 1:N) { 218 | levs <- object$obsLevels[[i]] 219 | nlevs <- length(levs) 220 | tmpPred2 <- as.data.frame(tmpPred[, (usedlevs):(nlevs+usedlevs-1)]) 221 | tmpPred2 <- tmpPred2[, -nlevs, drop = FALSE] 222 | pred <- predict(object$probModel[[i]], tmpPred2) 223 | out[[i]] <- switch(type, class = pred$class, prob = pred$posterior) 224 | usedlevs <- usedlevs + nlevs 225 | } 226 | } 227 | names(out) <- names(object$obsLevels) 228 | out 229 | } 230 | 231 | 232 | 233 | 234 | 235 | -------------------------------------------------------------------------------- /R/stability.R: -------------------------------------------------------------------------------- 1 | 2 | #' stability selection of sparse models 3 | #'' via stars 4 | #' @export 5 | spls.stars <- function (x, y, fold = 10, K, eta, kappa = 0.5, select = "pls2", 6 | fit = "simpls", scale.x = TRUE, scale.y = FALSE, stars.thresh = 0.05, ncores=2, rep.num=20, 7 | stars.subsample.ratio = NULL) { 8 | x <- as.matrix(x) 9 | n <- nrow(x) 10 | p <- ncol(x) 11 | ip <- c(1:p) 12 | y <- as.matrix(y) 13 | q <- ncol(y) 14 | 15 | if (is.null(stars.subsample.ratio)) { 16 | if (n > 144) 17 | stars.subsample.ratio = 10 * sqrt(n)/n 18 | if (n <= 144) 19 | stars.subsample.ratio = 0.8 20 | } 21 | 22 | mergelist <- vector('list', length(eta)) 23 | ## compute variability 24 | mergeArr <- array(0, dim=c(p, length(eta), length(K))) 25 | variability <- matrix(0, length(eta), length(K)) 26 | 27 | for (j in 1:length(eta)) { 28 | cat(paste("eta =", eta[j], "\n")) 29 | merge <- parallel::mclapply(1:rep.num, function(i) { 30 | ind.sample = sample(c(1:n), floor(n * stars.subsample.ratio), replace = FALSE) 31 | object <- spls::spls(x[ind.sample, , drop = FALSE], y[ind.sample, 32 | , drop = FALSE], eta = eta[j], kappa = kappa, 33 | K = max(K), select = select, fit = fit, scale.x = scale.x, 34 | scale.y = scale.y, trace = FALSE) 35 | return(object$betamat) 36 | }, mc.cores=ncores) 37 | mergelist[[j]] <- merge 38 | for (k.ind in 1:length(K)) { 39 | k <- K[k.ind] 40 | mm <- sapply(mergelist[[j]][1:rep.num], function(x) sign(abs(x[[k]][,1]))) 41 | mergeArr[,j,k.ind] <- apply(mm, 1, function(x) sum(x)/length(x)) 42 | variability[j,k.ind] <- 4 * sum(mergeArr[,j,k.ind] * (1 - mergeArr[,j,k.ind]))/(p) 43 | } 44 | } 45 | colnames(variability) <- K 46 | rownames(variability) <- eta 47 | opt.index = max(which.max(variability >= 48 | stars.thresh)[1] - 1, 1) 49 | return(list(merge=mergeArr, variability=variability, opt.index=opt.index)) 50 | } 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | compPLS 2 | ======= 3 | 4 | An R package for [sparse] Partial least squares discriminant analysis and biplots for compositional data analysis. 5 | 6 | This package is the implementation for the method developed in Lee et al. (2014) [**1**] for the classification of independently-sampled microbial compositions based on Helminth-infection status of a people in Malaysia. Under an assumption of model sparsity (that is, relatively few microbial populations truly correlate with Helminth-status) we get a factor analysis as well as a classifier. 7 | 8 | Currently, this package consists of functions for compositionally-robust data transformations that you can apply prior to (sparse) partial least squares discriminant analysis: here, just wrappers to key functions from the caret package [**2**]. As described in the paper, I've added parameter selection via cross-validation and bootstrap-based p-value calculation to assess PLS model coefficients (for feature selection). This package also includes some methods for ggplot2-based biplots for PLS-DA output, and for a few other commonly used projection/ordination/classification methods. This code was forked from vqv's biplot package [**3**]. 9 | 10 | This package is still under development and I am currently adding features, based on some new and exciting microbiome data being generated by the good people at the Loke lab [**4**] and collaborators. Installation and example instructions will be added soon. 11 | 12 | 13 | Installation 14 | ------------ 15 | This development package requires the `devtools` package for installation. Additionally, `compPLS` depends on the `caret` and `MASS` packages. Suggested packages are `boot`, (for bootstrapping) `ggplot2`, `grid` and `scales` (for biplots). 16 | 17 | ```R 18 | library(devtools) 19 | install_github('zdk123/compPLS') 20 | ``` 21 | 22 | Usage 23 | ----- 24 | Some minimal examples for running 1) PLS 2) sparse PLS and biplots for the results. 25 | 26 | ``` 27 | # a too low-dim example, for code demo purposes only 28 | data(ArcticLake) 29 | # clr transform the data along row margin (1) 30 | ALake.clr <- clr(ArcticLake[,1:3], 1) 31 | res <- plsDA(ALake.clr, grouping=ArcticLake[,4], K=2) 32 | ggbiplot(res, grouping=ArcticLake[,4], group.ellipse=TRUE, label.loadings=TRUE, label.offset=.2, alpha=.6) 33 | # alternative biplot 34 | biplot(res) 35 | 36 | ## a higher dim example 37 | set.seed(1100) 38 | data(Hydrochem) 39 | Hchem.clr <- clr(Hydrochem[,6:19], 1) 40 | # try without bootstrapping 41 | res <- plsDA_main(Hchem.clr, grouping=Hydrochem$River, K=8:10, nboots=0) #, n.core=4) # if on multicore system 42 | ggbiplot(res$plsda, grouping=Hydrochem$River, group.ellipse=TRUE, alpha=.5, plot.loadings=FALSE, label.loadings=TRUE) 43 | optK <- res$plsda$ncomp 44 | 45 | # do bootstrapping, warning: this can take a long time 46 | res <- plsDA_main(Hchem.clr, grouping=Hydrochem$River, K=optK, nboots=999) #, n.core=4) # if on multicore system 47 | ``` 48 | 49 | [**1**] http://www.plosntds.org/article/info%3Adoi%2F10.1371%2Fjournal.pntd.0002880 50 | 51 | [**2**] http://topepo.github.io/caret/index.html 52 | 53 | [**3**] https://github.com/vqv/ggbiplot 54 | 55 | [**4**] http://microbiology-parasitology.med.nyu.edu/png-loke 56 | 57 | 58 | -------------------------------------------------------------------------------- /codexample_clrpca1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk123/compPLS/078bda73ae23b3181f4874bea084c2db51ecc7a2/codexample_clrpca1.pdf -------------------------------------------------------------------------------- /data/ArcticLake.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk123/compPLS/078bda73ae23b3181f4874bea084c2db51ecc7a2/data/ArcticLake.rda -------------------------------------------------------------------------------- /data/Hydrochem.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zdk123/compPLS/078bda73ae23b3181f4874bea084c2db51ecc7a2/data/Hydrochem.rda -------------------------------------------------------------------------------- /man/alr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalization.R 3 | \name{alr} 4 | \alias{alr} 5 | \alias{alr.default} 6 | \alias{alr.matrix} 7 | \alias{alr.data.frame} 8 | \title{alr The additive log-ratio transformation} 9 | \usage{ 10 | alr(x, ...) 11 | 12 | \method{alr}{default}( 13 | x, 14 | divcomp = 1, 15 | base = exp(1), 16 | removeDivComp = TRUE, 17 | tol = .Machine$double.eps 18 | ) 19 | 20 | \method{alr}{matrix}( 21 | x, 22 | mar = 2, 23 | divcomp = 1, 24 | base = exp(1), 25 | removeDivComp = TRUE, 26 | tol = .Machine$double.eps 27 | ) 28 | 29 | \method{alr}{data.frame}(x, mar = 2, ...) 30 | } 31 | \arguments{ 32 | \item{x}{a numeric data vector containing components of a composition} 33 | 34 | \item{...}{additional arguments} 35 | 36 | \item{divcomp}{index of the divisor component} 37 | 38 | \item{base}{base of log to use, default is natural log} 39 | 40 | \item{removeDivComp}{remove divisor component from the resulting data} 41 | 42 | \item{tol}{machine tolerance for a zero count, default is machine tol (.Machine$double.eps)} 43 | } 44 | \value{ 45 | alr transformed \code{x} 46 | } 47 | \description{ 48 | The additive log-ratio transformation for 49 | compositional data (not necessarily closed/normalized!) 50 | } 51 | \details{ 52 | The alr transformation is computed as: 53 | \code{x[i]} = log ( \code{x[i]} / x[D] ) 54 | } 55 | \examples{ 56 | # vector examples: 57 | alr(norm_to_total(1:10)) 58 | alr(1:10) 59 | 60 | # matrix examples: 61 | dmat <- matrix(exp(rnorm(110)), 10) 62 | alr(dmat, 1) 63 | alr(dmat, 2) 64 | } 65 | -------------------------------------------------------------------------------- /man/clr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/normalization.R 3 | \name{clr} 4 | \alias{clr} 5 | \alias{clr.default} 6 | \alias{clr.matrix} 7 | \alias{clr.data.frame} 8 | \title{clr The centered log-ratio transformation} 9 | \usage{ 10 | clr(x, ...) 11 | 12 | \method{clr}{default}(x, base = exp(1), tol = .Machine$double.eps) 13 | 14 | \method{clr}{matrix}(x, mar = 2, base = exp(1), tol = .Machine$double.eps) 15 | 16 | \method{clr}{data.frame}(x, mar = 2, ...) 17 | } 18 | \arguments{ 19 | \item{x}{a numeric data vector containing components of a composition} 20 | 21 | \item{...}{additional arguments} 22 | 23 | \item{base}{base of log to use, default is natural log} 24 | 25 | \item{tol}{machine tolerance for a zero count, default is machine tol (.Machine$double.eps)} 26 | } 27 | \value{ 28 | clr transformed \code{x} 29 | } 30 | \description{ 31 | The centered log-ratio transformation for 32 | compositional data (not necessarily closed/normalized!) 33 | } 34 | \details{ 35 | The clr is computed as 36 | \code{x[i]} = log (\code{x[i]} / \code{exp(mean(log(x)))}) 37 | } 38 | \examples{ 39 | # vector examples: 40 | clr(norm_to_total(1:10)) 41 | clr(1:10) 42 | 43 | # matrix examples: 44 | dmat <- matrix(exp(rnorm(110)), 10) 45 | clr(dmat, 1) 46 | clr(dmat, 2) 47 | } 48 | -------------------------------------------------------------------------------- /man/compPLS-package.Rd: -------------------------------------------------------------------------------- 1 | \name{compPLS-package} 2 | \alias{compPLS-package} 3 | \alias{compPLS} 4 | \docType{package} 5 | \title{ 6 | A short title line describing what the package does 7 | } 8 | \description{ 9 | A more detailed description of what the package does. A length 10 | of about one to five lines is recommended. 11 | } 12 | \details{ 13 | This section should provide a more detailed overview of how to use the 14 | package, including the most important functions. 15 | } 16 | \author{ 17 | Who wrote it, email optional. 18 | 19 | Maintainer: Your Name 20 | } 21 | \references{ 22 | This optional section can contain literature or other references for 23 | background information. 24 | } 25 | % Optionally other standard keywords, one per line, 26 | % from the file KEYWORDS in the R documentation. 27 | \keyword{ package } 28 | \seealso{ 29 | Optional links to other man pages 30 | } 31 | \examples{ 32 | # Optional simple examples of the most important functions 33 | } 34 | -------------------------------------------------------------------------------- /man/ggbiplot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ggbiplot.R 3 | \name{ggbiplot} 4 | \alias{ggbiplot} 5 | \alias{ggbiplot.princomp} 6 | \alias{ggbiplot.prcomp} 7 | \alias{ggbiplot.lda} 8 | \alias{ggbiplot.plsda} 9 | \alias{ggbiplot.splsda} 10 | \alias{ggbiplot.matrix} 11 | \alias{ggbiplot.default} 12 | \title{methods for making biplots from various projection & classification models} 13 | \usage{ 14 | ggbiplot(xobj, ...) 15 | 16 | \method{ggbiplot}{princomp}(xobj, ...) 17 | 18 | \method{ggbiplot}{prcomp}(xobj, ...) 19 | 20 | \method{ggbiplot}{lda}(xobj, ...) 21 | 22 | \method{ggbiplot}{plsda}(xobj, Yplot = FALSE, ...) 23 | 24 | \method{ggbiplot}{splsda}(xobj, ...) 25 | 26 | \method{ggbiplot}{matrix}(xobj, ...) 27 | 28 | \method{ggbiplot}{default}( 29 | xobj, 30 | grouping, 31 | select = 1:2, 32 | circle = FALSE, 33 | circle.prob = 0.69, 34 | plot.loadings = TRUE, 35 | label.loadings = FALSE, 36 | sub.loadings = 1:nrow(xobj$loadings), 37 | label.offset = 0, 38 | label.size = 4.5, 39 | scale.loadings = 1, 40 | col.loadings = scales::muted("red"), 41 | alpha = 1, 42 | col = grouping, 43 | shape = NULL, 44 | group.ellipse = FALSE, 45 | scale.ellipse = 1, 46 | group.cloud = FALSE, 47 | xlab = "", 48 | ylab = "", 49 | equalcoord = TRUE, 50 | size = 3, 51 | size.loadings = 1, 52 | loadingsOnTop = FALSE 53 | ) 54 | } 55 | \arguments{ 56 | \item{xobj}{The object to be plotted} 57 | 58 | \item{grouping}{an optional grouping vector (ie - for coloring points)} 59 | 60 | \item{select}{index of components to be plotted (must be length 2)} 61 | 62 | \item{circle}{enclose points in a circle} 63 | 64 | \item{circle.prob}{controls circle diameter (scales data std dev) if \code{circle = TRUE}} 65 | 66 | \item{plot.loadings}{should loading vectors be plotted} 67 | 68 | \item{label.loadings}{text of loadings labels, taken from rownames of loadings (depends on class of \code{xobj})} 69 | 70 | \item{label.offset}{absolute offset for loading labels, so labels don't cover loadings vectors} 71 | 72 | \item{scale.loadings}{scale length of loading vectors for plotting purposes} 73 | 74 | \item{col.loadings}{a single value of vector for color of loadings} 75 | 76 | \item{alpha}{controls relative transparency of various plot features} 77 | 78 | \item{col}{color factor for points} 79 | 80 | \item{group.ellipse}{enclose within-group points in an covariance ellipse} 81 | 82 | \item{scale.ellipse}{scale \code{group.ellipse} to 1 standard deviation} 83 | 84 | \item{group.cloud}{connect within-group points to a group mean point with a straight edge} 85 | 86 | \item{xlab}{label for x axis} 87 | 88 | \item{ylab}{label for y axis} 89 | 90 | \item{equalcoord}{equal coordinates, ie should the plot area be square?} 91 | 92 | \item{size}{point size} 93 | 94 | \item{size.loadings}{line width of loading vectors} 95 | } 96 | \description{ 97 | Pretty biplots using ggplots 98 | } 99 | \details{ 100 | additional plotting attributes (eg colors, themes, etc) can be chained on in the usual way for ggplots 101 | } 102 | \examples{ 103 | # an LDA example with iris data 104 | ldamod <- lda(iris[,1:4], grouping=iris[,5]) 105 | ggbiplot(ldamod, grouping=iris[,5], alpha=.7, group.cloud=TRUE) + theme_bw() 106 | } 107 | -------------------------------------------------------------------------------- /man/hello.Rd: -------------------------------------------------------------------------------- 1 | \name{hello} 2 | \alias{hello} 3 | \title{ 4 | A simple function doing little 5 | } 6 | \description{ 7 | This function shows a standard text on the console. In a time-honoured 8 | tradition, it defaults to displaying \emph{hello, world}. 9 | } 10 | \examples{ 11 | hello() 12 | hello("and goodbye") 13 | } 14 | -------------------------------------------------------------------------------- /man/plsDA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PLSDA.R 3 | \name{plsDA} 4 | \alias{plsDA} 5 | \title{plsDA partial least squares discriminant analysis} 6 | \usage{ 7 | plsDA(x, grouping, K, usePriors = FALSE, plsfun = caret::plsda, ...) 8 | } 9 | \arguments{ 10 | \item{x}{data with samples in rows, features are columns (not necessarily compositional data)} 11 | 12 | \item{grouping}{a numeric vector or factor with sample classes (length should equal \code{nrow(x)})} 13 | 14 | \item{K}{number of components in the PLS model (default: number of classes - 1)} 15 | 16 | \item{usePriors}{use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups)} 17 | } 18 | \value{ 19 | a plsda fitted model 20 | } 21 | \description{ 22 | Partial Least Squares Discriminant Analysis 23 | PLS regression to discriminate classes (via a logistic model) 24 | basically this is a wrapper for the \code{plsda} function in the caret package, 25 | but with default setup for dealing with uneven classes (via the priors option, see details) 26 | see caret::plsda for implementation details 27 | } 28 | \details{ 29 | run this code if you don't need to fit paramaters by cross-validation 30 | } 31 | \seealso{ 32 | \code{\link{plsDA_main}}, \code{\link{caret::plsda}} 33 | } 34 | -------------------------------------------------------------------------------- /man/plsDA_main.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PLSDA.R, R/SPLSDA.R 3 | \name{plsDA_main} 4 | \alias{plsDA_main} 5 | \alias{splsDA_main} 6 | \title{plsDA_main partial least squares discriminant analysis} 7 | \usage{ 8 | plsDA_main( 9 | x, 10 | grouping, 11 | K, 12 | usePriors = FALSE, 13 | fold = 5, 14 | nboots = 999, 15 | n.core = 4, 16 | noise = 0, 17 | ... 18 | ) 19 | 20 | splsDA_main( 21 | x, 22 | grouping, 23 | eta, 24 | K, 25 | usePriors = FALSE, 26 | fold = 5, 27 | nboots = 999, 28 | n.core = 4, 29 | noise = 0, 30 | ... 31 | ) 32 | } 33 | \arguments{ 34 | \item{x}{data with samples in rows, features are columns (not necessarily compositional x)} 35 | 36 | \item{grouping}{a numeric vector or factor with sample classes (length should equal \code{nrow(x)})} 37 | 38 | \item{K}{numeric vector containing number of components in the PLS model} 39 | 40 | \item{usePriors}{use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups)} 41 | 42 | \item{fold}{number of partitions to randomly subsample for cross-validation} 43 | 44 | \item{nboots}{number of bootstraps/permutations for estimating coefficient p-vals} 45 | 46 | \item{n.core}{number of cores for paralellization of bootstraps} 47 | 48 | \item{noise}{for very sparse components, some subsamples may have zero variance. Optionally, add some Gaussian noise to to avoid PLS errors} 49 | 50 | \item{...}{additional arguments passed to plsDA} 51 | } 52 | \value{ 53 | a \code{plsDA} object that contains: the plsda model/object, \code{pvals}, the original data, \code{x}, and \code{groupings} 54 | 55 | a \code{plsDA} object that contains: the plsda model/object, \code{pvals}, the original data, \code{x}, and \code{groupings} 56 | } 57 | \description{ 58 | The main wrapper for full Partial Least Squares discriminant analysis, 59 | performing cross-validation to tune model parameters (here, number of components) 60 | and do permutation tests (ie bootstrapping) to get pseudo-pvals estimates for model coefficients 61 | 62 | The main wrapper for full sparse Partial Least Squares discriminant analysis, 63 | performing cross-validation to tune model parameters (here, number of components) 64 | and do permutation tests (ie bootstrapping) to get pseudo-pvals estimates for model coefficients 65 | } 66 | \seealso{ 67 | \code{\link{plsDA}} 68 | 69 | \code{\link{plsDA}} 70 | } 71 | -------------------------------------------------------------------------------- /man/spls.stars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stability.R 3 | \name{spls.stars} 4 | \alias{spls.stars} 5 | \title{stability selection of sparse models 6 | ' via stars} 7 | \usage{ 8 | spls.stars( 9 | x, 10 | y, 11 | fold = 10, 12 | K, 13 | eta, 14 | kappa = 0.5, 15 | select = "pls2", 16 | fit = "simpls", 17 | scale.x = TRUE, 18 | scale.y = FALSE, 19 | stars.thresh = 0.05, 20 | ncores = 2, 21 | rep.num = 20, 22 | stars.subsample.ratio = NULL 23 | ) 24 | } 25 | \description{ 26 | stability selection of sparse models 27 | ' via stars 28 | } 29 | -------------------------------------------------------------------------------- /man/splsDA.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SPLSDA.R 3 | \name{splsDA} 4 | \alias{splsDA} 5 | \title{splsDA sparse partial least squares discriminant analysis} 6 | \usage{ 7 | splsDA(x, grouping, eta, K, usePriors = FALSE, ...) 8 | } 9 | \arguments{ 10 | \item{x}{x with samples in rows, features are columns (not necessarily compositional x)} 11 | 12 | \item{grouping}{a numeric vector or factor with sample classes (length should equal \code{nrow(x)})} 13 | 14 | \item{eta}{parameter that adjusts sparsity of the PLS model (between 0 and 1)} 15 | 16 | \item{K}{number of components in the PLS model (default: number of classes - 1)} 17 | 18 | \item{usePriors}{use priors for very biased sample size between groups (ie - put strong penalty on misclassifying small groups)} 19 | } 20 | \value{ 21 | a plsda fitted model 22 | } 23 | \description{ 24 | sparse Partial Least Squares Discriminant Analysis 25 | sPLS regression to discriminate classes (via a logistic model) 26 | basically this is a wrapper for the \code{splsda} function in the caret package, 27 | but with default setup for dealing with uneven classes (via the priors option, see details) 28 | see caret::splsda for implementation details 29 | } 30 | \details{ 31 | run this code if you don't need to fit paramaters by cross-validation 32 | } 33 | \seealso{ 34 | \code{\link{plsDA_main}}, \code{\link{caret::plsda}}, \code{\link{caret::splsda}} 35 | } 36 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | test_check("yourpackage") 3 | -------------------------------------------------------------------------------- /tests/testthat/test_norms_and_transforms.R: -------------------------------------------------------------------------------- 1 | context("norms and transforms") 2 | 3 | test_that("total sum norms add up", { 4 | rvec <- exp(rnorm(10)) 5 | expect_equal(sum(norm_to_total(rvec)), 1) 6 | expect_equal(sum(norm_pseudo(rvec)), 1) 7 | }) 8 | 9 | 10 | test_that("clr vector output is expected", { 11 | testvec1 <- 1:10 12 | testvecrand <- runif(10, 0, 1) 13 | # test clr function and various equivilent definitions of the clr 14 | expect_equal(clr(testvec1), log(testvec1/prod(testvec1)^(1/length(testvec1)))) 15 | expect_equal(clr(testvec1), log(testvec1/exp(mean(log(testvec1))))) 16 | expect_equal(clr(testvec1), log(testvec1) - mean(log(testvec1))) 17 | 18 | expect_equal(clr(testvecrand), log(testvecrand/prod(testvecrand)^(1/length(testvecrand)))) 19 | expect_equal(clr(testvecrand), log(testvecrand/exp(mean(log(testvecrand))))) 20 | expect_equal(clr(testvecrand), log(testvecrand) - mean(log(testvecrand))) 21 | 22 | ## test presence of zero component 23 | expect_equal(clr(c(0, testvecrand))[1], 0) 24 | 25 | ## test that a composition doesn't need to be closed 26 | expect_equal(clr(testvec1), clr(norm_to_total(testvec1))) 27 | expect_equal(clr(testvecrand), clr(norm_to_total(testvecrand))) 28 | }) 29 | 30 | test_that("clr matrix output is expected", { 31 | 32 | testmat1 <- matrix(1:110, 10) 33 | testmatrand <- matrix(runif(110, 0, 1), 10) 34 | 35 | expect_equal(clr(testmat1, 1), t(apply(testmat1, 1, clr.default))) 36 | expect_equal(clr(testmat1, 2), apply(testmat1, 2, clr.default)) 37 | 38 | expect_equal(clr(testmatrand, 1), t(apply(testmatrand, 1, clr.default))) 39 | expect_equal(clr(testmatrand, 2), apply(testmatrand, 2, clr.default)) 40 | 41 | ## test zero counts 42 | expect_equal(clr(rbind(0, testmatrand, 2))[1,], rep(0, ncol(testmatrand))) 43 | 44 | ## test that a composition doesn't need to be closed 45 | expect_equal(clr(testmat1, 2), clr(apply(testmat1, 2, norm_to_total), 2)) 46 | expect_equal(clr(testmatrand, 2), clr(apply(testmatrand, 2, norm_to_total), 2)) 47 | 48 | }) 49 | 50 | 51 | test_that("clr data.frame output is expected", { 52 | 53 | testdf <- as.data.frame(testmatrand) 54 | colnames(testmatrand) <- colnames(testdf) 55 | expect_equal(clr(testdf, 2), clr(testmatrand, 2)) 56 | expect_equal(clr(testdf, 1), clr(testmatrand, 1)) 57 | }) 58 | 59 | 60 | 61 | test_that("alr vector output is expected", { 62 | testvec1 <- 1:10 63 | testvecrand <- runif(10, 0, 1) 64 | # test alr function and various equivilent definitions of the alr 65 | expect_equal(alr(testvec1, divcomp=1), log(testvec1/testvec1[1])[-1]) 66 | expect_equal(alr(testvec1, divcomp=1), log(testvec1)[-1] - log(testvec1[1])) 67 | expect_equal(alr(testvec1, divcomp=5), log(testvec1/testvec1[5])[-5]) 68 | expect_equal(alr(testvec1, divcomp=5), log(testvec1)[-5] - log(testvec1[5])) 69 | 70 | expect_equal(alr(testvecrand, divcomp=1), log(testvecrand/testvecrand[1])[-1]) 71 | expect_equal(alr(testvecrand, divcomp=1), log(testvecrand)[-1] - log(testvecrand[1])) 72 | expect_equal(alr(testvecrand, divcomp=5), log(testvecrand/testvecrand[5])[-5]) 73 | expect_equal(alr(testvecrand, divcomp=5), log(testvecrand)[-5] - log(testvecrand[5])) 74 | 75 | 76 | 77 | ## test presence of zero component 78 | expect_equal(alr(c(0, testvecrand), divcomp=2)[1], 0) 79 | 80 | ## test that a composition doesn't need to be closed 81 | expect_equal(alr(testvec1), alr(norm_to_total(testvec1))) 82 | expect_equal(alr(testvecrand), alr(norm_to_total(testvecrand))) 83 | }) 84 | 85 | test_that("alr matrix output is expected", { 86 | 87 | testmat1 <- matrix(1:110, 10) 88 | testmatrand <- matrix(runif(110, 0, 1), 10) 89 | 90 | expect_equal(alr(testmat1, 1), t(apply(testmat1, 1, alr.default))) 91 | expect_equal(alr(testmat1, 2), apply(testmat1, 2, alr.default)) 92 | 93 | expect_equal(alr(testmatrand, 1), t(apply(testmatrand, 1, alr.default))) 94 | expect_equal(alr(testmatrand, 2), apply(testmatrand, 2, alr.default)) 95 | 96 | ## test zero counts 97 | expect_equal(alr(cbind(0, testmatrand), 1, divcomp=2)[,1], rep(0, nrow(testmatrand))) 98 | expect_equal(alr(rbind(0, testmatrand), 2, divcomp=2)[1,], rep(0, ncol(testmatrand))) 99 | 100 | ## test that a composition doesn't need to be closed 101 | expect_equal(alr(testmat1, 2), alr(apply(testmat1, 2, norm_to_total), 2)) 102 | expect_equal(alr(testmatrand, 2), alr(apply(testmatrand, 2, norm_to_total), 2)) 103 | 104 | }) 105 | 106 | 107 | test_that("alr data.frame output is expected", { 108 | 109 | testdf <- as.data.frame(testmatrand) 110 | colnames(testmatrand) <- colnames(testdf) 111 | expect_equal(alr(testdf, 2), alr(testmatrand, 2)) 112 | expect_equal(alr(testdf, 1), alr(testmatrand, 1)) 113 | }) 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /tests/testthat/test_plsda.R: -------------------------------------------------------------------------------- 1 | context("partial least squares discriminant analysis") 2 | 3 | 4 | irisdat <- clr(iris[,1:4], 1) 5 | y <- iris[,5] 6 | mod <- plsDA(irisdat, y) 7 | 8 | test_that("plsDA returns correct class", { 9 | 10 | expect_that(mod, is_a('plsda')) 11 | expect_that(mod, is_a('mvr')) 12 | 13 | }) 14 | --------------------------------------------------------------------------------