├── NAMESPACE ├── MultiPowerUsersGuide_v2.pdf ├── MultiML ├── MultiMLUsersGuide.pdf ├── templates │ └── submit_sh.txt ├── README.md └── FoMPredictiveFunctions.R ├── DESCRIPTION ├── man ├── CohenFilter.Rd ├── powerPlot.Rd ├── postMultiPower.Rd ├── optimalRep.Rd ├── MultiPower.Rd └── MultiGroupPower.Rd ├── README.md ├── MultiPower_Examples ├── TCGA_Example.R └── STATegra_Example.R └── R └── MultiOmicsPower15.R /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /MultiPowerUsersGuide_v2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ConesaLab/MultiPower/HEAD/MultiPowerUsersGuide_v2.pdf -------------------------------------------------------------------------------- /MultiML/MultiMLUsersGuide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ConesaLab/MultiPower/HEAD/MultiML/MultiMLUsersGuide.pdf -------------------------------------------------------------------------------- /MultiML/templates/submit_sh.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | #SBATCH --array=0-{{{max_node}}} 4 | #SBATCH --job-name={{{jobname}}} 5 | #SBATCH --output=slurm_%a.out 6 | #SBATCH --time={{{time}}}-00:00:00 7 | {{#flags}} 8 | #SBATCH --{{{name}}} 9 | {{/flags}} 10 | {{#options}} 11 | #SBATCH --{{{name}}}={{{value}}} 12 | {{/options}} 13 | module load R 14 | Rscript --vanilla SlurmCreator_Run.R 15 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: MultiPower 2 | Type: Package 3 | Title: Estimation of sample size in multi-omic experiments 4 | Version: 1.0 5 | Author: Sonia Tarazona; David Gómez-Cabrero 6 | Maintainer: Sonia Tarazona 7 | Description: In multi-omic experiments, MultiPower estimates the sample size needed to achieve a given statistical power per omic and globally. Parameters require to estimate power can be either set by users or estimated from provided pilot data (recommended). MultiPower accepts count data, normally distributed data or binary data. 8 | License: GPL-2 9 | Encoding: UTF-8 10 | LazyData: true 11 | NeedsCompilation: no 12 | Packaged: 2019-09-18 09:04:40 UTC; sotacam 13 | -------------------------------------------------------------------------------- /man/CohenFilter.Rd: -------------------------------------------------------------------------------- 1 | \name{CohenFilter} 2 | \alias{CohenFilter} 3 | \title{ 4 | Removal of low effect features. 5 | } 6 | \description{ 7 | Removal of omic features with Cohen’s d (or h) below the threshold set by the user. 8 | } 9 | \usage{ 10 | CohenFilter(data, d, parameters) 11 | } 12 | %- maybe also 'usage' for other objects documented here. 13 | \arguments{ 14 | \item{data}{ 15 | Original data given to MultiPower function. 16 | } 17 | \item{d}{ 18 | Cutoff value for Cohen’s d (or h). A scalar (and then the same cutoff is applied to all the omics) or a vector with as many values as omic data types in the data object. 19 | } 20 | \item{parameters}{ 21 | List with as many elements as omic data types estimated by MultiPower function. They can be retrieved from MultiPower output as output@parameters. 22 | } 23 | } 24 | \details{ 25 | %% ~~ If necessary, more details than the description above ~~ 26 | } 27 | \value{ 28 | %% ~Describe the value returned 29 | %% If it is a LIST, use 30 | %% \item{comp1 }{Description of 'comp1'} 31 | %% \item{comp2 }{Description of 'comp2'} 32 | %% ... 33 | } 34 | \references{ 35 | %% ~put references to the literature/web site here ~ 36 | } 37 | \author{ 38 | Sonia Tarazona; David Gómez-Cabrero 39 | } 40 | \note{ 41 | %% ~~further notes~~ 42 | } 43 | 44 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 45 | 46 | \seealso{ 47 | \code{\link{MultiPower}} 48 | } 49 | \examples{ 50 | %% 51 | } 52 | % Add one or more standard keywords, see file 'KEYWORDS' in the 53 | % R documentation directory. 54 | %\keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 55 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 56 | -------------------------------------------------------------------------------- /man/powerPlot.Rd: -------------------------------------------------------------------------------- 1 | \name{powerPlot} 2 | \alias{powerPlot} 3 | %- Also NEED an '\alias' for EACH other topic documented here. 4 | \title{ 5 | Power plots. 6 | } 7 | \description{ 8 | Plotting statistical power versus sample size per omic from optimal sample size results. 9 | and statistical power versus Cohen's d cutoffs (in percentiles). 10 | } 11 | \usage{ 12 | powerPlot(parameters, optimalSampleSize, omicCol = NULL) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{parameters}{ 17 | List with as many elements as omic data types. For each omic, each element of this list is another list containing the different parameters needed to compute power. 18 | See \code{MultiPower} results for more details. 19 | } 20 | \item{optimalSampleSize}{ 21 | R object containing the results of \code{optimalRep}. 22 | } 23 | \item{omicCol}{ 24 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used. 25 | } 26 | } 27 | \details{ 28 | %% ~~ If necessary, more details than the description above ~~ 29 | } 30 | \value{ 31 | List containing the values used for the plot or plots. 32 | } 33 | \references{ 34 | %% ~put references to the literature/web site here ~ 35 | } 36 | \author{ 37 | Sonia Tarazona 38 | } 39 | \note{ 40 | %% ~~further notes~~ 41 | } 42 | 43 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 44 | 45 | \seealso{ 46 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 47 | } 48 | \examples{ 49 | % Add one or more standard keywords, see file 'KEYWORDS' in the 50 | % R documentation directory. 51 | %\keyword{ plot }% use one of RShowDoc("KEYWORDS") 52 | %\keyword{ power }% __ONLY ONE__ keyword per line 53 | } 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MultiPower and MultiML: statistical power studies for multi-omics experiments 2 | 3 | ## MultiPower 4 | 5 | The MultiPower R method performs statistical power studies for multi-omics experiments, 6 | and is designed to assist users in experimental design as well as in the evaluation of already-generated multi-omics datasets. 7 | More details on the method can be found in our manuscript [[1]](#1) and in the 8 | [MultiPower User’s Guide](https://github.com/ConesaLab/MultiPower/blob/master/MultiPowerUsersGuide_v2.pdf). 9 | 10 | MultiPower is available as an R package, and can be installed as follows: 11 | 12 | ``` 13 | install.packages(“devtools”) 14 | devtools::install_github(“ConesaLab/MultiPower”) 15 | ``` 16 | 17 | ### Installing MultiPower dependencies 18 | 19 | Some dependencies are required before running MultiPower that can be installed from R via 20 | `install.packages()` and loaded with `library()`: 21 | 22 | - FDRsampsize 23 | - lpSolve 24 | 25 | 26 | 27 | 28 | ## MultiML 29 | 30 | The MultiML method is included as a complementary tool to MultiPower, 31 | and is designed to help users determine the optimal sample size required to control for 32 | classification error rates when using one or more omics datasets. 33 | Details on the MultiML algorithm and its applications can be found in our manuscript [[1]](#1). 34 | 35 | If you are interested in using MultiML for your research, please see this folder(link) 36 | for scripts and instructions. For detailed information on how to run the tool, please read 37 | [MultiML's User Guide](https://github.com/ConesaLab/MultiPower/blob/master/MultiPower_UsersGuide.pdf) 38 | 39 | 40 | 41 | ## Citation 42 | 43 | If you are using MultiPower or MultiML in your research, please cite the following publication: 44 | 45 | [1] 46 | Tarazona, S., Balzano-Nogueira, L., Gómez-Cabrero, D. et al. 47 | Harmonization of quality metrics and power calculation in multi-omic studies. 48 | Nat Commun 11, 3092 (2020). https://doi.org/10.1038/s41467-020-16937-8 49 | 50 | -------------------------------------------------------------------------------- /man/postMultiPower.Rd: -------------------------------------------------------------------------------- 1 | \name{postMultiPower} 2 | \alias{postMultiPower} 3 | \title{ 4 | Power study for different sample sizes. 5 | } 6 | \description{ 7 | When the optimal sample size estimated by MultiPower exceeds the available budget, an alterative solution is to decrease such sample size at the cost of removing features 8 | with low effect size (magnitude of change). This function answer the question of to which effect size users must restrict themselves for a given maximum sample size. 9 | Pilot multi-omic data sets are needed to perform this analysis. 10 | } 11 | \usage{ 12 | postMultiPower(optResults, max.size = 5, omicCol = NULL) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{optResults}{ 17 | R object containing the results of \code{MultiPower} function. 18 | } 19 | \item{max.size}{ 20 | Maximum sample size allowed by the user. It will be used to determine the effect size that can be detected (by default, 5). 21 | } 22 | \item{omicCol}{ 23 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used. 24 | } 25 | } 26 | \details{ 27 | %% ~~ If necessary, more details than the description above ~~ 28 | } 29 | \value{ 30 | This function returns a list with the following elements: 31 | \item{SampleSize }{Matrix containing the optimal sample size for each omic data type (in columns) and for different cutoffs of Cohen’s d (in rows).} 32 | \item{Power }{Matrix containing the statistical power at the optimal sample size for each omic data type (in columns) and for different cutoffs of Cohen’s d (in rows).} 33 | \item{NumFeat }{Matrix containing the number of remaining features for each omic data type (in columns) and for different cutoffs of Cohen’s d (in rows).} 34 | \item{d }{Values of Cohen’s d used as cutoffs to remove low effect size features.} 35 | In addition, it also returns two plots that summarize these results: first, the number of replicates (sample size) for each of the tested Cohen's d values; 36 | second, the statistical power for each omic at the previously obtained optimal sample size and at the current sample size set by the user. 37 | } 38 | \references{ 39 | %% ~put references to the literature/web site here ~ 40 | } 41 | \author{ 42 | Sonia Tarazona 43 | } 44 | \note{ 45 | %% ~~further notes~~ 46 | } 47 | 48 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 49 | 50 | \seealso{ 51 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 52 | } 53 | \examples{ 54 | } 55 | % Add one or more standard keywords, see file 'KEYWORDS' in the 56 | % R documentation directory. 57 | %\keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 58 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 59 | -------------------------------------------------------------------------------- /man/optimalRep.Rd: -------------------------------------------------------------------------------- 1 | \name{optimalRep} 2 | \alias{optimalRep} 3 | \title{ 4 | Estimation of optimal sample size. 5 | } 6 | \description{ 7 | Estimation of the optimal sample size when pilot multi-omic data sets are not available. 8 | } 9 | \usage{ 10 | optimalRep(parameters, omicPower = 0.6, averagePower = 0.85, fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200) 11 | } 12 | %- maybe also 'usage' for other objects documented here. 13 | \arguments{ 14 | \item{parameters}{ 15 | List with as many elements as omic data types. For each omic, each element of this list is another list containing the different parameters needed to compute power which, in this case, must be set by the user. See \code{MultiPower} for more details. 16 | } 17 | \item{omicPower}{ 18 | The minimum power that must be achieved for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. By default, omicPower = 0.6. 19 | } 20 | \item{averagePower}{ 21 | The minimum average power that must be globally achieved. By default, averagePower = 0.85. 22 | } 23 | \item{fdr}{ 24 | False Discovery Rate level to be used. It is the significance level after multiple testing correction. By default, fdr = 0.05.} 25 | \item{cost}{ 26 | The cost to generate a replicate (a sample) for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. This argument will only be used when a different sample size per omic is allowed. By default, cost = 1 (which means that all the omics will be assumed to have the same cost). 27 | } 28 | \item{equalSize}{ 29 | If TRUE (default), the same optimal sample size will be estimated for all the omics. If FALSE, omics are allowed to have different sample sizes. 30 | } 31 | \item{max.size}{ 32 | Maximum allowed sample size. By default, max.size = 30. 33 | } 34 | } 35 | \details{ 36 | %% ~~ If necessary, more details than the description above ~~ 37 | } 38 | \value{ 39 | %% ~Describe the value returned 40 | %% If it is a LIST, use 41 | %% \item{comp1 }{Description of 'comp1'} 42 | %% \item{comp2 }{Description of 'comp2'} 43 | %% ... 44 | } 45 | \references{ 46 | %% ~put references to the literature/web site here ~ 47 | } 48 | \author{ 49 | Sonia Tarazona; David Gómez-Cabrero 50 | } 51 | \note{ 52 | %% ~~further notes~~ 53 | } 54 | 55 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 56 | 57 | \seealso{ 58 | \code{\link{MultiPower}} 59 | } 60 | \examples{ 61 | optimalSS = optimalRep(parameters = myparam, omicPower = 0.6, 62 | averagePower = 0.8, fdr = 0.05, 63 | cost = 1, equalSize = TRUE, max.size = 30) 64 | optimalSS$n # optimal sample size 65 | } 66 | % Add one or more standard keywords, see file 'KEYWORDS' in the 67 | % R documentation directory. 68 | %\keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 69 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 70 | -------------------------------------------------------------------------------- /man/MultiPower.Rd: -------------------------------------------------------------------------------- 1 | \name{MultiPower} 2 | \alias{MultiPower} 3 | \title{ 4 | Optimal sample size estimation and power study. 5 | } 6 | \description{ 7 | MultiPower computes the optimal sample size for a multi-omic experiment when pilot multi-omic data sets are available for estimating the parameters required to compute power. An optimization problem is solved in order to achieve the desired power while minimizing the cost of the experiment. 8 | } 9 | \usage{ 10 | MultiPower(data, groups, type, omicPower = 0.6, averagePower = 0.85, null.effect = 0, 11 | fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL, powerPlots = TRUE) 12 | } 13 | %- maybe also 'usage' for other objects documented here. 14 | \arguments{ 15 | \item{data}{ 16 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a raw count data matrix, and in this case MultiPower will take into account the library sizes to estimate power; a normally distributed data matrix which must have been already pre-processed and normalized; or a binary data matrix (with 0/1 or TRUE/FALSE values). In any case, for each one of these matrices, rows must correspond to omic features (genes, methylation sites, ChIP-seq peaks, etc.) and columns to observations (biological samples, patients, etc.). 17 | } 18 | \item{groups}{ 19 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a vector with length equal to the number of observations for that omic in data argument. Each element of this vector must indicate the experimental group where each observation belong. Only two experimental groups are allowed. 20 | } 21 | \item{type}{ 22 | Vector with length equal to the number of omic data types. Each element of this vector must be a 1, 2 or 3 to indicate whether the omic data are count data (1), continuous data approximately following a normal distribution (2) or binary data (3). 23 | } 24 | \item{null.effect}{ 25 | Value of the effect size that corresponds to null hypothesis. By default, 0. 26 | } 27 | \item{omicPower}{ 28 | The minimum power that must be achieved for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. By default, omicPower = 0.6. 29 | } 30 | \item{averagePower}{ 31 | The minimum average power that must be globally achieved. By default, averagePower = 0.85. 32 | } 33 | \item{fdr}{ 34 | False Discovery Rate level to be used. It is the significance level after multiple testing correction. By default, fdr = 0.05. If no multiple testing correction is to be applied, this argument must be set to NULL and then alpha argument is required. 35 | } 36 | \item{cost}{ 37 | The cost to generate a replicate (a sample) for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. This argument will only be used when a different sample size per omic is allowed. By default, cost = 1 (which means that all the omics will be assumed to have the same cost). 38 | } 39 | \item{equalSize}{ 40 | If TRUE (default), the same optimal sample size will be estimated for all the omics. If FALSE, omics are allowed to have different sample sizes. 41 | } 42 | \item{max.size}{ 43 | Maximum allowed sample size. By default, max.size = 200. 44 | } 45 | \item{omicCol}{ 46 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used. 47 | } 48 | \item{powerPlots}{ 49 | If TRUE (default), power plots will be generated. 50 | } 51 | } 52 | \details{ 53 | %% ~~ If necessary, more details than the description above ~~ 54 | } 55 | \value{ 56 | When applying \code{MultiPower}, the result is a list containing the following elements: 57 | \item{parameters }{List with as many elements as omic data types. For each omic, each element of the list is another list containing the different parameters used 58 | to compute power, either estimated from the pilot data or provided by the user: type, pooledSD, d, delta, logFC, mu, m, etc.} 59 | \item{optimalSampleSize }{List containing the following elements: n0 (sample size to achieve the minimum omic power, omicPower, for each omic), n (optimal sample size), 60 | finalPower (power at the optimal sample size for each omic), fdr (see fdr argument), omicPower (see omicPower argument), 61 | averagePower (see averagePower argument), and cost (see cost argument).} 62 | \item{summary }{Table summarizing MultiPower results. The columns are: the names of the omic data sets (omic), the omic data type (type), 63 | the number of omic features for each omic (numFeat), the minimum and maximum observed Cohen’s d (minCohenD and maxCohenD), the FDR value (FDR), 64 | the minimum power to be achieve for each omic (minPower), the average power to be achieved in the multi-omic experiment (averPower), 65 | the cost per omic (cost), the minimum sample size needed for each omic to achieve minPower (minSampleSize), 66 | the optimal sample size (optSampleSize), and the power at this optimal sample size (power).} 67 | \item{data2plot}{Data generated to create the power plots that are also returned by the function.} 68 | } 69 | \references{ 70 | %% ~put references to the literature/web site here ~ 71 | } 72 | \author{ 73 | Sonia Tarazona; David Gómez-Cabrero 74 | } 75 | \note{ 76 | %% ~~further notes~~ 77 | } 78 | 79 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 80 | 81 | \seealso{ 82 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 83 | } 84 | \examples{ 85 | } 86 | % Add one or more standard keywords, see file 'KEYWORDS' in the 87 | % R documentation directory. 88 | %\keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 89 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 90 | -------------------------------------------------------------------------------- /man/MultiGroupPower.Rd: -------------------------------------------------------------------------------- 1 | \name{MultiGroupPower} 2 | \alias{MultiGroupPower} 3 | \title{ 4 | Optimal sample size estimation for multiple group comparisons. 5 | } 6 | \description{ 7 | MultiGroupPower estimates the optimal sample size for a multi-omic experiment when pilot multi-omic data sets are available to estimate the parameters required to compute power and there are multiple groups to be compared. 8 | } 9 | \usage{ 10 | MultiGroupPower(data, groups, type, comparisons = NULL, omicPower = 0.6, averagePower = 0.85, 11 | fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL, 12 | powerPlots = FALSE, summaryPlot = TRUE) 13 | } 14 | %- maybe also 'usage' for other objects documented here. 15 | \arguments{ 16 | \item{data}{ 17 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a raw count data matrix, and in this case MultiGroupPower will take into account the library sizes to estimate power; a normally distributed data matrix which must have been already pre-processed and normalized; or a binary data matrix (with 0/1 or TRUE/FALSE values). In any case, for each one of these matrices, rows must correspond to omic features (genes, methylation sites, ChIP-seq peaks, etc.) and columns to observations (biological samples, patients, etc.). 18 | } 19 | \item{groups}{ 20 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a vector with length equal to the number of observations for that omic in data argument. Each element of this vector must indicate the experimental group where each observation belong. 21 | } 22 | \item{type}{ 23 | Vector with length equal to the number of omic data types. Each element of this vector must be a 1, 2 or 3 to indicate whether the omic data are count data (1), continuous data approximately following a normal distribution (2) or binary data (3). 24 | } 25 | \item{comparisons}{ 26 | Pairwise comparisons to be done between groups. If NULL (default option), the function will generate all the possible comparisons between the groups that are available for all omics. If users wish to indicate the comparisons to be done, they must provide a matrix with two rows and as many columns as comparisons. Each column will be then a two-element vector with the two groups to be compared. An easy way to generate this matrix is using the combn() function that returns a matrix with all the possible comparisons. Users can then remove the columns of the comparisons that are not interesting for them. 27 | } 28 | \item{omicPower}{ 29 | The minimum power that must be achieved for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. By default, omicPower = 0.6. 30 | } 31 | \item{averagePower}{ 32 | The minimum average power that must be globally achieved. By default, averagePower = 0.85. 33 | } 34 | \item{fdr}{ 35 | False Discovery Rate level to be used. It is the significance level after multiple testing correction. By default, fdr = 0.05. 36 | } 37 | 38 | \item{cost}{ 39 | The cost to generate a replicate (a sample) for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. This argument will only be used when a different sample size per omic is allowed. By default, cost = 1 (which means that all the omics will be assumed to have the same cost). 40 | } 41 | \item{equalSize}{ 42 | If TRUE (default), the same optimal sample size will be estimated for all the omics. If FALSE, omics are allowed to have different sample sizes. 43 | } 44 | \item{max.size}{ 45 | Maximum allowed sample size. By default, max.size = 200. 46 | } 47 | \item{omicCol}{ 48 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used. 49 | } 50 | \item{powerPlots}{ 51 | If TRUE (FALSE is the default), power plots will be generated for each individual comparison as in \code{MultiPower} function. 52 | } 53 | \item{summaryPlot}{ 54 | If TRUE (default), summary plots for sample size and power will be generated including the results for all comparisons and the global result, that is, the maximum sample size for all comparisons ("optimal" sample size) and the corresponding statistical power for each omic. 55 | } 56 | } 57 | \details{ 58 | %% ~~ If necessary, more details than the description above ~~ 59 | } 60 | \value{ 61 | When applying \code{MultiGroupPower}, the result is a list containing as many elements as omic data types and an 62 | additional element containing the global summary (GlobalSummary) of the results. The elements corresponding to omic data types are lists with the following elements: 63 | \item{parameters }{List with as many elements as omic data types. For each omic, each element of the list is another list containing the different parameters 64 | used to compute power, estimated from the pilot data.} 65 | \item{optimalSampleSize }{List containing the following elements: n0 (sample size to achieve the minimum omic power, omicPower, for each omic), 66 | n (optimal sample size), finalPower (power at the optimal sample size for each omic), fdr (see fdr argument), 67 | omicPower (see omicPower argument), averagePower (see averagePower argument), and cost (see cost argument).} 68 | \item{summary}{Table summarizing MultiPower results.} 69 | \item{data2plot}{Data generated to create the power plots that are also returned by the function.} 70 | The GlobalSummary element is a summary table very similar to the summary table described above. } 71 | \references{ 72 | %% ~put references to the literature/web site here ~ 73 | } 74 | \author{ 75 | Sonia Tarazona; David Gomez-Cabrero 76 | } 77 | \note{ 78 | %% ~~further notes~~ 79 | } 80 | 81 | %% ~Make other sections like Warning with \section{Warning }{....} ~ 82 | 83 | \seealso{ 84 | %% ~~objects to See Also as \code{\link{help}}, ~~~ 85 | } 86 | \examples{ 87 | } 88 | % Add one or more standard keywords, see file 'KEYWORDS' in the 89 | % R documentation directory. 90 | %\keyword{ ~kwd1 }% use one of RShowDoc("KEYWORDS") 91 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line 92 | -------------------------------------------------------------------------------- /MultiPower_Examples/TCGA_Example.R: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | ######## Power study for TCGA-GBM ######## 3 | ######## Nature Communications ######## 4 | ############################################################################ 5 | 6 | ## By Sonia Tarazona 7 | ## 14-Dic-2017 8 | ## Last modified: 28-nov-2018 9 | 10 | 11 | options(stringsAsFactors = FALSE) 12 | 13 | # Loading MultiPower functions 14 | source("~/Dropbox/STATegra_STC/FiguresOfMerit/MultiPower/R/MultiOmicsPower10.R") 15 | 16 | 17 | 18 | myfigdir = "~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults/" 19 | 20 | miscolores = c("red2", "orchid4", "dodgerblue3", "darkolivegreen4") 21 | omicas = c("expression", "methylation", "miRNAs", "proteomics") 22 | names(miscolores) = omicas 23 | 24 | 25 | 26 | # Required data ----------------------------------------------------------- 27 | 28 | setwd("~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults/") 29 | load("dataTCGA.RData", verbose = TRUE) 30 | # tcgadata 31 | # tcgadesign 32 | # DEresults 33 | 34 | 35 | # Figures for estimated parameters per omic ------------------------------------- 36 | 37 | ## Number of features, %DE features 38 | 39 | par(mar = c(1,7,2,2), mfrow = c(1,2)) 40 | bb = barplot(sapply(tcgadata, nrow)/1000, las = 1, 41 | col = miscolores, horiz = TRUE, beside = TRUE, log = "x", 42 | border = miscolores, main = "# features", axes = FALSE) 43 | text(200, bb, sapply(tcgadata, nrow), adj = 1, 44 | col = c(1, "white", rep(1,3)), las = 2) 45 | 46 | par(mar = c(1,2,2,2)) 47 | 48 | bb = barplot(round(100*sapply(DEresults, function (x) sum(x[,"adj.P.Val"] < 0.05))/sapply(tcgadata, nrow),2), 49 | las = 1, col = miscolores, horiz = TRUE, names.arg = "", 50 | border = miscolores, main = "% DE features", axes = FALSE) 51 | text(10, bb, paste0(round(100*sapply(DEresults, function (x) sum(x[,"adj.P.Val"] < 0.05))/sapply(tcgadata, nrow),0), "%"), 52 | adj = 1, col = rep("white",5), las = 2) 53 | 54 | 55 | 56 | 57 | 58 | # Global MultiPower results ----------------------------------------------- 59 | 60 | mytype = rep(2,4) 61 | p1omics = c(0.64, 0.14, 0.56, 0.44) 62 | 63 | tcgadesign = lapply(tcgadesign, function (x) x$type) 64 | 65 | ## d0 = 0.8 66 | par(mfrow = c(1,2)) 67 | tcgaResultsEQ = MultiPower(data = tcgadata, groups = tcgadesign, type = mytype, d0 = 0.8, p1 = p1omics, 68 | omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = TRUE, 69 | max.size = 300, omicCol = miscolores) 70 | tcgaResultsEQ$summary[,c(2:4,6:9,14:16)] 71 | 72 | 73 | save(tcgaResultsEQ, tcgaResultsNE, file = "TCGAresultsMultiPower5.RData") 74 | 75 | 76 | 77 | 78 | 79 | # Post-analysis ----------------------------------------------------------- 80 | 81 | TCGApostEQ = postMultiPower(data = tcgadata, groups = tcgadesign, optResults = tcgaResultsNE, 82 | max.size = 20, omicCol = miscolores) 83 | 84 | 85 | 86 | # Generating figures for the paper ---------------------------------------- 87 | 88 | setwd("~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults") 89 | load("TCGAresultsMultiPower5.RData", verbose = TRUE) 90 | 91 | 92 | ## Supplementary Figure 93 | par(mfrow = c(1,2)) 94 | powerPlot(parameters = tcgaResultsEQ$parameters, optimalSampleSize = tcgaResultsEQ$optimalSampleSize, 95 | omicCol = miscolores) 96 | 97 | 98 | 99 | 100 | 101 | # Filtering low variability features in methylation ----------------------- 102 | 103 | table(tcgadesign$methylation) 104 | dim(tcgadata$methylation) 105 | 106 | head(DEresults$methylation) 107 | plot(density(abs(DEresults$methylation$logFC))) 108 | 109 | min(abs(DEresults$methylation$logFC)) 110 | min(abs(DEresults$methylation$logFC[DEresults$methylation$adj.P.Val < 0.05])) 111 | 112 | sum(abs(DEresults$methylation$logFC) < 0.05) # 77020 (20%) 113 | dim(DEresults$methylation) 114 | # 384349 115 | sum(DEresults$methylation$adj.P.Val < 0.05) 116 | # 53064 117 | 118 | ## New proportion of DE methylation 119 | 53064 / (384349 - 77020) # = 17% 120 | 121 | tcgadata2 = tcgadata 122 | tcgadata2$methylation = tcgadata2$methylation[abs(DEresults$methylation$logFC) >= 0.05,] 123 | 124 | sapply(tcgadata2, dim) 125 | 126 | 127 | 128 | 129 | # Global MultiPower results AFTER filtering ----------------------------------------------- 130 | 131 | p1omics = c(0.64, 0.17, 0.56, 0.44) 132 | 133 | ## d0 = 0.8 134 | par(mfrow = c(1,2)) 135 | tcgaResultsEQ2 = MultiPower(data = tcgadata2, groups = tcgadesign, type = mytype, d0 = 0.8, p1 = p1omics, 136 | omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = TRUE, 137 | max.size = 300, omicCol = miscolores) 138 | tcgaResultsEQ2$summary[,c(2:4,6:9,14:16)] 139 | 140 | 141 | save(tcgaResultsEQ, tcgaResultsNE, tcgaResultsEQ2, file = "TCGAresultsMultiPower5.RData") 142 | 143 | 144 | 145 | 146 | 147 | # Post-analysis ----------------------------------------------------------- 148 | load("dataTCGA.RData", verbose = TRUE) 149 | load("TCGAresultsMultiPower5.RData", verbose = TRUE) 150 | 151 | TCGApostEQ = postMultiPower(data = tcgadata, groups = tcgadesign, optResults = tcgaResultsEQ, 152 | max.size = 30, omicCol = miscolores) 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | # Answers to reviewers --------------------------- 161 | 162 | p1omics 163 | # 0.64 0.17 0.56 0.44 164 | 165 | 166 | ## Selecting n samples and computing DE --> do this R=5 times 167 | 168 | myN = c(22, 20, 15, 10, 5) 169 | resum = NULL 170 | 171 | for (r in 1:5) { 172 | 173 | dades = tcgadata 174 | disseny = tcgadesign 175 | 176 | print(paste0("r = ", r)) 177 | 178 | for (n in myN) { 179 | print(paste0("n = ", n)) 180 | 181 | for (i in 1:length(tcgadata)) { 182 | pro = which(disseny[[i]] == "Proneural") 183 | mes = which(disseny[[i]] == "Mesenchymal") 184 | selec = c(sample(pro, n), sample(mes, n)) 185 | dades[[i]] = dades[[i]][,selec] 186 | disseny[[i]] = disseny[[i]][selec] 187 | } 188 | print(sapply(dades, ncol)) 189 | 190 | DEres = lapply(1:4, function (i) apply(dades[[i]], 1, function (x) t.test(x ~ disseny[[i]])$p.value)) 191 | DEresFDR = lapply(DEres, p.adjust, method = "fdr") 192 | tmp = sapply(DEresFDR, function (x) round(sum(x < 0.05)/length(x),2)) 193 | resum = rbind(resum, c(r, n, tmp)) 194 | } 195 | 196 | } 197 | 198 | colnames(resum) = c("r", "n", names(tcgadata)) 199 | 200 | 201 | head(resum) 202 | resum[resum[,"n"] == 22,] 203 | resum[resum[,"n"] == 20,] 204 | resum[resum[,"n"] == 15,] 205 | resum[resum[,"n"] == 10,] 206 | resum[resum[,"n"] == 5,] 207 | 208 | mediana = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), median) 209 | mediana 210 | 211 | mitjana = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), mean) 212 | 213 | minim = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), min) 214 | minim 215 | 216 | maxim = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), max) 217 | maxim 218 | 219 | png(filename = "validationTCGA.png", pointsize = 22) 220 | matplot(mitjana[,"n"], mitjana[,-1]*100, type = "o", col = miscolores, pch = 16, lwd = 4, ylim = c(0,30), 221 | xlab = "Sample size", ylab = "Mean % DE features", main = "TCGA glioblastoma data") 222 | legend("topleft", omicas, col = miscolores, pch = 16, bty = "n") 223 | dev.off() 224 | 225 | -------------------------------------------------------------------------------- /MultiPower_Examples/STATegra_Example.R: -------------------------------------------------------------------------------- 1 | ############################################################################ 2 | ######## Power study for STATegra data types ######## 3 | ######## Nature Communications ######## 4 | ############################################################################ 5 | 6 | ## By Sonia Tarazona 7 | ## 05-Oct-2017 8 | ## Last modified: 28-nov-2018 9 | 10 | 11 | options(stringsAsFactors = FALSE) 12 | 13 | # Loading MultiPower functions 14 | source("~/Dropbox/STATegra_STC/FiguresOfMerit/MultiPower/R/MultiOmicsPower10.R") 15 | 16 | 17 | # Set up 18 | myfigdir = "~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults/" 19 | 20 | miscolores = c("red2", "darkslategray3", "azure4", "sienna2", 21 | "dodgerblue4", "darkolivegreen4") 22 | omicas = c("RNA-seq", "miRNA-seq", "ChIP-seq", "DNase-seq", "Metabolomics", "Proteomics") 23 | names(miscolores) = omicas 24 | 25 | 26 | 27 | 28 | # Required data ----------------------------------------------------------- 29 | statdata = statdesign = vector("list") 30 | 31 | ## RNA-seq (LOG2-transformed) 32 | statdata$RNAseq = read.delim("~/Dropbox/STATegra_STC/STATegraData/RNAseq/STATegra.RNAseq.CQN.Combat.Annotated.positive_2014_09.csv", 33 | row.names = 1, as.is = TRUE, sep = ",")[,-1] 34 | statdata$RNAseq = statdata$RNAseq[,c(grep("_Ik_0H", colnames(statdata$RNAseq)), 35 | grep("_Ik_24H", colnames(statdata$RNAseq)))] 36 | statdesign$RNAseq = rep(c("0h", "24h"), each = 3) 37 | min(statdata$RNAseq) # 1.14 38 | statdata$RNAseq = statdata$RNAseq - min(statdata$RNAseq) # min = 0 39 | 40 | 41 | ## miRNA-seq (LOG2-transformed) 42 | statdata$miRNAseq = read.delim("~/Dropbox/STATegra_STC/STATegraData/miRNA/miRNAseq_FinalData_NOtecrep.txt", 43 | row.names = 1, as.is = TRUE) 44 | statdata$miRNAseq = statdata$miRNAseq[,c(grep("I.0.", colnames(statdata$miRNAseq)), 45 | grep("I.24.", colnames(statdata$miRNAseq)))] 46 | statdesign$miRNAseq = rep(c("0h", "24h"), each = 3) 47 | min(statdata$miRNAseq) # 0.08775962 48 | statdata$miRNAseq = statdata$miRNAseq - min(statdata$miRNAseq) # min = 0 49 | 50 | 51 | 52 | ## ChIP-seq 53 | statdata$ChIPseq = read.delim("~/Dropbox/STATegra_STC/STATegraData/ChIPseq/STAT_CPM_ChIPseq_FILT.txt", 54 | row.names = 1) 55 | statdata$ChIPseq = log2(statdata$ChIPseq + 1) 56 | statdesign$ChIPseq = rep(c("0h", "24h"), each = 2) 57 | min(statdata$ChIPseq) # 0 58 | 59 | 60 | ## DNase-seq (LOG2-transformed) 61 | statdata$DNaseSeq = read.delim("~/Dropbox/STATegra_STC/STATegraData/DNaseSeq/STAT_DNaseSeq_homer_RPKM_TMM_ARSyN.txt", 62 | row.names = 1, as.is = TRUE) 63 | statdata$DNaseSeq = statdata$DNaseSeq[,c(grep("Ik0h_", colnames(statdata$DNaseSeq)), 64 | grep("Ik24h_", colnames(statdata$DNaseSeq)))] 65 | statdesign$DNaseSeq = statdesign$RNAseq 66 | min(statdata$DNaseSeq) # 1 67 | statdata$DNaseSeq = statdata$DNaseSeq - min(statdata$DNaseSeq) # min = 0 68 | 69 | 70 | ## Metabolomics 71 | statdata$metabolomics = read.delim("~/Dropbox/STATegra_STC/STATegraData/metabolomics/oldData_original.txt", 72 | as.is = TRUE, header = TRUE, row.names = 1) 73 | statdata$metabolomics = statdata$metabolomics[,c(grep("I.0.", colnames(statdata$metabolomics)), 74 | grep("I.24.", colnames(statdata$metabolomics)))] 75 | statdata$metabolomics = log2(statdata$metabolomics) 76 | statdesign$metabolomics = statdesign$RNAseq 77 | min(statdata$metabolomics) # 8.82e-05 78 | 79 | 80 | ## Proteomics (LOG2-transformed) 81 | statdata$proteomics = read.delim("~/Dropbox/STATegra_STC/STATegraData/proteomics/proteomicsImputedTMM.txt", 82 | as.is = TRUE, header = TRUE, row.names = 1) 83 | statdata$proteomics = statdata$proteomics[,c(grep("IKA_0h", colnames(statdata$proteomics)), 84 | grep("IKA_24h", colnames(statdata$proteomics)))] 85 | statdesign$proteomics = statdesign$RNAseq 86 | min(statdata$proteomics) # 1.97 87 | statdata$proteomics = statdata$proteomics - min(statdata$proteomics) 88 | 89 | 90 | sapply(statdata, dim) 91 | # RNAseq miRNAseq ChIPseq DNaseSeq metabolomics proteomics 92 | # [1,] 12762 469 23875 52788 60 1077 93 | # [2,] 6 6 4 6 6 6 94 | 95 | names(statdata) = names(statdesign) = omicas 96 | 97 | 98 | 99 | # Global MultiPower results ----------------------------------------------- 100 | 101 | par(mfrow = c(1,2)) 102 | type1 = rep(2,6) 103 | p1omics = c(0.4, 0.2, 0.2, 0.2, 0.6, 0.2) 104 | 105 | 106 | statResultsEQ = MultiPower(data = statdata, groups = statdesign, type = type1, d0 = 0.8, p1 = p1omics, 107 | omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = TRUE, 108 | max.size = 200, omicCol = miscolores, dispPerc = 75) 109 | 110 | statResultsNE = MultiPower(data = statdata, groups = statdesign, type = type1, d0 = 0.8, p1 = p1omics, 111 | omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = FALSE, 112 | max.size = 200, omicCol = miscolores, dispPerc = 75) 113 | 114 | mycost = c(1, 1.3, 1.5, 1.6, 1, 1) 115 | statResultsNEcost = MultiPower(data = statdata, groups = statdesign, type = type1, d0 = 0.8, p1 = p1omics, 116 | omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = mycost, equalSize = FALSE, 117 | max.size = 200, omicCol = miscolores, dispPerc = 75) 118 | 119 | 120 | save(statResultsEQ, statResultsNE, statResultsNEcost, 121 | file = "STATegraResultsMultiPower6.RData") 122 | 123 | 124 | 125 | 126 | # Post-analysis ----------------------------------------------------------- 127 | 128 | STATpostEQ = postMultiPower(data = statdata, groups = statdesign, optResults = statResultsEQ, 129 | max.size = 4, omicCol = miscolores) 130 | 131 | STATpostNE = postMultiPower(data = statdata, groups = statdesign, optResults = statResultsNE, 132 | max.size = 5, omicCol = miscolores) 133 | 134 | STATpostNEcost = postMultiPower(data = statdata, groups = statdesign, optResults = statResultsNEcost, 135 | max.size = 3, omicCol = miscolores) 136 | 137 | 138 | save(statResultsEQ, statResultsNE, statResultsNEcost, 139 | STATpostEQ, STATpostNE, 140 | file = "STATegraResultsMultiPower6.RData") 141 | 142 | load("STATegraResultsMultiPower6.RData", verbose = TRUE) 143 | 144 | par(mfrow = c(1,2)) 145 | postPowerPlot(postResults = STATpostEQ, equalSize = TRUE, omicCol = miscolores, max.size = 3) 146 | postPowerPlot(postResults = STATpostNE, equalSize = FALSE, omicCol = miscolores, max.size = 5) 147 | postPowerPlot(postResults = STATpostNEcost, equalSize = FALSE, omicCol = miscolores, max.size = 5) 148 | 149 | 150 | powerPlot(statResultsNEcost$parameters, statResultsNEcost$optimalSampleSize, omicCol = miscolores) 151 | 152 | 153 | 154 | 155 | # Generating figures for the paper ---------------------------------------- 156 | 157 | setwd("~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults") 158 | load("STATegraResultsMultiPower6.RData") 159 | 160 | pdf(file = "plots/stategraEQ.pdf", width = 3.5*2, height = 3.5*2) 161 | par(mfrow = c(2,2)) 162 | powerPlot(parameters = statResultsEQ$parameters, optimalSampleSize = statResultsEQ$optimalSampleSize, 163 | omicCol = miscolores) 164 | postPowerPlot(postResults = STATpostEQ, equalSize = TRUE, omicCol = miscolores, max.size = 4) 165 | dev.off() 166 | 167 | 168 | pdf(file = "plots/stategraNE.pdf", width = 4.5*2, height = 4.5) 169 | par(mfrow = c(1,2)) 170 | powerPlot(parameters = statResultsNE$parameters, optimalSampleSize = statResultsNE$optimalSampleSize, omicCol = miscolores) 171 | dev.off() 172 | 173 | pdf(file = "plots/stategraNEcost.pdf", width = 4.5*2, height = 4.5) 174 | par(mfrow = c(1,2)) 175 | powerPlot(parameters = statResultsNEcost$parameters, optimalSampleSize = statResultsNEcost$optimalSampleSize, omicCol = miscolores) 176 | dev.off() 177 | 178 | 179 | 180 | 181 | 182 | # Figures for estimated parameters per omic ------------------------------------- 183 | 184 | ## Number of features, %DE features 185 | 186 | par(mar = c(1,7,2,2), mfrow = c(1,2)) 187 | 188 | bb = barplot(sapply(statResultsEQ$parameters, function (x) x$m)/1000, las = 1, 189 | col = miscolores, horiz = TRUE, beside = TRUE, log = "x", 190 | border = miscolores, main = "# features", axes = FALSE) 191 | text(0.5, bb, sapply(statResultsEQ$parameters, function (x) x$m), adj = 1, 192 | col = 1, las = 2) 193 | 194 | par(mar = c(1,2,2,2)) 195 | bb = barplot(100*sapply(statResultsEQ$parameters, function (x) x$p1), 196 | las = 1, col = miscolores, horiz = TRUE, names.arg = "", 197 | border = miscolores, main = "% DE features", axes = FALSE) 198 | text(10, bb, paste0(round(100*sapply(statResultsEQ$parameters, function (x) x$p1),0), "%"), 199 | adj = 1, col = "white", las = 2) 200 | 201 | 202 | 203 | ## Dispersion 204 | 205 | par(mar = c(7,4,2,1))#, mfrow = c(1,2)) 206 | boxplot(lapply(statResultsEQ$parameters, function (x) x$allDispersions), col = miscolores, 207 | las = 2, main = "Pooled standard deviation", log = "y", ylab = "log-scale (dispersion)") 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /MultiML/README.md: -------------------------------------------------------------------------------- 1 | # MultiML # 2 | 3 | This script was created to estimate the minimum sample size required to keep the error rate below a particular/desired threshold. 4 | Version: 0.1.0 5 | 6 | 7 | ## Introduction ## 8 | 9 | Although differential expression methods are needed to perform an initial variable selection step and hence it is key to be aware of how statistical power of such methods applied on our data can impact the results, an integrative multi-omic model is the final goal of multi-omic studies in many cases. Dimension reduction techniques or machine learning approaches are a popular choice in these cases and the biological problem to solve will determine if the selected methodology is unsupervised or supervised, and for either classification or prediction. For any of these approaches, the optimal size required to optimize the performance of the chosen method is still an open question in the field and, up to our knowledge, no tools exist to estimate sample sizes in a systematic way. On top of that, the classical definition of statistical power is not appropriate for machine learning algorithms where a significance level has to be performed a posteriori. Thus, in the particular case of supervised approaches, the performance can be measured in terms of classification or prediction error rates. 10 | The MultiML method employs a multi-omic pilot data to estimate the minimum sample size required to keep the error rate below the desired threshold. This MultiML version uses Partial Least Squares-Discriminant Analysis (PLS-DA) or Random Forest classification methods to depict K subsamples of different sample sizes (nk) from the available observations, obtaining a classifier for each case, and measuring the corresponding error rate (ek). On these (nk, ek) values, a first order-smoothed penalized P-spline regression is performed to estimate the learning curve that will allow the prediction of error rates for larger sample sizes. The MultiML method was created to help userd to determine optimal sample size required for an accurate classification error rate, when using one or more Omics datasets. 11 | 12 | ## Getting Started ## 13 | 14 | The MultiML method is available in: 15 | https://github.com/ConesaLab/MultiPower/blob/master/MultiML 16 | https://leobalzano@bitbucket.org/leobalzano/multiml.git 17 | 18 | ## Required R libraries: ## 19 | These are the libraries needed from the CRAN repository that can be installed with 20 | install.packages() function and then loaded with library(). 21 | ~~~~ 22 | - plotly 23 | - ggplot2 24 | - gridExtra 25 | - grid 26 | - ggpmisc 27 | - RColorBrewer 28 | - dplyr 29 | - glmnet 30 | - reshape2 31 | - rslurm 32 | - whisker 33 | - randomForest 34 | - caret 35 | - gtable 36 | - pls 37 | ~~~~ 38 | Required libraries from Bioconductor repository that must be installed following the Bioconductor instructions at: 39 | https://bioconductor.org/packages/release/bioc/html/RnaSeqSampleSize.html 40 | ~~~~ 41 | - mixOmics 42 | - Biobase 43 | ~~~~ 44 | Finally, there are some basic packages that do not need to be updated but they need to be loaded: 45 | ~~~~ 46 | - parallel 47 | - splines 48 | - graphics 49 | ~~~~ 50 | After installing all packages, it is necessary to load them all: 51 | ~~~~ 52 | library("plotly") 53 | library("ggplot2") 54 | library ("gridExtra") 55 | library("grid") 56 | library("ggpmisc") 57 | library("RColorBrewer") 58 | library("mixOmics") 59 | library("dplyr") 60 | library ("glmnet") 61 | library ("Biobase") 62 | library("reshape2") 63 | library("parallel") 64 | library("rslurm") 65 | library("whisker") 66 | library("randomForest") 67 | library("caret") 68 | library("gtable") 69 | library("splines") 70 | library("graphics") 71 | library("pls") 72 | ~~~~ 73 | ## Functions: ## 74 | A set of functions developed to perform the MultiML predictive analysis are required and it is necessary to upload them as well. 75 | source ("FoMPredictiveFunctions.R") 76 | 77 | Prediction of the number of samples required to reach a particular classification error rate 78 | 79 | MultiML method requires some input for each omic data type in order to compute a model to understand and predict the adequate number of samples with which the user can reach reasonably low classification errors. Utilizing Omics data results from a pilot study, the user can then extrapolate how many more samples they need to reach an acceptable error of classification. 80 | Next a sequence of the functions present in the method will be explained, in terms of inputs required and outputs obtained: 81 | 82 | ## Input data ## 83 | As unique input data required for MultiML to operate is a list of as many Omics datasets as the user wants and the Response matrix that can be binary or multifactorial. 84 | The names of the omics should be the names of each table in the list. All tables must be arranged in the same way, being in rows, the variables (genes, proteins, etc.) and in columns, the different samples (treatments and/or controls). Each element in this list must be a raw count data frame. 85 | 86 | 87 | 88 | | | TCGA-02-0432-01 | TCGA-08-0245-01 | TCGA-28-1750-01 | TCGA-06-1084-01 | TCGA-02-0064-01 | 89 | |--------|-----------------|-----------------|-----------------|-----------------|---------------- | 90 | |FSTL1 | 6.732094 | 9.172518 | 10.003722 | 9.818185 | 11.002899 | 91 | |AACS | 6.121826 | 6.348193 | 7.067599 | 6.822424 | 6.676739 | 92 | |RPS11 | 10.783535 | 10.533063 | 10.438774 | 11.006742 | 10.662374 | 93 | |CREB3L1 | 4.487414 | 4.620867 | 4.339173 | 4.283061 | 4.711638 | 94 | |ELMO2 | 7.770408 | 6.158398 | 7.035239 | 5.751987 | 7.152336 | 95 | |PNMA1 | 9.921098 | 9.556290 | 9.143960 | 7.626237 | 9.838729 | 96 | 97 | The response matrix must indicate a category for each sample. 98 | 99 | | | TCGA-02-0432-01 |TCGA-08-0245-01 |TCGA-28-1750-01 |TCGA-06-1084-01 |TCGA-02-0064-01| 100 | |--------|-----------------|----------------|----------------|----------------|---------------| 101 | | Type | Proneural | Proneural | Mesenchymal | Mesenchymal | Mesenchymal| 102 | 103 | 104 | Estimation of the time required for a particular study: 105 | Since MultiML is a computationally expensive, time consuming strategy, it is recommendable to perform an estimation of the required time necessary to analyze a particular study. 106 | For this, a function RequiredtimeTest() was created. Its arguments are described in detail next: 107 | 108 | Input 109 | Predictors: A list of the different Omics Datasets and the Response matrix. 110 | Response: A number indicating the position of the response matrix included in "Predictors" object. 111 | Comps: Number of components to be calculated after each iteration. Just applicable to PLSDA approach. 112 | Function: Modular function used to calculate the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used. 113 | crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF). 114 | Ticks: Number of segments (groups) of samples to evaluate. 115 | ERIterations: Number of iterations in which the error rate (ER) will be calculated. 116 | LassoIterations: Number of iterations of the Lasso selection per each error rate analysis. 117 | cpus_per_node: Number of CPUs that will be used in the analysis. 118 | ...: Arguments to be passed to methods. 119 | 120 | Example: 121 | ~~~~ 122 | EstTime<-RequiredtimeTest (Predictors=tcgadata[c(4,6)],Response=2, 123 | cpus_per_node=1, 124 | Ticks = 50, Function = PLSDA.MP,Comps = 10, 125 | crosval = "LOOCV",ERIterations = 50, 126 | LassoIterations=50) 127 | ~~~~ 128 | Output: 129 | Summary: A table indicating the conditions of the analysis, established by the user. 130 | EstimatedTime: A table of the estimated time that the process will last showed in different metrics (seconds, minutes, hours, or days). 131 | 132 | 133 | ## Classification Error Rate calculus ## 134 | 135 | This is calculated through the ER_Calculator() function which is the main function in the MultiML method. 136 | 137 | Input: 138 | Predictors: A list of different Omics Datasets and the Response matrix. 139 | Response: A number indicating the response matrix included in "Predictors" object. 140 | Previous_CER: A previous result of class "ClassificationErrorRate" (CER) to be fused to the one that is going to be calculated. This is useful for merging a pilot study with a posterior analysis. 141 | Ticks: Number of segments (groups) of samples to evaluate. If NULL, the calculation is made on its own considering the "TheoreticER". 142 | WhichTicks: Vector of numbers of ticks the user wants to analyze. If NULL, a random selection between the minimum and maximum number of samples is calculated. It results useful in posterior rounds of analysis. 143 | Function: Modular function used to calculate the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used. 144 | Comps: Number of components to be calculated after each iteration. Just applicable to PLSDA approach. 145 | crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF). 146 | ERIterations: Number of iterations in which the error rate (ER) will be calculated. 147 | LassoIterations: Number of iterations of the Lasso selection per each error rate analysis. 148 | 149 | Example: 150 | ~~~~ 151 | ProtsRF<-ER_Calculator(Predictors=tcgadata[c(4,6)], 152 | Response=2,Previous_CER=NULL,Ticks=NULL, 153 | WhichTicks=NULL,Function=Random.Forest.MP, 154 | Comps=10,crosval = "LOOCV",ERIterations=2, 155 | LassoIterations=2, TheoreticER=0.02) 156 | ~~~~ 157 | 158 | Output: 159 | TestedTicks: A vector of the number of evaluated number of samples. 160 | Omics: An indicator of the Omics evaluated. 161 | Minimums: A list of the minimum value of error rate, balanced (BER) or not (ER) obtained per each ten-component analysis. This is measured through three distance metrics to evaluate the classification performance of the model. Maximal distance "max.dist", distance to centroids "centroids.dist" or Mahalanobis distance "mahalanobis.dist". Thus, each table contains the results per each iteration at different subsets of samples. 162 | TablebyTick: A data frame with information related to the classification error rate obtained by tick ± standard error of the mean "SEM" and the characteristics of the analysis selected by the user. 163 | Prediction_table: A table with the information related to de predicted number of samples required to reach a particular ER ± a margin of error "MOE". 164 | CompWinner: A list of the number of components in which the minimum value of error rate, balanced (BER) or not (ER) was obtained per each iteration. This only applies for PLSDA module and it is measured through the three mentioned distance metrics, Maximal distance, distance to centroids or Mahalanobis distance, to evaluate the classification performance of the model. 165 | 166 | 167 | ## Elaboration of files and folder for running analyzes in a high-performance computer (HPC) ## 168 | 169 | Often, the users need to evaluate the data thoroughly, with analyzes among two or more Omics datasets, or determine if an Omics type is necessary or not for a particular study. 170 | To overcome the fact that the method is computationally demanding, we offer a set of functions that allows the data arrangement and the creation of procedure commands, so the user can be able to run them into an HPC, as long as the HPC can be run through the Slurm workload manager job scheduler. 171 | 172 | Step 1: Determining the comparisons to be evaluated in the dataset. 173 | This can be performed with the CombinationsFunction() function. Here, the user can create the combination of datasets, so they can be included in the HPC. 174 | 175 | Input: 176 | OmicsData: The data as list of predictor datasets including also the response (Y) matrix as the last element of the list. 177 | AnalysisType: Here we offer three types of analyzes that can be called. "exploratory", where all predictor matrices are going to be compared vs response matrix (Y) separately, as well as all omics dataset vs Y. "complete", where simply all combinations are created, even though this is not recommended because of the amount of time that a job of these characteristics implies. "detailed", where the user can choose by levels indicating if two datasets vs Y or three vs Y is required. 178 | levels: A vector indicating the levels to be analyzed. If "AnalysisType" = "detailed", if 2, then two predictors datasets will be contrasted against Y and so on. 179 | 180 | Example: 181 | ~~~~ 182 | ExplTest<-CombinationsFunction (OmicsData=tcgadata, 183 | AnalysisType="detailed", 184 | levels=c(2)) 185 | ~~~~ 186 | 187 | Output: 188 | Combinations: A list of the combinations created by the user. 189 | Data: All required lists of datasets to be used as input for the error rate analysis. 190 | 191 | Step 2: Creating the files and folders to upload into an HPC. 192 | For this, the user can simply follow a procedure to create an "SlurmFunction" as follows: 193 | ~~~~ 194 | SlurmFunction <- function (X) { 195 | ER_Calculator(Predictors=X, 196 | Response=length(X), Previous_CER=NULL, 197 | Ticks=NULL, WhichTicks=NULL, 198 | Function=Random.Forest.MP, Comps=10, 199 | crosval = "LOOCV", ERIterations=2, 200 | LassoIterations=2, TheoreticER=0.02) 201 | } 202 | ~~~~ 203 | 204 | Then, the user can create their "SlurmJob" using the Slurm_Creator() function. 205 | This function creates the files to be uploaded into a cluster to calculate the results of all desired combinations of predictor datasets. 206 | 207 | Input: 208 | Function: An object class "SlurmFunction" that will calculate the error rates. 209 | Parameters: All required lists of datasets to be used as input for the error rate calculation. These parameters are the ones obtained in step 1. 210 | jobname: The desired name of the Slurm job. If NA, a random name of the form "slr####" will be assigned. 211 | nodes: The maximum number of nodes to be used in the cluster. 212 | cpus_per_node: The number of CPUs per node in the cluster. It determines how many processes are run in parallel per node. 213 | pkgs: A character vector containing the names of packages that must be loaded on each cluster node. By default, it includes all packages loaded by the user when slurm_apply is called. 214 | time: Time in days to run the data in the cluster. By default is 30 days, but according to the user HPC-availability this can be different. 215 | 216 | Example: 217 | ~~~~ 218 | My_Slurm_job <- Slurm_Creator(Function=SlurmFunction, 219 | Parameters=ExplTest$Data, 220 | jobname = 'my_Job', nodes = 8, 221 | cpus_per_node = 100, time = 30, 222 | pkgs = rev(.packages()) ) 223 | ~~~~ 224 | Output: 225 | Four elements to be uploaded into cluster are created: 226 | Function.RDS: A .RDS file with the Slurm Function that will be calculated. 227 | Parameters.RDS: A .RDS file with all lists of datasets that will be used for the error rate calculation. 228 | SlurmCreator_Run.R: A .R file with the parameters to run the job. 229 | SlurmCreator_submit_sh: A .sh file that indicates all parameters that the cluster requires to perform the job. 230 | 231 | Step 3: Upload all four files into an HPC and run. 232 | Step 4: Once the job is done, download all files from the HPC. 233 | Step 5: Plot results. 234 | 235 | 236 | ## Plotting the results ## 237 | 238 | ### Predictive plot. ### 239 | The predictive plots are generated throughout the function called ErrorRateplot(). This function can not only use the information calculated in the previous step but also, it can calculate the samples required for ER values even lower than the ones reached in the pilot experiment. 240 | 241 | Input: 242 | x: An object of class ER_calculator 243 | ErrorRateType: Character to indicate which error rate to plot. It can be error rate "ER", balanced error rate "BER" or "Both". 244 | MetricsType: Character to indicate the metrics to be plotted. This only applies for PLSDA module. It can be Maximal distance "max.dist", distance to centroids "centroids.dist", Mahalanobis distance "mahalanobis.dist" or "All" 245 | DoF: It can be either NULL to indicate that the degrees of freedom of the Spline model are ticks-1, or a value so the model is created with a different degree of freedom. The chosen DoF affects the samples required so the user must be careful. 246 | Projection: Character to indicate if the user needs for the Spline model to be projected until the minimum error rate Min_ER (TRUE) or not (FALSE). 247 | Spline: Character to indicate if the user needs to plot the Spline model (TRUE) or not (FALSE). 248 | TheoreticER: Character to indicate the minimum value of ER in order to calculate the adequate number of samples. 249 | ConfInt: Character to indicate the confidence interval for the calculation of margin of error and plot of the confidence area of the Spline model. The user can use 0.90, 0.95 or 0.99. 250 | 251 | Example: 252 | ~~~~ 253 | plot<-ErrorRateplot(x=ProtsRF,Projection=TRUE, Spline=TRUE, 254 | DoF = NULL,ErrorRateType = "ER", 255 | MetricsType ="mahalanobis.dist",TheoreticER=NULL, 256 | ConfInt=0.95 ) 257 | ~~~~ 258 | 259 | Output: 260 | A plot of "number of samples" versus "classification error rate", indicating also the metric used, the Omics evaluated, and the samples required to reach a particular error rate ± a margin of error "MOE". 261 | 262 | 263 | ### Comparative plot. ### 264 | The user can also create a comparative plot of all Omics combinations to determine the best contributing Omics to an accurate classification by using Comparative_ERPlot() function. 265 | 266 | Input: 267 | L: list of different Omics-integration results obtained with ER_Calculator() function. 268 | ErrorRateType: Character to indicate which error rate to plot. It can be error rate "ER", balanced error rate "BER" if the PLSDA approach was the one calculated. 269 | MetricsType: Character to indicate the metrics to be plotted. This only applies for PLSDA module. It can be Maximal distance "max.dist", distance to centroids "centroids.dist" or Mahalanobis distance "mahalanobis.dist". 270 | 271 | Example: 272 | First, it is necessary to simply create a list of the results you want to plot together as follows: 273 | 274 | ~~~~ 275 | Allplots<-list( 276 | My_Omics1_Result= My_Omics1 _Result, 277 | My_Omics2_Result= My_Omics2_Result, 278 | My_Omics3_Result= My_Omics3_Result, 279 | My_Omics1_Omics2_Result= My_Omics1_Omics2_Result, 280 | My_Omics1_Omics3_Result= My_Omics1_Omics3_Result, 281 | My_Omics1_Omics2_Omics3_Result=My_Omics1_Omics2_Omics3_Result, 282 | My_Omics2_Omics3_Result= My_Omics2_Omics3_Result 283 | ) 284 | ~~~~ 285 | Then, simply use the function 286 | ~~~~ 287 | Plot<- Comparative_ERPlot(L=Allplots, 288 | ErrorRateType = "ER", 289 | MetricsType ="mahalanobis.dist") 290 | ~~~~ 291 | Output: 292 | A plot of "number of samples" versus "classification error rate" of all different analyzes. 293 | 294 | 295 | 296 | ## How to cite MultiML ## 297 | Soon in bioRxiv !! 298 | 299 | ## References ## 300 | [1] Hastie, T., James, G., Witten, D., Tibshirani, R. (2013). An Introduction to Statistical Learning. Springer. New York. pp. 316-321. 301 | 302 | [2] Meyer, M. C. (2008). Inference using shape-restricted regression splines. The Annals of Applied Statistics, 2(3), 1013-1033. 303 | 304 | [3] Ramsay, J. O. (1988). Monotone regression splines in action. Statistical science, 3(4), 425-441. 305 | -------------------------------------------------------------------------------- /R/MultiOmicsPower15.R: -------------------------------------------------------------------------------- 1 | ###################################################################################### 2 | ###### MULTIPOWER 3 | ###### Optimization model to maximize power of multi-omics integration models 4 | ###################################################################################### 5 | 6 | 7 | ## By Sonia Tarazona and David Gomez-Cabrero 8 | ## 05-Oct-2017 9 | ## Last modified: March-2023 10 | # 11 | 12 | 13 | 14 | 15 | #### PACKAGES READ 16 | # install.packages("FDRsampsize") 17 | require(FDRsampsize) 18 | require(lpSolve) 19 | 20 | 21 | # install.packages("slam") 22 | # install.packages("lpmodeler") 23 | # require(lpmodeler) 24 | # require(Rsymphony) 25 | # 1) Install SYMPHONY: (now 5.6.16, first MultiPower version was with 5.6.10) 26 | # cd 27 | # svn checkout https://projects.coin-or.org/svn/SYMPHONY/releases/5.6.16 SYMPHONY-5.6.16 28 | # cd SYMPHONY-5.6.16 29 | # ./configure 30 | # make 31 | # make install 32 | # 2) Install other components: 33 | # sudo apt-get install coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev 34 | # sudo apt-get install coinor-libsymphony-dev 35 | # sudo apt-get install autotools-dev 36 | # 3) Intall package in R (previously downloaded from CRAN): 37 | # install.packages("Rsymphony_0.1-26.tar.gz", repos = NULL) 38 | 39 | # https://cran.r-project.org/src/contrib/Rsymphony_0.1-26.tar.gz 40 | # https://projects.coin-or.org/SYMPHONY 41 | #Download: wget http://www.coin-or.org/download/source/SYMPHONY/SYMPHONY-5.6.6.tgz 42 | 43 | 44 | # require(boot) 45 | 46 | 47 | 48 | # Auxiliary functions ----------------------------------------------------- 49 | 50 | geomean = function (x) exp(mean(log(x))) 51 | 52 | cohen.h = function (p) abs(2*asin(sqrt(p[1])) - 2*asin(sqrt(p[2]))) # p is a vector with two components 53 | 54 | power.binary = function(n, sig.level=0.05, p1_p2, p1) { 55 | if (length(p1_p2) != length(p1)) stop("p1_p2 and p1 must be of the same length") 56 | p2 = p1 - p1_p2 57 | mypower = sapply(1:length(p1), function (i) power.prop.test(n=n, p1 = p1[i], p2 = p2[i], sig.level = sig.level)$power) 58 | return(mypower) 59 | } 60 | 61 | 62 | 63 | 64 | # Estimating parameters needed for power calculation ---------------------- 65 | 66 | ## Two-group comparison 67 | paramEst = function (data, groups, type = 1) { 68 | 69 | # type = 1 (counts), 2 (gaussian), 3 (binary variables: 0/1 or FALSE/TRUE) 70 | 71 | # Sample size per group 72 | nGroup = table(groups) 73 | 74 | sd0 = apply(data, 1, sd) 75 | sd0 = which(sd0 == 0) 76 | if (length(sd0) > 0) { 77 | print(paste0(length(sd0), " constant features are to be removed from the analysis.")) 78 | data = data[-sd0,] 79 | } 80 | 81 | # Number of features 82 | M = nrow(data) 83 | 84 | # Sequencing depth correction for count data 85 | if (type == 1) { 86 | seqdepth = colSums(data) 87 | med = median(seqdepth) 88 | data = apply(data, 2, function (x) med*x/sum(x)) 89 | } 90 | 91 | 92 | # Mean counts per group 93 | meanPerGroup = t(apply(data, 1, tapply, INDEX = groups, mean, na.rm = TRUE)) 94 | 95 | if (type != 3) { 96 | 97 | # Standard deviation per group 98 | sdPerGroup = t(apply(data, 1, tapply, INDEX = groups, sd, na.rm = TRUE)) 99 | sdPerGroup = sdPerGroup[,names(nGroup)] 100 | 101 | # Pooled Standard Deviation 102 | SDpooled = sqrt((nGroup[1]*sdPerGroup[,1]^2 + nGroup[2]*sdPerGroup[,2]^2)/(sum(as.numeric(nGroup))-2)) 103 | 104 | # Cohen's d per feature 105 | deltaPerFeat = abs(meanPerGroup[,1] - meanPerGroup[,2]) 106 | d = deltaPerFeat/SDpooled 107 | } else { 108 | d = apply(meanPerGroup, 1, cohen.h) 109 | } 110 | 111 | 112 | if (type == 1) { ## COUNT DATA 113 | cat("Parameters are to be estimated for count data \n") 114 | 115 | if(min(data, na.rm = TRUE) < 0) stop("Negative values were found. Are you sure these are count data?\n") 116 | 117 | # Fold-change estimation 118 | allFC = log2(apply(meanPerGroup, 1, function (x) max(0.0000001, x[2]) / max(x[1], 0.0000001))) 119 | 120 | # Average counts 121 | mu = rowMeans(data, na.rm = TRUE) 122 | 123 | # CV 124 | myCV = SDpooled/mu 125 | 126 | # Estimated parameters for count data 127 | myparameters = list("type" = type, "logFC" = allFC, "pooledSD" = SDpooled, "CV" = myCV, 128 | "delta" = deltaPerFeat, "mu" = mu,"m" = M, "d" = d, "nGroup" = nGroup) 129 | 130 | } 131 | 132 | if (type == 2) { ## NORMAL DATA 133 | cat("Parameters are to be estimated for normally distributed data \n") 134 | 135 | # Estimated parameters for normal data 136 | myparameters = list("type" = type, "delta" = deltaPerFeat, "pooledSD" = SDpooled, 137 | "m" = M, "d" = d, "nGroup" = nGroup) 138 | } 139 | 140 | if (type == 3) { ## BINARY DATA 141 | cat("Parameters are to be estimated for binary data \n") 142 | 143 | # Estimated parameters for normal data 144 | myparameters = list("type" = type, "p1_p2" = meanPerGroup[,1]-meanPerGroup[,2], "p1" = meanPerGroup[,1], 145 | "m" = M, "d" = d, "nGroup" = nGroup) 146 | } 147 | 148 | return(myparameters) 149 | 150 | } 151 | 152 | 153 | 154 | # Computing power or sample size given the rest of parameters ------------- 155 | 156 | getPower = function (parameters, power = NULL, n = NULL, fdr = 0.05, 157 | null.effect = 0, max.n = 500) { 158 | 159 | # Compute power for given n 160 | 161 | if (is.null(power)) { 162 | if (is.null(n)) stop("Please, indicate a value for either power or n arguments. \n") 163 | 164 | if (parameters$type == 1) { # COUNT DATA (Negative Binomial) 165 | potencia = fdr.power(fdr = fdr, n = n, pow.func = power.hart, eff.size = parameters$logFC, null.effect = null.effect, 166 | mu = parameters$mu, sig = parameters$CV) 167 | 168 | } 169 | 170 | if (parameters$type == 2) { # NORMAL DATA 171 | potencia = fdr.power(fdr = fdr, n = n, pow.func = power.twosampt, eff.size = parameters$delta, null.effect = null.effect, 172 | sigma = parameters$pooledSD) 173 | } 174 | 175 | 176 | if (parameters$type == 3) { # BINARY DATA 177 | potencia = fdr.power(fdr = fdr, n = n, pow.func = power.binary, eff.size = parameters$p1_p2, null.effect = null.effect, 178 | p1 = parameters$p1) 179 | } 180 | 181 | 182 | return(potencia) } else { # Compute n for given power 183 | 184 | if (parameters$type == 1) { # COUNT DATA 185 | 186 | tamany = fdr.sampsize(fdr = fdr, ave.pow = power, pow.func = power.hart, eff.size = parameters$logFC, 187 | null.effect = null.effect, mu = parameters$mu, sig = parameters$CV, 188 | max.n = max.n, min.n = 2)$n 189 | 190 | } 191 | 192 | if (parameters$type == 2) { # NORMAL DATA 193 | 194 | tamany = fdr.sampsize(fdr = fdr, ave.pow = power, pow.func = power.twosampt, eff.size = parameters$delta, 195 | null.effect = null.effect, sigma = parameters$pooledSD, 196 | max.n = max.n, min.n = 2)$n 197 | } 198 | 199 | 200 | if (parameters$type == 3) { # BINARY DATA 201 | 202 | tamany = fdr.sampsize(fdr = fdr, ave.pow = power, pow.func = power.binary, eff.size = parameters$p1_p2, 203 | null.effect = null.effect, p1 = parameters$p1, 204 | max.n = max.n, min.n = 2)$n 205 | 206 | } 207 | 208 | return(max(ceiling(tamany), 2)) 209 | 210 | } 211 | } 212 | 213 | 214 | 215 | # Optimal sample size ----------------------------------------------------- 216 | 217 | optimalRep = function (parameters, omicPower = 0.6, averagePower = 0.85, fdr = 0.05, cost = 1, 218 | equalSize = TRUE, max.size = 200, null.effect = 0) { 219 | 220 | omics = names(parameters) 221 | 222 | if (length(omicPower) == 1) omicPower = rep(omicPower, length(omics)) 223 | names(omicPower) = omics 224 | 225 | if (equalSize) { ## Same sample size for all omics 226 | 227 | # Compute n for each omic 228 | n1 = sapply(omics, function (oo) getPower(parameters[[oo]], 229 | power = omicPower[oo], n = NULL, 230 | fdr = fdr, max.n = max.size, 231 | null.effect = null.effect)) 232 | names(n1) = omics 233 | 234 | n1max = max(n1, 2, na.rm = TRUE) 235 | 236 | allPowers = sapply(omics, function (oo) getPower(parameters[[oo]], 237 | power = NULL, n = n1max, 238 | fdr = fdr)) 239 | n2 = n1max 240 | if (n2 > max.size) stop("Maximum size allowed has been exceed. 241 | Please, increase max.size parameter to get the optimal sample size. \n") 242 | 243 | # Compute n to satisfy global power 244 | while(sum(allPowers)/length(omics) < averagePower) { 245 | n2 = n2 + 1 246 | if (n2 > max.size) stop("Maximum size allowed has been exceed. 247 | Please, increase max.size parameter to get the optimal sample size. \n") 248 | allPowers = sapply(omics, function (oo) getPower(parameters[[oo]], power = NULL, 249 | n = n2, fdr = fdr, 250 | null.effect = null.effect)) 251 | } 252 | 253 | return(list("n0" = n1, "n" = n2, "finalPower" = allPowers, "fdr" = fdr, 254 | "omicPower" = omicPower, "averagePower" = averagePower, "cost" = cost)) 255 | 256 | } else { ## Different sample size for each omic 257 | 258 | sss = optiSSnotEqual(parameters, fdr, cost, max.size, omicPower, averagePower, 259 | null.effect) 260 | n2 = as.numeric(sss[,"SampleSize"]) 261 | allPowers = as.numeric(sss[,"Power"]) 262 | names(allPowers) = names(n2) = sss[,"Omic"] 263 | 264 | return(list("n0" = NA, "n" = n2, "finalPower" = allPowers, "fdr" = fdr, 265 | "omicPower" = omicPower, "averagePower" = averagePower, "cost" = cost)) 266 | 267 | } 268 | 269 | } 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | # Summary of results ------------------------------------------------------ 278 | 279 | powerSummary = function(parameters, optimalSampleSize) { 280 | 281 | tabla = data.frame("omic" = names(parameters), "type" = sapply(parameters, function (x) x$type), 282 | "numFeat" = sapply(parameters, function (x) x$m), 283 | "minCohenD" = sapply(parameters, function (x) round(min(x$d, na.rm = TRUE),2)), 284 | "maxCohenD" = sapply(parameters, function (x) round(max(x$d, na.rm = TRUE),2)), 285 | "minPower" = optimalSampleSize$omicPower, 286 | "averPower" = optimalSampleSize$averagePower, 287 | "cost" = optimalSampleSize$cost, 288 | "minSampleSize" = optimalSampleSize$n0, 289 | "optSampleSize" = optimalSampleSize$n, 290 | "power" = round(optimalSampleSize$finalPower,4)) 291 | print(tabla) 292 | return(tabla) 293 | } 294 | 295 | 296 | 297 | 298 | 299 | 300 | # Plots for power study --------------------------------------------------- 301 | 302 | powerPlot = function(parameters, optimalSampleSize, omicCol = NULL) { 303 | 304 | if (is.null(omicCol)) { 305 | if (length(parameters) > 12) { 306 | stop("Too many omics to be plotted. Please, select a lower number of omics to plot. \n") 307 | } 308 | omicCol = colors()[c(554,89,111,512,17,586,132,428,601,568,86,390)] 309 | omicCol = omicCol[1:length(parameters)] 310 | } 311 | 312 | omicShape = 1:length(parameters) 313 | names(omicCol) = names(omicShape) = names(parameters) 314 | 315 | 316 | ## Power versus Sample Size 317 | 318 | # Sample Sizes 319 | nmax = max(optimalSampleSize$n) 320 | ngroup = unique(as.numeric(sapply(parameters, function (x) x$nGroup))) 321 | xMin = 2 322 | xMax = round(max(nmax+20, (3*nmax - xMin)/2),0) 323 | xValues = c(round(seq(xMin, xMax, (xMax - xMin)/10)), optimalSampleSize$n) 324 | xValues = sort(unique(c(xValues, ngroup))) 325 | 326 | # Powers 327 | yValues = matrix(NA, ncol = length(parameters), nrow = length(xValues)) 328 | rownames(yValues) = xValues 329 | colnames(yValues) = names(parameters) 330 | 331 | for (i in 1:nrow(yValues)) { 332 | for (j in 1:ncol(yValues)) { 333 | yValues[i,j] = getPower(parameters[[j]], power = NULL, n = xValues[i], fdr = optimalSampleSize$fdr) ### null.effect 334 | } 335 | } 336 | 337 | matplot(xValues, yValues, type = "l", lwd = 2, xlab = "Sample size", ylab = "Statistical power", 338 | main = "Power vs Sample Size", col = omicCol, lty = omicShape) 339 | optiSS = optimalSampleSize$n 340 | if (length(optiSS) == 1) optiSS = rep(optiSS, length(parameters)) 341 | points(optiSS, diag(yValues[as.character(optiSS),]), pch = 15, col = omicCol, cex = 1.2) 342 | legend("bottomright", names(parameters), lwd = 2, col = omicCol, lty = omicShape, bty = "n") 343 | 344 | 345 | ## Power vs Effect Size 346 | 347 | # Quantiles of effect size 348 | xValues = seq(0,0.75,0.05) # max P75 349 | 350 | # Powers 351 | yValues2 = matrix(NA, ncol = length(parameters), nrow = length(xValues)) 352 | rownames(yValues2) = xValues 353 | colnames(yValues2) = names(parameters) 354 | 355 | parameters2 = parameters 356 | 357 | optiSS = optimalSampleSize$n 358 | if (length(optiSS) == 1) optiSS = rep(optiSS, length(parameters)) 359 | 360 | percentiles = lapply(parameters, function (x) {quantile(x$d, probs = xValues, na.rm = TRUE)}) 361 | 362 | for (i in 1:nrow(yValues2)) { 363 | for (j in 1:ncol(yValues2)) { 364 | selefeat = which(parameters2[[j]]$d >= percentiles[[j]][i]) 365 | parameters2[[j]]$d = parameters2[[j]]$d[selefeat] 366 | 367 | if (parameters2[[j]]$type == 1) { 368 | parameters2[[j]]$logFC = parameters2[[j]]$logFC[selefeat] 369 | parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat] 370 | parameters2[[j]]$CV = parameters2[[j]]$CV[selefeat] 371 | parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat] 372 | parameters2[[j]]$mu = parameters2[[j]]$mu[selefeat] 373 | } 374 | 375 | if (parameters2[[j]]$type == 2) { 376 | parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat] 377 | parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat] 378 | } 379 | 380 | if (parameters2[[j]]$type == 3) { 381 | parameters2[[j]]$p1_p2 = parameters2[[j]]$p1_p2[selefeat] 382 | parameters2[[j]]$p1 = parameters2[[j]]$p1[selefeat] 383 | } 384 | 385 | yValues2[i,j] = getPower(parameters2[[j]], power = NULL, n = optiSS[j], fdr = optimalSampleSize$fdr) 386 | 387 | } 388 | } 389 | 390 | if (!all(is.na(yValues2))) { 391 | matplot(xValues*100, yValues2, type = "l", lwd = 2, xlab = "Percentiles for effect size cutoff", ylab = "Statistical power", 392 | main = "Power vs Effect size", col = omicCol, lty = omicShape) 393 | points(rep(0, length(parameters)), as.numeric(yValues2[1,]), 394 | pch = 15, col = omicCol, cex = 1.2) 395 | legend("bottomright", names(parameters), lwd = 2, col = omicCol, lty = omicShape, bty = "n") 396 | } 397 | 398 | ## Data to plot 399 | return(list("PowerVsSsampleSize" = yValues, 400 | "PowerVsEffectSize" = yValues2)) 401 | 402 | } 403 | 404 | 405 | 406 | 407 | 408 | 409 | # Computing optimal sample size when it is not equal for all omics ---------------------------------------------------------- 410 | 411 | optiSSnotEqual = function (parameters, fdr = 0.05, cost = 1, max.size = 100, 412 | omicPower = 0.6, averagePower = 0.8, null.effect = 0) { 413 | 414 | ##### GENERATION OF MATRICES FOR THE PROBLEM 415 | 416 | K = length(parameters) # number of omics 417 | 418 | if (length(cost) < K) cost = rep(cost[1], K) 419 | if (length(max.size) < K) max.size = rep(max.size[1], K) 420 | if (length(omicPower) < K) omicPower = rep(omicPower[1], K) 421 | 422 | num.var = sum(max.size) - length(max.size) ## sample size = 1 is not considered 423 | # coeffs power; 424 | # constraint sum(vars) = 1 (to have only 1 sample size); coeffs average power 425 | myC = NULL # coeffs of objective function 426 | # A1: coeffs of power per omic 427 | # A3: sum(Zij) = 1 for each omic i 428 | A1 = A3 = matrix(0, nrow = K, ncol = num.var) 429 | # A2: coeffs for average power for all omics 430 | A2 = NULL 431 | 432 | for (k in 1:K) { 433 | # coef.power --> A1, A2 434 | # coef1 --> A3 435 | coef.power = coef1 = rep(0, num.var) 436 | 437 | for (i in 2:max.size[k]) { 438 | myC = c(myC, cost[k]*i*2) # coefficients of objective function 439 | 440 | # power of each (omic, sample size) 441 | my.power = getPower(parameters[[k]], power = NULL, n = i, fdr = fdr, 442 | null.effect = null.effect, max.n = max.size) 443 | 444 | # coeff average power 445 | A2 = c(A2, my.power) 446 | 447 | if (k == 1) { 448 | coef.power[i-1] = my.power 449 | coef1[i-1] = 1 450 | } 451 | 452 | if (k > 1) { 453 | coef.power[i-k+sum(max.size[1:(k-1)])] = my.power 454 | coef1[i-k+sum(max.size[1:(k-1)])] = 1 455 | } 456 | 457 | } 458 | A1[k,] = coef.power 459 | A3[k,] = coef1 460 | } 461 | 462 | A2 = matrix(A2, nrow = 1, byrow = TRUE) 463 | 464 | myA = rbind(A1, A2, A3) 465 | 466 | mydir = c(rep(">=", K+1), rep("=", K)) 467 | 468 | myRHS = c(omicPower, K*averagePower, rep(1,K)) 469 | 470 | 471 | #### SOLUTION OF THE PROBLEM 472 | mysol = lp(direction = "min", objective.in = myC, const.mat = myA, 473 | const.dir = mydir, const.rhs = myRHS, all.int = TRUE, all.bin = TRUE) 474 | 475 | if (mysol$status == 2) { 476 | 477 | stop("No feasible solution was found for these requirements.") 478 | 479 | } else { 480 | 481 | myvars = unlist(sapply(1:K, function (k) paste(names(parameters)[k], 482 | 2:max.size[k], sep = "="))) 483 | mysolution = myvars[which(mysol$solution == 1)] 484 | mysolution = as.data.frame(do.call("rbind", strsplit(mysolution, "=")), 485 | stringsAsFactors = FALSE) 486 | colnames(mysolution) = c("Omic", "SampleSize") 487 | mysolution = data.frame(mysolution, "OmicCost" = cost*as.numeric(mysolution[,2])*2, 488 | "Power" = diag(A1[,mysol$solution == 1])) 489 | 490 | return(mysolution) 491 | 492 | } 493 | 494 | } 495 | 496 | 497 | 498 | 499 | 500 | 501 | # Wrapper function: MULTIPOWER -------------------------------------------- 502 | 503 | MultiPower = function(data, groups, type, omicPower = 0.6, averagePower = 0.85, 504 | fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL, 505 | powerPlots = TRUE, null.effect = 0) { 506 | 507 | parameters = lapply(1:length(data), function (i) { 508 | cat(paste0("Estimating parameters for omic: ", names(data)[i], " \n")) 509 | paramEst(data[[i]], groups[[i]], type[i])}) 510 | names(parameters) = names(data) 511 | 512 | cat("Computing optimal sample size... \n") 513 | optimalSampleSize = optimalRep(parameters, omicPower, averagePower, fdr, cost, 514 | equalSize, max.size = max.size, null.effect) 515 | 516 | resum = powerSummary(parameters, optimalSampleSize) 517 | 518 | if (powerPlots) { 519 | cat("Generating power plots... \n") 520 | data2plot = powerPlot(parameters, optimalSampleSize, omicCol) 521 | } else { data2plot = NULL } 522 | 523 | return(list("parameters" = parameters, 524 | "optimalSampleSize" = optimalSampleSize, 525 | "summary" = resum, 526 | "data2plot" = data2plot)) 527 | 528 | } 529 | 530 | 531 | 532 | 533 | 534 | # postMultiPower ---------------------------------------------------------- 535 | 536 | postMultiPower = function(optResults, max.size = 5, omicCol = NULL) { 537 | 538 | omics = names(optResults$parameters) 539 | 540 | equalSS = TRUE 541 | if (sum(is.na(optResults$summary$minSampleSize)) == nrow(optResults$summary)) equalSS = FALSE 542 | 543 | maxD = round(min(optResults$summary$maxCohenD),1) 544 | maxD = min(c(maxD, 4)) 545 | lasD = seq(0,maxD-0.1,0.1) 546 | 547 | mySize = myPower = myM = matrix(NA, ncol = length(omics), nrow = length(lasD)) 548 | colnames(mySize) = colnames(myPower) = colnames(myM) = omics 549 | rownames(mySize) = rownames(myPower) = rownames(myM) = paste0("d=", lasD) 550 | 551 | mySize[1,] = optResults$summary$optSampleSize 552 | myPower[1,] = optResults$summary$power 553 | myM[1,] = sapply(optResults$parameters, function (x) x$m) 554 | 555 | parameters2 = optResults$parameters 556 | 557 | for (i in 2:nrow(mySize)) { 558 | for (j in 1:ncol(mySize)) { 559 | selefeat = which(parameters2[[j]]$d >= lasD[i]) 560 | parameters2[[j]]$d = parameters2[[j]]$d[selefeat] 561 | 562 | if (parameters2[[j]]$type == 1) { 563 | parameters2[[j]]$logFC = parameters2[[j]]$logFC[selefeat] 564 | parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat] 565 | parameters2[[j]]$CV = parameters2[[j]]$CV[selefeat] 566 | parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat] 567 | parameters2[[j]]$mu = parameters2[[j]]$mu[selefeat] 568 | } 569 | 570 | if (parameters2[[j]]$type == 2) { 571 | parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat] 572 | parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat] 573 | } 574 | 575 | if (parameters2[[j]]$type == 3) { 576 | parameters2[[j]]$p1_p2 = parameters2[[j]]$p1_p2[selefeat] 577 | parameters2[[j]]$p1 = parameters2[[j]]$p1[selefeat] 578 | } 579 | 580 | } 581 | 582 | tmp = optimalRep(parameters2, omicPower = optResults$summary$minPower, 583 | averagePower = optResults$summary$averPower[1], 584 | fdr = optResults$optimalSampleSize$fdr, cost = optResults$summary$cost, 585 | equalSize = equalSS, max.size = max(optResults$summary$optSampleSize, na.rm = TRUE)) 586 | 587 | mySize[i,] = tmp$n 588 | myPower[i,] = tmp$finalPower 589 | myM[i,] = sapply(parameters2, function (x) length(x$d)) 590 | } 591 | 592 | # Post-results 593 | myresult = list("SampleSize" = mySize, "Power" = myPower, "NumFeat" = myM, "d" = lasD) 594 | 595 | # Plot 596 | postPowerPlot(postResults = myresult, equalSize = equalSS, omicCol = omicCol, max.size = max.size) 597 | 598 | return(myresult) 599 | 600 | } 601 | 602 | 603 | 604 | 605 | # postPowerPlot ----------------------------------------------------------- 606 | 607 | postPowerPlot = function(postResults, equalSize, omicCol = NULL, max.size = 10) { 608 | 609 | if (is.null(omicCol)) { 610 | omicCol = colors()[c(554,89,111,512,17,586,132,428,601,568,86,390)] 611 | omicCol = omicCol[1:nrow(postResults$SampleSize)] 612 | } 613 | names(omicCol) = colnames(postResults$SampleSize) 614 | 615 | if (min(postResults$SampleSize) > max.size) { 616 | cat(paste0("The chosen sample size of ", max.size, " is not feasible. \n")) 617 | max.size = min(postResults$SampleSize) 618 | cat(paste0("A sample size of ", max.size, " will be plotted instead. \n")) 619 | } 620 | 621 | if (equalSize) { 622 | 623 | fff = approxfun(postResults$SampleSize[,1], postResults$d) 624 | myD = round(fff(max.size),1) 625 | 626 | cat(paste0("For having a sample size of ", max.size, " and maintain the desired power, you need to remove features with Cohen's d below ", myD, ". \n")) 627 | cat("The number of remaining features in each omic is: \n") 628 | print(postResults$NumFeat[paste0("d=",myD),]) 629 | 630 | plot(postResults$d, postResults$SampleSize[,1], type = "l", lwd = 2, xlab = "Cohen's d cutoff", 631 | ylab = "Number of replicates", main = "Sample size vs Cohen's d", 632 | ylim = c(2, max(postResults$SampleSize))) 633 | arrows(x0 = 0, y0 = max.size, x1 = myD, y1 = max.size, lty = 2, col = 2) 634 | arrows(x0 = myD, y0 = max.size, x1 = myD, y1 = 2, lty = 2, col = 2) 635 | text(min(postResults$d)+1, max(postResults$SampleSize, na.rm = TRUE)-2, paste0("Cohen's d = ", myD), col = 2) 636 | 637 | } else { 638 | 639 | fff = sapply(1:ncol(postResults$SampleSize), function (i) { fff = approxfun(postResults$SampleSize[,i], postResults$d) 640 | return(fff(max.size)) }) 641 | myD = round(max(fff, na.rm = TRUE),2) 642 | 643 | matplot(postResults$d, postResults$SampleSize, type = "l", lwd = 2, col = omicCol, lty = 1, 644 | xlab = "Cohen's d", ylab = "Number of replicates", main = "Sample size vs Cohen's d", 645 | ylim = c(2, max(postResults$SampleSize))) 646 | legend("topright", colnames(postResults$SampleSize), lwd = 2, col = omicCol, bty = "n") 647 | arrows(x0 = 0, y0 = max.size, x1 = myD, y1 = max.size, lty = 2, col = 1) 648 | arrows(x0 = myD, y0 = max.size, x1 = myD, y1 = 2, lty = 2, col = 1) 649 | text(mean(postResults$d), max(postResults$SampleSize, na.rm = TRUE), 650 | paste0("Cohen's d = ", myD), col = 1, adj = 0.5) 651 | } 652 | 653 | mypar = par() 654 | suppressWarnings(par(xpd = TRUE, mar = c(6.2,4,3,0.8))) 655 | barplot(postResults$Power[c(1,min(which(postResults$d >= myD))),], col = rep(omicCol, each = 2), 656 | beside = TRUE, las = 2, ylab = "Statistical power", ylim = c(0,1), 657 | border = rep(omicCol, each = 2), density = rep(c(30,100), ncol(postResults$Power))) 658 | legend(x = 0.5, y = 1.2, c("Optimal SS", "User's SS"), col = 1, density = c(30,100), ncol = 2, bty = "n") 659 | suppressWarnings(par(mypar)) 660 | 661 | } 662 | 663 | 664 | 665 | 666 | 667 | # Wrapper function MultiGroupPower --------------------------------------------------------- 668 | 669 | MultiGroupPower = function(data, groups, type, comparisons = NULL, 670 | omicPower = 0.6, averagePower = 0.85, 671 | fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL, 672 | powerPlots = FALSE, summaryPlot = TRUE) { 673 | 674 | grupsComuns = Reduce(intersect, groups) 675 | 676 | if (is.null(comparisons)) { # Generating all possible comparisons 677 | 678 | comparisons = combn(grupsComuns, m = 2) 679 | 680 | } else { # Checking that required comparisons are possible for all omics 681 | 682 | grupsComparats = unique(as.vector(comparisons)) 683 | 684 | if (!setequal(grupsComparats, grupsComuns)) stop("Groups to be compared are not available for all omics.") 685 | } 686 | 687 | nomsCompa = apply(comparisons, 2, paste, collapse = "_") 688 | 689 | output = vector("list", length = length(nomsCompa)); names(output) = nomsCompa 690 | 691 | for (i in 1:length(output)) { 692 | 693 | cat(nomsCompa[i], sep = "\n") 694 | 695 | quines = lapply(1:length(data), function (j) which(is.element(groups[[j]], comparisons[,i]))) 696 | data2 = lapply(1:length(data), function (j) data[[j]][,quines[[j]]]); names(data2) = names(data) 697 | groups2 = lapply(1:length(data), function (j) groups[[j]][quines[[j]]]); names(groups2) = names(groups) 698 | 699 | output[[i]] = MultiPower(data = data2, groups = groups2, type, omicPower, averagePower, 700 | fdr, cost, equalSize, max.size, omicCol, powerPlots) 701 | } 702 | 703 | output$GlobalSummary = output[[1]]$summary 704 | 705 | output$GlobalSummary[,"minSampleSize"] = apply(sapply(output[-length(output)], function(x) x$summary[,"minSampleSize"]), 1, 706 | function (y) { 707 | if (length(unique(y)) == 1) return(unique(y)) 708 | if (length(unique(y)) != 1) return(paste(range(y, na.rm = T), collapse = "-"))}) 709 | 710 | output$GlobalSummary[,"optSampleSize"] = apply(sapply(output[-length(output)], function(x) x$summary[,"optSampleSize"]), 711 | 1, max, na.rm = TRUE) 712 | 713 | tmpSS = sapply(output[-length(output)], function(x) x$summary[,"optSampleSize"]) 714 | tmpPower = sapply(output[-length(output)], function(x) x$summary[,"power"]) 715 | output$GlobalSummary[,"power"] = sapply(1:nrow(tmpSS), function (i) tmpPower[i,which.max(tmpSS[i,])]) 716 | 717 | cat("=============================== \n") 718 | cat("Global summary \n") 719 | cat("=============================== \n") 720 | print(output$GlobalSummary) 721 | 722 | 723 | if (summaryPlot) MultiCompaPlot(multiOutput = output, omicCol = omicCol, equalSize = equalSize, 724 | legendLoc = "bottom") 725 | 726 | return(output) 727 | 728 | } 729 | 730 | 731 | 732 | # Plot for multiple comparisons ------------------------------------------- 733 | 734 | MultiCompaPlot = function(multiOutput, omicCol = NULL, equalSize, legendLoc = "bottomright") { 735 | 736 | if (is.null(omicCol)) { 737 | if (length(multiOutput[[1]]$parameters) > 12) { 738 | stop("Too many omics to be plotted. \n") 739 | } 740 | omicCol = colors()[c(554,89,111,512,17,586,132,428,601,568,86,390)] 741 | omicCol = omicCol[1:length(multiOutput[[1]]$parameters)] 742 | } 743 | 744 | omicShape = 1:length(multiOutput[[1]]$parameters) 745 | names(omicCol) = names(omicShape) = names(multiOutput[[1]]$parameters) 746 | 747 | 748 | ## Comparisons versus Sample Size 749 | if (equalSize) { 750 | apintar = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[,"minSampleSize"]) 751 | rownames(apintar) = names(multiOutput[[1]]$parameters) 752 | laopt = multiOutput$GlobalSummary[1,"optSampleSize"] 753 | optComp = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[1,"optSampleSize"]) 754 | 755 | matplot(1:ncol(apintar), t(apintar), type = "l", lwd = 2, xlab = "Comparisons", ylab = "Sample Size", 756 | main = "Sample Size per Comparison", col = omicCol, lty = omicShape, xaxt='n') 757 | axis(side = 1, at=1:ncol(apintar), labels=colnames(apintar)) 758 | abline(h = laopt, lty = 1, lwd = 4) 759 | points(1:ncol(apintar), optComp, pch = 15, col = 1, cex = 1.3) 760 | legend(legendLoc, c(rownames(apintar), " ", "Comparison Optimal SS", "Global Optimal SS"), lwd = 2, col = c(omicCol,"white", 1,1), 761 | lty = c(omicShape,1,NA,1), bty = "n", pch = c(rep(NA, nrow(apintar)+1), 15, NA), title = "Min SS per comparison", cex = 0.8) 762 | 763 | } else { 764 | apintar = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[,"optSampleSize"]) 765 | rownames(apintar) = names(multiOutput[[1]]$parameters) 766 | laOpt = multiOutput$GlobalSummary[,"optSampleSize"] 767 | apintar = data.frame(apintar, "Optimal" = laOpt, check.names = FALSE) 768 | 769 | matplot(1:ncol(apintar), t(apintar), type = "l", lwd = 2, xlab = "Comparisons", ylab = "Sample Size", 770 | main = "Sample Size per Comparison", col = omicCol, lty = omicShape, xaxt='n') 771 | axis(side = 1, at=1:ncol(apintar), labels=colnames(apintar)) 772 | points(rep(ncol(apintar), nrow(apintar)), laOpt, pch = 15, col = omicCol, cex = 1.3) 773 | legend(legendLoc, rownames(apintar), lwd = 2, col = omicCol, cex = 0.8, 774 | lty = omicShape, bty = "n") 775 | 776 | } 777 | 778 | 779 | ## Comparisons vs Power 780 | apintar = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[,"power"]) 781 | rownames(apintar) = names(multiOutput[[1]]$parameters) 782 | optPow = multiOutput$GlobalSummary[,"power"] 783 | apintar = data.frame(apintar, "Optimal" = optPow, check.names = FALSE) 784 | 785 | matplot(1:ncol(apintar), t(apintar), type = "l", lwd = 2, xlab = "Comparisons", ylab = "Statistical Power", 786 | main = "Power per Comparison at Optimal SS", col = omicCol, lty = omicShape, xaxt='n') 787 | axis(side = 1, at=1:ncol(apintar), labels=colnames(apintar)) 788 | points(rep(ncol(apintar), nrow(apintar)), optPow, pch = 15, col = omicCol, cex = 1.3) 789 | legend(legendLoc, rownames(apintar), lwd = 2, col = omicCol, 790 | lty = omicShape, bty = "n", cex = 0.8) 791 | 792 | } 793 | 794 | 795 | 796 | 797 | 798 | # Filtering data with Cohen's d cutoff ------------------------------------ 799 | 800 | CohenFilter = function (data, d, parameters) { 801 | 802 | if (length(d) == 1) d = rep(d, length(data)) 803 | if (length(d) != length(data)) { 804 | stop("Please, provide a single value for d or as many values a omic data types in data") 805 | } 806 | 807 | dataF = data 808 | 809 | for (i in 1:length(data)) { 810 | quitar = which(parameters[[i]]$d < d[i]) 811 | dataF[[i]] = dataF[[i]][-quitar,] 812 | } 813 | 814 | return(dataF) 815 | 816 | } 817 | -------------------------------------------------------------------------------- /MultiML/FoMPredictiveFunctions.R: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | ############ FoMPredictiveFunctions.R ############# 3 | ########################################################### 4 | # Author: Leandro Balzano-Nogueira 5 | # Genetics Institute, University of Florida (Gainesville) 6 | # Last update: October/09/2019 7 | 8 | # This script is to gather together the functions used for the MultiML predictive analysis 9 | 10 | ########################################################### 11 | # Functions: 12 | 13 | permutations <- function(n){ 14 | # Internal checkpoint function to create permutations to evaluate 15 | # if all samples included are the same, and if they are in the 16 | # same order so they can be compared across all Omics datasets 17 | 18 | if(n==1){ 19 | return(matrix(1)) 20 | } else { 21 | sp <- permutations(n-1) 22 | p <- nrow(sp) 23 | A <- matrix(nrow=n*p,ncol=n) 24 | for(i in 1:n){ 25 | A[(i-1)*p+1:p,] <- cbind(i,sp+(sp>=i)) 26 | } 27 | return(A) 28 | } 29 | } 30 | 31 | ########################################################### 32 | 33 | g_legend <- function(a.gplot){ 34 | # Function to break ggplots legends and reconstruct them somewhere else 35 | tmp <- ggplot_gtable(ggplot_build(a.gplot)) 36 | leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 37 | legend <- tmp$grobs[[leg]] 38 | return(legend) 39 | } 40 | ########################################################### 41 | 42 | CombinationsFunction <- function (OmicsData, AnalysisType="complete", levels=NULL) { 43 | # Function to create the combinations of datasets to be studied 44 | # Input : 45 | # OmicsData: The data as list of predictor datasets including also the response (Y) matrix as the last element of the list 46 | # AnalysisType: The type of analysis, it can be 47 | # "exploratory", where all predictors vs Y along with each dataset of predictors vs Y are evaluated 48 | # "complete", where all combinations are created or 49 | # "detailed", where the user can choose by levels indicating if two datasets vs Y or three vs Y is required 50 | # levels: A vector indicating the levels to be analyzed if Analysis Type is detailed, if 2, then two predictors datasets will be contrasted against Y and so on 51 | 52 | # Output: 53 | # A list of two lists: 54 | # Combinations: A list of the combinations required by the user to be created 55 | # Data: All required lists of datasets to be used as input for the error rate analysis 56 | 57 | vectito<-names(OmicsData)[1:length(OmicsData)-1] 58 | Combinatory<-list() 59 | combita<-combitachica<-NULL 60 | 61 | for (factores in 1:length (vectito)) { 62 | namecito<-paste(factores,"Omics", sep = " ") 63 | #print(namecito) 64 | Secuencia<-seq (1:factores) 65 | combita<-combn(x=vectito, m=factores) 66 | 67 | for (om in 1:ncol(combita)){ 68 | namecitoChico<-paste(namecito,om, sep="") 69 | combitachica<-combita[,om] 70 | Combinatory[namecitoChico]<-list(combitachica) 71 | 72 | } 73 | } 74 | Combinations<-list() 75 | if (AnalysisType=="exploratory") { 76 | Combinations<-Combinatory[c(1:length(OmicsData)-1,(length(Combinatory)))] 77 | print(paste("Performing the analysis of",length(Combinations),"out of",length(Combinatory) ,"possible combinations", sep=" " ) ) 78 | } 79 | if (AnalysisType=="complete") { 80 | Combinations<-Combinatory 81 | print(paste("Performing the analysis with all",length(Combinatory) ,"possible combinations", sep=" " ) ) 82 | } 83 | if (AnalysisType=="detailed") { 84 | if (is.null(levels) ) { 85 | stop("If AnalysisType = detailed, you must indicate the levels") 86 | } 87 | SelList<-list() 88 | for (lev in 1:length(levels)) { 89 | levs=levels[lev] 90 | for (All in 1:length(Combinatory)) { 91 | if(length(Combinatory[[All]])==levs) { 92 | neiminho<-names(Combinatory[All]) 93 | SelList<-Combinatory[[All]] 94 | Combinations[neiminho]<-list(SelList) 95 | } 96 | } 97 | } 98 | print(paste("Performing the analysis of",length(Combinations),"out of",length(Combinatory) ,"possible combinations", sep=" " ) ) 99 | } 100 | ListofOmics<-OmicList<-list() 101 | for (branch in 1:length(Combinations)) { 102 | piece<-Combinations[[branch]] 103 | NameOmicsData<-paste0(piece, collapse = "_" ) 104 | OmicsData2<-OmicsData[c(piece,names(OmicsData[length(OmicsData)]) )] 105 | OmicList<-list(OmicsData2) 106 | ListofOmics[NameOmicsData]<-OmicList 107 | } 108 | res<-list(Combinations=Combinations, Data=ListofOmics) 109 | return(res) 110 | } 111 | 112 | ########################################################### 113 | SlurmFunction <- function (X) { 114 | # Function that will be included in the cluster for the required analysis 115 | # Input: 116 | # X: A list of datasets created in "CombinationsFunction" 117 | # comps: Number of componets to be calculated after each iteration through "ClassificationErrorRate" function 118 | # CrosVal: Type of cross validation to be applied in "ClassificationErrorRate" function, Leave-One-Out (LOOCV) or ten fold change (TenF) 119 | # ticks: Number of segments (groups) of samples to evaluate. 120 | # iterations: Number of iterations in which error rate will be calculated 121 | 122 | ER_Calculator(Predictors=X, Response=length(X),Previous_CER=NULL,Ticks=NULL,WhichTicks=NULL,Function=Random.Forest.MP,Comps=10,crosval = "LOOCV",ERIterations=2,LassoIterations=2,TheoreticER=0.02) 123 | } 124 | 125 | ########################################################### 126 | Slurm_Creator<-function (Function, Parameters, jobname = NA, nodes = 2, cpus_per_node = 2, time=30, 127 | pkgs = rev(.packages()) ) 128 | { 129 | # Function to create the files to be uploaded into a cluster to calculate the results of all particular combinations of predictor datasets. 130 | # Input: 131 | # Function: An object class SlurmFunction that will calculate the error rates 132 | # Parameters: All required lists of datasets to be used as input for the error rate calculation 133 | # jobname: The desired name of the Slurm job. If NA, a random name of the form "slr####" will be assigned 134 | # nodes: The maximum number of nodes to be used in the cluster. 135 | # cpus_per_node: The number of CPUs per node in the cluster. It determines how many processes are run in parallel per node 136 | # pkgs: A character vector containing the names of packages that must be loaded on each cluster node. By default, it includes all packages loaded by the user when slurm_apply is called. 137 | # time: Time in days to run the data in the cluster 138 | 139 | # Output: 140 | # Four elements to be uploaded into cluster: 141 | # Function.RDS: A .RDS file with the Slurm Function that will be calculated 142 | # Parameters.RDS: A .RDS file with all lists of datasets that will be used for the error rate calculation 143 | # SlurmCreator_Run.R: A .R file with the parameters to run the job 144 | # SlurmCreator_submit_sh: A .sh file that indicates all parameters that the cluster requires to perform the job 145 | 146 | if (!is.function(Function)) { 147 | stop("first argument to slurm_apply should be a function") 148 | } 149 | 150 | if (!is.numeric(nodes) || length(nodes) != 1) { 151 | stop("nodes should be a single number") 152 | } 153 | if (!is.numeric(cpus_per_node) || length(cpus_per_node) != 154 | 1) { 155 | stop("cpus_per_node should be a single number") 156 | } 157 | tmpdir <- paste0("rslurm_", jobname) 158 | dir.create(tmpdir, showWarnings = FALSE) 159 | saveRDS(Parameters, file = file.path(tmpdir, "Parameters.RDS")) 160 | saveRDS(Function, file = file.path(tmpdir, "Function.RDS")) 161 | if (length(Parameters) < cpus_per_node * nodes) { 162 | nchunk <- cpus_per_node 163 | } 164 | else { 165 | nchunk <- ceiling(length(Parameters)/nodes) 166 | } 167 | nodes <- ceiling(length(Parameters)/nchunk) 168 | template_r <- readLines("templates/Slurm_run_R.txt") 169 | script_r <- whisker::whisker.render(template_r, list(pkgs = pkgs, 170 | nchunk = nchunk, cpus_per_node = cpus_per_node)) 171 | writeLines(script_r, file.path(tmpdir, "SlurmCreator_Run.R")) 172 | template_sh <- readLines("templates/submit_sh.txt") 173 | 174 | rscript_path <- file.path(R.home("bin"), "Rscript") 175 | script_sh <- whisker::whisker.render(template_sh, list(max_node = nodes - 1, 176 | jobname = jobname, 177 | time=time, 178 | rscript = rscript_path)) 179 | writeLines(script_sh, file.path(tmpdir, "SlurmCreator_submit_sh")) 180 | slurm_job(jobname, nodes) 181 | } 182 | 183 | ########################################################### 184 | ClassificationErrorRate<- function (Predictors, Response=length(Predictors),Function=Random.Forest.MP,Comps=10,crosval = "LOOCV",Ticks=10,WhichTicks=NULL,ERIterations=15,LassoIterations=15,ErrorRateType="ER",...) { 185 | # Function to evaluate through Error rate the predictive capability of the Multipower package 186 | # Input: 187 | # Predictors: A list of different Omics Datasets and the Response matrix 188 | # Response: A number indicating the response matrix included in Predictors 189 | # Comps: Number of componets to be calculated after each iteration 190 | # crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF) 191 | # Ticks: Number of segments (groups) of samples to evaluate. 192 | # ERIterations: Number of iterations in which ER will be calculated 193 | # LassoIterations: Number of iterations of the Lasso selection per each Error rate analysis 194 | 195 | # Output: 196 | # Omics: A vector of the evaluated Omics 197 | # A list of two lists: 198 | # Minimums: A list of the minimum value of error rate, balanced (BER) or not (ER) obtained per each ten 199 | # component analysis. This is measured through three distance metrics to evaluate the 200 | # classification performance of the model. Maximal distance (max.dist), distance to 201 | # centroids (centroids) or Mahalanobis distance (Mahalanobis) 202 | # Thus, each table contains the results per each iteration at different subsets of samples 203 | 204 | # CompWinner: A list of the number of components in which the minimum value of error rate, 205 | # balanced (BER) or not (ER) was obtainde per each iteration. This is measured through 206 | # the three mentioned distance metrics to evaluate the classification performance 207 | # of the model. Thus, each table contains the components per each iteration at different 208 | # subsets of samples 209 | 210 | ########## 211 | 212 | components=Comps 213 | 214 | if (! is.list(Predictors) ) { 215 | stop("\\nOmics dataset must be a list with at least two elements") 216 | } 217 | ########### 218 | # Y 219 | Y<-t(as.matrix(Predictors[[Response]], drop=FALSE)) 220 | if (is.character(Y[,1])) { 221 | Ycita<-transform(Y, Type = as.numeric(Type)) 222 | Y<-Ycita 223 | rm(Ycita) 224 | } 225 | if (ncol(Y) != 1) { 226 | stop("\\nResponse must be a single variable") 227 | } 228 | if (any(is.na(Y))) { 229 | stop("\\nResponse must not contain missing values") 230 | } 231 | if (is.null(colnames(Y))) { 232 | colnames(Y) = "Y" 233 | } 234 | if (is.null(rownames(Y))) { 235 | rownames(Y) = 1:n 236 | } 237 | 238 | # Step1: Match the sample size 239 | LosIndivs<- rownames(Y) 240 | for (i in 1:length(Predictors)){ 241 | LosIndivs = intersect(LosIndivs, colnames(Predictors[[i]])) 242 | } 243 | #print(paste("This analysis will be performed with",length(LosIndivs),"samples, since those are the ones repeated in all layers")) 244 | 245 | NewList<-Predictors 246 | 247 | for (i in 1:length(NewList)) { 248 | NewList[[i]]<-NewList[[i]][,colnames(NewList[[i]]) %in% LosIndivs] 249 | } 250 | 251 | ########### 252 | # Step2: Match the order 253 | LosColnames<-colnames(NewList[[1]]) 254 | 255 | for (i in 1:length(NewList)) { 256 | NewList[[i]]<-NewList[[i]][,sort(LosColnames) ] 257 | } 258 | 259 | TestdeMatchTable<-unique(matrix(permutations(length(NewList)),ncol=2)) 260 | 261 | for (i in 1:nrow(TestdeMatchTable)) { 262 | a=TestdeMatchTable[i,1] 263 | b=TestdeMatchTable[i,2] 264 | 265 | if (all(colnames(NewList[[a]])==colnames(NewList[[b]]))){ 266 | #print(paste("Columns of lists",a,"and",b,"are equally sorted")) 267 | } else{ 268 | #print("The colnames are not equally sorted") 269 | } 270 | } 271 | if (crosval=="LOOCV"){ 272 | valid="loo" 273 | } 274 | if (crosval=="TenF"){ 275 | valid="Mfold" 276 | } 277 | 278 | # Y 279 | Y<-t(as.matrix(NewList[[Response]], drop=FALSE)) 280 | if (is.character(Y[,1])) { 281 | Ycita<-transform(Y, Type = as.numeric(Type)) 282 | Y<-Ycita 283 | rm(Ycita) 284 | } 285 | 286 | ######### 287 | Omics<-data.frame(Omics=names(NewList[-Response])) 288 | 289 | MinN<-round(length(table(as.factor(Y[,1]))) * 2, digits = 0) 290 | MaxN<-nrow(Y) 291 | Ngroups<-length(table(as.factor(Y[,1]))) 292 | if(!is.null(Ticks)) { 293 | vectTicks<-round(seq(from=MinN,to = MaxN,length.out = Ticks),digits = 0) 294 | } else { 295 | if (is.null(WhichTicks)) { 296 | print("Please insert which ticks you want to calculate") 297 | } else { 298 | vectTicks<-WhichTicks 299 | } 300 | } 301 | muestra<-seq(1:length(vectTicks)) 302 | 303 | muestrita<-muestritaLASSO<-NULL 304 | Muestra<-muestrasas<-list() 305 | MuestraLASSO<-muestrasasLASSO<-list() 306 | Minimums<-minimossas<-list() 307 | winnerSas<-CompWinner<-list() 308 | 309 | for (i in 1:length(vectTicks)) { 310 | 311 | #print(paste("Tick",i)) 312 | print(paste("Processing your data... Please wait")) 313 | 314 | Mins<-ComponentWinner<-preMins<-NULL 315 | Minsbigrfc<-NULL 316 | TableMins<-NULL 317 | 318 | for (iter in 1:ERIterations){ 319 | iter=iter 320 | #print(paste("performing iteration ",iter," of tick ",i,sep="")) 321 | #print(paste("Tick ",i,": ", vectTicks[i], " samples", sep = "")) 322 | namecito<-paste(vectTicks[i], " samples", sep = "") 323 | resample <- TRUE 324 | index <- rownames(Y) 325 | fun <- function(x) sample(x, round((vectTicks[i])/Ngroups,digits=0), replace = resample) 326 | a <- aggregate(index, by = list(group = Y[,1]), FUN = fun ) 327 | a<-a[,-1] 328 | aLASSO<-aggregate(index, by = list(group = Y[,1]), FUN = fun ) 329 | aLASSO<-aLASSO[,-1] 330 | Premuestrita<-as.vector(unlist(a)) 331 | PremuestritaLASSO<-as.vector(unlist(aLASSO)) 332 | if(vectTicks[i]>=length(Premuestrita)){ 333 | muestrita<-c(Premuestrita,sample(rownames(Y), size=vectTicks[i]-length(Premuestrita), replace = TRUE)) 334 | muestritaLASSO<-c(PremuestritaLASSO,sample(rownames(Y), size=vectTicks[i]-length(PremuestritaLASSO), replace = TRUE)) 335 | } else{ 336 | muestrita<-Premuestrita[1:(length(Premuestrita)-(length(Premuestrita)-vectTicks[i]))] 337 | muestritaLASSO<-PremuestritaLASSO[1:(length(PremuestritaLASSO)-(length(PremuestritaLASSO)-vectTicks[i]))] 338 | } 339 | muestrasas<-list(muestrita) 340 | muestrasasLASSO<-list(muestritaLASSO) 341 | Muestra[namecito]<-muestrasas 342 | MuestraLASSO[namecito]<-muestrasasLASSO 343 | ##### 344 | Xchica2<-NewList 345 | Xchica2LASSO<-NewList 346 | 347 | # Matching the order again 348 | LasMuestras<-Muestra[[i]] 349 | LasMuestrasLASSO<-MuestraLASSO[[i]] 350 | # Sorting the tables 351 | for (long in 1:length(Xchica2)) { 352 | Xchica2[[long]]<-Xchica2[[long]][,sort(LasMuestras) ] 353 | } 354 | for (long2 in 1:length(Xchica2LASSO)) { 355 | Xchica2LASSO[[long2]]<-Xchica2LASSO[[long2]][,sort(LasMuestrasLASSO) ] 356 | } 357 | 358 | TestdeMatchTable<-unique(matrix(permutations(length(Xchica2)),ncol=2)) 359 | for (lulu in 1:nrow(TestdeMatchTable)) { 360 | cola=TestdeMatchTable[lulu,1] 361 | colb=TestdeMatchTable[lulu,2] 362 | 363 | if (all(colnames(Xchica2[[cola]])==colnames(Xchica2[[colb]]))){ 364 | #print(paste("Columns of lists",cola,"and",colb,"are equally sorted")) 365 | } else{ 366 | print("Columns of lists",cola,"and",colb,"are NOT equally sorted") 367 | } 368 | } 369 | 370 | TestdeMatchTableLASSO<-unique(matrix(permutations(length(Xchica2LASSO)),ncol=2)) 371 | for (lulu in 1:nrow(TestdeMatchTableLASSO)) { 372 | cola=TestdeMatchTableLASSO[lulu,1] 373 | colb=TestdeMatchTableLASSO[lulu,2] 374 | 375 | if (all(colnames(Xchica2LASSO[[cola]])==colnames(Xchica2LASSO[[colb]]))){ 376 | #print(paste("Columns of lists",cola,"and",colb,"are equally sorted")) 377 | } else{ 378 | print("Columns of lists",cola,"and",colb,"are NOT equally sorted") 379 | } 380 | } 381 | # 382 | 383 | for (ii in 1:length(Xchica2)) { 384 | Xchica2[[ii]]<-Xchica2[[ii]][,Muestra[[i]] %in% colnames(Xchica2[[ii]])] 385 | } 386 | for (ii in 1:length(Xchica2LASSO)) { 387 | Xchica2LASSO[[ii]]<-Xchica2LASSO[[ii]][,MuestraLASSO[[i]] %in% colnames(Xchica2LASSO[[ii]])] 388 | } 389 | 390 | #### 391 | Xchica<-Xchica2[-Response] 392 | Ychica<-as.matrix(Xchica2[[Response]]) 393 | rm(Xchica2) 394 | if (is.character(Ychica[1,])) { 395 | Ycita<-as.numeric(as.factor(c(Ychica))) 396 | Ychica<-rbind(Ychica,as.numeric(Ycita)) 397 | } 398 | for (lili in 1:length(Xchica)) { 399 | Xchica[[lili]]<-t(Xchica[[lili]]) 400 | } 401 | 402 | XchicaLASSO<-Xchica2LASSO[-Response] 403 | YchicaLASSO<-as.matrix(Xchica2LASSO[[Response]]) 404 | rm(Xchica2LASSO) 405 | if (is.character(YchicaLASSO[1,])) { 406 | Ycita2<-as.numeric(as.factor(c(YchicaLASSO))) 407 | YchicaLASSO<-rbind(YchicaLASSO,as.numeric(Ycita2)) 408 | } 409 | for (lili in 1:length(XchicaLASSO)) { 410 | XchicaLASSO[[lili]]<-t(XchicaLASSO[[lili]]) 411 | } 412 | 413 | PreRes<-operator (X=Xchica,Y=Ychica,XLasso=XchicaLASSO, YLasso=YchicaLASSO, LassoIterations=LassoIterations, FUN=Function,Comps = components) 414 | 415 | if (length(PreRes)>1){ 416 | Mins<-as.matrix(cbind(Mins,PreRes$preMins)) 417 | ComponentWinner<-cbind(ComponentWinner,PreRes$ComponentWinner2) 418 | class(ComponentWinner)<-"numeric" 419 | } else { 420 | Mins<-cbind(Mins,PreRes) 421 | } 422 | } # Closing Iteration 423 | nameSample<-paste(vectTicks[i], " samples", sep = "") 424 | if (dim(Mins)[1]==1){ 425 | colnames(Mins)<-paste0("iter",seq(1:ERIterations));rownames(Mins)<-"OOB_ER" 426 | 427 | } 428 | minimossas<-list(Mins) 429 | Minimums[nameSample]<-minimossas 430 | winnerSas<-list(ComponentWinner) 431 | CompWinner[nameSample]<-winnerSas 432 | } 433 | ##### END of for (i in 1:length(vectTicks)) 434 | ListofMins<-Minimums 435 | SDsas<-StdDev<-list() 436 | for(lista in 1:length(ListofMins)){ 437 | Table<-ListofMins[[lista]] 438 | SDtita<-SDTota<-NULL 439 | for (row in 1:nrow(Table)){ 440 | SDtita<-sd(Table[row,]) /sqrt(ncol(Table)) 441 | SDTota<-rbind(SDTota,SDtita) 442 | } 443 | rownames(SDTota)<-rownames(ListofMins[[1]]) 444 | nameList<-names(ListofMins[lista]) 445 | SDsas<-list(SDTota) 446 | StdDev[nameList]<-SDsas 447 | 448 | } 449 | ### 450 | PreSDstable<-data.frame(matrix(data=NA,nrow=nrow(StdDev[[1]]),ncol = length(StdDev))) 451 | rownames(PreSDstable)<-rownames(StdDev[[1]]) 452 | colnames(PreSDstable)<-t(data.frame(strsplit(names(StdDev),split=" "))[1,])[,1] 453 | 454 | for (iiiii in 1: length(StdDev)) { 455 | PreSDstable[,iiiii]<-rowMeans(StdDev[[iiiii]]) 456 | } 457 | 458 | ### 459 | Premeanstable<-data.frame(matrix(data=NA,nrow=nrow(ListofMins[[1]]),ncol = length(ListofMins))) 460 | rownames(Premeanstable)<-rownames(ListofMins[[1]]) 461 | colnames(Premeanstable)<-t(data.frame(strsplit(names(ListofMins),split=" "))[1,])[,1] 462 | 463 | for (i in 1: length(ListofMins)) { 464 | Premeanstable[,i]<-rowMeans(ListofMins[[i]]) 465 | } 466 | 467 | ListofComps<-CompWinner 468 | if (is.null(ListofComps[[1]]) ){ 469 | PreCompstable<-NULL 470 | } else { 471 | ToNumListofComps2<-PreCompstabletita<-PreCompstable<-NULL 472 | for (i in 1: length(ListofComps)) { 473 | ToNumListofComps2<-ListofComps[[i]] 474 | class(ToNumListofComps2) <- "numeric" 475 | for (row in 1:nrow(ToNumListofComps2)){ 476 | PreCompstabletita<-rowMedians(ToNumListofComps2) 477 | } 478 | PreCompstable<-cbind(PreCompstable,PreCompstabletita) 479 | } 480 | rownames(PreCompstable)<-rownames(ListofComps[[1]]) 481 | colnames(PreCompstable)<-t(data.frame(strsplit(names(ListofComps),split=" "))[1,])[,1] 482 | PreCompstable<-as.data.frame(PreCompstable) 483 | } 484 | 485 | 486 | MeansTable2<-Premeanstable 487 | SDstable2<-PreSDstable 488 | Compstable2<-PreCompstable 489 | 490 | MeansTable2$Category <- rownames(MeansTable2) 491 | SDstable2$Category <- rownames(SDstable2) 492 | Compstable2$Category <- rownames(Compstable2) 493 | meltedMeans<-melt(MeansTable2, id.vars="Category") 494 | meltedSDs<-melt(SDstable2, id.vars="Category") 495 | colnames(meltedSDs)[3]<-"StdDev" 496 | if (is.null(Compstable2) ){ 497 | meltedComps<-NULL 498 | allmelted<-cbind(meltedMeans,StdDev=meltedSDs$StdDev) 499 | allmelted$ErrorRate<-t(data.frame(strsplit(allmelted$Category, split="_"))[2,])[,1] 500 | allmelted$Metric<-t(data.frame(strsplit(allmelted$Category, split="_"))[1,])[,1] 501 | } else { 502 | meltedComps<-melt(Compstable2, id.vars="Category") 503 | colnames(meltedComps)[3]<-"Comp" 504 | allmelted<-cbind(meltedMeans,StdDev=meltedSDs$StdDev,Comp=meltedComps$Comp) 505 | allmelted$ErrorRate<-t(data.frame(strsplit(allmelted$Category, split="_"))[2,])[,1] 506 | allmelted$Metric<-t(data.frame(strsplit(allmelted$Category, split="_"))[1,])[,1] 507 | } 508 | 509 | if (!is.null(Ticks)) { 510 | seqdeTicks<-as.numeric(as.character(unique(allmelted$variable))) 511 | MinN<-min(as.numeric(as.character(unique(allmelted$variable)))) 512 | MaxN<-max(as.numeric(as.character(unique(allmelted$variable)))) 513 | vectdeTicks<-round(seq(from=MinN,to = MaxN,length.out = Ticks),digits = 0) 514 | ElAusente<-vectdeTicks[!(vectdeTicks %in% seqdeTicks)] 515 | 516 | Elreemplazito<-Elreemplazo<-NULL 517 | for (ab in 1:length(ElAusente)){ 518 | Elreemplazito<-seqdeTicks[which.min(abs(seqdeTicks - ElAusente[ab])) ] 519 | Elreemplazo<-c(Elreemplazo,Elreemplazito) 520 | } 521 | while (length(Elreemplazo[duplicated(Elreemplazo)])>0) { 522 | Elnuevoreemplazo<-seqdeTicks[seqdeTicks>Elreemplazo[duplicated(Elreemplazo)] & seqdeTicks< Elreemplazo[which(duplicated(Elreemplazo))+1] ] 523 | Elreemplazo<-sort(c(unique(Elreemplazo),Elnuevoreemplazo )) 524 | } 525 | 526 | vectdeTicks2<-vectdeTicks[(vectdeTicks %in% seqdeTicks)] 527 | NvectdeTicks<-sort(c(vectdeTicks2,Elreemplazo)) 528 | allmelted<-allmelted[allmelted$variable %in% NvectdeTicks,] 529 | } 530 | 531 | if (!is.null(WhichTicks)) { 532 | NvectdeTicks<-sort(c(WhichTicks)) 533 | } 534 | 535 | 536 | allmelted2<-allmelted 537 | ################################################## 538 | Metrica<-unique(allmelted2$Metric) 539 | errate<-unique(allmelted2$ErrorRate) 540 | allmelted3<-allmeltedsas<-list() 541 | CItita<-CItota<-CI<-NULL 542 | CL<-c(90,95,99) 543 | Z<-c(1.64,1.96, 2.58) 544 | CItita<-CItota<-NULL 545 | 546 | if (length(Metrica)==1){ 547 | allmelted3<-allmelted2[allmelted2$Metric==Metrica,] 548 | for (zeta in 1:length(Z)) { 549 | namecito=paste("CI_",CL[zeta], sep="") 550 | for (al in 1:dim(allmelted3)[1]) { 551 | M=allmelted3$value[al] 552 | SM=sqrt( (allmelted3$StdDev[al])^2/as.numeric(as.character(allmelted3$variable[al]) ) ) 553 | lower<-round(M-(Z[zeta]* SM),digits=3) 554 | upper<-round(M+(Z[zeta]* SM),digits=3) 555 | CItita<-cbind(namecito,paste("[",lower," , ",upper,"]", sep="") ) 556 | CItota<-rbind(CItota,CItita) 557 | } 558 | allmelted3[namecito]<-CItota[,2] 559 | CItita<-CItota<-NULL 560 | } 561 | 562 | } else { 563 | for(me in 1:length(Metrica) ) { 564 | tab<-allmelted2[allmelted2$Metric==Metrica[me],] 565 | 566 | for(eler in 1:length(errate)) { 567 | namecito<-paste(Metrica[me],errate[eler],sep="_") 568 | tab2<-tab[tab$ErrorRate==errate[eler],] 569 | allmeltedsas<-list(tab2) 570 | allmelted3[namecito]<-allmeltedsas 571 | } 572 | } 573 | for (uu in 1:length(allmelted3)){ 574 | for (zeta in 1:length(Z)) { 575 | namecito=paste("CI_",CL[zeta], sep="") 576 | for (al in 1:dim(allmelted3[[uu]])[1]) { 577 | M=allmelted3[[uu]]$value[al] 578 | SM=sqrt( (allmelted3[[uu]]$StdDev[al])^2/as.numeric(as.character(allmelted3[[uu]]$variable[al]) ) ) 579 | lower<-round(M-(Z[zeta]* SM),digits=3) 580 | upper<-round(M+(Z[zeta]* SM),digits=3) 581 | CItita<-cbind(namecito,paste("[",lower," , ",upper,"]", sep="") ) 582 | CItota<-rbind(CItota,CItita) 583 | } 584 | allmelted3[[uu]][namecito]<-CItota[,2] 585 | CItita<-CItota<-NULL 586 | } 587 | } 588 | } 589 | 590 | allmelted3 591 | ####### 592 | 593 | if (is.null(winnerSas[[1]]) ) { 594 | result<-list(TestedTicks=NvectdeTicks,Omics=Omics,Minimums=Minimums, 595 | TablebyTick=allmelted3) 596 | } else { 597 | result<-list(TestedTicks=NvectdeTicks,Omics=Omics,Minimums=Minimums,CompWinner=CompWinner , 598 | TablebyTick=allmelted3) 599 | } 600 | return(result) 601 | } 602 | 603 | 604 | ########################################################### 605 | operator<-function (X,Y, XLasso,YLasso,FUN,LassoIterations,...) { 606 | args<-list(...) 607 | Comps<-args$Comps 608 | iter<-args$iter 609 | FUN (X,Y,XLasso,YLasso,LassoIterations,...) 610 | } 611 | 612 | ########################################################### 613 | PLSDA.MP<-function (X,Y,XLasso, YLasso, LassoIterations,...) { 614 | args<-list(...) 615 | Comps<-args$Comps 616 | iter<-args$iter 617 | 618 | if (length(X) >1){ 619 | summary(X) 620 | ### LassoSelection ### 621 | SelVarstita<-SelVars<-NULL 622 | for (lst in 1:length(X)){ 623 | X[[lst]]<-scale(X[[lst]],center=TRUE, scale = FALSE) 624 | SelVars<-LassoSelection (X=XLasso[[lst]],Y=as.factor(YLasso[2,]), IterationsforVarSelections=LassoIterations) 625 | X[[lst]]<-X[[lst]][,colnames(X[[lst]]) %in% SelVars$SelectedVariables] 626 | } 627 | ###################### 628 | res2<- block.plsda(X,as.vector(Y[2,]), ncomp=Comps) 629 | perf.plsda <- try (perf(res2, validation = valid, folds = 5, 630 | progressBar = FALSE, auc = TRUE, nrepeat = 1) ,silent=TRUE) 631 | ## 632 | #TableMeans<-table<-NULL 633 | Mins<-ComponentWinner<-NULL 634 | if (class(perf.plsda) == "try-error") { 635 | preMins<-matrix(1,nrow=6,ncol=1) 636 | colnames(preMins)<-iter;rownames(preMins)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER") 637 | ComponentWinner2<-matrix(1,nrow=6,ncol=1) 638 | colnames(ComponentWinner2)<-iter;rownames(ComponentWinner2)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER") 639 | } else{ 640 | preMeans<-perf.plsda$WeightedVote.error.rate 641 | 642 | TableMeans<-table<-NULL 643 | for (p in 1: length (preMeans)){ 644 | namecito2<-names(preMeans[p]) 645 | table<-preMeans[[p]][grep(pattern = "Overall",x = rownames(preMeans[[p]])),] 646 | tipodeER<-gsub("Overall.", "", rownames(table)) 647 | rownames(table)<-paste(namecito2,tipodeER, sep="_") 648 | TableMeans<-rbind(TableMeans,table) 649 | } 650 | 651 | preMins<-as.matrix(apply(TableMeans, 1, FUN=min)) 652 | preComponentWinner<-data.frame(ComponentWinner=colnames(TableMeans)[apply(TableMeans,1,which.min)]) 653 | ComponentWinner2<-t(data.frame(strsplit(as.vector(preComponentWinner[,1]),split=" "))[2,]) 654 | rownames(ComponentWinner2)<-rownames(preMins);colnames(ComponentWinner2)<-iter 655 | } 656 | Mins<-as.matrix(cbind(Mins,preMins)) 657 | ComponentWinner<-cbind(ComponentWinner,ComponentWinner2) 658 | ComponentWinner2<-ComponentWinner 659 | class(ComponentWinner2)<-"numeric" 660 | 661 | ################################################ 662 | } else { 663 | ### LassoSelection ### 664 | SelVars<-LassoSelection (X=XLasso[[1]],Y=as.factor(YLasso[2,]), IterationsforVarSelections=LassoIterations) 665 | X[[1]]<-X[[1]][,colnames(X[[1]]) %in% SelVars$SelectedVariables] 666 | ###################### 667 | CompsNEW<-min( (nrow(X[[1]])-1), Comps ) 668 | res2<- plsda(X[[1]],as.factor(Y[2,]), ncomp=CompsNEW) 669 | perf.plsda <- try (perf(res2, validation = valid, folds = 5, 670 | progressBar = FALSE, auc = TRUE, nrepeat = 1) ,silent=TRUE) 671 | 672 | if (class(perf.plsda)[[1]] == "try-error") { 673 | preMins<-matrix(1,nrow=6,ncol=1) 674 | colnames(preMins)<-iter;rownames(preMins)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER") 675 | ComponentWinner2<-matrix(1,nrow=6,ncol=1) 676 | colnames(ComponentWinner2)<-iter;rownames(ComponentWinner2)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER") 677 | } else { 678 | preMeans<-perf.plsda$error.rate 679 | 680 | TableMeans<-table<-NULL 681 | for (p in 1: length (preMeans)){ 682 | namecito2<-names(preMeans[p]) 683 | if (namecito2=="overall"){ 684 | namecito2<-"ER" 685 | } 686 | table<-t(preMeans[[p]]) 687 | rownames(table)<-paste(rownames(table),namecito2, sep="_") 688 | TableMeans<-rbind(TableMeans,table) 689 | } 690 | preMins<-as.matrix(apply(TableMeans, 1, FUN=min)) 691 | preComponentWinner<-data.frame(ComponentWinner=colnames(TableMeans)[apply(TableMeans,1,which.min)]) 692 | ComponentWinner2<-t(data.frame(strsplit(as.vector(preComponentWinner[,1]),split=" "))[2,]) 693 | rownames(ComponentWinner2)<-rownames(preMins);colnames(ComponentWinner2)<-iter 694 | } 695 | 696 | 697 | } # Closing separation between block.plsda and plsda 698 | 699 | res=list(preMins=preMins,ComponentWinner2=ComponentWinner2 ) 700 | return (res) 701 | } 702 | 703 | ########################################################### 704 | Random.Forest.MP<-function (X,Y,XLasso,YLasso, LassoIterations,...) { 705 | args<-list(...) 706 | Comps<-args$Comps 707 | iter<-args$iter 708 | Xchicaflat2Lasso<-do.call(cbind.data.frame, XLasso) 709 | Xchicaflat2Lasso<-scale(Xchicaflat2Lasso,center=TRUE, scale = FALSE) 710 | names(Xchicaflat2Lasso) <- make.names(names(Xchicaflat2Lasso)) 711 | YYLasso<-as.data.frame(YLasso[1,]);colnames(YYLasso)<-"Y" 712 | XchicaflatLasso<-merge(YYLasso,Xchicaflat2Lasso,by=0 );rownames(XchicaflatLasso)<-XchicaflatLasso[,1];XchicaflatLasso<-XchicaflatLasso[,-1] 713 | names(XchicaflatLasso) <- make.names(names(XchicaflatLasso)) 714 | rm(Xchicaflat2Lasso,YYLasso) 715 | 716 | Xchicaflat2<-do.call(cbind.data.frame, X); dim(Xchicaflat2) 717 | Xchicaflat2<-scale(Xchicaflat2,center=TRUE, scale = FALSE) 718 | names(Xchicaflat2) <- make.names(names(Xchicaflat2)) 719 | YY<-as.data.frame(Y[1,]);colnames(YY)<-"Y" 720 | Xchicaflat<-merge(YY,Xchicaflat2,by=0 );rownames(Xchicaflat)<-Xchicaflat[,1];Xchicaflat<-Xchicaflat[,-1] 721 | names(Xchicaflat) <- make.names(names(Xchicaflat)) 722 | rm(Xchicaflat2,YY) 723 | 724 | ### LassoSelection ### 725 | SelVars<-LassoSelection (X=XchicaflatLasso[,-1],Y=XchicaflatLasso[,1], IterationsforVarSelections=LassoIterations) 726 | SelVars$SelectedVariables<-c("Y",SelVars$SelectedVariables) 727 | Xchicaflat<-Xchicaflat[,colnames(Xchicaflat) %in% SelVars$SelectedVariables]#; dim(Xchicaflat) 728 | set.seed(1) 729 | model <- randomForest(Y ~., data=Xchicaflat) 730 | 731 | preMins<-model$err.rate[500,1] 732 | rm(model) 733 | return(preMins) 734 | } 735 | ########################################################### 736 | 737 | LassoSelection<- function (X,Y, IterationsforVarSelections=15, ...){ 738 | # Input: 739 | # X: This is the table of predictor variables where n is individuals and p is variables 740 | # Y: This is the vector of response variables 741 | # IterationsforVarSelections: Number of iterations of the variable selection step 742 | ################## 743 | require ("glmnet") 744 | 745 | X<-as.matrix(X) 746 | alpha_val=1 747 | SelVars<- SelectedVariables<-NULL 748 | SelecVariablesalliterations<-NULL 749 | CoefficientsVarstita<-CoefficientsVars<-NULL 750 | for (i in 1:IterationsforVarSelections) { 751 | #print (paste ("LASSO Variable Selection iteration ",i, sep=" ")) 752 | 753 | repeat { 754 | cfit<-try(cv.glmnet(as.matrix(X),as.vector(Y), 755 | standardize=TRUE, family="multinomial", 756 | alpha = alpha_val, grouped = FALSE, 757 | type.measure = "mae"),silent=TRUE ) 758 | 759 | if (class(cfit) == "try-error") { 760 | X=rbind(X,X) 761 | Y=c(Y,Y) 762 | } else { 763 | 764 | for (ls in 1:length(coef(cfit, s = "lambda.min"))) { 765 | CoefficientsVarstita<-as.matrix(coef(cfit, s = "lambda.min")[[ls]]) 766 | CoefficientsVarstita<-CoefficientsVarstita[-1,,drop=FALSE] 767 | CoefficientsVars<-rbind(CoefficientsVars,CoefficientsVarstita) 768 | } 769 | SelVarsPositions<-which(CoefficientsVars != 0) 770 | SelVarssmall<-CoefficientsVars[which(CoefficientsVars != 0),,drop=FALSE] 771 | SelVarssmalllista<- unique(rownames(SelVarssmall)) 772 | SelVars<-c(SelVars,SelVarssmalllista) 773 | SelVars<-unique(SelVars) 774 | break 775 | } 776 | } 777 | SelecVariablesalliterations<-c(SelecVariablesalliterations,SelVars) 778 | SelectedVariables<-unique(c(SelecVariablesalliterations)) 779 | } 780 | res<-list(SelectedVariables=SelectedVariables) 781 | return(res) 782 | } 783 | ########################################################### 784 | ErrorRateplot<-function (x, ErrorRateType="BER",MetricsType="max.dist",DoF=NULL,Projection=FALSE,Spline=TRUE, TheoreticER=NULL,ConfInt=0.95) { 785 | # Input: 786 | # x: list of tables with the error rates results and the number of components per tick 787 | # ErrorRateType: Character to indicate which error rate to plot. It can be error rate "ER", 788 | # balanced error rate "BER" or "Both" 789 | # MetricsType: Character to indicate the metrics to be plotted. It can be Maximal 790 | # distance "max.dist", distance to centroids "centroids.dist", Mahalanobis 791 | # distance "mahalanobis.dist" or "All" 792 | # DoF: It can be either NULL to indicate that the degrees of freedom of the Spline model are ticks-1, 793 | # or a value so the model is created with a different degree of freedom. The chosen DoF affects the samples required so the user must be careful. 794 | # Projection: Character to indicate if the user needs for the Spline model to be projected until 795 | # the minimum error rate Min_ER (TRUE) or not (FALSE). 796 | # Spline: Character to indicate if the user needs to plot the Spline model (TRUE) or not (FALSE). 797 | # TheoreticER: Character to indicate the minimum value of ER in order to calculate the adequate number of samples. 798 | # ConfInt: Character to indicate the confidence interval for the calculation of margin of error and plot of the confidence area of the Spline model. The user can use 0.90, 0.95 or 0.99. 799 | 800 | # Output: 801 | # A Linechart with standard deviation. If the plot represent just one error type and one metrics 802 | # The number of components will appear indicating that was the best number of component. 803 | suppressWarnings({ 804 | Layers<-x$Omics 805 | rownames(Layers)<-NULL 806 | ListofMins<-x$Minimums 807 | ticks<-DoF 808 | 809 | if(is.null(ticks)){ 810 | DegreesOfFreedom<-length(ListofMins)-1 811 | } else { 812 | DegreesOfFreedom<-ticks-1 813 | } 814 | if (MetricsType=="All" || ErrorRateType=="Both"){ 815 | allmelted3<-do.call(rbind.data.frame, x$TablebyTick) 816 | if (MetricsType!="All" ){ 817 | allmelted3<-allmelted3[allmelted3$Metric==MetricsType,] 818 | } 819 | if (ErrorRateType!="Both" ){ 820 | allmelted3<-allmelted3[allmelted3$ErrorRate==ErrorRateType,] 821 | } 822 | } else{ 823 | selector<-paste(MetricsType,ErrorRateType,sep="_") 824 | } 825 | 826 | if (dim(ListofMins[[1]])[1]==1){ 827 | allmelted3<-x$TablebyTick 828 | 829 | } else{ 830 | allmelted3<-x$TablebyTick[[selector]] 831 | } 832 | 833 | if (ConfInt==0.90){ 834 | df<-allmelted3[,c(7:9)] 835 | df<-lapply (df, function (x) gsub("\\[|\\]","",x)) 836 | df2<-as.data.frame(strsplit(df$CI_90, split=",")) 837 | lowers<-as.numeric(as.character(unname(unlist(df2[1,])))) 838 | uppers<-as.numeric(as.character(unname(unlist(df2[2,])))) 839 | } 840 | if (ConfInt==0.95) { 841 | df<-allmelted3[,c(7:9)] 842 | df<-lapply (df, function (x) gsub("\\[|\\]","",x)) 843 | df2<-as.data.frame(strsplit(df$CI_95, split=",")) 844 | lowers<-as.numeric(as.character(unname(unlist(df2[1,])))) 845 | uppers<-as.numeric(as.character(unname(unlist(df2[2,])))) 846 | } 847 | if(ConfInt==0.99){ 848 | df<-allmelted3[,c(7:9)] 849 | df<-lapply (df, function (x) gsub("\\[|\\]","",x)) 850 | df2<-as.data.frame(strsplit(df$CI_99, split=",")) 851 | lowers<-as.numeric(as.character(unname(unlist(df2[1,])))) 852 | uppers<-as.numeric(as.character(unname(unlist(df2[2,])))) 853 | } 854 | 855 | allmelted3$lower<-lowers 856 | allmelted3$upper<-uppers 857 | 858 | allmeltedmodel<-allmelted3 859 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="SEM"] <- "StdDev" 860 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="Samples"] <- "variable" 861 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="ER_value"] <- "value" 862 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="ER_type"] <- "ErrorRate" 863 | colnames(allmelted3)[colnames(allmelted3)=="SEM"] <- "StdDev" 864 | colnames(allmelted3)[colnames(allmelted3)=="Samples"] <- "variable" 865 | colnames(allmelted3)[colnames(allmelted3)=="ER_value"] <- "value" 866 | colnames(allmelted3)[colnames(allmelted3)=="ER_type"] <- "ErrorRate" 867 | 868 | ro=dim(allmeltedmodel)[1] 869 | 870 | if (allmeltedmodel$value[ro-1]1) { 885 | p<-ggplot(allmelted3, aes(x=variable, y= value, group=Category)) + 886 | geom_line(aes(linetype=ErrorRate, color=Metric), size=1) + 887 | ylim(NA,1.1) + 888 | ylab("Classification Error Rate") + 889 | xlab("Number of Samples") + 890 | theme(axis.title=element_text(face="bold",size="14"), 891 | axis.text.x = element_text(face="bold", size=18), 892 | axis.text.y = element_text(face="bold", size=18), 893 | plot.title = element_text(size = "16", face = "bold") 894 | ) + 895 | ggtitle(paste(length(unique(allmelted2$variable))-1, "segments", sep= " ")) + 896 | scale_linetype_manual(values=c("solid", "twodash"))+ 897 | geom_errorbar(aes(ymin = value - StdDev, 898 | ymax = value + StdDev, color=Metric)) 899 | legend <- g_legend(p) 900 | grid.newpage() 901 | vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5) 902 | vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75) 903 | subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25) 904 | print(p + theme(legend.position = "none"), vp = vp1) 905 | upViewport(0) 906 | pushViewport(vpleg) 907 | grid.draw(legend) 908 | upViewport(0) 909 | pushViewport(subvp) 910 | my_table <- tableGrob(Layers) 911 | grid.draw(my_table) 912 | } else { 913 | if (Projection==TRUE & Spline==FALSE){ 914 | Spline=TRUE 915 | print("Since Projection==TRUE, the Spline will be plotted") 916 | } 917 | if (Projection==FALSE & Spline==FALSE){ 918 | p<-ggplot() + 919 | geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Category, 920 | linetype=ErrorRate, color=Metric), allmelted3, size=1) + 921 | geom_errorbar(aes(x=as.numeric(as.character(variable)), group=Category, 922 | ymin = value - StdDev, ymax = value + StdDev, color=Metric), allmelted3) + 923 | ylim(NA,1.1) + 924 | ylab("Classification Error Rate") + 925 | xlab("Number of Samples") + 926 | theme(axis.title=element_text(face="bold",size="14"), 927 | axis.text.x = element_text(face="bold", size=18), 928 | axis.text.y = element_text(face="bold", size=18), 929 | plot.title = element_text(size = "16", face = "bold") 930 | ) + 931 | ggtitle(paste(length(unique(allmelted3$variable))-1, "segments", sep= " ")) + 932 | scale_linetype_manual(values=c("solid", "twodash")) 933 | 934 | legend <- g_legend(p) 935 | grid.newpage() 936 | vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5) 937 | vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75) 938 | subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25) 939 | print(p + theme(legend.position = "none"), vp = vp1) 940 | upViewport(0) 941 | pushViewport(vpleg) 942 | grid.draw(legend) 943 | upViewport(0) 944 | pushViewport(subvp) 945 | my_table <- tableGrob(Layers) 946 | grid.draw(my_table) 947 | } 948 | if (Projection==FALSE & Spline==TRUE){ 949 | MinimumError=min(allmelted3$value) 950 | fm1 <- lm(value ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel) 951 | DESVESTprediction<-lm(StdDev ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel) 952 | 953 | ht01 <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),2000, length.out = 2000) 954 | prediction<-as.data.frame(cbind(predict(fm1, data.frame(variable = ht01)),predict(DESVESTprediction, data.frame(variable = ht01)) ) );colnames(prediction)<-c("Prediction","STDpred") 955 | ht01Table<-cbind(ht01,as.data.frame(prediction));colnames(ht01Table)<-c("ht","ErrorPred", "STDpred") 956 | 957 | ht <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),max(as.numeric(as.character(allmeltedmodel$variable))), length.out = 200) 958 | htTable<-cbind(ht,as.data.frame(predict(fm1, data.frame(variable = ht))));colnames(htTable)<-c("ht","ErrorPred") 959 | 960 | NSampleMaxTable<-as.data.frame(ht01Table[ht01Table[,2]>=MinimumError,]) 961 | NSampleMax<-1+round(NSampleMaxTable$ht[dim(NSampleMaxTable)[1]]) 962 | NSampleMaxPosition<-as.numeric(rownames(NSampleMaxTable)[dim(NSampleMaxTable)[1]]) 963 | 964 | NSampleMaxTable$EplusStd<-NSampleMaxTable$ErrorPred - NSampleMaxTable$STDpred 965 | Theplusminustable<-as.data.frame(NSampleMaxTable[NSampleMaxTable[,4]>=MinimumError,]) 966 | Theplusminus<-round(Theplusminustable$ht[dim(Theplusminustable)[1]]) 967 | TheplusminusPosition<-as.numeric(rownames(Theplusminustable)[dim(Theplusminustable)[1]]) 968 | 969 | SamplesEvaluated<-as.numeric(as.character(allmelted3$variable[dim(allmelted3)[1]])) 970 | SamplesRequired<-round(NSampleMaxTable$ht[NSampleMaxPosition]) 971 | MOE<-paste("±",round(NSampleMax-Theplusminus)) 972 | 973 | p<-ggplot() + 974 | geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Category, 975 | linetype=ErrorRate, color=Metric), allmelted3, size=1) + 976 | geom_errorbar(aes(x=as.numeric(as.character(variable)), group=Category, 977 | ymin = lower, ymax = upper, color=Metric), allmelted3) + 978 | geom_smooth(method="loess", span=0.2, aes(x=ht, y=ErrorPred,color="SPline"), data=NSampleMaxTable, size=1.3 ) + 979 | ylim(NA,1.1) + 980 | ylab("Classification Error Rate") + 981 | xlab("Number of Samples") + 982 | theme(axis.title=element_text(face="bold",size="14"), 983 | axis.text.x = element_text(face="bold", size=18), 984 | axis.text.y = element_text(face="bold", size=18), 985 | plot.title = element_text(size = "16", face = "bold") 986 | ) + 987 | ggtitle(paste(length(unique(allmelted3$variable))-1, "segments", sep= " ")) + 988 | scale_linetype_manual(values=c("solid", "twodash")) 989 | 990 | legend <- g_legend(p) 991 | grid.newpage() 992 | vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5) 993 | vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75) 994 | subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.5) 995 | subvp2 <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25) 996 | print(p + theme(legend.position = "none"), vp = vp1) 997 | upViewport(0) 998 | pushViewport(vpleg) 999 | grid.draw(legend) 1000 | upViewport(0) 1001 | pushViewport(subvp) 1002 | rownames(Layers)<-NULL 1003 | my_table <- tableGrob(Layers) 1004 | grid.draw(my_table) 1005 | 1006 | 1007 | TERtable<-rbind(ERtarget=round(MinimumError,digits = 3), 1008 | PSS=SamplesRequired, 1009 | MOE= MOE) 1010 | TERtableGrob <- tableGrob(TERtable) 1011 | upViewport(0) 1012 | pushViewport(subvp2) 1013 | grid.draw(TERtableGrob) 1014 | } 1015 | 1016 | if (Projection==TRUE & Spline==TRUE){ 1017 | fm1 <- lm(value ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel) 1018 | DESVESTprediction<-lm(StdDev ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel) 1019 | 1020 | ht01 <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),2000, length.out = 2000) 1021 | prediction<-as.data.frame(cbind(predict(fm1, data.frame(variable = ht01)),predict(DESVESTprediction, data.frame(variable = ht01)) ) );colnames(prediction)<-c("Prediction","STDpred") 1022 | ht01Table<-cbind(ht01,as.data.frame(prediction));colnames(ht01Table)<-c("ht","ErrorPred", "STDpred") 1023 | 1024 | ht <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),max(as.numeric(as.character(allmeltedmodel$variable))), length.out = 200) 1025 | htTable<-cbind(ht,as.data.frame(predict(fm1, data.frame(variable = ht))));colnames(htTable)<-c("ht","ErrorPred") 1026 | 1027 | NSampleMaxTable<-as.data.frame(ht01Table[ht01Table[,2]>=MinimumError,]) 1028 | rownames(NSampleMaxTable)<-seq(1:dim(NSampleMaxTable)[1]) 1029 | NSampleMax<-1+round(NSampleMaxTable$ht[dim(NSampleMaxTable)[1]]) 1030 | NSampleMaxPosition<-as.numeric(rownames(NSampleMaxTable)[dim(NSampleMaxTable)[1]]) 1031 | 1032 | NSampleMaxTable$EplusStd<-NSampleMaxTable$ErrorPred - NSampleMaxTable$STDpred 1033 | Theplusminustable<-as.data.frame(NSampleMaxTable[NSampleMaxTable[,4]>=MinimumError,]) 1034 | Theplusminus<-round(Theplusminustable$ht[dim(Theplusminustable)[1]]) 1035 | TheplusminusPosition<-as.numeric(rownames(Theplusminustable)[dim(Theplusminustable)[1]]) 1036 | 1037 | SamplesEvaluated<-as.numeric(as.character(allmelted3$variable[dim(allmelted3)[1]])) 1038 | SamplesRequired<-round(NSampleMaxTable$ht[NSampleMaxPosition]) 1039 | MOE<-paste("±",round(NSampleMax-Theplusminus)) 1040 | 1041 | p<-ggplot() + 1042 | geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Category, 1043 | linetype=ErrorRate, color=Metric), allmelted3, size=1) + 1044 | geom_errorbar(aes(x=as.numeric(as.character(variable)), group=Category, 1045 | ymin = lower, ymax = upper, color=Metric), allmelted3) + 1046 | 1047 | 1048 | geom_smooth(method="loess", span=0.2, aes(x=ht, y=ErrorPred,color="SPline"), data=NSampleMaxTable, size=1.3 ) + 1049 | ylim(NA,1.1) + 1050 | ylab("Classification Error Rate") + 1051 | xlab("Number of Samples") + 1052 | theme(axis.title=element_text(face="bold",size="14"), 1053 | axis.text.x = element_text(face="bold", size=18), 1054 | axis.text.y = element_text(face="bold", size=18), 1055 | plot.title = element_text(size = "16", face = "bold") 1056 | ) + 1057 | ggtitle(paste(length(unique(allmelted3$variable))-1, "segments", sep= " ")) + 1058 | scale_linetype_manual(values=c("solid", "twodash")) 1059 | 1060 | legend <- g_legend(p) 1061 | grid.newpage() 1062 | vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5) 1063 | vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75) 1064 | subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.5) 1065 | subvp2 <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25) 1066 | print(p + theme(legend.position = "none"), vp = vp1) 1067 | upViewport(0) 1068 | pushViewport(vpleg) 1069 | grid.draw(legend) 1070 | upViewport(0) 1071 | pushViewport(subvp) 1072 | rownames(Layers)<-NULL 1073 | my_table <- tableGrob(Layers) 1074 | grid.draw(my_table) 1075 | 1076 | TERtable<-rbind(ERtarget=round(MinimumError,digits = 3), 1077 | PSS=SamplesRequired, 1078 | MOE= MOE) 1079 | TERtableGrob <- tableGrob(TERtable) 1080 | upViewport(0) 1081 | pushViewport(subvp2) 1082 | grid.draw(TERtableGrob) 1083 | 1084 | } 1085 | res<-TERtable 1086 | return(res) 1087 | } 1088 | }) 1089 | } 1090 | 1091 | ########################################################### 1092 | Comparative_ERPlot<-function (L, ErrorRateType = "ER", MetricsType ="mahalanobis.dist"){ 1093 | # Input: 1094 | # L: list of tables obtained from "UntilAllmelted" analysis 1095 | 1096 | # Output: 1097 | # A plot of number of samples vs Classification Error Rate in order to compare and evaluate different analyzes 1098 | 1099 | LosRowNames<-Ltransp1<-Ltransp2<-Ltransp3<-Ltransp4<-NULL 1100 | Ltransp5<-Ltransp<-list() 1101 | 1102 | if (length(unique(L[[1]]$TablebyTick$Category))==1) { 1103 | LosRowNames<-Ltransp1<-Ltransp2<-Ltransp3<-Ltransp4<-NULL 1104 | Ltransp5<-Ltransp<-list() 1105 | for (ls in 1:length(L)){ 1106 | Namecito<-names(L[ls]) 1107 | Ltransp1<-data.frame(L[[ls]]$TablebyTick) 1108 | Ltransp1$Omics<-rep(Namecito, dim(Ltransp1)[1]) 1109 | Ltransp[[Namecito]]<-Ltransp1 1110 | } 1111 | 1112 | TableAll <- do.call(rbind,Ltransp) 1113 | rownames(TableAll)<-seq(1:dim(TableAll)[1]) 1114 | 1115 | 1116 | Metrica<-t(data.frame(cbind(unique(TableAll$Metric),unique(TableAll$ErrorRate)))) 1117 | rownames(Metrica)<-NULL 1118 | p<-ggplot() + 1119 | geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Omics,colour = Omics), data = TableAll, size=1) + 1120 | geom_errorbar(aes(x=as.numeric(as.character(variable)),group=Omics , 1121 | ymin = value - StdDev, ymax = value + StdDev, color=Omics), TableAll) + 1122 | ylab("Classification Error Rate") + 1123 | xlab("Number of Samples") + 1124 | theme(axis.title=element_text(face="bold",size="14"), 1125 | axis.text.x = element_text(face="bold", size=18), 1126 | axis.text.y = element_text(face="bold", size=18), 1127 | plot.title = element_text(size = "16", face = "bold") 1128 | ) + 1129 | ggtitle(paste("Comparative Classification Plot ")) 1130 | legend <- g_legend(p) 1131 | grid.newpage() 1132 | vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5) 1133 | vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75) 1134 | subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25) 1135 | print(p + theme(legend.position = "none"), vp = vp1) 1136 | upViewport(0) 1137 | pushViewport(vpleg) 1138 | grid.draw(legend) 1139 | upViewport(0) 1140 | pushViewport(subvp) 1141 | my_table <- tableGrob(Metrica) 1142 | grid.draw(my_table) 1143 | } else { 1144 | LosRowNames<-Ltransp1<-Ltransp2<-Ltransp3<-Ltransp4<-NULL 1145 | Ltransp5<-Ltransp<-list() 1146 | selector<-paste(MetricsType,ErrorRateType, sep="_") 1147 | loscolnames<-colnames(L[[1]]$TablebyTick[[1]]) 1148 | for (ls in 1:length(L)){ 1149 | Namecito<-names(L[ls]) 1150 | Ltransp1<-data.frame(L[[ls]]$TablebyTick [ names(L[[ls]]$TablebyTick)==selector ]) 1151 | colnames(Ltransp1)<-loscolnames 1152 | Ltransp1$Omics<-rep(Namecito, dim(Ltransp1)[1]) 1153 | Ltransp[[Namecito]]<-Ltransp1 1154 | } 1155 | 1156 | TableAll <- do.call(rbind,Ltransp) 1157 | rownames(TableAll)<-seq(1:dim(TableAll)[1]) 1158 | 1159 | Metrica<-t(data.frame(cbind(unique(TableAll$Metric),unique(TableAll$ErrorRate)))) 1160 | rownames(Metrica)<-NULL 1161 | p<-ggplot() + 1162 | geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Omics,colour = Omics), data = TableAll, size=1) + 1163 | geom_errorbar(aes(x=as.numeric(as.character(variable)),group=Omics , 1164 | ymin = value - StdDev, ymax = value + StdDev, color=Omics), TableAll) + 1165 | ylab("Classification Error Rate") + 1166 | xlab("Number of Samples") + 1167 | theme(axis.title=element_text(face="bold",size="14"), 1168 | axis.text.x = element_text(face="bold", size=18), 1169 | axis.text.y = element_text(face="bold", size=18), 1170 | plot.title = element_text(size = "16", face = "bold") 1171 | ) + 1172 | ggtitle(paste("Comparative Classification Plot ")) 1173 | 1174 | legend <- g_legend(p) 1175 | grid.newpage() 1176 | vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5) 1177 | vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75) 1178 | subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25) 1179 | print(p + theme(legend.position = "none"), vp = vp1) 1180 | upViewport(0) 1181 | pushViewport(vpleg) 1182 | grid.draw(legend) 1183 | upViewport(0) 1184 | pushViewport(subvp) 1185 | my_table <- tableGrob(Metrica) 1186 | grid.draw(my_table) 1187 | } 1188 | } 1189 | 1190 | ########################################################### 1191 | RequiredtimeTest<-function (Predictors, Response, Comps = 10, Function,crosval = "LOOCV",Ticks = 20,ERIterations = 20,LassoIterations=50,cpus_per_node=1, ...) { 1192 | # Function to calculate the time required for a particular MultiML analysis. 1193 | # Input: 1194 | # Predictors: A list of the different Omics Datasets and the Response matrix. 1195 | # Response: A number indicating the position of the response matrix included in "Predictors" object. 1196 | # Comps: Number of componets to be calculated after each iteration. Just applicable to PLSDA approach. 1197 | # Function: Modular function used to calculate the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used. 1198 | # crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF). 1199 | # Ticks: Number of segments (groups) of samples to evaluate. 1200 | # ERIterations: Number of iterations in which the error rate (ER) will be calculated. 1201 | # LassoIterations: Number of iterations of the Lasso selection per each error rate analysis. 1202 | # cpus_per_node: Number of CPUs that will be used in the analysis. 1203 | # ...: Arguments to be passed to methods. 1204 | 1205 | # Output: 1206 | # Summary: A table indicating the conditions of the analysis, established by the user 1207 | # EstimatedTime: A table of the estimated time that the process will last showed in different metrics (seconds, minutes, hours, or days). 1208 | 1209 | components=Comps 1210 | #Mins=NULL 1211 | print("Calculating the estimated time required for the classification analysis") 1212 | Omics<-data.frame(Omics=names(Predictors[-Response])) 1213 | Omics2<-paste(Omics$Omics,collapse="_") 1214 | TestSumm<-data.frame(Omics=Omics2,CrossValidation=crosval,Ticks = Ticks,ERIterations=ERIterations,LassoIterations=LassoIterations,CPUs=cpus_per_node) 1215 | rep<-length(Predictors)-1 1216 | NewList<-Predictors 1217 | NewList<-NewList[-Response] 1218 | if (length(NewList)>1){ 1219 | tabe<-data.frame(sapply(NewList,dim)) 1220 | wti<-as.numeric(tabe[2,][apply(tabe[2,],1,which.min)]) 1221 | 1222 | } else { 1223 | tabe<-data.frame(sapply(NewList,dim)) 1224 | wti<-as.numeric(tabe[2,]) 1225 | } 1226 | TPLSDA<-t(data.frame(summary(system.time(capture.output(ProtsPLSDA<-ClassificationErrorRate(Predictors=Predictors,Response=Response,Comps = components, 1227 | Function=Function, 1228 | crosval = crosval,Ticks = NULL,WhichTicks = wti,ERIterations = 1,LassoIterations = 1) ))))) 1229 | TEval<-(2 + TPLSDA[,3]*ERIterations*LassoIterations*Ticks*rep)/cpus_per_node 1230 | 1231 | template<-c("seconds","minutes","hours","days") 1232 | Result<-c(round(TEval, digits=3),round(TEval/60, digits=3),round(TEval/3600, digits=3), 1233 | round(TEval/86400, digits=3)) 1234 | Result2<-as.data.frame(cbind(Result,template)) 1235 | 1236 | res<-list(Summary=TestSumm,EstimatedTime=Result2) 1237 | 1238 | return(res) 1239 | 1240 | } 1241 | 1242 | ########################################################### 1243 | ER_Calculator<-function(Predictors, Response,Previous_CER=NULL,Ticks=5,WhichTicks=NULL,Function=Random.Forest.MP,Comps=10,crosval = "LOOCV",ERIterations=2,LassoIterations=2,TheoreticER=NULL,ErrorRateType="ER") { 1244 | 1245 | # Function to evaluate through Error rate the predictive capability of the Multipower package. 1246 | # It encompasses "ClassificationErrorRate" and it is the fusion between the mentioned and "ER_Adder" 1247 | # Input: 1248 | # Predictors: A list of different Omics Datasets and the Response matrix 1249 | # Response: A number indicating the response matrix included in "Predictors" object 1250 | # Previous_CER: A previous result of class "ClassificationErrorRate" (CER) to be fusioned to the one that is going to be calculated 1251 | # Ticks: Number of segments (groups) of samples to evaluate. If NULL, the calculation is made on its own considering the TheoreticER 1252 | # WhichTicks: Vector of numbers of ticks the user wants to analyze. If NULL, a random selection between the minimum and maximum number of samples is calculated. It results useful in posterior rounds of analysis. 1253 | # Function: Modular function used to calculate the the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used. 1254 | # Comps: Number of componets to be calculated after each iteration. Just applicable to PLSDA approach. 1255 | # crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF) 1256 | # ERIterations: Number of iterations in which the error rate (ER) will be calculated. 1257 | # LassoIterations: Number of iterations of the Lasso selection per each error rate analysis. 1258 | 1259 | # Output: 1260 | # TestedTicks: A vector of the number of evaluated number of samples 1261 | # Omics: A vector of the evaluated Omics 1262 | # A list of two lists: 1263 | # Minimums: A list of the minimum value of error rate, balanced (BER) or not (ER) obtained per each ten 1264 | # component analysis. This is measured through three distance metrics to evaluate the 1265 | # classification performance of the model. Maximal distance (max.dist), distance to 1266 | # centroids (centroids) or Mahalanobis distance (Mahalanobis) 1267 | # Thus, each table contains the results per each iteration at different subsets of samples 1268 | 1269 | # CompWinner: A list of the number of components in which the minimum value of error rate, 1270 | # balanced (BER) or not (ER) was obtainde per each iteration. This is measured through 1271 | # the three mentioned distance metrics to evaluate the classification performance 1272 | # of the model. Thus, each table contains the components per each iteration at different 1273 | # subsets of samples 1274 | suppressWarnings({ 1275 | 1276 | if (! is.list(Predictors) ) { 1277 | stop("\\nOmics dataset must be a list with at least two elements") 1278 | } 1279 | ########### 1280 | # Y 1281 | Y<-t(as.matrix(Predictors[[Response]], drop=FALSE)) 1282 | if (is.character(Y[,1])) { 1283 | Ycita<-transform(Y, Type = as.numeric(Type)) 1284 | Y<-Ycita 1285 | rm(Ycita) 1286 | } 1287 | if (ncol(Y) != 1) { 1288 | stop("\\nResponse must be a single variable") 1289 | } 1290 | if (any(is.na(Y))) { 1291 | stop("\\nResponse must not contain missing values") 1292 | } 1293 | if (is.null(colnames(Y))) { 1294 | colnames(Y) = "Y" 1295 | } 1296 | if (is.null(rownames(Y))) { 1297 | rownames(Y) = 1:n 1298 | } 1299 | # Step1: Match the sample size 1300 | LosIndivs<- rownames(Y) 1301 | for (i in 1:length(Predictors)){ 1302 | LosIndivs = intersect(LosIndivs, colnames(Predictors[[i]])) 1303 | } 1304 | 1305 | print(paste("This analysis will be performed with",length(LosIndivs),"samples, since those are the ones repeated in all layers")) 1306 | 1307 | NewList<-Predictors 1308 | 1309 | for (i in 1:length(NewList)) { 1310 | NewList[[i]]<-NewList[[i]][,colnames(NewList[[i]]) %in% LosIndivs] 1311 | } 1312 | 1313 | ########### 1314 | # Step2: Match the order 1315 | LosColnames<-colnames(NewList[[1]]) 1316 | 1317 | for (i in 1:length(NewList)) { 1318 | NewList[[i]]<-NewList[[i]][,sort(LosColnames) ] 1319 | } 1320 | 1321 | TestdeMatchTable<-unique(matrix(permutations(length(NewList)),ncol=2)) 1322 | 1323 | for (i in 1:nrow(TestdeMatchTable)) { 1324 | a=TestdeMatchTable[i,1] 1325 | b=TestdeMatchTable[i,2] 1326 | 1327 | if (all(colnames(NewList[[a]])==colnames(NewList[[b]]))){ 1328 | #print(paste("Columns of lists",a,"and",b,"are equally sorted")) 1329 | } else{ 1330 | #print("The colnames are not equally sorted") 1331 | } 1332 | } 1333 | if (crosval=="LOOCV"){ 1334 | valid="loo" 1335 | } 1336 | if (crosval=="TenF"){ 1337 | valid="Mfold" 1338 | } 1339 | 1340 | # Y 1341 | Y<-t(as.matrix(NewList[[Response]], drop=FALSE)) 1342 | if (is.character(Y[,1])) { 1343 | Ycita<-transform(Y, Type = as.numeric(Type)) 1344 | Y<-Ycita 1345 | rm(Ycita) 1346 | } 1347 | 1348 | ######### 1349 | Omics<-data.frame(Omics=names(NewList[-Response])) 1350 | 1351 | MinN<-round(length(table(as.factor(Y[,1]))) * 2, digits = 0) 1352 | MaxN<-nrow(Y) 1353 | Ngroups<-length(table(as.factor(Y[,1]))) 1354 | 1355 | if(!is.null(Ticks)) { 1356 | vectTicks<-round(seq(from=MinN,to = MaxN,length.out = Ticks),digits = 0) 1357 | setdeTicks = 1 1358 | DegreesOfFreedom<-Ticks-1 1359 | } else { 1360 | if (is.null(WhichTicks)) { 1361 | setdeTicks<-seq(from=5,to = MaxN,by=2) 1362 | } else { 1363 | vectTicks<-WhichTicks 1364 | setdeTicks<-1 1365 | } 1366 | } 1367 | if (length(setdeTicks)==1) { 1368 | tab<- ClassificationErrorRate (Predictors=Predictors, Response=length(Predictors), 1369 | Comps=Comps,crosval = crosval,ERIterations=ERIterations, 1370 | LassoIterations=LassoIterations,Ticks=NULL, 1371 | WhichTicks=vectTicks,Function = Function) 1372 | 1373 | if (is.null(Previous_CER)){ 1374 | 1375 | Previous_CER<-tab 1376 | allmeltedmodel<-Previous_CER$TablebyTick 1377 | 1378 | } else{ 1379 | ttdticks<-sort(c(tab$TestedTicks,Previous_CER$TestedTicks)) 1380 | 1381 | if (length(tab$TablebyTick)==6) { 1382 | tbyticks2<-NULL 1383 | tbyticks<-list() 1384 | for (tod in 1:length(tab$TablebyTick)) { 1385 | namecito<-names(tab$TablebyTick[tod]) 1386 | tbyticks2<-rbind(tab$TablebyTick[[tod]],Previous_CER$TablebyTick[[tod]]) 1387 | tbyticks2<-tbyticks2[order(as.numeric(as.character(tbyticks2$variable))),] 1388 | rownames(tbyticks2)<-seq(1:dim(tbyticks2)[1]) 1389 | tbyticks[namecito]<-list(tbyticks2) 1390 | } 1391 | orden<-as.numeric(as.character(tbyticks[[1]]$variable)) 1392 | orden2<-paste(orden,"samples", sep=" ") 1393 | orden2 1394 | } else { 1395 | tbyticks<-rbind(tab$TablebyTick,Previous_CER$TablebyTick) 1396 | tbyticks<-tbyticks[order(as.numeric(as.character(tbyticks$variable))),] 1397 | rownames(tbyticks)<-seq(1:dim(tbyticks)[1]) 1398 | 1399 | orden<-as.numeric(as.character(tbyticks$variable)) 1400 | orden2<-paste(orden,"samples", sep=" ") 1401 | 1402 | } 1403 | ###### 1404 | om<-as.data.frame(tab$Omics, drop=F) 1405 | Omics<-om 1406 | ###### 1407 | lstdeMins<-append(tab$Minimums,Previous_CER$Minimums) 1408 | lstdeMins<-lstdeMins[match(orden2,names(lstdeMins))] 1409 | 1410 | 1411 | if (!is.null(tab$CompWinner)) { 1412 | lstdewinners<-append(tab$Minimums,Previous_CER$Minimums) 1413 | lstdewinners<-lstdewinners[match(orden2,names(lstdewinners))] 1414 | } else {} 1415 | 1416 | if (is.null(tab$CompWinner)) { 1417 | Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,TablebyTick=tbyticks) 1418 | } else { 1419 | Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins, 1420 | CompWinner=lstdewinners,TablebyTick=tbyticks) 1421 | } 1422 | 1423 | allmeltedmodel<-Previous_CER$TablebyTick 1424 | 1425 | } 1426 | 1427 | ###### 1428 | } else { 1429 | Rankk<-ranksas<-list() 1430 | for (sdt in 1:length(setdeTicks)){ 1431 | numTicks<-setdeTicks[sdt] 1432 | vectTicks<-round(seq(from=MinN,to = MaxN,length.out = numTicks),digits = 0) 1433 | howmany<-length(vectTicks) 1434 | 1435 | if (is.null(Previous_CER) ){ 1436 | already=0 1437 | } else { 1438 | already<-length(Previous_CER$TestedTicks) 1439 | howmany<-already+2 1440 | vectTicks<-round(seq(from=MinN,to = MaxN,length.out = howmany),digits = 0) 1441 | vectTicks<-c(setdiff(as.vector(vectTicks),as.vector(Previous_CER$TestedTicks) )) 1442 | vectTicks<-sort(sample(vectTicks,size = (howmany-already), replace = FALSE )) 1443 | } 1444 | ###### 1445 | tab<- ClassificationErrorRate (Predictors=Predictors, Response=length(Predictors), 1446 | Comps=Comps,crosval = crosval,ERIterations=ERIterations, 1447 | LassoIterations=LassoIterations,Ticks=NULL, 1448 | WhichTicks=vectTicks,Function = Function) 1449 | 1450 | if (is.data.frame(tab$TablebyTick)){ 1451 | colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="StdDev"] <- "SEM" 1452 | colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="variable"] <- "Samples" 1453 | colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="value"] <- "ER_value" 1454 | colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="ErrorRate"] <- "ER_type" 1455 | } else {} 1456 | 1457 | if (is.null(Previous_CER)){ 1458 | 1459 | Previous_CER<-tab 1460 | allmeltedmodel<-Previous_CER$TablebyTick 1461 | 1462 | } else{ 1463 | ttdticks<-sort(c(tab$TestedTicks,Previous_CER$TestedTicks)) 1464 | 1465 | if (!is.data.frame(tab$TablebyTick)) { 1466 | tbyticks2<-NULL 1467 | tbyticks<-list() 1468 | colnames <- c("Category","Samples","ER_value","SEM","Comp","ER_type","Metric","CI_90","CI_95","CI_99") 1469 | for (i in seq_along(tab$TablebyTick)){ 1470 | colnames(tab$TablebyTick[[i]]) <- colnames 1471 | } 1472 | for (tod in 1:length(tab$TablebyTick)) { 1473 | namecito<-names(tab$TablebyTick[tod]) 1474 | tbyticks2<-rbind(tab$TablebyTick[[tod]],Previous_CER$TablebyTick[[tod]]) 1475 | tbyticks2<-tbyticks2[order(as.numeric(as.character(tbyticks2$Samples))),] 1476 | rownames(tbyticks2)<-seq(1:dim(tbyticks2)[1]) 1477 | tbyticks[namecito]<-list(tbyticks2) 1478 | } 1479 | orden<-as.numeric(as.character(tbyticks[[1]]$Samples)) 1480 | orden2<-paste(orden,"samples", sep=" ") 1481 | orden2 1482 | } else { 1483 | tbyticks<-rbind(tab$TablebyTick,Previous_CER$TablebyTick) 1484 | tbyticks<-tbyticks[order(as.numeric(as.character(tbyticks$Samples))),] 1485 | rownames(tbyticks)<-seq(1:dim(tbyticks)[1]) 1486 | 1487 | orden<-as.numeric(as.character(tbyticks$Samples)) 1488 | orden2<-paste(orden,"samples", sep=" ") 1489 | } 1490 | ###### 1491 | om<-as.data.frame(tab$Omics, drop=F) 1492 | Omics<-om 1493 | ###### 1494 | lstdeMins<-append(tab$Minimums,Previous_CER$Minimums) 1495 | lstdeMins<-lstdeMins[match(orden2,names(lstdeMins))] 1496 | 1497 | if (!is.null(tab$CompWinner)) { 1498 | lstdewinners<-append(tab$Minimums,Previous_CER$Minimums) 1499 | lstdewinners<-lstdewinners[match(orden2,names(lstdewinners))] 1500 | } else {} 1501 | 1502 | ##### 1503 | if (is.null(tab$CompWinner)) { 1504 | Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,TablebyTick=tbyticks) 1505 | } else { 1506 | Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins, 1507 | CompWinner=lstdewinners,TablebyTick=tbyticks) 1508 | } 1509 | 1510 | allmeltedmodel<-Previous_CER$TablebyTick 1511 | 1512 | 1513 | } # Close of Previous_CER 1514 | 1515 | if (is.null(TheoreticER) ) { 1516 | MinimumError=min(tbyticks$value) 1517 | SamplesEvaluated=SamplesRequired=as.numeric(as.character( 1518 | Previous_CER$TablebyTick$mahalanobis.dist_ER$variable[Previous_CER$TablebyTick$mahalanobis.dist_ER$value==min(Previous_CER$TablebyTick$mahalanobis.dist_ER$value)] )) 1519 | } else{ 1520 | MinimumError=TheoreticER 1521 | } 1522 | 1523 | 1524 | if (!is.data.frame(allmeltedmodel)){ 1525 | allmeltedmodel<-allmeltedmodel$mahalanobis.dist_ER 1526 | ro=dim(allmeltedmodel)[1] 1527 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="StdDev"] <- "SEM" 1528 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="variable"] <- "Samples" 1529 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="value"] <- "ER_value" 1530 | colnames(allmeltedmodel)[colnames(allmeltedmodel)=="ErrorRate"] <- "ER_type" 1531 | } else { 1532 | ro=dim(allmeltedmodel)[1] 1533 | } 1534 | 1535 | ##### 1536 | if (allmeltedmodel$ER_value[ro-1]=MinimumError,]) 1552 | NSampleMax<-1+round(NSampleMaxTable$ht[dim(NSampleMaxTable)[1]]) 1553 | NSampleMaxPosition<-as.numeric(rownames(NSampleMaxTable)[dim(NSampleMaxTable)[1]]) 1554 | 1555 | NSampleMaxTable$EplusStd<-NSampleMaxTable$ErrorPred - NSampleMaxTable$STDpred 1556 | Theplusminustable<-as.data.frame(NSampleMaxTable[NSampleMaxTable[,4]>=MinimumError,]) 1557 | Theplusminus<-round(Theplusminustable$ht[dim(Theplusminustable)[1]]) 1558 | TheplusminusPosition<-as.numeric(rownames(Theplusminustable)[dim(Theplusminustable)[1]]) 1559 | 1560 | SamplesEvaluated<-as.numeric(as.character(allmeltedmodel$variable[dim(allmeltedmodel)[1]])) 1561 | SamplesRequired<-round(NSampleMaxTable$ht[NSampleMaxPosition]) 1562 | MOE<-paste("±",round(NSampleMax-Theplusminus)) 1563 | 1564 | TERtable<-rbind(ERtarget=round(MinimumError,digits = 3), 1565 | PSS=SamplesRequired, 1566 | MOE= MOE) 1567 | #################### 1568 | rango<-seq(SamplesRequired-round(NSampleMax-Theplusminus),SamplesRequired+round(NSampleMax-Theplusminus), by=1) 1569 | ranksas<-list(rango) 1570 | Rankk[namecito]<-ranksas 1571 | 1572 | # STOP protocol 1573 | if (length(Rankk)>2) { 1574 | testedfrags<-testedfragsSas<-list() 1575 | for (lngt in 1:length(Rankk)) { 1576 | namecito<-names(Rankk[lngt]) 1577 | if (length(Rankk[[lngt]])<20) { 1578 | testedfragsSas<- list(Rankk[[lngt]]) 1579 | testedfrags[namecito]<-testedfragsSas 1580 | 1581 | } else {} 1582 | } 1583 | if (length(testedfrags)>2){ 1584 | fin<-length(testedfrags) 1585 | intrsct<-intersect(intersect(testedfrags[[fin]],testedfrags[[fin-1]]),testedfrags[[fin-2]]) 1586 | if(length(intrsct)>=3){ 1587 | if (is.data.frame(tbyticks)){ 1588 | colnames(tbyticks)[colnames(tbyticks)=="StdDev"] <- "SEM" 1589 | colnames(tbyticks)[colnames(tbyticks)=="variable"] <- "Samples" 1590 | colnames(tbyticks)[colnames(tbyticks)=="value"] <- "ER_value" 1591 | colnames(tbyticks)[colnames(tbyticks)=="ErrorRate"] <- "ER_type" 1592 | } else { 1593 | colnames <- c("Category","Samples","ER_value","SEM","Comp","ER_type","Metric","CI_90","CI_95","CI_99") 1594 | for (i in seq_along(tbyticks)){ 1595 | colnames(tbyticks[[i]]) <- colnames 1596 | } 1597 | } 1598 | 1599 | if (is.null(tab$CompWinner)) { 1600 | res<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,Prediction_table=TERtable,TablebyTick=tbyticks) 1601 | } else { 1602 | res<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,Prediction_table=TERtable, 1603 | CompWinner=lstdewinners,TablebyTick=tbyticks) 1604 | } 1605 | return(res) 1606 | break 1607 | }else{} 1608 | } else {} 1609 | } 1610 | } 1611 | } 1612 | }) 1613 | } 1614 | 1615 | # END 1616 | 1617 | --------------------------------------------------------------------------------