├── NAMESPACE
├── MultiPowerUsersGuide_v2.pdf
├── MultiML
    ├── MultiMLUsersGuide.pdf
    ├── templates
    │   └── submit_sh.txt
    ├── README.md
    └── FoMPredictiveFunctions.R
├── DESCRIPTION
├── man
    ├── CohenFilter.Rd
    ├── powerPlot.Rd
    ├── postMultiPower.Rd
    ├── optimalRep.Rd
    ├── MultiPower.Rd
    └── MultiGroupPower.Rd
├── README.md
├── MultiPower_Examples
    ├── TCGA_Example.R
    └── STATegra_Example.R
└── R
    └── MultiOmicsPower15.R


/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 | 


--------------------------------------------------------------------------------
/MultiPowerUsersGuide_v2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ConesaLab/MultiPower/HEAD/MultiPowerUsersGuide_v2.pdf


--------------------------------------------------------------------------------
/MultiML/MultiMLUsersGuide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ConesaLab/MultiPower/HEAD/MultiML/MultiMLUsersGuide.pdf


--------------------------------------------------------------------------------
/MultiML/templates/submit_sh.txt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | #SBATCH --array=0-{{{max_node}}}
 4 | #SBATCH --job-name={{{jobname}}}
 5 | #SBATCH --output=slurm_%a.out
 6 | #SBATCH --time={{{time}}}-00:00:00
 7 | {{#flags}}
 8 | #SBATCH --{{{name}}}
 9 | {{/flags}}
10 | {{#options}}
11 | #SBATCH --{{{name}}}={{{value}}}
12 | {{/options}}
13 | module load R
14 | Rscript --vanilla SlurmCreator_Run.R
15 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: MultiPower
 2 | Type: Package
 3 | Title: Estimation of sample size in multi-omic experiments
 4 | Version: 1.0
 5 | Author: Sonia Tarazona; David Gómez-Cabrero
 6 | Maintainer: Sonia Tarazona <sotacam@gmail.com>
 7 | Description: In multi-omic experiments, MultiPower estimates the sample size needed to achieve a given statistical power per omic and globally. Parameters require to estimate power can be either set by users or estimated from provided pilot data (recommended). MultiPower accepts count data, normally distributed data or binary data.
 8 | License: GPL-2
 9 | Encoding: UTF-8
10 | LazyData: true
11 | NeedsCompilation: no
12 | Packaged: 2019-09-18 09:04:40 UTC; sotacam
13 | 


--------------------------------------------------------------------------------
/man/CohenFilter.Rd:
--------------------------------------------------------------------------------
 1 | \name{CohenFilter}
 2 | \alias{CohenFilter}
 3 | \title{
 4 | Removal of low effect features.
 5 | }
 6 | \description{
 7 | Removal of omic features with Cohen’s d (or h) below the threshold set by the user.
 8 | }
 9 | \usage{
10 | CohenFilter(data, d, parameters)
11 | }
12 | %- maybe also 'usage' for other objects documented here.
13 | \arguments{
14 |    \item{data}{
15 | Original data given to MultiPower function.
16 | }
17 |   \item{d}{
18 | Cutoff value for Cohen’s d (or h). A scalar (and then the same cutoff is applied to all the omics) or a vector with as many values as omic data types in the data object.
19 | }
20 |   \item{parameters}{
21 | List with as many elements as omic data types estimated by MultiPower function. They can be retrieved from MultiPower output as output@parameters.
22 | }
23 | }
24 | \details{
25 | %%  ~~ If necessary, more details than the description above ~~
26 | }
27 | \value{
28 | %%  ~Describe the value returned
29 | %%  If it is a LIST, use
30 | %%  \item{comp1 }{Description of 'comp1'}
31 | %%  \item{comp2 }{Description of 'comp2'}
32 | %% ...
33 | }
34 | \references{
35 | %% ~put references to the literature/web site here ~
36 | }
37 | \author{
38 | Sonia Tarazona; David Gómez-Cabrero
39 | }
40 | \note{
41 | %%  ~~further notes~~
42 | }
43 | 
44 | %% ~Make other sections like Warning with \section{Warning }{....} ~
45 | 
46 | \seealso{
47 | \code{\link{MultiPower}}
48 | }
49 | \examples{
50 | %%
51 | }
52 | % Add one or more standard keywords, see file 'KEYWORDS' in the
53 | % R documentation directory.
54 | %\keyword{ ~kwd1 }% use one of  RShowDoc("KEYWORDS")
55 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
56 | 


--------------------------------------------------------------------------------
/man/powerPlot.Rd:
--------------------------------------------------------------------------------
 1 | \name{powerPlot}
 2 | \alias{powerPlot}
 3 | %- Also NEED an '\alias' for EACH other topic documented here.
 4 | \title{
 5 | Power plots.
 6 | }
 7 | \description{
 8 | Plotting statistical power versus sample size per omic from optimal sample size results.
 9 | and statistical power versus Cohen's d cutoffs (in percentiles).
10 | }
11 | \usage{
12 | powerPlot(parameters, optimalSampleSize, omicCol = NULL)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{parameters}{
17 | List with as many elements as omic data types. For each omic, each element of this list is another list containing the different parameters needed to compute power. 
18 | See \code{MultiPower} results for more details.
19 | }
20 |   \item{optimalSampleSize}{
21 | R object containing the results of \code{optimalRep}.
22 | }
23 |   \item{omicCol}{
24 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used.
25 | }
26 | }
27 | \details{
28 | %%  ~~ If necessary, more details than the description above ~~
29 | }
30 | \value{
31 | List containing the values used for the plot or plots.
32 | }
33 | \references{
34 | %% ~put references to the literature/web site here ~
35 | }
36 | \author{
37 | Sonia Tarazona
38 | }
39 | \note{
40 | %%  ~~further notes~~
41 | }
42 | 
43 | %% ~Make other sections like Warning with \section{Warning }{....} ~
44 | 
45 | \seealso{
46 | %% ~~objects to See Also as \code{\link{help}}, ~~~
47 | }
48 | \examples{
49 | % Add one or more standard keywords, see file 'KEYWORDS' in the
50 | % R documentation directory.
51 | %\keyword{ plot }% use one of  RShowDoc("KEYWORDS")
52 | %\keyword{ power }% __ONLY ONE__ keyword per line
53 | }
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MultiPower and MultiML: statistical power studies for multi-omics experiments
 2 | 
 3 | ## MultiPower
 4 | 
 5 | The MultiPower R method performs statistical power studies for multi-omics experiments, 
 6 | and is designed to assist users in experimental design as well as in the evaluation of already-generated multi-omics datasets. 
 7 | More details on the method can be found in our manuscript [[1]](#1) and in the 
 8 | [MultiPower User’s Guide](https://github.com/ConesaLab/MultiPower/blob/master/MultiPowerUsersGuide_v2.pdf).
 9 | 
10 | MultiPower is available as an R package, and can be installed as follows:
11 | 
12 | ```
13 | install.packages(“devtools”)
14 | devtools::install_github(“ConesaLab/MultiPower”)
15 | ```
16 | 
17 | ### Installing MultiPower dependencies
18 | 
19 | Some dependencies are required before running MultiPower that can be installed from R via 
20 | `install.packages()` and loaded with `library()`:
21 | 
22 | - FDRsampsize
23 | - lpSolve
24 | 
25 | 
26 | 
27 | 
28 | ## MultiML
29 | 
30 | The MultiML method is included as a complementary tool to MultiPower, 
31 | and is designed to help users determine the optimal sample size required to control for 
32 | classification error rates when using one or more omics datasets. 
33 | Details on the MultiML algorithm and its applications can be found in our manuscript [[1]](#1). 
34 | 
35 | If you are interested in using MultiML for your research, please see this folder(link) 
36 | for scripts and instructions. For detailed information on how to run the tool, please read 
37 | [MultiML's  User Guide](https://github.com/ConesaLab/MultiPower/blob/master/MultiPower_UsersGuide.pdf)
38 | 
39 | 
40 | 
41 | ## Citation
42 | 
43 | If you are using MultiPower or MultiML in your research, please cite the following publication:
44 | 
45 | <a id="1">[1]</a>
46 | Tarazona, S., Balzano-Nogueira, L., Gómez-Cabrero, D. et al. 
47 | Harmonization of quality metrics and power calculation in multi-omic studies. 
48 | Nat Commun 11, 3092 (2020). https://doi.org/10.1038/s41467-020-16937-8 
49 | 
50 | 


--------------------------------------------------------------------------------
/man/postMultiPower.Rd:
--------------------------------------------------------------------------------
 1 | \name{postMultiPower}
 2 | \alias{postMultiPower}
 3 | \title{
 4 | Power study for different sample sizes.
 5 | }
 6 | \description{
 7 | When the optimal sample size estimated by MultiPower exceeds the available budget, an alterative solution is to decrease such sample size at the cost of removing features
 8 |   with low effect size (magnitude of change). This function answer the question of to which effect size users must restrict themselves for a given maximum sample size. 
 9 |     Pilot multi-omic data sets are needed to perform this analysis. 
10 | }
11 | \usage{
12 | postMultiPower(optResults, max.size = 5, omicCol = NULL)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{  
16 |   \item{optResults}{
17 | R object containing the results of \code{MultiPower} function.
18 | }
19 |   \item{max.size}{
20 | Maximum sample size allowed by the user. It will be used to determine the effect size that can be detected (by default, 5).
21 | }
22 |   \item{omicCol}{
23 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used.
24 | }
25 | }
26 | \details{
27 | %%  ~~ If necessary, more details than the description above ~~
28 | }
29 | \value{
30 | This function returns a list with the following elements:
31 | \item{SampleSize }{Matrix containing the optimal sample size for each omic data type (in columns) and for different cutoffs of Cohen’s d (in rows).}
32 | \item{Power }{Matrix containing the statistical power at the optimal sample size for each omic data type (in columns) and for different cutoffs of Cohen’s d (in rows).}
33 | \item{NumFeat }{Matrix containing the number of remaining features for each omic data type (in columns) and for different cutoffs of Cohen’s d (in rows).}
34 | \item{d }{Values of Cohen’s d used as cutoffs to remove low effect size features.}
35 | In addition, it also returns two plots that summarize these results: first, the number of replicates (sample size) for each of the tested Cohen's d values; 
36 | second, the statistical power for each omic at the previously obtained optimal sample size and at the current sample size set by the user.
37 | }
38 | \references{
39 | %% ~put references to the literature/web site here ~
40 | }
41 | \author{
42 | Sonia Tarazona
43 | }
44 | \note{
45 | %%  ~~further notes~~
46 | }
47 | 
48 | %% ~Make other sections like Warning with \section{Warning }{....} ~
49 | 
50 | \seealso{
51 | %% ~~objects to See Also as \code{\link{help}}, ~~~
52 | }
53 | \examples{
54 | }
55 | % Add one or more standard keywords, see file 'KEYWORDS' in the
56 | % R documentation directory.
57 | %\keyword{ ~kwd1 }% use one of  RShowDoc("KEYWORDS")
58 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
59 | 


--------------------------------------------------------------------------------
/man/optimalRep.Rd:
--------------------------------------------------------------------------------
 1 | \name{optimalRep}
 2 | \alias{optimalRep}
 3 | \title{
 4 | Estimation of optimal sample size.
 5 | }
 6 | \description{
 7 | Estimation of the optimal sample size when pilot multi-omic data sets are not available.
 8 | }
 9 | \usage{
10 | optimalRep(parameters, omicPower = 0.6, averagePower = 0.85, fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200)
11 | }
12 | %- maybe also 'usage' for other objects documented here.
13 | \arguments{
14 |   \item{parameters}{
15 | List with as many elements as omic data types. For each omic, each element of this list is another list containing the different parameters needed to compute power which, in this case, must be set by the user. See \code{MultiPower} for more details.
16 | }
17 |   \item{omicPower}{
18 | The minimum power that must be achieved for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. By default, omicPower = 0.6.
19 | }
20 |   \item{averagePower}{
21 | The minimum average power that must be globally achieved. By default, averagePower = 0.85.
22 | }
23 |   \item{fdr}{
24 | False Discovery Rate level to be used. It is the significance level after multiple testing correction. By default, fdr = 0.05.}
25 |   \item{cost}{
26 | The cost to generate a replicate (a sample) for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. This argument will only be used when a different sample size per omic is allowed. By default, cost = 1 (which means that all the omics will be assumed to have the same cost).
27 | }
28 |   \item{equalSize}{
29 |  If TRUE (default), the same optimal sample size will be estimated for all the omics. If FALSE, omics are allowed to have different sample sizes.
30 | }
31 |   \item{max.size}{
32 | Maximum allowed sample size. By default, max.size = 30.
33 | }
34 | }
35 | \details{
36 | %%  ~~ If necessary, more details than the description above ~~
37 | }
38 | \value{
39 | %%  ~Describe the value returned
40 | %%  If it is a LIST, use
41 | %%  \item{comp1 }{Description of 'comp1'}
42 | %%  \item{comp2 }{Description of 'comp2'}
43 | %% ...
44 | }
45 | \references{
46 | %% ~put references to the literature/web site here ~
47 | }
48 | \author{
49 | Sonia Tarazona; David Gómez-Cabrero
50 | }
51 | \note{
52 | %%  ~~further notes~~
53 | }
54 | 
55 | %% ~Make other sections like Warning with \section{Warning }{....} ~
56 | 
57 | \seealso{
58 | \code{\link{MultiPower}}
59 | }
60 | \examples{
61 | optimalSS = optimalRep(parameters = myparam, omicPower = 0.6,
62 | averagePower = 0.8, fdr = 0.05, 
63 | cost = 1, equalSize = TRUE, max.size = 30)
64 | optimalSS$n  # optimal sample size
65 | }
66 | % Add one or more standard keywords, see file 'KEYWORDS' in the
67 | % R documentation directory.
68 | %\keyword{ ~kwd1 }% use one of  RShowDoc("KEYWORDS")
69 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
70 | 


--------------------------------------------------------------------------------
/man/MultiPower.Rd:
--------------------------------------------------------------------------------
 1 | \name{MultiPower}
 2 | \alias{MultiPower}
 3 | \title{
 4 | Optimal sample size estimation and power study.
 5 | }
 6 | \description{
 7 | MultiPower computes the optimal sample size for a multi-omic experiment when pilot multi-omic data sets are available for estimating the parameters required to compute power. An optimization problem is solved in order to achieve the desired power while minimizing the cost of the experiment.
 8 | }
 9 | \usage{
10 | MultiPower(data, groups, type, omicPower = 0.6, averagePower = 0.85, null.effect = 0,
11 | fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL, powerPlots = TRUE)
12 | }
13 | %- maybe also 'usage' for other objects documented here.
14 | \arguments{
15 |   \item{data}{
16 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a raw count data matrix, and in this case MultiPower will take into account the library sizes to estimate power; a normally distributed data matrix which must have been already pre-processed and normalized; or a binary data matrix (with 0/1 or TRUE/FALSE values). In any case, for each one of these matrices, rows must correspond to omic features (genes, methylation sites, ChIP-seq peaks, etc.) and columns to observations (biological samples, patients, etc.).
17 | }
18 |   \item{groups}{
19 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a vector with length equal to the number of observations for that omic in data argument. Each element of this vector must indicate the experimental group where each observation belong. Only two experimental groups are allowed.
20 | }
21 |   \item{type}{
22 | Vector with length equal to the number of omic data types. Each element of this vector must be a 1, 2 or 3 to indicate whether the omic data are count data (1), continuous data approximately following a normal distribution (2) or binary data (3).
23 | } 
24 |   \item{null.effect}{
25 |  Value of the effect size that corresponds to null hypothesis. By default, 0.
26 | }
27 |   \item{omicPower}{
28 | The minimum power that must be achieved for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. By default, omicPower = 0.6.
29 | }
30 |   \item{averagePower}{
31 | The minimum average power that must be globally achieved. By default, averagePower = 0.85.
32 | }  
33 |   \item{fdr}{
34 | False Discovery Rate level to be used. It is the significance level after multiple testing correction. By default, fdr = 0.05. If no multiple testing correction is to be applied, this argument must be set to NULL and then alpha argument is required.
35 | }  
36 |   \item{cost}{
37 | The cost to generate a replicate (a sample) for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. This argument will only be used when a different sample size per omic is allowed. By default, cost = 1 (which means that all the omics will be assumed to have the same cost).
38 | }
39 |   \item{equalSize}{
40 | If TRUE (default), the same optimal sample size will be estimated for all the omics. If FALSE, omics are allowed to have different sample sizes.
41 | }
42 |   \item{max.size}{
43 | Maximum allowed sample size. By default, max.size = 200.
44 | }
45 |   \item{omicCol}{
46 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used.
47 | }
48 |   \item{powerPlots}{
49 | If TRUE (default), power plots will be generated.
50 | }
51 | }
52 | \details{
53 | %%  ~~ If necessary, more details than the description above ~~
54 | }
55 | \value{
56 | When applying \code{MultiPower}, the result is a list containing the following elements:
57 | \item{parameters }{List with as many elements as omic data types. For each omic, each element of the list is another list containing the different parameters used 
58 |                    to compute power, either estimated from the pilot data or provided by the user: type, pooledSD, d, delta, logFC, mu, m, etc.}
59 | \item{optimalSampleSize }{List containing the following elements: n0 (sample size to achieve the minimum omic power, omicPower, for each omic), n (optimal sample size),
60 |                           finalPower (power at the optimal sample size for each omic), fdr (see fdr argument), omicPower (see omicPower argument), 
61 |                                       averagePower (see averagePower argument), and cost (see cost argument).}
62 | \item{summary }{Table summarizing MultiPower results. The columns are: the names of the omic data sets (omic), the omic data type (type), 
63 |                 the number of omic features for each omic (numFeat), the minimum and maximum observed Cohen’s d (minCohenD and maxCohenD), the FDR value (FDR), 
64 |                 the minimum power to be achieve for each omic (minPower), the average power to be achieved in the multi-omic experiment (averPower), 
65 |                 the cost per omic (cost), the minimum sample size needed for each omic to achieve minPower (minSampleSize), 
66 |                 the optimal sample size (optSampleSize), and the power at this optimal sample size (power).}
67 | \item{data2plot}{Data generated to create the power plots that are also returned by the function.}
68 | }
69 | \references{
70 | %% ~put references to the literature/web site here ~
71 | }
72 | \author{
73 | Sonia Tarazona; David Gómez-Cabrero
74 | }
75 | \note{
76 | %%  ~~further notes~~
77 | }
78 | 
79 | %% ~Make other sections like Warning with \section{Warning }{....} ~
80 | 
81 | \seealso{
82 | %% ~~objects to See Also as \code{\link{help}}, ~~~
83 | }
84 | \examples{
85 | }
86 | % Add one or more standard keywords, see file 'KEYWORDS' in the
87 | % R documentation directory.
88 | %\keyword{ ~kwd1 }% use one of  RShowDoc("KEYWORDS")
89 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
90 | 


--------------------------------------------------------------------------------
/man/MultiGroupPower.Rd:
--------------------------------------------------------------------------------
 1 | \name{MultiGroupPower}
 2 | \alias{MultiGroupPower}
 3 | \title{
 4 | Optimal sample size estimation for multiple group comparisons.
 5 | }
 6 | \description{
 7 | MultiGroupPower estimates the optimal sample size for a multi-omic experiment when pilot multi-omic data sets are available to estimate the parameters required to compute power and there are multiple groups to be compared.
 8 | }
 9 | \usage{
10 | MultiGroupPower(data, groups, type, comparisons = NULL, omicPower = 0.6, averagePower = 0.85,
11 |                 fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL,
12 |                 powerPlots = FALSE, summaryPlot = TRUE)
13 | }
14 | %- maybe also 'usage' for other objects documented here.
15 | \arguments{
16 |   \item{data}{
17 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a raw count data matrix, and in this case MultiGroupPower will take into account the library sizes to estimate power; a normally distributed data matrix which must have been already pre-processed and normalized; or a binary data matrix (with 0/1 or TRUE/FALSE values). In any case, for each one of these matrices, rows must correspond to omic features (genes, methylation sites, ChIP-seq peaks, etc.) and columns to observations (biological samples, patients, etc.).
18 | }
19 |   \item{groups}{
20 | List with as many elements as omic data types. The names of the omics should be the names of the list. Each element in this list must be a vector with length equal to the number of observations for that omic in data argument. Each element of this vector must indicate the experimental group where each observation belong.
21 | }
22 |   \item{type}{
23 | Vector with length equal to the number of omic data types. Each element of this vector must be a 1, 2 or 3 to indicate whether the omic data are count data (1), continuous data approximately following a normal distribution (2) or binary data (3).
24 | }
25 |   \item{comparisons}{
26 | Pairwise comparisons to be done between groups. If NULL (default option), the function will generate all the possible comparisons between the groups that are available for all omics. If users wish to indicate the comparisons to be done, they must provide a matrix with two rows and as many columns as comparisons. Each column will be then a two-element vector with the two groups to be compared. An easy way to generate this matrix is using the combn() function that returns a matrix with all the possible comparisons. Users can then remove the columns of the comparisons that are not interesting for them.
27 | }  
28 |   \item{omicPower}{
29 | The minimum power that must be achieved for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. By default, omicPower = 0.6.
30 | }
31 |   \item{averagePower}{
32 | The minimum average power that must be globally achieved. By default, averagePower = 0.85.
33 | }  
34 |   \item{fdr}{
35 | False Discovery Rate level to be used. It is the significance level after multiple testing correction. By default, fdr = 0.05. 
36 | }
37 |   
38 |   \item{cost}{
39 | The cost to generate a replicate (a sample) for each omic. It must be a vector with length equal to the number of omics. If it is a single number, this same number will be used for all the omics. This argument will only be used when a different sample size per omic is allowed. By default, cost = 1 (which means that all the omics will be assumed to have the same cost).
40 | }
41 |   \item{equalSize}{
42 | If TRUE (default), the same optimal sample size will be estimated for all the omics. If FALSE, omics are allowed to have different sample sizes.
43 | }
44 |   \item{max.size}{
45 | Maximum allowed sample size. By default, max.size = 200.
46 | }
47 |   \item{omicCol}{
48 | The color that will be used to plot each omic. It must be a vector with length equal to the number of omics. If it is NULL (default), default colors are used.
49 | }
50 |   \item{powerPlots}{
51 | If TRUE (FALSE is the default), power plots will be generated for each individual comparison as in \code{MultiPower} function.
52 | }
53 |   \item{summaryPlot}{
54 | If TRUE (default), summary plots for sample size and power will be generated including the results for all comparisons and the global result, that is, the maximum sample size for all comparisons ("optimal" sample size) and the corresponding statistical power for each omic.
55 | }
56 | }
57 | \details{
58 | %%  ~~ If necessary, more details than the description above ~~
59 | }
60 | \value{
61 | When applying \code{MultiGroupPower}, the result is a list containing as many elements as omic data types and an
62 | additional element containing the global summary (GlobalSummary) of the results. The elements corresponding to omic data types are lists with the following elements:
63 | \item{parameters }{List with as many elements as omic data types. For each omic, each element of the list is another list containing the different parameters 
64 |                    used to compute power, estimated from the pilot data.}
65 | \item{optimalSampleSize }{List containing the following elements: n0 (sample size to achieve the minimum omic power, omicPower, for each omic), 
66 |                          n (optimal sample size), finalPower (power at the optimal sample size for each omic), fdr (see fdr argument), 
67 |                          omicPower (see omicPower argument), averagePower (see averagePower argument), and cost (see cost argument).}
68 | \item{summary}{Table summarizing MultiPower results.}
69 | \item{data2plot}{Data generated to create the power plots that are also returned by the function.}
70 | The GlobalSummary element is a summary table very similar to the summary table described above. }
71 | \references{
72 | %% ~put references to the literature/web site here ~
73 | }
74 | \author{
75 | Sonia Tarazona; David Gomez-Cabrero
76 | }
77 | \note{
78 | %%  ~~further notes~~
79 | }
80 | 
81 | %% ~Make other sections like Warning with \section{Warning }{....} ~
82 | 
83 | \seealso{
84 | %% ~~objects to See Also as \code{\link{help}}, ~~~
85 | }
86 | \examples{
87 | }
88 | % Add one or more standard keywords, see file 'KEYWORDS' in the
89 | % R documentation directory.
90 | %\keyword{ ~kwd1 }% use one of  RShowDoc("KEYWORDS")
91 | %\keyword{ ~kwd2 }% __ONLY ONE__ keyword per line
92 | 


--------------------------------------------------------------------------------
/MultiPower_Examples/TCGA_Example.R:
--------------------------------------------------------------------------------
  1 | ############################################################################
  2 | ########             Power study for TCGA-GBM                       ########
  3 | ########             Nature Communications                          ########
  4 | ############################################################################
  5 | 
  6 | ## By Sonia Tarazona
  7 | ## 14-Dic-2017
  8 | ## Last modified: 28-nov-2018
  9 | 
 10 | 
 11 | options(stringsAsFactors = FALSE)
 12 | 
 13 | # Loading MultiPower functions
 14 | source("~/Dropbox/STATegra_STC/FiguresOfMerit/MultiPower/R/MultiOmicsPower10.R")
 15 | 
 16 | 
 17 | 
 18 | myfigdir = "~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults/"
 19 | 
 20 | miscolores = c("red2", "orchid4", "dodgerblue3", "darkolivegreen4")
 21 | omicas = c("expression", "methylation", "miRNAs", "proteomics")
 22 | names(miscolores) = omicas
 23 | 
 24 | 
 25 | 
 26 | # Required data -----------------------------------------------------------
 27 | 
 28 | setwd("~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults/")
 29 | load("dataTCGA.RData", verbose = TRUE)
 30 | # tcgadata
 31 | # tcgadesign
 32 | # DEresults
 33 | 
 34 | 
 35 | # Figures for estimated parameters per omic -------------------------------------
 36 | 
 37 | ## Number of features, %DE features
 38 | 
 39 | par(mar = c(1,7,2,2), mfrow = c(1,2))
 40 | bb = barplot(sapply(tcgadata, nrow)/1000, las = 1,
 41 |              col = miscolores, horiz = TRUE, beside = TRUE, log = "x", 
 42 |              border = miscolores, main = "# features", axes = FALSE)
 43 | text(200, bb, sapply(tcgadata, nrow), adj = 1, 
 44 |      col = c(1, "white", rep(1,3)), las = 2)
 45 | 
 46 | par(mar = c(1,2,2,2))
 47 | 
 48 | bb = barplot(round(100*sapply(DEresults, function (x) sum(x[,"adj.P.Val"] < 0.05))/sapply(tcgadata, nrow),2), 
 49 |              las = 1, col = miscolores, horiz = TRUE, names.arg = "", 
 50 |              border = miscolores, main = "% DE features", axes = FALSE)
 51 | text(10, bb, paste0(round(100*sapply(DEresults, function (x) sum(x[,"adj.P.Val"] < 0.05))/sapply(tcgadata, nrow),0), "%"), 
 52 |      adj = 1, col = rep("white",5), las = 2)
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | # Global MultiPower results -----------------------------------------------
 59 | 
 60 | mytype = rep(2,4)
 61 | p1omics = c(0.64, 0.14, 0.56, 0.44)
 62 | 
 63 | tcgadesign = lapply(tcgadesign, function (x) x$type)
 64 | 
 65 | ## d0 = 0.8
 66 | par(mfrow = c(1,2))
 67 | tcgaResultsEQ = MultiPower(data = tcgadata, groups = tcgadesign, type = mytype, d0 = 0.8, p1 = p1omics,
 68 |                            omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = TRUE,
 69 |                            max.size = 300, omicCol = miscolores)
 70 | tcgaResultsEQ$summary[,c(2:4,6:9,14:16)]
 71 | 
 72 | 
 73 | save(tcgaResultsEQ, tcgaResultsNE, file = "TCGAresultsMultiPower5.RData")
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | # Post-analysis -----------------------------------------------------------
 80 | 
 81 | TCGApostEQ = postMultiPower(data = tcgadata, groups = tcgadesign, optResults = tcgaResultsNE, 
 82 |                             max.size = 20, omicCol = miscolores) 
 83 | 
 84 | 
 85 | 
 86 | # Generating figures for the paper ----------------------------------------
 87 | 
 88 | setwd("~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults")
 89 | load("TCGAresultsMultiPower5.RData", verbose = TRUE)
 90 | 
 91 | 
 92 | ## Supplementary Figure
 93 | par(mfrow = c(1,2))
 94 | powerPlot(parameters = tcgaResultsEQ$parameters, optimalSampleSize = tcgaResultsEQ$optimalSampleSize, 
 95 |           omicCol = miscolores)
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | # Filtering low variability features in methylation -----------------------
102 | 
103 | table(tcgadesign$methylation)
104 | dim(tcgadata$methylation)
105 | 
106 | head(DEresults$methylation)
107 | plot(density(abs(DEresults$methylation$logFC)))
108 | 
109 | min(abs(DEresults$methylation$logFC))
110 | min(abs(DEresults$methylation$logFC[DEresults$methylation$adj.P.Val < 0.05]))
111 | 
112 | sum(abs(DEresults$methylation$logFC) < 0.05) # 77020  (20%)
113 | dim(DEresults$methylation)
114 | # 384349
115 | sum(DEresults$methylation$adj.P.Val < 0.05)
116 | # 53064
117 | 
118 | ## New proportion of DE methylation
119 | 53064 / (384349 - 77020) # = 17%
120 | 
121 | tcgadata2 = tcgadata
122 | tcgadata2$methylation = tcgadata2$methylation[abs(DEresults$methylation$logFC) >= 0.05,]
123 | 
124 | sapply(tcgadata2, dim)
125 | 
126 | 
127 | 
128 | 
129 | # Global MultiPower results AFTER filtering -----------------------------------------------
130 | 
131 | p1omics = c(0.64, 0.17, 0.56, 0.44)
132 | 
133 | ## d0 = 0.8
134 | par(mfrow = c(1,2))
135 | tcgaResultsEQ2 = MultiPower(data = tcgadata2, groups = tcgadesign, type = mytype, d0 = 0.8, p1 = p1omics,
136 |                            omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = TRUE,
137 |                            max.size = 300, omicCol = miscolores)
138 | tcgaResultsEQ2$summary[,c(2:4,6:9,14:16)]
139 | 
140 | 
141 | save(tcgaResultsEQ, tcgaResultsNE, tcgaResultsEQ2, file = "TCGAresultsMultiPower5.RData")
142 | 
143 | 
144 | 
145 | 
146 | 
147 | # Post-analysis -----------------------------------------------------------
148 | load("dataTCGA.RData", verbose = TRUE)
149 | load("TCGAresultsMultiPower5.RData", verbose = TRUE)
150 | 
151 | TCGApostEQ = postMultiPower(data = tcgadata, groups = tcgadesign, optResults = tcgaResultsEQ, 
152 |                             max.size = 30, omicCol = miscolores) 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | # Answers to reviewers ---------------------------
161 | 
162 | p1omics
163 | # 0.64 0.17 0.56 0.44
164 | 
165 | 
166 | ## Selecting n samples and computing DE --> do this R=5 times
167 | 
168 | myN = c(22, 20, 15, 10, 5) 
169 | resum = NULL
170 | 
171 | for (r in 1:5) {
172 |   
173 |   dades = tcgadata
174 |   disseny = tcgadesign
175 |   
176 |   print(paste0("r = ", r))
177 |   
178 |   for (n in myN) {
179 |     print(paste0("n = ", n))
180 |     
181 |     for (i in 1:length(tcgadata)) {
182 |       pro = which(disseny[[i]] == "Proneural")
183 |       mes = which(disseny[[i]] == "Mesenchymal")
184 |       selec = c(sample(pro, n), sample(mes, n))
185 |       dades[[i]] = dades[[i]][,selec]
186 |       disseny[[i]] = disseny[[i]][selec]
187 |     }
188 |     print(sapply(dades, ncol))
189 |     
190 |     DEres = lapply(1:4, function (i) apply(dades[[i]], 1, function (x) t.test(x ~ disseny[[i]])$p.value))
191 |     DEresFDR = lapply(DEres, p.adjust, method = "fdr")
192 |     tmp = sapply(DEresFDR, function (x) round(sum(x < 0.05)/length(x),2))
193 |     resum = rbind(resum, c(r, n, tmp))
194 |   }
195 |   
196 | }
197 | 
198 | colnames(resum) = c("r", "n", names(tcgadata))
199 | 
200 | 
201 | head(resum)
202 | resum[resum[,"n"] == 22,]
203 | resum[resum[,"n"] == 20,]
204 | resum[resum[,"n"] == 15,]
205 | resum[resum[,"n"] == 10,]
206 | resum[resum[,"n"] == 5,]
207 | 
208 | mediana = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), median)
209 | mediana
210 | 
211 | mitjana = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), mean)
212 | 
213 | minim = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), min)
214 | minim
215 | 
216 | maxim = aggregate(resum[,3:6], by = list("n" = resum[,"n"]), max)
217 | maxim
218 | 
219 | png(filename = "validationTCGA.png", pointsize = 22)
220 | matplot(mitjana[,"n"], mitjana[,-1]*100, type = "o", col = miscolores, pch = 16, lwd = 4, ylim = c(0,30),
221 |         xlab = "Sample size", ylab = "Mean % DE features", main = "TCGA glioblastoma data")
222 | legend("topleft", omicas, col = miscolores, pch = 16, bty = "n")
223 | dev.off()
224 | 
225 | 


--------------------------------------------------------------------------------
/MultiPower_Examples/STATegra_Example.R:
--------------------------------------------------------------------------------
  1 | ############################################################################
  2 | ########             Power study for STATegra data types            ########
  3 | ########             Nature Communications                          ########
  4 | ############################################################################
  5 | 
  6 | ## By Sonia Tarazona
  7 | ## 05-Oct-2017
  8 | ## Last modified: 28-nov-2018
  9 | 
 10 | 
 11 | options(stringsAsFactors = FALSE)
 12 | 
 13 | # Loading MultiPower functions
 14 | source("~/Dropbox/STATegra_STC/FiguresOfMerit/MultiPower/R/MultiOmicsPower10.R")
 15 | 
 16 | 
 17 | # Set up
 18 | myfigdir = "~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults/"
 19 | 
 20 | miscolores = c("red2", "darkslategray3", "azure4", "sienna2",
 21 |                "dodgerblue4", "darkolivegreen4")
 22 | omicas = c("RNA-seq", "miRNA-seq", "ChIP-seq", "DNase-seq",  "Metabolomics", "Proteomics")
 23 | names(miscolores) = omicas
 24 | 
 25 | 
 26 | 
 27 | 
 28 | # Required data -----------------------------------------------------------
 29 | statdata = statdesign = vector("list")
 30 | 
 31 | ## RNA-seq (LOG2-transformed)
 32 | statdata$RNAseq = read.delim("~/Dropbox/STATegra_STC/STATegraData/RNAseq/STATegra.RNAseq.CQN.Combat.Annotated.positive_2014_09.csv",
 33 |                              row.names = 1, as.is = TRUE, sep = ",")[,-1]
 34 | statdata$RNAseq = statdata$RNAseq[,c(grep("_Ik_0H", colnames(statdata$RNAseq)),
 35 |                                      grep("_Ik_24H", colnames(statdata$RNAseq)))]
 36 | statdesign$RNAseq = rep(c("0h", "24h"), each = 3)
 37 | min(statdata$RNAseq) # 1.14
 38 | statdata$RNAseq = statdata$RNAseq - min(statdata$RNAseq) # min = 0
 39 | 
 40 | 
 41 | ## miRNA-seq (LOG2-transformed)
 42 | statdata$miRNAseq = read.delim("~/Dropbox/STATegra_STC/STATegraData/miRNA/miRNAseq_FinalData_NOtecrep.txt", 
 43 |                                row.names = 1, as.is = TRUE)
 44 | statdata$miRNAseq = statdata$miRNAseq[,c(grep("I.0.", colnames(statdata$miRNAseq)),
 45 |                                          grep("I.24.", colnames(statdata$miRNAseq)))]
 46 | statdesign$miRNAseq = rep(c("0h", "24h"), each = 3)
 47 | min(statdata$miRNAseq) # 0.08775962
 48 | statdata$miRNAseq = statdata$miRNAseq - min(statdata$miRNAseq)  # min = 0
 49 | 
 50 | 
 51 | 
 52 | ## ChIP-seq 
 53 | statdata$ChIPseq = read.delim("~/Dropbox/STATegra_STC/STATegraData/ChIPseq/STAT_CPM_ChIPseq_FILT.txt",
 54 |                               row.names = 1)
 55 | statdata$ChIPseq = log2(statdata$ChIPseq + 1)
 56 | statdesign$ChIPseq = rep(c("0h", "24h"), each = 2)
 57 | min(statdata$ChIPseq) # 0
 58 | 
 59 | 
 60 | ## DNase-seq (LOG2-transformed) 
 61 | statdata$DNaseSeq = read.delim("~/Dropbox/STATegra_STC/STATegraData/DNaseSeq/STAT_DNaseSeq_homer_RPKM_TMM_ARSyN.txt", 
 62 |                                row.names = 1, as.is = TRUE)
 63 | statdata$DNaseSeq = statdata$DNaseSeq[,c(grep("Ik0h_", colnames(statdata$DNaseSeq)),
 64 |                                          grep("Ik24h_", colnames(statdata$DNaseSeq)))]
 65 | statdesign$DNaseSeq = statdesign$RNAseq
 66 | min(statdata$DNaseSeq) # 1
 67 | statdata$DNaseSeq = statdata$DNaseSeq - min(statdata$DNaseSeq)  # min = 0 
 68 | 
 69 | 
 70 | ## Metabolomics 
 71 | statdata$metabolomics = read.delim("~/Dropbox/STATegra_STC/STATegraData/metabolomics/oldData_original.txt", 
 72 |                                    as.is = TRUE, header = TRUE, row.names = 1)
 73 | statdata$metabolomics = statdata$metabolomics[,c(grep("I.0.", colnames(statdata$metabolomics)),
 74 |                                                  grep("I.24.", colnames(statdata$metabolomics)))]
 75 | statdata$metabolomics = log2(statdata$metabolomics)
 76 | statdesign$metabolomics = statdesign$RNAseq
 77 | min(statdata$metabolomics)  # 8.82e-05
 78 | 
 79 | 
 80 | ## Proteomics (LOG2-transformed) 
 81 | statdata$proteomics = read.delim("~/Dropbox/STATegra_STC/STATegraData/proteomics/proteomicsImputedTMM.txt", 
 82 |                                  as.is = TRUE, header = TRUE, row.names = 1)
 83 | statdata$proteomics = statdata$proteomics[,c(grep("IKA_0h", colnames(statdata$proteomics)),
 84 |                                              grep("IKA_24h", colnames(statdata$proteomics)))]
 85 | statdesign$proteomics = statdesign$RNAseq
 86 | min(statdata$proteomics) # 1.97
 87 | statdata$proteomics = statdata$proteomics - min(statdata$proteomics)
 88 | 
 89 | 
 90 | sapply(statdata, dim)
 91 | #      RNAseq miRNAseq  ChIPseq DNaseSeq metabolomics proteomics
 92 | # [1,]  12762      469   23875    52788           60       1077
 93 | # [2,]      6        6       4        6            6          6
 94 | 
 95 | names(statdata) = names(statdesign) = omicas
 96 | 
 97 | 
 98 | 
 99 | # Global MultiPower results -----------------------------------------------
100 | 
101 | par(mfrow = c(1,2))
102 | type1 = rep(2,6)
103 | p1omics = c(0.4, 0.2, 0.2, 0.2, 0.6, 0.2)
104 | 
105 | 
106 | statResultsEQ = MultiPower(data = statdata, groups = statdesign, type = type1, d0 = 0.8, p1 = p1omics,
107 |                          omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = TRUE,
108 |                          max.size = 200, omicCol = miscolores, dispPerc = 75)
109 | 
110 | statResultsNE = MultiPower(data = statdata, groups = statdesign, type = type1, d0 = 0.8, p1 = p1omics,
111 |                            omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = 1, equalSize = FALSE,
112 |                            max.size = 200, omicCol = miscolores, dispPerc = 75)
113 | 
114 | mycost = c(1, 1.3, 1.5, 1.6, 1, 1)
115 | statResultsNEcost = MultiPower(data = statdata, groups = statdesign, type = type1, d0 = 0.8, p1 = p1omics,
116 |                                omicPower = 0.6, averagePower = 0.8, fdr = 0.05, cost = mycost, equalSize = FALSE,
117 |                                max.size = 200, omicCol = miscolores, dispPerc = 75)
118 | 
119 | 
120 | save(statResultsEQ, statResultsNE, statResultsNEcost,
121 |      file = "STATegraResultsMultiPower6.RData")
122 | 
123 | 
124 | 
125 | 
126 | # Post-analysis -----------------------------------------------------------
127 | 
128 | STATpostEQ = postMultiPower(data = statdata, groups = statdesign, optResults = statResultsEQ, 
129 |                             max.size = 4, omicCol = miscolores) 
130 | 
131 | STATpostNE = postMultiPower(data = statdata, groups = statdesign, optResults = statResultsNE, 
132 |                             max.size = 5, omicCol = miscolores) 
133 | 
134 | STATpostNEcost = postMultiPower(data = statdata, groups = statdesign, optResults = statResultsNEcost,
135 |                                 max.size = 3, omicCol = miscolores) 
136 | 
137 | 
138 | save(statResultsEQ, statResultsNE, statResultsNEcost,
139 |      STATpostEQ, STATpostNE, 
140 |      file = "STATegraResultsMultiPower6.RData")
141 | 
142 | load("STATegraResultsMultiPower6.RData", verbose = TRUE)
143 | 
144 | par(mfrow = c(1,2))
145 | postPowerPlot(postResults = STATpostEQ, equalSize = TRUE, omicCol = miscolores, max.size = 3)
146 | postPowerPlot(postResults = STATpostNE, equalSize = FALSE, omicCol = miscolores, max.size = 5)
147 | postPowerPlot(postResults = STATpostNEcost, equalSize = FALSE, omicCol = miscolores, max.size = 5)
148 | 
149 | 
150 | powerPlot(statResultsNEcost$parameters, statResultsNEcost$optimalSampleSize, omicCol = miscolores)
151 | 
152 | 
153 | 
154 | 
155 | # Generating figures for the paper ----------------------------------------
156 | 
157 | setwd("~/Dropbox/STATegra_STC/FiguresOfMerit/powerResults")
158 | load("STATegraResultsMultiPower6.RData")
159 | 
160 | pdf(file = "plots/stategraEQ.pdf", width = 3.5*2, height = 3.5*2)
161 | par(mfrow = c(2,2))
162 | powerPlot(parameters = statResultsEQ$parameters, optimalSampleSize = statResultsEQ$optimalSampleSize, 
163 |           omicCol = miscolores)
164 | postPowerPlot(postResults = STATpostEQ, equalSize = TRUE, omicCol = miscolores, max.size = 4)
165 | dev.off()
166 | 
167 | 
168 | pdf(file = "plots/stategraNE.pdf", width = 4.5*2, height = 4.5)
169 | par(mfrow = c(1,2))
170 | powerPlot(parameters = statResultsNE$parameters, optimalSampleSize = statResultsNE$optimalSampleSize, omicCol = miscolores)
171 | dev.off()
172 | 
173 | pdf(file = "plots/stategraNEcost.pdf", width = 4.5*2, height = 4.5)
174 | par(mfrow = c(1,2))
175 | powerPlot(parameters = statResultsNEcost$parameters, optimalSampleSize = statResultsNEcost$optimalSampleSize, omicCol = miscolores)
176 | dev.off()
177 | 
178 | 
179 | 
180 | 
181 | 
182 | # Figures for estimated parameters per omic -------------------------------------
183 | 
184 | ## Number of features, %DE features
185 | 
186 | par(mar = c(1,7,2,2), mfrow = c(1,2))
187 | 
188 | bb = barplot(sapply(statResultsEQ$parameters, function (x) x$m)/1000, las = 1,
189 |              col = miscolores, horiz = TRUE, beside = TRUE, log = "x", 
190 |              border = miscolores, main = "# features", axes = FALSE)
191 | text(0.5, bb, sapply(statResultsEQ$parameters, function (x) x$m), adj = 1, 
192 |      col = 1, las = 2)
193 | 
194 | par(mar = c(1,2,2,2))
195 | bb = barplot(100*sapply(statResultsEQ$parameters, function (x) x$p1), 
196 |              las = 1, col = miscolores, horiz = TRUE, names.arg = "", 
197 |              border = miscolores, main = "% DE features", axes = FALSE)
198 | text(10, bb, paste0(round(100*sapply(statResultsEQ$parameters, function (x) x$p1),0), "%"), 
199 |      adj = 1, col = "white", las = 2)
200 | 
201 | 
202 | 
203 | ## Dispersion
204 | 
205 | par(mar = c(7,4,2,1))#, mfrow = c(1,2))
206 | boxplot(lapply(statResultsEQ$parameters, function (x) x$allDispersions), col = miscolores,
207 |         las = 2, main = "Pooled standard deviation", log = "y", ylab = "log-scale (dispersion)")
208 | 
209 | 
210 | 
211 | 
212 | 


--------------------------------------------------------------------------------
/MultiML/README.md:
--------------------------------------------------------------------------------
  1 | # MultiML #
  2 | 
  3 | This script was created to estimate the minimum sample size required to keep the error rate below a particular/desired threshold. 
  4 | Version: 0.1.0
  5 | 
  6 | 
  7 | ## Introduction ##
  8 | 
  9 | Although differential expression methods are needed to perform an initial variable selection step and hence it is key to be aware of how statistical power of such methods applied on our data can impact the results, an integrative multi-omic model is the final goal of multi-omic studies in many cases. Dimension reduction techniques or machine learning approaches are a popular choice in these cases and the biological problem to solve will determine if the selected methodology is unsupervised or supervised, and for either classification or prediction. For any of these approaches, the optimal size required to optimize the performance of the chosen method is still an open question in the field and, up to our knowledge, no tools exist to estimate sample sizes in a systematic way. On top of that, the classical definition of statistical power is not appropriate for machine learning algorithms where a significance level has to be performed a posteriori. Thus, in the particular case of supervised approaches, the performance can be measured in terms of classification or prediction error rates. 
 10 | The MultiML method employs a multi-omic pilot data to estimate the minimum sample size required to keep the error rate below the desired threshold. This MultiML version uses Partial Least Squares-Discriminant Analysis (PLS-DA) or Random Forest classification methods to depict K subsamples of different sample sizes (nk) from the available observations, obtaining a classifier for each case, and measuring the corresponding error rate (ek). On these (nk, ek) values, a first order-smoothed penalized P-spline regression is performed to estimate the learning curve that will allow the prediction of error rates for larger sample sizes. The MultiML method was created to help userd to determine optimal sample size required for an accurate classification error rate, when using one or more Omics datasets.
 11 | 
 12 | ## Getting Started ##
 13 | 
 14 | The MultiML method is available in:
 15 | https://github.com/ConesaLab/MultiPower/blob/master/MultiML
 16 | https://leobalzano@bitbucket.org/leobalzano/multiml.git
 17 | 
 18 | ## Required R libraries: ##
 19 | These are the libraries needed from the CRAN repository that can be installed with
 20 | install.packages() function and then loaded with library().
 21 | ~~~~
 22 | - plotly 
 23 | - ggplot2
 24 | - gridExtra
 25 | - grid
 26 | - ggpmisc
 27 | - RColorBrewer
 28 | - dplyr
 29 | - glmnet
 30 | - reshape2
 31 | - rslurm
 32 | - whisker
 33 | - randomForest
 34 | - caret
 35 | - gtable
 36 | - pls
 37 | ~~~~
 38 | Required libraries from Bioconductor repository that must be installed following the Bioconductor instructions at:
 39 | https://bioconductor.org/packages/release/bioc/html/RnaSeqSampleSize.html
 40 | ~~~~
 41 | - mixOmics
 42 | - Biobase
 43 | ~~~~
 44 | Finally, there are some basic packages that do not need to be updated but they need to be loaded:
 45 | ~~~~
 46 | - parallel
 47 | - splines
 48 | - graphics
 49 | ~~~~
 50 | After installing all packages, it is necessary to load them all:
 51 | ~~~~
 52 | library("plotly")
 53 | library("ggplot2")
 54 | library ("gridExtra")
 55 | library("grid")
 56 | library("ggpmisc")
 57 | library("RColorBrewer")
 58 | library("mixOmics")
 59 | library("dplyr")
 60 | library ("glmnet")
 61 | library ("Biobase")
 62 | library("reshape2")
 63 | library("parallel")
 64 | library("rslurm")
 65 | library("whisker")
 66 | library("randomForest")
 67 | library("caret")
 68 | library("gtable")
 69 | library("splines")
 70 | library("graphics")
 71 | library("pls")
 72 | ~~~~
 73 | ## Functions: ##
 74 | A set of functions developed to perform the MultiML predictive analysis are required and it is necessary to upload them as well.
 75 | source ("FoMPredictiveFunctions.R")
 76 | 
 77 | Prediction of the number of samples required to reach a particular classification error rate
 78 | 
 79 | MultiML method requires some input for each omic data type in order to compute a model to understand and predict the adequate number of samples with which the user can reach reasonably low classification errors. Utilizing Omics data results from a pilot study, the user can then extrapolate how many more samples they need to reach an acceptable error of classification.
 80 | Next a sequence of the functions present in the method will be explained, in terms of inputs required and outputs obtained:
 81 | 
 82 | ## Input data ##
 83 | As unique input data required for MultiML to operate is a list of as many Omics datasets as the user wants and the Response matrix that can be binary or multifactorial.
 84 | The names of the omics should be the names of each table in the list. All tables must be arranged in the same way, being in rows, the variables (genes, proteins, etc.) and in columns, the different samples (treatments and/or controls).  Each element in this list must be a raw count data frame.
 85 | 
 86 | 
 87 | 
 88 | |        | TCGA-02-0432-01 | TCGA-08-0245-01 | TCGA-28-1750-01 | TCGA-06-1084-01 | TCGA-02-0064-01 |
 89 | |--------|-----------------|-----------------|-----------------|-----------------|---------------- |
 90 | |FSTL1   |        6.732094 |        9.172518 |       10.003722 |        9.818185 |       11.002899 |
 91 | |AACS    |        6.121826 |        6.348193 |        7.067599 |        6.822424 |        6.676739 |
 92 | |RPS11   |       10.783535 |       10.533063 |       10.438774 |       11.006742 |       10.662374 |
 93 | |CREB3L1 |        4.487414 |        4.620867 |        4.339173 |        4.283061 |        4.711638 |
 94 | |ELMO2   |        7.770408 |        6.158398 |        7.035239 |        5.751987 |        7.152336 |
 95 | |PNMA1   |        9.921098 |        9.556290 |        9.143960 |        7.626237 |        9.838729 |
 96 | 
 97 | The response matrix must indicate a category for each sample.
 98 | 
 99 | |        | TCGA-02-0432-01 |TCGA-08-0245-01 |TCGA-28-1750-01 |TCGA-06-1084-01 |TCGA-02-0064-01|
100 | |--------|-----------------|----------------|----------------|----------------|---------------|
101 | | Type   |       Proneural |      Proneural |    Mesenchymal |    Mesenchymal |    Mesenchymal|
102 | 
103 | 
104 | Estimation of the time required for a particular study:
105 | Since MultiML is a computationally expensive, time consuming strategy, it is recommendable to perform an estimation of the required time necessary to analyze a particular study.
106 | For this, a function RequiredtimeTest() was created. Its arguments are described in detail next:
107 | 
108 | Input
109 | Predictors: A list of the different Omics Datasets and the Response matrix.
110 | Response: A number indicating the position of the response matrix included in "Predictors" object.
111 | Comps: Number of components to be calculated after each iteration. Just applicable to PLSDA approach.
112 | Function: Modular function used to calculate the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used.
113 | crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF).
114 | Ticks: Number of segments (groups) of samples to evaluate.
115 | ERIterations: Number of iterations in which the error rate (ER) will be calculated.
116 | LassoIterations: Number of iterations of the Lasso selection per each error rate analysis.
117 | cpus_per_node: Number of CPUs that will be used in the analysis.
118 | ...: Arguments to be passed to methods.
119 | 
120 | Example:
121 | ~~~~
122 | EstTime<-RequiredtimeTest (Predictors=tcgadata[c(4,6)],Response=2,
123 |                            cpus_per_node=1,
124 |                            Ticks = 50, Function = PLSDA.MP,Comps = 10,
125 |                            crosval = "LOOCV",ERIterations = 50,
126 |                            LassoIterations=50)
127 | ~~~~
128 | Output:
129 | Summary: A table indicating the conditions of the analysis, established by the user.
130 | EstimatedTime: A table of the estimated time that the process will last showed in different metrics (seconds, minutes, hours, or days).
131 | 
132 | 
133 | ## Classification Error Rate calculus ##
134 | 
135 | This is calculated through the ER_Calculator() function which is the main function in the MultiML method. 
136 | 
137 | Input: 
138 | Predictors: A list of different Omics Datasets and the Response matrix.
139 | Response: A number indicating the response matrix included in "Predictors" object.
140 | Previous_CER: A previous result of class "ClassificationErrorRate" (CER) to be fused to the one that is going to be calculated. This is useful for merging a pilot study with a posterior analysis.
141 | Ticks: Number of segments (groups) of samples to evaluate. If NULL, the calculation is made on its own considering the "TheoreticER".
142 | WhichTicks: Vector of numbers of ticks the user wants to analyze. If NULL, a random selection between the minimum and maximum number of samples is calculated. It results useful in posterior rounds of analysis.
143 | Function: Modular function used to calculate the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used.
144 | Comps: Number of components to be calculated after each iteration. Just applicable to PLSDA approach.
145 | crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF).
146 | ERIterations: Number of iterations in which the error rate (ER) will be calculated.
147 | LassoIterations: Number of iterations of the Lasso selection per each error rate analysis.
148 | 
149 | Example:
150 | ~~~~
151 | ProtsRF<-ER_Calculator(Predictors=tcgadata[c(4,6)], 
152 |                        Response=2,Previous_CER=NULL,Ticks=NULL, 
153 |                        WhichTicks=NULL,Function=Random.Forest.MP, 
154 |                        Comps=10,crosval = "LOOCV",ERIterations=2, 
155 |                        LassoIterations=2, TheoreticER=0.02)
156 | ~~~~
157 | 
158 | Output:
159 | TestedTicks: A vector of the number of evaluated number of samples.
160 | Omics: An indicator of the Omics evaluated.
161 | Minimums: A list of the minimum value of error rate, balanced (BER) or not (ER) obtained per each ten-component analysis. This is measured through three distance metrics to evaluate the classification performance of the model. Maximal distance "max.dist", distance to centroids "centroids.dist" or Mahalanobis distance "mahalanobis.dist". Thus, each table contains the results per each iteration at different subsets of samples. 
162 | TablebyTick: A data frame with information related to the classification error rate obtained by tick ± standard error of the mean "SEM" and the characteristics of the analysis selected by the user.
163 | Prediction_table: A table with the information related to de predicted number of samples required to reach a particular ER ± a margin of error "MOE". 
164 | CompWinner: A list of the number of components in which the minimum value of error rate, balanced (BER) or not (ER) was obtained per each iteration. This only applies for PLSDA module and it is measured through the three mentioned distance metrics, Maximal distance, distance to centroids or Mahalanobis distance, to evaluate the classification performance of the model.
165 | 
166 | 
167 | ## Elaboration of files and folder for running analyzes in a high-performance computer (HPC) ##
168 | 
169 | Often, the users need to evaluate the data thoroughly, with analyzes among two or more Omics datasets, or determine if an Omics type is necessary or not for a particular study.
170 | To overcome the fact that the method is computationally demanding, we offer a set of functions that allows the data arrangement and the creation of procedure commands, so the user can be able to run them into an HPC, as long as the HPC can be run through the Slurm workload manager job scheduler.
171 | 
172 | Step 1: Determining the comparisons to be evaluated in the dataset.
173 | This can be performed with the CombinationsFunction() function. Here, the user can create the combination of datasets, so they can be included in the HPC.
174 | 
175 | Input: 
176 | OmicsData: The data as list of predictor datasets including also the response (Y) matrix as the last element of the list.
177 | AnalysisType: Here we offer three types of analyzes that can be called. "exploratory", where all predictor matrices are going to be compared vs response matrix (Y) separately, as well as all omics dataset vs Y. "complete", where simply all combinations are created, even though this is not recommended because of the amount of time that a job of these characteristics implies. "detailed", where the user can choose by levels indicating if two datasets vs Y or three vs Y is required.
178 | levels: A vector indicating the levels to be analyzed. If "AnalysisType" = "detailed", if 2, then two predictors datasets will be contrasted against Y and so on.
179 | 
180 | Example:
181 | ~~~~
182 | ExplTest<-CombinationsFunction (OmicsData=tcgadata, 
183 |                                 AnalysisType="detailed", 
184 |                                 levels=c(2))
185 | ~~~~
186 | 
187 | Output:
188 | Combinations: A list of the combinations created by the user.
189 | Data: All required lists of datasets to be used as input for the error rate analysis.
190 |  
191 | Step 2: Creating the files and folders to upload into an HPC.
192 | For this, the user can simply follow a procedure to create an "SlurmFunction" as follows:
193 | ~~~~
194 | SlurmFunction <- function (X) {
195 |                            ER_Calculator(Predictors=X, 
196 |                            Response=length(X), Previous_CER=NULL, 
197 |                            Ticks=NULL, WhichTicks=NULL, 
198 |                            Function=Random.Forest.MP, Comps=10, 
199 |                            crosval = "LOOCV", ERIterations=2, 
200 |                            LassoIterations=2, TheoreticER=0.02)
201 |                            }
202 | ~~~~
203 | 
204 | Then, the user can create their "SlurmJob" using the Slurm_Creator() function.
205 | This function creates the files to be uploaded into a cluster to calculate the results of all desired combinations of predictor datasets.
206 | 
207 | Input: 
208 | Function: An object class "SlurmFunction" that will calculate the error rates.
209 | Parameters: All required lists of datasets to be used as input for the error rate calculation. These parameters are the ones obtained in step 1.
210 | jobname: The desired name of the Slurm job. If NA, a random name of the form "slr####" will be assigned.
211 | nodes: The maximum number of nodes to be used in the cluster.
212 | cpus_per_node: The number of CPUs per node in the cluster. It determines how many processes are run in parallel per node.
213 | pkgs: A character vector containing the names of packages that must be loaded on each cluster node. By default, it includes all packages loaded by the user when slurm_apply is called.
214 | time: Time in days to run the data in the cluster. By default is 30 days, but according to the user HPC-availability this can be different.
215 | 
216 | Example:
217 | ~~~~
218 | My_Slurm_job <- Slurm_Creator(Function=SlurmFunction, 
219 |                               Parameters=ExplTest$Data, 
220 |                               jobname = 'my_Job', nodes = 8, 
221 |                               cpus_per_node = 100, time = 30, 
222 |                               pkgs = rev(.packages()) )
223 | ~~~~
224 | Output:
225 | Four elements to be uploaded into cluster are created:
226 | Function.RDS: A .RDS file with the Slurm Function that will be calculated.
227 | Parameters.RDS: A .RDS file with all lists of datasets that will be used for the error rate calculation.
228 | SlurmCreator_Run.R: A .R file with the parameters to run the job.
229 | SlurmCreator_submit_sh: A .sh file that indicates all parameters that the cluster requires to perform the job.
230 | 
231 | Step 3: Upload all four files into an HPC and run.
232 | Step 4: Once the job is done, download all files from the HPC.
233 | Step 5: Plot results.
234 | 
235 | 
236 | ## Plotting the results ##
237 | 
238 | ### Predictive plot. ###
239 | The predictive plots are generated throughout the function called ErrorRateplot(). This function can not only use the information calculated in the previous step but also, it can calculate the samples required for ER values even lower than the ones reached in the pilot experiment.
240 | 
241 | Input:
242 | x: An object of class ER_calculator 
243 | ErrorRateType: Character to indicate which error rate to plot. It can be error rate "ER", balanced error rate "BER" or "Both".
244 | MetricsType: Character to indicate the metrics to be plotted. This only applies for PLSDA module. It can be Maximal distance "max.dist", distance to centroids "centroids.dist", Mahalanobis distance "mahalanobis.dist" or "All"
245 | DoF: It can be either NULL to indicate that the degrees of freedom of the Spline model are ticks-1, or a value so the model is created with a different degree of freedom. The chosen DoF affects the samples required so the user must be careful.
246 | Projection: Character to indicate if the user needs for the Spline model to be projected until the minimum error rate Min_ER (TRUE) or not (FALSE).
247 | Spline: Character to indicate if the user needs to plot the Spline model (TRUE) or not (FALSE).
248 | TheoreticER: Character to indicate the minimum value of ER in order to calculate the adequate number of samples.
249 | ConfInt: Character to indicate the confidence interval for the calculation of margin of error and plot of the confidence area of the Spline model. The user can use 0.90, 0.95 or 0.99.
250 | 
251 | Example:
252 | ~~~~
253 | plot<-ErrorRateplot(x=ProtsRF,Projection=TRUE, Spline=TRUE, 
254 |                     DoF = NULL,ErrorRateType = "ER", 
255 |                     MetricsType ="mahalanobis.dist",TheoreticER=NULL, 
256 |                     ConfInt=0.95 )
257 | ~~~~
258 | 
259 | Output:
260 | A plot of "number of samples" versus "classification error rate", indicating also the metric used, the Omics evaluated, and the samples required to reach a particular error rate ± a margin of error "MOE".
261 | 
262 |  
263 | ### Comparative plot. ###
264 | The user can also create a comparative plot of all Omics combinations to determine the best contributing Omics to an accurate classification by using Comparative_ERPlot() function.
265 | 
266 | Input: 
267 | L: list of different Omics-integration results obtained with ER_Calculator() function.
268 | ErrorRateType: Character to indicate which error rate to plot. It can be error rate "ER", balanced error rate "BER" if the PLSDA approach was the one calculated.
269 | MetricsType: Character to indicate the metrics to be plotted. This only applies for PLSDA module. It can be Maximal distance "max.dist", distance to centroids "centroids.dist" or Mahalanobis distance "mahalanobis.dist".
270 | 
271 | Example:
272 | First, it is necessary to simply create a list of the results you want to plot together as follows:
273 | 
274 | ~~~~
275 | Allplots<-list(
276 |         My_Omics1_Result= My_Omics1 _Result, 
277 |         My_Omics2_Result= My_Omics2_Result, 
278 |         My_Omics3_Result= My_Omics3_Result, 
279 |         My_Omics1_Omics2_Result= My_Omics1_Omics2_Result, 
280 |         My_Omics1_Omics3_Result= My_Omics1_Omics3_Result, 
281 |         My_Omics1_Omics2_Omics3_Result=My_Omics1_Omics2_Omics3_Result, 
282 |         My_Omics2_Omics3_Result= My_Omics2_Omics3_Result 
283 |        )
284 | ~~~~
285 | Then, simply use the function
286 | ~~~~
287 | Plot<- Comparative_ERPlot(L=Allplots, 
288 |                           ErrorRateType = "ER", 
289 |                           MetricsType ="mahalanobis.dist")
290 | ~~~~
291 | Output:
292 | A plot of "number of samples" versus "classification error rate" of all different analyzes.
293 | 
294 |  
295 | 
296 | ## How to cite MultiML ##
297 | Soon in bioRxiv !! 
298 | 
299 | ## References  ##
300 | [1] Hastie, T., James, G., Witten, D., Tibshirani, R. (2013). An Introduction to Statistical Learning. Springer. New York. pp. 316-321.
301 | 
302 | [2] Meyer, M. C. (2008). Inference using shape-restricted regression splines. The Annals of Applied Statistics, 2(3), 1013-1033.
303 | 
304 | [3] Ramsay, J. O. (1988). Monotone regression splines in action. Statistical science, 3(4), 425-441.
305 | 


--------------------------------------------------------------------------------
/R/MultiOmicsPower15.R:
--------------------------------------------------------------------------------
  1 | ######################################################################################
  2 | ###### MULTIPOWER
  3 | ###### Optimization model to maximize power of multi-omics integration models
  4 | ######################################################################################
  5 | 
  6 | 
  7 | ## By Sonia Tarazona and David Gomez-Cabrero
  8 | ## 05-Oct-2017
  9 | ## Last modified: March-2023
 10 | # 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | #### PACKAGES READ
 16 | # install.packages("FDRsampsize")
 17 | require(FDRsampsize)
 18 | require(lpSolve)
 19 | 
 20 | 
 21 | # install.packages("slam")
 22 | # install.packages("lpmodeler")
 23 | # require(lpmodeler)
 24 | # require(Rsymphony)
 25 | # 1) Install SYMPHONY:  (now 5.6.16, first MultiPower version was with 5.6.10)
 26 | # cd
 27 | # svn checkout https://projects.coin-or.org/svn/SYMPHONY/releases/5.6.16 SYMPHONY-5.6.16
 28 | # cd SYMPHONY-5.6.16
 29 | # ./configure
 30 | # make
 31 | # make install
 32 | # 2) Install other components:
 33 | # sudo apt-get install coinor-libcgl-dev coinor-libclp-dev coinor-libcoinutils-dev coinor-libosi-dev
 34 | # sudo apt-get install coinor-libsymphony-dev
 35 | # sudo apt-get install autotools-dev
 36 | # 3) Intall package in R (previously downloaded from CRAN):
 37 | # install.packages("Rsymphony_0.1-26.tar.gz", repos = NULL)
 38 | 
 39 | # https://cran.r-project.org/src/contrib/Rsymphony_0.1-26.tar.gz
 40 | # https://projects.coin-or.org/SYMPHONY
 41 | #Download: wget http://www.coin-or.org/download/source/SYMPHONY/SYMPHONY-5.6.6.tgz
 42 | 
 43 | 
 44 | # require(boot)
 45 | 
 46 | 
 47 | 
 48 | # Auxiliary functions -----------------------------------------------------
 49 | 
 50 | geomean = function (x) exp(mean(log(x)))
 51 | 
 52 | cohen.h = function (p) abs(2*asin(sqrt(p[1])) - 2*asin(sqrt(p[2]))) # p is a vector with two components
 53 | 
 54 | power.binary = function(n, sig.level=0.05, p1_p2, p1) {
 55 |   if (length(p1_p2) != length(p1)) stop("p1_p2 and p1 must be of the same length")
 56 |   p2 = p1 - p1_p2
 57 |   mypower = sapply(1:length(p1), function (i) power.prop.test(n=n, p1 = p1[i], p2 = p2[i], sig.level = sig.level)$power)
 58 |   return(mypower)
 59 | }
 60 | 
 61 | 
 62 | 
 63 | 
 64 | # Estimating parameters needed for power calculation ----------------------
 65 | 
 66 | ## Two-group comparison
 67 | paramEst = function (data, groups, type = 1) {
 68 |   
 69 |   # type = 1 (counts), 2 (gaussian), 3 (binary variables: 0/1 or FALSE/TRUE)
 70 |   
 71 |   # Sample size per group
 72 |   nGroup = table(groups)
 73 |   
 74 |   sd0 = apply(data, 1, sd)
 75 |   sd0 = which(sd0 == 0)
 76 |   if (length(sd0) > 0) {
 77 |     print(paste0(length(sd0), " constant features are to be removed from the analysis."))
 78 |     data = data[-sd0,]
 79 |   }
 80 | 
 81 |   # Number of features
 82 |   M = nrow(data)
 83 |   
 84 |   # Sequencing depth correction for count data
 85 |   if (type == 1) {
 86 |     seqdepth = colSums(data)
 87 |     med = median(seqdepth)
 88 |     data  = apply(data, 2, function (x) med*x/sum(x))
 89 |   }
 90 | 
 91 | 
 92 |   # Mean counts per group 
 93 |   meanPerGroup = t(apply(data, 1, tapply, INDEX = groups, mean, na.rm = TRUE))   
 94 | 
 95 |   if (type != 3) {
 96 | 
 97 |     # Standard deviation per group
 98 |     sdPerGroup = t(apply(data, 1, tapply, INDEX = groups, sd, na.rm = TRUE))
 99 |     sdPerGroup = sdPerGroup[,names(nGroup)]
100 | 
101 |     # Pooled Standard Deviation
102 |     SDpooled = sqrt((nGroup[1]*sdPerGroup[,1]^2 + nGroup[2]*sdPerGroup[,2]^2)/(sum(as.numeric(nGroup))-2))
103 | 
104 |     # Cohen's d per feature
105 |     deltaPerFeat = abs(meanPerGroup[,1] - meanPerGroup[,2])
106 |     d = deltaPerFeat/SDpooled
107 |   } else {
108 |     d = apply(meanPerGroup, 1, cohen.h)
109 |   }
110 |   
111 | 
112 |   if (type == 1) {   ## COUNT DATA
113 |     cat("Parameters are to be estimated for count data \n")
114 | 
115 |     if(min(data, na.rm = TRUE) < 0) stop("Negative values were found. Are you sure these are count data?\n")
116 | 
117 |     # Fold-change estimation
118 |     allFC = log2(apply(meanPerGroup, 1, function (x) max(0.0000001, x[2]) / max(x[1], 0.0000001)))
119 |     
120 |     # Average counts
121 |     mu = rowMeans(data, na.rm = TRUE)
122 | 
123 |     # CV 
124 |     myCV = SDpooled/mu
125 |     
126 |     # Estimated parameters for count data
127 |     myparameters = list("type" = type, "logFC" = allFC, "pooledSD" = SDpooled, "CV" = myCV,
128 |                         "delta" = deltaPerFeat, "mu" = mu,"m" = M, "d" = d, "nGroup" = nGroup)
129 | 
130 |   }
131 | 
132 |   if (type == 2) {   ## NORMAL DATA
133 |     cat("Parameters are to be estimated for normally distributed data \n")
134 | 
135 |     # Estimated parameters for normal data
136 |     myparameters = list("type" = type, "delta" = deltaPerFeat, "pooledSD" = SDpooled, 
137 |                         "m" = M, "d" = d, "nGroup" = nGroup)
138 |   }
139 | 
140 |   if (type == 3) {  ## BINARY DATA
141 |     cat("Parameters are to be estimated for binary data \n")
142 | 
143 |     # Estimated parameters for normal data
144 |     myparameters = list("type" = type, "p1_p2" = meanPerGroup[,1]-meanPerGroup[,2], "p1" = meanPerGroup[,1],
145 |                         "m" = M, "d" = d, "nGroup" = nGroup)
146 |   }
147 | 
148 |   return(myparameters)
149 | 
150 | }
151 | 
152 | 
153 | 
154 | # Computing power or sample size given the rest of parameters -------------
155 | 
156 | getPower = function (parameters, power = NULL, n = NULL, fdr = 0.05, 
157 |                      null.effect = 0, max.n = 500) {
158 | 
159 |   # Compute power for given n
160 |   
161 |   if (is.null(power)) { 
162 |     if (is.null(n)) stop("Please, indicate a value for either power or n arguments. \n")
163 | 
164 |     if (parameters$type == 1) { # COUNT DATA (Negative Binomial)
165 |       potencia = fdr.power(fdr = fdr, n = n, pow.func = power.hart, eff.size = parameters$logFC, null.effect = null.effect, 
166 |                            mu = parameters$mu, sig = parameters$CV) 
167 | 
168 |     }
169 | 
170 |     if (parameters$type == 2) {  # NORMAL DATA
171 |       potencia = fdr.power(fdr = fdr, n = n, pow.func = power.twosampt, eff.size = parameters$delta, null.effect = null.effect,
172 |                            sigma = parameters$pooledSD)
173 |     }
174 | 
175 | 
176 |     if (parameters$type == 3) {  # BINARY DATA
177 |       potencia = fdr.power(fdr = fdr, n = n, pow.func = power.binary, eff.size = parameters$p1_p2, null.effect = null.effect,
178 |                            p1 = parameters$p1)
179 |     }
180 | 
181 | 
182 |     return(potencia)   } else {  # Compute n for given power
183 |       
184 |       if (parameters$type == 1) { # COUNT DATA
185 |         
186 |         tamany = fdr.sampsize(fdr = fdr, ave.pow = power, pow.func = power.hart, eff.size = parameters$logFC, 
187 |                               null.effect = null.effect, mu = parameters$mu, sig = parameters$CV,
188 |                               max.n = max.n, min.n = 2)$n
189 | 
190 |     }
191 | 
192 |     if (parameters$type == 2) {  # NORMAL DATA
193 |       
194 |       tamany = fdr.sampsize(fdr = fdr, ave.pow = power, pow.func = power.twosampt, eff.size = parameters$delta, 
195 |                             null.effect = null.effect, sigma = parameters$pooledSD,
196 |                             max.n = max.n, min.n = 2)$n
197 |     }
198 | 
199 | 
200 |     if (parameters$type == 3) {  # BINARY DATA
201 | 
202 |       tamany = fdr.sampsize(fdr = fdr, ave.pow = power, pow.func = power.binary, eff.size = parameters$p1_p2, 
203 |                             null.effect = null.effect, p1 = parameters$p1, 
204 |                             max.n = max.n, min.n = 2)$n
205 | 
206 |     }
207 |       
208 |       return(max(ceiling(tamany), 2))
209 |       
210 |     }
211 | }
212 | 
213 | 
214 | 
215 | # Optimal sample size -----------------------------------------------------
216 | 
217 | optimalRep = function (parameters, omicPower = 0.6, averagePower = 0.85, fdr = 0.05, cost = 1,
218 |                        equalSize = TRUE, max.size = 200, null.effect = 0) {
219 | 
220 |   omics = names(parameters)
221 | 
222 |   if (length(omicPower) == 1) omicPower = rep(omicPower, length(omics))
223 |   names(omicPower) = omics
224 | 
225 |   if (equalSize) {  ## Same sample size for all omics
226 | 
227 |     # Compute n for each omic
228 |     n1 = sapply(omics, function (oo) getPower(parameters[[oo]], 
229 |                                               power = omicPower[oo], n = NULL, 
230 |                                               fdr = fdr, max.n = max.size,
231 |                                               null.effect = null.effect))
232 |     names(n1) = omics
233 | 
234 |     n1max = max(n1, 2, na.rm = TRUE)
235 | 
236 |     allPowers = sapply(omics, function (oo) getPower(parameters[[oo]], 
237 |                                                      power = NULL, n = n1max, 
238 |                                                      fdr = fdr))
239 |     n2 = n1max
240 |     if (n2 > max.size) stop("Maximum size allowed has been exceed.
241 |                             Please, increase max.size parameter to get the optimal sample size. \n")
242 | 
243 |     # Compute n to satisfy global power
244 |     while(sum(allPowers)/length(omics) < averagePower) {     
245 |       n2 = n2 + 1
246 |       if (n2 > max.size) stop("Maximum size allowed has been exceed.
247 |                             Please, increase max.size parameter to get the optimal sample size. \n")
248 |       allPowers = sapply(omics, function (oo) getPower(parameters[[oo]], power = NULL, 
249 |                                                        n = n2, fdr = fdr, 
250 |                                                        null.effect = null.effect))
251 |     }
252 | 
253 |     return(list("n0" = n1, "n" = n2, "finalPower" = allPowers, "fdr" = fdr, 
254 |                 "omicPower" = omicPower, "averagePower" = averagePower, "cost" = cost))
255 | 
256 |   } else {   ## Different sample size for each omic
257 | 
258 |     sss = optiSSnotEqual(parameters, fdr, cost, max.size, omicPower, averagePower, 
259 |                          null.effect)
260 |     n2 = as.numeric(sss[,"SampleSize"])
261 |     allPowers = as.numeric(sss[,"Power"])
262 |     names(allPowers) = names(n2) = sss[,"Omic"]
263 | 
264 |     return(list("n0" = NA, "n" = n2, "finalPower" = allPowers, "fdr" = fdr, 
265 |                 "omicPower" = omicPower, "averagePower" = averagePower, "cost" = cost))
266 | 
267 |   }
268 | 
269 | }
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | # Summary of results ------------------------------------------------------
278 | 
279 | powerSummary = function(parameters, optimalSampleSize) {
280 |   
281 |   tabla = data.frame("omic" = names(parameters), "type" = sapply(parameters, function (x) x$type),
282 |                      "numFeat" = sapply(parameters, function (x) x$m),
283 |                      "minCohenD" = sapply(parameters, function (x) round(min(x$d, na.rm = TRUE),2)),
284 |                      "maxCohenD" = sapply(parameters, function (x) round(max(x$d, na.rm = TRUE),2)),
285 |                      "minPower" = optimalSampleSize$omicPower,
286 |                      "averPower" = optimalSampleSize$averagePower,
287 |                      "cost" = optimalSampleSize$cost,
288 |                      "minSampleSize" = optimalSampleSize$n0,
289 |                      "optSampleSize" = optimalSampleSize$n,
290 |                      "power" = round(optimalSampleSize$finalPower,4))
291 |   print(tabla)
292 |   return(tabla)
293 | }
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | # Plots for power study ---------------------------------------------------
301 | 
302 | powerPlot = function(parameters, optimalSampleSize, omicCol = NULL) {
303 | 
304 |   if (is.null(omicCol)) {
305 |     if (length(parameters) > 12) {
306 |       stop("Too many omics to be plotted. Please, select a lower number of omics to plot. \n")
307 |     }
308 |     omicCol = colors()[c(554,89,111,512,17,586,132,428,601,568,86,390)]
309 |     omicCol = omicCol[1:length(parameters)]
310 |   }
311 | 
312 |   omicShape = 1:length(parameters)
313 |   names(omicCol) = names(omicShape) = names(parameters)
314 | 
315 | 
316 |   ## Power versus Sample Size
317 | 
318 |   # Sample Sizes
319 |   nmax = max(optimalSampleSize$n)
320 |   ngroup = unique(as.numeric(sapply(parameters, function (x) x$nGroup)))
321 |   xMin = 2
322 |   xMax = round(max(nmax+20, (3*nmax - xMin)/2),0)
323 |   xValues = c(round(seq(xMin, xMax, (xMax - xMin)/10)), optimalSampleSize$n)
324 |   xValues = sort(unique(c(xValues, ngroup)))
325 | 
326 |   # Powers
327 |   yValues = matrix(NA, ncol = length(parameters), nrow = length(xValues))
328 |   rownames(yValues) = xValues
329 |   colnames(yValues) = names(parameters)
330 | 
331 |   for (i in 1:nrow(yValues)) {
332 |     for (j in 1:ncol(yValues)) {
333 |       yValues[i,j] = getPower(parameters[[j]], power = NULL, n = xValues[i], fdr = optimalSampleSize$fdr)   ### null.effect
334 |     }
335 |   }
336 | 
337 |   matplot(xValues, yValues, type = "l", lwd = 2, xlab = "Sample size", ylab = "Statistical power",
338 |           main = "Power vs Sample Size", col = omicCol, lty = omicShape)
339 |   optiSS = optimalSampleSize$n
340 |   if (length(optiSS) == 1) optiSS = rep(optiSS, length(parameters))
341 |   points(optiSS, diag(yValues[as.character(optiSS),]), pch = 15, col = omicCol, cex = 1.2)
342 |   legend("bottomright", names(parameters), lwd = 2, col = omicCol, lty = omicShape, bty = "n")
343 | 
344 | 
345 |   ## Power vs Effect Size
346 | 
347 |   # Quantiles of effect size
348 |   xValues = seq(0,0.75,0.05)  # max P75 
349 | 
350 |   # Powers
351 |   yValues2 = matrix(NA, ncol = length(parameters), nrow = length(xValues))
352 |   rownames(yValues2) = xValues
353 |   colnames(yValues2) = names(parameters)
354 | 
355 |   parameters2 = parameters
356 | 
357 |   optiSS = optimalSampleSize$n
358 |   if (length(optiSS) == 1) optiSS = rep(optiSS, length(parameters))
359 |   
360 |   percentiles = lapply(parameters, function (x) {quantile(x$d, probs = xValues, na.rm = TRUE)})
361 | 
362 |   for (i in 1:nrow(yValues2)) {
363 |     for (j in 1:ncol(yValues2)) {
364 |       selefeat = which(parameters2[[j]]$d >= percentiles[[j]][i])
365 |       parameters2[[j]]$d = parameters2[[j]]$d[selefeat]
366 |       
367 |       if (parameters2[[j]]$type == 1) {
368 |         parameters2[[j]]$logFC = parameters2[[j]]$logFC[selefeat]
369 |         parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat]
370 |         parameters2[[j]]$CV = parameters2[[j]]$CV[selefeat]
371 |         parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat]
372 |         parameters2[[j]]$mu = parameters2[[j]]$mu[selefeat]
373 |       }
374 |       
375 |       if (parameters2[[j]]$type == 2) {
376 |         parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat]
377 |         parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat]
378 |       }
379 |       
380 |       if (parameters2[[j]]$type == 3) {
381 |         parameters2[[j]]$p1_p2 = parameters2[[j]]$p1_p2[selefeat]
382 |         parameters2[[j]]$p1 = parameters2[[j]]$p1[selefeat]
383 |       }
384 |       
385 |       yValues2[i,j] = getPower(parameters2[[j]], power = NULL, n = optiSS[j], fdr = optimalSampleSize$fdr)
386 |         
387 |     }
388 |   }
389 | 
390 |   if (!all(is.na(yValues2))) {
391 |     matplot(xValues*100, yValues2, type = "l", lwd = 2, xlab = "Percentiles for effect size cutoff", ylab = "Statistical power",
392 |             main = "Power vs Effect size", col = omicCol, lty = omicShape)
393 |     points(rep(0, length(parameters)), as.numeric(yValues2[1,]),
394 |            pch = 15, col = omicCol, cex = 1.2)
395 |     legend("bottomright", names(parameters), lwd = 2, col = omicCol, lty = omicShape, bty = "n")
396 |   }
397 | 
398 |   ## Data to plot
399 |   return(list("PowerVsSsampleSize" = yValues,
400 |               "PowerVsEffectSize" = yValues2))
401 | 
402 | }
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | # Computing optimal sample size when it is not equal for all omics ----------------------------------------------------------
410 | 
411 | optiSSnotEqual = function (parameters, fdr = 0.05, cost = 1, max.size = 100,
412 |                            omicPower = 0.6, averagePower = 0.8, null.effect = 0) {
413 | 
414 |   ##### GENERATION OF MATRICES FOR THE PROBLEM
415 | 
416 |   K = length(parameters)  # number of omics
417 | 
418 |   if (length(cost) < K) cost = rep(cost[1], K)
419 |   if (length(max.size) < K) max.size = rep(max.size[1], K)
420 |   if (length(omicPower) < K) omicPower = rep(omicPower[1], K)
421 | 
422 |   num.var = sum(max.size) - length(max.size)  ## sample size = 1 is not considered
423 |   # coeffs power;  
424 |   # constraint sum(vars) = 1 (to have only 1 sample size); coeffs average power
425 |   myC = NULL  # coeffs of objective function
426 |   # A1: coeffs of power per omic
427 |   # A3: sum(Zij) = 1 for each omic i
428 |   A1 = A3 = matrix(0, nrow = K, ncol = num.var) 
429 |   # A2: coeffs for average power for all omics
430 |   A2 = NULL
431 |   
432 |   for (k in 1:K) {
433 |     # coef.power --> A1, A2
434 |     # coef1 --> A3
435 |     coef.power = coef1 = rep(0, num.var)
436 | 
437 |     for (i in 2:max.size[k]) {
438 |       myC = c(myC, cost[k]*i*2)  # coefficients of objective function
439 | 
440 |       # power of each (omic, sample size)
441 |       my.power = getPower(parameters[[k]], power = NULL, n = i, fdr = fdr,
442 |                           null.effect = null.effect, max.n = max.size)
443 | 
444 |       # coeff average power
445 |       A2 = c(A2, my.power)
446 | 
447 |       if (k == 1) {   
448 |         coef.power[i-1] = my.power
449 |         coef1[i-1] = 1
450 |       }
451 | 
452 |       if (k > 1) {
453 |         coef.power[i-k+sum(max.size[1:(k-1)])] = my.power
454 |         coef1[i-k+sum(max.size[1:(k-1)])] = 1
455 |       }
456 | 
457 |     }
458 |     A1[k,] = coef.power
459 |     A3[k,] = coef1
460 |   }
461 |   
462 |   A2 = matrix(A2, nrow = 1, byrow = TRUE)
463 |   
464 |   myA = rbind(A1, A2, A3)
465 |   
466 |   mydir = c(rep(">=", K+1), rep("=", K))
467 |   
468 |   myRHS = c(omicPower, K*averagePower, rep(1,K))
469 | 
470 | 
471 |   #### SOLUTION OF THE PROBLEM
472 |   mysol = lp(direction = "min", objective.in = myC, const.mat = myA,
473 |              const.dir = mydir, const.rhs = myRHS, all.int = TRUE, all.bin = TRUE)
474 |   
475 |   if (mysol$status == 2) {
476 |     
477 |     stop("No feasible solution was found for these requirements.")
478 |     
479 |   } else {
480 |     
481 |     myvars = unlist(sapply(1:K, function (k) paste(names(parameters)[k], 
482 |                                                    2:max.size[k], sep = "=")))
483 |     mysolution = myvars[which(mysol$solution == 1)]
484 |     mysolution = as.data.frame(do.call("rbind", strsplit(mysolution, "=")), 
485 |                                stringsAsFactors = FALSE)
486 |     colnames(mysolution) = c("Omic", "SampleSize")
487 |     mysolution = data.frame(mysolution, "OmicCost" = cost*as.numeric(mysolution[,2])*2,
488 |                             "Power" = diag(A1[,mysol$solution == 1]))
489 | 
490 |     return(mysolution)
491 |     
492 |   }
493 | 
494 | }
495 | 
496 | 
497 | 
498 | 
499 | 
500 | 
501 | # Wrapper function: MULTIPOWER --------------------------------------------
502 | 
503 | MultiPower = function(data, groups, type, omicPower = 0.6, averagePower = 0.85, 
504 |                       fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL,
505 |                       powerPlots = TRUE, null.effect = 0) {
506 | 
507 |   parameters = lapply(1:length(data), function (i) {
508 |     cat(paste0("Estimating parameters for omic: ", names(data)[i], " \n"))
509 |     paramEst(data[[i]], groups[[i]], type[i])})
510 |   names(parameters) = names(data)
511 | 
512 |   cat("Computing optimal sample size... \n")
513 |   optimalSampleSize = optimalRep(parameters, omicPower, averagePower, fdr, cost, 
514 |                                  equalSize, max.size = max.size, null.effect)
515 | 
516 |   resum = powerSummary(parameters, optimalSampleSize)
517 | 
518 |   if (powerPlots) {
519 |     cat("Generating power plots... \n")
520 |     data2plot = powerPlot(parameters, optimalSampleSize, omicCol)
521 |   } else { data2plot = NULL }
522 | 
523 |   return(list("parameters" = parameters,
524 |               "optimalSampleSize" = optimalSampleSize,
525 |               "summary" = resum,
526 |               "data2plot" = data2plot))
527 | 
528 | }
529 | 
530 | 
531 | 
532 | 
533 | 
534 | # postMultiPower ----------------------------------------------------------
535 | 
536 | postMultiPower = function(optResults, max.size = 5, omicCol = NULL) {
537 | 
538 |   omics = names(optResults$parameters)
539 |   
540 |   equalSS = TRUE
541 |   if (sum(is.na(optResults$summary$minSampleSize))  == nrow(optResults$summary)) equalSS = FALSE
542 |   
543 |   maxD = round(min(optResults$summary$maxCohenD),1)
544 |   maxD = min(c(maxD, 4))
545 |   lasD = seq(0,maxD-0.1,0.1) 
546 |   
547 |   mySize = myPower = myM =  matrix(NA, ncol = length(omics), nrow = length(lasD))
548 |   colnames(mySize) = colnames(myPower) = colnames(myM) = omics
549 |   rownames(mySize) = rownames(myPower) = rownames(myM) = paste0("d=", lasD)
550 | 
551 |   mySize[1,] = optResults$summary$optSampleSize
552 |   myPower[1,] = optResults$summary$power
553 |   myM[1,] = sapply(optResults$parameters, function (x) x$m)
554 |   
555 |   parameters2 = optResults$parameters
556 |   
557 |   for (i in 2:nrow(mySize)) {
558 |     for (j in 1:ncol(mySize)) {
559 |       selefeat = which(parameters2[[j]]$d >= lasD[i])
560 |       parameters2[[j]]$d = parameters2[[j]]$d[selefeat]
561 |       
562 |       if (parameters2[[j]]$type == 1) {
563 |         parameters2[[j]]$logFC = parameters2[[j]]$logFC[selefeat]
564 |         parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat]
565 |         parameters2[[j]]$CV = parameters2[[j]]$CV[selefeat]
566 |         parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat]
567 |         parameters2[[j]]$mu = parameters2[[j]]$mu[selefeat]
568 |       }
569 |       
570 |       if (parameters2[[j]]$type == 2) {
571 |         parameters2[[j]]$pooledSD = parameters2[[j]]$pooledSD[selefeat]
572 |         parameters2[[j]]$delta = parameters2[[j]]$delta[selefeat]
573 |       }
574 |       
575 |       if (parameters2[[j]]$type == 3) {
576 |         parameters2[[j]]$p1_p2 = parameters2[[j]]$p1_p2[selefeat]
577 |         parameters2[[j]]$p1 = parameters2[[j]]$p1[selefeat]
578 |       }
579 |       
580 |     }
581 |     
582 |     tmp = optimalRep(parameters2, omicPower = optResults$summary$minPower,
583 |                      averagePower = optResults$summary$averPower[1],
584 |                      fdr = optResults$optimalSampleSize$fdr, cost = optResults$summary$cost,
585 |                      equalSize = equalSS, max.size = max(optResults$summary$optSampleSize, na.rm = TRUE))
586 |     
587 |     mySize[i,] = tmp$n
588 |     myPower[i,] = tmp$finalPower
589 |     myM[i,] = sapply(parameters2, function (x) length(x$d))
590 |   }
591 | 
592 |   # Post-results
593 |   myresult = list("SampleSize" = mySize, "Power" = myPower, "NumFeat" = myM, "d" = lasD)
594 | 
595 |   # Plot
596 |   postPowerPlot(postResults = myresult, equalSize = equalSS, omicCol = omicCol, max.size = max.size)
597 | 
598 |   return(myresult)
599 | 
600 | }
601 | 
602 | 
603 | 
604 | 
605 | # postPowerPlot -----------------------------------------------------------
606 | 
607 | postPowerPlot = function(postResults, equalSize, omicCol = NULL, max.size = 10) {
608 | 
609 |   if (is.null(omicCol)) {
610 |     omicCol = colors()[c(554,89,111,512,17,586,132,428,601,568,86,390)]
611 |     omicCol = omicCol[1:nrow(postResults$SampleSize)]
612 |   }
613 |   names(omicCol) = colnames(postResults$SampleSize)
614 | 
615 |   if (min(postResults$SampleSize) > max.size) {
616 |     cat(paste0("The chosen sample size of ", max.size, " is not feasible. \n"))
617 |     max.size = min(postResults$SampleSize)
618 |     cat(paste0("A sample size of ", max.size, " will be plotted instead. \n"))
619 |   }
620 | 
621 |   if (equalSize) {
622 | 
623 |     fff = approxfun(postResults$SampleSize[,1], postResults$d)
624 |     myD = round(fff(max.size),1)
625 |     
626 |     cat(paste0("For having a sample size of ", max.size, " and maintain the desired power, you need to remove features with Cohen's d below ", myD, ". \n"))
627 |     cat("The number of remaining features in each omic is: \n")
628 |     print(postResults$NumFeat[paste0("d=",myD),])
629 | 
630 |     plot(postResults$d, postResults$SampleSize[,1], type = "l", lwd = 2, xlab = "Cohen's d cutoff",
631 |          ylab = "Number of replicates", main = "Sample size vs Cohen's d",
632 |          ylim = c(2, max(postResults$SampleSize)))
633 |     arrows(x0 = 0, y0 = max.size, x1 = myD, y1 = max.size, lty = 2, col = 2)
634 |     arrows(x0 = myD, y0 = max.size, x1 = myD, y1 = 2, lty = 2, col = 2)
635 |     text(min(postResults$d)+1, max(postResults$SampleSize, na.rm = TRUE)-2, paste0("Cohen's d = ", myD), col = 2)
636 | 
637 |   } else {
638 | 
639 |     fff = sapply(1:ncol(postResults$SampleSize), function (i) { fff = approxfun(postResults$SampleSize[,i], postResults$d)
640 |                                                                 return(fff(max.size)) })
641 |     myD = round(max(fff, na.rm = TRUE),2)
642 | 
643 |     matplot(postResults$d, postResults$SampleSize, type = "l", lwd = 2, col = omicCol, lty = 1,
644 |             xlab = "Cohen's d", ylab = "Number of replicates", main = "Sample size vs Cohen's d",
645 |             ylim = c(2, max(postResults$SampleSize)))
646 |     legend("topright", colnames(postResults$SampleSize), lwd = 2, col = omicCol, bty = "n")
647 |     arrows(x0 = 0, y0 = max.size, x1 = myD, y1 = max.size, lty = 2, col = 1)
648 |     arrows(x0 = myD, y0 = max.size, x1 = myD, y1 = 2, lty = 2, col = 1)
649 |     text(mean(postResults$d), max(postResults$SampleSize, na.rm = TRUE), 
650 |          paste0("Cohen's d = ", myD), col = 1, adj = 0.5)
651 |   }
652 | 
653 |   mypar = par()
654 |   suppressWarnings(par(xpd = TRUE, mar = c(6.2,4,3,0.8)))
655 |   barplot(postResults$Power[c(1,min(which(postResults$d >= myD))),], col = rep(omicCol, each = 2),
656 |           beside = TRUE, las = 2, ylab = "Statistical power", ylim = c(0,1),
657 |           border = rep(omicCol, each = 2), density = rep(c(30,100), ncol(postResults$Power)))
658 |   legend(x = 0.5, y = 1.2, c("Optimal SS", "User's SS"), col = 1, density = c(30,100), ncol = 2, bty = "n")
659 |   suppressWarnings(par(mypar))
660 | 
661 | }
662 | 
663 | 
664 | 
665 | 
666 | 
667 | # Wrapper function MultiGroupPower ---------------------------------------------------------
668 | 
669 | MultiGroupPower = function(data, groups, type, comparisons = NULL,
670 |                            omicPower = 0.6, averagePower = 0.85, 
671 |                            fdr = 0.05, cost = 1, equalSize = TRUE, max.size = 200, omicCol = NULL,
672 |                            powerPlots = FALSE, summaryPlot = TRUE) {
673 |   
674 |   grupsComuns = Reduce(intersect, groups)
675 |   
676 |   if (is.null(comparisons)) {  # Generating all possible comparisons
677 |     
678 |     comparisons = combn(grupsComuns, m = 2)
679 |     
680 |   } else {  # Checking that required comparisons are possible for all omics
681 |     
682 |     grupsComparats = unique(as.vector(comparisons))
683 |     
684 |     if (!setequal(grupsComparats, grupsComuns)) stop("Groups to be compared are not available for all omics.")
685 |   }
686 |   
687 |   nomsCompa = apply(comparisons, 2, paste, collapse = "_")
688 |   
689 |   output = vector("list", length = length(nomsCompa)); names(output) = nomsCompa
690 |   
691 |   for (i in 1:length(output)) {
692 |     
693 |     cat(nomsCompa[i], sep = "\n")
694 |     
695 |     quines = lapply(1:length(data), function (j) which(is.element(groups[[j]], comparisons[,i])))
696 |     data2 = lapply(1:length(data), function (j) data[[j]][,quines[[j]]]); names(data2) = names(data)
697 |     groups2 = lapply(1:length(data), function (j) groups[[j]][quines[[j]]]); names(groups2) = names(groups)
698 |     
699 |     output[[i]] = MultiPower(data = data2, groups = groups2, type, omicPower, averagePower,
700 |                              fdr, cost, equalSize, max.size, omicCol, powerPlots)
701 |   }
702 |   
703 |   output$GlobalSummary = output[[1]]$summary
704 |   
705 |   output$GlobalSummary[,"minSampleSize"] = apply(sapply(output[-length(output)], function(x) x$summary[,"minSampleSize"]), 1,
706 |                                                  function (y) {
707 |                                                    if (length(unique(y)) == 1) return(unique(y))
708 |                                                    if (length(unique(y)) != 1) return(paste(range(y, na.rm = T), collapse = "-"))})
709 |   
710 |   output$GlobalSummary[,"optSampleSize"] = apply(sapply(output[-length(output)], function(x) x$summary[,"optSampleSize"]),
711 |                                                  1, max,  na.rm = TRUE)
712 |   
713 |   tmpSS = sapply(output[-length(output)], function(x) x$summary[,"optSampleSize"])
714 |   tmpPower = sapply(output[-length(output)], function(x) x$summary[,"power"])
715 |   output$GlobalSummary[,"power"] = sapply(1:nrow(tmpSS), function (i) tmpPower[i,which.max(tmpSS[i,])])
716 |   
717 |   cat("=============================== \n")
718 |   cat("Global summary \n")
719 |   cat("=============================== \n")
720 |   print(output$GlobalSummary)
721 |   
722 |   
723 |   if (summaryPlot) MultiCompaPlot(multiOutput = output, omicCol = omicCol, equalSize = equalSize, 
724 |                                   legendLoc = "bottom")
725 |   
726 |   return(output)
727 |   
728 | }
729 | 
730 | 
731 | 
732 | # Plot for multiple comparisons -------------------------------------------
733 | 
734 | MultiCompaPlot = function(multiOutput, omicCol = NULL, equalSize, legendLoc = "bottomright") {
735 | 
736 |   if (is.null(omicCol)) {
737 |     if (length(multiOutput[[1]]$parameters) > 12) {
738 |       stop("Too many omics to be plotted. \n")
739 |     }
740 |     omicCol = colors()[c(554,89,111,512,17,586,132,428,601,568,86,390)]
741 |     omicCol = omicCol[1:length(multiOutput[[1]]$parameters)]
742 |   }
743 | 
744 |   omicShape = 1:length(multiOutput[[1]]$parameters)
745 |   names(omicCol) = names(omicShape) = names(multiOutput[[1]]$parameters)
746 | 
747 | 
748 |   ## Comparisons versus Sample Size
749 |   if (equalSize) {
750 |     apintar = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[,"minSampleSize"])
751 |     rownames(apintar) = names(multiOutput[[1]]$parameters)
752 |     laopt = multiOutput$GlobalSummary[1,"optSampleSize"]
753 |     optComp = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[1,"optSampleSize"])
754 | 
755 |     matplot(1:ncol(apintar), t(apintar), type = "l", lwd = 2, xlab = "Comparisons", ylab = "Sample Size",
756 |             main = "Sample Size per Comparison", col = omicCol, lty = omicShape, xaxt='n')
757 |     axis(side = 1, at=1:ncol(apintar), labels=colnames(apintar))
758 |     abline(h = laopt, lty = 1, lwd = 4)
759 |     points(1:ncol(apintar), optComp, pch = 15, col = 1, cex = 1.3)
760 |     legend(legendLoc, c(rownames(apintar), "  ", "Comparison Optimal SS", "Global Optimal SS"), lwd = 2, col = c(omicCol,"white", 1,1),
761 |            lty = c(omicShape,1,NA,1), bty = "n", pch = c(rep(NA, nrow(apintar)+1), 15, NA), title = "Min SS per comparison", cex = 0.8)
762 | 
763 |   } else {
764 |     apintar = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[,"optSampleSize"])
765 |     rownames(apintar) = names(multiOutput[[1]]$parameters)
766 |     laOpt = multiOutput$GlobalSummary[,"optSampleSize"]
767 |     apintar = data.frame(apintar, "Optimal" = laOpt, check.names = FALSE)
768 | 
769 |     matplot(1:ncol(apintar), t(apintar), type = "l", lwd = 2, xlab = "Comparisons", ylab = "Sample Size",
770 |             main = "Sample Size per Comparison", col = omicCol, lty = omicShape, xaxt='n')
771 |     axis(side = 1, at=1:ncol(apintar), labels=colnames(apintar))
772 |     points(rep(ncol(apintar), nrow(apintar)), laOpt, pch = 15, col = omicCol, cex = 1.3)
773 |     legend(legendLoc, rownames(apintar), lwd = 2, col = omicCol, cex = 0.8,
774 |            lty = omicShape, bty = "n")
775 | 
776 |   }
777 | 
778 | 
779 |   ## Comparisons vs Power
780 |   apintar = sapply(multiOutput[-length(multiOutput)], function (x) x$summary[,"power"])
781 |   rownames(apintar) = names(multiOutput[[1]]$parameters)
782 |   optPow = multiOutput$GlobalSummary[,"power"]
783 |   apintar = data.frame(apintar, "Optimal" = optPow, check.names = FALSE)
784 | 
785 |   matplot(1:ncol(apintar), t(apintar), type = "l", lwd = 2, xlab = "Comparisons", ylab = "Statistical Power",
786 |           main = "Power per Comparison at Optimal SS", col = omicCol, lty = omicShape, xaxt='n')
787 |   axis(side = 1, at=1:ncol(apintar), labels=colnames(apintar))
788 |   points(rep(ncol(apintar), nrow(apintar)), optPow, pch = 15, col = omicCol, cex = 1.3)
789 |   legend(legendLoc, rownames(apintar), lwd = 2, col = omicCol,
790 |          lty = omicShape, bty = "n", cex = 0.8)
791 | 
792 | }
793 | 
794 | 
795 | 
796 | 
797 | 
798 | # Filtering data with Cohen's d cutoff ------------------------------------
799 | 
800 | CohenFilter = function (data, d, parameters) {
801 |   
802 |   if (length(d) == 1) d = rep(d, length(data))
803 |   if (length(d) != length(data)) { 
804 |     stop("Please, provide a single value for d or as many values a omic data types in data") 
805 |   }
806 |   
807 |   dataF = data
808 |   
809 |   for (i in 1:length(data)) {
810 |     quitar = which(parameters[[i]]$d < d[i])
811 |     dataF[[i]] = dataF[[i]][-quitar,]
812 |   }
813 |   
814 |   return(dataF)
815 |   
816 | }
817 | 


--------------------------------------------------------------------------------
/MultiML/FoMPredictiveFunctions.R:
--------------------------------------------------------------------------------
   1 | ###########################################################
   2 | ############     FoMPredictiveFunctions.R     #############
   3 | ###########################################################
   4 | # Author: Leandro Balzano-Nogueira
   5 | # Genetics Institute, University of Florida (Gainesville)
   6 | # Last update: October/09/2019
   7 | 
   8 | # This script is to gather together the functions used for the MultiML predictive analysis 
   9 | 
  10 | ###########################################################
  11 | # Functions:
  12 | 
  13 | permutations <- function(n){
  14 |   # Internal checkpoint function to create permutations to evaluate 
  15 |   # if all samples included are the same, and if they are in the 
  16 |   # same order so they can be compared across all Omics datasets
  17 |   
  18 |   if(n==1){
  19 |     return(matrix(1))
  20 |   } else {
  21 |     sp <- permutations(n-1)
  22 |     p <- nrow(sp)
  23 |     A <- matrix(nrow=n*p,ncol=n)
  24 |     for(i in 1:n){
  25 |       A[(i-1)*p+1:p,] <- cbind(i,sp+(sp>=i))
  26 |     }
  27 |     return(A)
  28 |   }
  29 | }
  30 | 
  31 | ###########################################################
  32 | 
  33 | g_legend <- function(a.gplot){
  34 |   # Function to break ggplots legends and reconstruct them somewhere else
  35 |   tmp <- ggplot_gtable(ggplot_build(a.gplot))
  36 |   leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  37 |   legend <- tmp$grobs[[leg]]
  38 |   return(legend)
  39 | }
  40 | ###########################################################
  41 | 
  42 | CombinationsFunction <- function (OmicsData, AnalysisType="complete", levels=NULL) {
  43 |   # Function to create the combinations of datasets to be studied
  44 |   # Input :
  45 |   # OmicsData: The data as list of predictor datasets including also the response (Y) matrix as the last element of the list
  46 |   # AnalysisType: The type of analysis, it can be 
  47 |   #               "exploratory", where all predictors vs Y along with each dataset of predictors vs Y are evaluated
  48 |   #               "complete", where all combinations are created or
  49 |   #               "detailed", where the user can choose by levels indicating if two datasets vs Y or three vs Y is required
  50 |   # levels: A vector indicating the levels to be analyzed if Analysis Type is detailed, if 2, then two predictors datasets will be contrasted against Y and so on 
  51 |   
  52 |   # Output:
  53 |   # A list of two lists:
  54 |   # Combinations: A list of the combinations required by the user to be created
  55 |   # Data: All required lists of datasets to be used as input for the error rate analysis
  56 |   
  57 |   vectito<-names(OmicsData)[1:length(OmicsData)-1]
  58 |   Combinatory<-list()
  59 |   combita<-combitachica<-NULL
  60 |   
  61 |   for (factores in 1:length (vectito)) {
  62 |     namecito<-paste(factores,"Omics", sep = " ")
  63 |     #print(namecito)
  64 |     Secuencia<-seq (1:factores)
  65 |     combita<-combn(x=vectito, m=factores)
  66 |     
  67 |     for (om in 1:ncol(combita)){
  68 |       namecitoChico<-paste(namecito,om, sep="")
  69 |       combitachica<-combita[,om]
  70 |       Combinatory[namecitoChico]<-list(combitachica)
  71 |       
  72 |     }
  73 |   }
  74 |   Combinations<-list()
  75 |   if (AnalysisType=="exploratory") {
  76 |     Combinations<-Combinatory[c(1:length(OmicsData)-1,(length(Combinatory)))]
  77 |     print(paste("Performing the analysis of",length(Combinations),"out of",length(Combinatory) ,"possible combinations", sep=" " ) )
  78 |   }
  79 |   if (AnalysisType=="complete") {
  80 |     Combinations<-Combinatory
  81 |     print(paste("Performing the analysis with all",length(Combinatory) ,"possible combinations", sep=" " ) )
  82 |   }
  83 |   if (AnalysisType=="detailed") {
  84 |     if (is.null(levels) ) {
  85 |       stop("If AnalysisType = detailed, you must indicate the levels")
  86 |     }
  87 |     SelList<-list()
  88 |     for (lev in 1:length(levels)) {
  89 |       levs=levels[lev]
  90 |       for (All in 1:length(Combinatory)) {
  91 |         if(length(Combinatory[[All]])==levs) {
  92 |           neiminho<-names(Combinatory[All])
  93 |           SelList<-Combinatory[[All]]
  94 |           Combinations[neiminho]<-list(SelList)
  95 |         }
  96 |       }
  97 |     }
  98 |     print(paste("Performing the analysis of",length(Combinations),"out of",length(Combinatory) ,"possible combinations", sep=" " ) )
  99 |   }
 100 |   ListofOmics<-OmicList<-list()
 101 |   for (branch in 1:length(Combinations)) {
 102 |     piece<-Combinations[[branch]]
 103 |     NameOmicsData<-paste0(piece, collapse = "_" )
 104 |     OmicsData2<-OmicsData[c(piece,names(OmicsData[length(OmicsData)]) )]
 105 |     OmicList<-list(OmicsData2)
 106 |     ListofOmics[NameOmicsData]<-OmicList
 107 |   }
 108 |   res<-list(Combinations=Combinations, Data=ListofOmics)
 109 |   return(res)
 110 | } 
 111 | 
 112 | ###########################################################
 113 | SlurmFunction <- function (X) {
 114 |   # Function that will be included in the cluster for the required analysis
 115 |   # Input: 
 116 |   # X: A list of datasets created in "CombinationsFunction" 
 117 |   # comps: Number of componets to be calculated after each iteration through "ClassificationErrorRate" function
 118 |   # CrosVal: Type of cross validation to be applied in "ClassificationErrorRate" function, Leave-One-Out (LOOCV) or ten fold change (TenF) 
 119 |   # ticks: Number of segments (groups) of samples to evaluate.
 120 |   # iterations: Number of iterations in which error rate will be calculated
 121 |   
 122 |   ER_Calculator(Predictors=X, Response=length(X),Previous_CER=NULL,Ticks=NULL,WhichTicks=NULL,Function=Random.Forest.MP,Comps=10,crosval = "LOOCV",ERIterations=2,LassoIterations=2,TheoreticER=0.02)
 123 | }
 124 | 
 125 | ###########################################################
 126 | Slurm_Creator<-function (Function, Parameters, jobname = NA, nodes = 2, cpus_per_node = 2, time=30,
 127 |                          pkgs = rev(.packages()) ) 
 128 | {
 129 |   # Function to create the files to be uploaded into a cluster to calculate the results of all particular combinations of predictor datasets.
 130 |   # Input: 
 131 |   # Function: An object class SlurmFunction that will calculate the error rates
 132 |   # Parameters: All required lists of datasets to be used as input for the error rate calculation
 133 |   # jobname: The desired name of the Slurm job. If NA, a random name of the form "slr####" will be assigned
 134 |   # nodes: The maximum number of nodes to be used in the cluster.
 135 |   # cpus_per_node: The number of CPUs per node in the cluster. It determines how many processes are run in parallel per node
 136 |   # pkgs: A character vector containing the names of packages that must be loaded on each cluster node. By default, it includes all packages loaded by the user when slurm_apply is called.
 137 |   # time: Time in days to run the data in the cluster
 138 |   
 139 |   # Output:
 140 |   # Four elements to be uploaded into cluster:
 141 |   # Function.RDS: A .RDS file with the Slurm Function that will be calculated
 142 |   # Parameters.RDS: A .RDS file with all lists of datasets that will be used for the error rate calculation
 143 |   # SlurmCreator_Run.R: A .R file with the parameters to run the job
 144 |   # SlurmCreator_submit_sh: A .sh file that indicates all parameters that the cluster requires to perform the job
 145 |   
 146 |   if (!is.function(Function)) {
 147 |     stop("first argument to slurm_apply should be a function")
 148 |   }
 149 |   
 150 |   if (!is.numeric(nodes) || length(nodes) != 1) {
 151 |     stop("nodes should be a single number")
 152 |   }
 153 |   if (!is.numeric(cpus_per_node) || length(cpus_per_node) != 
 154 |       1) {
 155 |     stop("cpus_per_node should be a single number")
 156 |   }
 157 |   tmpdir <- paste0("rslurm_", jobname)
 158 |   dir.create(tmpdir, showWarnings = FALSE)
 159 |   saveRDS(Parameters, file = file.path(tmpdir, "Parameters.RDS"))
 160 |   saveRDS(Function, file = file.path(tmpdir, "Function.RDS"))
 161 |   if (length(Parameters) < cpus_per_node * nodes) {
 162 |     nchunk <- cpus_per_node
 163 |   }
 164 |   else {
 165 |     nchunk <- ceiling(length(Parameters)/nodes)
 166 |   }
 167 |   nodes <- ceiling(length(Parameters)/nchunk)
 168 |   template_r <- readLines("templates/Slurm_run_R.txt")
 169 |   script_r <- whisker::whisker.render(template_r, list(pkgs = pkgs, 
 170 |                                                        nchunk = nchunk, cpus_per_node = cpus_per_node))
 171 |   writeLines(script_r, file.path(tmpdir, "SlurmCreator_Run.R"))
 172 |   template_sh <- readLines("templates/submit_sh.txt")
 173 |   
 174 |   rscript_path <- file.path(R.home("bin"), "Rscript")
 175 |   script_sh <- whisker::whisker.render(template_sh, list(max_node = nodes - 1, 
 176 |                                                          jobname = jobname,
 177 |                                                          time=time,
 178 |                                                          rscript = rscript_path))
 179 |   writeLines(script_sh, file.path(tmpdir, "SlurmCreator_submit_sh"))
 180 |   slurm_job(jobname, nodes)
 181 | }
 182 | 
 183 | ###########################################################
 184 | ClassificationErrorRate<- function (Predictors, Response=length(Predictors),Function=Random.Forest.MP,Comps=10,crosval = "LOOCV",Ticks=10,WhichTicks=NULL,ERIterations=15,LassoIterations=15,ErrorRateType="ER",...) {
 185 |   # Function to evaluate through Error rate the predictive capability of the Multipower package
 186 |   # Input: 
 187 |   # Predictors: A list of different Omics Datasets and the Response matrix
 188 |   # Response: A number indicating the response matrix included in Predictors
 189 |   # Comps: Number of componets to be calculated after each iteration
 190 |   # crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF)
 191 |   # Ticks: Number of segments (groups) of samples to evaluate.
 192 |   # ERIterations: Number of iterations in which ER will be calculated
 193 |   # LassoIterations: Number of iterations of the Lasso selection per each Error rate analysis
 194 |   
 195 |   # Output:
 196 |   # Omics: A vector of the evaluated Omics
 197 |   # A list of two lists:
 198 |   # Minimums: A list of the minimum value of error rate, balanced (BER) or not (ER) obtained per each ten
 199 |   #           component analysis. This is measured through three distance metrics to evaluate the 
 200 |   #           classification performance of the model. Maximal distance (max.dist), distance to 
 201 |   #           centroids (centroids) or Mahalanobis distance (Mahalanobis)
 202 |   #           Thus, each table contains the results per each iteration at different subsets of samples
 203 |   
 204 |   # CompWinner: A list of the number of components in which the minimum value of error rate, 
 205 |   #             balanced (BER) or not (ER) was obtainde per each iteration. This is measured through 
 206 |   #             the three mentioned distance metrics to evaluate the classification performance 
 207 |   #             of the model. Thus, each table contains the components per each iteration at different 
 208 |   #             subsets of samples
 209 |   
 210 |   ##########
 211 |   
 212 |   components=Comps
 213 |   
 214 |   if (! is.list(Predictors) ) {
 215 |     stop("\\nOmics dataset must be a list with at least two elements")
 216 |   }
 217 |   ###########
 218 |   # Y
 219 |   Y<-t(as.matrix(Predictors[[Response]], drop=FALSE))
 220 |   if (is.character(Y[,1])) {
 221 |     Ycita<-transform(Y, Type = as.numeric(Type))
 222 |     Y<-Ycita
 223 |     rm(Ycita)
 224 |   }
 225 |   if (ncol(Y) != 1) {
 226 |     stop("\\nResponse must be a single variable")
 227 |   }
 228 |   if (any(is.na(Y))) {
 229 |     stop("\\nResponse must not contain missing values")
 230 |   }
 231 |   if (is.null(colnames(Y))) {
 232 |     colnames(Y) = "Y"
 233 |   }
 234 |   if (is.null(rownames(Y))) {
 235 |     rownames(Y) = 1:n
 236 |   }
 237 |   
 238 |   # Step1: Match the sample size
 239 |   LosIndivs<- rownames(Y)
 240 |   for (i in 1:length(Predictors)){
 241 |     LosIndivs = intersect(LosIndivs, colnames(Predictors[[i]]))
 242 |   }
 243 |   #print(paste("This analysis will be performed with",length(LosIndivs),"samples, since those are the ones repeated in all layers"))
 244 |   
 245 |   NewList<-Predictors
 246 |   
 247 |   for (i in 1:length(NewList)) {
 248 |     NewList[[i]]<-NewList[[i]][,colnames(NewList[[i]]) %in% LosIndivs]
 249 |   }
 250 |   
 251 |   ###########
 252 |   # Step2: Match the order
 253 |   LosColnames<-colnames(NewList[[1]])
 254 |   
 255 |   for (i in 1:length(NewList)) {
 256 |     NewList[[i]]<-NewList[[i]][,sort(LosColnames) ]
 257 |   }
 258 |   
 259 |   TestdeMatchTable<-unique(matrix(permutations(length(NewList)),ncol=2))
 260 |   
 261 |   for (i in 1:nrow(TestdeMatchTable)) {
 262 |     a=TestdeMatchTable[i,1]
 263 |     b=TestdeMatchTable[i,2]
 264 |     
 265 |     if (all(colnames(NewList[[a]])==colnames(NewList[[b]]))){
 266 |       #print(paste("Columns of lists",a,"and",b,"are equally sorted"))
 267 |     } else{
 268 |       #print("The colnames are not equally sorted")
 269 |     }
 270 |   }
 271 |   if (crosval=="LOOCV"){
 272 |     valid="loo"
 273 |   }
 274 |   if (crosval=="TenF"){
 275 |     valid="Mfold"
 276 |   }
 277 |   
 278 |   # Y
 279 |   Y<-t(as.matrix(NewList[[Response]], drop=FALSE))
 280 |   if (is.character(Y[,1])) {
 281 |     Ycita<-transform(Y, Type = as.numeric(Type))
 282 |     Y<-Ycita
 283 |     rm(Ycita)
 284 |   }
 285 |   
 286 |   #########
 287 |   Omics<-data.frame(Omics=names(NewList[-Response]))
 288 |   
 289 |   MinN<-round(length(table(as.factor(Y[,1]))) * 2, digits = 0)
 290 |   MaxN<-nrow(Y)
 291 |   Ngroups<-length(table(as.factor(Y[,1])))
 292 |   if(!is.null(Ticks)) {
 293 |     vectTicks<-round(seq(from=MinN,to = MaxN,length.out = Ticks),digits = 0)
 294 |   } else {
 295 |     if (is.null(WhichTicks)) {
 296 |       print("Please insert which ticks you want to calculate")
 297 |     } else {
 298 |       vectTicks<-WhichTicks
 299 |     }
 300 |   }
 301 |   muestra<-seq(1:length(vectTicks))
 302 |   
 303 |   muestrita<-muestritaLASSO<-NULL
 304 |   Muestra<-muestrasas<-list()
 305 |   MuestraLASSO<-muestrasasLASSO<-list()
 306 |   Minimums<-minimossas<-list()
 307 |   winnerSas<-CompWinner<-list()
 308 |   
 309 |   for (i in 1:length(vectTicks)) {
 310 |     
 311 |     #print(paste("Tick",i))
 312 |     print(paste("Processing your data... Please wait"))
 313 |     
 314 |     Mins<-ComponentWinner<-preMins<-NULL
 315 |     Minsbigrfc<-NULL
 316 |     TableMins<-NULL
 317 |     
 318 |     for (iter in 1:ERIterations){
 319 |       iter=iter
 320 |       #print(paste("performing iteration ",iter," of tick ",i,sep=""))
 321 |       #print(paste("Tick ",i,": ", vectTicks[i], " samples", sep = ""))
 322 |       namecito<-paste(vectTicks[i], " samples", sep = "")
 323 |       resample <- TRUE
 324 |       index <- rownames(Y)
 325 |       fun <- function(x) sample(x, round((vectTicks[i])/Ngroups,digits=0), replace = resample)
 326 |       a <- aggregate(index, by = list(group = Y[,1]), FUN = fun )
 327 |       a<-a[,-1]
 328 |       aLASSO<-aggregate(index, by = list(group = Y[,1]), FUN = fun )
 329 |       aLASSO<-aLASSO[,-1]
 330 |       Premuestrita<-as.vector(unlist(a))
 331 |       PremuestritaLASSO<-as.vector(unlist(aLASSO))
 332 |       if(vectTicks[i]>=length(Premuestrita)){
 333 |         muestrita<-c(Premuestrita,sample(rownames(Y), size=vectTicks[i]-length(Premuestrita), replace = TRUE))
 334 |         muestritaLASSO<-c(PremuestritaLASSO,sample(rownames(Y), size=vectTicks[i]-length(PremuestritaLASSO), replace = TRUE))
 335 |       } else{
 336 |         muestrita<-Premuestrita[1:(length(Premuestrita)-(length(Premuestrita)-vectTicks[i]))]
 337 |         muestritaLASSO<-PremuestritaLASSO[1:(length(PremuestritaLASSO)-(length(PremuestritaLASSO)-vectTicks[i]))]
 338 |       }
 339 |       muestrasas<-list(muestrita)
 340 |       muestrasasLASSO<-list(muestritaLASSO)
 341 |       Muestra[namecito]<-muestrasas
 342 |       MuestraLASSO[namecito]<-muestrasasLASSO
 343 |       #####
 344 |       Xchica2<-NewList
 345 |       Xchica2LASSO<-NewList
 346 |       
 347 |       # Matching the order again
 348 |       LasMuestras<-Muestra[[i]]
 349 |       LasMuestrasLASSO<-MuestraLASSO[[i]]
 350 |       # Sorting the tables
 351 |       for (long in 1:length(Xchica2)) {
 352 |         Xchica2[[long]]<-Xchica2[[long]][,sort(LasMuestras) ]
 353 |       }
 354 |       for (long2 in 1:length(Xchica2LASSO)) {
 355 |         Xchica2LASSO[[long2]]<-Xchica2LASSO[[long2]][,sort(LasMuestrasLASSO) ]
 356 |       }
 357 |       
 358 |       TestdeMatchTable<-unique(matrix(permutations(length(Xchica2)),ncol=2))
 359 |       for (lulu in 1:nrow(TestdeMatchTable)) {
 360 |         cola=TestdeMatchTable[lulu,1]
 361 |         colb=TestdeMatchTable[lulu,2]
 362 |         
 363 |         if (all(colnames(Xchica2[[cola]])==colnames(Xchica2[[colb]]))){
 364 |           #print(paste("Columns of lists",cola,"and",colb,"are equally sorted"))
 365 |         } else{
 366 |           print("Columns of lists",cola,"and",colb,"are NOT equally sorted")
 367 |         }
 368 |       }
 369 |       
 370 |       TestdeMatchTableLASSO<-unique(matrix(permutations(length(Xchica2LASSO)),ncol=2))
 371 |       for (lulu in 1:nrow(TestdeMatchTableLASSO)) {
 372 |         cola=TestdeMatchTableLASSO[lulu,1]
 373 |         colb=TestdeMatchTableLASSO[lulu,2]
 374 |         
 375 |         if (all(colnames(Xchica2LASSO[[cola]])==colnames(Xchica2LASSO[[colb]]))){
 376 |           #print(paste("Columns of lists",cola,"and",colb,"are equally sorted"))
 377 |         } else{
 378 |           print("Columns of lists",cola,"and",colb,"are NOT equally sorted")
 379 |         }
 380 |       }
 381 |       #
 382 |       
 383 |       for (ii in 1:length(Xchica2)) {
 384 |         Xchica2[[ii]]<-Xchica2[[ii]][,Muestra[[i]] %in% colnames(Xchica2[[ii]])]
 385 |       }
 386 |       for (ii in 1:length(Xchica2LASSO)) {
 387 |         Xchica2LASSO[[ii]]<-Xchica2LASSO[[ii]][,MuestraLASSO[[i]] %in% colnames(Xchica2LASSO[[ii]])]
 388 |       }
 389 |       
 390 |       ####
 391 |       Xchica<-Xchica2[-Response]
 392 |       Ychica<-as.matrix(Xchica2[[Response]])
 393 |       rm(Xchica2)
 394 |       if (is.character(Ychica[1,])) {
 395 |         Ycita<-as.numeric(as.factor(c(Ychica)))
 396 |         Ychica<-rbind(Ychica,as.numeric(Ycita))
 397 |       }
 398 |       for (lili in 1:length(Xchica)) {
 399 |         Xchica[[lili]]<-t(Xchica[[lili]])
 400 |       }
 401 |       
 402 |       XchicaLASSO<-Xchica2LASSO[-Response]
 403 |       YchicaLASSO<-as.matrix(Xchica2LASSO[[Response]])
 404 |       rm(Xchica2LASSO)
 405 |       if (is.character(YchicaLASSO[1,])) {
 406 |         Ycita2<-as.numeric(as.factor(c(YchicaLASSO)))
 407 |         YchicaLASSO<-rbind(YchicaLASSO,as.numeric(Ycita2))
 408 |       }
 409 |       for (lili in 1:length(XchicaLASSO)) {
 410 |         XchicaLASSO[[lili]]<-t(XchicaLASSO[[lili]])
 411 |       }
 412 |       
 413 |       PreRes<-operator (X=Xchica,Y=Ychica,XLasso=XchicaLASSO, YLasso=YchicaLASSO, LassoIterations=LassoIterations, FUN=Function,Comps = components)
 414 |       
 415 |       if (length(PreRes)>1){
 416 |         Mins<-as.matrix(cbind(Mins,PreRes$preMins))
 417 |         ComponentWinner<-cbind(ComponentWinner,PreRes$ComponentWinner2)
 418 |         class(ComponentWinner)<-"numeric"  
 419 |       } else {
 420 |         Mins<-cbind(Mins,PreRes)  
 421 |       }
 422 |     } # Closing Iteration
 423 |     nameSample<-paste(vectTicks[i], " samples", sep = "")
 424 |     if (dim(Mins)[1]==1){
 425 |       colnames(Mins)<-paste0("iter",seq(1:ERIterations));rownames(Mins)<-"OOB_ER"
 426 |       
 427 |     }
 428 |     minimossas<-list(Mins)
 429 |     Minimums[nameSample]<-minimossas
 430 |     winnerSas<-list(ComponentWinner)
 431 |     CompWinner[nameSample]<-winnerSas
 432 |   }
 433 |   ##### END of for (i in 1:length(vectTicks))
 434 |   ListofMins<-Minimums
 435 |   SDsas<-StdDev<-list()
 436 |   for(lista in 1:length(ListofMins)){
 437 |     Table<-ListofMins[[lista]]
 438 |     SDtita<-SDTota<-NULL
 439 |     for (row in 1:nrow(Table)){
 440 |       SDtita<-sd(Table[row,]) /sqrt(ncol(Table)) 
 441 |       SDTota<-rbind(SDTota,SDtita)
 442 |     }
 443 |     rownames(SDTota)<-rownames(ListofMins[[1]])
 444 |     nameList<-names(ListofMins[lista])
 445 |     SDsas<-list(SDTota)
 446 |     StdDev[nameList]<-SDsas
 447 |     
 448 |   }
 449 |   ###
 450 |   PreSDstable<-data.frame(matrix(data=NA,nrow=nrow(StdDev[[1]]),ncol = length(StdDev)))
 451 |   rownames(PreSDstable)<-rownames(StdDev[[1]])
 452 |   colnames(PreSDstable)<-t(data.frame(strsplit(names(StdDev),split=" "))[1,])[,1]
 453 |   
 454 |   for (iiiii in 1: length(StdDev)) {
 455 |     PreSDstable[,iiiii]<-rowMeans(StdDev[[iiiii]])
 456 |   }
 457 |   
 458 |   ###
 459 |   Premeanstable<-data.frame(matrix(data=NA,nrow=nrow(ListofMins[[1]]),ncol = length(ListofMins)))
 460 |   rownames(Premeanstable)<-rownames(ListofMins[[1]])
 461 |   colnames(Premeanstable)<-t(data.frame(strsplit(names(ListofMins),split=" "))[1,])[,1]
 462 |   
 463 |   for (i in 1: length(ListofMins)) {
 464 |     Premeanstable[,i]<-rowMeans(ListofMins[[i]])
 465 |   }
 466 |   
 467 |   ListofComps<-CompWinner
 468 |   if (is.null(ListofComps[[1]]) ){
 469 |     PreCompstable<-NULL
 470 |   } else {
 471 |     ToNumListofComps2<-PreCompstabletita<-PreCompstable<-NULL
 472 |     for (i in 1: length(ListofComps)) {
 473 |       ToNumListofComps2<-ListofComps[[i]]
 474 |       class(ToNumListofComps2) <- "numeric"
 475 |       for (row in 1:nrow(ToNumListofComps2)){
 476 |         PreCompstabletita<-rowMedians(ToNumListofComps2)
 477 |       }
 478 |       PreCompstable<-cbind(PreCompstable,PreCompstabletita)
 479 |     }
 480 |     rownames(PreCompstable)<-rownames(ListofComps[[1]])
 481 |     colnames(PreCompstable)<-t(data.frame(strsplit(names(ListofComps),split=" "))[1,])[,1]
 482 |     PreCompstable<-as.data.frame(PreCompstable)
 483 |   }
 484 |   
 485 |   
 486 |   MeansTable2<-Premeanstable
 487 |   SDstable2<-PreSDstable
 488 |   Compstable2<-PreCompstable
 489 |   
 490 |   MeansTable2$Category <- rownames(MeansTable2)
 491 |   SDstable2$Category <- rownames(SDstable2)
 492 |   Compstable2$Category <- rownames(Compstable2)
 493 |   meltedMeans<-melt(MeansTable2, id.vars="Category")
 494 |   meltedSDs<-melt(SDstable2, id.vars="Category")
 495 |   colnames(meltedSDs)[3]<-"StdDev"
 496 |   if (is.null(Compstable2) ){
 497 |     meltedComps<-NULL
 498 |     allmelted<-cbind(meltedMeans,StdDev=meltedSDs$StdDev)
 499 |     allmelted$ErrorRate<-t(data.frame(strsplit(allmelted$Category, split="_"))[2,])[,1]
 500 |     allmelted$Metric<-t(data.frame(strsplit(allmelted$Category, split="_"))[1,])[,1]  
 501 |   } else {
 502 |     meltedComps<-melt(Compstable2, id.vars="Category")
 503 |     colnames(meltedComps)[3]<-"Comp"
 504 |     allmelted<-cbind(meltedMeans,StdDev=meltedSDs$StdDev,Comp=meltedComps$Comp)
 505 |     allmelted$ErrorRate<-t(data.frame(strsplit(allmelted$Category, split="_"))[2,])[,1]
 506 |     allmelted$Metric<-t(data.frame(strsplit(allmelted$Category, split="_"))[1,])[,1]
 507 |   }
 508 |   
 509 |   if (!is.null(Ticks)) { 
 510 |     seqdeTicks<-as.numeric(as.character(unique(allmelted$variable)))
 511 |     MinN<-min(as.numeric(as.character(unique(allmelted$variable))))
 512 |     MaxN<-max(as.numeric(as.character(unique(allmelted$variable))))
 513 |     vectdeTicks<-round(seq(from=MinN,to = MaxN,length.out = Ticks),digits = 0)
 514 |     ElAusente<-vectdeTicks[!(vectdeTicks %in% seqdeTicks)]
 515 |     
 516 |     Elreemplazito<-Elreemplazo<-NULL
 517 |     for (ab in 1:length(ElAusente)){
 518 |       Elreemplazito<-seqdeTicks[which.min(abs(seqdeTicks - ElAusente[ab])) ]
 519 |       Elreemplazo<-c(Elreemplazo,Elreemplazito)
 520 |     }
 521 |     while (length(Elreemplazo[duplicated(Elreemplazo)])>0) { 
 522 |       Elnuevoreemplazo<-seqdeTicks[seqdeTicks>Elreemplazo[duplicated(Elreemplazo)] & seqdeTicks< Elreemplazo[which(duplicated(Elreemplazo))+1] ]
 523 |       Elreemplazo<-sort(c(unique(Elreemplazo),Elnuevoreemplazo ))  
 524 |     }
 525 |     
 526 |     vectdeTicks2<-vectdeTicks[(vectdeTicks %in% seqdeTicks)]
 527 |     NvectdeTicks<-sort(c(vectdeTicks2,Elreemplazo))
 528 |     allmelted<-allmelted[allmelted$variable %in% NvectdeTicks,]
 529 |   }
 530 |   
 531 |   if (!is.null(WhichTicks)) {
 532 |     NvectdeTicks<-sort(c(WhichTicks))
 533 |   }
 534 |     
 535 |   
 536 |   allmelted2<-allmelted
 537 |   ##################################################
 538 |   Metrica<-unique(allmelted2$Metric)
 539 |   errate<-unique(allmelted2$ErrorRate)
 540 |   allmelted3<-allmeltedsas<-list()
 541 |   CItita<-CItota<-CI<-NULL
 542 |   CL<-c(90,95,99)
 543 |   Z<-c(1.64,1.96, 2.58)
 544 |   CItita<-CItota<-NULL
 545 |   
 546 |   if (length(Metrica)==1){
 547 |     allmelted3<-allmelted2[allmelted2$Metric==Metrica,]
 548 |     for (zeta in 1:length(Z)) {
 549 |       namecito=paste("CI_",CL[zeta], sep="")
 550 |       for (al in 1:dim(allmelted3)[1]) {
 551 |         M=allmelted3$value[al]
 552 |         SM=sqrt( (allmelted3$StdDev[al])^2/as.numeric(as.character(allmelted3$variable[al]) ) )
 553 |         lower<-round(M-(Z[zeta]* SM),digits=3)
 554 |         upper<-round(M+(Z[zeta]* SM),digits=3)
 555 |         CItita<-cbind(namecito,paste("[",lower," , ",upper,"]", sep="") )
 556 |         CItota<-rbind(CItota,CItita)  
 557 |       }
 558 |       allmelted3[namecito]<-CItota[,2]
 559 |       CItita<-CItota<-NULL
 560 |     }
 561 |     
 562 |   } else {
 563 |     for(me in 1:length(Metrica) ) {
 564 |       tab<-allmelted2[allmelted2$Metric==Metrica[me],]
 565 |       
 566 |       for(eler in 1:length(errate)) {
 567 |         namecito<-paste(Metrica[me],errate[eler],sep="_")
 568 |         tab2<-tab[tab$ErrorRate==errate[eler],]
 569 |         allmeltedsas<-list(tab2)
 570 |         allmelted3[namecito]<-allmeltedsas
 571 |       }
 572 |     }
 573 |     for (uu in 1:length(allmelted3)){
 574 |       for (zeta in 1:length(Z)) {
 575 |         namecito=paste("CI_",CL[zeta], sep="")
 576 |         for (al in 1:dim(allmelted3[[uu]])[1]) {
 577 |           M=allmelted3[[uu]]$value[al]
 578 |           SM=sqrt( (allmelted3[[uu]]$StdDev[al])^2/as.numeric(as.character(allmelted3[[uu]]$variable[al]) ) )
 579 |           lower<-round(M-(Z[zeta]* SM),digits=3)
 580 |           upper<-round(M+(Z[zeta]* SM),digits=3)
 581 |           CItita<-cbind(namecito,paste("[",lower," , ",upper,"]", sep="") )
 582 |           CItota<-rbind(CItota,CItita)  
 583 |         }
 584 |         allmelted3[[uu]][namecito]<-CItota[,2]
 585 |         CItita<-CItota<-NULL
 586 |       }
 587 |     }
 588 |   }
 589 |   
 590 |   allmelted3
 591 |   #######
 592 |   
 593 |   if (is.null(winnerSas[[1]]) ) {
 594 |     result<-list(TestedTicks=NvectdeTicks,Omics=Omics,Minimums=Minimums,
 595 |                  TablebyTick=allmelted3)
 596 |   } else {
 597 |   result<-list(TestedTicks=NvectdeTicks,Omics=Omics,Minimums=Minimums,CompWinner=CompWinner ,
 598 |                TablebyTick=allmelted3)
 599 |   }
 600 |   return(result)
 601 | }
 602 | 
 603 | 
 604 | ###########################################################
 605 | operator<-function (X,Y, XLasso,YLasso,FUN,LassoIterations,...) {
 606 |   args<-list(...)
 607 |   Comps<-args$Comps
 608 |   iter<-args$iter
 609 |   FUN (X,Y,XLasso,YLasso,LassoIterations,...)
 610 | }
 611 | 
 612 | ###########################################################
 613 | PLSDA.MP<-function (X,Y,XLasso, YLasso, LassoIterations,...) {
 614 |   args<-list(...)
 615 |   Comps<-args$Comps
 616 |   iter<-args$iter
 617 |   
 618 |     if (length(X) >1){
 619 |     summary(X)
 620 |     ### LassoSelection ###
 621 |     SelVarstita<-SelVars<-NULL
 622 |     for (lst in 1:length(X)){
 623 |       X[[lst]]<-scale(X[[lst]],center=TRUE, scale = FALSE)
 624 |       SelVars<-LassoSelection (X=XLasso[[lst]],Y=as.factor(YLasso[2,]), IterationsforVarSelections=LassoIterations)
 625 |       X[[lst]]<-X[[lst]][,colnames(X[[lst]]) %in% SelVars$SelectedVariables]
 626 |     }
 627 |     ######################
 628 |     res2<- block.plsda(X,as.vector(Y[2,]), ncomp=Comps)  
 629 |     perf.plsda <- try (perf(res2, validation = valid, folds = 5, 
 630 |                             progressBar = FALSE, auc = TRUE, nrepeat = 1) ,silent=TRUE) 
 631 |     ##
 632 |     #TableMeans<-table<-NULL
 633 |     Mins<-ComponentWinner<-NULL
 634 |     if (class(perf.plsda) == "try-error") {
 635 |       preMins<-matrix(1,nrow=6,ncol=1)
 636 |       colnames(preMins)<-iter;rownames(preMins)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER")
 637 |       ComponentWinner2<-matrix(1,nrow=6,ncol=1)
 638 |       colnames(ComponentWinner2)<-iter;rownames(ComponentWinner2)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER")
 639 |     } else{
 640 |       preMeans<-perf.plsda$WeightedVote.error.rate
 641 |       
 642 |       TableMeans<-table<-NULL
 643 |       for (p in 1: length (preMeans)){
 644 |         namecito2<-names(preMeans[p])
 645 |         table<-preMeans[[p]][grep(pattern = "Overall",x = rownames(preMeans[[p]])),]
 646 |         tipodeER<-gsub("Overall.", "", rownames(table))
 647 |         rownames(table)<-paste(namecito2,tipodeER, sep="_")
 648 |         TableMeans<-rbind(TableMeans,table)
 649 |       }
 650 |       
 651 |       preMins<-as.matrix(apply(TableMeans, 1, FUN=min))
 652 |       preComponentWinner<-data.frame(ComponentWinner=colnames(TableMeans)[apply(TableMeans,1,which.min)])
 653 |       ComponentWinner2<-t(data.frame(strsplit(as.vector(preComponentWinner[,1]),split=" "))[2,])
 654 |       rownames(ComponentWinner2)<-rownames(preMins);colnames(ComponentWinner2)<-iter
 655 |     }
 656 |     Mins<-as.matrix(cbind(Mins,preMins))
 657 |     ComponentWinner<-cbind(ComponentWinner,ComponentWinner2)
 658 |     ComponentWinner2<-ComponentWinner
 659 |     class(ComponentWinner2)<-"numeric" 
 660 |     
 661 |     ################################################
 662 |   } else {
 663 |     ### LassoSelection ###
 664 |     SelVars<-LassoSelection (X=XLasso[[1]],Y=as.factor(YLasso[2,]), IterationsforVarSelections=LassoIterations)
 665 |     X[[1]]<-X[[1]][,colnames(X[[1]]) %in% SelVars$SelectedVariables]
 666 |     ######################
 667 |     CompsNEW<-min( (nrow(X[[1]])-1), Comps )
 668 |     res2<- plsda(X[[1]],as.factor(Y[2,]), ncomp=CompsNEW)  
 669 |     perf.plsda <- try (perf(res2, validation = valid, folds = 5, 
 670 |                             progressBar = FALSE, auc = TRUE, nrepeat = 1) ,silent=TRUE) 
 671 |     
 672 |     if (class(perf.plsda)[[1]] == "try-error") {
 673 |       preMins<-matrix(1,nrow=6,ncol=1)
 674 |       colnames(preMins)<-iter;rownames(preMins)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER")
 675 |       ComponentWinner2<-matrix(1,nrow=6,ncol=1)
 676 |       colnames(ComponentWinner2)<-iter;rownames(ComponentWinner2)<-c("max.dist_ER","max.dist_BER","centroids.dist_ER","centroids.dist_BER","mahalanobis.dist_ER","mahalanobis.dist_BER")
 677 |     } else {
 678 |       preMeans<-perf.plsda$error.rate
 679 |       
 680 |       TableMeans<-table<-NULL
 681 |       for (p in 1: length (preMeans)){
 682 |         namecito2<-names(preMeans[p])
 683 |         if (namecito2=="overall"){
 684 |           namecito2<-"ER"
 685 |         }
 686 |         table<-t(preMeans[[p]])
 687 |         rownames(table)<-paste(rownames(table),namecito2, sep="_")
 688 |         TableMeans<-rbind(TableMeans,table)
 689 |       }
 690 |       preMins<-as.matrix(apply(TableMeans, 1, FUN=min))
 691 |       preComponentWinner<-data.frame(ComponentWinner=colnames(TableMeans)[apply(TableMeans,1,which.min)])
 692 |       ComponentWinner2<-t(data.frame(strsplit(as.vector(preComponentWinner[,1]),split=" "))[2,])
 693 |       rownames(ComponentWinner2)<-rownames(preMins);colnames(ComponentWinner2)<-iter
 694 |     }
 695 |     
 696 |     
 697 |   } # Closing separation between block.plsda and plsda
 698 |   
 699 |   res=list(preMins=preMins,ComponentWinner2=ComponentWinner2 )
 700 |   return (res)
 701 | } 
 702 | 
 703 | ###########################################################
 704 | Random.Forest.MP<-function (X,Y,XLasso,YLasso, LassoIterations,...) {
 705 |   args<-list(...)
 706 |   Comps<-args$Comps
 707 |   iter<-args$iter
 708 |   Xchicaflat2Lasso<-do.call(cbind.data.frame, XLasso)
 709 |   Xchicaflat2Lasso<-scale(Xchicaflat2Lasso,center=TRUE, scale = FALSE)
 710 |   names(Xchicaflat2Lasso) <- make.names(names(Xchicaflat2Lasso))
 711 |   YYLasso<-as.data.frame(YLasso[1,]);colnames(YYLasso)<-"Y"
 712 |   XchicaflatLasso<-merge(YYLasso,Xchicaflat2Lasso,by=0 );rownames(XchicaflatLasso)<-XchicaflatLasso[,1];XchicaflatLasso<-XchicaflatLasso[,-1]
 713 |   names(XchicaflatLasso) <- make.names(names(XchicaflatLasso))
 714 |   rm(Xchicaflat2Lasso,YYLasso)
 715 |   
 716 |   Xchicaflat2<-do.call(cbind.data.frame, X); dim(Xchicaflat2)
 717 |   Xchicaflat2<-scale(Xchicaflat2,center=TRUE, scale = FALSE)
 718 |   names(Xchicaflat2) <- make.names(names(Xchicaflat2))
 719 |   YY<-as.data.frame(Y[1,]);colnames(YY)<-"Y"
 720 |   Xchicaflat<-merge(YY,Xchicaflat2,by=0 );rownames(Xchicaflat)<-Xchicaflat[,1];Xchicaflat<-Xchicaflat[,-1]
 721 |   names(Xchicaflat) <- make.names(names(Xchicaflat))
 722 |   rm(Xchicaflat2,YY)
 723 |   
 724 |   ### LassoSelection ###
 725 |   SelVars<-LassoSelection (X=XchicaflatLasso[,-1],Y=XchicaflatLasso[,1], IterationsforVarSelections=LassoIterations)
 726 |   SelVars$SelectedVariables<-c("Y",SelVars$SelectedVariables)
 727 |   Xchicaflat<-Xchicaflat[,colnames(Xchicaflat) %in% SelVars$SelectedVariables]#; dim(Xchicaflat)
 728 |   set.seed(1)
 729 |   model <- randomForest(Y ~., data=Xchicaflat)
 730 |   
 731 |   preMins<-model$err.rate[500,1]
 732 |   rm(model)
 733 |   return(preMins)
 734 | }  
 735 | ###########################################################
 736 | 
 737 | LassoSelection<- function (X,Y, IterationsforVarSelections=15, ...){
 738 |   # Input:
 739 |   # X: This is the table of predictor variables where n is individuals and p is variables
 740 |   # Y: This is the vector of response variables
 741 |   # IterationsforVarSelections: Number of iterations of the variable selection step
 742 |   ##################
 743 |   require ("glmnet")
 744 |   
 745 |   X<-as.matrix(X)
 746 |   alpha_val=1
 747 |   SelVars<- SelectedVariables<-NULL
 748 |   SelecVariablesalliterations<-NULL 
 749 |   CoefficientsVarstita<-CoefficientsVars<-NULL
 750 |   for (i in 1:IterationsforVarSelections) {
 751 |     #print (paste ("LASSO Variable Selection iteration ",i, sep=" "))
 752 |     
 753 |     repeat {
 754 |       cfit<-try(cv.glmnet(as.matrix(X),as.vector(Y), 
 755 |                           standardize=TRUE, family="multinomial", 
 756 |                           alpha = alpha_val, grouped = FALSE,
 757 |                           type.measure = "mae"),silent=TRUE ) 
 758 |       
 759 |       if (class(cfit) == "try-error") {
 760 |         X=rbind(X,X)
 761 |         Y=c(Y,Y)
 762 |       } else { 
 763 |         
 764 |         for (ls in 1:length(coef(cfit, s = "lambda.min"))) {
 765 |           CoefficientsVarstita<-as.matrix(coef(cfit, s = "lambda.min")[[ls]])
 766 |           CoefficientsVarstita<-CoefficientsVarstita[-1,,drop=FALSE]
 767 |           CoefficientsVars<-rbind(CoefficientsVars,CoefficientsVarstita)
 768 |         }
 769 |         SelVarsPositions<-which(CoefficientsVars != 0)
 770 |         SelVarssmall<-CoefficientsVars[which(CoefficientsVars != 0),,drop=FALSE]
 771 |         SelVarssmalllista<- unique(rownames(SelVarssmall))
 772 |         SelVars<-c(SelVars,SelVarssmalllista)
 773 |         SelVars<-unique(SelVars)
 774 |         break 
 775 |       }
 776 |     }
 777 |     SelecVariablesalliterations<-c(SelecVariablesalliterations,SelVars)  
 778 |     SelectedVariables<-unique(c(SelecVariablesalliterations))
 779 |   }
 780 |   res<-list(SelectedVariables=SelectedVariables)
 781 |   return(res)
 782 | }
 783 | ###########################################################
 784 | ErrorRateplot<-function (x, ErrorRateType="BER",MetricsType="max.dist",DoF=NULL,Projection=FALSE,Spline=TRUE, TheoreticER=NULL,ConfInt=0.95) {
 785 |   # Input:
 786 |   # x: list of tables with the error rates results and the number of components per tick 
 787 |   # ErrorRateType: Character to indicate which error rate to plot. It can be error rate "ER", 
 788 |   #                balanced error rate "BER" or "Both"
 789 |   # MetricsType: Character to indicate the metrics to be plotted. It can be Maximal 
 790 |   #              distance "max.dist", distance to centroids "centroids.dist", Mahalanobis 
 791 |   #              distance "mahalanobis.dist" or "All"
 792 |   # DoF: It can be either NULL to indicate that the degrees of freedom of the Spline model are ticks-1, 
 793 |   #             or a value so the model is created with a different degree of freedom. The chosen DoF affects the samples required so the user must be careful.
 794 |   # Projection: Character to indicate if the user needs for the Spline model to be projected until 
 795 |   #             the minimum error rate Min_ER (TRUE) or not (FALSE).
 796 |   # Spline: Character to indicate if the user needs to plot the Spline model (TRUE) or not (FALSE).
 797 |   # TheoreticER: Character to indicate the minimum value of ER in order to calculate the adequate number of samples.
 798 |   # ConfInt: Character to indicate the confidence interval for the calculation of margin of error and plot of the confidence area of the Spline model. The user can use 0.90, 0.95 or 0.99.
 799 |   
 800 |   # Output:
 801 |   # A Linechart with standard deviation. If the plot represent just one error type and one metrics
 802 |   # The number of components will appear indicating that was the best number of component.
 803 |   suppressWarnings({ 
 804 |   Layers<-x$Omics
 805 |   rownames(Layers)<-NULL
 806 |   ListofMins<-x$Minimums
 807 |   ticks<-DoF
 808 |   
 809 |   if(is.null(ticks)){
 810 |     DegreesOfFreedom<-length(ListofMins)-1
 811 |   } else {
 812 |     DegreesOfFreedom<-ticks-1
 813 |   }
 814 |   if (MetricsType=="All" || ErrorRateType=="Both"){
 815 |     allmelted3<-do.call(rbind.data.frame, x$TablebyTick)
 816 |     if (MetricsType!="All" ){
 817 |       allmelted3<-allmelted3[allmelted3$Metric==MetricsType,]
 818 |     }
 819 |     if (ErrorRateType!="Both" ){
 820 |       allmelted3<-allmelted3[allmelted3$ErrorRate==ErrorRateType,]
 821 |     }
 822 |   } else{
 823 |     selector<-paste(MetricsType,ErrorRateType,sep="_")  
 824 |   }
 825 |   
 826 |   if (dim(ListofMins[[1]])[1]==1){
 827 |     allmelted3<-x$TablebyTick
 828 |   
 829 |   } else{
 830 |     allmelted3<-x$TablebyTick[[selector]]
 831 |   }
 832 |   
 833 |   if (ConfInt==0.90){
 834 |     df<-allmelted3[,c(7:9)]
 835 |     df<-lapply (df, function (x) gsub("\\[|\\]","",x))
 836 |     df2<-as.data.frame(strsplit(df$CI_90, split=","))
 837 |     lowers<-as.numeric(as.character(unname(unlist(df2[1,]))))
 838 |     uppers<-as.numeric(as.character(unname(unlist(df2[2,]))))
 839 |   } 
 840 |   if (ConfInt==0.95) {
 841 |     df<-allmelted3[,c(7:9)]
 842 |     df<-lapply (df, function (x) gsub("\\[|\\]","",x))
 843 |     df2<-as.data.frame(strsplit(df$CI_95, split=","))
 844 |     lowers<-as.numeric(as.character(unname(unlist(df2[1,]))))
 845 |     uppers<-as.numeric(as.character(unname(unlist(df2[2,]))))
 846 |   }
 847 |   if(ConfInt==0.99){
 848 |     df<-allmelted3[,c(7:9)]
 849 |     df<-lapply (df, function (x) gsub("\\[|\\]","",x))
 850 |     df2<-as.data.frame(strsplit(df$CI_99, split=","))
 851 |     lowers<-as.numeric(as.character(unname(unlist(df2[1,]))))
 852 |     uppers<-as.numeric(as.character(unname(unlist(df2[2,]))))
 853 |   }
 854 |   
 855 |   allmelted3$lower<-lowers
 856 |   allmelted3$upper<-uppers
 857 |   
 858 |   allmeltedmodel<-allmelted3
 859 |   colnames(allmeltedmodel)[colnames(allmeltedmodel)=="SEM"] <- "StdDev"
 860 |   colnames(allmeltedmodel)[colnames(allmeltedmodel)=="Samples"] <- "variable"
 861 |   colnames(allmeltedmodel)[colnames(allmeltedmodel)=="ER_value"] <- "value"
 862 |   colnames(allmeltedmodel)[colnames(allmeltedmodel)=="ER_type"] <- "ErrorRate"
 863 |   colnames(allmelted3)[colnames(allmelted3)=="SEM"] <- "StdDev"
 864 |   colnames(allmelted3)[colnames(allmelted3)=="Samples"] <- "variable"
 865 |   colnames(allmelted3)[colnames(allmelted3)=="ER_value"] <- "value"
 866 |   colnames(allmelted3)[colnames(allmelted3)=="ER_type"] <- "ErrorRate"
 867 |   
 868 |   ro=dim(allmeltedmodel)[1]
 869 |   
 870 |   if (allmeltedmodel$value[ro-1]<allmeltedmodel$value[ro]) {
 871 |         allmeltedmodel$value[ro]<-allmeltedmodel$value[ro-1]-abs(allmeltedmodel$StdDev[ro-1])
 872 |   } else{}
 873 |   
 874 |   
 875 |   #
 876 |   if (is.null(TheoreticER) ) {
 877 |     MinimumError=min(allmelted3$value)
 878 |     SamplesEvaluated=SamplesRequired=as.numeric(as.character(allmelted3$variable[allmelted3$value==min(allmelted3$value)]))
 879 |   } else{
 880 |     MinimumError=TheoreticER 
 881 |     
 882 |   }
 883 |   
 884 |   if (length(unique(allmelted3$Category))>1) {
 885 |     p<-ggplot(allmelted3, aes(x=variable, y= value, group=Category)) +
 886 |           geom_line(aes(linetype=ErrorRate, color=Metric), size=1) + 
 887 |           ylim(NA,1.1) +
 888 |           ylab("Classification Error Rate") +
 889 |           xlab("Number of Samples") +
 890 |           theme(axis.title=element_text(face="bold",size="14"),
 891 |                 axis.text.x = element_text(face="bold", size=18),
 892 |                 axis.text.y = element_text(face="bold", size=18),
 893 |                 plot.title = element_text(size = "16", face = "bold")
 894 |           ) +
 895 |           ggtitle(paste(length(unique(allmelted2$variable))-1, "segments", sep= " ")) +
 896 |           scale_linetype_manual(values=c("solid", "twodash"))+
 897 |           geom_errorbar(aes(ymin = value - StdDev,
 898 |                             ymax = value + StdDev, color=Metric))
 899 |         legend <- g_legend(p)
 900 |         grid.newpage()
 901 |         vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5)
 902 |         vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75)
 903 |         subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25)
 904 |         print(p + theme(legend.position = "none"), vp = vp1)
 905 |         upViewport(0)
 906 |         pushViewport(vpleg)
 907 |         grid.draw(legend)
 908 |         upViewport(0)
 909 |         pushViewport(subvp)
 910 |         my_table <- tableGrob(Layers) 
 911 |         grid.draw(my_table)
 912 |   } else {
 913 |     if (Projection==TRUE & Spline==FALSE){
 914 |           Spline=TRUE
 915 |           print("Since Projection==TRUE, the Spline will be plotted")
 916 |         }
 917 |     if (Projection==FALSE & Spline==FALSE){
 918 |       p<-ggplot() + 
 919 |             geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Category,
 920 |                           linetype=ErrorRate, color=Metric), allmelted3, size=1) +
 921 |             geom_errorbar(aes(x=as.numeric(as.character(variable)), group=Category,
 922 |                               ymin = value - StdDev, ymax = value + StdDev, color=Metric), allmelted3) +
 923 |             ylim(NA,1.1) +
 924 |             ylab("Classification Error Rate") +
 925 |             xlab("Number of Samples") +
 926 |             theme(axis.title=element_text(face="bold",size="14"),
 927 |                   axis.text.x = element_text(face="bold", size=18),
 928 |                   axis.text.y = element_text(face="bold", size=18),
 929 |                   plot.title = element_text(size = "16", face = "bold")
 930 |             ) +
 931 |             ggtitle(paste(length(unique(allmelted3$variable))-1, "segments", sep= " ")) +
 932 |             scale_linetype_manual(values=c("solid", "twodash"))
 933 |           
 934 |           legend <- g_legend(p)
 935 |           grid.newpage()
 936 |           vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5)
 937 |           vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75)
 938 |           subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25)
 939 |           print(p + theme(legend.position = "none"), vp = vp1)
 940 |           upViewport(0)
 941 |           pushViewport(vpleg)
 942 |           grid.draw(legend)
 943 |           upViewport(0)
 944 |           pushViewport(subvp)
 945 |           my_table <- tableGrob(Layers) 
 946 |           grid.draw(my_table)
 947 |         }
 948 |     if (Projection==FALSE & Spline==TRUE){
 949 |       MinimumError=min(allmelted3$value)
 950 |       fm1 <- lm(value ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel)
 951 |       DESVESTprediction<-lm(StdDev ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel)
 952 |       
 953 |       ht01 <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),2000, length.out = 2000)
 954 |       prediction<-as.data.frame(cbind(predict(fm1, data.frame(variable = ht01)),predict(DESVESTprediction, data.frame(variable = ht01))   )  );colnames(prediction)<-c("Prediction","STDpred")
 955 |       ht01Table<-cbind(ht01,as.data.frame(prediction));colnames(ht01Table)<-c("ht","ErrorPred", "STDpred")
 956 |       
 957 |       ht <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),max(as.numeric(as.character(allmeltedmodel$variable))), length.out = 200)
 958 |       htTable<-cbind(ht,as.data.frame(predict(fm1, data.frame(variable = ht))));colnames(htTable)<-c("ht","ErrorPred")
 959 |       
 960 |       NSampleMaxTable<-as.data.frame(ht01Table[ht01Table[,2]>=MinimumError,])
 961 |       NSampleMax<-1+round(NSampleMaxTable$ht[dim(NSampleMaxTable)[1]])
 962 |       NSampleMaxPosition<-as.numeric(rownames(NSampleMaxTable)[dim(NSampleMaxTable)[1]])
 963 |       
 964 |       NSampleMaxTable$EplusStd<-NSampleMaxTable$ErrorPred - NSampleMaxTable$STDpred
 965 |       Theplusminustable<-as.data.frame(NSampleMaxTable[NSampleMaxTable[,4]>=MinimumError,])
 966 |       Theplusminus<-round(Theplusminustable$ht[dim(Theplusminustable)[1]])
 967 |       TheplusminusPosition<-as.numeric(rownames(Theplusminustable)[dim(Theplusminustable)[1]])
 968 |       
 969 |       SamplesEvaluated<-as.numeric(as.character(allmelted3$variable[dim(allmelted3)[1]]))
 970 |       SamplesRequired<-round(NSampleMaxTable$ht[NSampleMaxPosition])
 971 |       MOE<-paste("±",round(NSampleMax-Theplusminus))
 972 |       
 973 |       p<-ggplot() + 
 974 |         geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Category,
 975 |                       linetype=ErrorRate, color=Metric), allmelted3, size=1) +
 976 |         geom_errorbar(aes(x=as.numeric(as.character(variable)), group=Category,
 977 |                           ymin = lower, ymax = upper, color=Metric), allmelted3) +
 978 |         geom_smooth(method="loess", span=0.2, aes(x=ht, y=ErrorPred,color="SPline"), data=NSampleMaxTable, size=1.3 ) +
 979 |         ylim(NA,1.1) +
 980 |         ylab("Classification Error Rate") +
 981 |         xlab("Number of Samples") +
 982 |         theme(axis.title=element_text(face="bold",size="14"),
 983 |               axis.text.x = element_text(face="bold", size=18),
 984 |               axis.text.y = element_text(face="bold", size=18),
 985 |               plot.title = element_text(size = "16", face = "bold")
 986 |         ) +
 987 |         ggtitle(paste(length(unique(allmelted3$variable))-1, "segments", sep= " ")) +
 988 |         scale_linetype_manual(values=c("solid", "twodash"))
 989 |       
 990 |       legend <- g_legend(p)
 991 |       grid.newpage()
 992 |       vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5)
 993 |       vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75)
 994 |       subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.5)
 995 |       subvp2 <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25)
 996 |       print(p + theme(legend.position = "none"), vp = vp1)
 997 |       upViewport(0)
 998 |       pushViewport(vpleg)
 999 |       grid.draw(legend)
1000 |       upViewport(0)
1001 |       pushViewport(subvp)
1002 |       rownames(Layers)<-NULL
1003 |       my_table <- tableGrob(Layers) 
1004 |       grid.draw(my_table)
1005 |       
1006 |       
1007 |       TERtable<-rbind(ERtarget=round(MinimumError,digits = 3),
1008 |                       PSS=SamplesRequired,
1009 |                       MOE= MOE) 
1010 |       TERtableGrob <- tableGrob(TERtable) 
1011 |       upViewport(0)
1012 |       pushViewport(subvp2)
1013 |       grid.draw(TERtableGrob)
1014 |     }
1015 |     
1016 |     if (Projection==TRUE & Spline==TRUE){
1017 |       fm1 <- lm(value ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel)
1018 |       DESVESTprediction<-lm(StdDev ~ bs(as.numeric(as.character(variable)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel)
1019 |       
1020 |       ht01 <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),2000, length.out = 2000)
1021 |       prediction<-as.data.frame(cbind(predict(fm1, data.frame(variable = ht01)),predict(DESVESTprediction, data.frame(variable = ht01))   )  );colnames(prediction)<-c("Prediction","STDpred")
1022 |       ht01Table<-cbind(ht01,as.data.frame(prediction));colnames(ht01Table)<-c("ht","ErrorPred", "STDpred")
1023 |       
1024 |       ht <- seq(min(as.numeric(as.character(allmeltedmodel$variable))),max(as.numeric(as.character(allmeltedmodel$variable))), length.out = 200)
1025 |       htTable<-cbind(ht,as.data.frame(predict(fm1, data.frame(variable = ht))));colnames(htTable)<-c("ht","ErrorPred")
1026 |       
1027 |       NSampleMaxTable<-as.data.frame(ht01Table[ht01Table[,2]>=MinimumError,])
1028 |       rownames(NSampleMaxTable)<-seq(1:dim(NSampleMaxTable)[1])
1029 |       NSampleMax<-1+round(NSampleMaxTable$ht[dim(NSampleMaxTable)[1]])
1030 |       NSampleMaxPosition<-as.numeric(rownames(NSampleMaxTable)[dim(NSampleMaxTable)[1]])
1031 | 
1032 |       NSampleMaxTable$EplusStd<-NSampleMaxTable$ErrorPred - NSampleMaxTable$STDpred
1033 |       Theplusminustable<-as.data.frame(NSampleMaxTable[NSampleMaxTable[,4]>=MinimumError,])
1034 |       Theplusminus<-round(Theplusminustable$ht[dim(Theplusminustable)[1]])
1035 |       TheplusminusPosition<-as.numeric(rownames(Theplusminustable)[dim(Theplusminustable)[1]])
1036 | 
1037 |       SamplesEvaluated<-as.numeric(as.character(allmelted3$variable[dim(allmelted3)[1]]))
1038 |       SamplesRequired<-round(NSampleMaxTable$ht[NSampleMaxPosition])
1039 |       MOE<-paste("±",round(NSampleMax-Theplusminus))
1040 |   
1041 |         p<-ggplot() + 
1042 |           geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Category,
1043 |                         linetype=ErrorRate, color=Metric), allmelted3, size=1) +
1044 |           geom_errorbar(aes(x=as.numeric(as.character(variable)), group=Category,
1045 |                             ymin = lower, ymax = upper, color=Metric), allmelted3) +
1046 |           
1047 |           
1048 |           geom_smooth(method="loess", span=0.2, aes(x=ht, y=ErrorPred,color="SPline"), data=NSampleMaxTable, size=1.3 ) +
1049 |           ylim(NA,1.1) +
1050 |           ylab("Classification Error Rate") +
1051 |           xlab("Number of Samples") +
1052 |           theme(axis.title=element_text(face="bold",size="14"),
1053 |                 axis.text.x = element_text(face="bold", size=18),
1054 |                 axis.text.y = element_text(face="bold", size=18),
1055 |                 plot.title = element_text(size = "16", face = "bold")
1056 |           ) +
1057 |           ggtitle(paste(length(unique(allmelted3$variable))-1, "segments", sep= " ")) +
1058 |           scale_linetype_manual(values=c("solid", "twodash"))
1059 |         
1060 |         legend <- g_legend(p)
1061 |         grid.newpage()
1062 |         vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5)
1063 |         vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75)
1064 |         subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.5)
1065 |         subvp2 <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25)
1066 |         print(p + theme(legend.position = "none"), vp = vp1)
1067 |         upViewport(0)
1068 |         pushViewport(vpleg)
1069 |         grid.draw(legend)
1070 |         upViewport(0)
1071 |         pushViewport(subvp)
1072 |         rownames(Layers)<-NULL
1073 |         my_table <- tableGrob(Layers) 
1074 |         grid.draw(my_table)
1075 |         
1076 |         TERtable<-rbind(ERtarget=round(MinimumError,digits = 3),
1077 |                         PSS=SamplesRequired,
1078 |                         MOE= MOE) 
1079 |         TERtableGrob <- tableGrob(TERtable) 
1080 |         upViewport(0)
1081 |         pushViewport(subvp2)
1082 |         grid.draw(TERtableGrob)
1083 |         
1084 |     }
1085 |     res<-TERtable
1086 |     return(res)
1087 |   }
1088 |   })
1089 | }
1090 | 
1091 | ###########################################################
1092 | Comparative_ERPlot<-function (L, ErrorRateType = "ER", MetricsType ="mahalanobis.dist"){
1093 |   # Input:
1094 |   # L: list of tables obtained from "UntilAllmelted" analysis 
1095 |   
1096 |   # Output:
1097 |   # A plot of number of samples vs Classification Error Rate in order to compare and evaluate different analyzes
1098 |   
1099 |   LosRowNames<-Ltransp1<-Ltransp2<-Ltransp3<-Ltransp4<-NULL
1100 |   Ltransp5<-Ltransp<-list()
1101 |   
1102 |   if (length(unique(L[[1]]$TablebyTick$Category))==1) {
1103 |     LosRowNames<-Ltransp1<-Ltransp2<-Ltransp3<-Ltransp4<-NULL
1104 |     Ltransp5<-Ltransp<-list()
1105 |     for (ls in 1:length(L)){
1106 |       Namecito<-names(L[ls])
1107 |       Ltransp1<-data.frame(L[[ls]]$TablebyTick)
1108 |       Ltransp1$Omics<-rep(Namecito, dim(Ltransp1)[1])
1109 |       Ltransp[[Namecito]]<-Ltransp1
1110 |     }
1111 |   
1112 |   TableAll <- do.call(rbind,Ltransp)
1113 |   rownames(TableAll)<-seq(1:dim(TableAll)[1])
1114 |   
1115 |     
1116 |   Metrica<-t(data.frame(cbind(unique(TableAll$Metric),unique(TableAll$ErrorRate))))
1117 |   rownames(Metrica)<-NULL
1118 |   p<-ggplot() + 
1119 |       geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Omics,colour = Omics), data = TableAll, size=1) +
1120 |       geom_errorbar(aes(x=as.numeric(as.character(variable)),group=Omics ,
1121 |                         ymin = value - StdDev, ymax = value + StdDev, color=Omics), TableAll) +
1122 |       ylab("Classification Error Rate") +
1123 |       xlab("Number of Samples") +
1124 |       theme(axis.title=element_text(face="bold",size="14"),
1125 |             axis.text.x = element_text(face="bold", size=18),
1126 |             axis.text.y = element_text(face="bold", size=18),
1127 |             plot.title = element_text(size = "16", face = "bold")
1128 |       ) +
1129 |       ggtitle(paste("Comparative Classification Plot "))
1130 |     legend <- g_legend(p)
1131 |     grid.newpage()
1132 |     vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5)
1133 |     vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75)
1134 |     subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25)
1135 |     print(p + theme(legend.position = "none"), vp = vp1)
1136 |     upViewport(0)
1137 |     pushViewport(vpleg)
1138 |     grid.draw(legend)
1139 |     upViewport(0)
1140 |     pushViewport(subvp)
1141 |     my_table <- tableGrob(Metrica) 
1142 |     grid.draw(my_table)  
1143 |   } else {
1144 |     LosRowNames<-Ltransp1<-Ltransp2<-Ltransp3<-Ltransp4<-NULL
1145 |     Ltransp5<-Ltransp<-list()
1146 |     selector<-paste(MetricsType,ErrorRateType, sep="_")
1147 |     loscolnames<-colnames(L[[1]]$TablebyTick[[1]])
1148 |     for (ls in 1:length(L)){
1149 |       Namecito<-names(L[ls])
1150 |       Ltransp1<-data.frame(L[[ls]]$TablebyTick [ names(L[[ls]]$TablebyTick)==selector ])
1151 |       colnames(Ltransp1)<-loscolnames
1152 |       Ltransp1$Omics<-rep(Namecito, dim(Ltransp1)[1])
1153 |       Ltransp[[Namecito]]<-Ltransp1
1154 |     }
1155 |     
1156 |     TableAll <- do.call(rbind,Ltransp)
1157 |     rownames(TableAll)<-seq(1:dim(TableAll)[1])
1158 |     
1159 |     Metrica<-t(data.frame(cbind(unique(TableAll$Metric),unique(TableAll$ErrorRate))))
1160 |     rownames(Metrica)<-NULL
1161 |     p<-ggplot() + 
1162 |       geom_line(aes(x=as.numeric(as.character(variable)), y=value, group=Omics,colour = Omics), data = TableAll, size=1) +
1163 |       geom_errorbar(aes(x=as.numeric(as.character(variable)),group=Omics ,
1164 |                         ymin = value - StdDev, ymax = value + StdDev, color=Omics), TableAll) +
1165 |       ylab("Classification Error Rate") +
1166 |       xlab("Number of Samples") +
1167 |       theme(axis.title=element_text(face="bold",size="14"),
1168 |             axis.text.x = element_text(face="bold", size=18),
1169 |             axis.text.y = element_text(face="bold", size=18),
1170 |             plot.title = element_text(size = "16", face = "bold")
1171 |       ) +
1172 |       ggtitle(paste("Comparative Classification Plot "))
1173 |     
1174 |     legend <- g_legend(p)
1175 |     grid.newpage()
1176 |     vp1 <- viewport(width = 0.75, height = 1, x = 0.375, y = .5)
1177 |     vpleg <- viewport(width = 0.25, height = 0.5, x = 0.85, y = 0.75)
1178 |     subvp <- viewport(width = 0.3, height = 0.3, x = 0.85, y = 0.25)
1179 |     print(p + theme(legend.position = "none"), vp = vp1)
1180 |     upViewport(0)
1181 |     pushViewport(vpleg)
1182 |     grid.draw(legend)
1183 |     upViewport(0)
1184 |     pushViewport(subvp)
1185 |     my_table <- tableGrob(Metrica) 
1186 |     grid.draw(my_table)  
1187 |   }
1188 | }
1189 | 
1190 | ###########################################################
1191 | RequiredtimeTest<-function (Predictors, Response, Comps = 10, Function,crosval = "LOOCV",Ticks = 20,ERIterations = 20,LassoIterations=50,cpus_per_node=1, ...) {
1192 |   # Function to calculate the time required for a particular MultiML analysis. 
1193 |   # Input: 
1194 |   # Predictors: A list of the different Omics Datasets and the Response matrix.
1195 |   # Response: A number indicating the position of the response matrix included in "Predictors" object.
1196 |   # Comps: Number of componets to be calculated after each iteration. Just applicable to PLSDA approach.
1197 |   # Function: Modular function used to calculate the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used.
1198 |   # crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF).
1199 |   # Ticks: Number of segments (groups) of samples to evaluate.
1200 |   # ERIterations: Number of iterations in which the error rate (ER) will be calculated.
1201 |   # LassoIterations: Number of iterations of the Lasso selection per each error rate analysis.
1202 |   # cpus_per_node: Number of CPUs that will be used in the analysis.
1203 |   # ...: Arguments to be passed to methods.
1204 |   
1205 |   # Output:
1206 |   # Summary: A table indicating the conditions of the analysis, established by the user
1207 |   # EstimatedTime: A table of the estimated time that the process will last showed in different metrics (seconds, minutes, hours, or days).
1208 | 
1209 |   components=Comps
1210 |   #Mins=NULL
1211 |   print("Calculating the estimated time required for the classification analysis")
1212 |   Omics<-data.frame(Omics=names(Predictors[-Response]))
1213 |   Omics2<-paste(Omics$Omics,collapse="_")
1214 |   TestSumm<-data.frame(Omics=Omics2,CrossValidation=crosval,Ticks = Ticks,ERIterations=ERIterations,LassoIterations=LassoIterations,CPUs=cpus_per_node)
1215 |   rep<-length(Predictors)-1
1216 |   NewList<-Predictors
1217 |   NewList<-NewList[-Response]
1218 |   if (length(NewList)>1){
1219 |     tabe<-data.frame(sapply(NewList,dim))
1220 |     wti<-as.numeric(tabe[2,][apply(tabe[2,],1,which.min)])
1221 |     
1222 |   } else {
1223 |     tabe<-data.frame(sapply(NewList,dim))
1224 |     wti<-as.numeric(tabe[2,])
1225 |     }
1226 |   TPLSDA<-t(data.frame(summary(system.time(capture.output(ProtsPLSDA<-ClassificationErrorRate(Predictors=Predictors,Response=Response,Comps = components,
1227 |                                                                                               Function=Function,
1228 |                                                                                               crosval = crosval,Ticks = NULL,WhichTicks = wti,ERIterations = 1,LassoIterations = 1)  )))))
1229 |   TEval<-(2 + TPLSDA[,3]*ERIterations*LassoIterations*Ticks*rep)/cpus_per_node
1230 |   
1231 |   template<-c("seconds","minutes","hours","days")
1232 |   Result<-c(round(TEval, digits=3),round(TEval/60, digits=3),round(TEval/3600, digits=3),
1233 |                 round(TEval/86400, digits=3))
1234 |   Result2<-as.data.frame(cbind(Result,template))
1235 |       
1236 |       res<-list(Summary=TestSumm,EstimatedTime=Result2)
1237 |       
1238 |       return(res)
1239 |       
1240 | }
1241 | 
1242 | ###########################################################
1243 | ER_Calculator<-function(Predictors, Response,Previous_CER=NULL,Ticks=5,WhichTicks=NULL,Function=Random.Forest.MP,Comps=10,crosval = "LOOCV",ERIterations=2,LassoIterations=2,TheoreticER=NULL,ErrorRateType="ER") {
1244 |   
1245 |   # Function to evaluate through Error rate the predictive capability of the Multipower package. 
1246 |   # It encompasses "ClassificationErrorRate" and it is the fusion between the mentioned and "ER_Adder"
1247 |   # Input: 
1248 |   # Predictors: A list of different Omics Datasets and the Response matrix
1249 |   # Response: A number indicating the response matrix included in "Predictors" object
1250 |   # Previous_CER: A previous result of class "ClassificationErrorRate" (CER) to be fusioned to the one that is going to be calculated
1251 |   # Ticks: Number of segments (groups) of samples to evaluate. If NULL, the calculation is made on its own considering the TheoreticER
1252 |   # WhichTicks: Vector of numbers of ticks the user wants to analyze. If NULL, a random selection between the minimum and maximum number of samples is calculated. It results useful in posterior rounds of analysis.
1253 |   # Function: Modular function used to calculate the the error rate. It can be PLSDA.MP or Random.Forest.MP to indicate the approach to be used.
1254 |   # Comps: Number of componets to be calculated after each iteration. Just applicable to PLSDA approach.
1255 |   # crosval: Type of cross validation to be applied, Leave-One-Out (LOOCV) or ten fold change (TenF)
1256 |   # ERIterations: Number of iterations in which the error rate (ER) will be calculated.
1257 |   # LassoIterations: Number of iterations of the Lasso selection per each error rate analysis.
1258 |   
1259 |   # Output:
1260 |   # TestedTicks: A vector of the number of evaluated number of samples
1261 |   # Omics: A vector of the evaluated Omics
1262 |   # A list of two lists:
1263 |   # Minimums: A list of the minimum value of error rate, balanced (BER) or not (ER) obtained per each ten
1264 |   #           component analysis. This is measured through three distance metrics to evaluate the 
1265 |   #           classification performance of the model. Maximal distance (max.dist), distance to 
1266 |   #           centroids (centroids) or Mahalanobis distance (Mahalanobis)
1267 |   #           Thus, each table contains the results per each iteration at different subsets of samples
1268 |   
1269 |   # CompWinner: A list of the number of components in which the minimum value of error rate, 
1270 |   #             balanced (BER) or not (ER) was obtainde per each iteration. This is measured through 
1271 |   #             the three mentioned distance metrics to evaluate the classification performance 
1272 |   #             of the model. Thus, each table contains the components per each iteration at different 
1273 |   #             subsets of samples
1274 |   suppressWarnings({ 
1275 |   
1276 |   if (! is.list(Predictors) ) {
1277 |     stop("\\nOmics dataset must be a list with at least two elements")
1278 |   }
1279 |   ###########
1280 |   # Y
1281 |   Y<-t(as.matrix(Predictors[[Response]], drop=FALSE))
1282 |   if (is.character(Y[,1])) {
1283 |     Ycita<-transform(Y, Type = as.numeric(Type))
1284 |     Y<-Ycita
1285 |     rm(Ycita)
1286 |   }
1287 |   if (ncol(Y) != 1) {
1288 |     stop("\\nResponse must be a single variable")
1289 |   }
1290 |   if (any(is.na(Y))) {
1291 |     stop("\\nResponse must not contain missing values")
1292 |   }
1293 |   if (is.null(colnames(Y))) {
1294 |     colnames(Y) = "Y"
1295 |   }
1296 |   if (is.null(rownames(Y))) {
1297 |     rownames(Y) = 1:n
1298 |   }
1299 |   # Step1: Match the sample size
1300 |   LosIndivs<- rownames(Y)
1301 |   for (i in 1:length(Predictors)){
1302 |     LosIndivs = intersect(LosIndivs, colnames(Predictors[[i]]))
1303 |   }
1304 |   
1305 |   print(paste("This analysis will be performed with",length(LosIndivs),"samples, since those are the ones repeated in all layers"))
1306 |   
1307 |   NewList<-Predictors
1308 |   
1309 |   for (i in 1:length(NewList)) {
1310 |     NewList[[i]]<-NewList[[i]][,colnames(NewList[[i]]) %in% LosIndivs]
1311 |   }
1312 |   
1313 |   ###########
1314 |   # Step2: Match the order
1315 |   LosColnames<-colnames(NewList[[1]])
1316 |   
1317 |   for (i in 1:length(NewList)) {
1318 |     NewList[[i]]<-NewList[[i]][,sort(LosColnames) ]
1319 |   }
1320 |   
1321 |   TestdeMatchTable<-unique(matrix(permutations(length(NewList)),ncol=2))
1322 |   
1323 |   for (i in 1:nrow(TestdeMatchTable)) {
1324 |     a=TestdeMatchTable[i,1]
1325 |     b=TestdeMatchTable[i,2]
1326 |     
1327 |     if (all(colnames(NewList[[a]])==colnames(NewList[[b]]))){
1328 |       #print(paste("Columns of lists",a,"and",b,"are equally sorted"))
1329 |     } else{
1330 |       #print("The colnames are not equally sorted")
1331 |     }
1332 |   }
1333 |   if (crosval=="LOOCV"){
1334 |     valid="loo"
1335 |   }
1336 |   if (crosval=="TenF"){
1337 |     valid="Mfold"
1338 |   }
1339 |   
1340 |   # Y
1341 |   Y<-t(as.matrix(NewList[[Response]], drop=FALSE))
1342 |   if (is.character(Y[,1])) {
1343 |     Ycita<-transform(Y, Type = as.numeric(Type))
1344 |     Y<-Ycita
1345 |     rm(Ycita)
1346 |   }
1347 |   
1348 |   #########
1349 |   Omics<-data.frame(Omics=names(NewList[-Response]))
1350 |   
1351 |   MinN<-round(length(table(as.factor(Y[,1]))) * 2, digits = 0)
1352 |   MaxN<-nrow(Y)
1353 |   Ngroups<-length(table(as.factor(Y[,1])))
1354 |   
1355 |   if(!is.null(Ticks)) {
1356 |     vectTicks<-round(seq(from=MinN,to = MaxN,length.out = Ticks),digits = 0)
1357 |     setdeTicks = 1
1358 |     DegreesOfFreedom<-Ticks-1
1359 |   } else {
1360 |     if (is.null(WhichTicks)) {
1361 |       setdeTicks<-seq(from=5,to = MaxN,by=2)
1362 |     } else {
1363 |       vectTicks<-WhichTicks
1364 |       setdeTicks<-1
1365 |     }
1366 |   }
1367 |   if (length(setdeTicks)==1) {
1368 |     tab<-  ClassificationErrorRate (Predictors=Predictors, Response=length(Predictors),
1369 |                                     Comps=Comps,crosval = crosval,ERIterations=ERIterations,
1370 |                                     LassoIterations=LassoIterations,Ticks=NULL, 
1371 |                                     WhichTicks=vectTicks,Function = Function)
1372 | 
1373 |     if (is.null(Previous_CER)){
1374 |       
1375 |       Previous_CER<-tab
1376 |       allmeltedmodel<-Previous_CER$TablebyTick
1377 |     
1378 |     } else{
1379 |         ttdticks<-sort(c(tab$TestedTicks,Previous_CER$TestedTicks))
1380 |       
1381 |       if (length(tab$TablebyTick)==6) {
1382 |         tbyticks2<-NULL
1383 |         tbyticks<-list()
1384 |         for (tod in 1:length(tab$TablebyTick)) {
1385 |           namecito<-names(tab$TablebyTick[tod])
1386 |           tbyticks2<-rbind(tab$TablebyTick[[tod]],Previous_CER$TablebyTick[[tod]])
1387 |           tbyticks2<-tbyticks2[order(as.numeric(as.character(tbyticks2$variable))),]
1388 |           rownames(tbyticks2)<-seq(1:dim(tbyticks2)[1])
1389 |           tbyticks[namecito]<-list(tbyticks2)
1390 |         }
1391 |         orden<-as.numeric(as.character(tbyticks[[1]]$variable))
1392 |         orden2<-paste(orden,"samples", sep=" ")
1393 |         orden2
1394 |       } else {
1395 |         tbyticks<-rbind(tab$TablebyTick,Previous_CER$TablebyTick)
1396 |         tbyticks<-tbyticks[order(as.numeric(as.character(tbyticks$variable))),]
1397 |         rownames(tbyticks)<-seq(1:dim(tbyticks)[1])
1398 |         
1399 |         orden<-as.numeric(as.character(tbyticks$variable))
1400 |         orden2<-paste(orden,"samples", sep=" ")
1401 |         
1402 |       } 
1403 |       ######
1404 |       om<-as.data.frame(tab$Omics, drop=F)
1405 |       Omics<-om
1406 |       ######
1407 |       lstdeMins<-append(tab$Minimums,Previous_CER$Minimums)
1408 |       lstdeMins<-lstdeMins[match(orden2,names(lstdeMins))]
1409 |       
1410 |       
1411 |       if (!is.null(tab$CompWinner)) {
1412 |         lstdewinners<-append(tab$Minimums,Previous_CER$Minimums)
1413 |         lstdewinners<-lstdewinners[match(orden2,names(lstdewinners))]
1414 |       } else {}
1415 |       
1416 |       if (is.null(tab$CompWinner)) {
1417 |         Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,TablebyTick=tbyticks)
1418 |       } else {
1419 |         Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,
1420 |                            CompWinner=lstdewinners,TablebyTick=tbyticks)
1421 |       }
1422 |       
1423 |       allmeltedmodel<-Previous_CER$TablebyTick
1424 |     
1425 |     }
1426 |     
1427 |     ######
1428 |   } else {
1429 |     Rankk<-ranksas<-list()
1430 |     for (sdt in 1:length(setdeTicks)){
1431 |       numTicks<-setdeTicks[sdt]
1432 |       vectTicks<-round(seq(from=MinN,to = MaxN,length.out = numTicks),digits = 0)
1433 |       howmany<-length(vectTicks)
1434 |       
1435 |       if (is.null(Previous_CER) ){
1436 |         already=0
1437 |       } else {
1438 |         already<-length(Previous_CER$TestedTicks)
1439 |         howmany<-already+2
1440 |         vectTicks<-round(seq(from=MinN,to = MaxN,length.out = howmany),digits = 0)
1441 |         vectTicks<-c(setdiff(as.vector(vectTicks),as.vector(Previous_CER$TestedTicks) ))
1442 |         vectTicks<-sort(sample(vectTicks,size = (howmany-already), replace = FALSE ))
1443 |       }
1444 |       ######
1445 |       tab<-  ClassificationErrorRate (Predictors=Predictors, Response=length(Predictors),
1446 |                                       Comps=Comps,crosval = crosval,ERIterations=ERIterations,
1447 |                                       LassoIterations=LassoIterations,Ticks=NULL, 
1448 |                                       WhichTicks=vectTicks,Function = Function)
1449 |       
1450 |       if (is.data.frame(tab$TablebyTick)){
1451 |       colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="StdDev"] <- "SEM"
1452 |       colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="variable"] <- "Samples"
1453 |       colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="value"] <- "ER_value"
1454 |       colnames(tab$TablebyTick)[colnames(tab$TablebyTick)=="ErrorRate"] <- "ER_type"
1455 |       } else {}
1456 |       
1457 |       if (is.null(Previous_CER)){
1458 |         
1459 |         Previous_CER<-tab
1460 |         allmeltedmodel<-Previous_CER$TablebyTick
1461 |       
1462 |       } else{
1463 |         ttdticks<-sort(c(tab$TestedTicks,Previous_CER$TestedTicks))
1464 |       
1465 |         if (!is.data.frame(tab$TablebyTick)) {
1466 |           tbyticks2<-NULL
1467 |           tbyticks<-list()
1468 |           colnames <- c("Category","Samples","ER_value","SEM","Comp","ER_type","Metric","CI_90","CI_95","CI_99") 
1469 |             for (i in seq_along(tab$TablebyTick)){
1470 |                 colnames(tab$TablebyTick[[i]]) <- colnames
1471 |             }
1472 |           for (tod in 1:length(tab$TablebyTick)) {
1473 |             namecito<-names(tab$TablebyTick[tod])
1474 |             tbyticks2<-rbind(tab$TablebyTick[[tod]],Previous_CER$TablebyTick[[tod]])
1475 |             tbyticks2<-tbyticks2[order(as.numeric(as.character(tbyticks2$Samples))),]
1476 |             rownames(tbyticks2)<-seq(1:dim(tbyticks2)[1])
1477 |             tbyticks[namecito]<-list(tbyticks2)
1478 |           }
1479 |           orden<-as.numeric(as.character(tbyticks[[1]]$Samples))
1480 |           orden2<-paste(orden,"samples", sep=" ")
1481 |           orden2
1482 |         } else {
1483 |           tbyticks<-rbind(tab$TablebyTick,Previous_CER$TablebyTick)
1484 |           tbyticks<-tbyticks[order(as.numeric(as.character(tbyticks$Samples))),]
1485 |           rownames(tbyticks)<-seq(1:dim(tbyticks)[1])
1486 |           
1487 |           orden<-as.numeric(as.character(tbyticks$Samples))
1488 |           orden2<-paste(orden,"samples", sep=" ")
1489 |         } 
1490 |         ######
1491 |         om<-as.data.frame(tab$Omics, drop=F)
1492 |         Omics<-om
1493 |         ######
1494 |         lstdeMins<-append(tab$Minimums,Previous_CER$Minimums)
1495 |         lstdeMins<-lstdeMins[match(orden2,names(lstdeMins))]
1496 |       
1497 |         if (!is.null(tab$CompWinner)) {
1498 |           lstdewinners<-append(tab$Minimums,Previous_CER$Minimums)
1499 |           lstdewinners<-lstdewinners[match(orden2,names(lstdewinners))]
1500 |         } else {}
1501 |         
1502 |        #####
1503 |         if (is.null(tab$CompWinner)) {
1504 |           Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,TablebyTick=tbyticks)
1505 |         } else {
1506 |           Previous_CER<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,
1507 |                              CompWinner=lstdewinners,TablebyTick=tbyticks)
1508 |         }
1509 |         
1510 |         allmeltedmodel<-Previous_CER$TablebyTick
1511 |         
1512 |         
1513 |       } # Close of Previous_CER
1514 |       
1515 |       if (is.null(TheoreticER) ) {
1516 |         MinimumError=min(tbyticks$value)
1517 |         SamplesEvaluated=SamplesRequired=as.numeric(as.character(
1518 |                                         Previous_CER$TablebyTick$mahalanobis.dist_ER$variable[Previous_CER$TablebyTick$mahalanobis.dist_ER$value==min(Previous_CER$TablebyTick$mahalanobis.dist_ER$value)]  ))
1519 |       } else{
1520 |         MinimumError=TheoreticER 
1521 |       }
1522 |       
1523 | 
1524 |       if (!is.data.frame(allmeltedmodel)){
1525 |         allmeltedmodel<-allmeltedmodel$mahalanobis.dist_ER
1526 |         ro=dim(allmeltedmodel)[1]
1527 |         colnames(allmeltedmodel)[colnames(allmeltedmodel)=="StdDev"] <- "SEM"
1528 |         colnames(allmeltedmodel)[colnames(allmeltedmodel)=="variable"] <- "Samples"
1529 |         colnames(allmeltedmodel)[colnames(allmeltedmodel)=="value"] <- "ER_value"
1530 |         colnames(allmeltedmodel)[colnames(allmeltedmodel)=="ErrorRate"] <- "ER_type"
1531 |       } else {
1532 |         ro=dim(allmeltedmodel)[1]
1533 |         } 
1534 |       
1535 |       #####
1536 |       if (allmeltedmodel$ER_value[ro-1]<allmeltedmodel$ER_value[ro]) {
1537 |         allmeltedmodel$ER_value[ro]<-allmeltedmodel$ER_value[ro-1]-abs(allmeltedmodel$SEM[ro-1])
1538 |       } else{}
1539 |       
1540 |       #####
1541 |       
1542 |       namecito<-paste(dim(allmeltedmodel)[1],"ticks", sep=" ")
1543 |       DegreesOfFreedom<-length(Previous_CER$Minimums)-1
1544 |       
1545 |       fm1 <- lm(ER_value ~ bs(as.numeric(as.character(Samples)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel)
1546 |       DESVESTprediction<-lm(SEM ~ bs(as.numeric(as.character(Samples)), degree = 1,df = DegreesOfFreedom), data = allmeltedmodel)
1547 |       ht01 <- seq(min(as.numeric(as.character(allmeltedmodel$Samples))),2000, length.out = 2000)
1548 |       prediction<-as.data.frame(cbind(predict(fm1, data.frame(Samples = ht01)),predict(DESVESTprediction, data.frame(Samples = ht01))   )  );colnames(prediction)<-c("Prediction","STDpred")
1549 |       ht01Table<-cbind(ht01,as.data.frame(prediction));colnames(ht01Table)<-c("ht","ErrorPred", "STDpred")
1550 |       
1551 |       NSampleMaxTable<-as.data.frame(ht01Table[ht01Table[,2]>=MinimumError,])
1552 |       NSampleMax<-1+round(NSampleMaxTable$ht[dim(NSampleMaxTable)[1]])
1553 |       NSampleMaxPosition<-as.numeric(rownames(NSampleMaxTable)[dim(NSampleMaxTable)[1]])
1554 |       
1555 |       NSampleMaxTable$EplusStd<-NSampleMaxTable$ErrorPred - NSampleMaxTable$STDpred
1556 |       Theplusminustable<-as.data.frame(NSampleMaxTable[NSampleMaxTable[,4]>=MinimumError,])
1557 |       Theplusminus<-round(Theplusminustable$ht[dim(Theplusminustable)[1]])
1558 |       TheplusminusPosition<-as.numeric(rownames(Theplusminustable)[dim(Theplusminustable)[1]])
1559 |       
1560 |       SamplesEvaluated<-as.numeric(as.character(allmeltedmodel$variable[dim(allmeltedmodel)[1]]))
1561 |       SamplesRequired<-round(NSampleMaxTable$ht[NSampleMaxPosition])
1562 |       MOE<-paste("±",round(NSampleMax-Theplusminus))
1563 |       
1564 |       TERtable<-rbind(ERtarget=round(MinimumError,digits = 3),
1565 |                       PSS=SamplesRequired,
1566 |                       MOE= MOE) 
1567 |       ####################
1568 |       rango<-seq(SamplesRequired-round(NSampleMax-Theplusminus),SamplesRequired+round(NSampleMax-Theplusminus), by=1)
1569 |       ranksas<-list(rango)
1570 |       Rankk[namecito]<-ranksas
1571 |       
1572 |       # STOP protocol
1573 |       if (length(Rankk)>2) {
1574 |         testedfrags<-testedfragsSas<-list()
1575 |         for (lngt in 1:length(Rankk)) {
1576 |           namecito<-names(Rankk[lngt])
1577 |           if (length(Rankk[[lngt]])<20) {
1578 |             testedfragsSas<- list(Rankk[[lngt]])
1579 |             testedfrags[namecito]<-testedfragsSas
1580 |             
1581 |           } else {}
1582 |         }
1583 |         if (length(testedfrags)>2){
1584 |           fin<-length(testedfrags)
1585 |           intrsct<-intersect(intersect(testedfrags[[fin]],testedfrags[[fin-1]]),testedfrags[[fin-2]])
1586 |           if(length(intrsct)>=3){
1587 |             if (is.data.frame(tbyticks)){
1588 |             colnames(tbyticks)[colnames(tbyticks)=="StdDev"] <- "SEM"
1589 |             colnames(tbyticks)[colnames(tbyticks)=="variable"] <- "Samples"
1590 |             colnames(tbyticks)[colnames(tbyticks)=="value"] <- "ER_value"
1591 |             colnames(tbyticks)[colnames(tbyticks)=="ErrorRate"] <- "ER_type"
1592 |             } else {
1593 |             colnames <- c("Category","Samples","ER_value","SEM","Comp","ER_type","Metric","CI_90","CI_95","CI_99") 
1594 |               for (i in seq_along(tbyticks)){
1595 |                   colnames(tbyticks[[i]]) <- colnames
1596 |               }
1597 |             }
1598 |             
1599 |             if (is.null(tab$CompWinner)) {
1600 |               res<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,Prediction_table=TERtable,TablebyTick=tbyticks)
1601 |             } else {
1602 |               res<-list(TestedTicks=ttdticks,Omics=Omics,Minimums=lstdeMins,Prediction_table=TERtable,
1603 |                         CompWinner=lstdewinners,TablebyTick=tbyticks)
1604 |             }
1605 |             return(res)
1606 |             break
1607 |           }else{}
1608 |         } else {}
1609 |       }
1610 |     } 
1611 |   }
1612 |   })
1613 | }
1614 | 
1615 | # END 
1616 | 
1617 | 


--------------------------------------------------------------------------------