├── LICENSE
├── .Rbuildignore
├── data
├── charity.rda
└── manifest.rda
├── man
├── figures
│ ├── mlim.png
│ ├── MLIM_text.png
│ ├── charity.png
│ ├── flowchart.pdf
│ ├── flowchart.png
│ ├── procedure3.pdf
│ ├── procedure3.png
│ ├── procedure4.pdf
│ ├── procedure4.png
│ ├── procedure5.png
│ ├── bfi_binary_p15.png
│ ├── flowchart_base.png
│ ├── iris_mpce_p15.png
│ ├── bfi_missrank_p15.png
│ ├── titanic_mpce_p15.png
│ ├── iris_continuous_p15.png
│ ├── sat.act_nrmse_p15.png
│ ├── flowchart_optimization.png
│ ├── wiki.svg
│ ├── manual.svg
│ ├── handbook_book.svg
│ ├── handbook_lamp.svg
│ ├── handbook_stupid.svg
│ ├── Untitled.drawio
│ ├── flowchart_optimization.drawio
│ ├── flowchart_base.drawio
│ ├── flowchart.drawio
│ ├── procedure2.drawio
│ ├── procedure.drawio
│ ├── procedure3.drawio
│ ├── procedure4.drawio
│ └── procedure5.drawio
├── charity.Rd
├── mlim.summarize.Rd
├── mlim.mids.Rd
├── mlim.preimpute.Rd
├── mlim.na.Rd
├── mlim.error.Rd
├── manifest.Rd
└── mlim.Rd
├── .gitignore
├── R
├── defaultCV.R
├── normalize.R
├── getDigits.R
├── javaServer.R
├── factmem.R
├── charity.R
├── threeDots.R
├── addNA.R
├── is.valid.R
├── bootstrapWeight.R
├── is.imbalanced.R
├── zzz.R
├── stochasticFactorImpute.R
├── missclass.R
├── nrmse.R
├── missrank.R
├── getMetrics.R
├── revert.R
├── meanmode.R
├── matching.R
├── h2o.check.R
├── miss_per_class_error.R
├── server.check.R
├── mlim.mids.R
├── selectVariables.R
├── extractMetrics.R
├── iterationNextVar.R
├── init.R
├── mlim.shuffle.R
├── algoSelector.R
├── bootstrapCV.R
├── mlim.summarize.R
├── mlim.postimpute.R
├── mlim.preimpute.R
├── mlim.na.R
├── syntaxProcessing.R
├── stoppingCriteria.R
├── checkNconvert.R
├── manifest.R
├── mlim.error.R
├── iteration_loop.R
└── mlim.R
├── mlim.Rproj
├── NAMESPACE
├── DESCRIPTION
└── README.md
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2022
2 | COPYRIGHT HOLDER: E. F. Haghish
3 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\web
4 | ^.log
--------------------------------------------------------------------------------
/data/charity.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/data/charity.rda
--------------------------------------------------------------------------------
/data/manifest.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/data/manifest.rda
--------------------------------------------------------------------------------
/man/figures/mlim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/mlim.png
--------------------------------------------------------------------------------
/man/figures/MLIM_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/MLIM_text.png
--------------------------------------------------------------------------------
/man/figures/charity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/charity.png
--------------------------------------------------------------------------------
/man/figures/flowchart.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart.pdf
--------------------------------------------------------------------------------
/man/figures/flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart.png
--------------------------------------------------------------------------------
/man/figures/procedure3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure3.pdf
--------------------------------------------------------------------------------
/man/figures/procedure3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure3.png
--------------------------------------------------------------------------------
/man/figures/procedure4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure4.pdf
--------------------------------------------------------------------------------
/man/figures/procedure4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure4.png
--------------------------------------------------------------------------------
/man/figures/procedure5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure5.png
--------------------------------------------------------------------------------
/man/figures/bfi_binary_p15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/bfi_binary_p15.png
--------------------------------------------------------------------------------
/man/figures/flowchart_base.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart_base.png
--------------------------------------------------------------------------------
/man/figures/iris_mpce_p15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/iris_mpce_p15.png
--------------------------------------------------------------------------------
/man/figures/bfi_missrank_p15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/bfi_missrank_p15.png
--------------------------------------------------------------------------------
/man/figures/titanic_mpce_p15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/titanic_mpce_p15.png
--------------------------------------------------------------------------------
/man/figures/iris_continuous_p15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/iris_continuous_p15.png
--------------------------------------------------------------------------------
/man/figures/sat.act_nrmse_p15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/sat.act_nrmse_p15.png
--------------------------------------------------------------------------------
/man/figures/flowchart_optimization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart_optimization.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | .DS_Store
6 | *.log
7 | TEST.R
8 | TEST.R
9 | TEST.R
10 |
--------------------------------------------------------------------------------
/R/defaultCV.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | defaultCV <- function(n, min = 10) {
4 | cv <- NULL
5 | v <- n / min
6 | if (v >= 150) cv <- 15
7 | }
8 |
--------------------------------------------------------------------------------
/R/normalize.R:
--------------------------------------------------------------------------------
1 | #' @title normalize
2 | #' @description normalizes a variable, scaling it to range from 0 to 1
3 | #' @return list
4 | #' @author E. F. Haghish
5 | #' @keywords Internal
6 | #' @noRd
7 |
8 | normalize <- function(x){
9 | return(x-min(x, na.rm = TRUE))/(max(x, na.rm = TRUE)-min(x, na.rm = TRUE))
10 | }
11 |
--------------------------------------------------------------------------------
/R/getDigits.R:
--------------------------------------------------------------------------------
1 | #' @title getDigits
2 | #' @description extracts number of digits
3 | #' @author E. F. Haghish
4 | #' @return integer
5 | #' @keywords Internal
6 | #' @noRd
7 |
8 | getDigits <- function(x) {
9 | if (!is.null(x)) {
10 | result <- floor(log10(abs(x)))
11 | result[!is.finite(result)] = 0
12 | return(abs(result))
13 | }
14 | else return(0)
15 | }
16 |
--------------------------------------------------------------------------------
/R/javaServer.R:
--------------------------------------------------------------------------------
1 | #' @title java server interaction
2 | #' @description the function provides commands to interact with the java
3 | #' server
4 | #' @param action character
5 | #' @return character (java command)
6 | #' @author E. F. Haghish
7 | #' @keywords Internal
8 | #' @noRd
9 | javaServer <- function(action){
10 | result <- NULL
11 | if (action == "flush") result <- "h2o:::.h2o.garbageCollect()"
12 | return(result)
13 | }
14 |
--------------------------------------------------------------------------------
/R/factmem.R:
--------------------------------------------------------------------------------
1 | #' @title factmem
2 | #' @description memorizes factors' levels
3 | #' @return list
4 | #' @author E. F. Haghish
5 | #' @keywords Internal
6 | #' @noRd
7 | factmem <- function(df) {
8 | mem <- list()
9 | for (i in 1:ncol(df)) {
10 | lev <- levels(df[,i])
11 | len <- 1:length(lev)
12 | nam <- colnames(df)[i]
13 | mem[[i]] <- list(names = nam, length = len, level=lev)
14 | }
15 |
16 | return(mem)
17 | }
18 |
19 |
--------------------------------------------------------------------------------
/R/charity.R:
--------------------------------------------------------------------------------
1 | #' some items about attitude towards charity
2 | #'
3 | #' A dataset containing likert-scale items about attitude towards charity
4 | #'
5 | #' @format A data frame with 832 rows and 5 variables:
6 | #' \describe{
7 | #' \item{ta1}{Charitable Organizations More Effective}
8 | #' \item{ta2}{Degree of Trust}
9 | #' \item{ta3}{Charitable Organizations Honest/Ethical}
10 | #' \item{ta4}{Role Improving Communities}
11 | #' \item{ta5}{Job Delivering Services}
12 | #' }
13 | #' @source \url{https://www.stata.com/}
14 | "charity"
15 |
16 |
--------------------------------------------------------------------------------
/R/threeDots.R:
--------------------------------------------------------------------------------
1 | #' @title process ... arguments
2 | #' @description extract values from '...' argument and pass them to the scalars
3 | #' @param name character. the name of the object of interest
4 | #' @param ... arguments list
5 | #' @return value of the object of interest
6 | #' @author E. F. Haghish
7 | #' @keywords Internal
8 | #' @noRd
9 | threeDots <- function(name, ..., default){
10 | dotList <- list(...)
11 | dotName <- names(dotList)
12 | if (name %in% dotName) {
13 | return(dotList[[name]])
14 | }
15 | else return(default)
16 | }
17 |
--------------------------------------------------------------------------------
/mlim.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: No
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 |
23 | QuitChildProcessesOnExit: Yes
24 | DisableExecuteRprofile: Yes
25 |
--------------------------------------------------------------------------------
/R/addNA.R:
--------------------------------------------------------------------------------
1 | #' @title add NA in a vector
2 | #' @description generates NA and replaces the actual values of a vector
3 | #' with NA
4 | #' @author E. F. Haghish
5 | #' @keywords Internal
6 | #' @noRd
7 | addNA <- function(x, p, stratify=FALSE) {
8 |
9 | if (stratify & "factor" %in% class(x)) {
10 | levs <- levels(x)
11 | for (l in levs) {
12 | index <- which(x == l)
13 | len <- length(index)
14 | x[index][sample(len, round(p * len))] <- NA
15 | }
16 | }
17 | else {
18 | len <- length(x)
19 | x[sample(len, round(p * len))] <- NA # sample without replacement
20 | }
21 | return(x)
22 | }
23 |
--------------------------------------------------------------------------------
/R/is.valid.R:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------
2 | # is.valid
3 | # ==========================================================
4 | #' @title validate an object
5 | #' @description a function to make sure the value is not NULL,
6 | #' NA, NaN, or Inf
7 | #' @return locical. if TRUE, the object is valid
8 | #' @author E. F. Haghish
9 | #' @keywords Internal
10 | #' @noRd
11 | is.valid = function(x) {
12 | if (!is.null(x)) {
13 | if (is.na(x)) {
14 | return(FALSE)
15 | }
16 | else if (is.infinite(x)) {
17 | return(FALSE)
18 | }
19 | else {
20 | return(TRUE)
21 | }
22 | }
23 | else {
24 | return(FALSE)
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/man/charity.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/charity.R
3 | \docType{data}
4 | \name{charity}
5 | \alias{charity}
6 | \title{some items about attitude towards charity}
7 | \format{
8 | A data frame with 832 rows and 5 variables:
9 | \describe{
10 | \item{ta1}{Charitable Organizations More Effective}
11 | \item{ta2}{Degree of Trust}
12 | \item{ta3}{Charitable Organizations Honest/Ethical}
13 | \item{ta4}{Role Improving Communities}
14 | \item{ta5}{Job Delivering Services}
15 | }
16 | }
17 | \source{
18 | \url{https://www.stata.com/}
19 | }
20 | \usage{
21 | charity
22 | }
23 | \description{
24 | A dataset containing likert-scale items about attitude towards charity
25 | }
26 | \keyword{datasets}
27 |
--------------------------------------------------------------------------------
/R/bootstrapWeight.R:
--------------------------------------------------------------------------------
1 | #' @title bootstrap duplication removed and weight_column calculator
2 | #' @description identifies duplicated rows in the bootstrap and creates
3 | #' a weight column counting the duplicated rows to be given
4 | #' as additional weights. this weight column should be
5 | #' added to the weight_column given by the user.
6 | #' @author E. F. Haghish
7 | #' @return numeric data.frame including 'rows' and 'weight'. rows is the row number
8 | #' of the observation and weight is the number of duplications.
9 | #' @keywords Internal
10 | #' @noRd
11 | bootstrapWeight <- function(index) {
12 | weight <- table(index)
13 | rows <- as.numeric(names(weight))
14 | return(as.data.frame(cbind(rows, weight)))
15 | }
16 |
--------------------------------------------------------------------------------
/R/is.imbalanced.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title is.imbalanced
3 | #' @description examines whether a binary or multinomial feature is imbalanced
4 | #' @param x vector.
5 | #' @author E. F. Haghish
6 | #' @return data.frame of error metrics.
7 | #' @keywords Internal
8 | #' @noRd
9 |
10 | is.imbalanced <- function(x) {
11 | imbalanced <- FALSE
12 | prev <- sort(table(x) / length(x))
13 |
14 | # get the difference between the first and last scalars
15 | diff <- prev[1] / prev[length(prev)]
16 |
17 | # define a criteria for imbalance, to prefer AUCPR to AUC.
18 | # Here I select any difference more than 0.3 (e.g. 70% - 30%) as
19 | # imbalanced and prefer AUPRC for tuning
20 | if (diff >= 0.3) imbalanced <- TRUE
21 |
22 | return(imbalanced)
23 | }
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | #' @title normalized RMSE
2 | #' @description calculates the normalized RMSE
3 | #' @importFrom curl curl
4 | #' @author E. F. Haghish
5 | #' @keywords Internal
6 | #' @noRd
7 | #'
8 | .onAttach <- function(lib, pkg) {
9 | packageStartupMessage('Single and Multiple Imputation with Automated Machine Learning\n
10 | \nNote1: This is a free software and comes with no guarantee.
11 | \nNote2: The default algorithm is ELNET, which is the fastest\n to fine-tune. You can reduce imputation error by\n adding other algorithms e.g. "RF", "Ensemble", ...
12 | \nNote3: If you find a bug, post it on GitHub\n https://github.com/haghish/mlim\n')
13 | options("prefer_RCurl" = FALSE)
14 | options(timeout = 3600) # "seconds"
15 | }
16 |
--------------------------------------------------------------------------------
/R/stochasticFactorImpute.R:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------
2 | # getMetrics
3 | # ==========================================================
4 | #' @title generates random integers with the length of the factor levels
5 | #' @description generates stochastic integer valuesbased on estimated
6 | #' probabilities of each factor's levels' probabilities
7 | #' @param levels levels of the factor
8 | #' @param probMat estimated probability of each level for each missing observation
9 | #' @author E. F. Haghish
10 | #' @keywords Internal
11 | #' @noRd
12 |
13 | stochasticFactorImpute <- function(levels, probMat)
14 | {
15 | do.call("c", lapply(seq(nrow(probMat)), function(i)
16 | {
17 | sample(x = levels, size = 1, replace = TRUE, prob = probMat[i,])
18 | }))
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/R/missclass.R:
--------------------------------------------------------------------------------
1 | #' @title missclassification error
2 | #' @description calculates the missclassification rate for each variable
3 | #' @param imputed the imputed dataframe
4 | #' @param incomplete the dataframe with missing values
5 | #' @param complete the original dataframe with no missing values
6 | #' @author E. F. Haghish
7 | #' @keywords Internal
8 | #' @noRd
9 | missclass <- function(imputed, incomplete, complete, rename = TRUE){
10 | classerror <- NULL
11 | mis <- as.data.frame(is.na(incomplete))
12 | index <- which(colSums(mis) > 0)
13 | for (i in index) {
14 | missclass <- sum(as.character(as.matrix(imputed[,i])) != as.character(as.matrix(complete[,i])))
15 | NAs <- sum(is.na(incomplete[, i]))
16 | classerror <- c(classerror, (missclass / NAs))
17 | }
18 | if (rename) names(classerror) <- colnames(incomplete)[index]
19 | return(classerror)
20 | }
21 |
--------------------------------------------------------------------------------
/R/nrmse.R:
--------------------------------------------------------------------------------
1 | #' @title normalized RMSE
2 | #' @description calculates the normalized RMSE
3 | #' @param imputed the imputed dataframe
4 | #' @param incomplete the dataframe with missing values
5 | #' @param complete the original dataframe with no missing values
6 | #' @author E. F. Haghish
7 | #' @keywords Internal
8 | #' @noRd
9 | nrmse <- function(imputed, incomplete, complete){
10 | nrmse <- NULL
11 | mis <- as.data.frame(is.na(incomplete))
12 | index <- which(colSums(mis) > 0)
13 | for (i in index) {
14 | v.na <- is.na(incomplete[, i])
15 | nrmse <- c(nrmse,
16 | sqrt(mean((imputed[v.na,i] - complete[v.na,i])^{2}) /
17 | stats::var(complete[v.na,i]))
18 | )
19 |
20 | }
21 | names(nrmse) <- colnames(incomplete)[index]
22 | return(nrmse)
23 | #sqrt(mean((imputed[mis] - complete[mis])^{2}) / stats::var(complete[mis]))
24 | }
25 |
--------------------------------------------------------------------------------
/R/missrank.R:
--------------------------------------------------------------------------------
1 | #' @title miss ranking error
2 | #' @description calculates the MAE or missranking, devided by N-1 ordered categories
3 | #' @param imputed the imputed dataframe
4 | #' @param incomplete the dataframe with missing values
5 | #' @param complete the original dataframe with no missing values
6 | #' @author E. F. Haghish
7 | #' @keywords Internal
8 | #' @noRd
9 | missrank <- function(imputed, incomplete, complete){
10 | MAE <- NULL
11 | mis <- as.data.frame(is.na(incomplete))
12 | index <- which(colSums(mis) > 0)
13 | for (i in index) {
14 | imputed[,i] <- as.numeric(imputed[,i])
15 | incomplete[,i] <- as.numeric(incomplete[,i])
16 | complete[,i] <- as.numeric(complete[,i])
17 | v.na <- is.na(incomplete[, i])
18 | MAE <- c(MAE, (mean(abs(imputed[v.na,i] - complete[v.na,i])) /
19 | (length(unique(complete[,i]))-1)
20 | ))
21 | }
22 | names(MAE) <- colnames(incomplete)[index]
23 | return(MAE)
24 | }
25 |
--------------------------------------------------------------------------------
/R/getMetrics.R:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------------------
2 | # getMetrics
3 | # ==========================================================
4 | #' @title retreives metrics data
5 | #' @description a function to retreive the metrics data from
6 | #' datasets preimputed with mlim
7 | #' @param preimputed.data data.frame previously imputed by mlim or
8 | #' other programs
9 | #' @author E. F. Haghish
10 | #' @keywords Internal
11 | #' @noRd
12 | getMetrics = function(preimputed.data) {
13 | metrics <- attributes(preimputed.data)$metrics
14 | if (!is.null(metrics)) {
15 | unc <- unique(metrics$variable)
16 | maxmetrics <- max(metrics$iteration, na.rm = TRUE)
17 | for (i in unc) {
18 | metrics[(metrics$iteration == maxmetrics & metrics$variable == i),
19 | "RMSE"] <- min(metrics$RMSE[metrics$variable == i], na.rm = TRUE)
20 | }
21 | return(metrics)
22 | }
23 | else return(NULL)
24 | }
25 |
--------------------------------------------------------------------------------
/R/revert.R:
--------------------------------------------------------------------------------
1 | #' @title revert
2 | #' @description using factmem object, integer variables are reverted to the
3 | #' original variable type of "ordered factors".
4 | #' @return data.frame
5 | #' @author E. F. Haghish
6 | #' @keywords Internal
7 | #' @noRd
8 |
9 | revert <- function(df, factmem) {
10 | cols <- colnames(df)
11 | for (i in 1:length(factmem)) {
12 | data <- factmem[[i]][[1]]
13 | if (!is.null(data$names)) {
14 | if (data$names %in% cols) {
15 | df[, data$names] <- factor(as.character(round(df[, data$names])),
16 | levels = as.character(data$length),
17 | labels = data$level,
18 | ordered = TRUE)
19 | }
20 |
21 | }
22 | }
23 |
24 | return(df)
25 | }
26 |
27 | #revert(round(mlimELNET[, 1:3]), MEM)
28 | #print(head(mlimELNET$A1))
29 |
30 | #revert(LAST[, COLS, drop = FALSE], MEM)
31 |
32 |
--------------------------------------------------------------------------------
/R/meanmode.R:
--------------------------------------------------------------------------------
1 | #' @title mean and mode imputation
2 | #' @description Compute the missing values with mean and mode replacement
3 | #' @importFrom stats median
4 | #' @param data A data frame with dummies or numeric variables.
5 | #' @return imputed dataset
6 | #' @author E. F. Haghish, Maintainer: \email{haghish@@uio.no}
7 | #' @noRd
8 | #' @keywords Internal
9 |
10 | meanmode <- function (data) {
11 |
12 | Mode <- function(x) {
13 | xtab <- table(x)
14 | xmode <- names(which(xtab == max(xtab)))
15 | return(xmode[1])
16 | }
17 |
18 | values <- sapply(data, function(x) {
19 | if (class(x) %in% c("character", "factor"))
20 | Mode(x)
21 | else if (class(x) %in% c("numeric", "integer"))
22 | median(x, na.rm = TRUE)
23 |
24 | }, simplify = FALSE)
25 |
26 | #impute the values ??? improve it in the future release and make it more secure
27 | for (i in 1:length(values)) {
28 | data[is.na(data[,i]), i] <- values[i]
29 | }
30 |
31 | return(data)
32 | }
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/man/mlim.summarize.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mlim.summarize.R
3 | \name{mlim.summarize}
4 | \alias{mlim.summarize}
5 | \title{mlim imputation summary}
6 | \usage{
7 | mlim.summarize(data)
8 | }
9 | \arguments{
10 | \item{data}{dataset imputed with mlim}
11 | }
12 | \value{
13 | estimated imputation accuracy via cross-valdiation procedure
14 | }
15 | \description{
16 | provides information about estimated accuracy of the imputation as well
17 | as the overall procedure of the imputation.
18 | }
19 | \examples{
20 | \dontrun{
21 | data(iris)
22 |
23 | # add 10\% stratified missing values to one factor variable
24 | irisNA <- iris
25 | irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022)
26 |
27 | # run the ELNET single imputation (fastest imputation via 'mlim')
28 | MLIM <- mlim(irisNA)
29 |
30 | # in single imputation, you can estimate the imputation accuracy via cross validation RMSE
31 | mlim.summarize(MLIM)
32 | }
33 | }
34 | \author{
35 | E. F. Haghish
36 | }
37 |
--------------------------------------------------------------------------------
/man/figures/wiki.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/manual.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/handbook_book.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/handbook_lamp.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/man/figures/handbook_stupid.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(mlim)
4 | export(mlim.error)
5 | export(mlim.mids)
6 | export(mlim.na)
7 | export(mlim.preimpute)
8 | export(mlim.summarize)
9 | importFrom(curl,curl)
10 | importFrom(h2o,as.h2o)
11 | importFrom(h2o,h2o.automl)
12 | importFrom(h2o,h2o.clusterIsUp)
13 | importFrom(h2o,h2o.clusterStatus)
14 | importFrom(h2o,h2o.getId)
15 | importFrom(h2o,h2o.get_automl)
16 | importFrom(h2o,h2o.init)
17 | importFrom(h2o,h2o.load_frame)
18 | importFrom(h2o,h2o.ls)
19 | importFrom(h2o,h2o.no_progress)
20 | importFrom(h2o,h2o.predict)
21 | importFrom(h2o,h2o.removeAll)
22 | importFrom(h2o,h2o.rm)
23 | importFrom(h2o,h2o.save_frame)
24 | importFrom(h2o,h2o.shutdown)
25 | importFrom(md.log,md.log)
26 | importFrom(memuse,Sys.meminfo)
27 | importFrom(mice,as.mids)
28 | importFrom(missRanger,imputeUnivariate)
29 | importFrom(missRanger,missRanger)
30 | importFrom(stats,median)
31 | importFrom(stats,na.omit)
32 | importFrom(stats,rnorm)
33 | importFrom(stats,setNames)
34 | importFrom(stats,var)
35 | importFrom(tools,file_ext)
36 | importFrom(utils,capture.output)
37 | importFrom(utils,packageVersion)
38 | importFrom(utils,setTxtProgressBar)
39 | importFrom(utils,txtProgressBar)
40 |
--------------------------------------------------------------------------------
/man/mlim.mids.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mlim.mids.R
3 | \name{mlim.mids}
4 | \alias{mlim.mids}
5 | \title{prepare "mids" class object}
6 | \usage{
7 | mlim.mids(mlim, incomplete)
8 | }
9 | \arguments{
10 | \item{mlim}{array of class "mlim", returned by "mlim" function}
11 |
12 | \item{incomplete}{the original data.frame with NAs}
13 | }
14 | \value{
15 | object of class 'mids', as required by 'mice' package for analyzing
16 | multiple imputation data
17 | }
18 | \description{
19 | takes "mlim" object and prepares a "mids" class for data analysis with
20 | multiple imputation.
21 | }
22 | \examples{
23 |
24 | \dontrun{
25 | data(iris)
26 | require(mice)
27 | irisNA <- mlim.na(iris, p = 0.1, seed = 2022)
28 |
29 | # adding unstratified NAs to all variables of a data.frame
30 | MLIM <- mlim(irisNA, m=5, tuning_time = 180, doublecheck = T, seed = 2022)
31 |
32 | # create the mids object for MICE package
33 | mids <- mlim.mids(MLIM, irisNA)
34 |
35 | # run an analysis on the mids data (just as example)
36 | fit <- with(data=mids, exp=glm(Species~ Sepal.Length, family = "binomial"))
37 |
38 | # then, pool the results!
39 | summary(pool(fit))
40 | }
41 | }
42 | \author{
43 | E. F. Haghish, based on code from 'prelim' frunction in missMDA R package
44 | }
45 |
--------------------------------------------------------------------------------
/R/matching.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title match imputed missing observations to non-missing values
3 | #' @description each imputed numeric missing value is replaced with the nearest
4 | #' non-missing value. this option is particularly
5 | #' recommended when ordinal variables are imputed as a numeric
6 | #' variables.
7 | #' @param imputed numeric vector of imputed missing values
8 | #' @param nonMiss numeric vector of non-missing values
9 | #' @return numeric vector of the imputed values
10 | #' @author E. F. Haghish
11 | #' @keywords Internal
12 | #' @noRd
13 |
14 | matching <- function(imputed, nonMiss, md.log) {
15 |
16 | if (!is.null(imputed) & !is.null(nonMiss)) {
17 | if (is.numeric(imputed) & is.numeric(nonMiss)) {
18 | # get the unique values
19 | unqImputed <- unique(imputed)
20 | unqNonMiss <- unique(nonMiss[!is.na(nonMiss)])
21 |
22 | # avoid imputed values that are already in nnMiss set
23 | unqImputed <- setdiff(unqImputed, unqNonMiss)
24 |
25 | # index the unique values and replace them in the vectors
26 | for (i in unqImputed) {
27 | nearest <- which.min(abs(unqNonMiss - i))[1]
28 | index <- which(imputed == i)
29 | imputed[index] <- unqNonMiss[nearest]
30 | }
31 |
32 | md.log("matching successul!")
33 |
34 | return(imputed)
35 | }
36 | }
37 | }
38 |
39 |
40 | #nonMiss <- c(1:20, 19:1)
41 | #imputed <- c(11.5, 12.2, 11.51, 14.1, -1, 49, 20, 1, 4)
42 | #ching(imputed, nonMiss, F)
43 |
44 |
45 |
--------------------------------------------------------------------------------
/man/mlim.preimpute.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mlim.preimpute.R
3 | \name{mlim.preimpute}
4 | \alias{mlim.preimpute}
5 | \title{carries out preimputation}
6 | \usage{
7 | mlim.preimpute(data, preimpute = "RF", seed = NULL)
8 | }
9 | \arguments{
10 | \item{data}{data.frame with missing values}
11 |
12 | \item{preimpute}{character. specify the algorithm for preimputation. the
13 | supported options are "RF" (Random Forest), "mm"
14 | (mean-mode replacement), and "random" (random sampling from available data).
15 | the default is "RF", which carries
16 | a parallel random forest imputation, using all the CPUs available.
17 | the other alternative is "mm" which performs mean/mode
18 | imputation.}
19 |
20 | \item{seed}{integer. specify the random generator seed}
21 | }
22 | \value{
23 | imputed data.frame
24 | }
25 | \description{
26 | instead of replacing missing data with mean and mode, a smarter
27 | start-point would be to use fast imputation algorithms and then
28 | optimize the imputed dataset with mlim. this procedure usually
29 | requires less iterations and will savea lot of computation
30 | resources.
31 | }
32 | \examples{
33 | \dontrun{
34 | data(iris)
35 |
36 | # add 10\% stratified missing values to one factor variable
37 | irisNA <- iris
38 | irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022)
39 |
40 | # run the default random forest preimputation
41 | MLIM <- mlim.preimpute(irisNA)
42 | mlim.error(MLIM, irisNA, iris)
43 | }
44 | }
45 | \author{
46 | E. F. Haghish
47 | }
48 |
--------------------------------------------------------------------------------
/R/h2o.check.R:
--------------------------------------------------------------------------------
1 | #' @title server.check
2 | #' @description safely examines the connection status with h2o server
3 | #' @importFrom h2o h2o.clusterIsUp
4 | #' @author E. F. Haghish
5 | #' @return logical. if TRUE, proceed with the analysis
6 | #' @keywords Internal
7 | #' @noRd
8 |
9 | server.check <- function() {
10 |
11 | up <- FALSE
12 | healthy <- FALSE
13 |
14 | # check that the cluster is up
15 | # ============================================================
16 | for (i in 1:5) {
17 | if (!up) tryCatch(up <- h2o.clusterIsUp(),
18 | error = function(cond){
19 | message("trying to connect to JAVA server...\n");
20 | return(NULL)})
21 | if (!up) Sys.sleep(3)
22 | }
23 | if (!up) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed")
24 |
25 | # make sure the cluster is healthy
26 | # ============================================================
27 | for (i in 1:5) {
28 | if (!healthy) tryCatch(healthy <- h2o.clusterStatus()$healthy,
29 | error = function(cond){
30 | message("trying to connect to JAVA server...\n");
31 | return(NULL)})
32 | if (!healthy) Sys.sleep(3)
33 | }
34 | if (!healthy) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed")
35 |
36 |
37 | return(healthy)
38 | }
39 |
40 |
--------------------------------------------------------------------------------
/R/miss_per_class_error.R:
--------------------------------------------------------------------------------
1 | #' @title Mean Per Class Error (MPCE)
2 | #' @description calculates the missclassification rate for each level of an unordered
3 | #' factor and either returns a vector of the errors or return the mean,
4 | #' if 'mean = TRUE'
5 | #' @param imputed the imputed dataframe
6 | #' @param incomplete the dataframe with missing values
7 | #' @param complete the original dataframe with no missing values
8 | #' @param mean logical. if TRUE (defualt), the mean per class error is returned
9 | #' @author E. F. Haghish
10 | #' @keywords Internal
11 | #' @noRd
12 | mean_per_class_error <- function(imputed, incomplete, complete, mean = FALSE){
13 | mpce <- NULL
14 | for (k in 1:ncol(complete)) {
15 | lvl <- levels(complete[, k])
16 | for (i in lvl) {
17 | index <- drop(complete[, k]) == i
18 | temp <- missclass(imputed[index, k, drop = FALSE],
19 | incomplete[index, k, drop = FALSE],
20 | complete[index, k, drop = FALSE],
21 | rename = FALSE)
22 | names(temp) <- i
23 | if (!is.null(temp)) mpce <- c(mpce, temp)
24 | }
25 | }
26 |
27 | if (mean) mpce <- mean(mpce)
28 | return(mpce)
29 | }
30 |
31 | # print(mean_per_class_error(mm[,c("education"), drop = FALSE],
32 | # dfNA[,c("education"), drop = FALSE],
33 | # df[,c("education"), drop = FALSE]))
34 |
35 | # index <- df$education == "HS" & is.na(dfNA$education)
36 | # table(index)
37 | # mm$education[index]
38 | #index <- is.na(dfNA$education[index])
39 | #table(index)
40 | #View(cbind(df$education[index], mm$education[index]))
41 |
--------------------------------------------------------------------------------
/R/server.check.R:
--------------------------------------------------------------------------------
1 | #' @title server.check
2 | #' @description safely examines the connection status with h2o server
3 | #' @importFrom h2o h2o.clusterIsUp h2o.clusterStatus
4 | #' @author E. F. Haghish
5 | #' @return logical. if TRUE, proceed with the analysis
6 | #' @keywords Internal
7 | #' @noRd
8 |
9 | server.check <- function() {
10 |
11 | up <- FALSE
12 | healthy <- FALSE
13 |
14 | # check that the cluster is up
15 | # ============================================================
16 | for (i in 1:3) {
17 | if (!up) up <- tryCatch(h2o.clusterIsUp(),
18 | error = function(cond){
19 | message("trying to connect to JAVA server...\n");
20 | return(NULL)})
21 | if (!up) Sys.sleep(1)
22 | }
23 | if (!up) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed")
24 |
25 | # make sure the cluster is healthy
26 | # ============================================================
27 | for (i in 1:3) {
28 | if (!healthy) healthy <- tryCatch(h2o.clusterStatus()$healthy,
29 | error = function(cond){
30 | message("trying to connect to JAVA server...\n");
31 | return(NULL)})
32 | if (!healthy) Sys.sleep(1)
33 | }
34 | if (!healthy) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed")
35 |
36 |
37 | return(healthy)
38 | }
39 |
40 |
--------------------------------------------------------------------------------
/R/mlim.mids.R:
--------------------------------------------------------------------------------
1 | #' @title prepare "mids" class object
2 | #' @description takes "mlim" object and prepares a "mids" class for data analysis with
3 | #' multiple imputation.
4 | #' @importFrom mice as.mids
5 | #' @param mlim array of class "mlim", returned by "mlim" function
6 | #' @param incomplete the original data.frame with NAs
7 | #' @author E. F. Haghish, based on code from 'prelim' frunction in missMDA R package
8 | #' @examples
9 | #'
10 | #' \dontrun{
11 | #' data(iris)
12 | #' require(mice)
13 | #' irisNA <- mlim.na(iris, p = 0.1, seed = 2022)
14 | #'
15 | #' # adding unstratified NAs to all variables of a data.frame
16 | #' MLIM <- mlim(irisNA, m=5, tuning_time = 180, doublecheck = T, seed = 2022)
17 | #'
18 | #' # create the mids object for MICE package
19 | #' mids <- mlim.mids(MLIM, irisNA)
20 | #'
21 | #' # run an analysis on the mids data (just as example)
22 | #' fit <- with(data=mids, exp=glm(Species~ Sepal.Length, family = "binomial"))
23 | #'
24 | #' # then, pool the results!
25 | #' summary(pool(fit))
26 | #' }
27 | #' @return object of class 'mids', as required by 'mice' package for analyzing
28 | #' multiple imputation data
29 | #' @export
30 |
31 | mlim.mids <- function (mlim, incomplete) {
32 | if (any(c("MIMCA", "MIFAMD", "MIPCA", "mlim.mi") %in% class(mlim))) {
33 | longformat <- rbind(incomplete, do.call(rbind, mlim))
34 | longformat <- cbind(.imp = rep(0:length(mlim), each = nrow(incomplete)),
35 | .id = rep(1:nrow(incomplete), (length(mlim) + 1)), longformat)
36 | rownames(longformat) <- NULL
37 | mids <- as.mids(longformat)
38 | }
39 | else {
40 | stop("Objects of class mlim.mi, MIPCA, MIFAMD, or MIMCA are required.")
41 | }
42 |
43 | return(mids)
44 | }
45 |
46 |
47 | #mid <- mlim.mids(ELNET, irisNA)
48 |
--------------------------------------------------------------------------------
/man/figures/Untitled.drawio:
--------------------------------------------------------------------------------
1 | 7Ztdd6I6FIZ/jZftIgnycamIna5px1nVmblOIWJWA3Ehrfb8+pNoVDBY6TkqdLQ3JTshgf3shLwbbCEvXtyleDp55CFhLWiEixbqtSAEpuGIf9LyvrLYpjJEKQ1Vo61hSP8hymgo6ysNyazQMOOcZXRaNAY8SUiQFWw4Tfm82GzMWXHUKY6IZhgGmK2tt+2t/Q8Ns4myA8vdVnwjNJqowR1orSqecfASpfw1USMmPCGrmhivu1F3OZvgkM9zJuS3kJdynq2O4oVHmHTs2mer8/p7aje3kpIkq3KCZYTYeXbHJrRMM0T4Bq56eMPsVTlCXWj2vvbMfEIzMpziQJbngn4LdSdZzEQJiMPljZNQlTY3aMhClvKXjTMdYRnzJFPsoWyBGY0SUQjE9ZNUNqCMeZzxdDk26vs9COxNV/ma5Z+o0V2gvPJG0owsciblkjvCY5Kl76KJqkVAkVSxaxoqFubbOICOQjjJhYCJlBGr6Is2fW8hiAPFoSITpDHpjEZPHW+koRH3lxVpFP20jMSiU5VJc7z0FhXToaMqYhqGcphuGf4tc2OXeZ6wrcpl2Jbt1J2UhPH/ZwgrMrROgdDUEHYZj1rQYpJXSN/EYSQPv5P3OU/D2bpKjJSrLTlhyAOK5diPJKR432lfKk6A+UGcnCA07Kqh4Z4iNKzDKy6jS98Xl0+4b9k9OAU/XGQV59Our2hnbjolAMwSAOAUAOwrALvsAXc2AI4GYCgebz/u/KehRkJ0JXaA+9aXajiKGK2SxefQNqTb75od90NKJ1y/YNst0GubOj0A2iX01saj4nMPz5/8gyZPIsSzyYZR9UeL6GMqe44XkRQft6v9PryVI9IZCTuyLLuXDjJuJeGlMDHlMAnPAukk4QtBiYgduRxMNDP1qCkECiiZyQcfmwL9eDyGQXAc9KZRnLhtG+nojXPtakCFpbNWtdD3DEPf3Rx5OrbdRqkFoC+n3uDHb//pKhc+A7FWuQD0NbXP03hWsv/3MGOzm4zfdIKM8qSybnjASUgTIUGMnzgie/XGl4qYUwuH3SCpVzisc0iXtHG1dlV9rcphc/0XTKBe6bB2eI7A7/vh/WjQWOmAXLvX6dQlHdqoUdIB6qlNDdtVOxxJO1hOk7QD1FOiGvp6tUP3DNrBsRulHdb7mbx2eBgM/S+2DzyrctAQ1qocoJ5O9mNMWdmbA+FvzKq/afjD05cxky9Wr1rh82FRs1a4wCS3syvp69UKelrm4gjUrBX0pMqD3+k1VSj4fRe1UV1CwQGNEgqoQq7jKhSOJBRcq0lCAVVIstT7SZLrGsaH8/QITMS0apRSQHripec/3N99u75l+BTFWsUC0tMv/pu4u7L3DJvvjO6TZ76o/nlSjNNMnOUJT0q/XaXDf4iSerUDqpCp+dt2rgDsqvpaxQPS8zKXh6Be9YD0xIr3azgaPDb3KyXL6Tptry4FAQy3WRKiQgrkKiGOJCEANBulIfTky8+nweNg1NzJ2+1byK3tPSFA9hknryhuf/KyrMv9qAj5/wI=
--------------------------------------------------------------------------------
/R/selectVariables.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title select imputation variables
3 | #' @description selects variables relevant to the imputation process
4 | #' @importFrom md.log md.log
5 | #' @param data data.frame
6 | #' @return list
7 | #' @keywords Internal
8 | #' @noRd
9 |
10 | selectVariables <- function(data, ignore=NULL, verbose=FALSE, report=NULL) {
11 |
12 | suppressPackageStartupMessages({requireNamespace("md.log")})
13 |
14 | # select the variables with missing, excluding fully missing vars
15 | vars2impute <- vapply(data[, , drop = FALSE], FUN.VALUE = TRUE,
16 | function(z) anyNA(z) && !all(is.na(z)))
17 |
18 | # make sure that these variables were not meant to be ignored
19 | if (!is.null(ignore)) vars2impute[colnames(data)[vars2impute] %in% ignore] <- FALSE
20 |
21 |
22 |
23 | #data[, vars2impute] <- converted$X
24 |
25 | #if (verbose) {
26 | # message("\n Variables to impute:\t\t")
27 | # message(colnames(data)[vars2impute], sep = ", ")
28 | # message("\n")
29 | #
30 | #}
31 |
32 | if (!is.null(report)) {
33 | md.log(paste("Variables to impute:", paste(colnames(data)[vars2impute],
34 | collapse = ", ")))
35 | }
36 |
37 | # Get missing indicators and order variables by number of missings
38 | dataNA <- is.na(data[, vars2impute, drop = FALSE])
39 | vars2impute <- names(sort(colSums(dataNA)))
40 | # ============================================================
41 |
42 | # specify the list of all predictors, which were not ignored by the user
43 | allPredictors <- colnames(data)[!colnames(data) %in% ignore]
44 | X <- setdiff(allPredictors, vars2impute)
45 |
46 | return(list(
47 | #data <- data,
48 | dataNA = dataNA,
49 | #converted = converted,
50 | allPredictors = allPredictors,
51 | vars2impute = vars2impute,
52 | X = X))
53 | }
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/man/figures/flowchart_optimization.drawio:
--------------------------------------------------------------------------------
1 | 7Vlbd5s4EP41Pmf3ITkYbIwf7dhOs02z7Sa7SR9lkEFrgaiQY7u/viOQuMeXNNdz4gfDjKTR5ZtvmIGOdRZuzjmKgy/Mw7RjGt6mY006ptk1LAcuUrPNNHZvmCl8TjzVqVBck59Yj1TaFfFwUukoGKOCxFWly6IIu6KiQ5yzdbXbgtHqrDHycUNx7SLa1N4STwSZ1jEHhf4TJn6gZ+7aan8h0p3VTpIAeWxdUlnTjnXGGRPZXbg5w1Qenj6X24vtLb1c2ud/fUt+oH/Hn2+u/jvJjM2OGZJvgeNIPK1pKzN9j+hKnVfHtClMMl4wmEvCQhlPW+wfK7nTcce0sl9ZZfvyOkECwZg1gZMzjauRtgWLysxl3dRxiq3GiLNV5GG5zi40rwMi8HWMXNm6Bq8EXSBCqpqlJeVmXcBg7FOUSB8x4D4RnC1znGXvHDTZvCCUnuUbsnD6y4eVWizbGlpevtS9J68Qusdc4E3J7xQS55iFWPAtdNGttnKqbU1eFz7a1Y4XlPzTUjqkaOHnpgvo4Uahf4Qn9Fo84UGYjCok/2MhtgoTtBIMVIyLgPksQvSSsbgFOinjyBtJioM8p8xdZqoZkStP5wBJ9Xf2o0vRHNMxcpd+ulANZ8QiiTH2ICyovRSLmxbaMd4QcSdtnfaV9F1NI+8nG70mKWzzBcLZ35WF74UFKRbDUkmPO86vErbiLt5PY7nHnd6nvE07FscUCXJfjZRtnqUsfWUkDQq6C1ssEiyqUUf3UY6ez6Wn7tUcONubGrXLUL9myKkZEoj7WDQMgYOhbalbLDskLZt6aMFOJazCTWaxYFp+vo8nX38/+fb77/um5++Qr/uc5KuR6gA2DnaSr/ty5LOGtaeM+UjyWYOXIV/PeAXyDZ40ByJhvBLAQsiY02zoTWdAM2cymg7bMqChPbCQ/YwZkNV7xQzo0htQfjMx7/wR7qLg87317WdLBvSVJSLFE4jKoiZoAQvnq2Q/YFUIAL0ZCgmVG/+E6T0WxEUtsCJK/AgEF04e83ZsYUoS+SDZhXSTBnTYTgPu8WhqTO02uA3sGI4M4h5bzdP9HR8oD8e+8SzvyaB9EPr1wPNk6DcfwX/HAD7gIbmseTznmsIXNZ6/DUJ7fex4vTaEHXNu2c9J6Dqo1qsT2t6fVb3vnOlZSprj/KOcDO2KqntLE+VFxmmv6kUvly3Vk4/eo7OlboMJOrztyZcekdK0HrrzYEoTV/xfJy+URPhEc3IEXYzTYZbT1LIbbQYeNVGrJcmHkyR1cGmna8abnYYek2JdlB7J+mWTZAT8c/nijnF5j6jPODSGyYNJWD2mH7qoxaK5qOnl1fQGBBTK4B7Nk7hksDHz7HemOniWXJ3BdSCE8zzenKhlSSRJRARBNDVgPDXKbbs8H395+Y2V9Nyf/9GHQAMMNIrrn0cfQA2q2Q703iNMd+ev4H8fMB0L0+TyA6W3j9I/sw+U3j5K0yjBoaySnxarY5fezFsemBHUcaGrVUBQ3Ul9ezVq9rSsBsh2WRISF9GRej8REs9Lq43mC4tyLVMpapdYuEGphNX2VanFmcrvnvk9RO0rXF4BlEtWo6VkrX/EOKBkBbH4VJzl+cUHd2v6Cw==
--------------------------------------------------------------------------------
/man/figures/flowchart_base.drawio:
--------------------------------------------------------------------------------
1 | 7VpbV9s4EP41OWf3IRxfgkkeyRW2wGmBLrRviq3YWmTLlRWS9NfvyJYcO3ZuNKG0JQ9YMx6N5PlmPl0ODbsXzkccxcE18zBtWIY3b9j9hmWZht2Gh9QsMo3T6mQKnxNPGS0Vd+Q71j2Vdko8nJQMBWNUkLisdFkUYVeUdIhzNiubTRgtjxojH1cUdy6iVe0D8USQadvW2VJ/gYkf6JFNR31fiLSx+pIkQB6bFVT2oGH3OGMia4XzHqYyeDouD5eLB3r15Iz++ZR8Q5+7H+5v/m1mzob7dMk/geNIHNa1lbl+RnSq4qW+VSx0ADmbRh6WToyG3Q1ESKFpQvM/LMRCAY6mgoGKcREwn0WIXjEWK7sJi4QyM6WMI+9cAgvymDL3KVMNCaVqDJCUfRukRHD2lGMnHeRASGOKxph2kfvkpxPtMco4vIpYhKUrD5JBfctycoOltrtjbBUGCZtyF2+ws1WKI+7jTf6czE7Or5CnCrkRZiEWfAEGHFMkyHM5mZGqCT+3W+IODQX9Hmlg16SBQ4UCL63OLKyg/jaVCQ9xs7NfUeX48tlHAkGfGQHELOPmXPuCqWXuMrMNmSZRngVE4LsYpeGeATmVs6+YVRC4rk9Rkqic2JIyE0i1Xv5BNk5/ebfCG9uxO7a3d5I8Yy7wfCOs+q2juEWRa0eJsyVTmZp+ggJL2caREqH1x/MBnhPxKH2dnCrpixpGtvtzPScpLPIJQuwLnaT4pfhu2S2VdL8Dc4+zI/fopXkr+ais1Bm4MxcpTx8ZSclDm7DJJIGJlRYpbaMKIt826KFXMz0Lguq1wZFtrDhqrzjKolRxBJmIFgWzWBoklbrKg/TyUnMqpfaRYxLGUwFBZlG17gIWjqfJdmYscx0U2xCFhMo4XGD6jAVxUQ1/Ikr8CAQXchHz+jqDIUnkg+Qspfu0roE4Krw6mUws163jVc8ZO6fOK/JqLm8j1tUsORixtv94Yj0w2Wmi2M525o5sp3LHOGmVcsc+Gvsdikl0KArJdftbMUnP6neGvTomcdw2Hk9ekUlyhvhpTKLzuYD2pcQamANO6enW+23sq4ft/vmgU4ta+ntF1FrGK26sr7wzyu/71qN/jk0UfHi2P31vVkGrwASxjGXTnXK66HJgWslr2/Aqg1vCowQdOHODAlCarNWC4nPkEUBhhdaPBpC9skDXlJXm2CI+rWPhU7c+Z6fWuISSPutSEuGmnhmccA3jpJMdgVcOw9oNkFxU60kWXDNJK076Ma14XuvoFsc0Y015pk7yA3bmfsxXB4QwZGMeeB4wi2lCxtBbhSvZMiKo47VnfsgXsZaArJaWVQf5XiYZLDz0XC0zIfG8dINRXXdeWgycqZUTlqPO/ge13cvg1NxaBaZRQ1NHKwNz/U3QW6mDlZupqqNia7+Lq8vCnklfX0kehL9c3ggzLtuI+ozDy7CY+OWLrXU1+YNznkyqcx5c3QzuG3ItWzOX4REHhycK5YIUjZO4XPJ1AdmXksb5aaOppiUzg0REEESz6b4wa/b6ylH3+vU/rKDn/vgvyRRQ0sby+ffeAViBargBvV8RpsfRT8i/d5j2hal/9Y7S20fpdviO0ttHaRAlOEy34i/B6n1jvnZjbnVaJ6fbL37q9uZnB9ib39x+tT7fcCsaDTyBo/6FuL9uVm/5fokrhCJm8oCjQauAsSuOe4Bm19z7WKfa6AdPVCAu//kju8td/guNPfgf
--------------------------------------------------------------------------------
/R/extractMetrics.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title extractMetrics
3 | #' @description extracts performance metrics from cross-validation
4 | #' @return data.frame of error metrics.
5 | #' @author E. F. Haghish
6 | #' @keywords Internal
7 | #' @noRd
8 |
9 | extractMetrics <- function(hex, k, v, perf, family) {
10 |
11 | # calculate the variance of v
12 | if (is.numeric(var)) {
13 | var <- var(as.vector(hex[, v]), na.rm = TRUE)
14 | NRMSE <- as.numeric(perf@metrics$RMSE)/var
15 | } else {
16 | NRMSE <- NA
17 | }
18 |
19 | RMSE = as.numeric(perf@metrics$RMSE)
20 | MSE = as.numeric(perf@metrics$MSE)
21 | MAE = as.numeric(perf@metrics$mae)
22 | if (length(MAE) < 1) MAE <- NA
23 | RMSLE = as.numeric(perf@metrics$rmsle)
24 | if (length(RMSLE) < 1) RMSLE <- NA
25 | Mean_Residual_Deviance = as.numeric(perf@metrics$mean_residual_deviance)
26 | if (length(Mean_Residual_Deviance) < 1) Mean_Residual_Deviance <- NA
27 | R2 = as.numeric(perf@metrics$r2)
28 | #AIC = numeric()
29 | logloss = as.numeric(perf@metrics$logloss)
30 | if (length(logloss) < 1) logloss <- NA
31 | mean_per_class_error = as.numeric(perf@metrics$mean_per_class_error)
32 | if (length(mean_per_class_error) < 1) mean_per_class_error <- NA
33 | AUC = as.numeric(perf@metrics$AUC)
34 | if (length(AUC) < 1) AUC <- NA
35 | pr_auc = as.numeric(perf@metrics$pr_auc)
36 | if (length(pr_auc) < 1) pr_auc <- NA
37 |
38 | metrics <- data.frame(iteration=k,
39 | variable = v,
40 | NRMSE = NRMSE,
41 | RMSE = RMSE,
42 | MSE = MSE,
43 | MAE = MAE,
44 | RMSLE = RMSLE,
45 | Mean_Residual_Deviance = Mean_Residual_Deviance,
46 | R2 = R2,
47 | #AIC = numeric()
48 | logloss = logloss,
49 | mean_per_class_error = mean_per_class_error,
50 | AUC = AUC,
51 | pr_auc = pr_auc)
52 |
53 | return(metrics)
54 | }
55 |
--------------------------------------------------------------------------------
/man/figures/flowchart.drawio:
--------------------------------------------------------------------------------
1 | 7VtdW+I4FP41PM/uhT5tUypc8um447gzo7vqZaChdA1Npw0C/vpNmqTfUMCCOsKFNqfpaXLOe855T9AG6M2WlwH0p9+IjXDD0OxlA/QbhtHWdfaTC1ZC0DSbQuAEri1EeiK4dV+QFGpSOndtFGYmUkIwdf2scEw8D41pRgaDgCyy0yYEZ9/qQwcVBLdjiIvSe9emUyFtGReJ/Atynal6s261xZ0ZVJPlTsIptMkiJQKDBugFhFBxNVv2EOa2U3a5v1rd4+sn6/KvH+Ev+E/3693Nv2dC2XCXR+ItBMij9ao2hOpniOfSXnKvdKUMGJC5ZyOuRGuA7pTOMLvU2eV/iNKVdDicU8JEJKBT4hAP4mtCfDlvQjwqp+l8jDy7wx3LxiNMxk9CNHQxlu9gIzm/xUYhDchT7DuuIHYEn4zhCOEuHD850UJ7BJOA3fKIh7gqm4FB7iVZ3CCRdre0rfRBSObBGG2YByTEYeCgTfosMY+vL4VT6blLRGaIBis2IUAYUvc5C2YoY8KJ5yV+ZxfS9TvAAJTAwMJUOi+KTmFWJv4154BndgPikxZZDv/dhxSyZxYu85ih3XSULrY0oU5M24A07uXF1KXo1oeRuRcsN2XRl0YVM1zXwTAMJSYqIDNhUOvFGwIo+sSPpe4AC7SBvTNInlFA0XKjW9VdS+YWmVzbcrhIMpWu0s80laWAdiAgmJ8+H6ClSx+4rvOmHD3K1/Dr/lKtiQ9W8QKZ7R/Sg8dEAx8mj0Uj9VzNucfaMveo0lyZfCQqFQK3zkVS03fiRslDTSGTScgWlilSao4MiJg2qFfnkS6MIJ/aoAhoOUWtnCJhpYIihkS4Sk3z+YSwZFNqwUb5e5IgFBqTkIztu3+UNqujtBroHzuOXxOl+iGjtPbo048XfaCdBXNce3aOvjVRUXf0mVr5gtetKz//KNFqFaL1e4DcmT+nzJ/EKxbYKZmN5mE1BcqSGhaNQzhzMd/aF4SfEXXHsIQoQew6HhuMGZxRUB6I7JWu57CRlYzuosBnDKFIoHS7iS7KCFTbugDQOiKBisdVDCoPyNoY1EWtVFohheVp1oJHvPp9c+m3gwIw3xmXbn16Ll0zv1WVsLrE6luWWAkd7dzMQAccrOTWVVOUKVLg+vlb1ZSe0W8Pe2WJxBq30GhywERiNHMkwXzrmqLwnPL2Va4qvI/0P2z1O4N2qdeizxG9BrZlAnXk/2v7Agd3fePB6SAdTr8+gx8vZ0WnFdzEbOnzy/E8wKtuwDItz2tV/so6N+OPjOuYsvE05SiVrGVBcQJou8wLubS+xvDb+mw9VTPWNBcpBwG96J+Y0tXuoLJTz89VoF932JV0zo/pexvb6FxJrpMdlDTgpW433wkXqOxOzb3b79wpmmlu137vwU7WWzjb8ZKQ/k70pNsZaAOrrNBpqKW1eOjaZD6K9peAv/6kmq96JjgiVyn1ffFs8p1TFa4y/Z2P6MSrKczhndk02+fNNyYxxcOrU42srpE11rqPXcIsLYbwa4tYs3ncIlZ2kHei7wmoSs7Xjkvfy87XxOGon3GTOlPFrofO1NI6bIp23hZHrblDV6WGsQCvVBNPWGdhlIG4Ht3wlxsVyZPfoqL01W4Hw7eYgcbQrlKcSvvbp2Tmvqih+PuLNcfFQjwKal7qZFJc6mX3W8UymFjYekv7y9dz2wfO6I8mgx2DkJb8/lMsc3s/aRyK/Od+C9oTEDuZ8eGyezLj683Yvz5Z8fVW/Dk8WfHgiXPghWjGe8hdk/iavTKxv/YLQ0YXaGNdr2WYaiwf4Pc5x2D9Nu7I7nrm2nbEmIvt9r4cJyCyuLEuvH3IPtoE2Ta6WdJGayV9V/5gpb7D4/V/kvkJ2M0QhvSDsJvB9c3gruZkuKsHThmg9gygtw6XAdgw+a8B0a4m/3oBBv8D
--------------------------------------------------------------------------------
/man/figures/procedure2.drawio:
--------------------------------------------------------------------------------
1 | 7Vpbd6M2EP41frQPAnF79DV7ybY5zZ62p28yyKAGkAMidvbXVwKBkZETN7GzThvvORsYSSNpvk8zo7EH1jTdXuVoHX+jIU4GphFuB9ZsYJrAAgb/IySPtcRz3FoQ5SSUnXaCW/IDS6EcF5UkxIXSkVGaMLJWhQHNMhwwRYbynG7UbiuaqLOuUYR7gtsAJX3pHyRksZQCw9g1fMIkiuXUni0blii4i3JaZnK+jGa4bklRo0Z2LWIU0k1HZM0H1jSnlNVP6XaKE2HWxmL1uMWB1nbJOc7YMQNCcPPZgOB2Yi/vhmHpumG8HFq1lgeUlNIUA9NJuL5JzCdwIvHUSJb5voRP1u1W7ZI9NkZleFspYmnCBYA/FmsUkCzib/bu7Ttdc8GQr96abGLC8C2XCxUbzjUuow84XyWV5WIShjgTQ1lO71qoLC5Z0YxJXgHxjhIS8Z6zgFsH51xQgYRDuZIVSZIpTWheLdVa2eKfVNOR1592wk6LU33abfM18u0eRAa0ePMjhGmKWf7Iu8gB0K5HyMPTMmazo2J7wuIOC+2GhUjSP2pV73jAHyQV9LRwAws5wPWAG9jByrKHXo8Sv6EspCmXLWiOC9YDWmye8MN0jZY4uaEFYYQK2y8pY3ycNWk6jCUoTECuECNGa6Es3UbCwYxItqLVEwlG4tzgcFoueYcJKeg4i6qJgd3H0YXAnNg9vKpTWYO7QClJhKE/4eQBi1Wp5Klo2COPyjezhb2HsYYJB2EHlq/gDj27hzswNbi754AdHOMKbnJM0nXJUIWw3gvUwvW+QNhYoY1zX9KmYVhU1h/zDsBbb+thsr1RhHjjfUmCO7HzJKI5YXFaBQoRGHgUEKalpZiExVhsNeNERNWmO2ueCgNzXsVCXghHXVbPKee20BwEZY4YrrSgTDRg8WeR8qDH33K8Trh7SgXsu/3Xm+s5x/XP943m3iGRR4EfrpDgna/rnJDGi0LdQTgB7R3V29l+n/Utw7usB83A19D+bzy9/vJ4ld9vcrD81fXKv278Iegxf4E0bq5xUnS1uuH5wLTORYT5tNh0sN3zUrOxby+8PXNXqNfPxsg2fMu1Dfmp2vQ+tBeXPHNpVXGpTTcMJfgZh8FvxDOS833VLnwj/P1JghwAqrczG+/X9XaOBnd4Am+nzX1MjcM7APkqwduxyDO5LXAWysdZkKCiIIEKtmrrvYSiBvSJBORwosHnbQ41sEcugDwLbf6363bJJWcEleZdvMJhLw9+Fr8OPrYuCZGyHCfcwz6o6nWYyRluKKkCQuMVoKnQw3Lckd/9QFVjQcs8wFJJN+vd0+uCf6eXoTzCrKe3IlhrlJdzzn+ec1wLv/bg572KSjTlvAt23UqVqOQR8mgizuy5N4M/zUlYnqHA5fVTYW1K5J3ASWhjg3NBTsKBruO5OidhAWjZ8D/vJCxXZQeE/sj0X+YYILCf1XVmZ9C4pg9yXQK5XPt05HJ9Dbm8TtRx3pRo/Sv9O6ry6HIhTf3niYJOty50qMDzTL3oBLENmnsJ8M8t8/Qzkfn1L/PvPXwvp7jD4fe8xTsp7th71x1oQzXvPC6vaSvQr8F++ePOg1+DSfa1yO7ZJ/wlzdyhe4RP+PxR6/nf13pe5/MgVE7BBRR7+jnXrTDq2Yo9njf25/vfUGi+amjzstcWgZ5G/bz3N+hfWpEHfqQ+r0x9Aj4MkUy0VPeJE9DEMy4qFWpm6pDkavJtIBalxLOgNd4uXlVfOS3tfggTyzkUNIw/ryaUihLz+aaYXZ9T+zwrcLpM8AWni8Acj9/Nd4G+YanpovGydPEs50PnQy8Wdh/Orcn8LWDXFFnPwATLdI/E3jxBytTDXndNuFjs21zrPR55U5Mbn63Wrb0SHlMm4viyj58AfFwLX5f/md6lXQv7v375HVdLLs56O2yj1RvfDk8Borfnv8541+Ovux9K1lXx3Q9Rrfk/
--------------------------------------------------------------------------------
/man/figures/procedure.drawio:
--------------------------------------------------------------------------------
1 | 7Vpbc6M2FP41frQHAeLy6Gv2km0zzU7b6ZsMslEDyAERJ/31lUBgLiLx7hrbuxNnJoEjcSSf79O5kZExj55vErQLvlAfhyNd859HxmKk68AAGv8jJC+FxLHsQrBNiC8nHQT35D8shfK5bUZ8nDYmMkpDRnZNoUfjGHusIUNJQvfNaRsaNlfdoS3uCO49FHalfxGfBVIKNO0w8AGTbSCXdqAcWCPvYZvQLJbrxTTGxUiESjVyahogn+5rImM5MuYJpay4ip7nOBRmLS1WPLfqGa22nOCYHfOAD+4+aia4n8H1w9jPbNsP1mOj0PKEwkyaYqRbIdc3C/gC1lZclZJ10pbwxerT8m/JXkqjMvycK2JRyAWAX6Y75JF4y+/g4e4r3XHBmO/emO0DwvA9lwsVe841LqNPONmEueUC4vs4Fo+yhD5UUBlcsqExk7wC4h6FZMtnLjxuHZxwQQ4S9uVONiQM5zSkSb5VYwPFj1RTkxefasHaiJV/qq/N98i/bi8yoMKbHyFMI8ySFz5FPmDC4gl5eIAjGbM/ULE6YUGNhdCUQiTpv61UH3jALyQV1LSwPQNZwHaA7UFvY8Cx06HEHyj2acRlK5rglHWAFl+e8MN0i9Y4vKMpYYQK268pY/w5Y1ZOmEpQmIC8QYwA7YSy6HkrHMyExBuaXxFvIs4N9ufZmk+YkZRO422+MIBdHG0T6DPYwSs/lQW4KxSRUBj6Aw6fsNhVkzw5DTvkafJNr2DvYKxgQi/swHAbuJsO7OAOdAXu9hCwg2NcwV2CSbTLGMoRVnuBQrgrBYjrfMyI9yA2HG5pQlgQ5f5d+HPuvIVFaMaEKMBihzHnD8r3WltqLuzC6RAIeSr8a5ZfR5ySQrPnZQliONeCYjGAxZ9VxGMVv0vwLuReJRJoHba9u7zv0lskllTl5PcJPviiGoNLL2eqiHoCWlpNbwTdLisrBtZZCU7hjf7F89tPLzfJ4z4B699tJ/vnzh2DDjNXSOGGSidCN5s7Hq/nRa4gzKfEpoZty4sspi5cOS1z56gX19oEaq5hQ01+8jG1j+vEDUdfG3ncqNIBrRGctH7wS/GCJPx7FS52L/zxSYIQAE1vpGvdKAQsBe6ngF2Zm+gKh9QD+SbEz1ORB3Jb4NiXlwsvRGlKvCbYTVu3An4B6CsJQn8iwNctDzWAExuYPEssf8NiXHLJmpiN4UM8wX4nT30Tvxo+UJUkSFmCQ+5Kn5rqVZjJFe4oEb6y8gqm3qCHYdkTt/4xmxpTmiUelkrqWWlLrw2+TS9DyRazjt6cYJVRvp9z8G3OiXR/13vgZIWC1uV07WggS4uUX/mb00EIBzqJytzg5GbpB6TfaXVj05lNYx6RNV1tAaVyY4rS6pVaqV5y9dVOb5RinBoMkViM5PdHxbJXaPqqhzxjHVWuVKPGzezLSDC/AF/YKz8apcmsx4wWRMgLmTWsi0qOrGrcKTSUI9rfNzNKRWI03BKL2yG1L+MUR8JFXG+FCfTp9FIV5uu++egEvxVfeAbXjLjdtE9VhAI4xKFRudOr5YJrLo3Z8hxcEAvfSxOgjJfO56MHKCG4DCHsn4kQjjN1l9124U/sHFxFEagC3zkB9uv/HhzzszeLP6fxI/uAP0WxrehGKrpSPOi9t6Wuvy11ypyuVNNqoSoKgjM3q7ql0p8433EqDD5Yz6qKRe2eVV+D40d7WRL6wZC1W1nKpbtR7tuBiGshuxS/jV2zBdXoBCoi/ZEtqgVcOgvzYu1DowwB5SvgM0YOJWBHZA5Dt3JMC7RI3DWK7jpdo0BjsFZOtyodwCz9gPTaamw0G4FjxYk/c2/nmNzjvbfzWm/nB6KDpmbLZXo5Xee/vP1t+bWD6vWUIhx0x1ldUymi9ZChvxRpBhSeEXxXn0IbqjRRBZi2e/j4Xpb8mmVJj3tqvZe4nrJE75D1ftB6pGqFnLkeGSj5/R4S2C3/dcYKRkkB6+3M72zv0y3TthxblawYwDSgOfrV36cbLXaYpjvR3aaWY9+hm213o9A18Htz0PUv7+S6GLlseDpy2a6CXE4tDbNORDR+e/hn6GL64Z/NjeX/
--------------------------------------------------------------------------------
/man/figures/procedure3.drawio:
--------------------------------------------------------------------------------
1 | 7Vpbd5s4EP41frQPN3F5tB073TZtc5qc7p594yKMNoAIiNjJr9+REBgMTt3Ebp3dkHMcGAlJzPfNaGZgpM+TzWXuZtFnGuB4pCnBZqRfjDTNsgz45YLHSoA0pRKschJUInUruCFPWArrbiUJcNHpyCiNGcm6Qp+mKfZZR+bmOV13u4U07s6auSvcE9z4btyX/kkCFkmpqijbhg+YrCI5tY1kg+f6d6uclqmcL6UprloStx5Gdi0iN6DrlkhfjPR5TimrzpLNHMdcq7XGqvuWe1qbJec4ZYfcEKjXfyiGejND3t04KC0riLyxXo3y4MalVMVIM2MYbxbBBOaKn9USL9+VwGTtbuIp2WOtVIY3YiCWxCBQ4bTIXJ+kK7hC26tbmoFgDKvXZ+uIMHwDcj7EGqgGMvqA8zAWmotIEOCU38pyetdApYMkpCmTvFL5tRuTFfS88EE7OAeBAAkHciUhieM5jWkulqqHiP/JYVry6mgmbLWY4mgeuw2DRAaWDRpoiSQsl5gmmOWP0EW2Gqi6QxpPw5j1lop6bSlRi4WoZqEr6b9qht7yAE4kFYZpYfm6a6qWrVo+8kMdje0eJb65aUATkC1pjgvWA5o/KQFjunI9HF/TgjBCue49yhjcp8/qDlMJCuOQd4gRuRkfLNmsuH+ZkDSk4oz4E243OJiXHnSYkYJO05WYWEV9HC1D1Waoh5ewygrcpZuQmCv6A44fMF9VlzyChj3ydPmmHQd2VXc6uBs26uGuagO4W6eAXT3EFVznmCRZyVyB8LAXqITZroDruEMb876kdcO4ENqfQgfVzjbVbbK9Hog33pfEv4P/PUYqbryiOWERF5ICfsoCjJ1vI3zbgD2CK56WvCuLMO+UAk1doZLWE825+oF1UT1MUopzDzPOBL5+F7gMPgzApyStxnNT3hPzf8sE9ka4ynEWgxtLOD22eqqU0HOi2e/3odqOMUmTASMMCN76xJYl1d7WGDKYI5iH2fWKyNF61tFYQts61DoYeY15/IPnVx8fL/P7da56Xy27/PvaGas9C1m6A+6wdmY0DK8hbphXMQtX3yA2LWx3vNnF1EFLe0fdAvXqXJkgxdEtpMhDtA372t7+ZWueLvavJixROpuksh/8WnxBcniuytWvuRUexyuqXa+o1V6y7RXNAdyNI3jFwRhJG3CMeyAPY7yZ8ngUdIHTQJ5e+LFbFMTvgt3V9U7gUQH6TKCyPyCBeWujVtHEUg2IVutfVLVLLpkTo9P87L6Gg04I3cevhQ8aClakLMcx+NqHbuA9hJmc4Vp62dorGFqHHrppTZz2sWP8BS1zH8tB2tHxzriW+nPjwjawwqw3riBYo5SXuxrzjDhnGpZpW0Oc01VDR8Z/nnN6vc3UgZrlTDTnZTwzVPTDsY7HrUF/pp6TQ/vfk8tCxyOX5QyQy245MfOXEq2fSb6h4sLQ1jpQdnimjtAuR+yrK/ygTHGEeMrQduKp31tdcHqcWFx9Wdz28D2fmgLAb9vLN1JTQDvRs4GMbhjTw36wwtAUPl+Dvfd0Zxuf/Fn6qUjv2Qf8MUmtsXWAT/j2+4oMC9jYAEDo8gUPlhVCkuIxK1NRXAgpLwnQjJGEPFVFAVlc4OsXXcSD8htp2LT2qxfb553A1W0kpspy6uOgzHE9dwGOShQhZMUha1djJu9lhpf5R8PoWMxP1BnQ621kMPjvx2c3XKknqzPY9tRZ7BbRB6rhTQz32vrD86iftr5gOOdWXzDew6RXhkk+3OaCY85l7nEEmtjKWYVN9UwtklzOPo/4ojo7n98ob7uzibciHupvdnw5+zYN5a/LGaViczrZFBdXpxx9kRY48WJ8xqGlqk2nb+Z1laPo3dBSeVloeRL7GPKhZwu7Yywg0vwVsPOJb6QK3JLR0zBB16wDsdeOEDL1sB9KKc4W+ybWeosmrzkHvqG2T5U+HlJSAnzZyRNIZ08C+ZV//6OIvDChInWLsPvw2E4leSrniwTO4+3tF9VhmUOK2EoqcZM0yjLJgakiYPwoM8b3xPCFEaBmvzAxNE+VGPY/0fi+xfl0+WGzX/3i/PAYINo7HuyE2R5cbr/mq2ro248l9cW/
--------------------------------------------------------------------------------
/man/figures/procedure4.drawio:
--------------------------------------------------------------------------------
1 | 7Vpbd5s4EP41frQPN3F5tB073TZtc5qc7p594yKMNoAIiNjJr9+REBgMTt3Ebp3dkHMcGAlJzPfNaGZgpM+TzWXuZtFnGuB4pCnBZqRfjDTNsgz45YLHSoA0pRKschJUInUruCFPWArrbiUJcNHpyCiNGcm6Qp+mKfZZR+bmOV13u4U07s6auSvcE9z4btyX/kkCFkmpqijbhg+YrCI5tY1kg+f6d6uclqmcL6UprloStx5Gdi0iN6DrlkhfjPR5TimrzpLNHMdcq7XGqvuWe1qbJec4ZYfcEKjXfyiGejND3t04KC0riLyxXo3y4MalVMVIM2MYbxbBBOaKn9USL9+VwGTtbuIp2WOtVIY3YiCWxCBQ4bTIXJ+kK7hC26tbmoFgDKvXZ+uIMHwDcj7EGqgGMvqA8zAWmotIEOCU38pyetdApYMkpCmTvFL5tRuTFfS88EE7OAeBAAkHciUhieM5jWkulqqHiP/JYVry6mgmbLWY4mgeuw2DRAaWDRpoiSQsl5gmmOWP0EW2Gqi6QxpPw5j1lop6bSlRi4WoZqEr6b9qht7yAE4kFYZpYfm6a6qWrVo+8kMdje0eJb65aUATkC1pjgvWA5o/KQFjunI9HF/TgjBCue49yhjcp8/qDlMJCuOQd4gRuRkfLNmsuH+ZkDSk4oz4E243OJiXHnSYkYJO05WYWEV9HC1D1Waoh5ewygrcpZuQmCv6A44fMF9VlzyChj3ydPmmHQd2VXc6uBs26uGuagO4W6eAXT3EFVznmCRZyVyB8LAXqITZroDruEMb876kdcO4ENqfQgfVzjbVbbK9Hog33pfEv4P/PUYqbryiOWERF5ICfsoCjJ1vI3zbgD2CK56WvCuLMO+UAk1doZLWE825+oF1UT1MUopzDzPOBL5+F7gMPgzApyStxnNT3hPzf8sE9ka4ynEWgxtLOD22eqqU0HOi2e/3odqOMUmTASMMCN76xJYl1d7WGDKYI5iH2fWKyNF61tFYQts61DoYeY15/IPnVx8fL/P7da56Xy27/PvaGas9C1m6A+6wdmY0DK8hbphXMQtX3yA2LWx3vNnF1EFLe0fdAvXqXJkgxdEtpMhDtA372t7+ZWueLvavJixROpuksh/8WnxBcniuytWvuRUexyuqXa+o1V6y7RXNAdyNI3jFwRhJG3CMeyAPY7yZ8ngUdIHTQJ5e+LFbFMTvgt3V9U7gUQH6TKCyPyCBeWujVtHEUg2IVutfVLVLLpkTo9P87L6Gg04I3cevhQ8aClakLMcx+NqHbuA9hJmc4Vp62dorGFqHHrppTZz2sWP8BS1zH8tB2tHxzriW+nPjwjawwqw3riBYo5SXuxrzjDhnGpZpW0Oc01VDR8Z/nnN6vc3UgZrlTDTnZTwzVPTDsY7HrUF/pp6TQ/vfk8tCxyOX5QyQy245MfOXEq2fSb6h4sLQ1jpQdnimjtAuR+yrK/ygTHGEeMrQduKp31tdcHqcWFx9Wdz28D2fmgLAb9vLN1JTQDvRs4GMbhjTw36wwtAUPl+Dvfd0Zxuf/Fn6qUjv2Qf8MUmtsXWAT/j2+4oMC9jYAEDo8gUPlhVCkuIxK1NRXAgpLwnQjJGEPFVFAVlc4OsXXcSD8htp2LT2qxfb553A1W0kpspy6uOgzHE9dwGOShQhZMUha1djJu9lhpf5R8PoWMxP1BnQ621kMPjvx2c3XKknqzPY9tRZ7BbRB6rhTQz32vrD86iftr5gOOdWXzDew6RXhkk+3OaCY85l7nEEmtjKWYVN9UwtklzOPo/4ojo7n98ob7uzibciHupvdnw5+zYN5a/LGaViczrZFBdXpxx9kRY48WJ8xqGlqk2nb+Z1laPo3dBSeVloeRL7GPKhZwu7Yywg0vwVsPOJb6QK3JLR0zBB16wDsdeOEDL1sB9KKc4W+ybWeosmrzkHvqG2T5U+HlJSAnzZyRNIZ08C+ZV//6OIvDChInWLsPvw2E4leSrniwTO4+3tF9VhmUOK2EoqcZM0yjLJgakiYPwoM8b3xPCFEaBmvzAxNE+VGPY/0fi+xfl0+WGzX/3i/PAYINo7HuyE2R5cbr/mq2ro248l9cW/
--------------------------------------------------------------------------------
/man/figures/procedure5.drawio:
--------------------------------------------------------------------------------
1 | 7Vptc6M2EP41nmk/2GPxzkfbsZO7y10zyc20028CBFYDiAgcO/frKwmBkZGTXGInvjZkxoGVWIl9Hq12FwbmLNucU1gsv5IIpQNjHG0G5tnAMIDlmewflzzUEt9wa0FCcSQ7bQU3+AeSwrGUrnCESqVjRUha4UIVhiTPUVgpMkgpWavdYpKqoxYwQT3BTQjTvvRPHFVLKQXj8bbhAuFkKYf2bNkQwPA2oWSVy/FykqO6JYONGtm1XMKIrDsicz4wZ5SQqj7LNjOUcrM2FqvvW+xpbadMUV4954YIXH0aW+Bmage3w2jlutEyGErc7mG6kqYYGE7K9E2XbAAn4WeNJKC7EjZYt5t4yuqhMWqFNkJRlaVMANhpWcAQ5wm7srdX30nBBEM2e3O6XuIK3TA5V7FmXGMyco9onArLLXEUoZzfWlFy20JlMklM8kryCvBrmOKE9TwLmXUQZQIBEorkTGKcpjOSEiqmasY2/5NqOvL6aAfstDjiaB+7C4NEhk2bWaAjkrCcI5Khij6wLrLVsus75OJpGbPeUtFsVsqyw0K7YSGU9E9a1VsesBNJBT0t3NCEDnA94IZ2GJv20OtR4iuCOVPDl/2Az23Bfq8XPbz5A2O2pi5hgNIrUuIKEw5BQKqKZMxUTYeJxKbiyCv8WMKCK8s2CfczI5zHRJzhcMSXD4pmq4B1mOKSTPJEDAzsPpyuBYyp3YNNLM4a4wXMcMrtfYHSe8RnpXJIsLHHIZV2xmHQB6avwG95ffiBoYHfPQb64Dke4YoinBWrCgqE9c6gFha7Arboc4U2zt2Ke0Bh/GEprD9hHYBXbOrbZHuj6NukFE6fU5GiImW+IhLGYrZiSKRwVeIgRc0jlGIr4Z4aJTiXHUdMGcw4+fKgLFSHVk+w5+eK93dzxg7RJZ3ZAokw2rqtDssbh2jpyHwA6jqq43KA3WNuy9Iuc4Hlvp66/6DZ5eeHc3q3piD4w/VWf1/5Q9BjL+Mqe0bJ1B3oGn9D4viK7fCzOrrgVtRC1IF4x+GcTXx74e1YXYBfn49H9tg3XXssD9Gmd4e9ncYzAlPsNG0AMVa2s/F+DjTiM0zZc9XeeI3K6jDoA6A6LsO0+o7L0cBvHcBxaaMZQ+O79kAep2gz4ZEjswXKI3l6FqawLHGogq3aeidEqAF9JKTYHzqwcZu1DeyRCywWVza/dt0uueSMLKX50a0HRUqw28evg4+tCyukjKKULZt7NUTWYSZHuCKYzWTrHCxDoYfpuCO/e1iqxpKsaIikkm4cu6PXBT+nt4I0QVVPryBYa5SXexznhDjnWK7juTrOmcAybes/zzmz2W2aWMr1R4b/Mp5ZwH5S1+G4pfVn4JQc2v+eXK59OHK5voZcXseJOW9KtH7O9wuVAXRbq6ZA8EjG3y0c7KsAPFFQOEA8ZRk78dT71gH8Hifml9/m3/elTtxuokLXmG6buIl8PLD7uZwoJbQcqjW01FJGuV68zTjn06/v84D91fXX+ZSQsnqf6Zxdvs+487xEmUjfT7a4xJyM5y163uc0i0v2To5mWb4aLPc8jLbU1BbCX+Nhgh+3nvUlnOZfyvyuukCfs9wdus/Yea5fXG2SZD14tQkmUBSTAliKaz6vcUFRhMNKCNraE4l5AWrJ7y8LFOIYi/aMv0L5rfx9/7L4KDw9tlValkJrfeXJ0RD5aJWnfqg+CasV5Pd96rD3WBUoz5v4890XIZo3Gm10/9rK1OMkOG7lyfJPrfJkfQTQrwygQ3Ybc6q8RWSlB6CJNz6pgLoZ6SOi/oioPyLqjosAxmTyy7yu9cem4lRM/2UR9VFcjG4bOlnYfWvO6PkWsPOBb6QJ4KoiR2JC89HTk9gb9hGw12VSJ4t9G67+ikve8PvZhhZo71hZ83PqtQzf6uh5s78nbxYpj5oJh+LDj4BnwqRgM2NKokH74cYSwXvMIGTGShNCmTArRwNu1Kc3SceFgR/rNkkZATSC70vMZyJ++BxILvIzmPOJxISKJL9EkIbiW5IVLUgp5pszJm4N2OobPZ2+f2T1PxGvA/UFiy6r18brwD6AQ9Vm9f2voeabAlGcsafk3DlWOt/ujW+czh8CRW/HWx4xOWeX2w9o65dh2w+Uzfm/
--------------------------------------------------------------------------------
/R/iterationNextVar.R:
--------------------------------------------------------------------------------
1 | #' @title get the next variable in the iteration to impute
2 | #' @description this function evaluates the saved mlim object and
3 | #' decides which loop and iteration to carry out next.
4 | #' @param m integer. number of imputation datasets
5 | #' @param m.it integer. current number of dataset that is being imputed
6 | #' @param k integer. current global loop iteration. each dataset goes up to
7 | #' 'maxiter' number of global iterations
8 | #' @param z integer. current number of imputation variable in the 'ITERATIONVARS'.
9 | #' this specifies the local iteration number.
10 | #' @param Y character. name of the variable that is to be imputed (or was lastly
11 | #' imputed, in case of loading mlim object)
12 | #' @param ITERATIONVARS character vector, specifying variable names to be imputed
13 | #' @param maxiter maximum number of global iterations for each dataset
14 | #' @author E. F. Haghish
15 | #' @keywords Internal
16 | #' @noRd
17 | iterationNextVar <- function(m, m.it, k, z, Y, ITERATIONVARS, maxiter) {
18 |
19 | # if 'z' is smaller than the length of 'ITERATIONVARS', move-on to next variable
20 | # if 'z' is the last in the iteration, first check the iteration limits
21 | # if iterations limit is reached, check the MI datasets limit
22 | # if there is a place for multiple imputation, move on to next data
23 | # and reset the iterations
24 | # else stop the imputation
25 | # else move on to the next global loop iteration
26 | # if there is a continuation, specify the next imputation variable
27 |
28 | if (z < length(ITERATIONVARS)) {
29 | z <- z + 1
30 | }
31 | else if (z == length(ITERATIONVARS)) {
32 | if (k == maxiter) {
33 | if (m.it < m) {
34 | m.it <- m.it + 1
35 | k <- 1
36 | z <- 1
37 | }
38 | else stop("all datasets were already generated!")
39 | }
40 | else {
41 | k <- k + 1
42 | z <- 1
43 | }
44 | }
45 |
46 | # update the next variable to be imputed
47 | Y <- ITERATIONVARS[z]
48 |
49 | return(list(m=m,
50 | m.it=m.it,
51 | k=k,
52 | z=z,
53 | Y=Y))
54 | }
55 |
56 | # print(paste(object$m, object$m.it, object$k, object$z, object$Y, object$ITERATIONVARS, object$maxiter))
57 | # iterationNextVar(object$m, object$m.it, object$k, object$z, object$Y, object$ITERATIONVARS, object$maxiter)
58 |
--------------------------------------------------------------------------------
/R/init.R:
--------------------------------------------------------------------------------
1 | #' @title extractMetrics
2 | #' @description extracts performance metrics from cross-validation
3 | #' @author E. F. Haghish
4 | #' @return connection object
5 | #' @keywords Internal
6 | #' @noRd
7 |
8 | init <- function(nthreads, min_mem_size, max_mem_size, ignore_config = TRUE,
9 | java = NULL, report, debug) {
10 |
11 | if (!is.null(java)) {
12 | Sys.setenv(JAVA_HOME = java)
13 | }
14 | # Run H2O on the statistics server
15 | # ============================================================
16 | #try(h2o.shutdown(FALSE), silent = TRUE)
17 | keepTrying <- TRUE
18 | connection <- NULL
19 | test <- 1
20 | while (keepTrying) {
21 | # h2o.init(jvm_custom_args = c("-help"))
22 | # h2o.init(jvm_custom_args = c("-session_timeout=100"))
23 | # bind_to_localhost = FALSE
24 | # h2o.init(jvm_custom_args=c("-Dsys.ai.h2o.heartbeat.benchmark.enabled=true"))
25 | tryCatch(connection <- h2o::h2o.init(nthreads = nthreads,
26 | #name = "mlim_connection",
27 | min_mem_size = min_mem_size,
28 | max_mem_size = max_mem_size,
29 | ignore_config = ignore_config,
30 | insecure = TRUE,
31 | https = FALSE,
32 | log_level = if (debug) "DEBUG" else "FATA",
33 | bind_to_localhost = FALSE),
34 | error = function(cond) {
35 | #message("connection to JAVA server failed...\n");
36 | return()})
37 | if (!is.null(connection)) {
38 | keepTrying <- FALSE
39 | }
40 | else {
41 | test <- test + 1
42 | message("The Java server could not be initiated. It will retry in 3 seconds...\n")
43 | Sys.sleep(3)
44 | }
45 |
46 | if (test > 10) stop("The attempt to start the H2O server was unsuccessful \ndue to an issue within your system...\n")
47 |
48 | }
49 |
50 | #if (!is.null(report)) {
51 | # if (!is.null(connection)) {
52 | # if (attributes(connection)$ip == "localhost") {
53 | # md.log("h2o.cluster was initialized", section="subsection")
54 | # md.log(paste("IP:",attributes(connection)$ip,
55 | # " port:",attributes(connection)$port))
56 | # }
57 | # }
58 | #}
59 |
60 | return(connection)
61 | }
62 |
--------------------------------------------------------------------------------
/man/mlim.na.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mlim.na.R
3 | \name{mlim.na}
4 | \alias{mlim.na}
5 | \title{add stratified/unstratified artificial missing observations}
6 | \usage{
7 | mlim.na(x, p = 0.1, stratify = FALSE, classes = NULL, seed = NULL)
8 | }
9 | \arguments{
10 | \item{x}{data.frame. x must be strictly a data.frame and any other
11 | data.table classes will be rejected}
12 |
13 | \item{p}{percentage of missingness to be added to the data}
14 |
15 | \item{stratify}{logical. if TRUE (default), stratified sampling will be
16 | carried out, when adding NA values to 'factor' variables
17 | (either ordered or unordered). this feature makes evaluation
18 | of missing data imputation algorithms more fair, especially
19 | when the factor levels are imbalanced.}
20 |
21 | \item{classes}{character vector, specifying the variable classes that should
22 | be selected for adding NA values. the default value is NULL,
23 | meaning all variables will receive NA values with probability of 'p'.
24 | however, if you wish to add NA values only to a specific classes, e.g.
25 | 'numeric' variables or 'ordered' factors, specify them in this argument.
26 | e.g. write "classes = c('numeric', 'ordered')" if you wish to add NAs
27 | only to numeric and ordered factors.}
28 |
29 | \item{seed}{integer. a random seed number for reproducing the result (recommended)}
30 | }
31 | \value{
32 | data.frame
33 | }
34 | \description{
35 | to examine the performance of imputation algorithms, artificial
36 | missing data are added to datasets and then imputed, to compare
37 | the original observations with the imputed values. this function
38 | can add stratified or unstratified artificial missing data. stratified
39 | missing data can be particularly useful if your categorical or ordinal
40 | variables are imbalanced, i.e., one category appears at a much higher
41 | rate than others.
42 | }
43 | \examples{
44 |
45 | \dontrun{
46 | # adding stratified NA to an atomic vector
47 | x <- as.factor(c(rep("M", 100), rep("F", 900)))
48 | table(mlim.na(x, p=.5, stratify = TRUE))
49 |
50 | # adding unstratified NAs to all variables of a data.frame
51 | data(iris)
52 | mlim.na(iris, p=0.5, stratify = FALSE, seed = 1)
53 |
54 | # or add stratified NAs only to factor variables, ignoring other variables
55 | mlim.na(iris, p=0.5, stratify = TRUE, classes = "factor", seed = 1)
56 |
57 | # or add NAs to numeric variables
58 | mlim.na(iris, p=0.5, classes = "numeric", seed = 1)
59 | }
60 | }
61 | \author{
62 | E. F. Haghish
63 | }
64 |
--------------------------------------------------------------------------------
/R/mlim.shuffle.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title generate multiple imputation datasets from a single imputation
3 | #' @description mlim estimates Root Mean Squared Error (RMSE), which can provide
4 | #' a hint about the standard deviation of the error. if normal
5 | #' distribution of the error is assumed, stochastic imputed values
6 | #' can be generated, where each predicted value is considered the
7 | #' 'mean' of the normal distribution and the standard deviation is
8 | #' assumed to be equal to 'RMSE'.
9 | #' @param df original dataframe with NAs
10 | #' @param imputed single imputed dataset of the class 'mlim'. the dataset must have
11 | #' been imuted with mlim R package to include the imputation metrics
12 | #' needed for the computation
13 | #' @param m integer, specifying number of multiple imputations. the default value is
14 | #' 1, carrying out a single imputation.
15 | #' @param report filename. if a filename is specified (e.g. report = "mlim.md"), the \code{"md.log"} R
16 | #' package is used to generate a Markdown progress report for the
17 | #' imputation. the format of the report is adopted based on the
18 | #' \code{'verbosity'} argument. the higher the verbosity, the more
19 | #' technical the report becomes. if verbosity equals "debug", then
20 | #' a log file is generated, which includes time stamp and shows
21 | #' the function that has generated the message. otherwise, a
22 | #' reduced markdown-like report is generated. default is NULL.
23 | #' @importFrom md.log md.log
24 | #' @importFrom stats rnorm
25 | #' @return list, including multiple imputed datasets
26 | #' @author E. F. Haghish
27 | #' @keywords Internal
28 | #' @noRd
29 |
30 |
31 | mlim.shuffle <- function(df, imputed, m, report = NULL) {
32 | MI <- NULL
33 | if (!is.null(report)) md.log("Stochastic Shuffling", section="paragraph", trace=FALSE)
34 |
35 | # get the imputation metrics
36 | metrics <- attributes(imputed)$metrics
37 | vars2impute <- unique(metrics$variable)
38 | dfNA <- is.na(df)
39 |
40 | for (i in 1:m) {
41 | MI[[i]] <- imputed
42 |
43 | for (Y in vars2impute) {
44 | v.na <- dfNA[, Y]
45 | RMSE <- min(metrics[metrics$variable == Y, "RMSE"], na.rm = TRUE)
46 | VEK <- imputed[which(v.na), Y]
47 |
48 | MI[[i]][which(v.na), Y] <- rnorm(
49 | n = length(VEK),
50 | mean = VEK,
51 | sd = RMSE)
52 | }
53 | }
54 |
55 | if (m > 1) class(MI) <- "mlim.mi"
56 | else class(MI) <- c("mlim", "data.frame")
57 |
58 | return(MI)
59 | }
60 |
61 |
62 |
--------------------------------------------------------------------------------
/R/algoSelector.R:
--------------------------------------------------------------------------------
1 | #' @title selects algorithms for imputation and post imputation
2 | #' @description automatically sorts algorithms for imputation and post imputation
3 | #' @author E. F. Haghish
4 | #' @return integer
5 | #' @keywords Internal
6 | #' @noRd
7 |
8 | algoSelector <- function(algos, postimpute) {
9 | #preimpute <- NULL
10 | impute <- NULL
11 | postimputealgos <- NULL
12 |
13 | supportedAlgos <- c("ELNET","RF","DL","GBM","XGB", "Ensemble")
14 | actualNames <- c("GLM","DRF","DeepLearning","GBM","XGBoost", "StackedEnsemble")
15 | #algos <- toupper(algos)
16 |
17 | # convert the names to the actuals
18 | for (i in supportedAlgos) {
19 | if (i %in% algos) algos[which(algos == i)] <- actualNames[which(supportedAlgos == i)]
20 | }
21 |
22 | if (length(setdiff(x=algos, y=c("ELNET","GLM",
23 | "RF","DRF",
24 | "GBM",
25 | "XGB","XGBoost",
26 | "DL","DeepLearning",
27 | "Ensemble","StackedEnsemble"))) > 0) stop("some of the 'algos' are not recognised")
28 |
29 | if (postimpute) {
30 | # impute (by default both GLM and DRF are used for imputing)
31 | # ------------------------------------------------------------
32 | if ("ELNET" %in% algos) impute <- c("GLM")
33 | else if ("GLM" %in% algos) impute <- c("GLM")
34 |
35 | if ("RF" %in% algos) impute <- c(impute, "DRF")
36 | else if ("DRF" %in% algos) impute <- c(impute, "DRF")
37 |
38 | # postimpute (start with GBM, then XGB, then DL, then Ensemble)
39 | # ------------------------------------------------------------
40 | if ("GBM" %in% algos) {
41 | if (is.null(impute)) impute <- "GBM"
42 | else postimputealgos <- c(postimputealgos, "GBM")
43 | }
44 | if ("XGB" %in% algos) {
45 | if (Sys.info()["sysname"] == "Windows") {
46 | stop("XGB is not available in Windows")
47 | }
48 | if (is.null(impute)) impute <- "XGBoost"
49 | else postimputealgos <- c(postimputealgos, "XGBoost")
50 | }
51 | if ("DL" %in% algos) {
52 | if (is.null(impute)) impute <- "DeepLearning"
53 | else postimputealgos <- c(postimputealgos, "DeepLearning")
54 | }
55 |
56 | # if Ensemble is specified, include the 'impute' algorithms
57 | if ("Ensemble" %in% algos) {
58 | if (is.null(impute)) stop("Ensemble is a meta learner and requires other algorithms. ")
59 | else postimputealgos <- c(impute, postimputealgos, "StackedEnsemble")
60 | }
61 | }
62 |
63 | # use all algorithms for imputation
64 | else {
65 | impute <- algos
66 | }
67 |
68 |
69 | return(list(impute=impute, postimpute=postimputealgos))
70 | }
71 |
72 |
73 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: mlim
2 | Type: Package
3 | Title: Single and Multiple Imputation with Automated Machine Learning
4 | Version: 0.4.0
5 | Authors@R:
6 | person("E. F. Haghish",
7 | role = c("aut", "cre", "cph"),
8 | email = "haghish@hotmail.com")
9 | Depends:
10 | R (>= 3.5.0)
11 | Description: Machine learning algorithms have been used for performing
12 | single missing data imputation and most recently, multiple imputations.
13 | However, this is the first attempt for using automated machine learning algorithms
14 | for performing both
15 | single and multiple imputation. Automated machine learning is a procedure for
16 | fine-tuning the model automatic, performing a random search for a model that
17 | results in less error, without overfitting the data. The main idea is
18 | to allow the model to set its own parameters for imputing each variable separately
19 | instead of setting fixed predefined parameters to impute all variables
20 | of the dataset.
21 | Using automated machine learning, the package fine-tunes an Elastic
22 | Net (default) or Gradient Boosting, Random Forest, Deep Learning, Extreme Gradient Boosting,
23 | or Stacked Ensemble machine learning model (from one or a combination of other
24 | supported algorithms) for imputing the missing
25 | observations. This procedure has been implemented for the
26 | first time by this package and is expected to outperform other packages for
27 | imputing missing data that do not fine-tune their models. The multiple imputation
28 | is implemented via bootstrapping without letting the duplicated observations to
29 | harm the cross-validation procedure, which is the way imputed variables are evaluated.
30 | Most notably, the package implements automated procedure for handling imputing imbalanced
31 | data (class rarity problem), which happens when a factor variable has a level that is far more
32 | prevalent than the other(s). This is known to result in biased predictions, hence, biased
33 | imputation of missing data. However, the autobalancing procedure ensures that instead of
34 | focusing on maximizing accuracy (classification error) in imputing factor variables,
35 | a fairer procedure and imputation method is practiced.
36 | License: MIT + file LICENSE
37 | Encoding: UTF-8
38 | Imports:
39 | h2o (>= 3.34.0.0),
40 | curl (>= 4.3.2),
41 | mice,
42 | missRanger,
43 | memuse,
44 | md.log (>= 0.2.0)
45 | RoxygenNote: 7.3.1
46 | LazyData: true
47 | URL: https://github.com/haghish/mlim, https://www.sv.uio.no/psi/english/people/academic/haghish/
48 | BugReports: https://github.com/haghish/mlim/issues
49 |
--------------------------------------------------------------------------------
/R/bootstrapCV.R:
--------------------------------------------------------------------------------
1 | #' @title assigns bootstrap data to cros validation groups
2 | #' @description identifies duplicated rows in the bootstrap and creates
3 | #' a column in the bootstrap data assigning the cross-validation
4 | #' groups to avoid having duplicates in the training and testing
5 | #' @author E. F. Haghish
6 | #' @param index numeric. bootstrapped rows numbers from the original dataset
7 | #' @param cv numeric. number of Cross-Validation (CV)
8 | #' @return numeric data.frame including 'CV' amount of columns,
9 | #' owhere each column shows the row number of observations
10 | #' that belong to that CV group
11 | #' @keywords Internal
12 | #' @noRd
13 | bootstrapCV <- function(index, cv = 10) {
14 | index <- sort(index)
15 | weight <- table(index)
16 | rows <- as.numeric(names(weight))
17 | tab <- as.data.frame(cbind(rows, weight))
18 | tab <- tab[order(tab$weight, decreasing = TRUE), ]
19 |
20 | # calculate the length of each group and start the groups
21 | col <- length(index) %/% cv
22 | mat <- as.data.frame(matrix(data = rep(NA, length(index)), ncol = cv))
23 |
24 | #distribute the values that have duplications
25 | tab.dup <- tab[tab$weight > 1, ]
26 | tab.unq <- tab[tab$weight == 1, ]
27 | rows <- tab.unq$rows
28 |
29 | j <- 1
30 | reverse <- FALSE
31 | for (i in 1:nrow(tab.dup)) {
32 | # which cv group should the rows be assigned to
33 | if (j > cv) {
34 | reverse <- TRUE
35 | j <- cv
36 | }
37 | if (j < 1) {
38 | reverse <- FALSE
39 | j <- 1
40 | }
41 | val <- rep(tab.dup[i,1], tab.dup[i,2])
42 | place <- which(is.na(mat[,j]))
43 | replace <- place[1]:(place[1]+length(val)-1)
44 | mat[replace,j] <- val
45 |
46 | if (!reverse) j <- j + 1
47 | else j <- j - 1
48 | }
49 |
50 | # Filling the remaining places with random rows
51 | # =============================================
52 | for (j in 1:cv) {
53 | place <- which(is.na(mat[,j]))
54 | replace <- place[1]:(nrow(mat))
55 |
56 | if (j < cv) {
57 | if (length(replace) >= 1) {
58 | val <- sample(rows, size = length(replace), replace = FALSE)
59 | mat[replace, j] <- val
60 | rows <- rows[! rows %in% val]
61 | }
62 | }
63 | # filling the last column, if it has any empty rows
64 | else {
65 | if (length(replace) >= 1) {
66 | if (length(rows) >= 1) {
67 | replace <- place[1]:(place[1]+length(rows)-1)
68 | mat[replace,j] <- rows
69 | }
70 | }
71 | }
72 | }
73 | return(mat)
74 | }
75 |
76 | # sampling_index <- sample.int(150, 145, replace=TRUE)
77 | # sampling_index <- sort(sampling_index, decreasing = TRUE)
78 | # View(bootstrapCV(sampling_index, cv = 5))
79 |
--------------------------------------------------------------------------------
/R/mlim.summarize.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title mlim imputation summary
3 | #' @description provides information about estimated accuracy of the imputation as well
4 | #' as the overall procedure of the imputation.
5 | #' @param data dataset imputed with mlim
6 | # @param method character. the default is NULL, returning RMSE. the supported
7 | # arguments are "scaled" nad "normalize".
8 | #' @return estimated imputation accuracy via cross-valdiation procedure
9 | #' @examples
10 | #' \dontrun{
11 | #' data(iris)
12 | #'
13 | #' # add 10% stratified missing values to one factor variable
14 | #' irisNA <- iris
15 | #' irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022)
16 | #'
17 | #' # run the ELNET single imputation (fastest imputation via 'mlim')
18 | #' MLIM <- mlim(irisNA)
19 | #'
20 | #' # in single imputation, you can estimate the imputation accuracy via cross validation RMSE
21 | #' mlim.summarize(MLIM)
22 | #' }
23 | #' @author E. F. Haghish
24 | #' @export
25 |
26 | mlim.summarize <- function(data #, method = NULL
27 | ) {
28 |
29 | results <- data.frame(variable = character(),
30 | nrmse = numeric(),
31 | mpce = numeric())
32 |
33 | if (inherits(data, "mlim")) {
34 | att <- attributes(data)
35 | metrics <- att$metrics
36 | VARS <- colnames(data)[colnames(data) %in% metrics$variable]
37 | for (i in VARS) {
38 | err <- NULL
39 | index <- metrics$variable == i
40 | if (inherits(data[,i], "factor")) {
41 | #err <- min(metrics[index, "mean_per_class_error"], na.rm = TRUE)
42 | minimum <- min(metrics[index, "RMSE"], na.rm = TRUE)
43 | # if (is.valid(method)) {
44 | # if (method == "scale") err <- minimum / (length(levels(data[,i])) - 1)
45 | # }
46 | err <- round(minimum, 6)
47 | results <- rbind(results, c(i, err))
48 | }
49 | else {
50 | minimum <- min(metrics[index, "RMSE"], na.rm = TRUE)
51 | # if (is.valid(method)) {
52 | # if (method == "scale") err <- minimum / diff(range(data[,i])) #sd(data[,i], na.rm = TRUE)
53 | # else if (method == "scale") err <- minimum / sd(data[,i], na.rm = TRUE)
54 | # }
55 |
56 | err <- round(minimum, 6)
57 | results <- rbind(results, c(i, err))
58 | }
59 | }
60 |
61 | colnames(results) <- c("variable", "rmse")
62 | return(results)
63 | }
64 | else if (inherits(data, "mlim.mi")) {
65 | results <- mlim.summarize(data[[1]])
66 | for (i in 2:length(data)) {
67 | results <- cbind(results, mlim.summarize(data[[i]])[,2])
68 | }
69 | colnames(results) <- c("varable", paste0("rmse_", 1:length(data)))
70 | return(results)
71 | }
72 |
73 | }
74 |
75 | #print(mlim.summarize(default))
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/R/mlim.postimpute.R:
--------------------------------------------------------------------------------
1 | # THIS FUNCTION IS STILL UNDER DEVELOPMENT AND WILL BE RELEASED IN VERSION 0.2
2 |
3 |
4 | #' @title carries out postimputation to minimize imputation error
5 | #' @description this is a function for carrying out postimputation on datasets
6 | #' that are already imputed, either by mlim or other software.
7 | #' @param data data.frame with missing values
8 | #' @param preimputed.data data.frame. if you have used another software for missing
9 | #' data imputation, you can still optimize the imputation
10 | #' by handing the data.frame to this argument, which will
11 | #' bypass the "preimpute" procedure.
12 | #' @param algos character vector of algorithms to be used for imputation. the default
13 | #' is 'c("ELNET", "GBM", "RF", "DL", "Ensemble")'.
14 | #' @param ... arguments to be passed to mlim
15 | #' @return imputed data.frame
16 | #' @examples
17 | #' \dontrun{
18 | #' data(iris)
19 | #'
20 | #' # add 10% stratified missing values to one factor variable
21 | #' irisNA <- iris
22 | #' irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022)
23 | #'
24 | #' # run the default imputation
25 | #' MLIM <- mlim(irisNA)
26 | #' mlim.error(MLIM, irisNA, iris)
27 | #'
28 | #' # carry out postimputation. for real-life applications, you should increase
29 | #' # the 'tuning_time' to at least 3600 seconds, even for small datasets
30 | #' post <- mlim.postimpute(data=irisNA, preimputed.data=MLIM, tuning_time=120,
31 | #' algos = c("GBM", "RF", "ELNET", "Ensemble"),
32 | #' seed = 2022)
33 | #' }
34 | #'
35 | #' @author E. F. Haghish
36 | #' @keywords Internal
37 | #' @noRd
38 |
39 |
40 | # mlim prints the data number. when postimpute is used, the data number
41 | # must be passed to mlim
42 | # mlim.postimpute should also allow saving and loading. this requires adding
43 | # other elements in the saving object
44 | mlim.postimpute <- function(data, preimputed.data,
45 | algos = c("ELNET", "GBM", "RF", "DL", "Ensemble"),
46 | ... ) {
47 | results <- NULL
48 |
49 | if (inherits(preimputed.data, "mlim.mi")) {
50 | results <- list()
51 | for (i in 1:length(preimputed.data)) {
52 | results[[i]] <- mlim(data = data,
53 | preimputed.data = preimputed.data[[i]],
54 | algos = algos,
55 | postimpute = FALSE,
56 | shutdown = FALSE,
57 | ...)
58 |
59 | }
60 | class(results) <- "mlim.mi"
61 | }
62 |
63 | # if the preimputation was done with mlim, extract the metrics
64 | else if (inherits(preimputed.data, "mlim")) {
65 | results <- mlim(data = data,
66 | preimputed.data = preimputed.data,
67 | algos = algos,
68 | postimpute = FALSE,
69 | ...)
70 | }
71 |
72 | return(results)
73 | }
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/R/mlim.preimpute.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title carries out preimputation
3 | #' @description instead of replacing missing data with mean and mode, a smarter
4 | #' start-point would be to use fast imputation algorithms and then
5 | #' optimize the imputed dataset with mlim. this procedure usually
6 | #' requires less iterations and will savea lot of computation
7 | #' resources.
8 | # @importFrom VIM kNN
9 | #' @importFrom missRanger missRanger imputeUnivariate
10 | # @importFrom missForest missForest
11 | #' @param data data.frame with missing values
12 | #' @param preimpute character. specify the algorithm for preimputation. the
13 | #' supported options are "RF" (Random Forest), "mm"
14 | #' (mean-mode replacement), and "random" (random sampling from available data).
15 | #' the default is "RF", which carries
16 | #' a parallel random forest imputation, using all the CPUs available.
17 | #' the other alternative is "mm" which performs mean/mode
18 | #' imputation.
19 | #' @param seed integer. specify the random generator seed
20 | #' @return imputed data.frame
21 | #' @author E. F. Haghish
22 | #' @examples
23 | #' \dontrun{
24 | #' data(iris)
25 | #'
26 | #' # add 10% stratified missing values to one factor variable
27 | #' irisNA <- iris
28 | #' irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022)
29 | #'
30 | #' # run the default random forest preimputation
31 | #' MLIM <- mlim.preimpute(irisNA)
32 | #' mlim.error(MLIM, irisNA, iris)
33 |
34 | #' }
35 | #' @export
36 |
37 | mlim.preimpute <- function(data, preimpute = "RF", seed = NULL) {
38 |
39 | #if (tolower(preimpute) == "knn") {
40 | # set.seed(seed)
41 | # data <- VIM::kNN(data, imp_var=FALSE)
42 | # if (!is.null(report)) md.log("kNN preimputation is done", date=debug, time=debug, trace=FALSE)
43 | #}
44 |
45 | if (tolower(preimpute) == "rf") {
46 | message("\nPreimputation: Random Forest")
47 | pb <- txtProgressBar(0, 1, style = 3)
48 | data <- missRanger::missRanger(data, num.trees=500, mtry=1,
49 | verbose = 0, returnOOB=TRUE, seed = seed)
50 | setTxtProgressBar(pb, 1)
51 | #if (!is.null(report)) md.log("RF preimputation is done", date=debug, time=debug, trace=FALSE)
52 | }
53 | else if (tolower(preimpute) == "mm") {
54 | message("\nPreimputation: Mean/Mode")
55 | pb <- txtProgressBar(0, 1, style = 3)
56 | data <- meanmode(data)
57 | setTxtProgressBar(pb, 1)
58 | #if (!is.null(report)) md.log("Mean/Mode preimputation is done", date=debug, time=debug, trace=FALSE)
59 | }
60 | else if (tolower(preimpute) == "sample") {
61 | message("\nPreimputation: Random Sampling")
62 | if (!is.null(seed)) set.seed(seed)
63 | rsample <- function(x) replace(x, is.na(x), sample(x[!is.na(x)],sum(is.na(x))))
64 | pb <- txtProgressBar(0, 1, style = 3)
65 | for (i in colnames(data)) {
66 | if (sum(is.na(data[,i])) > 0) data[,i] <- rsample(data[,i])
67 | }
68 | setTxtProgressBar(pb, 1)
69 | }
70 | else stop(paste(preimpute, "is not recognized preimputation argument"))
71 |
72 | return(data)
73 | }
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/man/mlim.error.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mlim.error.R
3 | \name{mlim.error}
4 | \alias{mlim.error}
5 | \title{imputation error}
6 | \usage{
7 | mlim.error(
8 | imputed,
9 | incomplete,
10 | complete,
11 | transform = NULL,
12 | varwise = FALSE,
13 | ignore.missclass = TRUE,
14 | ignore.rank = FALSE
15 | )
16 | }
17 | \arguments{
18 | \item{imputed}{the imputed dataframe}
19 |
20 | \item{incomplete}{the dataframe with missing values}
21 |
22 | \item{complete}{the original dataframe with no missing values}
23 |
24 | \item{transform}{character. it can be either "standardize", which standardizes the
25 | numeric variables before evaluating the imputation error, or
26 | "normalize", which change the scale of continuous variables to
27 | range from 0 to 1. the default is NULL.}
28 |
29 | \item{varwise}{logical, default is FALSE. if TRUE, in addition to
30 | mean accuracy for each variable type, the algorithm's
31 | performance for each variable (column) of the datast is
32 | also returned. if TRUE, instead of a numeric vector, a
33 | list is retuned.}
34 |
35 | \item{ignore.missclass}{logical. the default is TRUE. if FALSE, the overall
36 | missclassification rate for imputed unordered factors will be
37 | returned. in general, missclassification is not recommended,
38 | particularly for multinomial factors because it is not robust
39 | to imbalanced data. in other words, an imputation might show
40 | a very high accuracy, because it is biased towards the majority
41 | class, ignoring the minority levels. to avoid this error,
42 | Mean Per Class Error (MPCE) is returned, which is the average
43 | missclassification of each class and thus, it is a fairer
44 | criteria for evaluating multinomial classes.}
45 |
46 | \item{ignore.rank}{logical (default is FALSE, which is recommended). if TRUE,
47 | the accuracy of imputation of ordered factors (ordinal variables)
48 | will be evaluated based on 'missclassification rate' instead of
49 | normalized euclidean distance. this practice is not recommended
50 | because higher classification rate for ordinal variables does not
51 | guarantee lower distances between the imputed levels, despite the
52 | popularity of evaluating ordinal variables based on missclassification
53 | rate. in other words, assume an ordinal variable has 5 levels (1. strongly
54 | disagree, 2. disagree, 3. uncertain, 4. agree, 5.strongly agree). in this
55 | example, if "ignore.rank = TRUE", then an imputation that imputes level
56 | "5" as "4" is equally inaccurate as other algorithm that imputes level "5"
57 | as "1". therefore, if you have ordinal variables in your dataset, make sure
58 | you declare them as "ordered" factors to get the best imputation accuracy.}
59 | }
60 | \value{
61 | numeric vector
62 | }
63 | \description{
64 | calculates NRMSE, missclassification rate, and miss-ranking
65 | absolute mean distance, scaled between 0 to 1, where 1 means
66 | maximum distance between the actual rank of a level and the
67 | imputed level.
68 | }
69 | \examples{
70 |
71 | \dontrun{
72 | data(iris)
73 |
74 | # add 10\% missing values, ensure missingness is stratified for factors
75 | irisNA <- mlim.na(iris, p = 0.1, stratify = TRUE, seed = 2022)
76 |
77 | # run the default imputation
78 | MLIM <- mlim(irisNA)
79 | mlim.error(MLIM, irisNA, iris)
80 |
81 | # get error estimations for each variable
82 | mlim.error(MLIM, irisNA, iris, varwise = TRUE)
83 | }
84 | }
85 | \author{
86 | E. F. Haghish
87 | }
88 |
--------------------------------------------------------------------------------
/R/mlim.na.R:
--------------------------------------------------------------------------------
1 | #' @title add stratified/unstratified artificial missing observations
2 | #' @description to examine the performance of imputation algorithms, artificial
3 | #' missing data are added to datasets and then imputed, to compare
4 | #' the original observations with the imputed values. this function
5 | #' can add stratified or unstratified artificial missing data. stratified
6 | #' missing data can be particularly useful if your categorical or ordinal
7 | #' variables are imbalanced, i.e., one category appears at a much higher
8 | #' rate than others.
9 | #' @param x data.frame. x must be strictly a data.frame and any other
10 | #' data.table classes will be rejected
11 | #' @param p percentage of missingness to be added to the data
12 | #' @param stratify logical. if TRUE (default), stratified sampling will be
13 | #' carried out, when adding NA values to 'factor' variables
14 | #' (either ordered or unordered). this feature makes evaluation
15 | #' of missing data imputation algorithms more fair, especially
16 | #' when the factor levels are imbalanced.
17 | #' @param classes character vector, specifying the variable classes that should
18 | #' be selected for adding NA values. the default value is NULL,
19 | #' meaning all variables will receive NA values with probability of 'p'.
20 | #' however, if you wish to add NA values only to a specific classes, e.g.
21 | #' 'numeric' variables or 'ordered' factors, specify them in this argument.
22 | #' e.g. write "classes = c('numeric', 'ordered')" if you wish to add NAs
23 | #' only to numeric and ordered factors.
24 | #' @param seed integer. a random seed number for reproducing the result (recommended)
25 | #' @author E. F. Haghish
26 | #' @examples
27 | #'
28 | #' \dontrun{
29 | #' # adding stratified NA to an atomic vector
30 | #' x <- as.factor(c(rep("M", 100), rep("F", 900)))
31 | #' table(mlim.na(x, p=.5, stratify = TRUE))
32 | #'
33 | #' # adding unstratified NAs to all variables of a data.frame
34 | #' data(iris)
35 | #' mlim.na(iris, p=0.5, stratify = FALSE, seed = 1)
36 | #'
37 | #' # or add stratified NAs only to factor variables, ignoring other variables
38 | #' mlim.na(iris, p=0.5, stratify = TRUE, classes = "factor", seed = 1)
39 | #'
40 | #' # or add NAs to numeric variables
41 | #' mlim.na(iris, p=0.5, classes = "numeric", seed = 1)
42 | #' }
43 | #' @return data.frame
44 | #' @export
45 |
46 | mlim.na <- function(x, p = 0.1, stratify=FALSE, classes=NULL, seed = NULL) {
47 |
48 | # Syntax processing
49 | # ------------------------------------------------------------
50 | stopifnot(
51 | "'p' should be between 0 and 1" = p >= 0 & p <= 1,
52 | "'x' type is not recognized" = is.atomic(x) || is.data.frame(x),
53 | "clas of 'x' must be strictly data.frame" = ! class(x) %in% c("tbl", "tbl_df")
54 | )
55 |
56 | # set the seed for reproducibility
57 | if (!is.null(seed)) set.seed(seed)
58 |
59 | # if 'x' is not a dataframe:
60 | if (is.atomic(x)) {
61 | x <- addNA(x, p, stratify = stratify)
62 | return(x)
63 | }
64 | else {
65 | for (i in colnames(x)) {
66 | if (is.null(classes)) x[, i] <- addNA(x[, i], p, stratify = stratify)
67 | else {
68 | # force drop in case 'x' is not a data.frame
69 | if (class(x[, i, drop=TRUE])[1] %in% classes) x[, i] <- addNA(x[, i], p, stratify = stratify)
70 | }
71 | }
72 | }
73 |
74 | return(x)
75 | }
76 |
77 |
78 |
--------------------------------------------------------------------------------
/R/syntaxProcessing.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title syntaxProcessing
3 | #' @description extracts performance metrics from cross-validation
4 | #' @importFrom memuse Sys.meminfo
5 | #' @importFrom tools file_ext
6 | #' @author E. F. Haghish
7 | #' @keywords Internal
8 | #' @noRd
9 |
10 |
11 | syntaxProcessing <- function(data, preimpute, impute, ram,
12 | matching, miniter, maxiter, max_models,
13 | tuning_time, cv,
14 | verbosity, report, save) {
15 |
16 | #if ("GBM" %in% impute) {
17 | #if (nrow(data) < 200) stop("too few rows... use 'ELNET' or 'DRF' instead")
18 | #}
19 |
20 | # default values
21 | verbose <- NULL
22 | debug <- FALSE
23 | min_ram <- NULL
24 | max_ram <- NULL
25 |
26 | stopifnot(
27 | #"'data' is not a data.frame" = is.data.frame(data) | inherits(data, "mlim"),
28 | #"'data' has no observations" = dim(data) >= 1, #not applicable to mlim object
29 | #"'formula' should be a formula!" = inherits(formula, "formula"),
30 | #length(formula <- as.character(formula)) == 3L,
31 | #"'max_models' must be a positive integer equal or more than 1" = max_models >= 1,
32 | #"'tuning_time' must be a positive integer" = tuning_time >= 2,
33 | "'cv' must be a positive integer equal or more than 10" = cv >= 10,
34 | "'save' argument must have '*.mlim' file extension" = tools::file_ext(save) == "mlim"
35 | )
36 |
37 | #if (miniter < 2 & preimpute == "iterate") stop("'miniter' must not be less than 2")
38 | #if ( maxiter < 2 & preimpute == "iterate") stop("'maxiter' must not be less than 2")
39 |
40 | # if (!is.null(weights_column)) {
41 | # stopifnot(
42 | # "'weights_column' must have equal length with the data.frame" = length(weights_column) == nrow(data),
43 | # "'weights_column' must not have any missing observations" = !anyNA(weights_column)
44 | # )
45 | # }
46 |
47 |
48 | if (!is.null(ram)) {
49 | if (!is.numeric(ram)) stop("'ram' must be an integer, specifying amount of RAM in Gigabytes")
50 | min_ram <- paste0(ram - 1, "G")
51 | max_ram <- paste0(ram, "G")
52 |
53 | # if more than 2/3 of the RAM is dedicated to Java server, give a warning
54 | if (ram > 0.666*round(as.numeric(memuse::Sys.meminfo()$totalram)*9.31*1e-10)) {
55 | message("NOTE: you have dedicated more than 2/3 of your total RAM to mlim.\n This is fine as long as you do not use 'XGB' algorithm.\n You are advised to monitor your RAM during the imputation...\n\n")
56 | }
57 | }
58 | else {
59 | ## NOTE: memuse::Sys.meminfo() can return RAM near zero, which fails
60 | ## initiating the Java server
61 | # ram <- floor(as.numeric(memuse::Sys.meminfo()$freeram)*9.31*1e-10)
62 | # if (ram > 4) {
63 | # min_ram <- paste0(ram - 1, "G")
64 | # max_ram <- paste0(ram, "G")
65 | # }
66 | # else {
67 | # min_ram <- NULL
68 | # max_ram <- NULL
69 | # }
70 | min_ram <- NULL
71 | max_ram <- NULL
72 | }
73 |
74 | if ("StackEnsemble" %in% impute) {
75 | keep_cross_validation_predictions <- TRUE
76 | }
77 | else {
78 | keep_cross_validation_predictions <- FALSE
79 | }
80 |
81 | # define logging levels and debugging
82 | if (is.null(verbosity)) verbose <- 0
83 | else if (verbosity == "warn") verbose <- 1
84 | else if (verbosity == "info") verbose <- 2
85 | else if (verbosity == "debug") {
86 | verbose <- 3
87 | debug <- TRUE
88 | }
89 |
90 | return(list(min_ram=min_ram,
91 | max_ram=max_ram,
92 | keep_cross_validation_predictions=keep_cross_validation_predictions,
93 | verbose=verbose,
94 | debug=debug))
95 | }
96 |
--------------------------------------------------------------------------------
/R/stoppingCriteria.R:
--------------------------------------------------------------------------------
1 | #' @title iteration stopping criteria
2 | #' @description evaluates the stopping criteria
3 | #' @param metrics estimated error from CV
4 | #' @param k iteration round
5 | #' @param error_metric character. stopping metric for the iteration. default is "RMSE"
6 | #' @return logical. if TRUE, the imputation goes on to the next iteration
7 | #' @author E. F. Haghish
8 | #' @keywords Internal
9 | #' @noRd
10 |
11 | stoppingCriteria <- function(method = "iteration_RMSE",
12 | miniter, maxiter,
13 | metrics, k, vars2impute,
14 | error_metric,
15 | tolerance,
16 | postimpute, runpostimpute,
17 | md.log) {
18 |
19 | # keep running unless...
20 | running <- TRUE
21 | error <- NA
22 | errImprovement <- NA
23 |
24 | # ............................................................
25 | # as long as there is a variable that has been improving,
26 | # keep iterating. however, if "double.check = FALSE", ignore
27 | # variables that do not improve in any iteration throughout
28 | # the rest of the iterations
29 | # ............................................................
30 | if (method == "varwise_NA") {
31 | # as long as there is a variable that it's RMSE is not NA, keep going!
32 | if (running) {
33 | error <- mean(metrics[metrics$iteration == k,
34 | error_metric], na.rm = TRUE)
35 |
36 | if (is.na(error)) {
37 |
38 | # if all values were NA, well, stop then, if there is no postimpute!
39 | if (is.null(postimpute)) {
40 | if (is.na(error)) running <- FALSE
41 | }
42 | else {
43 | if (!runpostimpute) {
44 | runpostimpute <- TRUE
45 | vars2impute <- NULL #avoid the loops on the base imputer
46 | }
47 | else {
48 | running <- FALSE
49 | runpostimpute <- FALSE
50 | }
51 | }
52 | }
53 |
54 |
55 |
56 | }
57 | }
58 |
59 | # ............................................................
60 | # Decide the stopping criteria based on average improvement of
61 | # RMSE per iteration
62 | #
63 | # ............................................................
64 | if (method == "iteration_RMSE") {
65 | # there has been no (or too little) improvement, stop!
66 | if (running) {
67 | error <- mean(metrics[metrics$iteration == k,
68 | error_metric], na.rm = TRUE)
69 |
70 | if (k == 1) message("\n ",error_metric,
71 | " = ", round(error,4), "\n", sep = "")
72 |
73 | if (k >= 2) {
74 | # get the rmse's that made NA, because of saturation
75 | available <- !is.na(metrics[metrics$iteration == k, error_metric])
76 | errPrevious <- mean(metrics[metrics$iteration == k-1 & available,
77 | error_metric],
78 | na.rm = TRUE)
79 |
80 | errImprovement <- error - errPrevious
81 | if (!is.na(error) & !is.na(errImprovement)) {
82 | percentImprove <- (errImprovement / errPrevious)
83 | }
84 |
85 | if (!is.na(errImprovement)) {
86 | if (percentImprove < 0) {
87 | message("\n ",error_metric,
88 | " = ", round(error,4), " (improved by ",
89 | round(-percentImprove*100, 3),"%)", "\n", sep = "")
90 | }
91 | else {
92 | message("\n ",error_metric,
93 | " = ", round(error,4), " (increased by ",
94 | round(percentImprove*100, 3),"%)", "\n", sep = "")
95 | }
96 |
97 | #message(paste0(error_metric,
98 | # " improved by: ", round(-percentImprove*100,4),"%"))
99 | #running <- errImprovement < (-tolerance)
100 | running <- percentImprove < (-tolerance)
101 | }
102 | }
103 | }
104 | }
105 |
106 |
107 | # if maximum iteration has been reached and still is running...
108 | # ------------------------------------------------------------
109 | if (k == maxiter & running) {
110 | warning("the imputation could be further improved by increasing number of iterations")
111 | }
112 |
113 | # maximum iteration has been reached
114 | # ------------------------------------------------------------
115 | if (k == maxiter) running <- FALSE
116 |
117 | if (!running) runpostimpute <- FALSE
118 |
119 | return(list(running = running,
120 | error = error,
121 | vars2impute = vars2impute,
122 | improvement = errImprovement,
123 | runpostimpute = runpostimpute))
124 | }
125 |
--------------------------------------------------------------------------------
/man/manifest.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/manifest.R
3 | \docType{data}
4 | \name{manifest}
5 | \alias{manifest}
6 | \title{Manifest Anxiety Scale}
7 | \format{
8 | A data frame with 4469 rows and 52 variables:
9 | \describe{
10 | \item{gender}{participants' gender}
11 | \item{age}{participants' age in years}
12 | \item{Q1}{I do not tire quickly.}
13 | \item{Q2}{I am troubled by attacks of nausea.}
14 | \item{Q3}{I believe I am no more nervous than most others.}
15 | \item{Q4}{I have very few headaches.}
16 | \item{Q5}{I work under a great deal of tension.}
17 | \item{Q6}{I cannot keep my mind on one thing.}
18 | \item{Q7}{I worry over money and business.}
19 | \item{Q8}{I frequently notice my hand shakes when I try to do something.}
20 | \item{Q9}{I blush no more often than others.}
21 | \item{Q10}{I have diarrhea once a month or more.}
22 | \item{Q11}{I worry quite a bit over possible misfortunes.}
23 | \item{Q12}{I practically never blush.}
24 | \item{Q13}{I am often afraid that I am going to blush.}
25 | \item{Q14}{I have nightmares every few nights.}
26 | \item{Q15}{My hands and feet are usually warm.}
27 | \item{Q16}{I sweat very easily even on cool days.}
28 | \item{Q17}{Sometimes when embarrassed, I break out in a sweat.}
29 | \item{Q18}{I hardly ever notice my heart pounding and I am seldom short of breath.}
30 | \item{Q19}{I feel hungry almost all the time.}
31 | \item{Q20}{I am very seldom troubled by constipation.}
32 | \item{Q21}{I have a great deal of stomach trouble.}
33 | \item{Q22}{I have had periods in which I lost sleep over worry.}
34 | \item{Q23}{My sleep is fitful and disturbed.}
35 | \item{Q24}{I dream frequently about things that are best kept to myself.}
36 | \item{Q25}{I am easily embarrassed.}
37 | \item{Q26}{I am more sensitive than most other people.}
38 | \item{Q27}{I frequently find myself worrying about something.}
39 | \item{Q28}{I wish I could be as happy as others seem to be.}
40 | \item{Q29}{I am usually calm and not easily upset.}
41 | \item{Q30}{I cry easily.}
42 | \item{Q31}{I feel anxiety about something or someone almost all the time.}
43 | \item{Q32}{I am happy most of the time.}
44 | \item{Q33}{It makes me nervous to have to wait.}
45 | \item{Q34}{I have periods of such great restlessness that I cannot sit long I a chair.}
46 | \item{Q35}{Sometimes I become so excited that I find it hard to get to sleep.}
47 | \item{Q36}{I have sometimes felt that difficulties were piling up so high that I could not overcome them.}
48 | \item{Q37}{I must admit that I have at times been worried beyond reason over something that really did not matter.}
49 | \item{Q38}{I have very few fears compared to my friends.}
50 | \item{Q39}{I have been afraid of things or people that I know could not hurt me.}
51 | \item{Q40}{I certainly feel useless at times.}
52 | \item{Q41}{I find it hard to keep my mind on a task or job.}
53 | \item{Q42}{I am usually self-conscious.}
54 | \item{Q43}{I am inclined to take things hard.}
55 | \item{Q44}{I am a high-strung person.}
56 | \item{Q45}{Life is a trial for me much of the time.}
57 | \item{Q46}{At times I think I am no good at all.}
58 | \item{Q47}{I am certainly lacking in self-confidence.}
59 | \item{Q48}{I sometimes feel that I am about to go to pieces.}
60 | \item{Q49}{I shrink from facing crisis of difficulty.}
61 | \item{Q50}{I am entirely self-confident.}
62 | }
63 | }
64 | \source{
65 | \url{https://openpsychometrics.org/tests/TMAS/}
66 | }
67 | \usage{
68 | manifest
69 | }
70 | \description{
71 | The Taylor Manifest Anxiety Scale was first developed in 1953 to identify
72 | individuals who would be good subjects for studies of stress and other
73 | related psychological phenomenon. Since then it has been used as a measure of
74 | anxiety as general personality trait. Anxiety is a complex psychological
75 | construct that includes a multiple of different facets related to extensive
76 | worrying that may impair normal functioning. The test has been widely studied
77 | and used in research, however there are some concerns that it does not measure
78 | a single trait, but instead, measures a basket of loosely related ones and so
79 | the score is not that meaningful.
80 | }
81 | \details{
82 | The data comes from an online offering of the Taylor Manifest Anxiety Scale.
83 | At the end of the test users were asked if their answers were accurate and
84 | could be used for research, 76% said yes and have been published on
85 | \url{https://openpsychometrics.org/}.
86 |
87 | #' items 1 to 50 were rated 1=True and 2=False. gender, chosen from
88 | a drop down menu (1=male, 2=female, 3=other) and age was
89 | entered as a free response (ages<14 have been removed)
90 | }
91 | \references{
92 | Taylor, J. (1953). "A personality scale of manifest anxiety".
93 | The Journal of Abnormal and Social Psychology, 48(2), 285-290.
94 | }
95 | \keyword{datasets}
96 |
--------------------------------------------------------------------------------
/R/checkNconvert.R:
--------------------------------------------------------------------------------
1 |
2 | #' @title check data.frame features and convert documents
3 | #' @description extracts features classes
4 | #' @param data data.frame
5 | #' @param ignore.rank (RECOMMENDED) model ordinal variable as gausian, but force
6 | #' the imputed values to be integers and are reverted later. if FALSE, the
7 | #' feature will be imputed as a multinomial factor.
8 | #' @return character vector of features types.
9 | #' @examples
10 | #' \dontrun{
11 | #' data(iris)
12 | #'
13 | #' # add an integer variable to iris
14 | #' iris$int <- as.integer(round(iris$Sepal.Length + iris$Sepal.Width))
15 | #'
16 | #' # add an integer variable to iris
17 | #' iris$order <- factor(iris$Species, order = TRUE,
18 | #' levels = c("setosa","versicolor","virginica"))
19 | #'
20 | #' # add a binary variable to iris
21 | #' iris$binary <- iris$Species
22 | #' iris$binary[iris$binary=="versicolor"] <- "setosa"
23 | #'
24 | #' #print(checkNconvert(iris))
25 | #' }
26 | #' @author E. F. Haghish
27 | #' @keywords Internal
28 | #' @noRd
29 |
30 | checkNconvert <- function(data, vars2impute, ignore,
31 | ignore.rank=FALSE, report=NULL) {
32 |
33 | # CHECK THAT 'data' is a data.frame
34 |
35 | mem <- NULL
36 | orderedIndex <- 0
37 |
38 | ncl <- ncol(data)
39 | features <- character(ncl)
40 | family <- character(ncl)
41 | classes <- lapply(data, class)
42 | COLNAMES <- colnames(data)
43 |
44 | # get the vartype of the variables that should be imputed
45 | # convert incompatible variable types
46 | # ============================================================
47 | j <- 0
48 |
49 | for (i in COLNAMES) {
50 | j <- j + 1
51 | # first evaluate the factors and numeric
52 | if ("factor" %in% classes[[i]]) {
53 | features[j] <- 'factor'
54 | cardinality <- length(unique(data[!is.na(data[, i]), i]))
55 | if (cardinality <= 2) family[j] <- 'binomial'
56 | else family[j] <- 'multinomial'
57 | }
58 | else if ("numeric" %in% classes[[i]]){
59 | features[j] <- 'numeric'
60 | family[j] <- 'gaussian'
61 | }
62 |
63 | # then search for ordinal & integers
64 | if ("ordered" %in% classes[[i]]) {
65 | features[j] <- 'ordered'
66 | cardinality <- length(unique(data[!is.na(data[, i]), i]))
67 | if (cardinality <= 2 & ignore.rank) {
68 | family[j] <- 'binomial'
69 | data[,i] <- factor(data[,i], ordered = FALSE)
70 | }
71 | else if (cardinality <= 2 & !ignore.rank) {
72 | orderedIndex <- orderedIndex + 1
73 | mem[[orderedIndex]] <- factmem(data[, i, drop = FALSE])
74 | family[j] <- 'quasibinomial'
75 | # take the labels if numeric
76 | if (is.numeric(as.character(data[,i]))) {
77 | data[,i] <- as.numeric(levels(data[,i]))[data[,i]]
78 | }
79 | # otherwise take the levels
80 | else {
81 | data[,i] <- as.numeric(data[,i])
82 | }
83 | }
84 | else if (cardinality > 2 & ignore.rank) {
85 | family[j] <- 'multinomial'
86 | data[,i] <- factor(data[,i], ordered = FALSE)
87 | }
88 | else if (cardinality > 2 & !ignore.rank) {
89 | orderedIndex <- orderedIndex + 1
90 | mem[[orderedIndex]] <- factmem(data[, i, drop = FALSE])
91 | family[j] <- 'gaussian_integer'
92 |
93 | # take the labels if numeric
94 | if (is.numeric(as.character(data[,i]))) {
95 | data[,i] <- as.numeric(levels(data[,i]))[data[,i]]
96 | }
97 | # otherwise take the levels
98 | else {
99 | data[,i] <- as.numeric(data[,i])
100 | }
101 | }
102 | }
103 | else if ("integer" %in% classes[[i]]) {
104 | features[j] <- 'integer'
105 | family[j] <- 'gaussian_integer'
106 | data[,i] <- as.numeric(data[,i])
107 | }
108 | else if ("character" %in% classes[[i]]) {
109 | msg <- paste(i, "variable is of class 'character', which",
110 | "is not supported. either convert it to a 'factor'",
111 | "or give it to the 'ignore' argument.")
112 | stop(msg)
113 | }
114 | }
115 |
116 | # only return the class and family of the vars2impute, if not ignored
117 | # ============================================================
118 | # now match the location of the variables according to vars2impute order
119 | index <- match(vars2impute, COLNAMES)
120 | return(list(class = features[index],
121 | family = family[index],
122 | COLNAMES = COLNAMES[index],
123 | data = data,
124 | mem = mem,
125 | orderedCols = which(features == "ordered")))
126 | }
127 |
128 |
129 | #iim <- selectVariables(data=irisWithNA, ignore = c("Sepal.Length","Sepal.Width" ,"Petal.Length","Petal.Width"))
130 | #get <- checkNconvert(data=DATA1, vars2impute=vars2impute, ignore = NULL)
131 | #print(get$family)
132 | #print(get$class)
133 | #data(cars)
134 | #cars$dist[c(1,4,7,13,16,22,26,29,35,44,45)] <- NA
135 | #a <- mlim:::selectVariables(data=cars)
136 | #b <- checkNconvert(data=cars, vars2impute=a$vars2impute, ignore = NULL)
137 |
--------------------------------------------------------------------------------
/R/manifest.R:
--------------------------------------------------------------------------------
1 | #' Manifest Anxiety Scale
2 | #'
3 | #' The Taylor Manifest Anxiety Scale was first developed in 1953 to identify
4 | #' individuals who would be good subjects for studies of stress and other
5 | #' related psychological phenomenon. Since then it has been used as a measure of
6 | #' anxiety as general personality trait. Anxiety is a complex psychological
7 | #' construct that includes a multiple of different facets related to extensive
8 | #' worrying that may impair normal functioning. The test has been widely studied
9 | #' and used in research, however there are some concerns that it does not measure
10 | #' a single trait, but instead, measures a basket of loosely related ones and so
11 | #' the score is not that meaningful.
12 | #'
13 | #' The data comes from an online offering of the Taylor Manifest Anxiety Scale.
14 | #' At the end of the test users were asked if their answers were accurate and
15 | #' could be used for research, 76% said yes and have been published on
16 | #' \url{https://openpsychometrics.org/}.
17 | #'
18 | #' #' items 1 to 50 were rated 1=True and 2=False. gender, chosen from
19 | #' a drop down menu (1=male, 2=female, 3=other) and age was
20 | #' entered as a free response (ages<14 have been removed)
21 | #'
22 | #' @format A data frame with 4469 rows and 52 variables:
23 | #' \describe{
24 | #' \item{gender}{participants' gender}
25 | #' \item{age}{participants' age in years}
26 | #' \item{Q1}{I do not tire quickly.}
27 | #' \item{Q2}{I am troubled by attacks of nausea.}
28 | #' \item{Q3}{I believe I am no more nervous than most others.}
29 | #' \item{Q4}{I have very few headaches.}
30 | #' \item{Q5}{I work under a great deal of tension.}
31 | #' \item{Q6}{I cannot keep my mind on one thing.}
32 | #' \item{Q7}{I worry over money and business.}
33 | #' \item{Q8}{I frequently notice my hand shakes when I try to do something.}
34 | #' \item{Q9}{I blush no more often than others.}
35 | #' \item{Q10}{I have diarrhea once a month or more.}
36 | #' \item{Q11}{I worry quite a bit over possible misfortunes.}
37 | #' \item{Q12}{I practically never blush.}
38 | #' \item{Q13}{I am often afraid that I am going to blush.}
39 | #' \item{Q14}{I have nightmares every few nights.}
40 | #' \item{Q15}{My hands and feet are usually warm.}
41 | #' \item{Q16}{I sweat very easily even on cool days.}
42 | #' \item{Q17}{Sometimes when embarrassed, I break out in a sweat.}
43 | #' \item{Q18}{I hardly ever notice my heart pounding and I am seldom short of breath.}
44 | #' \item{Q19}{I feel hungry almost all the time.}
45 | #' \item{Q20}{I am very seldom troubled by constipation.}
46 | #' \item{Q21}{I have a great deal of stomach trouble.}
47 | #' \item{Q22}{I have had periods in which I lost sleep over worry.}
48 | #' \item{Q23}{My sleep is fitful and disturbed.}
49 | #' \item{Q24}{I dream frequently about things that are best kept to myself.}
50 | #' \item{Q25}{I am easily embarrassed.}
51 | #' \item{Q26}{I am more sensitive than most other people.}
52 | #' \item{Q27}{I frequently find myself worrying about something.}
53 | #' \item{Q28}{I wish I could be as happy as others seem to be.}
54 | #' \item{Q29}{I am usually calm and not easily upset.}
55 | #' \item{Q30}{I cry easily.}
56 | #' \item{Q31}{I feel anxiety about something or someone almost all the time.}
57 | #' \item{Q32}{I am happy most of the time.}
58 | #' \item{Q33}{It makes me nervous to have to wait.}
59 | #' \item{Q34}{I have periods of such great restlessness that I cannot sit long I a chair.}
60 | #' \item{Q35}{Sometimes I become so excited that I find it hard to get to sleep.}
61 | #' \item{Q36}{I have sometimes felt that difficulties were piling up so high that I could not overcome them.}
62 | #' \item{Q37}{I must admit that I have at times been worried beyond reason over something that really did not matter.}
63 | #' \item{Q38}{I have very few fears compared to my friends.}
64 | #' \item{Q39}{I have been afraid of things or people that I know could not hurt me.}
65 | #' \item{Q40}{I certainly feel useless at times.}
66 | #' \item{Q41}{I find it hard to keep my mind on a task or job.}
67 | #' \item{Q42}{I am usually self-conscious.}
68 | #' \item{Q43}{I am inclined to take things hard.}
69 | #' \item{Q44}{I am a high-strung person.}
70 | #' \item{Q45}{Life is a trial for me much of the time.}
71 | #' \item{Q46}{At times I think I am no good at all.}
72 | #' \item{Q47}{I am certainly lacking in self-confidence.}
73 | #' \item{Q48}{I sometimes feel that I am about to go to pieces.}
74 | #' \item{Q49}{I shrink from facing crisis of difficulty.}
75 | #' \item{Q50}{I am entirely self-confident.}
76 | #' }
77 | #' @source \url{https://openpsychometrics.org/tests/TMAS/}
78 | #' @references Taylor, J. (1953). "A personality scale of manifest anxiety".
79 | #' The Journal of Abnormal and Social Psychology, 48(2), 285-290.
80 | "manifest"
81 |
82 | # TMA <- read_csv("TMA.csv", col_types = cols(score = col_skip()))
83 | # TMA <- as.data.frame(TMA)
84 | # head(TMA$gender)
85 | # TMA$gender[TMA$gender == 0] <- NA
86 | # TMA$gender <- factor(TMA$gender,
87 | # levels = c(1,2,3),
88 | # labels = c("Male","Female","Other"))
89 | #
90 | # # code the missing responses
91 | # for (i in 1:50) {
92 | # index <- TMA[, paste0("Q",i)] == 0
93 | # TMA[index, paste0("Q",i)] <- NA
94 | # TMA[, paste0("Q",i)] <-factor(TMA[, paste0("Q",i)],
95 | # levels = c(1,2),
96 | # labels = c("TRUE","FALSE"))
97 | # }
98 | # manifest <- na.omit(manifest)
99 |
100 |
101 |
--------------------------------------------------------------------------------
/R/mlim.error.R:
--------------------------------------------------------------------------------
1 | #' @title imputation error
2 | #' @description calculates NRMSE, missclassification rate, and miss-ranking
3 | #' absolute mean distance, scaled between 0 to 1, where 1 means
4 | #' maximum distance between the actual rank of a level and the
5 | #' imputed level.
6 | #' @param imputed the imputed dataframe
7 | #' @param incomplete the dataframe with missing values
8 | #' @param complete the original dataframe with no missing values
9 | #' @param transform character. it can be either "standardize", which standardizes the
10 | #' numeric variables before evaluating the imputation error, or
11 | #' "normalize", which change the scale of continuous variables to
12 | #' range from 0 to 1. the default is NULL.
13 | #' @param varwise logical, default is FALSE. if TRUE, in addition to
14 | #' mean accuracy for each variable type, the algorithm's
15 | #' performance for each variable (column) of the datast is
16 | #' also returned. if TRUE, instead of a numeric vector, a
17 | #' list is retuned.
18 | #' @param ignore.missclass logical. the default is TRUE. if FALSE, the overall
19 | #' missclassification rate for imputed unordered factors will be
20 | #' returned. in general, missclassification is not recommended,
21 | #' particularly for multinomial factors because it is not robust
22 | #' to imbalanced data. in other words, an imputation might show
23 | #' a very high accuracy, because it is biased towards the majority
24 | #' class, ignoring the minority levels. to avoid this error,
25 | #' Mean Per Class Error (MPCE) is returned, which is the average
26 | #' missclassification of each class and thus, it is a fairer
27 | #' criteria for evaluating multinomial classes.
28 | #' @param ignore.rank logical (default is FALSE, which is recommended). if TRUE,
29 | #' the accuracy of imputation of ordered factors (ordinal variables)
30 | #' will be evaluated based on 'missclassification rate' instead of
31 | #' normalized euclidean distance. this practice is not recommended
32 | #' because higher classification rate for ordinal variables does not
33 | #' guarantee lower distances between the imputed levels, despite the
34 | #' popularity of evaluating ordinal variables based on missclassification
35 | #' rate. in other words, assume an ordinal variable has 5 levels (1. strongly
36 | #' disagree, 2. disagree, 3. uncertain, 4. agree, 5.strongly agree). in this
37 | #' example, if "ignore.rank = TRUE", then an imputation that imputes level
38 | #' "5" as "4" is equally inaccurate as other algorithm that imputes level "5"
39 | #' as "1". therefore, if you have ordinal variables in your dataset, make sure
40 | #' you declare them as "ordered" factors to get the best imputation accuracy.
41 |
42 | #' @author E. F. Haghish
43 | #' @examples
44 | #'
45 | #' \dontrun{
46 | #' data(iris)
47 | #'
48 | #' # add 10% missing values, ensure missingness is stratified for factors
49 | #' irisNA <- mlim.na(iris, p = 0.1, stratify = TRUE, seed = 2022)
50 | #'
51 | #' # run the default imputation
52 | #' MLIM <- mlim(irisNA)
53 | #' mlim.error(MLIM, irisNA, iris)
54 | #'
55 | #' # get error estimations for each variable
56 | #' mlim.error(MLIM, irisNA, iris, varwise = TRUE)
57 | #' }
58 | #' @return numeric vector
59 | #' @export
60 | mlim.error <- function(imputed, incomplete, complete, transform = NULL,
61 | varwise = FALSE, ignore.missclass = TRUE,
62 | ignore.rank=FALSE) {
63 |
64 | rankerror <- NA
65 | classerror <- NA
66 | meanclasserr <- NA
67 | nrmse <- NA
68 | err <- NULL
69 |
70 | if ("mlim" %in% class(imputed) | "data.frame" %in% class(imputed) ) {
71 | # make sure the complete dataset is complete!
72 | if (length(which(colSums(is.na(complete)) > 0)) > 0)
73 | stop("'complete dataset has missing values")
74 |
75 | # get the variables with missing data, ignoring the rest
76 | naCols <- which(colSums(is.na(incomplete)) > 0)
77 | imputed <- imputed[, naCols, drop = FALSE]
78 | incomplete <- incomplete[, naCols, drop = FALSE]
79 | complete <- complete[, naCols, drop = FALSE]
80 |
81 | classes <- lapply(complete, class)
82 | types <- character(length(classes))
83 | for (i in 1:length(classes)) types[i] <- classes[[i]][1]
84 | if ("integer" %in% types) types[which(types == "integer")] <- "numeric"
85 |
86 | n <- nrow(imputed)
87 |
88 | if (!ignore.rank) {
89 | err <- rep(NA, 4)
90 | names(err) <- c('nrmse', 'mpce', 'missclass', 'missrank')
91 | }
92 | else {
93 | err <- rep(NA, 3)
94 | names(err) <- c('nrmse', 'mpce', 'missclass')
95 | if ("ordered" %in% types) types[which(types == "ordered")] <- "factor"
96 | }
97 |
98 | for (t in types){
99 | ind <- which(types == t)
100 |
101 | if (t == "numeric") {
102 | if (is.null(transform)) {
103 | nrmse <- nrmse(imputed[,ind, drop = FALSE],
104 | incomplete[,ind, drop = FALSE],
105 | complete[,ind, drop = FALSE])
106 | if (!is.null(nrmse)) err[1] <- mean(nrmse, na.rm = TRUE)
107 | }
108 | else {
109 | if (transform == "standardize") {
110 | v1 <- scale(imputed[,ind, drop = FALSE])
111 | v2 <- scale(incomplete[,ind, drop = FALSE])
112 | v3 <- scale(complete[,ind, drop = FALSE])
113 | }
114 | else if (transform == "normalize") {
115 | v1 <- normalize(imputed[,ind, drop = FALSE])
116 | v2 <- normalize(incomplete[,ind, drop = FALSE])
117 | v3 <- normalize(complete[,ind, drop = FALSE])
118 | }
119 | else {
120 | stop(paste(transform, "is not a recognized transformation"))
121 | }
122 | nrmse <- nrmse(v1, v2, v3)
123 | if (!is.null(nrmse)) err[1] <- mean(nrmse, na.rm = TRUE)
124 | }
125 |
126 | }
127 | else if (t == 'ordered' & !ignore.rank) {
128 | rankerror <- missrank(imputed[,ind, drop = FALSE],
129 | incomplete[,ind, drop = FALSE],
130 | complete[,ind, drop = FALSE])
131 | if (!is.null(rankerror)) err[4] <- mean(rankerror, na.rm = TRUE)
132 | }
133 |
134 | # ??? this does not mean that it is necessarily a factor variable. IMPROVE IT
135 | else {
136 | meanclasserr <- mean_per_class_error(imputed[,ind, drop = FALSE],
137 | incomplete[,ind, drop = FALSE],
138 | complete[,ind, drop = FALSE])
139 | if (!is.null(meanclasserr)) err[2] <- mean(meanclasserr, na.rm = TRUE)
140 |
141 | if (!ignore.missclass) {
142 | classerror <- missclass(imputed[,ind, drop = FALSE],
143 | incomplete[,ind, drop = FALSE],
144 | complete[,ind, drop = FALSE])
145 | if (!is.null(classerror)) err[3] <- mean(classerror, na.rm = TRUE)
146 | }
147 | }
148 | }
149 |
150 | if (varwise) {
151 | vwa <- NULL
152 | if (length(nrmse) > 1) vwa <- c(vwa, nrmse)
153 | else if (is.valid(nrmse)) vwa <- c(vwa, nrmse)
154 | if (ignore.missclass) {
155 | if (length(meanclasserr) > 1) vwa <- c(vwa, meanclasserr)
156 | else if (is.valid(meanclasserr)) vwa <- c(vwa, meanclasserr)
157 | }
158 | else {
159 | if (length(classerror) > 1) vwa <- c(vwa, classerror)
160 | else if (is.valid(classerror)) vwa <- c(vwa, classerror)
161 | }
162 | if (length(rankerror) > 1) vwa <- c(vwa, rankerror)
163 | else if (is.valid(rankerror)) vwa <- c(vwa, rankerror)
164 | return(list(error = err[!is.na(err)],
165 | nrmse = nrmse,
166 | missclass = classerror,
167 | missrank = rankerror,
168 | all = vwa))
169 | }
170 | else return(err[!is.na(err)])
171 | }
172 |
173 | else if ("mlim.mi" %in% class(imputed) |
174 | "list" %in% class(imputed) |
175 | "mids" %in% class(imputed)) {
176 | mat <- NULL
177 | for (i in 1:length(imputed)) {
178 | tmp <- imputed[[i]]
179 | class(tmp) <- c("mlim", "data.frame")
180 | mat <- rbind(mat, mlim.error(tmp, incomplete, complete,
181 | varwise = varwise,
182 | transform = transform,
183 | ignore.rank=ignore.rank))
184 | }
185 | return(mat)
186 | }
187 | else stop("'imputed' must be of class 'data.frame', 'list', 'mlim', 'mlim.mi', or 'mids'")
188 | }
189 |
190 | # data(charity)
191 | # charity$ta1 <- factor(charity$ta1, ordered = FALSE)
192 | # for (i in colnames(charity))
193 | # dfNA <- mlim.na(charity, p = 0.1, stratify = TRUE, seed = 2022)
194 | # imp <- missRanger::missRanger(dfNA)
195 | # mlim.error(imp, dfNA, charity)
196 | #print((ELNETerror <- mlim.error(ELNET, dfNA, df)))
197 |
198 | # if we standardize numeric vars, RMSE and MAE can be more than 1
199 |
200 |
201 |
202 |
203 |
204 |
205 |
--------------------------------------------------------------------------------
/man/mlim.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mlim.R
3 | \name{mlim}
4 | \alias{mlim}
5 | \title{missing data imputation with automated machine learning}
6 | \usage{
7 | mlim(
8 | data = NULL,
9 | m = 1,
10 | algos = c("ELNET"),
11 | postimpute = FALSE,
12 | stochastic = m > 1,
13 | ignore = NULL,
14 | tuning_time = 900,
15 | max_models = NULL,
16 | maxiter = 10L,
17 | cv = 10L,
18 | matching = "AUTO",
19 | autobalance = TRUE,
20 | balance = NULL,
21 | seed = NULL,
22 | verbosity = NULL,
23 | report = NULL,
24 | tolerance = 0.001,
25 | doublecheck = TRUE,
26 | preimpute = "RF",
27 | cpu = -1,
28 | ram = NULL,
29 | flush = FALSE,
30 | preimputed.data = NULL,
31 | save = NULL,
32 | load = NULL,
33 | shutdown = TRUE,
34 | java = NULL,
35 | ...
36 | )
37 | }
38 | \arguments{
39 | \item{data}{a \code{data.frame} (strictly) with missing data to be
40 | imputed. if \code{'load'} argument is provided, this argument will be ignored.}
41 |
42 | \item{m}{integer, specifying number of multiple imputations. the default value is
43 | 1, carrying out a single imputation.}
44 |
45 | \item{algos}{character vector, specifying algorithms to be used for missing data
46 | imputation. supported algorithms are "ELNET", "RF", "GBM", "DL",
47 | "XGB", and "Ensemble". if more than one algorithm is specified,
48 | mlim changes behavior to save on runtime. for example,
49 | the default is "ELNET", which fine-tunes an Elastic Net model.
50 | In general, "ELNET" is expected to
51 | be the best algorithm because it fine-tunes very fast, it is
52 | very robust to over-fitting, and hence, it generalizes very well.
53 | However, if your data has many factor variables, each with several
54 | levels, it is recommended to have c("ELNET", "RF") as your imputation
55 | algorithms (and possibly add "Ensemble" as well, to make the most out
56 | of tuning the models).
57 |
58 | Note that "XGB" is only available in Mac OS and Linux. moreover,
59 | "GBM", "DL" and "XGB" take the full given "tuning_time" (see below) to
60 | tune the best model for imputing he given variable, whereas "ELNET"
61 | will produce only one fine-tuned model, often at less time than
62 | other algorithms need for developing a single model, which is why "ELNET"
63 | is work horse of the mlim imputation package.}
64 |
65 | \item{postimpute}{(EXPERIMENTAL FEATURE) logical. if TRUE, mlim uses algorithms rather than 'ELNET' for carrying out
66 | postimputation optimization. however, if FALSE, all specified algorihms will
67 | be used in the process of 'reimputation' together. the 'Ensemble' algorithm
68 | is encouraged when other algorithms are used. However, for general users
69 | unspecialized in machine learning, postimpute is NOT recommended because this
70 | feature is currently experimental, prone to over-fitting, and highly computationally extensive.}
71 |
72 | \item{stochastic}{logical. by default it is set to TRUE for multiple imputation and FALSE for
73 | single imputation. stochastic argument is currently under testing and is intended to
74 | avoid inflating the correlation between imputed valuables.}
75 |
76 | \item{ignore}{character vector of column names or index of columns that should
77 | should be ignored in the process of imputation.}
78 |
79 | \item{tuning_time}{integer. maximum runtime (in seconds) for fine-tuning the
80 | imputation model for each variable in each iteration. the default
81 | time is 900 seconds but for a large dataset, you
82 | might need to provide a larger model development
83 | time. this argument also influences \code{max_models},
84 | see below. If you are using 'ELNET' algorithm (default),
85 | you can be generous with the 'tuning_time' argument because
86 | 'ELNET' tunes much faster than the rest and will only
87 | produce one model.}
88 |
89 | \item{max_models}{integer. maximum number of models that can be generated in
90 | the proecess of fine-tuning the parameters. this value
91 | default to 100, meaning that for imputing each variable in
92 | each iteration, up to 100 models can be fine-tuned. increasing
93 | this value should be consistent with increasing
94 | \code{max_model_runtime_secs}, allowing the model to spend
95 | more time in the process of individualized fine-tuning.
96 | as a result, the better tuned the model, the more accurate
97 | the imputed values are expected to be}
98 |
99 | \item{maxiter}{integer. maximum number of iterations. the default value is \code{15},
100 | but it can be reduced to \code{3} (not recommended, see below).}
101 |
102 | \item{cv}{logical. specify number of k-fold Cross-Validation (CV). values of
103 | 10 or higher are recommended. default is 10.}
104 |
105 | \item{matching}{logical. if \code{TRUE}, imputed values are coerced to the
106 | closest value to the non-missing values of the variable.
107 | if set to "AUTO", 'mlim' decides whether to match
108 | or not, based on the variable classes. the default is "AUTO".}
109 |
110 | \item{autobalance}{logical. if TRUE (default), binary and multinomial factor variables
111 | will be balanced before the imputation to obtain fairer
112 | and less-biased imputations, which are typically in favor
113 | of the majority class.
114 | if FALSE, imputation fairness will be sacrificed for overall accuracy, which
115 | is not recommended, although it is commonly practiced in other missing data
116 | imputation software. MLIM is highly concerned with imputation fairness for
117 | factor variables and autobalancing is generally recommended.
118 | in fact, higher overall accuracy does not mean a better imputation as
119 | long as minority classes are neglected, which increases the bias in favor of the
120 | majority class. if you do not wish to autobalance all the
121 | factor variables, you can manually specify the variables
122 | that should be balanced using the 'balance' argument (see below).}
123 |
124 | \item{balance}{character vector, specifying variable names that should be
125 | balanced before imputation. balancing the prevalence might
126 | decrease the overall accuracy of the imputation, because it
127 | attempts to ensure the representation of the rare outcome.
128 | this argument is optional and intended for advanced users that
129 | impute a severely imbalance categorical (nominal) variable.}
130 |
131 | \item{seed}{integer. specify the random generator seed}
132 |
133 | \item{verbosity}{character. controls how much information is printed to console.
134 | the value can be "warn" (default), "info", "debug", or NULL.
135 | to FALSE.}
136 |
137 | \item{report}{filename. if a filename is specified (e.g. report = "mlim.md"), the \code{"md.log"} R
138 | package is used to generate a Markdown progress report for the
139 | imputation. the format of the report is adopted based on the
140 | \code{'verbosity'} argument. the higher the verbosity, the more
141 | technical the report becomes. if verbosity equals "debug", then
142 | a log file is generated, which includes time stamp and shows
143 | the function that has generated the message. otherwise, a
144 | reduced markdown-like report is generated. default is NULL.}
145 |
146 | \item{tolerance}{numeric. the minimum rate of improvement in estimated error metric
147 | of a variable to qualify the imputation for another round of iteration,
148 | if the \code{maxiter} is not yet reached. any improvement of imputation
149 | is desirable. however, specifying values above 0 can reduce the number
150 | of required iterations at a marginal increase of imputation error.
151 | for larger datasets, value of "1e-3" is recommended to reduce number
152 | of iterations. the default value is '1e-3'.}
153 |
154 | \item{doublecheck}{logical. default is TRUE (which is conservative). if FALSE, if the estimated
155 | imputation error of a variable does not improve, the variable
156 | will be not reimputed in the following iterations. in general,
157 | deactivating this argument will slightly reduce the imputation
158 | accuracy, however, it significantly reduces the computation time.
159 | if your dataset is large, you are advised to set this argument to
160 | FALSE. (EXPERIMENTAL: consider that by avoiding several iterations
161 | that marginally improve the imputation accuracy, you might gain
162 | higher accuracy by investing your computational resources in fine-tuning
163 | better algorithms such as "GBM")}
164 |
165 | \item{preimpute}{character. specifies the 'primary' procedure of handling the missing
166 | data. before 'mlim' begins imputing the missing observations, they should
167 | be prepared for the imputation algorithms and thus, they should be replaced
168 | with some values.
169 | the default procedure is a quick "RF", which models the missing
170 | data with parallel Random Forest model. this is a very fast procedure,
171 | which later on, will be replaced within the "reimputation" procedure (see below).
172 | possible other alternative is \code{"mm"},
173 | which carries out mean/mode replacement, as practiced by most imputation algorithms.
174 | "mm" is much faster than "RF". if your dataset is very
175 | large, consider pre-imputing it before hand using 'mlim.preimpute()'
176 | function and passing the preimputed dataset to mlim (see "preimputed.data" argument).}
177 |
178 | \item{cpu}{integer. number of CPUs to be dedicated for the imputation.
179 | the default takes all of the available CPUs.}
180 |
181 | \item{ram}{integer. specifies the maximum size, in Gigabytes, of the
182 | memory allocation. by default, all the available memory is
183 | used for the imputation.
184 | large memory size is particularly advised, especially
185 | for multicore processes. the more you give the more you get!}
186 |
187 | \item{flush}{logical (experimental). if TRUE, after each model, the server is
188 | cleaned to retrieve RAM. this feature is in testing mode and is
189 | currently set to FALSE by default, but it is recommended if you
190 | have limited amount of RAM or large datasets.}
191 |
192 | \item{preimputed.data}{data.frame. if you have used another software for missing
193 | data imputation, you can still optimize the imputation
194 | by handing the data.frame to this argument, which will
195 | bypass the "preimpute" procedure.}
196 |
197 | \item{save}{filename (with .mlim extension). if a filename is specified, an \code{mlim} object is
198 | saved after the end of each variable imputation. this object not only
199 | includes the imputed dataframe and estimated cross-validation error, but also
200 | includes the information needed for continuing the imputation,
201 | which is very useful feature for imputing large datasets, with a
202 | long runtime. this argument is activated by default and an
203 | mlim object is stored in the local directory named \code{"mlim.rds"}.}
204 |
205 | \item{load}{filename (with .mlim extension). an object of class "mlim", which includes the data, arguments,
206 | and settings for re-running the imputation, from where it was
207 | previously stopped. the "mlim" object saves the current state of
208 | the imputation and is particularly recommended for large datasets
209 | or when the user specifies a computationally extensive settings
210 | (e.g. specifying several algorithms, increasing tuning time, etc.).}
211 |
212 | \item{shutdown}{logical. if TRUE, h2o server is closed after the imputation.
213 | the default is TRUE}
214 |
215 | \item{java}{character, specifying path to the executable 64bit Java JDK on the
216 | Microsoft Windows machines, if JDK is installed but the path environment
217 | variable is not set.}
218 |
219 | \item{...}{arguments that are used internally between 'mlim' and 'mlim.postimpute'.
220 | these arguments are not documented in the help file and are not
221 | intended to be used by end user.}
222 | }
223 | \value{
224 | a \code{data.frame}, showing the
225 | estimated imputation error from the cross validation within the data.frame's
226 | attribution
227 | }
228 | \description{
229 | imputes data.frame with mixed variable types using automated
230 | machine learning (AutoML)
231 | }
232 | \examples{
233 |
234 | \dontrun{
235 | data(iris)
236 |
237 | # add stratified missing observations to the data. to make the example run
238 | # faster, I add NAs only to a single variable.
239 | dfNA <- iris
240 | dfNA$Species <- mlim.na(dfNA$Species, p = 0.1, stratify = TRUE, seed = 2022)
241 |
242 | # run the ELNET single imputation (fastest imputation via 'mlim')
243 | MLIM <- mlim(dfNA, shutdown = FALSE)
244 |
245 | # in single imputation, you can estimate the imputation accuracy via cross validation RMSE
246 | mlim.summarize(MLIM)
247 |
248 | ### or if you want to carry out ELNET multiple imputation with 5 datasets.
249 | ### next, to carry out analysis on the multiple imputation, use the 'mlim.mids' function
250 | ### minimum of 5 datasets
251 | MLIM2 <- mlim(dfNA, m = 5)
252 | mids <- mlim.mids(MLIM2, dfNA)
253 | fit <- with(data=mids, exp=glm(Species ~ Sepal.Length, family = "binomial"))
254 | res <- mice::pool(fit)
255 | summary(res)
256 |
257 | # you can check the accuracy of the imputation, if you have the original dataset
258 | mlim.error(MLIM2, dfNA, iris)
259 | }
260 | }
261 | \author{
262 | E. F. Haghish
263 | }
264 |
--------------------------------------------------------------------------------
/R/iteration_loop.R:
--------------------------------------------------------------------------------
1 | #' @title iteration_loop
2 | #' @description runs imputation iteration loop to fully impute a dataframe
3 | #' @importFrom utils setTxtProgressBar txtProgressBar capture.output packageVersion
4 | #' @importFrom h2o h2o.init as.h2o h2o.predict h2o.ls
5 | #' h2o.removeAll h2o.rm h2o.shutdown h2o.get_automl
6 | #' @importFrom md.log md.log
7 | #' @importFrom memuse Sys.meminfo
8 | #' @importFrom stats var setNames na.omit rnorm
9 | #' @return list
10 | #' @author E. F. Haghish
11 | #' @keywords Internal
12 | #' @noRd
13 |
14 | iteration_loop <- function(MI, dataNA, preimputed.data, data, bdata, boot, metrics, tolerance, doublecheck,
15 | m, k, X, Y, z, m.it,
16 |
17 | # loop data
18 | vars2impute, vars2postimpute, storeVars2impute,
19 | allPredictors, preimpute, impute, postimputealgos,
20 |
21 | # settings
22 | error_metric, FAMILY, cv, tuning_time,
23 | max_models,
24 | keep_cv,
25 | autobalance, balance, seed, save, flush,
26 | verbose, debug, report, sleep,
27 |
28 | # saving settings
29 | mem, orderedCols, ignore, maxiter,
30 | miniter, matching, ignore.rank,
31 | verbosity, error, cpu, max_ram, min_ram, shutdown, clean,
32 | stochastic) {
33 |
34 | FACTORPREDCTIONS <- NULL
35 |
36 | # ------------------------------------------------------------
37 | # bootrtap
38 | #
39 | # Bootstrap from the original dataset, hold the original NA values,
40 | # and then use the preimputed dataset, and then gradually improve it
41 | #
42 | #### PROBLEM
43 | ############
44 | #### drop the duplicates because they screw up the k-fold cross-validation.
45 | #### multiple identical observations might go to train and test datasets.
46 | #### here I suggest several 'work-in-progress' solutions
47 | # ============================================================
48 |
49 | ####### ===============================================
50 | ####### BOOTSTRAP AND BALANCING DRAMA
51 | ####### ===============================================
52 | #??? THIS NEEDS FURTHER UPDATE IF 'autobalance' IS ACTIVATED
53 | # THE SOlUTION WOULD BE TO CALCULATE BALANCING WEIGHTS FOR
54 | # EACH OBSERVATION AND THEN MULTIPLY IT BY THE WEIGHTS_COLUMN.
55 | # OR CARRY OUT BALANCED STRATIFIED SAMPLING FOR CATEGORICAL
56 | # VARIABLES...
57 |
58 | if (boot) {
59 | rownames(data) <- 1:nrow(data) #remember the rows that are missing
60 | sampling_index <- sample(x = nrow(data), size = nrow(data), replace=TRUE)
61 |
62 |
63 |
64 | ## SOLUTION 1: DROP THE DUPLICATES AND DO UNDERSAMPLING
65 | ## ----------------------------------------------------
66 | # bdata <- data[sampling_index, ]
67 | # bdataNA <- is.na(bdata[, vars2impute, drop = FALSE])
68 | # bdata <- mlim.preimpute(data=bdata, preimpute=preimpute, seed = NULL)
69 | # sampling_index <- sampling_index[!duplicated(sampling_index)]
70 | # bdata <- data[sampling_index, ]
71 | # bdata[, "mlim_bootstrap_weights_column_"] <- 1
72 | # bdataNA <- is.na(bdata[, vars2impute, drop = FALSE])
73 |
74 | ## SOLUTION 2: ADD THE DUPLICATES TO THE WEIGHT_COLUMN
75 | ## ----------------------------------------------------
76 | dups <- bootstrapWeight(sampling_index)
77 | bdata <- data[1:nrow(data) %in% dups[,1], ]
78 | bdataNA <- is.na(bdata[, vars2impute, drop = FALSE])
79 | message("\n")
80 | bdata <- mlim.preimpute(data=bdata, preimpute=preimpute, seed = NULL)
81 | bdata[, "mlim_bootstrap_weights_column_"] <- dups[,2] #OR ALTERNATIVELY #dups[,2] / sum(dups[,2])
82 |
83 | ## SOLUTION 3: Assign CV folding manually instead of weight_column
84 | ## ----------------------------------------------------
85 | # bdata <- data[sampling_index, ]
86 | # bdataNA <- is.na(bdata[, vars2impute, drop = FALSE])
87 | # bdata <- mlim.preimpute(data=bdata, preimpute=preimpute, seed = NULL)
88 | # bdata[, "mlim_bootstrap_fold_assignment_"] <- 0
89 | # folds <- bootstrapCV(index = sampling_index, cv = cv)
90 | # for (i in 1:cv) {
91 | # indexcv <- sampling_index %in% folds[,i]
92 | # bdata[indexcv, "mlim_bootstrap_fold_assignment_"] <- i
93 | # }
94 | }
95 |
96 | # update the fresh data
97 | # ------------------------------------------------------------
98 | running <- TRUE
99 | runpostimpute <- FALSE
100 |
101 | if (debug) md.log("data was sent to h2o cloud", date=debug, time=debug, trace=FALSE)
102 |
103 | # define iteration var. this is a vector of varnames that should be imputed
104 | # if 'doublecheck' argument is FALSE, everytime a variable stops improving,
105 | # remove it from ITERATIONVARS. When you get to postimputation, reset the
106 | # ITERATIONVARS.
107 | ITERATIONVARS <- vars2impute
108 |
109 | # ------------------------------------------------------------
110 | # Generate the HEX datasets if there is NO FLUSHING
111 | # ------------------------------------------------------------
112 | if (!flush) {
113 | tryCatch(hex <- h2o::as.h2o(data),
114 | error = function(cond) {
115 | message("trying to upload data to JAVA server...\n");
116 | message("ERROR: Data could not be uploaded to the Java Server\nJava server returned the following error:\n")
117 | return(stop(cond))})
118 |
119 | bhex <- NULL
120 | if (!is.null(bdata)) {
121 | tryCatch(bhex<- h2o::as.h2o(bdata),
122 | error = function(cond) {
123 | message("trying to upload data to JAVA server...\n");
124 | message("ERROR: Data could not be uploaded to the Java Server\nJava server returned the following error:\n")
125 | return(stop(cond))})
126 | }
127 | }
128 | else {
129 | hex <- NULL
130 | bhex <- NULL
131 | }
132 |
133 | # ============================================================
134 | # ============================================================
135 | # global iteration loop
136 | # ============================================================
137 | # ============================================================
138 | while (running) {
139 |
140 | # always print the iteration
141 | message(paste0("\ndata ", m.it, ", iteration ", k, " (RAM = ", memuse::Sys.meminfo()$freeram,")", ":"), sep = "") #":\t"
142 | md.log(paste("Iteration", k), section="subsection")
143 |
144 | # ## AVOID THIS PRACTICE BECAUSE DOWNLOADING DATA FROM THE SERVER IS SLOW
145 | # # store the last data
146 | # if (debug) md.log("store last data", date=debug, time=debug, trace=FALSE)
147 | # dataLast <- as.data.frame(hex)
148 | # attr(dataLast, "metrics") <- metrics
149 | # attr(dataLast, "rmse") <- error
150 |
151 | # .........................................................
152 | # IMPUTATION & POSTIMPUTATION LOOP
153 | # .........................................................
154 | if (runpostimpute) {
155 | procedure <- "postimpute"
156 | if (debug) md.log("Running POSTIMPUTATION", date = TRUE, time = TRUE, print = FALSE, trace = FALSE)
157 | }
158 | else procedure <- "impute"
159 |
160 | for (Y in ITERATIONVARS[z:length(ITERATIONVARS)]) {
161 | start <- as.integer(Sys.time())
162 |
163 | # Prepare the progress bar and iteration console text
164 | # ============================================================
165 | if (verbose==0) pb <- txtProgressBar((which(ITERATIONVARS == Y))-1, length(vars2impute), style = 3)
166 | if (verbose!=0) message(paste0(" ",Y))
167 |
168 | it <- NULL
169 | tryCatch(capture.output(
170 | it <- iterate(
171 | procedure = procedure,
172 | MI, dataNA, bdataNA,
173 | preimputed.data, data, bdata, boot, hex, bhex, metrics, tolerance, doublecheck,
174 | m, k, X, Y, z=which(ITERATIONVARS == Y), m.it,
175 | # loop data
176 | ITERATIONVARS, vars2impute,
177 | allPredictors, preimpute, impute, postimputealgos,
178 | # settings
179 | error_metric, FAMILY=FAMILY, cv, tuning_time,
180 | max_models,
181 | keep_cv,
182 | autobalance, balance, seed, save, flush,
183 | verbose, debug, report, sleep,
184 | # saving settings
185 | mem, orderedCols, ignore, maxiter,
186 | miniter, matching, ignore.rank,
187 | verbosity, error, cpu, max_ram, min_ram, stochastic)
188 | , file = report, append = TRUE)
189 | , error = function(cond) {
190 | message(paste0("\nReimputing '", Y, "' with the current specified algorithms failed and this variable will be skipped! \nSee Java server's error below:"));
191 | md.log(paste("Reimputing", Y, "failed and the variable will be skipped!"),
192 | date = TRUE, time = TRUE, print = TRUE)
193 | message(cond)
194 |
195 | ### ??? activate the code below if you allow "iterate" preimputation
196 | ### ??? or should it be ignored...
197 | # if (preimpute == "iterate" && k == 1L && (Y %in% allPredictors)) {
198 | # X <- union(X, Y)
199 | # if (debug) md.log("x was updated", date=debug, time=debug, trace=FALSE)
200 | # }
201 | return(NULL)
202 | })
203 |
204 |
205 |
206 | # If there was no error, update the variables
207 | # else make sure the model is cleared
208 | # --------------------------------------------------------------
209 | #IT <<- it
210 | if (!is.null(it)) {
211 | X <- it$X
212 | ITERATIONVARS <- it$iterationvars
213 | metrics <- it$metrics
214 | data <- it$data
215 | bdata <- it$bdata
216 | hex <- it$hex
217 | bhex <- it$bhex
218 |
219 | # if 'factorPred' is not NULL, update the list:
220 | if (!is.null(it$factorPred)) {
221 | #remove the 'predict' column, which is the first column in predict dataframe
222 | #Ok <<- it$factorPred[,2:ncol(it$factorPred)]
223 | if (length(FACTORPREDCTIONS) > 0) FACTORPREDCTIONS <- list(FACTORPREDCTIONS, Y = it$factorPred[,2:ncol(it$factorPred)])
224 | else FACTORPREDCTIONS <- list(Y = it$factorPred[,2:ncol(it$factorPred)])
225 |
226 | # update the name of the new item
227 | names(FACTORPREDCTIONS)[length(FACTORPREDCTIONS)] <- Y
228 | }
229 | }
230 |
231 | else tryCatch(h2o::h2o.rm(h2o::h2o.get_automl("mlim")),
232 | error = function(cond) {
233 | return(NULL)})
234 |
235 | # log & statusbar
236 | # --------------------------------------------------------------
237 | time = as.integer(Sys.time()) - start
238 | if (debug) md.log(paste("done! after: ", time, " seconds"),
239 | date = TRUE, time = TRUE, print = FALSE, trace = FALSE)
240 |
241 | # update the statusbar
242 | if (verbose==0) setTxtProgressBar(pb, (which(ITERATIONVARS == Y)))
243 |
244 |
245 | }
246 |
247 | # CHECK CRITERIA FOR RUNNING THE NEXT ITERATION
248 | # --------------------------------------------------------------
249 | if (debug) md.log("evaluating stopping criteria", date=debug, time=debug, trace=FALSE)
250 | SC <- stoppingCriteria(method="varwise_NA", miniter, maxiter,
251 | metrics, k, vars2impute,
252 | error_metric,
253 | tolerance,
254 | postimputealgos,
255 | runpostimpute,
256 | md.log = report)
257 | if (debug) {
258 | md.log(paste("running: ", SC$running), date=debug, time=debug, trace=FALSE)
259 | md.log(paste("\nEstimated", error_metric, "error:", SC$error), section="paragraph", date=debug, time=debug, trace=FALSE)
260 | }
261 |
262 | running <- SC$running
263 | error <- SC$error
264 | runpostimpute <- SC$runpostimpute
265 | ITERATIONVARS <- SC$vars2impute #only sets it to NULL
266 |
267 | # indication of postimpute
268 | if (length(ITERATIONVARS) == 0) {
269 | ITERATIONVARS <- vars2impute
270 | procedure <- "postimpute"
271 | }
272 |
273 | # update the loop number
274 | k <- k + 1L
275 | }
276 |
277 | # ............................................................
278 | # END OF THE ITERATIONS
279 | # ............................................................
280 | if (verbose) message("\n")
281 | md.log("", section="paragraph", trace=FALSE)
282 |
283 | # # if the iterations stops on minimum or maximum, return the last data
284 | # if (k == miniter || (k == maxiter && running) || maxiter == 1) {
285 | ###### ALWAYS RETURN THE LAST DATA. THIS WAS A BUG, REMAINING AFTER I INDIVIDUALIZED IMPUTATION EVALUATION
286 |
287 | ### Workaround for buggy 'as.data.frame' function
288 | ### =============================================
289 |
290 | # INSTEAD OF DEFINING A NEW VARIABLE 'dataLast', just use the 'data' returned
291 | # FROM iteration and most importantly, AVOID THE BLOODY 'as.data.frame' function
292 | # which IS SO BUGGY
293 | attr(data, "metrics") <- metrics
294 | attr(data, error_metric) <- error
295 | # dataLast <- as.data.frame(hex)
296 | # Sys.sleep(sleep)
297 | # attr(dataLast, "metrics") <- metrics
298 | # attr(dataLast, error_metric) <- error
299 | # }
300 | # else {
301 | # md.log("return previous iteration's data", date=debug, time=debug, trace=FALSE)
302 | # }
303 |
304 |
305 | if (clean) {
306 | tryCatch(h2o::h2o.removeAll(),
307 | error = function(cond) {
308 | message("trying to connect to JAVA server...\n");
309 | return(stop("Java server has crashed (low RAM?)"))})
310 | md.log("server was cleaned", section="paragraph", trace=FALSE)
311 | }
312 |
313 | if (shutdown) {
314 | md.log("shutting down the server", section="paragraph", trace=FALSE)
315 | tryCatch(h2o::h2o.shutdown(prompt = FALSE),
316 | error = function(cond) {
317 | message("trying to connect to JAVA server...\n");
318 | return(warning("Java server has crashed (low RAM?)"))})
319 | Sys.sleep(sleep)
320 | }
321 |
322 | # ------------------------------------------------------------
323 | # Adding stochastic variation
324 | #
325 | # Note: for continuous variables, RMSE is used as indication of
326 | # standard deviation. However, for binomial and multinomial
327 | # variables, the estimated probability of each level for
328 | # each missing observation is needed and thus, the predictions
329 | # of these variables should be stored and used in this section.
330 | # ============================================================
331 | if (stochastic) {
332 | md.log("STOCHASTIC TIMES", section="paragraph", trace=FALSE)
333 |
334 | # evaluate each variable and add stochastic variation based on variable types
335 | # ---------------------------------------------------------------------------
336 | for (Y in vars2impute) {
337 | v.na <- dataNA[, Y]
338 | VEK <- data[which(v.na), Y]
339 |
340 | if (FAMILY[which(ITERATIONVARS == Y)] == 'gaussian' ||
341 | FAMILY[which(ITERATIONVARS == Y)] == 'gaussian_integer'
342 | || FAMILY[which(ITERATIONVARS == Y)] == 'quasibinomial' ) {
343 |
344 | RMSE <- min(metrics[metrics$variable == Y, "RMSE"], na.rm = TRUE)
345 | data[which(v.na), Y] <- rnorm(
346 | n = length(VEK),
347 | mean = VEK,
348 | sd = RMSE)
349 | }
350 |
351 | else if (FAMILY[which(ITERATIONVARS == Y)] == 'binomial' ||
352 | FAMILY[which(ITERATIONVARS == Y)] == 'multinomial') {
353 |
354 | #NOK <<- FACTORPREDCTIONS
355 | #> #column names represent the estimated levels' probabilities
356 | stochFactors <- stochasticFactorImpute(levels = colnames(FACTORPREDCTIONS[[Y]]),
357 | probMat = as.matrix(FACTORPREDCTIONS[[Y]]))
358 |
359 | # replace the missing observations with the stochastic data
360 | data[which(v.na), Y] <- stochFactors
361 | }
362 |
363 | }
364 | }
365 |
366 | # ------------------------------------------------------------
367 | # Auto-Matching specifications
368 | # ============================================================
369 | if (matching == "AUTO") {
370 | mtc <- 0
371 | for (Y in vars2impute) {
372 | mtc <- mtc + 1
373 | v.na <- dataNA[, Y]
374 |
375 | if ((FAMILY[mtc] == 'gaussian_integer') | (FAMILY[mtc] == 'quasibinomial')) {
376 | if (debug) md.log(paste("matching", Y), section="paragraph")
377 |
378 | matchedVal <- matching(imputed=data[v.na, Y],
379 | nonMiss=unique(data[!v.na,Y]),
380 | md.log)
381 | #message(matchedVal)
382 | if (!is.null(matchedVal)) data[v.na, Y] <- matchedVal
383 | else {
384 | md.log("matching failed", section="paragraph", trace=FALSE)
385 | }
386 | }
387 | }
388 | }
389 |
390 | # ------------------------------------------------------------
391 | # Revert ordinal transformation
392 | # ============================================================
393 | if (!ignore.rank) {
394 | data[, orderedCols] <- revert(data[, orderedCols, drop = FALSE], mem)
395 | }
396 |
397 | class(data) <- c("mlim", "data.frame")
398 |
399 | return(dataLast=data)
400 | }
401 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | **`mlim`** : Single and Multiple Imputation with Automated Machine Learning
5 | ================================================================
6 |
7 |
8 | [](https://github.com/haghish/mlim/releases/?include_prereleases&sort=semver "View GitHub releases")
9 | [](https://cran.r-project.org/package=mlim) [](https://cran.r-project.org/package=mlim) [](https://CRAN.R-project.org/package=mlim)
10 |
16 |
17 |
18 | **`mlim`** is the first missing data imputation software to implement automated machine learning for performing _multiple imputation_ or _single imputation_ of missing data. The software, which is currently implemented as an R package, brings the state-of-the-arts of machine learning to provide a versatile missing data solution for various data types (continuous, binary, multinomial, and ordinal). In a nutshell, **`mlim`** is expected to outperform any other available missing data imputation software on many grounds. For example, **`mlim`** is expected to deliver:
19 |
20 | 1. **Lower imputation error** compared to other missing data imputation software.
21 | 2. **Higher imputation fairness**, when the data suffers from severe class imbalance, unnormal destribution, or the variables (features) have interactions with one another.
22 | 3. **Faster imputation of big datasets** because **`mlim`** excells in making an efficient use of available CPU cores and the runtime scales fairly well as the size of data becomes huge.
23 |
24 | The high performance of **`mlim`** is mainly by **fine-tuning** an **`ELNET`** algorithm, which often outperforms any standard statistical procedure or untuned machine learning algorithm and generalizes very well. However, **`mlim`** is an active research project and hence, it comes with a set of **experimental optimization toolkit** for exploring the possibility of performing multiple imputation with industry-standard machine learning algorithms such as _Deep Learning_, _Gradient Boosting Machine_, _Extreme Gradient Boosting_, and _Stacked Ensembles_. These algorithms can be used for either imputing missing data or optimizing already imputed data, but are **NOT used by default NOR recommended to all users**. Advanced users who are interested in exploring the possibilities of imputing missing data with these algorithms are recommended to read the free handbook (see below). These algorithms, as noted, are experimental, and the author is intended to examine their effectiveness for academic research (at this point). If you are interested to collaborate, [get in touch with the author](https://www.sv.uio.no/psi/english/people/aca/haghish/).
25 |
26 |
29 |
30 | Fine-tuning missing data imputation
31 | -----------------------------------
32 |
33 | Simply put, for each variable in the dataset, **`mlim`** automatically fine-tunes a fast machine learning model, which results in significantly lower imputation error compared to classical statistical models or even untuned machine learning imputation software that use Random Forest or unsuperwised learning algorithms. Moreover, **`mlim`** is intended to give social scientists a powerful solution to their missing data problem, a tool that can automatically adopts to different variable types, that can appear at different rates, with unknown destributions and have high correlations or interactions with one another. **But it is not just about higher accuracy! `mlim` also delivers fairer imputation, particularly for categorical and ordinal variables because it automatically balances the levels of the avriable, minimizing the bias resulting from class imbalance, which can often be seen in social science data and has been commonly ignored by missing data imputation software.**
34 |
35 |
36 |
37 |
38 | **`mlim`** outperforms other R packages for all variable types, continuous, binary (factor), multinomial (factor), and ordinal (ordered factor). The reason for this improved performance is that **`mlim`**:
39 |
40 | - Automatically fine-tunes the parameters of the Machile Learning models
41 | - Delivers a very high prediction accuracy
42 | - Does not make any assumption about the destribution of the data
43 | - Takes the interactions between the variables into account
44 | - Can to some extend take the hierarchical structure of the data into account
45 | + Imputes missing data in nested observations with higher accuracy compared to the HLM imputation methods
46 | - Does not force a particular linear model
47 | - Uses a blend of different machine learning models
48 |
49 |
61 |
62 | Procedure: From preimputation to imputation and postimputation
63 | ---------------------------------------------------------
64 |
65 | When a dataframe with NAs is given to **`mlim`**, the NAs are replaced with plausible values (e.g. Mean and Mode) to prepare the dataset for the imputation, as shown in the flowchart below:
66 |
67 |
68 |
69 | **`mlim`** follows three steps to optimize the missing data imputation. This procedure is _optional_, depending on the amount of computing resources available to you. In general, **`ELNET`** imputation already outperforms other available single and multiple imputation methods available in **R**. However, the imputation error can be further improved by training stronger algorithms such as **`GBM`**, **`XGB`**, **`DL`**, or even **`Ensemble`**, stacking several models on top of one another. For the majority of the users, the **`GBM`** or **`XGB`** (XGB is available only in Mac OSX and Linux) will significantly imprive the **`ELNET`** imputation, if long-enough time is provided to generate a lot of models to fine-tune them.
70 |
71 |
72 |
73 | You do not necessarily need the post-imputation. Once you have _reimputed_ the data with `ELNET`, you can stop there. `ELNET` is relatively a fast algorithm and it is easy to fine-tune it compared to `GBM`, `XGB`, `DL`, or `Ensemble`. In addition, `ELNET` generalizes nicely and is less prone to overfiting. In the flowchart below the procedure of **mlim** algorithm is drawn. When using **mlim**, you can use `ELNET` to impute a dataset with NA or optimize the imputed values of a dataset that is already imputed. If you wish to go the extra mile, you can use heavier algorithms as well to activate the postimputation procedure, but it is strictly optional and by default, **mlim** does not use postimputation.
74 |
75 |
76 |
77 |
78 |
79 | Fast imputation with **`ELNET`** (without postimputation)
80 | ---------------------------------------------------------
81 |
82 | Below are some comparisons between different R packages for carrying out multiple imputations (bars with error) and single imputation. In these analyses, I only used the **`ELNET`** algorithm, which fine-tunes much faster than other algorithms (**`GBM`**, **`XGBoost`**, and **`DL`**). As it evident, **`ELNET`** already outperforms all other single and multiple imputation procedures available in **R** language. However, the performance of **`mlim`** can still be improved, by adding another algorithm, which activates the _postimputation_ procedure.
83 |
84 |
85 |
86 |
87 |
88 |
89 | Installation
90 | ------------
91 |
92 | **`mlim`** is under fast development. The package receive monthly updates on CRAN. Therefore, it is recommended that you install the GitHub version until version 0.1 is released. To install the latest development version from GitHub:
93 |
94 | ``` r
95 | library(devtools)
96 | install_github("haghish/mlim")
97 | ```
98 |
99 | Or alternatively, install the latest stable version from CRAN:
100 | ``` r
101 | install.packages("mlim")
102 | ```
103 |
104 | Supported algorithms
105 | --------------------
106 |
107 | **`mlim`** supports several algorithms:
108 |
109 | - `ELNET` (Elastic Net)
110 | - `RF` (Random Forest and Extremely Randomized Trees)
111 | - `GBM` (Gradient Boosting Machine)
112 | - `XGB` (Extreme Gradient Boosting, available in Mac OS and Linux)
113 | - `DL` (Deep Learning)
114 | - `Ensemble` (Stacked Ensemble)
115 |
116 | > `ELNET` is the default imputation algorithm. Among all of the above, ELNET is the simplest model, fastest to fine-tune, requires the least amount of RAM and CPU, and yet, it is the most stable one, which also makes it one of the most generalizable algorithms. By default, **`mlim`** uses only `ELNET`, however, you can add another algorithm to activate the post-imputation procedure.
117 |
118 |
120 |
126 |
128 |
129 | ### `GBM` vs `ELNET`
130 |
131 | But which one should you choose, assuming computation resources are not in question? Well, __`GBM`__ is very liokely to outperform __`ELNET`__, if you specify a large enough `max_models` argument to well-tune the algorithm for imputing each feature. That basically means generating more than 100 models, at least. But you will enjoy a slight -- yet probably statistically significant -- improvement in the imputation accuracy. The option is there, for those who can use it, and to my knowledge, fine-tuning __`GBM`__ with large enough number of models will be the most accurate imputation algorithm compared to any other procedure I know. But __`ELNET`__ comes second and compared to its speed advantage, it is indeed charming!
132 |
133 | Both of these algorithms offer one advantage over all the other machine learning missing data imputation methods such as kNN, K-Means, PCA, Random Forest, etc... Simply put, you do not need to specify any parameter yourself, everything is automatic and **`mlim`** searches for the optimal parameters for imputing each variable within each iteration. For all the aformentioned packages, some parameters need to be specified, which influence the imputation accuracy. Number of _k_ for kNN, number of components for PCA, number of trees (and other parameters) for Random Forest, etc... This is why `elnet` outperform the other packages. You get a software that optimizes its models on its own.
134 |
135 | Advantages and limitations
136 | --------------------------
137 |
138 | **`mlim`** fine-tunes models for imputation, a procedure that has never been implemented in other R packages. This procedure often yields much higher accuracy compared to other machine learning imputation methods or missing data imputation procedures because of using more accurate models that are fine-tuned for each feature in the dataset. The cost, however, is computational resources. If you have access to a very powerful machine, with a huge amount of RAM per CPU, then try __`GBM`__. If you specify a high enough number of models in each fine-tuning process, you are likely to get a more accurate imputation that __`ELNET`__. However, for personal machines and laptops, __`ELNET`__ is generally recommended (see below). __If your machine is not powerful enough, it is likely that the imputation crashes due to memory problems...__. So, perhaps begin with __`ELNET`__, unless you are working with a powerful server. This is my general advice as long as **`mlim`** is in Beta version and under development.
139 |
140 |
141 | Citation
142 | --------
143 |
144 | - Haghish, E. F. (2022). mlim: Multiple Imputation with Automated Machine Learning [Computer software]. https://CRAN.R-project.org/package=mlim.
145 |
146 |
147 |
160 |
161 | Example
162 | -------
163 |
164 | `iris` ia a small dataset with 150 rows only. Let's add 50% of artifitial missing data and compare several state-of-the-art machine learning missing data imputation procedures. __`ELNET`__ comes up as a winner for a very simple reason! Because it was fine-tuned and all the rest were not. The larger the dataset and the higher the number of features, the difference between __`ELNET`__ and the others becomes more vivid.
165 |
166 | ### Single imputation
167 |
168 | In a single imputation, the NAs are replaced with the most plausible values according the model. You do not get the diversity of the multiple imputation, but you still get an estimated imputation error based on 10-fold (or higher, if specified) cross-validation procedure for each variable (column) in the dataset. As shown below, `mlim` provides the **`mlim.error()`** function to summarize the imputation error for the entire dataset or each variable.
169 |
170 | ```R
171 | # Comparison of different R packages imputing iris dataset
172 | # ===============================================================================
173 | rm(list = ls())
174 | library(mlim)
175 | library(mice)
176 | library(missForest)
177 | library(VIM)
178 |
179 | # Add artifitial missing data
180 | # ===============================================================================
181 | irisNA <- mlim.na(iris, p = 0.5, stratify = TRUE, seed = 2022)
182 |
183 | # Single imputation with mlim, giving it 180 seconds to fine-tune each imputation
184 | # ===============================================================================
185 | MLIM <- mlim(irisNA, m=1, seed = 2022, tuning_time = 180)
186 | print(MLIMerror <- mlim.error(MLIM, irisNA, iris))
187 |
188 | # kNN Imputation with VIM
189 | # ===============================================================================
190 | kNN <- kNN(irisNA, imp_var=FALSE)
191 | print(kNNerror <- mlim.error(kNN, irisNA, iris))
192 |
193 | # Single imputation with MICE (for the sake of demonstration)
194 | # ===============================================================================
195 | MC <- mice(irisNA, m=1, maxit = 50, method = 'pmm', seed = 500)
196 | print(MCerror <- mlim.error(MC, irisNA, iris))
197 |
198 | # Random Forest Imputation with missForest
199 | # ===============================================================================
200 | set.seed(2022)
201 | RF <- missForest(irisNA)
202 | print(RFerror <- mlim.error(RF$ximp, irisNA, iris))
203 | ```
204 |
205 | ### Multiple imputation
206 |
207 | `mlim` supports multiple imputation. All you need to do is to specify an integer higher than 1 for the value of `m`. For example, set `m = 5` in the `mlim` function to impute 5 datasets. Then, `mlim` returns a list including 5 datasets. You can convert this list to a `mids` object using the **`mlim.mids()`** function and then follow up the analysis with the `mids` object the same way it is carried out by the [`mice`](https://CRAN.R-project.org/package=mice) R package. Here is an example:
208 |
209 | ```R
210 | # Comparison of different R packages imputing iris dataset
211 | # ===============================================================================
212 | rm(list = ls())
213 | library(mlim)
214 | library(mice)
215 |
216 | # Add artifitial missing data
217 | # ===============================================================================
218 | irisNA <- mlim.na(iris, p = 0.5, stratify = TRUE, seed = 2022)
219 |
220 | # multiple imputation with mlim, giving it 180 seconds to fine-tune each imputation
221 | # ===============================================================================
222 | MLIM2 <- mlim(irisNA, m = 5, seed = 2022, tuning_time = 180)
223 | print(MLIMerror2 <- mlim.error(MLIM2, irisNA, iris))
224 | mids <- mlim.mids(MLIM2, dfNA)
225 | fit <- with(data=mids, exp=glm(Species ~ Sepal.Length, family = "binomial"))
226 | res <- mice::pool(fit)
227 | summary(res)
228 | ```
229 |
230 |
231 |
--------------------------------------------------------------------------------
/R/mlim.R:
--------------------------------------------------------------------------------
1 | #' @title missing data imputation with automated machine learning
2 | #' @description imputes data.frame with mixed variable types using automated
3 | #' machine learning (AutoML)
4 | # @param impute character. specify a vector of algorithms to be used
5 | # in the process of auto-tuning. the supported main algorithms are
6 | # "ELNET", "RF",
7 | # "GBM", "DL", "XGB" (available for Mac and Linux), and "Ensemble".
8 | #
9 | # the default is "AUTO", which is mostly based on "ELNET", but also
10 | # uses Extremely Randomized Forests, in addition to Random Forest, before
11 | # concluding the imputation, when "ELNET" stops improving. This procedure is
12 | # relatively fast and yields charming results, often equal to specifying "impute = c('ELNET', 'RF')",
13 | # which at each step of the imputation, uses both "ELNET" and "RF", doubling the imputation
14 | # time, and thus, it is advised.
15 | #
16 | # "GBM", "DL", "XGB", and "Ensemble" take the full given "tuning_time" (see below) to
17 | # tune the best model for imputing he given variable. it is advised to use these extensive
18 | # algorithms in the process of "postimputation" and let "ELNET" do most of the legwork to save
19 | # computational resources.
20 | #' @importFrom utils setTxtProgressBar txtProgressBar capture.output packageVersion
21 | #' @importFrom tools file_ext
22 | #' @importFrom h2o h2o.init as.h2o h2o.automl h2o.predict h2o.ls
23 | #' h2o.removeAll h2o.rm h2o.shutdown h2o.no_progress
24 | #' @importFrom md.log md.log
25 | #' @importFrom memuse Sys.meminfo
26 | #' @importFrom stats var setNames na.omit
27 | #' @importFrom curl curl
28 | #' @param data a \code{data.frame} (strictly) with missing data to be
29 | #' imputed. if \code{'load'} argument is provided, this argument will be ignored.
30 | #' @param m integer, specifying number of multiple imputations. the default value is
31 | #' 1, carrying out a single imputation.
32 | #' @param algos character vector, specifying algorithms to be used for missing data
33 | #' imputation. supported algorithms are "ELNET", "RF", "GBM", "DL",
34 | #' "XGB", and "Ensemble". if more than one algorithm is specified,
35 | #' mlim changes behavior to save on runtime. for example,
36 | #' the default is "ELNET", which fine-tunes an Elastic Net model.
37 | #' In general, "ELNET" is expected to
38 | #' be the best algorithm because it fine-tunes very fast, it is
39 | #' very robust to over-fitting, and hence, it generalizes very well.
40 | #' However, if your data has many factor variables, each with several
41 | #' levels, it is recommended to have c("ELNET", "RF") as your imputation
42 | #' algorithms (and possibly add "Ensemble" as well, to make the most out
43 | #' of tuning the models).
44 | #'
45 | #' Note that "XGB" is only available in Mac OS and Linux. moreover,
46 | #' "GBM", "DL" and "XGB" take the full given "tuning_time" (see below) to
47 | #' tune the best model for imputing he given variable, whereas "ELNET"
48 | #' will produce only one fine-tuned model, often at less time than
49 | #' other algorithms need for developing a single model, which is why "ELNET"
50 | #' is work horse of the mlim imputation package.
51 | #' @param preimpute character. specifies the 'primary' procedure of handling the missing
52 | #' data. before 'mlim' begins imputing the missing observations, they should
53 | #' be prepared for the imputation algorithms and thus, they should be replaced
54 | #' with some values.
55 | #' the default procedure is a quick "RF", which models the missing
56 | #' data with parallel Random Forest model. this is a very fast procedure,
57 | #' which later on, will be replaced within the "reimputation" procedure (see below).
58 | #' possible other alternative is \code{"mm"},
59 | #' which carries out mean/mode replacement, as practiced by most imputation algorithms.
60 | #' "mm" is much faster than "RF". if your dataset is very
61 | #' large, consider pre-imputing it before hand using 'mlim.preimpute()'
62 | #' function and passing the preimputed dataset to mlim (see "preimputed.data" argument).
63 | #
64 | # another alternative is "iterate", which instead of filling the missing observations with mean and mode, it
65 | # gradually adds the imputed variables to the vector of predictors, as it carries out the
66 | # first iteration.
67 | #' @param postimpute (EXPERIMENTAL FEATURE) logical. if TRUE, mlim uses algorithms rather than 'ELNET' for carrying out
68 | #' postimputation optimization. however, if FALSE, all specified algorihms will
69 | #' be used in the process of 'reimputation' together. the 'Ensemble' algorithm
70 | #' is encouraged when other algorithms are used. However, for general users
71 | #' unspecialized in machine learning, postimpute is NOT recommended because this
72 | #' feature is currently experimental, prone to over-fitting, and highly computationally extensive.
73 | #' @param stochastic logical. by default it is set to TRUE for multiple imputation and FALSE for
74 | #' single imputation. stochastic argument is currently under testing and is intended to
75 | #' avoid inflating the correlation between imputed valuables.
76 | #' @param ignore character vector of column names or index of columns that should
77 | #' should be ignored in the process of imputation.
78 | #' @param tuning_time integer. maximum runtime (in seconds) for fine-tuning the
79 | #' imputation model for each variable in each iteration. the default
80 | #' time is 900 seconds but for a large dataset, you
81 | #' might need to provide a larger model development
82 | #' time. this argument also influences \code{max_models},
83 | #' see below. If you are using 'ELNET' algorithm (default),
84 | #' you can be generous with the 'tuning_time' argument because
85 | #' 'ELNET' tunes much faster than the rest and will only
86 | #' produce one model.
87 | #' @param max_models integer. maximum number of models that can be generated in
88 | #' the proecess of fine-tuning the parameters. this value
89 | #' default to 100, meaning that for imputing each variable in
90 | #' each iteration, up to 100 models can be fine-tuned. increasing
91 | #' this value should be consistent with increasing
92 | #' \code{max_model_runtime_secs}, allowing the model to spend
93 | #' more time in the process of individualized fine-tuning.
94 | #' as a result, the better tuned the model, the more accurate
95 | #' the imputed values are expected to be
96 | #' @param autobalance logical. if TRUE (default), binary and multinomial factor variables
97 | #' will be balanced before the imputation to obtain fairer
98 | #' and less-biased imputations, which are typically in favor
99 | #' of the majority class.
100 | #' if FALSE, imputation fairness will be sacrificed for overall accuracy, which
101 | #' is not recommended, although it is commonly practiced in other missing data
102 | #' imputation software. MLIM is highly concerned with imputation fairness for
103 | #' factor variables and autobalancing is generally recommended.
104 | #' in fact, higher overall accuracy does not mean a better imputation as
105 | #' long as minority classes are neglected, which increases the bias in favor of the
106 | #' majority class. if you do not wish to autobalance all the
107 | #' factor variables, you can manually specify the variables
108 | #' that should be balanced using the 'balance' argument (see below).
109 | #'
110 | # NOTE: when a variable is balanced prior to the imputation, a different
111 | # bootstrap sampling procedure will be used. in doing so, instead of
112 | # carrying out bootstrap subsamples with replacement and adding the
113 | # duplicated observations as weights in the imputation, undersampling
114 | # bootstrap procedure without replacement is performed because the weights
115 | # of the artificially balanced data will conflicts the weights of the
116 | # bootstrap data.
117 | #' @param balance character vector, specifying variable names that should be
118 | #' balanced before imputation. balancing the prevalence might
119 | #' decrease the overall accuracy of the imputation, because it
120 | #' attempts to ensure the representation of the rare outcome.
121 | #' this argument is optional and intended for advanced users that
122 | #' impute a severely imbalance categorical (nominal) variable.
123 | #' @param matching logical. if \code{TRUE}, imputed values are coerced to the
124 | #' closest value to the non-missing values of the variable.
125 | #' if set to "AUTO", 'mlim' decides whether to match
126 | #' or not, based on the variable classes. the default is "AUTO".
127 | # @param ignore.rank logical, if FALSE (default), ordinal variables
128 | # are imputed as continuous integers with regression plus matching
129 | # and are reverted to ordinal later again. this procedure is
130 | # recommended. if FALSE, the rank of the categories will be ignored
131 | # the the algorithm will try to optimize for classification accuracy.
132 | # WARNING: the latter often results in very high classification accuracy but at
133 | # the cost of higher rank error. see the "mlim.error" function
134 | # documentation to see how rank error is computed. therefore, if you
135 | # intend to carry out analysis on the rank data as numeric, it is
136 | # recommended that you set this argument to FALSE.
137 | #' @param maxiter integer. maximum number of iterations. the default value is \code{15},
138 | #' but it can be reduced to \code{3} (not recommended, see below).
139 | # @param miniter integer. minimum number of iterations. the default value is
140 | # 2.
141 | #' @param flush logical (experimental). if TRUE, after each model, the server is
142 | #' cleaned to retrieve RAM. this feature is in testing mode and is
143 | #' currently set to FALSE by default, but it is recommended if you
144 | #' have limited amount of RAM or large datasets.
145 | #' @param cv logical. specify number of k-fold Cross-Validation (CV). values of
146 | #' 10 or higher are recommended. default is 10.
147 | # @param error_metric character. specify the minimum improvement
148 | # in the estimated error to proceed to the
149 | # following iteration or stop the imputation.
150 | # the default is 10^-4 for \code{"MAE"}
151 | # (Mean Absolute Error). this criteria is only
152 | # applied from the end of the fourth iteration.
153 | # \code{"RMSE"} (Root Mean Square
154 | # Error). other possible values are \code{"MSE"},
155 | # \code{"MAE"}, \code{"RMSLE"}.
156 | #' @param tolerance numeric. the minimum rate of improvement in estimated error metric
157 | #' of a variable to qualify the imputation for another round of iteration,
158 | #' if the \code{maxiter} is not yet reached. any improvement of imputation
159 | #' is desirable. however, specifying values above 0 can reduce the number
160 | #' of required iterations at a marginal increase of imputation error.
161 | #' for larger datasets, value of "1e-3" is recommended to reduce number
162 | #' of iterations. the default value is '1e-3'.
163 | #' @param doublecheck logical. default is TRUE (which is conservative). if FALSE, if the estimated
164 | #' imputation error of a variable does not improve, the variable
165 | #' will be not reimputed in the following iterations. in general,
166 | #' deactivating this argument will slightly reduce the imputation
167 | #' accuracy, however, it significantly reduces the computation time.
168 | #' if your dataset is large, you are advised to set this argument to
169 | #' FALSE. (EXPERIMENTAL: consider that by avoiding several iterations
170 | #' that marginally improve the imputation accuracy, you might gain
171 | #' higher accuracy by investing your computational resources in fine-tuning
172 | #' better algorithms such as "GBM")
173 | #'
174 | # @param stopping_metric character.
175 | # @param stopping_rounds integer.
176 | # @param stopping_tolerance numeric.
177 | # @param weights_column non-negative integer. a vector of observation weights
178 | # can be provided, which should be of the same length
179 | # as the dataframe. giving an observation a weight of
180 | # Zero is equivalent of ignoring that observation in the
181 | # model. in contrast, a weight of 2 is equivalent of
182 | # repeating that observation twice in the dataframe.
183 | # the higher the weight, the more important an observation
184 | # becomes in the modeling process. the default is NULL.
185 | #' @param seed integer. specify the random generator seed
186 | # @param plot logical. If TRUE, estimated error of the imputed dataset is plotted,
187 | # showing the reduction in CV error
188 |
189 | #' @param report filename. if a filename is specified (e.g. report = "mlim.md"), the \code{"md.log"} R
190 | #' package is used to generate a Markdown progress report for the
191 | #' imputation. the format of the report is adopted based on the
192 | #' \code{'verbosity'} argument. the higher the verbosity, the more
193 | #' technical the report becomes. if verbosity equals "debug", then
194 | #' a log file is generated, which includes time stamp and shows
195 | #' the function that has generated the message. otherwise, a
196 | #' reduced markdown-like report is generated. default is NULL.
197 | #' @param verbosity character. controls how much information is printed to console.
198 | #' the value can be "warn" (default), "info", "debug", or NULL.
199 | # @param init logical. should h2o Java server be initiated? the default is TRUE.
200 | # however, if the Java server is already running, set this argument
201 | #' to FALSE.
202 | #' @param cpu integer. number of CPUs to be dedicated for the imputation.
203 | #' the default takes all of the available CPUs.
204 | #' @param ram integer. specifies the maximum size, in Gigabytes, of the
205 | #' memory allocation. by default, all the available memory is
206 | #' used for the imputation.
207 | #' large memory size is particularly advised, especially
208 | #' for multicore processes. the more you give the more you get!
209 | #' @param preimputed.data data.frame. if you have used another software for missing
210 | #' data imputation, you can still optimize the imputation
211 | #' by handing the data.frame to this argument, which will
212 | #' bypass the "preimpute" procedure.
213 | #' @param save filename (with .mlim extension). if a filename is specified, an \code{mlim} object is
214 | #' saved after the end of each variable imputation. this object not only
215 | #' includes the imputed dataframe and estimated cross-validation error, but also
216 | #' includes the information needed for continuing the imputation,
217 | #' which is very useful feature for imputing large datasets, with a
218 | #' long runtime. this argument is activated by default and an
219 | #' mlim object is stored in the local directory named \code{"mlim.rds"}.
220 | #' @param load filename (with .mlim extension). an object of class "mlim", which includes the data, arguments,
221 | #' and settings for re-running the imputation, from where it was
222 | #' previously stopped. the "mlim" object saves the current state of
223 | #' the imputation and is particularly recommended for large datasets
224 | #' or when the user specifies a computationally extensive settings
225 | #' (e.g. specifying several algorithms, increasing tuning time, etc.).
226 | # @param force.load (NOT YET IMPLEMENTED FOR R).logical (default is TRUE). if TRUE, when loading the mlim class
227 | # object, its preserved settings are used for restoring and saving the
228 | # following itterations. otherwise, if FALSE, the current arguments of
229 | # mlim are used to overpower the settings of the mlim object. the settings
230 | # include the full list of the mlim arguments.
231 | #' @param shutdown logical. if TRUE, h2o server is closed after the imputation.
232 | #' the default is TRUE
233 | #' @param java character, specifying path to the executable 64bit Java JDK on the
234 | #' Microsoft Windows machines, if JDK is installed but the path environment
235 | #' variable is not set.
236 | #' @param ... arguments that are used internally between 'mlim' and 'mlim.postimpute'.
237 | #' these arguments are not documented in the help file and are not
238 | #' intended to be used by end user.
239 | #' @return a \code{data.frame}, showing the
240 | #' estimated imputation error from the cross validation within the data.frame's
241 | #' attribution
242 | #' @author E. F. Haghish
243 | #'
244 | #' @examples
245 | #'
246 | #' \dontrun{
247 | #' data(iris)
248 | #'
249 | #' # add stratified missing observations to the data. to make the example run
250 | #' # faster, I add NAs only to a single variable.
251 | #' dfNA <- iris
252 | #' dfNA$Species <- mlim.na(dfNA$Species, p = 0.1, stratify = TRUE, seed = 2022)
253 | #'
254 | #' # run the ELNET single imputation (fastest imputation via 'mlim')
255 | #' MLIM <- mlim(dfNA, shutdown = FALSE)
256 | #'
257 | #' # in single imputation, you can estimate the imputation accuracy via cross validation RMSE
258 | #' mlim.summarize(MLIM)
259 | #'
260 | #' ### or if you want to carry out ELNET multiple imputation with 5 datasets.
261 | #' ### next, to carry out analysis on the multiple imputation, use the 'mlim.mids' function
262 | #' ### minimum of 5 datasets
263 | #' MLIM2 <- mlim(dfNA, m = 5)
264 | #' mids <- mlim.mids(MLIM2, dfNA)
265 | #' fit <- with(data=mids, exp=glm(Species ~ Sepal.Length, family = "binomial"))
266 | #' res <- mice::pool(fit)
267 | #' summary(res)
268 | #'
269 | #' # you can check the accuracy of the imputation, if you have the original dataset
270 | #' mlim.error(MLIM2, dfNA, iris)
271 | #
272 | # ### run GBM, RF, ELNET, and Ensemble algos and allow 60 minutes of tuning for each variable
273 | # ### this requires a lot of RAM on your machine and a lot of time!
274 | # MLIM <- mlim(dfNA, algos = c("GBM", "RF","ELNET","Ensemble"), tuning_time=60*60)
275 | # mlim.error(MLIM, dfNA, iris)
276 | #
277 | # ### if you have a larger data, there is a few things you can set to make the
278 | # ### algorithm faster, yet, having only a marginal accuracy reduction as a trade-off
279 | # MLIM <- mlim(dfNA, algos = 'ELNET', tolerance = 1e-3, doublecheck = FALSE)
280 | #' }
281 | #' @export
282 |
283 |
284 | mlim <- function(data = NULL,
285 | m = 1,
286 | algos = c("ELNET"), #impute, postimpute
287 | postimpute = FALSE,
288 | stochastic = m > 1,
289 | ignore = NULL,
290 |
291 | # computational resources
292 | tuning_time = 900,
293 | max_models = NULL, # run all that you can
294 | maxiter = 10L,
295 | #miniter = 2L,
296 | cv = 10L,
297 | #validation = 0,
298 |
299 | matching = "AUTO", #EXPERIMENTAL
300 | autobalance = TRUE,
301 | balance = NULL, #EXPERIMENTAL
302 | #ignore.rank = FALSE, #to ignore it, they should make it unordered!
303 | # weights_column = NULL,
304 |
305 | # report and reproducibility
306 | seed = NULL,
307 | verbosity = NULL,
308 | report = NULL,
309 |
310 | # stopping criteria
311 | tolerance = 1e-3,
312 | doublecheck = TRUE,
313 |
314 | ## simplify the settings by taking these arguments out
315 | preimpute = "RF",
316 | #impute = "AUTO",
317 | #postimpute = "AUTO",
318 | #error_metric = "RMSE", #??? mormalize it
319 | #stopping_metric = "AUTO",
320 | #stopping_rounds = 3,
321 | #stopping_tolerance=1e-3,
322 |
323 | # setup the h2o cluster
324 | cpu = -1,
325 | ram = NULL,
326 | flush = FALSE,
327 |
328 |
329 | # NOT YET IMPLEMENTED
330 | preimputed.data = NULL,
331 | save = NULL,
332 | load = NULL,
333 | #init = TRUE,
334 | shutdown = TRUE,
335 | java = NULL,
336 | #force.load = TRUE,
337 | ...
338 | ) {
339 |
340 | # improvements for the next release
341 | # ============================================================
342 | # instead of using all the algorithms at each iteration, add the
343 | # other algorithms when the first algorithm stops being useful.
344 | # perhaps this will help optimizing, while reducing the computation burdon
345 | # h2o DRF does not give OOB error, so initial comparison preimputation is not possible
346 | # HOWEVER, I can estimate the CV for the preimputation procedure
347 | #
348 | # instead of adding postimpute_algos, extract it from specified algorithms
349 |
350 | # check the ... arguments
351 | # ============================================================
352 | hidden_args <- c("superdebug", "init", "ignore.rank", "sleep", "stochastic")
353 | stopifnot(
354 | "incompatible '...' arguments" = (names(list(...)) %in% hidden_args)
355 | )
356 |
357 | # Simplify the syntax by taking arguments that are less relevant to the majority
358 | # of the users out
359 | # ============================================================
360 | #stopping_metric <- "AUTO"
361 | #stopping_rounds <- 3
362 | #stopping_tolerance <- 1e-3
363 | MI <- list()
364 | bdata <- NULL
365 | metrics <- NULL
366 | error <- NULL
367 | debug <- FALSE
368 | miniter <- 2L
369 | init <- threeDots(name = "init", ..., default = TRUE)
370 | # cv <- threeDots(name = "cv", ..., default = 10L)
371 | # flush <- threeDots(name = "flush", ..., default = TRUE)
372 | verbose <- 0
373 | error_metric<- "RMSE"
374 | #preimpute <- "RF"
375 | ignore.rank <- threeDots(name = "ignore.rank", ..., default = FALSE) #EXPERIMENTAL
376 | sleep <- threeDots(name = "sleep", ..., default = .25)
377 | superdebug <- threeDots(name = "superdebug", ..., default = FALSE)
378 | #stochastic <- threeDots(name = "stochastic", ..., default = FALSE)
379 |
380 |
381 |
382 | # ============================================================
383 | # ============================================================
384 | # LOAD SETTINGS FROM mlim class object
385 | # ============================================================
386 | # ============================================================
387 | if (!is.null(load)) {
388 | if (inherits(load, "character")) load <- readRDS(load)
389 | if (!inherits(load, "mlim")) stop("loaded object must be of class 'mlim'")
390 |
391 | # Data
392 | # ----------------------------------
393 | MI <- load$MI # dataLast or multiple-imputation data
394 | dataNA <- load$dataNA
395 | preimputed.data<- load$preimputed.data
396 | data <- load$data # preimputed dataset that is constantly updated
397 | #bdata <- load$bdata
398 | #dataLast <- load$dataLast
399 | metrics <- load$metrics
400 | mem <- load$mem
401 | orderedCols <- load$orderedCols
402 |
403 | # Loop data
404 | # ----------------------------------
405 | m <- load$m # number of datasets to impute
406 | m.it <- load$m.it # current dataset to impute
407 | k <- load$k # current loop number (global imputation iteration)
408 | z <- load$z # current local iteration number
409 | X <- load$X
410 | Y <- load$Y # last-imputed imputed variable. outside the 'load' argument, it means current variable to be imputed
411 | vars2impute <- load$vars2impute
412 | FAMILY <- load$FAMILY
413 |
414 | # settings
415 | # ----------------------------------
416 | ITERATIONVARS <- load$ITERATIONVARS# variables to be imputed
417 | impute <- load$impute # reimputation algorithm(s)
418 | postimputealgos<- load$postimputealgos
419 | autobalance <- load$autobalance
420 | balance <- load$balance
421 | ignore <- load$ignore
422 | save <- load$save
423 | maxiter <- load$maxiter
424 | miniter <- load$miniter
425 | cv <- load$cv
426 | tuning_time <- load$tuning_time
427 | max_models <- load$max_models
428 | matching <- load$matching
429 | ignore.rank <- load$ignore.rank #KEEP IT HIDDEN
430 | #weights_column <- load$weights_column
431 | seed <- load$seed
432 | verbosity <- load$verbosity
433 | verbose <- load$verbose #KEEP IT HIDDEN
434 | debug <- load$debug #KEEP IT HIDDEN
435 | report <- load$report
436 | flush <- load$flush
437 | error_metric <- load$error_metric #KEEP IT HIDDEN
438 | error <- load$error #KEEP IT HIDDEN
439 | tolerance <- load$tolerance
440 | cpu <- load$cpu
441 | max_ram <- load$max_ram
442 | min_ram <- load$min_ram #KEEP IT HIDDEN
443 | keep_cv <- load$keep_cv
444 | pkg <- load$pkg #KEEP IT HIDDEN
445 |
446 |
447 | # MOVE-ON to the next variable after loading an mlim object
448 | # ---------------------------------------------------------
449 | moveOn <- iterationNextVar(m, m.it, k, z, Y, ITERATIONVARS, maxiter)
450 | m <- moveOn$m
451 | m.it <- moveOn$m.it
452 | k <- moveOn$k
453 | z <- moveOn$z
454 | Y <- moveOn$Y
455 | }
456 |
457 | # ============================================================
458 | # ============================================================
459 | # Prepare the imputation settings
460 | # ============================================================
461 | # ============================================================
462 | else {
463 | if (!is.null(seed)) set.seed(seed) # avoid setting seed by default if it is a continuation
464 |
465 | alg <- algoSelector(algos, postimpute)
466 | # preimpute <- "RF" #alg$preimpute ## for now, make this global
467 | impute <- alg$impute
468 | postimputealgos <- alg$postimpute
469 |
470 | synt <- syntaxProcessing(data, preimpute, impute, ram,
471 | matching=matching, miniter, maxiter, max_models,
472 | tuning_time, cv, verbosity=verbosity, report, save)
473 | min_ram <- synt$min_ram
474 | max_ram <- synt$max_ram
475 | keep_cv <- synt$keep_cross_validation_predictions
476 | verbose <- synt$verbose
477 | debug <- synt$debug
478 | }
479 |
480 | # disable h2o progress_bar
481 | #if (!debug) h2o::h2o.no_progress()
482 | if (!superdebug) h2o::h2o.no_progress()
483 |
484 | # ============================================================
485 | # Initialize the Markdown report
486 | # ============================================================
487 | if (is.null(report)) md.log("System information", file=tempfile(),
488 | trace=TRUE, sys.info = TRUE, date=TRUE, time=TRUE)
489 |
490 | else if (is.null(load)) md.log("System information", file=report,
491 | append = FALSE, trace=TRUE, sys.info = TRUE,
492 | date=TRUE, time=TRUE) #, print=TRUE
493 |
494 | else if (!is.null(load)) md.log("\nContinuing from where it was left...", file=report,
495 | append = TRUE, trace=TRUE, sys.info = TRUE,
496 | date=TRUE, time=TRUE)
497 |
498 | # Run H2O on the statistics server¤
499 | # ============================================================
500 | if (init) {
501 | #sink(file = report, append = TRUE)
502 | #message("\n") # for Markdown styling
503 | capture.output(connection <- init(nthreads = cpu,
504 | min_mem_size = min_ram,
505 | max_mem_size = max_ram,
506 | ignore_config = TRUE,
507 | java = java,
508 | report, debug),
509 | file = report, append = TRUE)
510 | #sink()
511 |
512 | ## ??? DO NOT CLOSE ALL THE CONNECTIONS
513 | #sink.reset <- function(){
514 | # for(i in seq_len(sink.number())){
515 | # sink(NULL)
516 | # }
517 | #}
518 | #sink.reset()
519 |
520 |
521 | ##closeAllConnections()
522 | ##print(connection)
523 | }
524 |
525 | # Identify variables for imputation and their models' families
526 | # ============================================================
527 | if (is.null(load)) {
528 | VARS <- selectVariables(data, ignore, verbose, report)
529 |
530 | dataNA <- VARS$dataNA # the missing data placeholder
531 | allPredictors <- VARS$allPredictors
532 | vars2impute <- VARS$vars2impute
533 | vars2postimpute <- VARS$vars2impute
534 | storeVars2impute <- vars2impute
535 | X <- VARS$X
536 | bdata <- NULL
537 |
538 | # if there is only one variable to impute, there is no need to iterate!
539 | if (length(vars2impute) < 1) stop("\nthere is nothing to impute!\n")
540 | else if (length(vars2impute) == 1) {
541 | if (!is.valid(postimputealgos)) {
542 | maxiter <- 1
543 | }
544 | }
545 |
546 | # .........................................................
547 | # check the variables for compatibility
548 | # .........................................................
549 | # if preimputed data is provided, take it into consideration!
550 | if (!is.null(preimputed.data)) {
551 |
552 | # if a multiple imputation object is given, take the first dataset
553 | # ??? in the future, consider that each of the given datasets can
554 | # be fed independently as a separate "m". for now, this is NOT AN
555 | # announced feature and thus, just take the first dataset as preimputation
556 | if (inherits(preimputed.data, "mlim.mi")) {
557 | #preimputed.data <- preimputed.data[[1]]
558 | #stop("use 'mlim.postimpute' function for postimputing multiple imputation datasets\n")
559 | stop("multiple imputation datasets cannot be used as 'preimputed.data'\n")
560 | }
561 |
562 | # if the preimputation was done with mlim, extract the metrics
563 | else if (inherits(preimputed.data, "mlim")) {
564 |
565 |
566 | # remove the NAs of the last imputation and replace them with
567 | # the minimum
568 | metrics <- getMetrics(preimputed.data)
569 | }
570 |
571 | # SAVE RAM: if preimputed.data is given, replace the original data because
572 | # its missing data is reserved within dataNA
573 | data <- preimputed.data
574 |
575 | # reset the relevant predictors
576 | X <- allPredictors
577 | }
578 |
579 | Features <- checkNconvert(data, vars2impute, ignore,
580 | ignore.rank=ignore.rank, report)
581 |
582 | FAMILY<- Features$family
583 |
584 | # data <- Features$data ##> this will be moved inside the loop because
585 | # in multiple imputation, we want to start over
586 | # everytime!
587 | mem <- Features$mem
588 | orderedCols <- Features$orderedCols
589 |
590 | # .........................................................
591 | # PREIMPUTATION: replace data with preimputed data
592 | # .........................................................
593 | if (preimpute != "iterate" & is.null(preimputed.data)) {
594 |
595 | # preimpute in single imputation ONLY. for multiple imputation, each
596 | # bootstrap dataset is imputed seperately
597 | if (m == 1) {
598 | data <- mlim.preimpute(data=data, preimpute=preimpute, seed = NULL) # DO NOT RESET THE SEED!
599 | }
600 |
601 | # reset the relevant predictors
602 | X <- allPredictors
603 | }
604 |
605 | # .........................................................
606 | # Remove 'Features', but keep 'preimputed.data' in MI
607 | # .........................................................
608 | if (m > 1) preimputed.data <- Features$data
609 | else preimputed.data <- NULL
610 | rm(Features)
611 | gc()
612 | }
613 |
614 |
615 | # ............................................................
616 | # ............................................................
617 | # ITERATION LOOP
618 | # ............................................................
619 | # ............................................................
620 | if (is.null(load)) {
621 | k <- 1L
622 | z <- 1L
623 | m.it <- 1L
624 | MI <- NULL
625 | error <- setNames(rep(1, length(vars2impute)), vars2impute)
626 | }
627 |
628 | # drop 'load' from the memory
629 | # ---------------------------
630 | rm(load)
631 | gc()
632 | load <- NULL
633 |
634 | # ??? bdata must be NULL at the beginning of each itteration. Currently
635 | # this is NOT happenning when the 'mlim' object is loaded
636 |
637 | for (m.it in m.it:m) {
638 |
639 | # Start the new imputation data fresh, if it is multiple imputation
640 | if (k == 1 & z == 1) {
641 | if (!is.null(preimputed.data)) data <- preimputed.data
642 | md.log(paste("Dataset", m.it), section="section")
643 | }
644 |
645 | #it is always NULL. It doesn't have to be saved
646 | bdata <- NULL
647 | dataLast <- iteration_loop(MI, dataNA, preimputed.data, data, bdata, boot=m>1,
648 | metrics, tolerance, doublecheck,
649 | m, k, X, Y, z, m.it,
650 | # loop data
651 | vars2impute, vars2postimpute, storeVars2impute,
652 | allPredictors, preimpute, impute, postimputealgos,
653 | # settings
654 | error_metric, FAMILY=FAMILY, cv, tuning_time,
655 | max_models,
656 | keep_cv,
657 | autobalance, balance, seed, save, flush,
658 | verbose, debug, report, sleep,
659 | # saving settings
660 | mem, orderedCols, ignore, maxiter,
661 | miniter, matching, ignore.rank,
662 | verbosity, error, cpu, max_ram=max_ram, min_ram=min_ram,
663 | #??? shutdown has to be fixed in future updates
664 | shutdown=FALSE, clean = TRUE,
665 | stochastic=stochastic)
666 |
667 | if (m > 1) MI[[m.it]] <- dataLast
668 | else MI <- dataLast
669 | }
670 |
671 | message("\n")
672 |
673 | if (shutdown) {
674 | md.log("shutting down the server", trace=FALSE)
675 | h2o::h2o.shutdown(prompt = FALSE)
676 | Sys.sleep(sleep)
677 | }
678 |
679 | if (m > 1) class(MI) <- "mlim.mi"
680 | else class(MI) <- c("mlim", "data.frame")
681 |
682 | return(MI)
683 | }
684 |
685 |
686 |
687 |
688 |
--------------------------------------------------------------------------------