├── LICENSE ├── .Rbuildignore ├── data ├── charity.rda └── manifest.rda ├── man ├── figures │ ├── mlim.png │ ├── MLIM_text.png │ ├── charity.png │ ├── flowchart.pdf │ ├── flowchart.png │ ├── procedure3.pdf │ ├── procedure3.png │ ├── procedure4.pdf │ ├── procedure4.png │ ├── procedure5.png │ ├── bfi_binary_p15.png │ ├── flowchart_base.png │ ├── iris_mpce_p15.png │ ├── bfi_missrank_p15.png │ ├── titanic_mpce_p15.png │ ├── iris_continuous_p15.png │ ├── sat.act_nrmse_p15.png │ ├── flowchart_optimization.png │ ├── wiki.svg │ ├── manual.svg │ ├── handbook_book.svg │ ├── handbook_lamp.svg │ ├── handbook_stupid.svg │ ├── Untitled.drawio │ ├── flowchart_optimization.drawio │ ├── flowchart_base.drawio │ ├── flowchart.drawio │ ├── procedure2.drawio │ ├── procedure.drawio │ ├── procedure3.drawio │ ├── procedure4.drawio │ └── procedure5.drawio ├── charity.Rd ├── mlim.summarize.Rd ├── mlim.mids.Rd ├── mlim.preimpute.Rd ├── mlim.na.Rd ├── mlim.error.Rd ├── manifest.Rd └── mlim.Rd ├── .gitignore ├── R ├── defaultCV.R ├── normalize.R ├── getDigits.R ├── javaServer.R ├── factmem.R ├── charity.R ├── threeDots.R ├── addNA.R ├── is.valid.R ├── bootstrapWeight.R ├── is.imbalanced.R ├── zzz.R ├── stochasticFactorImpute.R ├── missclass.R ├── nrmse.R ├── missrank.R ├── getMetrics.R ├── revert.R ├── meanmode.R ├── matching.R ├── h2o.check.R ├── miss_per_class_error.R ├── server.check.R ├── mlim.mids.R ├── selectVariables.R ├── extractMetrics.R ├── iterationNextVar.R ├── init.R ├── mlim.shuffle.R ├── algoSelector.R ├── bootstrapCV.R ├── mlim.summarize.R ├── mlim.postimpute.R ├── mlim.preimpute.R ├── mlim.na.R ├── syntaxProcessing.R ├── stoppingCriteria.R ├── checkNconvert.R ├── manifest.R ├── mlim.error.R ├── iteration_loop.R └── mlim.R ├── mlim.Rproj ├── NAMESPACE ├── DESCRIPTION └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2022 2 | COPYRIGHT HOLDER: E. F. Haghish 3 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\web 4 | ^.log -------------------------------------------------------------------------------- /data/charity.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/data/charity.rda -------------------------------------------------------------------------------- /data/manifest.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/data/manifest.rda -------------------------------------------------------------------------------- /man/figures/mlim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/mlim.png -------------------------------------------------------------------------------- /man/figures/MLIM_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/MLIM_text.png -------------------------------------------------------------------------------- /man/figures/charity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/charity.png -------------------------------------------------------------------------------- /man/figures/flowchart.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart.pdf -------------------------------------------------------------------------------- /man/figures/flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart.png -------------------------------------------------------------------------------- /man/figures/procedure3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure3.pdf -------------------------------------------------------------------------------- /man/figures/procedure3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure3.png -------------------------------------------------------------------------------- /man/figures/procedure4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure4.pdf -------------------------------------------------------------------------------- /man/figures/procedure4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure4.png -------------------------------------------------------------------------------- /man/figures/procedure5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/procedure5.png -------------------------------------------------------------------------------- /man/figures/bfi_binary_p15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/bfi_binary_p15.png -------------------------------------------------------------------------------- /man/figures/flowchart_base.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart_base.png -------------------------------------------------------------------------------- /man/figures/iris_mpce_p15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/iris_mpce_p15.png -------------------------------------------------------------------------------- /man/figures/bfi_missrank_p15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/bfi_missrank_p15.png -------------------------------------------------------------------------------- /man/figures/titanic_mpce_p15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/titanic_mpce_p15.png -------------------------------------------------------------------------------- /man/figures/iris_continuous_p15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/iris_continuous_p15.png -------------------------------------------------------------------------------- /man/figures/sat.act_nrmse_p15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/sat.act_nrmse_p15.png -------------------------------------------------------------------------------- /man/figures/flowchart_optimization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haghish/mlim/HEAD/man/figures/flowchart_optimization.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | *.log 7 | TEST.R 8 | TEST.R 9 | TEST.R 10 | -------------------------------------------------------------------------------- /R/defaultCV.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | defaultCV <- function(n, min = 10) { 4 | cv <- NULL 5 | v <- n / min 6 | if (v >= 150) cv <- 15 7 | } 8 | -------------------------------------------------------------------------------- /R/normalize.R: -------------------------------------------------------------------------------- 1 | #' @title normalize 2 | #' @description normalizes a variable, scaling it to range from 0 to 1 3 | #' @return list 4 | #' @author E. F. Haghish 5 | #' @keywords Internal 6 | #' @noRd 7 | 8 | normalize <- function(x){ 9 | return(x-min(x, na.rm = TRUE))/(max(x, na.rm = TRUE)-min(x, na.rm = TRUE)) 10 | } 11 | -------------------------------------------------------------------------------- /R/getDigits.R: -------------------------------------------------------------------------------- 1 | #' @title getDigits 2 | #' @description extracts number of digits 3 | #' @author E. F. Haghish 4 | #' @return integer 5 | #' @keywords Internal 6 | #' @noRd 7 | 8 | getDigits <- function(x) { 9 | if (!is.null(x)) { 10 | result <- floor(log10(abs(x))) 11 | result[!is.finite(result)] = 0 12 | return(abs(result)) 13 | } 14 | else return(0) 15 | } 16 | -------------------------------------------------------------------------------- /R/javaServer.R: -------------------------------------------------------------------------------- 1 | #' @title java server interaction 2 | #' @description the function provides commands to interact with the java 3 | #' server 4 | #' @param action character 5 | #' @return character (java command) 6 | #' @author E. F. Haghish 7 | #' @keywords Internal 8 | #' @noRd 9 | javaServer <- function(action){ 10 | result <- NULL 11 | if (action == "flush") result <- "h2o:::.h2o.garbageCollect()" 12 | return(result) 13 | } 14 | -------------------------------------------------------------------------------- /R/factmem.R: -------------------------------------------------------------------------------- 1 | #' @title factmem 2 | #' @description memorizes factors' levels 3 | #' @return list 4 | #' @author E. F. Haghish 5 | #' @keywords Internal 6 | #' @noRd 7 | factmem <- function(df) { 8 | mem <- list() 9 | for (i in 1:ncol(df)) { 10 | lev <- levels(df[,i]) 11 | len <- 1:length(lev) 12 | nam <- colnames(df)[i] 13 | mem[[i]] <- list(names = nam, length = len, level=lev) 14 | } 15 | 16 | return(mem) 17 | } 18 | 19 | -------------------------------------------------------------------------------- /R/charity.R: -------------------------------------------------------------------------------- 1 | #' some items about attitude towards charity 2 | #' 3 | #' A dataset containing likert-scale items about attitude towards charity 4 | #' 5 | #' @format A data frame with 832 rows and 5 variables: 6 | #' \describe{ 7 | #' \item{ta1}{Charitable Organizations More Effective} 8 | #' \item{ta2}{Degree of Trust} 9 | #' \item{ta3}{Charitable Organizations Honest/Ethical} 10 | #' \item{ta4}{Role Improving Communities} 11 | #' \item{ta5}{Job Delivering Services} 12 | #' } 13 | #' @source \url{https://www.stata.com/} 14 | "charity" 15 | 16 | -------------------------------------------------------------------------------- /R/threeDots.R: -------------------------------------------------------------------------------- 1 | #' @title process ... arguments 2 | #' @description extract values from '...' argument and pass them to the scalars 3 | #' @param name character. the name of the object of interest 4 | #' @param ... arguments list 5 | #' @return value of the object of interest 6 | #' @author E. F. Haghish 7 | #' @keywords Internal 8 | #' @noRd 9 | threeDots <- function(name, ..., default){ 10 | dotList <- list(...) 11 | dotName <- names(dotList) 12 | if (name %in% dotName) { 13 | return(dotList[[name]]) 14 | } 15 | else return(default) 16 | } 17 | -------------------------------------------------------------------------------- /mlim.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | 23 | QuitChildProcessesOnExit: Yes 24 | DisableExecuteRprofile: Yes 25 | -------------------------------------------------------------------------------- /R/addNA.R: -------------------------------------------------------------------------------- 1 | #' @title add NA in a vector 2 | #' @description generates NA and replaces the actual values of a vector 3 | #' with NA 4 | #' @author E. F. Haghish 5 | #' @keywords Internal 6 | #' @noRd 7 | addNA <- function(x, p, stratify=FALSE) { 8 | 9 | if (stratify & "factor" %in% class(x)) { 10 | levs <- levels(x) 11 | for (l in levs) { 12 | index <- which(x == l) 13 | len <- length(index) 14 | x[index][sample(len, round(p * len))] <- NA 15 | } 16 | } 17 | else { 18 | len <- length(x) 19 | x[sample(len, round(p * len))] <- NA # sample without replacement 20 | } 21 | return(x) 22 | } 23 | -------------------------------------------------------------------------------- /R/is.valid.R: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # is.valid 3 | # ========================================================== 4 | #' @title validate an object 5 | #' @description a function to make sure the value is not NULL, 6 | #' NA, NaN, or Inf 7 | #' @return locical. if TRUE, the object is valid 8 | #' @author E. F. Haghish 9 | #' @keywords Internal 10 | #' @noRd 11 | is.valid = function(x) { 12 | if (!is.null(x)) { 13 | if (is.na(x)) { 14 | return(FALSE) 15 | } 16 | else if (is.infinite(x)) { 17 | return(FALSE) 18 | } 19 | else { 20 | return(TRUE) 21 | } 22 | } 23 | else { 24 | return(FALSE) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/charity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/charity.R 3 | \docType{data} 4 | \name{charity} 5 | \alias{charity} 6 | \title{some items about attitude towards charity} 7 | \format{ 8 | A data frame with 832 rows and 5 variables: 9 | \describe{ 10 | \item{ta1}{Charitable Organizations More Effective} 11 | \item{ta2}{Degree of Trust} 12 | \item{ta3}{Charitable Organizations Honest/Ethical} 13 | \item{ta4}{Role Improving Communities} 14 | \item{ta5}{Job Delivering Services} 15 | } 16 | } 17 | \source{ 18 | \url{https://www.stata.com/} 19 | } 20 | \usage{ 21 | charity 22 | } 23 | \description{ 24 | A dataset containing likert-scale items about attitude towards charity 25 | } 26 | \keyword{datasets} 27 | -------------------------------------------------------------------------------- /R/bootstrapWeight.R: -------------------------------------------------------------------------------- 1 | #' @title bootstrap duplication removed and weight_column calculator 2 | #' @description identifies duplicated rows in the bootstrap and creates 3 | #' a weight column counting the duplicated rows to be given 4 | #' as additional weights. this weight column should be 5 | #' added to the weight_column given by the user. 6 | #' @author E. F. Haghish 7 | #' @return numeric data.frame including 'rows' and 'weight'. rows is the row number 8 | #' of the observation and weight is the number of duplications. 9 | #' @keywords Internal 10 | #' @noRd 11 | bootstrapWeight <- function(index) { 12 | weight <- table(index) 13 | rows <- as.numeric(names(weight)) 14 | return(as.data.frame(cbind(rows, weight))) 15 | } 16 | -------------------------------------------------------------------------------- /R/is.imbalanced.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title is.imbalanced 3 | #' @description examines whether a binary or multinomial feature is imbalanced 4 | #' @param x vector. 5 | #' @author E. F. Haghish 6 | #' @return data.frame of error metrics. 7 | #' @keywords Internal 8 | #' @noRd 9 | 10 | is.imbalanced <- function(x) { 11 | imbalanced <- FALSE 12 | prev <- sort(table(x) / length(x)) 13 | 14 | # get the difference between the first and last scalars 15 | diff <- prev[1] / prev[length(prev)] 16 | 17 | # define a criteria for imbalance, to prefer AUCPR to AUC. 18 | # Here I select any difference more than 0.3 (e.g. 70% - 30%) as 19 | # imbalanced and prefer AUPRC for tuning 20 | if (diff >= 0.3) imbalanced <- TRUE 21 | 22 | return(imbalanced) 23 | } 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | #' @title normalized RMSE 2 | #' @description calculates the normalized RMSE 3 | #' @importFrom curl curl 4 | #' @author E. F. Haghish 5 | #' @keywords Internal 6 | #' @noRd 7 | #' 8 | .onAttach <- function(lib, pkg) { 9 | packageStartupMessage('Single and Multiple Imputation with Automated Machine Learning\n 10 | \nNote1: This is a free software and comes with no guarantee. 11 | \nNote2: The default algorithm is ELNET, which is the fastest\n to fine-tune. You can reduce imputation error by\n adding other algorithms e.g. "RF", "Ensemble", ... 12 | \nNote3: If you find a bug, post it on GitHub\n https://github.com/haghish/mlim\n') 13 | options("prefer_RCurl" = FALSE) 14 | options(timeout = 3600) # "seconds" 15 | } 16 | -------------------------------------------------------------------------------- /R/stochasticFactorImpute.R: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # getMetrics 3 | # ========================================================== 4 | #' @title generates random integers with the length of the factor levels 5 | #' @description generates stochastic integer valuesbased on estimated 6 | #' probabilities of each factor's levels' probabilities 7 | #' @param levels levels of the factor 8 | #' @param probMat estimated probability of each level for each missing observation 9 | #' @author E. F. Haghish 10 | #' @keywords Internal 11 | #' @noRd 12 | 13 | stochasticFactorImpute <- function(levels, probMat) 14 | { 15 | do.call("c", lapply(seq(nrow(probMat)), function(i) 16 | { 17 | sample(x = levels, size = 1, replace = TRUE, prob = probMat[i,]) 18 | })) 19 | } 20 | 21 | -------------------------------------------------------------------------------- /R/missclass.R: -------------------------------------------------------------------------------- 1 | #' @title missclassification error 2 | #' @description calculates the missclassification rate for each variable 3 | #' @param imputed the imputed dataframe 4 | #' @param incomplete the dataframe with missing values 5 | #' @param complete the original dataframe with no missing values 6 | #' @author E. F. Haghish 7 | #' @keywords Internal 8 | #' @noRd 9 | missclass <- function(imputed, incomplete, complete, rename = TRUE){ 10 | classerror <- NULL 11 | mis <- as.data.frame(is.na(incomplete)) 12 | index <- which(colSums(mis) > 0) 13 | for (i in index) { 14 | missclass <- sum(as.character(as.matrix(imputed[,i])) != as.character(as.matrix(complete[,i]))) 15 | NAs <- sum(is.na(incomplete[, i])) 16 | classerror <- c(classerror, (missclass / NAs)) 17 | } 18 | if (rename) names(classerror) <- colnames(incomplete)[index] 19 | return(classerror) 20 | } 21 | -------------------------------------------------------------------------------- /R/nrmse.R: -------------------------------------------------------------------------------- 1 | #' @title normalized RMSE 2 | #' @description calculates the normalized RMSE 3 | #' @param imputed the imputed dataframe 4 | #' @param incomplete the dataframe with missing values 5 | #' @param complete the original dataframe with no missing values 6 | #' @author E. F. Haghish 7 | #' @keywords Internal 8 | #' @noRd 9 | nrmse <- function(imputed, incomplete, complete){ 10 | nrmse <- NULL 11 | mis <- as.data.frame(is.na(incomplete)) 12 | index <- which(colSums(mis) > 0) 13 | for (i in index) { 14 | v.na <- is.na(incomplete[, i]) 15 | nrmse <- c(nrmse, 16 | sqrt(mean((imputed[v.na,i] - complete[v.na,i])^{2}) / 17 | stats::var(complete[v.na,i])) 18 | ) 19 | 20 | } 21 | names(nrmse) <- colnames(incomplete)[index] 22 | return(nrmse) 23 | #sqrt(mean((imputed[mis] - complete[mis])^{2}) / stats::var(complete[mis])) 24 | } 25 | -------------------------------------------------------------------------------- /R/missrank.R: -------------------------------------------------------------------------------- 1 | #' @title miss ranking error 2 | #' @description calculates the MAE or missranking, devided by N-1 ordered categories 3 | #' @param imputed the imputed dataframe 4 | #' @param incomplete the dataframe with missing values 5 | #' @param complete the original dataframe with no missing values 6 | #' @author E. F. Haghish 7 | #' @keywords Internal 8 | #' @noRd 9 | missrank <- function(imputed, incomplete, complete){ 10 | MAE <- NULL 11 | mis <- as.data.frame(is.na(incomplete)) 12 | index <- which(colSums(mis) > 0) 13 | for (i in index) { 14 | imputed[,i] <- as.numeric(imputed[,i]) 15 | incomplete[,i] <- as.numeric(incomplete[,i]) 16 | complete[,i] <- as.numeric(complete[,i]) 17 | v.na <- is.na(incomplete[, i]) 18 | MAE <- c(MAE, (mean(abs(imputed[v.na,i] - complete[v.na,i])) / 19 | (length(unique(complete[,i]))-1) 20 | )) 21 | } 22 | names(MAE) <- colnames(incomplete)[index] 23 | return(MAE) 24 | } 25 | -------------------------------------------------------------------------------- /R/getMetrics.R: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # getMetrics 3 | # ========================================================== 4 | #' @title retreives metrics data 5 | #' @description a function to retreive the metrics data from 6 | #' datasets preimputed with mlim 7 | #' @param preimputed.data data.frame previously imputed by mlim or 8 | #' other programs 9 | #' @author E. F. Haghish 10 | #' @keywords Internal 11 | #' @noRd 12 | getMetrics = function(preimputed.data) { 13 | metrics <- attributes(preimputed.data)$metrics 14 | if (!is.null(metrics)) { 15 | unc <- unique(metrics$variable) 16 | maxmetrics <- max(metrics$iteration, na.rm = TRUE) 17 | for (i in unc) { 18 | metrics[(metrics$iteration == maxmetrics & metrics$variable == i), 19 | "RMSE"] <- min(metrics$RMSE[metrics$variable == i], na.rm = TRUE) 20 | } 21 | return(metrics) 22 | } 23 | else return(NULL) 24 | } 25 | -------------------------------------------------------------------------------- /R/revert.R: -------------------------------------------------------------------------------- 1 | #' @title revert 2 | #' @description using factmem object, integer variables are reverted to the 3 | #' original variable type of "ordered factors". 4 | #' @return data.frame 5 | #' @author E. F. Haghish 6 | #' @keywords Internal 7 | #' @noRd 8 | 9 | revert <- function(df, factmem) { 10 | cols <- colnames(df) 11 | for (i in 1:length(factmem)) { 12 | data <- factmem[[i]][[1]] 13 | if (!is.null(data$names)) { 14 | if (data$names %in% cols) { 15 | df[, data$names] <- factor(as.character(round(df[, data$names])), 16 | levels = as.character(data$length), 17 | labels = data$level, 18 | ordered = TRUE) 19 | } 20 | 21 | } 22 | } 23 | 24 | return(df) 25 | } 26 | 27 | #revert(round(mlimELNET[, 1:3]), MEM) 28 | #print(head(mlimELNET$A1)) 29 | 30 | #revert(LAST[, COLS, drop = FALSE], MEM) 31 | 32 | -------------------------------------------------------------------------------- /R/meanmode.R: -------------------------------------------------------------------------------- 1 | #' @title mean and mode imputation 2 | #' @description Compute the missing values with mean and mode replacement 3 | #' @importFrom stats median 4 | #' @param data A data frame with dummies or numeric variables. 5 | #' @return imputed dataset 6 | #' @author E. F. Haghish, Maintainer: \email{haghish@@uio.no} 7 | #' @noRd 8 | #' @keywords Internal 9 | 10 | meanmode <- function (data) { 11 | 12 | Mode <- function(x) { 13 | xtab <- table(x) 14 | xmode <- names(which(xtab == max(xtab))) 15 | return(xmode[1]) 16 | } 17 | 18 | values <- sapply(data, function(x) { 19 | if (class(x) %in% c("character", "factor")) 20 | Mode(x) 21 | else if (class(x) %in% c("numeric", "integer")) 22 | median(x, na.rm = TRUE) 23 | 24 | }, simplify = FALSE) 25 | 26 | #impute the values ??? improve it in the future release and make it more secure 27 | for (i in 1:length(values)) { 28 | data[is.na(data[,i]), i] <- values[i] 29 | } 30 | 31 | return(data) 32 | } 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /man/mlim.summarize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlim.summarize.R 3 | \name{mlim.summarize} 4 | \alias{mlim.summarize} 5 | \title{mlim imputation summary} 6 | \usage{ 7 | mlim.summarize(data) 8 | } 9 | \arguments{ 10 | \item{data}{dataset imputed with mlim} 11 | } 12 | \value{ 13 | estimated imputation accuracy via cross-valdiation procedure 14 | } 15 | \description{ 16 | provides information about estimated accuracy of the imputation as well 17 | as the overall procedure of the imputation. 18 | } 19 | \examples{ 20 | \dontrun{ 21 | data(iris) 22 | 23 | # add 10\% stratified missing values to one factor variable 24 | irisNA <- iris 25 | irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022) 26 | 27 | # run the ELNET single imputation (fastest imputation via 'mlim') 28 | MLIM <- mlim(irisNA) 29 | 30 | # in single imputation, you can estimate the imputation accuracy via cross validation RMSE 31 | mlim.summarize(MLIM) 32 | } 33 | } 34 | \author{ 35 | E. F. Haghish 36 | } 37 | -------------------------------------------------------------------------------- /man/figures/wiki.svg: -------------------------------------------------------------------------------- 1 | 💡: Manual💡Manual -------------------------------------------------------------------------------- /man/figures/manual.svg: -------------------------------------------------------------------------------- 1 | 📖: Vignette📖Vignette -------------------------------------------------------------------------------- /man/figures/handbook_book.svg: -------------------------------------------------------------------------------- 1 | 📙: Handbook📙Handbook -------------------------------------------------------------------------------- /man/figures/handbook_lamp.svg: -------------------------------------------------------------------------------- 1 | 💡: Handbook💡Handbook -------------------------------------------------------------------------------- /man/figures/handbook_stupid.svg: -------------------------------------------------------------------------------- 1 | ✋📙: Handbook✋📙Handbook -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(mlim) 4 | export(mlim.error) 5 | export(mlim.mids) 6 | export(mlim.na) 7 | export(mlim.preimpute) 8 | export(mlim.summarize) 9 | importFrom(curl,curl) 10 | importFrom(h2o,as.h2o) 11 | importFrom(h2o,h2o.automl) 12 | importFrom(h2o,h2o.clusterIsUp) 13 | importFrom(h2o,h2o.clusterStatus) 14 | importFrom(h2o,h2o.getId) 15 | importFrom(h2o,h2o.get_automl) 16 | importFrom(h2o,h2o.init) 17 | importFrom(h2o,h2o.load_frame) 18 | importFrom(h2o,h2o.ls) 19 | importFrom(h2o,h2o.no_progress) 20 | importFrom(h2o,h2o.predict) 21 | importFrom(h2o,h2o.removeAll) 22 | importFrom(h2o,h2o.rm) 23 | importFrom(h2o,h2o.save_frame) 24 | importFrom(h2o,h2o.shutdown) 25 | importFrom(md.log,md.log) 26 | importFrom(memuse,Sys.meminfo) 27 | importFrom(mice,as.mids) 28 | importFrom(missRanger,imputeUnivariate) 29 | importFrom(missRanger,missRanger) 30 | importFrom(stats,median) 31 | importFrom(stats,na.omit) 32 | importFrom(stats,rnorm) 33 | importFrom(stats,setNames) 34 | importFrom(stats,var) 35 | importFrom(tools,file_ext) 36 | importFrom(utils,capture.output) 37 | importFrom(utils,packageVersion) 38 | importFrom(utils,setTxtProgressBar) 39 | importFrom(utils,txtProgressBar) 40 | -------------------------------------------------------------------------------- /man/mlim.mids.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlim.mids.R 3 | \name{mlim.mids} 4 | \alias{mlim.mids} 5 | \title{prepare "mids" class object} 6 | \usage{ 7 | mlim.mids(mlim, incomplete) 8 | } 9 | \arguments{ 10 | \item{mlim}{array of class "mlim", returned by "mlim" function} 11 | 12 | \item{incomplete}{the original data.frame with NAs} 13 | } 14 | \value{ 15 | object of class 'mids', as required by 'mice' package for analyzing 16 | multiple imputation data 17 | } 18 | \description{ 19 | takes "mlim" object and prepares a "mids" class for data analysis with 20 | multiple imputation. 21 | } 22 | \examples{ 23 | 24 | \dontrun{ 25 | data(iris) 26 | require(mice) 27 | irisNA <- mlim.na(iris, p = 0.1, seed = 2022) 28 | 29 | # adding unstratified NAs to all variables of a data.frame 30 | MLIM <- mlim(irisNA, m=5, tuning_time = 180, doublecheck = T, seed = 2022) 31 | 32 | # create the mids object for MICE package 33 | mids <- mlim.mids(MLIM, irisNA) 34 | 35 | # run an analysis on the mids data (just as example) 36 | fit <- with(data=mids, exp=glm(Species~ Sepal.Length, family = "binomial")) 37 | 38 | # then, pool the results! 39 | summary(pool(fit)) 40 | } 41 | } 42 | \author{ 43 | E. F. Haghish, based on code from 'prelim' frunction in missMDA R package 44 | } 45 | -------------------------------------------------------------------------------- /R/matching.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title match imputed missing observations to non-missing values 3 | #' @description each imputed numeric missing value is replaced with the nearest 4 | #' non-missing value. this option is particularly 5 | #' recommended when ordinal variables are imputed as a numeric 6 | #' variables. 7 | #' @param imputed numeric vector of imputed missing values 8 | #' @param nonMiss numeric vector of non-missing values 9 | #' @return numeric vector of the imputed values 10 | #' @author E. F. Haghish 11 | #' @keywords Internal 12 | #' @noRd 13 | 14 | matching <- function(imputed, nonMiss, md.log) { 15 | 16 | if (!is.null(imputed) & !is.null(nonMiss)) { 17 | if (is.numeric(imputed) & is.numeric(nonMiss)) { 18 | # get the unique values 19 | unqImputed <- unique(imputed) 20 | unqNonMiss <- unique(nonMiss[!is.na(nonMiss)]) 21 | 22 | # avoid imputed values that are already in nnMiss set 23 | unqImputed <- setdiff(unqImputed, unqNonMiss) 24 | 25 | # index the unique values and replace them in the vectors 26 | for (i in unqImputed) { 27 | nearest <- which.min(abs(unqNonMiss - i))[1] 28 | index <- which(imputed == i) 29 | imputed[index] <- unqNonMiss[nearest] 30 | } 31 | 32 | md.log("matching successul!") 33 | 34 | return(imputed) 35 | } 36 | } 37 | } 38 | 39 | 40 | #nonMiss <- c(1:20, 19:1) 41 | #imputed <- c(11.5, 12.2, 11.51, 14.1, -1, 49, 20, 1, 4) 42 | #ching(imputed, nonMiss, F) 43 | 44 | 45 | -------------------------------------------------------------------------------- /man/mlim.preimpute.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlim.preimpute.R 3 | \name{mlim.preimpute} 4 | \alias{mlim.preimpute} 5 | \title{carries out preimputation} 6 | \usage{ 7 | mlim.preimpute(data, preimpute = "RF", seed = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{data.frame with missing values} 11 | 12 | \item{preimpute}{character. specify the algorithm for preimputation. the 13 | supported options are "RF" (Random Forest), "mm" 14 | (mean-mode replacement), and "random" (random sampling from available data). 15 | the default is "RF", which carries 16 | a parallel random forest imputation, using all the CPUs available. 17 | the other alternative is "mm" which performs mean/mode 18 | imputation.} 19 | 20 | \item{seed}{integer. specify the random generator seed} 21 | } 22 | \value{ 23 | imputed data.frame 24 | } 25 | \description{ 26 | instead of replacing missing data with mean and mode, a smarter 27 | start-point would be to use fast imputation algorithms and then 28 | optimize the imputed dataset with mlim. this procedure usually 29 | requires less iterations and will savea lot of computation 30 | resources. 31 | } 32 | \examples{ 33 | \dontrun{ 34 | data(iris) 35 | 36 | # add 10\% stratified missing values to one factor variable 37 | irisNA <- iris 38 | irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022) 39 | 40 | # run the default random forest preimputation 41 | MLIM <- mlim.preimpute(irisNA) 42 | mlim.error(MLIM, irisNA, iris) 43 | } 44 | } 45 | \author{ 46 | E. F. Haghish 47 | } 48 | -------------------------------------------------------------------------------- /R/h2o.check.R: -------------------------------------------------------------------------------- 1 | #' @title server.check 2 | #' @description safely examines the connection status with h2o server 3 | #' @importFrom h2o h2o.clusterIsUp 4 | #' @author E. F. Haghish 5 | #' @return logical. if TRUE, proceed with the analysis 6 | #' @keywords Internal 7 | #' @noRd 8 | 9 | server.check <- function() { 10 | 11 | up <- FALSE 12 | healthy <- FALSE 13 | 14 | # check that the cluster is up 15 | # ============================================================ 16 | for (i in 1:5) { 17 | if (!up) tryCatch(up <- h2o.clusterIsUp(), 18 | error = function(cond){ 19 | message("trying to connect to JAVA server...\n"); 20 | return(NULL)}) 21 | if (!up) Sys.sleep(3) 22 | } 23 | if (!up) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed") 24 | 25 | # make sure the cluster is healthy 26 | # ============================================================ 27 | for (i in 1:5) { 28 | if (!healthy) tryCatch(healthy <- h2o.clusterStatus()$healthy, 29 | error = function(cond){ 30 | message("trying to connect to JAVA server...\n"); 31 | return(NULL)}) 32 | if (!healthy) Sys.sleep(3) 33 | } 34 | if (!healthy) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed") 35 | 36 | 37 | return(healthy) 38 | } 39 | 40 | -------------------------------------------------------------------------------- /R/miss_per_class_error.R: -------------------------------------------------------------------------------- 1 | #' @title Mean Per Class Error (MPCE) 2 | #' @description calculates the missclassification rate for each level of an unordered 3 | #' factor and either returns a vector of the errors or return the mean, 4 | #' if 'mean = TRUE' 5 | #' @param imputed the imputed dataframe 6 | #' @param incomplete the dataframe with missing values 7 | #' @param complete the original dataframe with no missing values 8 | #' @param mean logical. if TRUE (defualt), the mean per class error is returned 9 | #' @author E. F. Haghish 10 | #' @keywords Internal 11 | #' @noRd 12 | mean_per_class_error <- function(imputed, incomplete, complete, mean = FALSE){ 13 | mpce <- NULL 14 | for (k in 1:ncol(complete)) { 15 | lvl <- levels(complete[, k]) 16 | for (i in lvl) { 17 | index <- drop(complete[, k]) == i 18 | temp <- missclass(imputed[index, k, drop = FALSE], 19 | incomplete[index, k, drop = FALSE], 20 | complete[index, k, drop = FALSE], 21 | rename = FALSE) 22 | names(temp) <- i 23 | if (!is.null(temp)) mpce <- c(mpce, temp) 24 | } 25 | } 26 | 27 | if (mean) mpce <- mean(mpce) 28 | return(mpce) 29 | } 30 | 31 | # print(mean_per_class_error(mm[,c("education"), drop = FALSE], 32 | # dfNA[,c("education"), drop = FALSE], 33 | # df[,c("education"), drop = FALSE])) 34 | 35 | # index <- df$education == "HS" & is.na(dfNA$education) 36 | # table(index) 37 | # mm$education[index] 38 | #index <- is.na(dfNA$education[index]) 39 | #table(index) 40 | #View(cbind(df$education[index], mm$education[index])) 41 | -------------------------------------------------------------------------------- /R/server.check.R: -------------------------------------------------------------------------------- 1 | #' @title server.check 2 | #' @description safely examines the connection status with h2o server 3 | #' @importFrom h2o h2o.clusterIsUp h2o.clusterStatus 4 | #' @author E. F. Haghish 5 | #' @return logical. if TRUE, proceed with the analysis 6 | #' @keywords Internal 7 | #' @noRd 8 | 9 | server.check <- function() { 10 | 11 | up <- FALSE 12 | healthy <- FALSE 13 | 14 | # check that the cluster is up 15 | # ============================================================ 16 | for (i in 1:3) { 17 | if (!up) up <- tryCatch(h2o.clusterIsUp(), 18 | error = function(cond){ 19 | message("trying to connect to JAVA server...\n"); 20 | return(NULL)}) 21 | if (!up) Sys.sleep(1) 22 | } 23 | if (!up) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed") 24 | 25 | # make sure the cluster is healthy 26 | # ============================================================ 27 | for (i in 1:3) { 28 | if (!healthy) healthy <- tryCatch(h2o.clusterStatus()$healthy, 29 | error = function(cond){ 30 | message("trying to connect to JAVA server...\n"); 31 | return(NULL)}) 32 | if (!healthy) Sys.sleep(1) 33 | } 34 | if (!healthy) stop("h2o server is down... perhaps heavy RAM consumption crashed the JAVA server?\nNOTE: save your imputation with the 'save' argument to continue \n from where the imputation crashed") 35 | 36 | 37 | return(healthy) 38 | } 39 | 40 | -------------------------------------------------------------------------------- /R/mlim.mids.R: -------------------------------------------------------------------------------- 1 | #' @title prepare "mids" class object 2 | #' @description takes "mlim" object and prepares a "mids" class for data analysis with 3 | #' multiple imputation. 4 | #' @importFrom mice as.mids 5 | #' @param mlim array of class "mlim", returned by "mlim" function 6 | #' @param incomplete the original data.frame with NAs 7 | #' @author E. F. Haghish, based on code from 'prelim' frunction in missMDA R package 8 | #' @examples 9 | #' 10 | #' \dontrun{ 11 | #' data(iris) 12 | #' require(mice) 13 | #' irisNA <- mlim.na(iris, p = 0.1, seed = 2022) 14 | #' 15 | #' # adding unstratified NAs to all variables of a data.frame 16 | #' MLIM <- mlim(irisNA, m=5, tuning_time = 180, doublecheck = T, seed = 2022) 17 | #' 18 | #' # create the mids object for MICE package 19 | #' mids <- mlim.mids(MLIM, irisNA) 20 | #' 21 | #' # run an analysis on the mids data (just as example) 22 | #' fit <- with(data=mids, exp=glm(Species~ Sepal.Length, family = "binomial")) 23 | #' 24 | #' # then, pool the results! 25 | #' summary(pool(fit)) 26 | #' } 27 | #' @return object of class 'mids', as required by 'mice' package for analyzing 28 | #' multiple imputation data 29 | #' @export 30 | 31 | mlim.mids <- function (mlim, incomplete) { 32 | if (any(c("MIMCA", "MIFAMD", "MIPCA", "mlim.mi") %in% class(mlim))) { 33 | longformat <- rbind(incomplete, do.call(rbind, mlim)) 34 | longformat <- cbind(.imp = rep(0:length(mlim), each = nrow(incomplete)), 35 | .id = rep(1:nrow(incomplete), (length(mlim) + 1)), longformat) 36 | rownames(longformat) <- NULL 37 | mids <- as.mids(longformat) 38 | } 39 | else { 40 | stop("Objects of class mlim.mi, MIPCA, MIFAMD, or MIMCA are required.") 41 | } 42 | 43 | return(mids) 44 | } 45 | 46 | 47 | #mid <- mlim.mids(ELNET, irisNA) 48 | -------------------------------------------------------------------------------- /man/figures/Untitled.drawio: -------------------------------------------------------------------------------- 1 | 7Ztdd6I6FIZ/jZftIgnycamIna5px1nVmblOIWJWA3Ehrfb8+pNoVDBY6TkqdLQ3JTshgf3shLwbbCEvXtyleDp55CFhLWiEixbqtSAEpuGIf9LyvrLYpjJEKQ1Vo61hSP8hymgo6ysNyazQMOOcZXRaNAY8SUiQFWw4Tfm82GzMWXHUKY6IZhgGmK2tt+2t/Q8Ns4myA8vdVnwjNJqowR1orSqecfASpfw1USMmPCGrmhivu1F3OZvgkM9zJuS3kJdynq2O4oVHmHTs2mer8/p7aje3kpIkq3KCZYTYeXbHJrRMM0T4Bq56eMPsVTlCXWj2vvbMfEIzMpziQJbngn4LdSdZzEQJiMPljZNQlTY3aMhClvKXjTMdYRnzJFPsoWyBGY0SUQjE9ZNUNqCMeZzxdDk26vs9COxNV/ma5Z+o0V2gvPJG0owsciblkjvCY5Kl76KJqkVAkVSxaxoqFubbOICOQjjJhYCJlBGr6Is2fW8hiAPFoSITpDHpjEZPHW+koRH3lxVpFP20jMSiU5VJc7z0FhXToaMqYhqGcphuGf4tc2OXeZ6wrcpl2Jbt1J2UhPH/ZwgrMrROgdDUEHYZj1rQYpJXSN/EYSQPv5P3OU/D2bpKjJSrLTlhyAOK5diPJKR432lfKk6A+UGcnCA07Kqh4Z4iNKzDKy6jS98Xl0+4b9k9OAU/XGQV59Our2hnbjolAMwSAOAUAOwrALvsAXc2AI4GYCgebz/u/KehRkJ0JXaA+9aXajiKGK2SxefQNqTb75od90NKJ1y/YNst0GubOj0A2iX01saj4nMPz5/8gyZPIsSzyYZR9UeL6GMqe44XkRQft6v9PryVI9IZCTuyLLuXDjJuJeGlMDHlMAnPAukk4QtBiYgduRxMNDP1qCkECiiZyQcfmwL9eDyGQXAc9KZRnLhtG+nojXPtakCFpbNWtdD3DEPf3Rx5OrbdRqkFoC+n3uDHb//pKhc+A7FWuQD0NbXP03hWsv/3MGOzm4zfdIKM8qSybnjASUgTIUGMnzgie/XGl4qYUwuH3SCpVzisc0iXtHG1dlV9rcphc/0XTKBe6bB2eI7A7/vh/WjQWOmAXLvX6dQlHdqoUdIB6qlNDdtVOxxJO1hOk7QD1FOiGvp6tUP3DNrBsRulHdb7mbx2eBgM/S+2DzyrctAQ1qocoJ5O9mNMWdmbA+FvzKq/afjD05cxky9Wr1rh82FRs1a4wCS3syvp69UKelrm4gjUrBX0pMqD3+k1VSj4fRe1UV1CwQGNEgqoQq7jKhSOJBRcq0lCAVVIstT7SZLrGsaH8/QITMS0apRSQHripec/3N99u75l+BTFWsUC0tMv/pu4u7L3DJvvjO6TZ76o/nlSjNNMnOUJT0q/XaXDf4iSerUDqpCp+dt2rgDsqvpaxQPS8zKXh6Be9YD0xIr3azgaPDb3KyXL6Tptry4FAQy3WRKiQgrkKiGOJCEANBulIfTky8+nweNg1NzJ2+1byK3tPSFA9hknryhuf/KyrMv9qAj5/wI= -------------------------------------------------------------------------------- /R/selectVariables.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title select imputation variables 3 | #' @description selects variables relevant to the imputation process 4 | #' @importFrom md.log md.log 5 | #' @param data data.frame 6 | #' @return list 7 | #' @keywords Internal 8 | #' @noRd 9 | 10 | selectVariables <- function(data, ignore=NULL, verbose=FALSE, report=NULL) { 11 | 12 | suppressPackageStartupMessages({requireNamespace("md.log")}) 13 | 14 | # select the variables with missing, excluding fully missing vars 15 | vars2impute <- vapply(data[, , drop = FALSE], FUN.VALUE = TRUE, 16 | function(z) anyNA(z) && !all(is.na(z))) 17 | 18 | # make sure that these variables were not meant to be ignored 19 | if (!is.null(ignore)) vars2impute[colnames(data)[vars2impute] %in% ignore] <- FALSE 20 | 21 | 22 | 23 | #data[, vars2impute] <- converted$X 24 | 25 | #if (verbose) { 26 | # message("\n Variables to impute:\t\t") 27 | # message(colnames(data)[vars2impute], sep = ", ") 28 | # message("\n") 29 | # 30 | #} 31 | 32 | if (!is.null(report)) { 33 | md.log(paste("Variables to impute:", paste(colnames(data)[vars2impute], 34 | collapse = ", "))) 35 | } 36 | 37 | # Get missing indicators and order variables by number of missings 38 | dataNA <- is.na(data[, vars2impute, drop = FALSE]) 39 | vars2impute <- names(sort(colSums(dataNA))) 40 | # ============================================================ 41 | 42 | # specify the list of all predictors, which were not ignored by the user 43 | allPredictors <- colnames(data)[!colnames(data) %in% ignore] 44 | X <- setdiff(allPredictors, vars2impute) 45 | 46 | return(list( 47 | #data <- data, 48 | dataNA = dataNA, 49 | #converted = converted, 50 | allPredictors = allPredictors, 51 | vars2impute = vars2impute, 52 | X = X)) 53 | } 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /man/figures/flowchart_optimization.drawio: -------------------------------------------------------------------------------- 1 | 7Vlbd5s4EP41Pmf3ITkYbIwf7dhOs02z7Sa7SR9lkEFrgaiQY7u/viOQuMeXNNdz4gfDjKTR5ZtvmIGOdRZuzjmKgy/Mw7RjGt6mY006ptk1LAcuUrPNNHZvmCl8TjzVqVBck59Yj1TaFfFwUukoGKOCxFWly6IIu6KiQ5yzdbXbgtHqrDHycUNx7SLa1N4STwSZ1jEHhf4TJn6gZ+7aan8h0p3VTpIAeWxdUlnTjnXGGRPZXbg5w1Qenj6X24vtLb1c2ud/fUt+oH/Hn2+u/jvJjM2OGZJvgeNIPK1pKzN9j+hKnVfHtClMMl4wmEvCQhlPW+wfK7nTcce0sl9ZZfvyOkECwZg1gZMzjauRtgWLysxl3dRxiq3GiLNV5GG5zi40rwMi8HWMXNm6Bq8EXSBCqpqlJeVmXcBg7FOUSB8x4D4RnC1znGXvHDTZvCCUnuUbsnD6y4eVWizbGlpevtS9J68Qusdc4E3J7xQS55iFWPAtdNGttnKqbU1eFz7a1Y4XlPzTUjqkaOHnpgvo4Uahf4Qn9Fo84UGYjCok/2MhtgoTtBIMVIyLgPksQvSSsbgFOinjyBtJioM8p8xdZqoZkStP5wBJ9Xf2o0vRHNMxcpd+ulANZ8QiiTH2ICyovRSLmxbaMd4QcSdtnfaV9F1NI+8nG70mKWzzBcLZ35WF74UFKRbDUkmPO86vErbiLt5PY7nHnd6nvE07FscUCXJfjZRtnqUsfWUkDQq6C1ssEiyqUUf3UY6ez6Wn7tUcONubGrXLUL9myKkZEoj7WDQMgYOhbalbLDskLZt6aMFOJazCTWaxYFp+vo8nX38/+fb77/um5++Qr/uc5KuR6gA2DnaSr/ty5LOGtaeM+UjyWYOXIV/PeAXyDZ40ByJhvBLAQsiY02zoTWdAM2cymg7bMqChPbCQ/YwZkNV7xQzo0htQfjMx7/wR7qLg87317WdLBvSVJSLFE4jKoiZoAQvnq2Q/YFUIAL0ZCgmVG/+E6T0WxEUtsCJK/AgEF04e83ZsYUoS+SDZhXSTBnTYTgPu8WhqTO02uA3sGI4M4h5bzdP9HR8oD8e+8SzvyaB9EPr1wPNk6DcfwX/HAD7gIbmseTznmsIXNZ6/DUJ7fex4vTaEHXNu2c9J6Dqo1qsT2t6fVb3vnOlZSprj/KOcDO2KqntLE+VFxmmv6kUvly3Vk4/eo7OlboMJOrztyZcekdK0HrrzYEoTV/xfJy+URPhEc3IEXYzTYZbT1LIbbQYeNVGrJcmHkyR1cGmna8abnYYek2JdlB7J+mWTZAT8c/nijnF5j6jPODSGyYNJWD2mH7qoxaK5qOnl1fQGBBTK4B7Nk7hksDHz7HemOniWXJ3BdSCE8zzenKhlSSRJRARBNDVgPDXKbbs8H395+Y2V9Nyf/9GHQAMMNIrrn0cfQA2q2Q703iNMd+ev4H8fMB0L0+TyA6W3j9I/sw+U3j5K0yjBoaySnxarY5fezFsemBHUcaGrVUBQ3Ul9ezVq9rSsBsh2WRISF9GRej8REs9Lq43mC4tyLVMpapdYuEGphNX2VanFmcrvnvk9RO0rXF4BlEtWo6VkrX/EOKBkBbH4VJzl+cUHd2v6Cw== -------------------------------------------------------------------------------- /man/figures/flowchart_base.drawio: -------------------------------------------------------------------------------- 1 | 7VpbV9s4EP41OWf3IRxfgkkeyRW2wGmBLrRviq3YWmTLlRWS9NfvyJYcO3ZuNKG0JQ9YMx6N5PlmPl0ODbsXzkccxcE18zBtWIY3b9j9hmWZht2Gh9QsMo3T6mQKnxNPGS0Vd+Q71j2Vdko8nJQMBWNUkLisdFkUYVeUdIhzNiubTRgtjxojH1cUdy6iVe0D8USQadvW2VJ/gYkf6JFNR31fiLSx+pIkQB6bFVT2oGH3OGMia4XzHqYyeDouD5eLB3r15Iz++ZR8Q5+7H+5v/m1mzob7dMk/geNIHNa1lbl+RnSq4qW+VSx0ADmbRh6WToyG3Q1ESKFpQvM/LMRCAY6mgoGKcREwn0WIXjEWK7sJi4QyM6WMI+9cAgvymDL3KVMNCaVqDJCUfRukRHD2lGMnHeRASGOKxph2kfvkpxPtMco4vIpYhKUrD5JBfctycoOltrtjbBUGCZtyF2+ws1WKI+7jTf6czE7Or5CnCrkRZiEWfAEGHFMkyHM5mZGqCT+3W+IODQX9Hmlg16SBQ4UCL63OLKyg/jaVCQ9xs7NfUeX48tlHAkGfGQHELOPmXPuCqWXuMrMNmSZRngVE4LsYpeGeATmVs6+YVRC4rk9Rkqic2JIyE0i1Xv5BNk5/ebfCG9uxO7a3d5I8Yy7wfCOs+q2juEWRa0eJsyVTmZp+ggJL2caREqH1x/MBnhPxKH2dnCrpixpGtvtzPScpLPIJQuwLnaT4pfhu2S2VdL8Dc4+zI/fopXkr+ais1Bm4MxcpTx8ZSclDm7DJJIGJlRYpbaMKIt826KFXMz0Lguq1wZFtrDhqrzjKolRxBJmIFgWzWBoklbrKg/TyUnMqpfaRYxLGUwFBZlG17gIWjqfJdmYscx0U2xCFhMo4XGD6jAVxUQ1/Ikr8CAQXchHz+jqDIUnkg+Qspfu0roE4Krw6mUws163jVc8ZO6fOK/JqLm8j1tUsORixtv94Yj0w2Wmi2M525o5sp3LHOGmVcsc+Gvsdikl0KArJdftbMUnP6neGvTomcdw2Hk9ekUlyhvhpTKLzuYD2pcQamANO6enW+23sq4ft/vmgU4ta+ntF1FrGK26sr7wzyu/71qN/jk0UfHi2P31vVkGrwASxjGXTnXK66HJgWslr2/Aqg1vCowQdOHODAlCarNWC4nPkEUBhhdaPBpC9skDXlJXm2CI+rWPhU7c+Z6fWuISSPutSEuGmnhmccA3jpJMdgVcOw9oNkFxU60kWXDNJK076Ma14XuvoFsc0Y015pk7yA3bmfsxXB4QwZGMeeB4wi2lCxtBbhSvZMiKo47VnfsgXsZaArJaWVQf5XiYZLDz0XC0zIfG8dINRXXdeWgycqZUTlqPO/ge13cvg1NxaBaZRQ1NHKwNz/U3QW6mDlZupqqNia7+Lq8vCnklfX0kehL9c3ggzLtuI+ozDy7CY+OWLrXU1+YNznkyqcx5c3QzuG3ItWzOX4REHhycK5YIUjZO4XPJ1AdmXksb5aaOppiUzg0REEESz6b4wa/b6ylH3+vU/rKDn/vgvyRRQ0sby+ffeAViBargBvV8RpsfRT8i/d5j2hal/9Y7S20fpdviO0ttHaRAlOEy34i/B6n1jvnZjbnVaJ6fbL37q9uZnB9ib39x+tT7fcCsaDTyBo/6FuL9uVm/5fokrhCJm8oCjQauAsSuOe4Bm19z7WKfa6AdPVCAu//kju8td/guNPfgf -------------------------------------------------------------------------------- /R/extractMetrics.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title extractMetrics 3 | #' @description extracts performance metrics from cross-validation 4 | #' @return data.frame of error metrics. 5 | #' @author E. F. Haghish 6 | #' @keywords Internal 7 | #' @noRd 8 | 9 | extractMetrics <- function(hex, k, v, perf, family) { 10 | 11 | # calculate the variance of v 12 | if (is.numeric(var)) { 13 | var <- var(as.vector(hex[, v]), na.rm = TRUE) 14 | NRMSE <- as.numeric(perf@metrics$RMSE)/var 15 | } else { 16 | NRMSE <- NA 17 | } 18 | 19 | RMSE = as.numeric(perf@metrics$RMSE) 20 | MSE = as.numeric(perf@metrics$MSE) 21 | MAE = as.numeric(perf@metrics$mae) 22 | if (length(MAE) < 1) MAE <- NA 23 | RMSLE = as.numeric(perf@metrics$rmsle) 24 | if (length(RMSLE) < 1) RMSLE <- NA 25 | Mean_Residual_Deviance = as.numeric(perf@metrics$mean_residual_deviance) 26 | if (length(Mean_Residual_Deviance) < 1) Mean_Residual_Deviance <- NA 27 | R2 = as.numeric(perf@metrics$r2) 28 | #AIC = numeric() 29 | logloss = as.numeric(perf@metrics$logloss) 30 | if (length(logloss) < 1) logloss <- NA 31 | mean_per_class_error = as.numeric(perf@metrics$mean_per_class_error) 32 | if (length(mean_per_class_error) < 1) mean_per_class_error <- NA 33 | AUC = as.numeric(perf@metrics$AUC) 34 | if (length(AUC) < 1) AUC <- NA 35 | pr_auc = as.numeric(perf@metrics$pr_auc) 36 | if (length(pr_auc) < 1) pr_auc <- NA 37 | 38 | metrics <- data.frame(iteration=k, 39 | variable = v, 40 | NRMSE = NRMSE, 41 | RMSE = RMSE, 42 | MSE = MSE, 43 | MAE = MAE, 44 | RMSLE = RMSLE, 45 | Mean_Residual_Deviance = Mean_Residual_Deviance, 46 | R2 = R2, 47 | #AIC = numeric() 48 | logloss = logloss, 49 | mean_per_class_error = mean_per_class_error, 50 | AUC = AUC, 51 | pr_auc = pr_auc) 52 | 53 | return(metrics) 54 | } 55 | -------------------------------------------------------------------------------- /man/figures/flowchart.drawio: -------------------------------------------------------------------------------- 1 | 7VtdW+I4FP41PM/uhT5tUypc8um447gzo7vqZaChdA1Npw0C/vpNmqTfUMCCOsKFNqfpaXLOe855T9AG6M2WlwH0p9+IjXDD0OxlA/QbhtHWdfaTC1ZC0DSbQuAEri1EeiK4dV+QFGpSOndtFGYmUkIwdf2scEw8D41pRgaDgCyy0yYEZ9/qQwcVBLdjiIvSe9emUyFtGReJ/Atynal6s261xZ0ZVJPlTsIptMkiJQKDBugFhFBxNVv2EOa2U3a5v1rd4+sn6/KvH+Ev+E/3693Nv2dC2XCXR+ItBMij9ao2hOpniOfSXnKvdKUMGJC5ZyOuRGuA7pTOMLvU2eV/iNKVdDicU8JEJKBT4hAP4mtCfDlvQjwqp+l8jDy7wx3LxiNMxk9CNHQxlu9gIzm/xUYhDchT7DuuIHYEn4zhCOEuHD850UJ7BJOA3fKIh7gqm4FB7iVZ3CCRdre0rfRBSObBGG2YByTEYeCgTfosMY+vL4VT6blLRGaIBis2IUAYUvc5C2YoY8KJ5yV+ZxfS9TvAAJTAwMJUOi+KTmFWJv4154BndgPikxZZDv/dhxSyZxYu85ih3XSULrY0oU5M24A07uXF1KXo1oeRuRcsN2XRl0YVM1zXwTAMJSYqIDNhUOvFGwIo+sSPpe4AC7SBvTNInlFA0XKjW9VdS+YWmVzbcrhIMpWu0s80laWAdiAgmJ8+H6ClSx+4rvOmHD3K1/Dr/lKtiQ9W8QKZ7R/Sg8dEAx8mj0Uj9VzNucfaMveo0lyZfCQqFQK3zkVS03fiRslDTSGTScgWlilSao4MiJg2qFfnkS6MIJ/aoAhoOUWtnCJhpYIihkS4Sk3z+YSwZFNqwUb5e5IgFBqTkIztu3+UNqujtBroHzuOXxOl+iGjtPbo048XfaCdBXNce3aOvjVRUXf0mVr5gtetKz//KNFqFaL1e4DcmT+nzJ/EKxbYKZmN5mE1BcqSGhaNQzhzMd/aF4SfEXXHsIQoQew6HhuMGZxRUB6I7JWu57CRlYzuosBnDKFIoHS7iS7KCFTbugDQOiKBisdVDCoPyNoY1EWtVFohheVp1oJHvPp9c+m3gwIw3xmXbn16Ll0zv1WVsLrE6luWWAkd7dzMQAccrOTWVVOUKVLg+vlb1ZSe0W8Pe2WJxBq30GhywERiNHMkwXzrmqLwnPL2Va4qvI/0P2z1O4N2qdeizxG9BrZlAnXk/2v7Agd3fePB6SAdTr8+gx8vZ0WnFdzEbOnzy/E8wKtuwDItz2tV/so6N+OPjOuYsvE05SiVrGVBcQJou8wLubS+xvDb+mw9VTPWNBcpBwG96J+Y0tXuoLJTz89VoF932JV0zo/pexvb6FxJrpMdlDTgpW433wkXqOxOzb3b79wpmmlu137vwU7WWzjb8ZKQ/k70pNsZaAOrrNBpqKW1eOjaZD6K9peAv/6kmq96JjgiVyn1ffFs8p1TFa4y/Z2P6MSrKczhndk02+fNNyYxxcOrU42srpE11rqPXcIsLYbwa4tYs3ncIlZ2kHei7wmoSs7Xjkvfy87XxOGon3GTOlPFrofO1NI6bIp23hZHrblDV6WGsQCvVBNPWGdhlIG4Ht3wlxsVyZPfoqL01W4Hw7eYgcbQrlKcSvvbp2Tmvqih+PuLNcfFQjwKal7qZFJc6mX3W8UymFjYekv7y9dz2wfO6I8mgx2DkJb8/lMsc3s/aRyK/Od+C9oTEDuZ8eGyezLj683Yvz5Z8fVW/Dk8WfHgiXPghWjGe8hdk/iavTKxv/YLQ0YXaGNdr2WYaiwf4Pc5x2D9Nu7I7nrm2nbEmIvt9r4cJyCyuLEuvH3IPtoE2Ta6WdJGayV9V/5gpb7D4/V/kvkJ2M0QhvSDsJvB9c3gruZkuKsHThmg9gygtw6XAdgw+a8B0a4m/3oBBv8D -------------------------------------------------------------------------------- /man/figures/procedure2.drawio: -------------------------------------------------------------------------------- 1 | 7Vpbd6M2EP41frQPAnF79DV7ybY5zZ62p28yyKAGkAMidvbXVwKBkZETN7GzThvvORsYSSNpvk8zo7EH1jTdXuVoHX+jIU4GphFuB9ZsYJrAAgb/IySPtcRz3FoQ5SSUnXaCW/IDS6EcF5UkxIXSkVGaMLJWhQHNMhwwRYbynG7UbiuaqLOuUYR7gtsAJX3pHyRksZQCw9g1fMIkiuXUni0blii4i3JaZnK+jGa4bklRo0Z2LWIU0k1HZM0H1jSnlNVP6XaKE2HWxmL1uMWB1nbJOc7YMQNCcPPZgOB2Yi/vhmHpumG8HFq1lgeUlNIUA9NJuL5JzCdwIvHUSJb5voRP1u1W7ZI9NkZleFspYmnCBYA/FmsUkCzib/bu7Ttdc8GQr96abGLC8C2XCxUbzjUuow84XyWV5WIShjgTQ1lO71qoLC5Z0YxJXgHxjhIS8Z6zgFsH51xQgYRDuZIVSZIpTWheLdVa2eKfVNOR1592wk6LU33abfM18u0eRAa0ePMjhGmKWf7Iu8gB0K5HyMPTMmazo2J7wuIOC+2GhUjSP2pV73jAHyQV9LRwAws5wPWAG9jByrKHXo8Sv6EspCmXLWiOC9YDWmye8MN0jZY4uaEFYYQK2y8pY3ycNWk6jCUoTECuECNGa6Es3UbCwYxItqLVEwlG4tzgcFoueYcJKeg4i6qJgd3H0YXAnNg9vKpTWYO7QClJhKE/4eQBi1Wp5Klo2COPyjezhb2HsYYJB2EHlq/gDj27hzswNbi754AdHOMKbnJM0nXJUIWw3gvUwvW+QNhYoY1zX9KmYVhU1h/zDsBbb+thsr1RhHjjfUmCO7HzJKI5YXFaBQoRGHgUEKalpZiExVhsNeNERNWmO2ueCgNzXsVCXghHXVbPKee20BwEZY4YrrSgTDRg8WeR8qDH33K8Trh7SgXsu/3Xm+s5x/XP943m3iGRR4EfrpDgna/rnJDGi0LdQTgB7R3V29l+n/Utw7usB83A19D+bzy9/vJ4ld9vcrD81fXKv278Iegxf4E0bq5xUnS1uuH5wLTORYT5tNh0sN3zUrOxby+8PXNXqNfPxsg2fMu1Dfmp2vQ+tBeXPHNpVXGpTTcMJfgZh8FvxDOS833VLnwj/P1JghwAqrczG+/X9XaOBnd4Am+nzX1MjcM7APkqwduxyDO5LXAWysdZkKCiIIEKtmrrvYSiBvSJBORwosHnbQ41sEcugDwLbf6363bJJWcEleZdvMJhLw9+Fr8OPrYuCZGyHCfcwz6o6nWYyRluKKkCQuMVoKnQw3Lckd/9QFVjQcs8wFJJN+vd0+uCf6eXoTzCrKe3IlhrlJdzzn+ec1wLv/bg572KSjTlvAt23UqVqOQR8mgizuy5N4M/zUlYnqHA5fVTYW1K5J3ASWhjg3NBTsKBruO5OidhAWjZ8D/vJCxXZQeE/sj0X+YYILCf1XVmZ9C4pg9yXQK5XPt05HJ9Dbm8TtRx3pRo/Sv9O6ry6HIhTf3niYJOty50qMDzTL3oBLENmnsJ8M8t8/Qzkfn1L/PvPXwvp7jD4fe8xTsp7th71x1oQzXvPC6vaSvQr8F++ePOg1+DSfa1yO7ZJ/wlzdyhe4RP+PxR6/nf13pe5/MgVE7BBRR7+jnXrTDq2Yo9njf25/vfUGi+amjzstcWgZ5G/bz3N+hfWpEHfqQ+r0x9Aj4MkUy0VPeJE9DEMy4qFWpm6pDkavJtIBalxLOgNd4uXlVfOS3tfggTyzkUNIw/ryaUihLz+aaYXZ9T+zwrcLpM8AWni8Acj9/Nd4G+YanpovGydPEs50PnQy8Wdh/Orcn8LWDXFFnPwATLdI/E3jxBytTDXndNuFjs21zrPR55U5Mbn63Wrb0SHlMm4viyj58AfFwLX5f/md6lXQv7v375HVdLLs56O2yj1RvfDk8Borfnv8541+Ovux9K1lXx3Q9Rrfk/ -------------------------------------------------------------------------------- /man/figures/procedure.drawio: -------------------------------------------------------------------------------- 1 | 7Vpbc6M2FP41frQHAeLy6Gv2km0zzU7b6ZsMslEDyAERJ/31lUBgLiLx7hrbuxNnJoEjcSSf79O5kZExj55vErQLvlAfhyNd859HxmKk68AAGv8jJC+FxLHsQrBNiC8nHQT35D8shfK5bUZ8nDYmMkpDRnZNoUfjGHusIUNJQvfNaRsaNlfdoS3uCO49FHalfxGfBVIKNO0w8AGTbSCXdqAcWCPvYZvQLJbrxTTGxUiESjVyahogn+5rImM5MuYJpay4ip7nOBRmLS1WPLfqGa22nOCYHfOAD+4+aia4n8H1w9jPbNsP1mOj0PKEwkyaYqRbIdc3C/gC1lZclZJ10pbwxerT8m/JXkqjMvycK2JRyAWAX6Y75JF4y+/g4e4r3XHBmO/emO0DwvA9lwsVe841LqNPONmEueUC4vs4Fo+yhD5UUBlcsqExk7wC4h6FZMtnLjxuHZxwQQ4S9uVONiQM5zSkSb5VYwPFj1RTkxefasHaiJV/qq/N98i/bi8yoMKbHyFMI8ySFz5FPmDC4gl5eIAjGbM/ULE6YUGNhdCUQiTpv61UH3jALyQV1LSwPQNZwHaA7UFvY8Cx06HEHyj2acRlK5rglHWAFl+e8MN0i9Y4vKMpYYQK268pY/w5Y1ZOmEpQmIC8QYwA7YSy6HkrHMyExBuaXxFvIs4N9ufZmk+YkZRO422+MIBdHG0T6DPYwSs/lQW4KxSRUBj6Aw6fsNhVkzw5DTvkafJNr2DvYKxgQi/swHAbuJsO7OAOdAXu9hCwg2NcwV2CSbTLGMoRVnuBQrgrBYjrfMyI9yA2HG5pQlgQ5f5d+HPuvIVFaMaEKMBihzHnD8r3WltqLuzC6RAIeSr8a5ZfR5ySQrPnZQliONeCYjGAxZ9VxGMVv0vwLuReJRJoHba9u7zv0lskllTl5PcJPviiGoNLL2eqiHoCWlpNbwTdLisrBtZZCU7hjf7F89tPLzfJ4z4B699tJ/vnzh2DDjNXSOGGSidCN5s7Hq/nRa4gzKfEpoZty4sspi5cOS1z56gX19oEaq5hQ01+8jG1j+vEDUdfG3ncqNIBrRGctH7wS/GCJPx7FS52L/zxSYIQAE1vpGvdKAQsBe6ngF2Zm+gKh9QD+SbEz1ORB3Jb4NiXlwsvRGlKvCbYTVu3An4B6CsJQn8iwNctDzWAExuYPEssf8NiXHLJmpiN4UM8wX4nT30Tvxo+UJUkSFmCQ+5Kn5rqVZjJFe4oEb6y8gqm3qCHYdkTt/4xmxpTmiUelkrqWWlLrw2+TS9DyRazjt6cYJVRvp9z8G3OiXR/13vgZIWC1uV07WggS4uUX/mb00EIBzqJytzg5GbpB6TfaXVj05lNYx6RNV1tAaVyY4rS6pVaqV5y9dVOb5RinBoMkViM5PdHxbJXaPqqhzxjHVWuVKPGzezLSDC/AF/YKz8apcmsx4wWRMgLmTWsi0qOrGrcKTSUI9rfNzNKRWI03BKL2yG1L+MUR8JFXG+FCfTp9FIV5uu++egEvxVfeAbXjLjdtE9VhAI4xKFRudOr5YJrLo3Z8hxcEAvfSxOgjJfO56MHKCG4DCHsn4kQjjN1l9124U/sHFxFEagC3zkB9uv/HhzzszeLP6fxI/uAP0WxrehGKrpSPOi9t6Wuvy11ypyuVNNqoSoKgjM3q7ql0p8433EqDD5Yz6qKRe2eVV+D40d7WRL6wZC1W1nKpbtR7tuBiGshuxS/jV2zBdXoBCoi/ZEtqgVcOgvzYu1DowwB5SvgM0YOJWBHZA5Dt3JMC7RI3DWK7jpdo0BjsFZOtyodwCz9gPTaamw0G4FjxYk/c2/nmNzjvbfzWm/nB6KDpmbLZXo5Xee/vP1t+bWD6vWUIhx0x1ldUymi9ZChvxRpBhSeEXxXn0IbqjRRBZi2e/j4Xpb8mmVJj3tqvZe4nrJE75D1ftB6pGqFnLkeGSj5/R4S2C3/dcYKRkkB6+3M72zv0y3TthxblawYwDSgOfrV36cbLXaYpjvR3aaWY9+hm213o9A18Htz0PUv7+S6GLlseDpy2a6CXE4tDbNORDR+e/hn6GL64Z/NjeX/ -------------------------------------------------------------------------------- /man/figures/procedure3.drawio: -------------------------------------------------------------------------------- 1 | 7Vpbd5s4EP41frQPN3F5tB073TZtc5qc7p594yKMNoAIiNjJr9+REBgMTt3Ebp3dkHMcGAlJzPfNaGZgpM+TzWXuZtFnGuB4pCnBZqRfjDTNsgz45YLHSoA0pRKschJUInUruCFPWArrbiUJcNHpyCiNGcm6Qp+mKfZZR+bmOV13u4U07s6auSvcE9z4btyX/kkCFkmpqijbhg+YrCI5tY1kg+f6d6uclqmcL6UprloStx5Gdi0iN6DrlkhfjPR5TimrzpLNHMdcq7XGqvuWe1qbJec4ZYfcEKjXfyiGejND3t04KC0riLyxXo3y4MalVMVIM2MYbxbBBOaKn9USL9+VwGTtbuIp2WOtVIY3YiCWxCBQ4bTIXJ+kK7hC26tbmoFgDKvXZ+uIMHwDcj7EGqgGMvqA8zAWmotIEOCU38pyetdApYMkpCmTvFL5tRuTFfS88EE7OAeBAAkHciUhieM5jWkulqqHiP/JYVry6mgmbLWY4mgeuw2DRAaWDRpoiSQsl5gmmOWP0EW2Gqi6QxpPw5j1lop6bSlRi4WoZqEr6b9qht7yAE4kFYZpYfm6a6qWrVo+8kMdje0eJb65aUATkC1pjgvWA5o/KQFjunI9HF/TgjBCue49yhjcp8/qDlMJCuOQd4gRuRkfLNmsuH+ZkDSk4oz4E243OJiXHnSYkYJO05WYWEV9HC1D1Waoh5ewygrcpZuQmCv6A44fMF9VlzyChj3ydPmmHQd2VXc6uBs26uGuagO4W6eAXT3EFVznmCRZyVyB8LAXqITZroDruEMb876kdcO4ENqfQgfVzjbVbbK9Hog33pfEv4P/PUYqbryiOWERF5ICfsoCjJ1vI3zbgD2CK56WvCuLMO+UAk1doZLWE825+oF1UT1MUopzDzPOBL5+F7gMPgzApyStxnNT3hPzf8sE9ka4ynEWgxtLOD22eqqU0HOi2e/3odqOMUmTASMMCN76xJYl1d7WGDKYI5iH2fWKyNF61tFYQts61DoYeY15/IPnVx8fL/P7da56Xy27/PvaGas9C1m6A+6wdmY0DK8hbphXMQtX3yA2LWx3vNnF1EFLe0fdAvXqXJkgxdEtpMhDtA372t7+ZWueLvavJixROpuksh/8WnxBcniuytWvuRUexyuqXa+o1V6y7RXNAdyNI3jFwRhJG3CMeyAPY7yZ8ngUdIHTQJ5e+LFbFMTvgt3V9U7gUQH6TKCyPyCBeWujVtHEUg2IVutfVLVLLpkTo9P87L6Gg04I3cevhQ8aClakLMcx+NqHbuA9hJmc4Vp62dorGFqHHrppTZz2sWP8BS1zH8tB2tHxzriW+nPjwjawwqw3riBYo5SXuxrzjDhnGpZpW0Oc01VDR8Z/nnN6vc3UgZrlTDTnZTwzVPTDsY7HrUF/pp6TQ/vfk8tCxyOX5QyQy245MfOXEq2fSb6h4sLQ1jpQdnimjtAuR+yrK/ygTHGEeMrQduKp31tdcHqcWFx9Wdz28D2fmgLAb9vLN1JTQDvRs4GMbhjTw36wwtAUPl+Dvfd0Zxuf/Fn6qUjv2Qf8MUmtsXWAT/j2+4oMC9jYAEDo8gUPlhVCkuIxK1NRXAgpLwnQjJGEPFVFAVlc4OsXXcSD8htp2LT2qxfb553A1W0kpspy6uOgzHE9dwGOShQhZMUha1djJu9lhpf5R8PoWMxP1BnQ621kMPjvx2c3XKknqzPY9tRZ7BbRB6rhTQz32vrD86iftr5gOOdWXzDew6RXhkk+3OaCY85l7nEEmtjKWYVN9UwtklzOPo/4ojo7n98ob7uzibciHupvdnw5+zYN5a/LGaViczrZFBdXpxx9kRY48WJ8xqGlqk2nb+Z1laPo3dBSeVloeRL7GPKhZwu7Yywg0vwVsPOJb6QK3JLR0zBB16wDsdeOEDL1sB9KKc4W+ybWeosmrzkHvqG2T5U+HlJSAnzZyRNIZ08C+ZV//6OIvDChInWLsPvw2E4leSrniwTO4+3tF9VhmUOK2EoqcZM0yjLJgakiYPwoM8b3xPCFEaBmvzAxNE+VGPY/0fi+xfl0+WGzX/3i/PAYINo7HuyE2R5cbr/mq2ro248l9cW/ -------------------------------------------------------------------------------- /man/figures/procedure4.drawio: -------------------------------------------------------------------------------- 1 | 7Vpbd5s4EP41frQPN3F5tB073TZtc5qc7p594yKMNoAIiNjJr9+REBgMTt3Ebp3dkHMcGAlJzPfNaGZgpM+TzWXuZtFnGuB4pCnBZqRfjDTNsgz45YLHSoA0pRKschJUInUruCFPWArrbiUJcNHpyCiNGcm6Qp+mKfZZR+bmOV13u4U07s6auSvcE9z4btyX/kkCFkmpqijbhg+YrCI5tY1kg+f6d6uclqmcL6UprloStx5Gdi0iN6DrlkhfjPR5TimrzpLNHMdcq7XGqvuWe1qbJec4ZYfcEKjXfyiGejND3t04KC0riLyxXo3y4MalVMVIM2MYbxbBBOaKn9USL9+VwGTtbuIp2WOtVIY3YiCWxCBQ4bTIXJ+kK7hC26tbmoFgDKvXZ+uIMHwDcj7EGqgGMvqA8zAWmotIEOCU38pyetdApYMkpCmTvFL5tRuTFfS88EE7OAeBAAkHciUhieM5jWkulqqHiP/JYVry6mgmbLWY4mgeuw2DRAaWDRpoiSQsl5gmmOWP0EW2Gqi6QxpPw5j1lop6bSlRi4WoZqEr6b9qht7yAE4kFYZpYfm6a6qWrVo+8kMdje0eJb65aUATkC1pjgvWA5o/KQFjunI9HF/TgjBCue49yhjcp8/qDlMJCuOQd4gRuRkfLNmsuH+ZkDSk4oz4E243OJiXHnSYkYJO05WYWEV9HC1D1Waoh5ewygrcpZuQmCv6A44fMF9VlzyChj3ydPmmHQd2VXc6uBs26uGuagO4W6eAXT3EFVznmCRZyVyB8LAXqITZroDruEMb876kdcO4ENqfQgfVzjbVbbK9Hog33pfEv4P/PUYqbryiOWERF5ICfsoCjJ1vI3zbgD2CK56WvCuLMO+UAk1doZLWE825+oF1UT1MUopzDzPOBL5+F7gMPgzApyStxnNT3hPzf8sE9ka4ynEWgxtLOD22eqqU0HOi2e/3odqOMUmTASMMCN76xJYl1d7WGDKYI5iH2fWKyNF61tFYQts61DoYeY15/IPnVx8fL/P7da56Xy27/PvaGas9C1m6A+6wdmY0DK8hbphXMQtX3yA2LWx3vNnF1EFLe0fdAvXqXJkgxdEtpMhDtA372t7+ZWueLvavJixROpuksh/8WnxBcniuytWvuRUexyuqXa+o1V6y7RXNAdyNI3jFwRhJG3CMeyAPY7yZ8ngUdIHTQJ5e+LFbFMTvgt3V9U7gUQH6TKCyPyCBeWujVtHEUg2IVutfVLVLLpkTo9P87L6Gg04I3cevhQ8aClakLMcx+NqHbuA9hJmc4Vp62dorGFqHHrppTZz2sWP8BS1zH8tB2tHxzriW+nPjwjawwqw3riBYo5SXuxrzjDhnGpZpW0Oc01VDR8Z/nnN6vc3UgZrlTDTnZTwzVPTDsY7HrUF/pp6TQ/vfk8tCxyOX5QyQy245MfOXEq2fSb6h4sLQ1jpQdnimjtAuR+yrK/ygTHGEeMrQduKp31tdcHqcWFx9Wdz28D2fmgLAb9vLN1JTQDvRs4GMbhjTw36wwtAUPl+Dvfd0Zxuf/Fn6qUjv2Qf8MUmtsXWAT/j2+4oMC9jYAEDo8gUPlhVCkuIxK1NRXAgpLwnQjJGEPFVFAVlc4OsXXcSD8htp2LT2qxfb553A1W0kpspy6uOgzHE9dwGOShQhZMUha1djJu9lhpf5R8PoWMxP1BnQ621kMPjvx2c3XKknqzPY9tRZ7BbRB6rhTQz32vrD86iftr5gOOdWXzDew6RXhkk+3OaCY85l7nEEmtjKWYVN9UwtklzOPo/4ojo7n98ob7uzibciHupvdnw5+zYN5a/LGaViczrZFBdXpxx9kRY48WJ8xqGlqk2nb+Z1laPo3dBSeVloeRL7GPKhZwu7Yywg0vwVsPOJb6QK3JLR0zBB16wDsdeOEDL1sB9KKc4W+ybWeosmrzkHvqG2T5U+HlJSAnzZyRNIZ08C+ZV//6OIvDChInWLsPvw2E4leSrniwTO4+3tF9VhmUOK2EoqcZM0yjLJgakiYPwoM8b3xPCFEaBmvzAxNE+VGPY/0fi+xfl0+WGzX/3i/PAYINo7HuyE2R5cbr/mq2ro248l9cW/ -------------------------------------------------------------------------------- /man/figures/procedure5.drawio: -------------------------------------------------------------------------------- 1 | 7Vptc6M2EP41nmk/2GPxzkfbsZO7y10zyc20028CBFYDiAgcO/frKwmBkZGTXGInvjZkxoGVWIl9Hq12FwbmLNucU1gsv5IIpQNjHG0G5tnAMIDlmewflzzUEt9wa0FCcSQ7bQU3+AeSwrGUrnCESqVjRUha4UIVhiTPUVgpMkgpWavdYpKqoxYwQT3BTQjTvvRPHFVLKQXj8bbhAuFkKYf2bNkQwPA2oWSVy/FykqO6JYONGtm1XMKIrDsicz4wZ5SQqj7LNjOUcrM2FqvvW+xpbadMUV4954YIXH0aW+Bmage3w2jlutEyGErc7mG6kqYYGE7K9E2XbAAn4WeNJKC7EjZYt5t4yuqhMWqFNkJRlaVMANhpWcAQ5wm7srdX30nBBEM2e3O6XuIK3TA5V7FmXGMyco9onArLLXEUoZzfWlFy20JlMklM8kryCvBrmOKE9TwLmXUQZQIBEorkTGKcpjOSEiqmasY2/5NqOvL6aAfstDjiaB+7C4NEhk2bWaAjkrCcI5Khij6wLrLVsus75OJpGbPeUtFsVsqyw0K7YSGU9E9a1VsesBNJBT0t3NCEDnA94IZ2GJv20OtR4iuCOVPDl/2Az23Bfq8XPbz5A2O2pi5hgNIrUuIKEw5BQKqKZMxUTYeJxKbiyCv8WMKCK8s2CfczI5zHRJzhcMSXD4pmq4B1mOKSTPJEDAzsPpyuBYyp3YNNLM4a4wXMcMrtfYHSe8RnpXJIsLHHIZV2xmHQB6avwG95ffiBoYHfPQb64Dke4YoinBWrCgqE9c6gFha7Arboc4U2zt2Ke0Bh/GEprD9hHYBXbOrbZHuj6NukFE6fU5GiImW+IhLGYrZiSKRwVeIgRc0jlGIr4Z4aJTiXHUdMGcw4+fKgLFSHVk+w5+eK93dzxg7RJZ3ZAokw2rqtDssbh2jpyHwA6jqq43KA3WNuy9Iuc4Hlvp66/6DZ5eeHc3q3piD4w/VWf1/5Q9BjL+Mqe0bJ1B3oGn9D4viK7fCzOrrgVtRC1IF4x+GcTXx74e1YXYBfn49H9tg3XXssD9Gmd4e9ncYzAlPsNG0AMVa2s/F+DjTiM0zZc9XeeI3K6jDoA6A6LsO0+o7L0cBvHcBxaaMZQ+O79kAep2gz4ZEjswXKI3l6FqawLHGogq3aeidEqAF9JKTYHzqwcZu1DeyRCywWVza/dt0uueSMLKX50a0HRUqw28evg4+tCyukjKKULZt7NUTWYSZHuCKYzWTrHCxDoYfpuCO/e1iqxpKsaIikkm4cu6PXBT+nt4I0QVVPryBYa5SXexznhDjnWK7juTrOmcAybes/zzmz2W2aWMr1R4b/Mp5ZwH5S1+G4pfVn4JQc2v+eXK59OHK5voZcXseJOW9KtH7O9wuVAXRbq6ZA8EjG3y0c7KsAPFFQOEA8ZRk78dT71gH8Hifml9/m3/elTtxuokLXmG6buIl8PLD7uZwoJbQcqjW01FJGuV68zTjn06/v84D91fXX+ZSQsnqf6Zxdvs+487xEmUjfT7a4xJyM5y163uc0i0v2To5mWb4aLPc8jLbU1BbCX+Nhgh+3nvUlnOZfyvyuukCfs9wdus/Yea5fXG2SZD14tQkmUBSTAliKaz6vcUFRhMNKCNraE4l5AWrJ7y8LFOIYi/aMv0L5rfx9/7L4KDw9tlValkJrfeXJ0RD5aJWnfqg+CasV5Pd96rD3WBUoz5v4890XIZo3Gm10/9rK1OMkOG7lyfJPrfJkfQTQrwygQ3Ybc6q8RWSlB6CJNz6pgLoZ6SOi/oioPyLqjosAxmTyy7yu9cem4lRM/2UR9VFcjG4bOlnYfWvO6PkWsPOBb6QJ4KoiR2JC89HTk9gb9hGw12VSJ4t9G67+ikve8PvZhhZo71hZ83PqtQzf6uh5s78nbxYpj5oJh+LDj4BnwqRgM2NKokH74cYSwXvMIGTGShNCmTArRwNu1Kc3SceFgR/rNkkZATSC70vMZyJ++BxILvIzmPOJxISKJL9EkIbiW5IVLUgp5pszJm4N2OobPZ2+f2T1PxGvA/UFiy6r18brwD6AQ9Vm9f2voeabAlGcsafk3DlWOt/ujW+czh8CRW/HWx4xOWeX2w9o65dh2w+Uzfm/ -------------------------------------------------------------------------------- /R/iterationNextVar.R: -------------------------------------------------------------------------------- 1 | #' @title get the next variable in the iteration to impute 2 | #' @description this function evaluates the saved mlim object and 3 | #' decides which loop and iteration to carry out next. 4 | #' @param m integer. number of imputation datasets 5 | #' @param m.it integer. current number of dataset that is being imputed 6 | #' @param k integer. current global loop iteration. each dataset goes up to 7 | #' 'maxiter' number of global iterations 8 | #' @param z integer. current number of imputation variable in the 'ITERATIONVARS'. 9 | #' this specifies the local iteration number. 10 | #' @param Y character. name of the variable that is to be imputed (or was lastly 11 | #' imputed, in case of loading mlim object) 12 | #' @param ITERATIONVARS character vector, specifying variable names to be imputed 13 | #' @param maxiter maximum number of global iterations for each dataset 14 | #' @author E. F. Haghish 15 | #' @keywords Internal 16 | #' @noRd 17 | iterationNextVar <- function(m, m.it, k, z, Y, ITERATIONVARS, maxiter) { 18 | 19 | # if 'z' is smaller than the length of 'ITERATIONVARS', move-on to next variable 20 | # if 'z' is the last in the iteration, first check the iteration limits 21 | # if iterations limit is reached, check the MI datasets limit 22 | # if there is a place for multiple imputation, move on to next data 23 | # and reset the iterations 24 | # else stop the imputation 25 | # else move on to the next global loop iteration 26 | # if there is a continuation, specify the next imputation variable 27 | 28 | if (z < length(ITERATIONVARS)) { 29 | z <- z + 1 30 | } 31 | else if (z == length(ITERATIONVARS)) { 32 | if (k == maxiter) { 33 | if (m.it < m) { 34 | m.it <- m.it + 1 35 | k <- 1 36 | z <- 1 37 | } 38 | else stop("all datasets were already generated!") 39 | } 40 | else { 41 | k <- k + 1 42 | z <- 1 43 | } 44 | } 45 | 46 | # update the next variable to be imputed 47 | Y <- ITERATIONVARS[z] 48 | 49 | return(list(m=m, 50 | m.it=m.it, 51 | k=k, 52 | z=z, 53 | Y=Y)) 54 | } 55 | 56 | # print(paste(object$m, object$m.it, object$k, object$z, object$Y, object$ITERATIONVARS, object$maxiter)) 57 | # iterationNextVar(object$m, object$m.it, object$k, object$z, object$Y, object$ITERATIONVARS, object$maxiter) 58 | -------------------------------------------------------------------------------- /R/init.R: -------------------------------------------------------------------------------- 1 | #' @title extractMetrics 2 | #' @description extracts performance metrics from cross-validation 3 | #' @author E. F. Haghish 4 | #' @return connection object 5 | #' @keywords Internal 6 | #' @noRd 7 | 8 | init <- function(nthreads, min_mem_size, max_mem_size, ignore_config = TRUE, 9 | java = NULL, report, debug) { 10 | 11 | if (!is.null(java)) { 12 | Sys.setenv(JAVA_HOME = java) 13 | } 14 | # Run H2O on the statistics server 15 | # ============================================================ 16 | #try(h2o.shutdown(FALSE), silent = TRUE) 17 | keepTrying <- TRUE 18 | connection <- NULL 19 | test <- 1 20 | while (keepTrying) { 21 | # h2o.init(jvm_custom_args = c("-help")) 22 | # h2o.init(jvm_custom_args = c("-session_timeout=100")) 23 | # bind_to_localhost = FALSE 24 | # h2o.init(jvm_custom_args=c("-Dsys.ai.h2o.heartbeat.benchmark.enabled=true")) 25 | tryCatch(connection <- h2o::h2o.init(nthreads = nthreads, 26 | #name = "mlim_connection", 27 | min_mem_size = min_mem_size, 28 | max_mem_size = max_mem_size, 29 | ignore_config = ignore_config, 30 | insecure = TRUE, 31 | https = FALSE, 32 | log_level = if (debug) "DEBUG" else "FATA", 33 | bind_to_localhost = FALSE), 34 | error = function(cond) { 35 | #message("connection to JAVA server failed...\n"); 36 | return()}) 37 | if (!is.null(connection)) { 38 | keepTrying <- FALSE 39 | } 40 | else { 41 | test <- test + 1 42 | message("The Java server could not be initiated. It will retry in 3 seconds...\n") 43 | Sys.sleep(3) 44 | } 45 | 46 | if (test > 10) stop("The attempt to start the H2O server was unsuccessful \ndue to an issue within your system...\n") 47 | 48 | } 49 | 50 | #if (!is.null(report)) { 51 | # if (!is.null(connection)) { 52 | # if (attributes(connection)$ip == "localhost") { 53 | # md.log("h2o.cluster was initialized", section="subsection") 54 | # md.log(paste("IP:",attributes(connection)$ip, 55 | # " port:",attributes(connection)$port)) 56 | # } 57 | # } 58 | #} 59 | 60 | return(connection) 61 | } 62 | -------------------------------------------------------------------------------- /man/mlim.na.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlim.na.R 3 | \name{mlim.na} 4 | \alias{mlim.na} 5 | \title{add stratified/unstratified artificial missing observations} 6 | \usage{ 7 | mlim.na(x, p = 0.1, stratify = FALSE, classes = NULL, seed = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{data.frame. x must be strictly a data.frame and any other 11 | data.table classes will be rejected} 12 | 13 | \item{p}{percentage of missingness to be added to the data} 14 | 15 | \item{stratify}{logical. if TRUE (default), stratified sampling will be 16 | carried out, when adding NA values to 'factor' variables 17 | (either ordered or unordered). this feature makes evaluation 18 | of missing data imputation algorithms more fair, especially 19 | when the factor levels are imbalanced.} 20 | 21 | \item{classes}{character vector, specifying the variable classes that should 22 | be selected for adding NA values. the default value is NULL, 23 | meaning all variables will receive NA values with probability of 'p'. 24 | however, if you wish to add NA values only to a specific classes, e.g. 25 | 'numeric' variables or 'ordered' factors, specify them in this argument. 26 | e.g. write "classes = c('numeric', 'ordered')" if you wish to add NAs 27 | only to numeric and ordered factors.} 28 | 29 | \item{seed}{integer. a random seed number for reproducing the result (recommended)} 30 | } 31 | \value{ 32 | data.frame 33 | } 34 | \description{ 35 | to examine the performance of imputation algorithms, artificial 36 | missing data are added to datasets and then imputed, to compare 37 | the original observations with the imputed values. this function 38 | can add stratified or unstratified artificial missing data. stratified 39 | missing data can be particularly useful if your categorical or ordinal 40 | variables are imbalanced, i.e., one category appears at a much higher 41 | rate than others. 42 | } 43 | \examples{ 44 | 45 | \dontrun{ 46 | # adding stratified NA to an atomic vector 47 | x <- as.factor(c(rep("M", 100), rep("F", 900))) 48 | table(mlim.na(x, p=.5, stratify = TRUE)) 49 | 50 | # adding unstratified NAs to all variables of a data.frame 51 | data(iris) 52 | mlim.na(iris, p=0.5, stratify = FALSE, seed = 1) 53 | 54 | # or add stratified NAs only to factor variables, ignoring other variables 55 | mlim.na(iris, p=0.5, stratify = TRUE, classes = "factor", seed = 1) 56 | 57 | # or add NAs to numeric variables 58 | mlim.na(iris, p=0.5, classes = "numeric", seed = 1) 59 | } 60 | } 61 | \author{ 62 | E. F. Haghish 63 | } 64 | -------------------------------------------------------------------------------- /R/mlim.shuffle.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title generate multiple imputation datasets from a single imputation 3 | #' @description mlim estimates Root Mean Squared Error (RMSE), which can provide 4 | #' a hint about the standard deviation of the error. if normal 5 | #' distribution of the error is assumed, stochastic imputed values 6 | #' can be generated, where each predicted value is considered the 7 | #' 'mean' of the normal distribution and the standard deviation is 8 | #' assumed to be equal to 'RMSE'. 9 | #' @param df original dataframe with NAs 10 | #' @param imputed single imputed dataset of the class 'mlim'. the dataset must have 11 | #' been imuted with mlim R package to include the imputation metrics 12 | #' needed for the computation 13 | #' @param m integer, specifying number of multiple imputations. the default value is 14 | #' 1, carrying out a single imputation. 15 | #' @param report filename. if a filename is specified (e.g. report = "mlim.md"), the \code{"md.log"} R 16 | #' package is used to generate a Markdown progress report for the 17 | #' imputation. the format of the report is adopted based on the 18 | #' \code{'verbosity'} argument. the higher the verbosity, the more 19 | #' technical the report becomes. if verbosity equals "debug", then 20 | #' a log file is generated, which includes time stamp and shows 21 | #' the function that has generated the message. otherwise, a 22 | #' reduced markdown-like report is generated. default is NULL. 23 | #' @importFrom md.log md.log 24 | #' @importFrom stats rnorm 25 | #' @return list, including multiple imputed datasets 26 | #' @author E. F. Haghish 27 | #' @keywords Internal 28 | #' @noRd 29 | 30 | 31 | mlim.shuffle <- function(df, imputed, m, report = NULL) { 32 | MI <- NULL 33 | if (!is.null(report)) md.log("Stochastic Shuffling", section="paragraph", trace=FALSE) 34 | 35 | # get the imputation metrics 36 | metrics <- attributes(imputed)$metrics 37 | vars2impute <- unique(metrics$variable) 38 | dfNA <- is.na(df) 39 | 40 | for (i in 1:m) { 41 | MI[[i]] <- imputed 42 | 43 | for (Y in vars2impute) { 44 | v.na <- dfNA[, Y] 45 | RMSE <- min(metrics[metrics$variable == Y, "RMSE"], na.rm = TRUE) 46 | VEK <- imputed[which(v.na), Y] 47 | 48 | MI[[i]][which(v.na), Y] <- rnorm( 49 | n = length(VEK), 50 | mean = VEK, 51 | sd = RMSE) 52 | } 53 | } 54 | 55 | if (m > 1) class(MI) <- "mlim.mi" 56 | else class(MI) <- c("mlim", "data.frame") 57 | 58 | return(MI) 59 | } 60 | 61 | 62 | -------------------------------------------------------------------------------- /R/algoSelector.R: -------------------------------------------------------------------------------- 1 | #' @title selects algorithms for imputation and post imputation 2 | #' @description automatically sorts algorithms for imputation and post imputation 3 | #' @author E. F. Haghish 4 | #' @return integer 5 | #' @keywords Internal 6 | #' @noRd 7 | 8 | algoSelector <- function(algos, postimpute) { 9 | #preimpute <- NULL 10 | impute <- NULL 11 | postimputealgos <- NULL 12 | 13 | supportedAlgos <- c("ELNET","RF","DL","GBM","XGB", "Ensemble") 14 | actualNames <- c("GLM","DRF","DeepLearning","GBM","XGBoost", "StackedEnsemble") 15 | #algos <- toupper(algos) 16 | 17 | # convert the names to the actuals 18 | for (i in supportedAlgos) { 19 | if (i %in% algos) algos[which(algos == i)] <- actualNames[which(supportedAlgos == i)] 20 | } 21 | 22 | if (length(setdiff(x=algos, y=c("ELNET","GLM", 23 | "RF","DRF", 24 | "GBM", 25 | "XGB","XGBoost", 26 | "DL","DeepLearning", 27 | "Ensemble","StackedEnsemble"))) > 0) stop("some of the 'algos' are not recognised") 28 | 29 | if (postimpute) { 30 | # impute (by default both GLM and DRF are used for imputing) 31 | # ------------------------------------------------------------ 32 | if ("ELNET" %in% algos) impute <- c("GLM") 33 | else if ("GLM" %in% algos) impute <- c("GLM") 34 | 35 | if ("RF" %in% algos) impute <- c(impute, "DRF") 36 | else if ("DRF" %in% algos) impute <- c(impute, "DRF") 37 | 38 | # postimpute (start with GBM, then XGB, then DL, then Ensemble) 39 | # ------------------------------------------------------------ 40 | if ("GBM" %in% algos) { 41 | if (is.null(impute)) impute <- "GBM" 42 | else postimputealgos <- c(postimputealgos, "GBM") 43 | } 44 | if ("XGB" %in% algos) { 45 | if (Sys.info()["sysname"] == "Windows") { 46 | stop("XGB is not available in Windows") 47 | } 48 | if (is.null(impute)) impute <- "XGBoost" 49 | else postimputealgos <- c(postimputealgos, "XGBoost") 50 | } 51 | if ("DL" %in% algos) { 52 | if (is.null(impute)) impute <- "DeepLearning" 53 | else postimputealgos <- c(postimputealgos, "DeepLearning") 54 | } 55 | 56 | # if Ensemble is specified, include the 'impute' algorithms 57 | if ("Ensemble" %in% algos) { 58 | if (is.null(impute)) stop("Ensemble is a meta learner and requires other algorithms. ") 59 | else postimputealgos <- c(impute, postimputealgos, "StackedEnsemble") 60 | } 61 | } 62 | 63 | # use all algorithms for imputation 64 | else { 65 | impute <- algos 66 | } 67 | 68 | 69 | return(list(impute=impute, postimpute=postimputealgos)) 70 | } 71 | 72 | 73 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: mlim 2 | Type: Package 3 | Title: Single and Multiple Imputation with Automated Machine Learning 4 | Version: 0.4.0 5 | Authors@R: 6 | person("E. F. Haghish", 7 | role = c("aut", "cre", "cph"), 8 | email = "haghish@hotmail.com") 9 | Depends: 10 | R (>= 3.5.0) 11 | Description: Machine learning algorithms have been used for performing 12 | single missing data imputation and most recently, multiple imputations. 13 | However, this is the first attempt for using automated machine learning algorithms 14 | for performing both 15 | single and multiple imputation. Automated machine learning is a procedure for 16 | fine-tuning the model automatic, performing a random search for a model that 17 | results in less error, without overfitting the data. The main idea is 18 | to allow the model to set its own parameters for imputing each variable separately 19 | instead of setting fixed predefined parameters to impute all variables 20 | of the dataset. 21 | Using automated machine learning, the package fine-tunes an Elastic 22 | Net (default) or Gradient Boosting, Random Forest, Deep Learning, Extreme Gradient Boosting, 23 | or Stacked Ensemble machine learning model (from one or a combination of other 24 | supported algorithms) for imputing the missing 25 | observations. This procedure has been implemented for the 26 | first time by this package and is expected to outperform other packages for 27 | imputing missing data that do not fine-tune their models. The multiple imputation 28 | is implemented via bootstrapping without letting the duplicated observations to 29 | harm the cross-validation procedure, which is the way imputed variables are evaluated. 30 | Most notably, the package implements automated procedure for handling imputing imbalanced 31 | data (class rarity problem), which happens when a factor variable has a level that is far more 32 | prevalent than the other(s). This is known to result in biased predictions, hence, biased 33 | imputation of missing data. However, the autobalancing procedure ensures that instead of 34 | focusing on maximizing accuracy (classification error) in imputing factor variables, 35 | a fairer procedure and imputation method is practiced. 36 | License: MIT + file LICENSE 37 | Encoding: UTF-8 38 | Imports: 39 | h2o (>= 3.34.0.0), 40 | curl (>= 4.3.2), 41 | mice, 42 | missRanger, 43 | memuse, 44 | md.log (>= 0.2.0) 45 | RoxygenNote: 7.3.1 46 | LazyData: true 47 | URL: https://github.com/haghish/mlim, https://www.sv.uio.no/psi/english/people/academic/haghish/ 48 | BugReports: https://github.com/haghish/mlim/issues 49 | -------------------------------------------------------------------------------- /R/bootstrapCV.R: -------------------------------------------------------------------------------- 1 | #' @title assigns bootstrap data to cros validation groups 2 | #' @description identifies duplicated rows in the bootstrap and creates 3 | #' a column in the bootstrap data assigning the cross-validation 4 | #' groups to avoid having duplicates in the training and testing 5 | #' @author E. F. Haghish 6 | #' @param index numeric. bootstrapped rows numbers from the original dataset 7 | #' @param cv numeric. number of Cross-Validation (CV) 8 | #' @return numeric data.frame including 'CV' amount of columns, 9 | #' owhere each column shows the row number of observations 10 | #' that belong to that CV group 11 | #' @keywords Internal 12 | #' @noRd 13 | bootstrapCV <- function(index, cv = 10) { 14 | index <- sort(index) 15 | weight <- table(index) 16 | rows <- as.numeric(names(weight)) 17 | tab <- as.data.frame(cbind(rows, weight)) 18 | tab <- tab[order(tab$weight, decreasing = TRUE), ] 19 | 20 | # calculate the length of each group and start the groups 21 | col <- length(index) %/% cv 22 | mat <- as.data.frame(matrix(data = rep(NA, length(index)), ncol = cv)) 23 | 24 | #distribute the values that have duplications 25 | tab.dup <- tab[tab$weight > 1, ] 26 | tab.unq <- tab[tab$weight == 1, ] 27 | rows <- tab.unq$rows 28 | 29 | j <- 1 30 | reverse <- FALSE 31 | for (i in 1:nrow(tab.dup)) { 32 | # which cv group should the rows be assigned to 33 | if (j > cv) { 34 | reverse <- TRUE 35 | j <- cv 36 | } 37 | if (j < 1) { 38 | reverse <- FALSE 39 | j <- 1 40 | } 41 | val <- rep(tab.dup[i,1], tab.dup[i,2]) 42 | place <- which(is.na(mat[,j])) 43 | replace <- place[1]:(place[1]+length(val)-1) 44 | mat[replace,j] <- val 45 | 46 | if (!reverse) j <- j + 1 47 | else j <- j - 1 48 | } 49 | 50 | # Filling the remaining places with random rows 51 | # ============================================= 52 | for (j in 1:cv) { 53 | place <- which(is.na(mat[,j])) 54 | replace <- place[1]:(nrow(mat)) 55 | 56 | if (j < cv) { 57 | if (length(replace) >= 1) { 58 | val <- sample(rows, size = length(replace), replace = FALSE) 59 | mat[replace, j] <- val 60 | rows <- rows[! rows %in% val] 61 | } 62 | } 63 | # filling the last column, if it has any empty rows 64 | else { 65 | if (length(replace) >= 1) { 66 | if (length(rows) >= 1) { 67 | replace <- place[1]:(place[1]+length(rows)-1) 68 | mat[replace,j] <- rows 69 | } 70 | } 71 | } 72 | } 73 | return(mat) 74 | } 75 | 76 | # sampling_index <- sample.int(150, 145, replace=TRUE) 77 | # sampling_index <- sort(sampling_index, decreasing = TRUE) 78 | # View(bootstrapCV(sampling_index, cv = 5)) 79 | -------------------------------------------------------------------------------- /R/mlim.summarize.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title mlim imputation summary 3 | #' @description provides information about estimated accuracy of the imputation as well 4 | #' as the overall procedure of the imputation. 5 | #' @param data dataset imputed with mlim 6 | # @param method character. the default is NULL, returning RMSE. the supported 7 | # arguments are "scaled" nad "normalize". 8 | #' @return estimated imputation accuracy via cross-valdiation procedure 9 | #' @examples 10 | #' \dontrun{ 11 | #' data(iris) 12 | #' 13 | #' # add 10% stratified missing values to one factor variable 14 | #' irisNA <- iris 15 | #' irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022) 16 | #' 17 | #' # run the ELNET single imputation (fastest imputation via 'mlim') 18 | #' MLIM <- mlim(irisNA) 19 | #' 20 | #' # in single imputation, you can estimate the imputation accuracy via cross validation RMSE 21 | #' mlim.summarize(MLIM) 22 | #' } 23 | #' @author E. F. Haghish 24 | #' @export 25 | 26 | mlim.summarize <- function(data #, method = NULL 27 | ) { 28 | 29 | results <- data.frame(variable = character(), 30 | nrmse = numeric(), 31 | mpce = numeric()) 32 | 33 | if (inherits(data, "mlim")) { 34 | att <- attributes(data) 35 | metrics <- att$metrics 36 | VARS <- colnames(data)[colnames(data) %in% metrics$variable] 37 | for (i in VARS) { 38 | err <- NULL 39 | index <- metrics$variable == i 40 | if (inherits(data[,i], "factor")) { 41 | #err <- min(metrics[index, "mean_per_class_error"], na.rm = TRUE) 42 | minimum <- min(metrics[index, "RMSE"], na.rm = TRUE) 43 | # if (is.valid(method)) { 44 | # if (method == "scale") err <- minimum / (length(levels(data[,i])) - 1) 45 | # } 46 | err <- round(minimum, 6) 47 | results <- rbind(results, c(i, err)) 48 | } 49 | else { 50 | minimum <- min(metrics[index, "RMSE"], na.rm = TRUE) 51 | # if (is.valid(method)) { 52 | # if (method == "scale") err <- minimum / diff(range(data[,i])) #sd(data[,i], na.rm = TRUE) 53 | # else if (method == "scale") err <- minimum / sd(data[,i], na.rm = TRUE) 54 | # } 55 | 56 | err <- round(minimum, 6) 57 | results <- rbind(results, c(i, err)) 58 | } 59 | } 60 | 61 | colnames(results) <- c("variable", "rmse") 62 | return(results) 63 | } 64 | else if (inherits(data, "mlim.mi")) { 65 | results <- mlim.summarize(data[[1]]) 66 | for (i in 2:length(data)) { 67 | results <- cbind(results, mlim.summarize(data[[i]])[,2]) 68 | } 69 | colnames(results) <- c("varable", paste0("rmse_", 1:length(data))) 70 | return(results) 71 | } 72 | 73 | } 74 | 75 | #print(mlim.summarize(default)) 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /R/mlim.postimpute.R: -------------------------------------------------------------------------------- 1 | # THIS FUNCTION IS STILL UNDER DEVELOPMENT AND WILL BE RELEASED IN VERSION 0.2 2 | 3 | 4 | #' @title carries out postimputation to minimize imputation error 5 | #' @description this is a function for carrying out postimputation on datasets 6 | #' that are already imputed, either by mlim or other software. 7 | #' @param data data.frame with missing values 8 | #' @param preimputed.data data.frame. if you have used another software for missing 9 | #' data imputation, you can still optimize the imputation 10 | #' by handing the data.frame to this argument, which will 11 | #' bypass the "preimpute" procedure. 12 | #' @param algos character vector of algorithms to be used for imputation. the default 13 | #' is 'c("ELNET", "GBM", "RF", "DL", "Ensemble")'. 14 | #' @param ... arguments to be passed to mlim 15 | #' @return imputed data.frame 16 | #' @examples 17 | #' \dontrun{ 18 | #' data(iris) 19 | #' 20 | #' # add 10% stratified missing values to one factor variable 21 | #' irisNA <- iris 22 | #' irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022) 23 | #' 24 | #' # run the default imputation 25 | #' MLIM <- mlim(irisNA) 26 | #' mlim.error(MLIM, irisNA, iris) 27 | #' 28 | #' # carry out postimputation. for real-life applications, you should increase 29 | #' # the 'tuning_time' to at least 3600 seconds, even for small datasets 30 | #' post <- mlim.postimpute(data=irisNA, preimputed.data=MLIM, tuning_time=120, 31 | #' algos = c("GBM", "RF", "ELNET", "Ensemble"), 32 | #' seed = 2022) 33 | #' } 34 | #' 35 | #' @author E. F. Haghish 36 | #' @keywords Internal 37 | #' @noRd 38 | 39 | 40 | # mlim prints the data number. when postimpute is used, the data number 41 | # must be passed to mlim 42 | # mlim.postimpute should also allow saving and loading. this requires adding 43 | # other elements in the saving object 44 | mlim.postimpute <- function(data, preimputed.data, 45 | algos = c("ELNET", "GBM", "RF", "DL", "Ensemble"), 46 | ... ) { 47 | results <- NULL 48 | 49 | if (inherits(preimputed.data, "mlim.mi")) { 50 | results <- list() 51 | for (i in 1:length(preimputed.data)) { 52 | results[[i]] <- mlim(data = data, 53 | preimputed.data = preimputed.data[[i]], 54 | algos = algos, 55 | postimpute = FALSE, 56 | shutdown = FALSE, 57 | ...) 58 | 59 | } 60 | class(results) <- "mlim.mi" 61 | } 62 | 63 | # if the preimputation was done with mlim, extract the metrics 64 | else if (inherits(preimputed.data, "mlim")) { 65 | results <- mlim(data = data, 66 | preimputed.data = preimputed.data, 67 | algos = algos, 68 | postimpute = FALSE, 69 | ...) 70 | } 71 | 72 | return(results) 73 | } 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /R/mlim.preimpute.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title carries out preimputation 3 | #' @description instead of replacing missing data with mean and mode, a smarter 4 | #' start-point would be to use fast imputation algorithms and then 5 | #' optimize the imputed dataset with mlim. this procedure usually 6 | #' requires less iterations and will savea lot of computation 7 | #' resources. 8 | # @importFrom VIM kNN 9 | #' @importFrom missRanger missRanger imputeUnivariate 10 | # @importFrom missForest missForest 11 | #' @param data data.frame with missing values 12 | #' @param preimpute character. specify the algorithm for preimputation. the 13 | #' supported options are "RF" (Random Forest), "mm" 14 | #' (mean-mode replacement), and "random" (random sampling from available data). 15 | #' the default is "RF", which carries 16 | #' a parallel random forest imputation, using all the CPUs available. 17 | #' the other alternative is "mm" which performs mean/mode 18 | #' imputation. 19 | #' @param seed integer. specify the random generator seed 20 | #' @return imputed data.frame 21 | #' @author E. F. Haghish 22 | #' @examples 23 | #' \dontrun{ 24 | #' data(iris) 25 | #' 26 | #' # add 10% stratified missing values to one factor variable 27 | #' irisNA <- iris 28 | #' irisNA$Species <- mlim.na(irisNA$Species, p = 0.1, stratify = TRUE, seed = 2022) 29 | #' 30 | #' # run the default random forest preimputation 31 | #' MLIM <- mlim.preimpute(irisNA) 32 | #' mlim.error(MLIM, irisNA, iris) 33 | 34 | #' } 35 | #' @export 36 | 37 | mlim.preimpute <- function(data, preimpute = "RF", seed = NULL) { 38 | 39 | #if (tolower(preimpute) == "knn") { 40 | # set.seed(seed) 41 | # data <- VIM::kNN(data, imp_var=FALSE) 42 | # if (!is.null(report)) md.log("kNN preimputation is done", date=debug, time=debug, trace=FALSE) 43 | #} 44 | 45 | if (tolower(preimpute) == "rf") { 46 | message("\nPreimputation: Random Forest") 47 | pb <- txtProgressBar(0, 1, style = 3) 48 | data <- missRanger::missRanger(data, num.trees=500, mtry=1, 49 | verbose = 0, returnOOB=TRUE, seed = seed) 50 | setTxtProgressBar(pb, 1) 51 | #if (!is.null(report)) md.log("RF preimputation is done", date=debug, time=debug, trace=FALSE) 52 | } 53 | else if (tolower(preimpute) == "mm") { 54 | message("\nPreimputation: Mean/Mode") 55 | pb <- txtProgressBar(0, 1, style = 3) 56 | data <- meanmode(data) 57 | setTxtProgressBar(pb, 1) 58 | #if (!is.null(report)) md.log("Mean/Mode preimputation is done", date=debug, time=debug, trace=FALSE) 59 | } 60 | else if (tolower(preimpute) == "sample") { 61 | message("\nPreimputation: Random Sampling") 62 | if (!is.null(seed)) set.seed(seed) 63 | rsample <- function(x) replace(x, is.na(x), sample(x[!is.na(x)],sum(is.na(x)))) 64 | pb <- txtProgressBar(0, 1, style = 3) 65 | for (i in colnames(data)) { 66 | if (sum(is.na(data[,i])) > 0) data[,i] <- rsample(data[,i]) 67 | } 68 | setTxtProgressBar(pb, 1) 69 | } 70 | else stop(paste(preimpute, "is not recognized preimputation argument")) 71 | 72 | return(data) 73 | } 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /man/mlim.error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlim.error.R 3 | \name{mlim.error} 4 | \alias{mlim.error} 5 | \title{imputation error} 6 | \usage{ 7 | mlim.error( 8 | imputed, 9 | incomplete, 10 | complete, 11 | transform = NULL, 12 | varwise = FALSE, 13 | ignore.missclass = TRUE, 14 | ignore.rank = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{imputed}{the imputed dataframe} 19 | 20 | \item{incomplete}{the dataframe with missing values} 21 | 22 | \item{complete}{the original dataframe with no missing values} 23 | 24 | \item{transform}{character. it can be either "standardize", which standardizes the 25 | numeric variables before evaluating the imputation error, or 26 | "normalize", which change the scale of continuous variables to 27 | range from 0 to 1. the default is NULL.} 28 | 29 | \item{varwise}{logical, default is FALSE. if TRUE, in addition to 30 | mean accuracy for each variable type, the algorithm's 31 | performance for each variable (column) of the datast is 32 | also returned. if TRUE, instead of a numeric vector, a 33 | list is retuned.} 34 | 35 | \item{ignore.missclass}{logical. the default is TRUE. if FALSE, the overall 36 | missclassification rate for imputed unordered factors will be 37 | returned. in general, missclassification is not recommended, 38 | particularly for multinomial factors because it is not robust 39 | to imbalanced data. in other words, an imputation might show 40 | a very high accuracy, because it is biased towards the majority 41 | class, ignoring the minority levels. to avoid this error, 42 | Mean Per Class Error (MPCE) is returned, which is the average 43 | missclassification of each class and thus, it is a fairer 44 | criteria for evaluating multinomial classes.} 45 | 46 | \item{ignore.rank}{logical (default is FALSE, which is recommended). if TRUE, 47 | the accuracy of imputation of ordered factors (ordinal variables) 48 | will be evaluated based on 'missclassification rate' instead of 49 | normalized euclidean distance. this practice is not recommended 50 | because higher classification rate for ordinal variables does not 51 | guarantee lower distances between the imputed levels, despite the 52 | popularity of evaluating ordinal variables based on missclassification 53 | rate. in other words, assume an ordinal variable has 5 levels (1. strongly 54 | disagree, 2. disagree, 3. uncertain, 4. agree, 5.strongly agree). in this 55 | example, if "ignore.rank = TRUE", then an imputation that imputes level 56 | "5" as "4" is equally inaccurate as other algorithm that imputes level "5" 57 | as "1". therefore, if you have ordinal variables in your dataset, make sure 58 | you declare them as "ordered" factors to get the best imputation accuracy.} 59 | } 60 | \value{ 61 | numeric vector 62 | } 63 | \description{ 64 | calculates NRMSE, missclassification rate, and miss-ranking 65 | absolute mean distance, scaled between 0 to 1, where 1 means 66 | maximum distance between the actual rank of a level and the 67 | imputed level. 68 | } 69 | \examples{ 70 | 71 | \dontrun{ 72 | data(iris) 73 | 74 | # add 10\% missing values, ensure missingness is stratified for factors 75 | irisNA <- mlim.na(iris, p = 0.1, stratify = TRUE, seed = 2022) 76 | 77 | # run the default imputation 78 | MLIM <- mlim(irisNA) 79 | mlim.error(MLIM, irisNA, iris) 80 | 81 | # get error estimations for each variable 82 | mlim.error(MLIM, irisNA, iris, varwise = TRUE) 83 | } 84 | } 85 | \author{ 86 | E. F. Haghish 87 | } 88 | -------------------------------------------------------------------------------- /R/mlim.na.R: -------------------------------------------------------------------------------- 1 | #' @title add stratified/unstratified artificial missing observations 2 | #' @description to examine the performance of imputation algorithms, artificial 3 | #' missing data are added to datasets and then imputed, to compare 4 | #' the original observations with the imputed values. this function 5 | #' can add stratified or unstratified artificial missing data. stratified 6 | #' missing data can be particularly useful if your categorical or ordinal 7 | #' variables are imbalanced, i.e., one category appears at a much higher 8 | #' rate than others. 9 | #' @param x data.frame. x must be strictly a data.frame and any other 10 | #' data.table classes will be rejected 11 | #' @param p percentage of missingness to be added to the data 12 | #' @param stratify logical. if TRUE (default), stratified sampling will be 13 | #' carried out, when adding NA values to 'factor' variables 14 | #' (either ordered or unordered). this feature makes evaluation 15 | #' of missing data imputation algorithms more fair, especially 16 | #' when the factor levels are imbalanced. 17 | #' @param classes character vector, specifying the variable classes that should 18 | #' be selected for adding NA values. the default value is NULL, 19 | #' meaning all variables will receive NA values with probability of 'p'. 20 | #' however, if you wish to add NA values only to a specific classes, e.g. 21 | #' 'numeric' variables or 'ordered' factors, specify them in this argument. 22 | #' e.g. write "classes = c('numeric', 'ordered')" if you wish to add NAs 23 | #' only to numeric and ordered factors. 24 | #' @param seed integer. a random seed number for reproducing the result (recommended) 25 | #' @author E. F. Haghish 26 | #' @examples 27 | #' 28 | #' \dontrun{ 29 | #' # adding stratified NA to an atomic vector 30 | #' x <- as.factor(c(rep("M", 100), rep("F", 900))) 31 | #' table(mlim.na(x, p=.5, stratify = TRUE)) 32 | #' 33 | #' # adding unstratified NAs to all variables of a data.frame 34 | #' data(iris) 35 | #' mlim.na(iris, p=0.5, stratify = FALSE, seed = 1) 36 | #' 37 | #' # or add stratified NAs only to factor variables, ignoring other variables 38 | #' mlim.na(iris, p=0.5, stratify = TRUE, classes = "factor", seed = 1) 39 | #' 40 | #' # or add NAs to numeric variables 41 | #' mlim.na(iris, p=0.5, classes = "numeric", seed = 1) 42 | #' } 43 | #' @return data.frame 44 | #' @export 45 | 46 | mlim.na <- function(x, p = 0.1, stratify=FALSE, classes=NULL, seed = NULL) { 47 | 48 | # Syntax processing 49 | # ------------------------------------------------------------ 50 | stopifnot( 51 | "'p' should be between 0 and 1" = p >= 0 & p <= 1, 52 | "'x' type is not recognized" = is.atomic(x) || is.data.frame(x), 53 | "clas of 'x' must be strictly data.frame" = ! class(x) %in% c("tbl", "tbl_df") 54 | ) 55 | 56 | # set the seed for reproducibility 57 | if (!is.null(seed)) set.seed(seed) 58 | 59 | # if 'x' is not a dataframe: 60 | if (is.atomic(x)) { 61 | x <- addNA(x, p, stratify = stratify) 62 | return(x) 63 | } 64 | else { 65 | for (i in colnames(x)) { 66 | if (is.null(classes)) x[, i] <- addNA(x[, i], p, stratify = stratify) 67 | else { 68 | # force drop in case 'x' is not a data.frame 69 | if (class(x[, i, drop=TRUE])[1] %in% classes) x[, i] <- addNA(x[, i], p, stratify = stratify) 70 | } 71 | } 72 | } 73 | 74 | return(x) 75 | } 76 | 77 | 78 | -------------------------------------------------------------------------------- /R/syntaxProcessing.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title syntaxProcessing 3 | #' @description extracts performance metrics from cross-validation 4 | #' @importFrom memuse Sys.meminfo 5 | #' @importFrom tools file_ext 6 | #' @author E. F. Haghish 7 | #' @keywords Internal 8 | #' @noRd 9 | 10 | 11 | syntaxProcessing <- function(data, preimpute, impute, ram, 12 | matching, miniter, maxiter, max_models, 13 | tuning_time, cv, 14 | verbosity, report, save) { 15 | 16 | #if ("GBM" %in% impute) { 17 | #if (nrow(data) < 200) stop("too few rows... use 'ELNET' or 'DRF' instead") 18 | #} 19 | 20 | # default values 21 | verbose <- NULL 22 | debug <- FALSE 23 | min_ram <- NULL 24 | max_ram <- NULL 25 | 26 | stopifnot( 27 | #"'data' is not a data.frame" = is.data.frame(data) | inherits(data, "mlim"), 28 | #"'data' has no observations" = dim(data) >= 1, #not applicable to mlim object 29 | #"'formula' should be a formula!" = inherits(formula, "formula"), 30 | #length(formula <- as.character(formula)) == 3L, 31 | #"'max_models' must be a positive integer equal or more than 1" = max_models >= 1, 32 | #"'tuning_time' must be a positive integer" = tuning_time >= 2, 33 | "'cv' must be a positive integer equal or more than 10" = cv >= 10, 34 | "'save' argument must have '*.mlim' file extension" = tools::file_ext(save) == "mlim" 35 | ) 36 | 37 | #if (miniter < 2 & preimpute == "iterate") stop("'miniter' must not be less than 2") 38 | #if ( maxiter < 2 & preimpute == "iterate") stop("'maxiter' must not be less than 2") 39 | 40 | # if (!is.null(weights_column)) { 41 | # stopifnot( 42 | # "'weights_column' must have equal length with the data.frame" = length(weights_column) == nrow(data), 43 | # "'weights_column' must not have any missing observations" = !anyNA(weights_column) 44 | # ) 45 | # } 46 | 47 | 48 | if (!is.null(ram)) { 49 | if (!is.numeric(ram)) stop("'ram' must be an integer, specifying amount of RAM in Gigabytes") 50 | min_ram <- paste0(ram - 1, "G") 51 | max_ram <- paste0(ram, "G") 52 | 53 | # if more than 2/3 of the RAM is dedicated to Java server, give a warning 54 | if (ram > 0.666*round(as.numeric(memuse::Sys.meminfo()$totalram)*9.31*1e-10)) { 55 | message("NOTE: you have dedicated more than 2/3 of your total RAM to mlim.\n This is fine as long as you do not use 'XGB' algorithm.\n You are advised to monitor your RAM during the imputation...\n\n") 56 | } 57 | } 58 | else { 59 | ## NOTE: memuse::Sys.meminfo() can return RAM near zero, which fails 60 | ## initiating the Java server 61 | # ram <- floor(as.numeric(memuse::Sys.meminfo()$freeram)*9.31*1e-10) 62 | # if (ram > 4) { 63 | # min_ram <- paste0(ram - 1, "G") 64 | # max_ram <- paste0(ram, "G") 65 | # } 66 | # else { 67 | # min_ram <- NULL 68 | # max_ram <- NULL 69 | # } 70 | min_ram <- NULL 71 | max_ram <- NULL 72 | } 73 | 74 | if ("StackEnsemble" %in% impute) { 75 | keep_cross_validation_predictions <- TRUE 76 | } 77 | else { 78 | keep_cross_validation_predictions <- FALSE 79 | } 80 | 81 | # define logging levels and debugging 82 | if (is.null(verbosity)) verbose <- 0 83 | else if (verbosity == "warn") verbose <- 1 84 | else if (verbosity == "info") verbose <- 2 85 | else if (verbosity == "debug") { 86 | verbose <- 3 87 | debug <- TRUE 88 | } 89 | 90 | return(list(min_ram=min_ram, 91 | max_ram=max_ram, 92 | keep_cross_validation_predictions=keep_cross_validation_predictions, 93 | verbose=verbose, 94 | debug=debug)) 95 | } 96 | -------------------------------------------------------------------------------- /R/stoppingCriteria.R: -------------------------------------------------------------------------------- 1 | #' @title iteration stopping criteria 2 | #' @description evaluates the stopping criteria 3 | #' @param metrics estimated error from CV 4 | #' @param k iteration round 5 | #' @param error_metric character. stopping metric for the iteration. default is "RMSE" 6 | #' @return logical. if TRUE, the imputation goes on to the next iteration 7 | #' @author E. F. Haghish 8 | #' @keywords Internal 9 | #' @noRd 10 | 11 | stoppingCriteria <- function(method = "iteration_RMSE", 12 | miniter, maxiter, 13 | metrics, k, vars2impute, 14 | error_metric, 15 | tolerance, 16 | postimpute, runpostimpute, 17 | md.log) { 18 | 19 | # keep running unless... 20 | running <- TRUE 21 | error <- NA 22 | errImprovement <- NA 23 | 24 | # ............................................................ 25 | # as long as there is a variable that has been improving, 26 | # keep iterating. however, if "double.check = FALSE", ignore 27 | # variables that do not improve in any iteration throughout 28 | # the rest of the iterations 29 | # ............................................................ 30 | if (method == "varwise_NA") { 31 | # as long as there is a variable that it's RMSE is not NA, keep going! 32 | if (running) { 33 | error <- mean(metrics[metrics$iteration == k, 34 | error_metric], na.rm = TRUE) 35 | 36 | if (is.na(error)) { 37 | 38 | # if all values were NA, well, stop then, if there is no postimpute! 39 | if (is.null(postimpute)) { 40 | if (is.na(error)) running <- FALSE 41 | } 42 | else { 43 | if (!runpostimpute) { 44 | runpostimpute <- TRUE 45 | vars2impute <- NULL #avoid the loops on the base imputer 46 | } 47 | else { 48 | running <- FALSE 49 | runpostimpute <- FALSE 50 | } 51 | } 52 | } 53 | 54 | 55 | 56 | } 57 | } 58 | 59 | # ............................................................ 60 | # Decide the stopping criteria based on average improvement of 61 | # RMSE per iteration 62 | # 63 | # ............................................................ 64 | if (method == "iteration_RMSE") { 65 | # there has been no (or too little) improvement, stop! 66 | if (running) { 67 | error <- mean(metrics[metrics$iteration == k, 68 | error_metric], na.rm = TRUE) 69 | 70 | if (k == 1) message("\n ",error_metric, 71 | " = ", round(error,4), "\n", sep = "") 72 | 73 | if (k >= 2) { 74 | # get the rmse's that made NA, because of saturation 75 | available <- !is.na(metrics[metrics$iteration == k, error_metric]) 76 | errPrevious <- mean(metrics[metrics$iteration == k-1 & available, 77 | error_metric], 78 | na.rm = TRUE) 79 | 80 | errImprovement <- error - errPrevious 81 | if (!is.na(error) & !is.na(errImprovement)) { 82 | percentImprove <- (errImprovement / errPrevious) 83 | } 84 | 85 | if (!is.na(errImprovement)) { 86 | if (percentImprove < 0) { 87 | message("\n ",error_metric, 88 | " = ", round(error,4), " (improved by ", 89 | round(-percentImprove*100, 3),"%)", "\n", sep = "") 90 | } 91 | else { 92 | message("\n ",error_metric, 93 | " = ", round(error,4), " (increased by ", 94 | round(percentImprove*100, 3),"%)", "\n", sep = "") 95 | } 96 | 97 | #message(paste0(error_metric, 98 | # " improved by: ", round(-percentImprove*100,4),"%")) 99 | #running <- errImprovement < (-tolerance) 100 | running <- percentImprove < (-tolerance) 101 | } 102 | } 103 | } 104 | } 105 | 106 | 107 | # if maximum iteration has been reached and still is running... 108 | # ------------------------------------------------------------ 109 | if (k == maxiter & running) { 110 | warning("the imputation could be further improved by increasing number of iterations") 111 | } 112 | 113 | # maximum iteration has been reached 114 | # ------------------------------------------------------------ 115 | if (k == maxiter) running <- FALSE 116 | 117 | if (!running) runpostimpute <- FALSE 118 | 119 | return(list(running = running, 120 | error = error, 121 | vars2impute = vars2impute, 122 | improvement = errImprovement, 123 | runpostimpute = runpostimpute)) 124 | } 125 | -------------------------------------------------------------------------------- /man/manifest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/manifest.R 3 | \docType{data} 4 | \name{manifest} 5 | \alias{manifest} 6 | \title{Manifest Anxiety Scale} 7 | \format{ 8 | A data frame with 4469 rows and 52 variables: 9 | \describe{ 10 | \item{gender}{participants' gender} 11 | \item{age}{participants' age in years} 12 | \item{Q1}{I do not tire quickly.} 13 | \item{Q2}{I am troubled by attacks of nausea.} 14 | \item{Q3}{I believe I am no more nervous than most others.} 15 | \item{Q4}{I have very few headaches.} 16 | \item{Q5}{I work under a great deal of tension.} 17 | \item{Q6}{I cannot keep my mind on one thing.} 18 | \item{Q7}{I worry over money and business.} 19 | \item{Q8}{I frequently notice my hand shakes when I try to do something.} 20 | \item{Q9}{I blush no more often than others.} 21 | \item{Q10}{I have diarrhea once a month or more.} 22 | \item{Q11}{I worry quite a bit over possible misfortunes.} 23 | \item{Q12}{I practically never blush.} 24 | \item{Q13}{I am often afraid that I am going to blush.} 25 | \item{Q14}{I have nightmares every few nights.} 26 | \item{Q15}{My hands and feet are usually warm.} 27 | \item{Q16}{I sweat very easily even on cool days.} 28 | \item{Q17}{Sometimes when embarrassed, I break out in a sweat.} 29 | \item{Q18}{I hardly ever notice my heart pounding and I am seldom short of breath.} 30 | \item{Q19}{I feel hungry almost all the time.} 31 | \item{Q20}{I am very seldom troubled by constipation.} 32 | \item{Q21}{I have a great deal of stomach trouble.} 33 | \item{Q22}{I have had periods in which I lost sleep over worry.} 34 | \item{Q23}{My sleep is fitful and disturbed.} 35 | \item{Q24}{I dream frequently about things that are best kept to myself.} 36 | \item{Q25}{I am easily embarrassed.} 37 | \item{Q26}{I am more sensitive than most other people.} 38 | \item{Q27}{I frequently find myself worrying about something.} 39 | \item{Q28}{I wish I could be as happy as others seem to be.} 40 | \item{Q29}{I am usually calm and not easily upset.} 41 | \item{Q30}{I cry easily.} 42 | \item{Q31}{I feel anxiety about something or someone almost all the time.} 43 | \item{Q32}{I am happy most of the time.} 44 | \item{Q33}{It makes me nervous to have to wait.} 45 | \item{Q34}{I have periods of such great restlessness that I cannot sit long I a chair.} 46 | \item{Q35}{Sometimes I become so excited that I find it hard to get to sleep.} 47 | \item{Q36}{I have sometimes felt that difficulties were piling up so high that I could not overcome them.} 48 | \item{Q37}{I must admit that I have at times been worried beyond reason over something that really did not matter.} 49 | \item{Q38}{I have very few fears compared to my friends.} 50 | \item{Q39}{I have been afraid of things or people that I know could not hurt me.} 51 | \item{Q40}{I certainly feel useless at times.} 52 | \item{Q41}{I find it hard to keep my mind on a task or job.} 53 | \item{Q42}{I am usually self-conscious.} 54 | \item{Q43}{I am inclined to take things hard.} 55 | \item{Q44}{I am a high-strung person.} 56 | \item{Q45}{Life is a trial for me much of the time.} 57 | \item{Q46}{At times I think I am no good at all.} 58 | \item{Q47}{I am certainly lacking in self-confidence.} 59 | \item{Q48}{I sometimes feel that I am about to go to pieces.} 60 | \item{Q49}{I shrink from facing crisis of difficulty.} 61 | \item{Q50}{I am entirely self-confident.} 62 | } 63 | } 64 | \source{ 65 | \url{https://openpsychometrics.org/tests/TMAS/} 66 | } 67 | \usage{ 68 | manifest 69 | } 70 | \description{ 71 | The Taylor Manifest Anxiety Scale was first developed in 1953 to identify 72 | individuals who would be good subjects for studies of stress and other 73 | related psychological phenomenon. Since then it has been used as a measure of 74 | anxiety as general personality trait. Anxiety is a complex psychological 75 | construct that includes a multiple of different facets related to extensive 76 | worrying that may impair normal functioning. The test has been widely studied 77 | and used in research, however there are some concerns that it does not measure 78 | a single trait, but instead, measures a basket of loosely related ones and so 79 | the score is not that meaningful. 80 | } 81 | \details{ 82 | The data comes from an online offering of the Taylor Manifest Anxiety Scale. 83 | At the end of the test users were asked if their answers were accurate and 84 | could be used for research, 76% said yes and have been published on 85 | \url{https://openpsychometrics.org/}. 86 | 87 | #' items 1 to 50 were rated 1=True and 2=False. gender, chosen from 88 | a drop down menu (1=male, 2=female, 3=other) and age was 89 | entered as a free response (ages<14 have been removed) 90 | } 91 | \references{ 92 | Taylor, J. (1953). "A personality scale of manifest anxiety". 93 | The Journal of Abnormal and Social Psychology, 48(2), 285-290. 94 | } 95 | \keyword{datasets} 96 | -------------------------------------------------------------------------------- /R/checkNconvert.R: -------------------------------------------------------------------------------- 1 | 2 | #' @title check data.frame features and convert documents 3 | #' @description extracts features classes 4 | #' @param data data.frame 5 | #' @param ignore.rank (RECOMMENDED) model ordinal variable as gausian, but force 6 | #' the imputed values to be integers and are reverted later. if FALSE, the 7 | #' feature will be imputed as a multinomial factor. 8 | #' @return character vector of features types. 9 | #' @examples 10 | #' \dontrun{ 11 | #' data(iris) 12 | #' 13 | #' # add an integer variable to iris 14 | #' iris$int <- as.integer(round(iris$Sepal.Length + iris$Sepal.Width)) 15 | #' 16 | #' # add an integer variable to iris 17 | #' iris$order <- factor(iris$Species, order = TRUE, 18 | #' levels = c("setosa","versicolor","virginica")) 19 | #' 20 | #' # add a binary variable to iris 21 | #' iris$binary <- iris$Species 22 | #' iris$binary[iris$binary=="versicolor"] <- "setosa" 23 | #' 24 | #' #print(checkNconvert(iris)) 25 | #' } 26 | #' @author E. F. Haghish 27 | #' @keywords Internal 28 | #' @noRd 29 | 30 | checkNconvert <- function(data, vars2impute, ignore, 31 | ignore.rank=FALSE, report=NULL) { 32 | 33 | # CHECK THAT 'data' is a data.frame 34 | 35 | mem <- NULL 36 | orderedIndex <- 0 37 | 38 | ncl <- ncol(data) 39 | features <- character(ncl) 40 | family <- character(ncl) 41 | classes <- lapply(data, class) 42 | COLNAMES <- colnames(data) 43 | 44 | # get the vartype of the variables that should be imputed 45 | # convert incompatible variable types 46 | # ============================================================ 47 | j <- 0 48 | 49 | for (i in COLNAMES) { 50 | j <- j + 1 51 | # first evaluate the factors and numeric 52 | if ("factor" %in% classes[[i]]) { 53 | features[j] <- 'factor' 54 | cardinality <- length(unique(data[!is.na(data[, i]), i])) 55 | if (cardinality <= 2) family[j] <- 'binomial' 56 | else family[j] <- 'multinomial' 57 | } 58 | else if ("numeric" %in% classes[[i]]){ 59 | features[j] <- 'numeric' 60 | family[j] <- 'gaussian' 61 | } 62 | 63 | # then search for ordinal & integers 64 | if ("ordered" %in% classes[[i]]) { 65 | features[j] <- 'ordered' 66 | cardinality <- length(unique(data[!is.na(data[, i]), i])) 67 | if (cardinality <= 2 & ignore.rank) { 68 | family[j] <- 'binomial' 69 | data[,i] <- factor(data[,i], ordered = FALSE) 70 | } 71 | else if (cardinality <= 2 & !ignore.rank) { 72 | orderedIndex <- orderedIndex + 1 73 | mem[[orderedIndex]] <- factmem(data[, i, drop = FALSE]) 74 | family[j] <- 'quasibinomial' 75 | # take the labels if numeric 76 | if (is.numeric(as.character(data[,i]))) { 77 | data[,i] <- as.numeric(levels(data[,i]))[data[,i]] 78 | } 79 | # otherwise take the levels 80 | else { 81 | data[,i] <- as.numeric(data[,i]) 82 | } 83 | } 84 | else if (cardinality > 2 & ignore.rank) { 85 | family[j] <- 'multinomial' 86 | data[,i] <- factor(data[,i], ordered = FALSE) 87 | } 88 | else if (cardinality > 2 & !ignore.rank) { 89 | orderedIndex <- orderedIndex + 1 90 | mem[[orderedIndex]] <- factmem(data[, i, drop = FALSE]) 91 | family[j] <- 'gaussian_integer' 92 | 93 | # take the labels if numeric 94 | if (is.numeric(as.character(data[,i]))) { 95 | data[,i] <- as.numeric(levels(data[,i]))[data[,i]] 96 | } 97 | # otherwise take the levels 98 | else { 99 | data[,i] <- as.numeric(data[,i]) 100 | } 101 | } 102 | } 103 | else if ("integer" %in% classes[[i]]) { 104 | features[j] <- 'integer' 105 | family[j] <- 'gaussian_integer' 106 | data[,i] <- as.numeric(data[,i]) 107 | } 108 | else if ("character" %in% classes[[i]]) { 109 | msg <- paste(i, "variable is of class 'character', which", 110 | "is not supported. either convert it to a 'factor'", 111 | "or give it to the 'ignore' argument.") 112 | stop(msg) 113 | } 114 | } 115 | 116 | # only return the class and family of the vars2impute, if not ignored 117 | # ============================================================ 118 | # now match the location of the variables according to vars2impute order 119 | index <- match(vars2impute, COLNAMES) 120 | return(list(class = features[index], 121 | family = family[index], 122 | COLNAMES = COLNAMES[index], 123 | data = data, 124 | mem = mem, 125 | orderedCols = which(features == "ordered"))) 126 | } 127 | 128 | 129 | #iim <- selectVariables(data=irisWithNA, ignore = c("Sepal.Length","Sepal.Width" ,"Petal.Length","Petal.Width")) 130 | #get <- checkNconvert(data=DATA1, vars2impute=vars2impute, ignore = NULL) 131 | #print(get$family) 132 | #print(get$class) 133 | #data(cars) 134 | #cars$dist[c(1,4,7,13,16,22,26,29,35,44,45)] <- NA 135 | #a <- mlim:::selectVariables(data=cars) 136 | #b <- checkNconvert(data=cars, vars2impute=a$vars2impute, ignore = NULL) 137 | -------------------------------------------------------------------------------- /R/manifest.R: -------------------------------------------------------------------------------- 1 | #' Manifest Anxiety Scale 2 | #' 3 | #' The Taylor Manifest Anxiety Scale was first developed in 1953 to identify 4 | #' individuals who would be good subjects for studies of stress and other 5 | #' related psychological phenomenon. Since then it has been used as a measure of 6 | #' anxiety as general personality trait. Anxiety is a complex psychological 7 | #' construct that includes a multiple of different facets related to extensive 8 | #' worrying that may impair normal functioning. The test has been widely studied 9 | #' and used in research, however there are some concerns that it does not measure 10 | #' a single trait, but instead, measures a basket of loosely related ones and so 11 | #' the score is not that meaningful. 12 | #' 13 | #' The data comes from an online offering of the Taylor Manifest Anxiety Scale. 14 | #' At the end of the test users were asked if their answers were accurate and 15 | #' could be used for research, 76% said yes and have been published on 16 | #' \url{https://openpsychometrics.org/}. 17 | #' 18 | #' #' items 1 to 50 were rated 1=True and 2=False. gender, chosen from 19 | #' a drop down menu (1=male, 2=female, 3=other) and age was 20 | #' entered as a free response (ages<14 have been removed) 21 | #' 22 | #' @format A data frame with 4469 rows and 52 variables: 23 | #' \describe{ 24 | #' \item{gender}{participants' gender} 25 | #' \item{age}{participants' age in years} 26 | #' \item{Q1}{I do not tire quickly.} 27 | #' \item{Q2}{I am troubled by attacks of nausea.} 28 | #' \item{Q3}{I believe I am no more nervous than most others.} 29 | #' \item{Q4}{I have very few headaches.} 30 | #' \item{Q5}{I work under a great deal of tension.} 31 | #' \item{Q6}{I cannot keep my mind on one thing.} 32 | #' \item{Q7}{I worry over money and business.} 33 | #' \item{Q8}{I frequently notice my hand shakes when I try to do something.} 34 | #' \item{Q9}{I blush no more often than others.} 35 | #' \item{Q10}{I have diarrhea once a month or more.} 36 | #' \item{Q11}{I worry quite a bit over possible misfortunes.} 37 | #' \item{Q12}{I practically never blush.} 38 | #' \item{Q13}{I am often afraid that I am going to blush.} 39 | #' \item{Q14}{I have nightmares every few nights.} 40 | #' \item{Q15}{My hands and feet are usually warm.} 41 | #' \item{Q16}{I sweat very easily even on cool days.} 42 | #' \item{Q17}{Sometimes when embarrassed, I break out in a sweat.} 43 | #' \item{Q18}{I hardly ever notice my heart pounding and I am seldom short of breath.} 44 | #' \item{Q19}{I feel hungry almost all the time.} 45 | #' \item{Q20}{I am very seldom troubled by constipation.} 46 | #' \item{Q21}{I have a great deal of stomach trouble.} 47 | #' \item{Q22}{I have had periods in which I lost sleep over worry.} 48 | #' \item{Q23}{My sleep is fitful and disturbed.} 49 | #' \item{Q24}{I dream frequently about things that are best kept to myself.} 50 | #' \item{Q25}{I am easily embarrassed.} 51 | #' \item{Q26}{I am more sensitive than most other people.} 52 | #' \item{Q27}{I frequently find myself worrying about something.} 53 | #' \item{Q28}{I wish I could be as happy as others seem to be.} 54 | #' \item{Q29}{I am usually calm and not easily upset.} 55 | #' \item{Q30}{I cry easily.} 56 | #' \item{Q31}{I feel anxiety about something or someone almost all the time.} 57 | #' \item{Q32}{I am happy most of the time.} 58 | #' \item{Q33}{It makes me nervous to have to wait.} 59 | #' \item{Q34}{I have periods of such great restlessness that I cannot sit long I a chair.} 60 | #' \item{Q35}{Sometimes I become so excited that I find it hard to get to sleep.} 61 | #' \item{Q36}{I have sometimes felt that difficulties were piling up so high that I could not overcome them.} 62 | #' \item{Q37}{I must admit that I have at times been worried beyond reason over something that really did not matter.} 63 | #' \item{Q38}{I have very few fears compared to my friends.} 64 | #' \item{Q39}{I have been afraid of things or people that I know could not hurt me.} 65 | #' \item{Q40}{I certainly feel useless at times.} 66 | #' \item{Q41}{I find it hard to keep my mind on a task or job.} 67 | #' \item{Q42}{I am usually self-conscious.} 68 | #' \item{Q43}{I am inclined to take things hard.} 69 | #' \item{Q44}{I am a high-strung person.} 70 | #' \item{Q45}{Life is a trial for me much of the time.} 71 | #' \item{Q46}{At times I think I am no good at all.} 72 | #' \item{Q47}{I am certainly lacking in self-confidence.} 73 | #' \item{Q48}{I sometimes feel that I am about to go to pieces.} 74 | #' \item{Q49}{I shrink from facing crisis of difficulty.} 75 | #' \item{Q50}{I am entirely self-confident.} 76 | #' } 77 | #' @source \url{https://openpsychometrics.org/tests/TMAS/} 78 | #' @references Taylor, J. (1953). "A personality scale of manifest anxiety". 79 | #' The Journal of Abnormal and Social Psychology, 48(2), 285-290. 80 | "manifest" 81 | 82 | # TMA <- read_csv("TMA.csv", col_types = cols(score = col_skip())) 83 | # TMA <- as.data.frame(TMA) 84 | # head(TMA$gender) 85 | # TMA$gender[TMA$gender == 0] <- NA 86 | # TMA$gender <- factor(TMA$gender, 87 | # levels = c(1,2,3), 88 | # labels = c("Male","Female","Other")) 89 | # 90 | # # code the missing responses 91 | # for (i in 1:50) { 92 | # index <- TMA[, paste0("Q",i)] == 0 93 | # TMA[index, paste0("Q",i)] <- NA 94 | # TMA[, paste0("Q",i)] <-factor(TMA[, paste0("Q",i)], 95 | # levels = c(1,2), 96 | # labels = c("TRUE","FALSE")) 97 | # } 98 | # manifest <- na.omit(manifest) 99 | 100 | 101 | -------------------------------------------------------------------------------- /R/mlim.error.R: -------------------------------------------------------------------------------- 1 | #' @title imputation error 2 | #' @description calculates NRMSE, missclassification rate, and miss-ranking 3 | #' absolute mean distance, scaled between 0 to 1, where 1 means 4 | #' maximum distance between the actual rank of a level and the 5 | #' imputed level. 6 | #' @param imputed the imputed dataframe 7 | #' @param incomplete the dataframe with missing values 8 | #' @param complete the original dataframe with no missing values 9 | #' @param transform character. it can be either "standardize", which standardizes the 10 | #' numeric variables before evaluating the imputation error, or 11 | #' "normalize", which change the scale of continuous variables to 12 | #' range from 0 to 1. the default is NULL. 13 | #' @param varwise logical, default is FALSE. if TRUE, in addition to 14 | #' mean accuracy for each variable type, the algorithm's 15 | #' performance for each variable (column) of the datast is 16 | #' also returned. if TRUE, instead of a numeric vector, a 17 | #' list is retuned. 18 | #' @param ignore.missclass logical. the default is TRUE. if FALSE, the overall 19 | #' missclassification rate for imputed unordered factors will be 20 | #' returned. in general, missclassification is not recommended, 21 | #' particularly for multinomial factors because it is not robust 22 | #' to imbalanced data. in other words, an imputation might show 23 | #' a very high accuracy, because it is biased towards the majority 24 | #' class, ignoring the minority levels. to avoid this error, 25 | #' Mean Per Class Error (MPCE) is returned, which is the average 26 | #' missclassification of each class and thus, it is a fairer 27 | #' criteria for evaluating multinomial classes. 28 | #' @param ignore.rank logical (default is FALSE, which is recommended). if TRUE, 29 | #' the accuracy of imputation of ordered factors (ordinal variables) 30 | #' will be evaluated based on 'missclassification rate' instead of 31 | #' normalized euclidean distance. this practice is not recommended 32 | #' because higher classification rate for ordinal variables does not 33 | #' guarantee lower distances between the imputed levels, despite the 34 | #' popularity of evaluating ordinal variables based on missclassification 35 | #' rate. in other words, assume an ordinal variable has 5 levels (1. strongly 36 | #' disagree, 2. disagree, 3. uncertain, 4. agree, 5.strongly agree). in this 37 | #' example, if "ignore.rank = TRUE", then an imputation that imputes level 38 | #' "5" as "4" is equally inaccurate as other algorithm that imputes level "5" 39 | #' as "1". therefore, if you have ordinal variables in your dataset, make sure 40 | #' you declare them as "ordered" factors to get the best imputation accuracy. 41 | 42 | #' @author E. F. Haghish 43 | #' @examples 44 | #' 45 | #' \dontrun{ 46 | #' data(iris) 47 | #' 48 | #' # add 10% missing values, ensure missingness is stratified for factors 49 | #' irisNA <- mlim.na(iris, p = 0.1, stratify = TRUE, seed = 2022) 50 | #' 51 | #' # run the default imputation 52 | #' MLIM <- mlim(irisNA) 53 | #' mlim.error(MLIM, irisNA, iris) 54 | #' 55 | #' # get error estimations for each variable 56 | #' mlim.error(MLIM, irisNA, iris, varwise = TRUE) 57 | #' } 58 | #' @return numeric vector 59 | #' @export 60 | mlim.error <- function(imputed, incomplete, complete, transform = NULL, 61 | varwise = FALSE, ignore.missclass = TRUE, 62 | ignore.rank=FALSE) { 63 | 64 | rankerror <- NA 65 | classerror <- NA 66 | meanclasserr <- NA 67 | nrmse <- NA 68 | err <- NULL 69 | 70 | if ("mlim" %in% class(imputed) | "data.frame" %in% class(imputed) ) { 71 | # make sure the complete dataset is complete! 72 | if (length(which(colSums(is.na(complete)) > 0)) > 0) 73 | stop("'complete dataset has missing values") 74 | 75 | # get the variables with missing data, ignoring the rest 76 | naCols <- which(colSums(is.na(incomplete)) > 0) 77 | imputed <- imputed[, naCols, drop = FALSE] 78 | incomplete <- incomplete[, naCols, drop = FALSE] 79 | complete <- complete[, naCols, drop = FALSE] 80 | 81 | classes <- lapply(complete, class) 82 | types <- character(length(classes)) 83 | for (i in 1:length(classes)) types[i] <- classes[[i]][1] 84 | if ("integer" %in% types) types[which(types == "integer")] <- "numeric" 85 | 86 | n <- nrow(imputed) 87 | 88 | if (!ignore.rank) { 89 | err <- rep(NA, 4) 90 | names(err) <- c('nrmse', 'mpce', 'missclass', 'missrank') 91 | } 92 | else { 93 | err <- rep(NA, 3) 94 | names(err) <- c('nrmse', 'mpce', 'missclass') 95 | if ("ordered" %in% types) types[which(types == "ordered")] <- "factor" 96 | } 97 | 98 | for (t in types){ 99 | ind <- which(types == t) 100 | 101 | if (t == "numeric") { 102 | if (is.null(transform)) { 103 | nrmse <- nrmse(imputed[,ind, drop = FALSE], 104 | incomplete[,ind, drop = FALSE], 105 | complete[,ind, drop = FALSE]) 106 | if (!is.null(nrmse)) err[1] <- mean(nrmse, na.rm = TRUE) 107 | } 108 | else { 109 | if (transform == "standardize") { 110 | v1 <- scale(imputed[,ind, drop = FALSE]) 111 | v2 <- scale(incomplete[,ind, drop = FALSE]) 112 | v3 <- scale(complete[,ind, drop = FALSE]) 113 | } 114 | else if (transform == "normalize") { 115 | v1 <- normalize(imputed[,ind, drop = FALSE]) 116 | v2 <- normalize(incomplete[,ind, drop = FALSE]) 117 | v3 <- normalize(complete[,ind, drop = FALSE]) 118 | } 119 | else { 120 | stop(paste(transform, "is not a recognized transformation")) 121 | } 122 | nrmse <- nrmse(v1, v2, v3) 123 | if (!is.null(nrmse)) err[1] <- mean(nrmse, na.rm = TRUE) 124 | } 125 | 126 | } 127 | else if (t == 'ordered' & !ignore.rank) { 128 | rankerror <- missrank(imputed[,ind, drop = FALSE], 129 | incomplete[,ind, drop = FALSE], 130 | complete[,ind, drop = FALSE]) 131 | if (!is.null(rankerror)) err[4] <- mean(rankerror, na.rm = TRUE) 132 | } 133 | 134 | # ??? this does not mean that it is necessarily a factor variable. IMPROVE IT 135 | else { 136 | meanclasserr <- mean_per_class_error(imputed[,ind, drop = FALSE], 137 | incomplete[,ind, drop = FALSE], 138 | complete[,ind, drop = FALSE]) 139 | if (!is.null(meanclasserr)) err[2] <- mean(meanclasserr, na.rm = TRUE) 140 | 141 | if (!ignore.missclass) { 142 | classerror <- missclass(imputed[,ind, drop = FALSE], 143 | incomplete[,ind, drop = FALSE], 144 | complete[,ind, drop = FALSE]) 145 | if (!is.null(classerror)) err[3] <- mean(classerror, na.rm = TRUE) 146 | } 147 | } 148 | } 149 | 150 | if (varwise) { 151 | vwa <- NULL 152 | if (length(nrmse) > 1) vwa <- c(vwa, nrmse) 153 | else if (is.valid(nrmse)) vwa <- c(vwa, nrmse) 154 | if (ignore.missclass) { 155 | if (length(meanclasserr) > 1) vwa <- c(vwa, meanclasserr) 156 | else if (is.valid(meanclasserr)) vwa <- c(vwa, meanclasserr) 157 | } 158 | else { 159 | if (length(classerror) > 1) vwa <- c(vwa, classerror) 160 | else if (is.valid(classerror)) vwa <- c(vwa, classerror) 161 | } 162 | if (length(rankerror) > 1) vwa <- c(vwa, rankerror) 163 | else if (is.valid(rankerror)) vwa <- c(vwa, rankerror) 164 | return(list(error = err[!is.na(err)], 165 | nrmse = nrmse, 166 | missclass = classerror, 167 | missrank = rankerror, 168 | all = vwa)) 169 | } 170 | else return(err[!is.na(err)]) 171 | } 172 | 173 | else if ("mlim.mi" %in% class(imputed) | 174 | "list" %in% class(imputed) | 175 | "mids" %in% class(imputed)) { 176 | mat <- NULL 177 | for (i in 1:length(imputed)) { 178 | tmp <- imputed[[i]] 179 | class(tmp) <- c("mlim", "data.frame") 180 | mat <- rbind(mat, mlim.error(tmp, incomplete, complete, 181 | varwise = varwise, 182 | transform = transform, 183 | ignore.rank=ignore.rank)) 184 | } 185 | return(mat) 186 | } 187 | else stop("'imputed' must be of class 'data.frame', 'list', 'mlim', 'mlim.mi', or 'mids'") 188 | } 189 | 190 | # data(charity) 191 | # charity$ta1 <- factor(charity$ta1, ordered = FALSE) 192 | # for (i in colnames(charity)) 193 | # dfNA <- mlim.na(charity, p = 0.1, stratify = TRUE, seed = 2022) 194 | # imp <- missRanger::missRanger(dfNA) 195 | # mlim.error(imp, dfNA, charity) 196 | #print((ELNETerror <- mlim.error(ELNET, dfNA, df))) 197 | 198 | # if we standardize numeric vars, RMSE and MAE can be more than 1 199 | 200 | 201 | 202 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /man/mlim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlim.R 3 | \name{mlim} 4 | \alias{mlim} 5 | \title{missing data imputation with automated machine learning} 6 | \usage{ 7 | mlim( 8 | data = NULL, 9 | m = 1, 10 | algos = c("ELNET"), 11 | postimpute = FALSE, 12 | stochastic = m > 1, 13 | ignore = NULL, 14 | tuning_time = 900, 15 | max_models = NULL, 16 | maxiter = 10L, 17 | cv = 10L, 18 | matching = "AUTO", 19 | autobalance = TRUE, 20 | balance = NULL, 21 | seed = NULL, 22 | verbosity = NULL, 23 | report = NULL, 24 | tolerance = 0.001, 25 | doublecheck = TRUE, 26 | preimpute = "RF", 27 | cpu = -1, 28 | ram = NULL, 29 | flush = FALSE, 30 | preimputed.data = NULL, 31 | save = NULL, 32 | load = NULL, 33 | shutdown = TRUE, 34 | java = NULL, 35 | ... 36 | ) 37 | } 38 | \arguments{ 39 | \item{data}{a \code{data.frame} (strictly) with missing data to be 40 | imputed. if \code{'load'} argument is provided, this argument will be ignored.} 41 | 42 | \item{m}{integer, specifying number of multiple imputations. the default value is 43 | 1, carrying out a single imputation.} 44 | 45 | \item{algos}{character vector, specifying algorithms to be used for missing data 46 | imputation. supported algorithms are "ELNET", "RF", "GBM", "DL", 47 | "XGB", and "Ensemble". if more than one algorithm is specified, 48 | mlim changes behavior to save on runtime. for example, 49 | the default is "ELNET", which fine-tunes an Elastic Net model. 50 | In general, "ELNET" is expected to 51 | be the best algorithm because it fine-tunes very fast, it is 52 | very robust to over-fitting, and hence, it generalizes very well. 53 | However, if your data has many factor variables, each with several 54 | levels, it is recommended to have c("ELNET", "RF") as your imputation 55 | algorithms (and possibly add "Ensemble" as well, to make the most out 56 | of tuning the models). 57 | 58 | Note that "XGB" is only available in Mac OS and Linux. moreover, 59 | "GBM", "DL" and "XGB" take the full given "tuning_time" (see below) to 60 | tune the best model for imputing he given variable, whereas "ELNET" 61 | will produce only one fine-tuned model, often at less time than 62 | other algorithms need for developing a single model, which is why "ELNET" 63 | is work horse of the mlim imputation package.} 64 | 65 | \item{postimpute}{(EXPERIMENTAL FEATURE) logical. if TRUE, mlim uses algorithms rather than 'ELNET' for carrying out 66 | postimputation optimization. however, if FALSE, all specified algorihms will 67 | be used in the process of 'reimputation' together. the 'Ensemble' algorithm 68 | is encouraged when other algorithms are used. However, for general users 69 | unspecialized in machine learning, postimpute is NOT recommended because this 70 | feature is currently experimental, prone to over-fitting, and highly computationally extensive.} 71 | 72 | \item{stochastic}{logical. by default it is set to TRUE for multiple imputation and FALSE for 73 | single imputation. stochastic argument is currently under testing and is intended to 74 | avoid inflating the correlation between imputed valuables.} 75 | 76 | \item{ignore}{character vector of column names or index of columns that should 77 | should be ignored in the process of imputation.} 78 | 79 | \item{tuning_time}{integer. maximum runtime (in seconds) for fine-tuning the 80 | imputation model for each variable in each iteration. the default 81 | time is 900 seconds but for a large dataset, you 82 | might need to provide a larger model development 83 | time. this argument also influences \code{max_models}, 84 | see below. If you are using 'ELNET' algorithm (default), 85 | you can be generous with the 'tuning_time' argument because 86 | 'ELNET' tunes much faster than the rest and will only 87 | produce one model.} 88 | 89 | \item{max_models}{integer. maximum number of models that can be generated in 90 | the proecess of fine-tuning the parameters. this value 91 | default to 100, meaning that for imputing each variable in 92 | each iteration, up to 100 models can be fine-tuned. increasing 93 | this value should be consistent with increasing 94 | \code{max_model_runtime_secs}, allowing the model to spend 95 | more time in the process of individualized fine-tuning. 96 | as a result, the better tuned the model, the more accurate 97 | the imputed values are expected to be} 98 | 99 | \item{maxiter}{integer. maximum number of iterations. the default value is \code{15}, 100 | but it can be reduced to \code{3} (not recommended, see below).} 101 | 102 | \item{cv}{logical. specify number of k-fold Cross-Validation (CV). values of 103 | 10 or higher are recommended. default is 10.} 104 | 105 | \item{matching}{logical. if \code{TRUE}, imputed values are coerced to the 106 | closest value to the non-missing values of the variable. 107 | if set to "AUTO", 'mlim' decides whether to match 108 | or not, based on the variable classes. the default is "AUTO".} 109 | 110 | \item{autobalance}{logical. if TRUE (default), binary and multinomial factor variables 111 | will be balanced before the imputation to obtain fairer 112 | and less-biased imputations, which are typically in favor 113 | of the majority class. 114 | if FALSE, imputation fairness will be sacrificed for overall accuracy, which 115 | is not recommended, although it is commonly practiced in other missing data 116 | imputation software. MLIM is highly concerned with imputation fairness for 117 | factor variables and autobalancing is generally recommended. 118 | in fact, higher overall accuracy does not mean a better imputation as 119 | long as minority classes are neglected, which increases the bias in favor of the 120 | majority class. if you do not wish to autobalance all the 121 | factor variables, you can manually specify the variables 122 | that should be balanced using the 'balance' argument (see below).} 123 | 124 | \item{balance}{character vector, specifying variable names that should be 125 | balanced before imputation. balancing the prevalence might 126 | decrease the overall accuracy of the imputation, because it 127 | attempts to ensure the representation of the rare outcome. 128 | this argument is optional and intended for advanced users that 129 | impute a severely imbalance categorical (nominal) variable.} 130 | 131 | \item{seed}{integer. specify the random generator seed} 132 | 133 | \item{verbosity}{character. controls how much information is printed to console. 134 | the value can be "warn" (default), "info", "debug", or NULL. 135 | to FALSE.} 136 | 137 | \item{report}{filename. if a filename is specified (e.g. report = "mlim.md"), the \code{"md.log"} R 138 | package is used to generate a Markdown progress report for the 139 | imputation. the format of the report is adopted based on the 140 | \code{'verbosity'} argument. the higher the verbosity, the more 141 | technical the report becomes. if verbosity equals "debug", then 142 | a log file is generated, which includes time stamp and shows 143 | the function that has generated the message. otherwise, a 144 | reduced markdown-like report is generated. default is NULL.} 145 | 146 | \item{tolerance}{numeric. the minimum rate of improvement in estimated error metric 147 | of a variable to qualify the imputation for another round of iteration, 148 | if the \code{maxiter} is not yet reached. any improvement of imputation 149 | is desirable. however, specifying values above 0 can reduce the number 150 | of required iterations at a marginal increase of imputation error. 151 | for larger datasets, value of "1e-3" is recommended to reduce number 152 | of iterations. the default value is '1e-3'.} 153 | 154 | \item{doublecheck}{logical. default is TRUE (which is conservative). if FALSE, if the estimated 155 | imputation error of a variable does not improve, the variable 156 | will be not reimputed in the following iterations. in general, 157 | deactivating this argument will slightly reduce the imputation 158 | accuracy, however, it significantly reduces the computation time. 159 | if your dataset is large, you are advised to set this argument to 160 | FALSE. (EXPERIMENTAL: consider that by avoiding several iterations 161 | that marginally improve the imputation accuracy, you might gain 162 | higher accuracy by investing your computational resources in fine-tuning 163 | better algorithms such as "GBM")} 164 | 165 | \item{preimpute}{character. specifies the 'primary' procedure of handling the missing 166 | data. before 'mlim' begins imputing the missing observations, they should 167 | be prepared for the imputation algorithms and thus, they should be replaced 168 | with some values. 169 | the default procedure is a quick "RF", which models the missing 170 | data with parallel Random Forest model. this is a very fast procedure, 171 | which later on, will be replaced within the "reimputation" procedure (see below). 172 | possible other alternative is \code{"mm"}, 173 | which carries out mean/mode replacement, as practiced by most imputation algorithms. 174 | "mm" is much faster than "RF". if your dataset is very 175 | large, consider pre-imputing it before hand using 'mlim.preimpute()' 176 | function and passing the preimputed dataset to mlim (see "preimputed.data" argument).} 177 | 178 | \item{cpu}{integer. number of CPUs to be dedicated for the imputation. 179 | the default takes all of the available CPUs.} 180 | 181 | \item{ram}{integer. specifies the maximum size, in Gigabytes, of the 182 | memory allocation. by default, all the available memory is 183 | used for the imputation. 184 | large memory size is particularly advised, especially 185 | for multicore processes. the more you give the more you get!} 186 | 187 | \item{flush}{logical (experimental). if TRUE, after each model, the server is 188 | cleaned to retrieve RAM. this feature is in testing mode and is 189 | currently set to FALSE by default, but it is recommended if you 190 | have limited amount of RAM or large datasets.} 191 | 192 | \item{preimputed.data}{data.frame. if you have used another software for missing 193 | data imputation, you can still optimize the imputation 194 | by handing the data.frame to this argument, which will 195 | bypass the "preimpute" procedure.} 196 | 197 | \item{save}{filename (with .mlim extension). if a filename is specified, an \code{mlim} object is 198 | saved after the end of each variable imputation. this object not only 199 | includes the imputed dataframe and estimated cross-validation error, but also 200 | includes the information needed for continuing the imputation, 201 | which is very useful feature for imputing large datasets, with a 202 | long runtime. this argument is activated by default and an 203 | mlim object is stored in the local directory named \code{"mlim.rds"}.} 204 | 205 | \item{load}{filename (with .mlim extension). an object of class "mlim", which includes the data, arguments, 206 | and settings for re-running the imputation, from where it was 207 | previously stopped. the "mlim" object saves the current state of 208 | the imputation and is particularly recommended for large datasets 209 | or when the user specifies a computationally extensive settings 210 | (e.g. specifying several algorithms, increasing tuning time, etc.).} 211 | 212 | \item{shutdown}{logical. if TRUE, h2o server is closed after the imputation. 213 | the default is TRUE} 214 | 215 | \item{java}{character, specifying path to the executable 64bit Java JDK on the 216 | Microsoft Windows machines, if JDK is installed but the path environment 217 | variable is not set.} 218 | 219 | \item{...}{arguments that are used internally between 'mlim' and 'mlim.postimpute'. 220 | these arguments are not documented in the help file and are not 221 | intended to be used by end user.} 222 | } 223 | \value{ 224 | a \code{data.frame}, showing the 225 | estimated imputation error from the cross validation within the data.frame's 226 | attribution 227 | } 228 | \description{ 229 | imputes data.frame with mixed variable types using automated 230 | machine learning (AutoML) 231 | } 232 | \examples{ 233 | 234 | \dontrun{ 235 | data(iris) 236 | 237 | # add stratified missing observations to the data. to make the example run 238 | # faster, I add NAs only to a single variable. 239 | dfNA <- iris 240 | dfNA$Species <- mlim.na(dfNA$Species, p = 0.1, stratify = TRUE, seed = 2022) 241 | 242 | # run the ELNET single imputation (fastest imputation via 'mlim') 243 | MLIM <- mlim(dfNA, shutdown = FALSE) 244 | 245 | # in single imputation, you can estimate the imputation accuracy via cross validation RMSE 246 | mlim.summarize(MLIM) 247 | 248 | ### or if you want to carry out ELNET multiple imputation with 5 datasets. 249 | ### next, to carry out analysis on the multiple imputation, use the 'mlim.mids' function 250 | ### minimum of 5 datasets 251 | MLIM2 <- mlim(dfNA, m = 5) 252 | mids <- mlim.mids(MLIM2, dfNA) 253 | fit <- with(data=mids, exp=glm(Species ~ Sepal.Length, family = "binomial")) 254 | res <- mice::pool(fit) 255 | summary(res) 256 | 257 | # you can check the accuracy of the imputation, if you have the original dataset 258 | mlim.error(MLIM2, dfNA, iris) 259 | } 260 | } 261 | \author{ 262 | E. F. Haghish 263 | } 264 | -------------------------------------------------------------------------------- /R/iteration_loop.R: -------------------------------------------------------------------------------- 1 | #' @title iteration_loop 2 | #' @description runs imputation iteration loop to fully impute a dataframe 3 | #' @importFrom utils setTxtProgressBar txtProgressBar capture.output packageVersion 4 | #' @importFrom h2o h2o.init as.h2o h2o.predict h2o.ls 5 | #' h2o.removeAll h2o.rm h2o.shutdown h2o.get_automl 6 | #' @importFrom md.log md.log 7 | #' @importFrom memuse Sys.meminfo 8 | #' @importFrom stats var setNames na.omit rnorm 9 | #' @return list 10 | #' @author E. F. Haghish 11 | #' @keywords Internal 12 | #' @noRd 13 | 14 | iteration_loop <- function(MI, dataNA, preimputed.data, data, bdata, boot, metrics, tolerance, doublecheck, 15 | m, k, X, Y, z, m.it, 16 | 17 | # loop data 18 | vars2impute, vars2postimpute, storeVars2impute, 19 | allPredictors, preimpute, impute, postimputealgos, 20 | 21 | # settings 22 | error_metric, FAMILY, cv, tuning_time, 23 | max_models, 24 | keep_cv, 25 | autobalance, balance, seed, save, flush, 26 | verbose, debug, report, sleep, 27 | 28 | # saving settings 29 | mem, orderedCols, ignore, maxiter, 30 | miniter, matching, ignore.rank, 31 | verbosity, error, cpu, max_ram, min_ram, shutdown, clean, 32 | stochastic) { 33 | 34 | FACTORPREDCTIONS <- NULL 35 | 36 | # ------------------------------------------------------------ 37 | # bootrtap 38 | # 39 | # Bootstrap from the original dataset, hold the original NA values, 40 | # and then use the preimputed dataset, and then gradually improve it 41 | # 42 | #### PROBLEM 43 | ############ 44 | #### drop the duplicates because they screw up the k-fold cross-validation. 45 | #### multiple identical observations might go to train and test datasets. 46 | #### here I suggest several 'work-in-progress' solutions 47 | # ============================================================ 48 | 49 | ####### =============================================== 50 | ####### BOOTSTRAP AND BALANCING DRAMA 51 | ####### =============================================== 52 | #??? THIS NEEDS FURTHER UPDATE IF 'autobalance' IS ACTIVATED 53 | # THE SOlUTION WOULD BE TO CALCULATE BALANCING WEIGHTS FOR 54 | # EACH OBSERVATION AND THEN MULTIPLY IT BY THE WEIGHTS_COLUMN. 55 | # OR CARRY OUT BALANCED STRATIFIED SAMPLING FOR CATEGORICAL 56 | # VARIABLES... 57 | 58 | if (boot) { 59 | rownames(data) <- 1:nrow(data) #remember the rows that are missing 60 | sampling_index <- sample(x = nrow(data), size = nrow(data), replace=TRUE) 61 | 62 | 63 | 64 | ## SOLUTION 1: DROP THE DUPLICATES AND DO UNDERSAMPLING 65 | ## ---------------------------------------------------- 66 | # bdata <- data[sampling_index, ] 67 | # bdataNA <- is.na(bdata[, vars2impute, drop = FALSE]) 68 | # bdata <- mlim.preimpute(data=bdata, preimpute=preimpute, seed = NULL) 69 | # sampling_index <- sampling_index[!duplicated(sampling_index)] 70 | # bdata <- data[sampling_index, ] 71 | # bdata[, "mlim_bootstrap_weights_column_"] <- 1 72 | # bdataNA <- is.na(bdata[, vars2impute, drop = FALSE]) 73 | 74 | ## SOLUTION 2: ADD THE DUPLICATES TO THE WEIGHT_COLUMN 75 | ## ---------------------------------------------------- 76 | dups <- bootstrapWeight(sampling_index) 77 | bdata <- data[1:nrow(data) %in% dups[,1], ] 78 | bdataNA <- is.na(bdata[, vars2impute, drop = FALSE]) 79 | message("\n") 80 | bdata <- mlim.preimpute(data=bdata, preimpute=preimpute, seed = NULL) 81 | bdata[, "mlim_bootstrap_weights_column_"] <- dups[,2] #OR ALTERNATIVELY #dups[,2] / sum(dups[,2]) 82 | 83 | ## SOLUTION 3: Assign CV folding manually instead of weight_column 84 | ## ---------------------------------------------------- 85 | # bdata <- data[sampling_index, ] 86 | # bdataNA <- is.na(bdata[, vars2impute, drop = FALSE]) 87 | # bdata <- mlim.preimpute(data=bdata, preimpute=preimpute, seed = NULL) 88 | # bdata[, "mlim_bootstrap_fold_assignment_"] <- 0 89 | # folds <- bootstrapCV(index = sampling_index, cv = cv) 90 | # for (i in 1:cv) { 91 | # indexcv <- sampling_index %in% folds[,i] 92 | # bdata[indexcv, "mlim_bootstrap_fold_assignment_"] <- i 93 | # } 94 | } 95 | 96 | # update the fresh data 97 | # ------------------------------------------------------------ 98 | running <- TRUE 99 | runpostimpute <- FALSE 100 | 101 | if (debug) md.log("data was sent to h2o cloud", date=debug, time=debug, trace=FALSE) 102 | 103 | # define iteration var. this is a vector of varnames that should be imputed 104 | # if 'doublecheck' argument is FALSE, everytime a variable stops improving, 105 | # remove it from ITERATIONVARS. When you get to postimputation, reset the 106 | # ITERATIONVARS. 107 | ITERATIONVARS <- vars2impute 108 | 109 | # ------------------------------------------------------------ 110 | # Generate the HEX datasets if there is NO FLUSHING 111 | # ------------------------------------------------------------ 112 | if (!flush) { 113 | tryCatch(hex <- h2o::as.h2o(data), 114 | error = function(cond) { 115 | message("trying to upload data to JAVA server...\n"); 116 | message("ERROR: Data could not be uploaded to the Java Server\nJava server returned the following error:\n") 117 | return(stop(cond))}) 118 | 119 | bhex <- NULL 120 | if (!is.null(bdata)) { 121 | tryCatch(bhex<- h2o::as.h2o(bdata), 122 | error = function(cond) { 123 | message("trying to upload data to JAVA server...\n"); 124 | message("ERROR: Data could not be uploaded to the Java Server\nJava server returned the following error:\n") 125 | return(stop(cond))}) 126 | } 127 | } 128 | else { 129 | hex <- NULL 130 | bhex <- NULL 131 | } 132 | 133 | # ============================================================ 134 | # ============================================================ 135 | # global iteration loop 136 | # ============================================================ 137 | # ============================================================ 138 | while (running) { 139 | 140 | # always print the iteration 141 | message(paste0("\ndata ", m.it, ", iteration ", k, " (RAM = ", memuse::Sys.meminfo()$freeram,")", ":"), sep = "") #":\t" 142 | md.log(paste("Iteration", k), section="subsection") 143 | 144 | # ## AVOID THIS PRACTICE BECAUSE DOWNLOADING DATA FROM THE SERVER IS SLOW 145 | # # store the last data 146 | # if (debug) md.log("store last data", date=debug, time=debug, trace=FALSE) 147 | # dataLast <- as.data.frame(hex) 148 | # attr(dataLast, "metrics") <- metrics 149 | # attr(dataLast, "rmse") <- error 150 | 151 | # ......................................................... 152 | # IMPUTATION & POSTIMPUTATION LOOP 153 | # ......................................................... 154 | if (runpostimpute) { 155 | procedure <- "postimpute" 156 | if (debug) md.log("Running POSTIMPUTATION", date = TRUE, time = TRUE, print = FALSE, trace = FALSE) 157 | } 158 | else procedure <- "impute" 159 | 160 | for (Y in ITERATIONVARS[z:length(ITERATIONVARS)]) { 161 | start <- as.integer(Sys.time()) 162 | 163 | # Prepare the progress bar and iteration console text 164 | # ============================================================ 165 | if (verbose==0) pb <- txtProgressBar((which(ITERATIONVARS == Y))-1, length(vars2impute), style = 3) 166 | if (verbose!=0) message(paste0(" ",Y)) 167 | 168 | it <- NULL 169 | tryCatch(capture.output( 170 | it <- iterate( 171 | procedure = procedure, 172 | MI, dataNA, bdataNA, 173 | preimputed.data, data, bdata, boot, hex, bhex, metrics, tolerance, doublecheck, 174 | m, k, X, Y, z=which(ITERATIONVARS == Y), m.it, 175 | # loop data 176 | ITERATIONVARS, vars2impute, 177 | allPredictors, preimpute, impute, postimputealgos, 178 | # settings 179 | error_metric, FAMILY=FAMILY, cv, tuning_time, 180 | max_models, 181 | keep_cv, 182 | autobalance, balance, seed, save, flush, 183 | verbose, debug, report, sleep, 184 | # saving settings 185 | mem, orderedCols, ignore, maxiter, 186 | miniter, matching, ignore.rank, 187 | verbosity, error, cpu, max_ram, min_ram, stochastic) 188 | , file = report, append = TRUE) 189 | , error = function(cond) { 190 | message(paste0("\nReimputing '", Y, "' with the current specified algorithms failed and this variable will be skipped! \nSee Java server's error below:")); 191 | md.log(paste("Reimputing", Y, "failed and the variable will be skipped!"), 192 | date = TRUE, time = TRUE, print = TRUE) 193 | message(cond) 194 | 195 | ### ??? activate the code below if you allow "iterate" preimputation 196 | ### ??? or should it be ignored... 197 | # if (preimpute == "iterate" && k == 1L && (Y %in% allPredictors)) { 198 | # X <- union(X, Y) 199 | # if (debug) md.log("x was updated", date=debug, time=debug, trace=FALSE) 200 | # } 201 | return(NULL) 202 | }) 203 | 204 | 205 | 206 | # If there was no error, update the variables 207 | # else make sure the model is cleared 208 | # -------------------------------------------------------------- 209 | #IT <<- it 210 | if (!is.null(it)) { 211 | X <- it$X 212 | ITERATIONVARS <- it$iterationvars 213 | metrics <- it$metrics 214 | data <- it$data 215 | bdata <- it$bdata 216 | hex <- it$hex 217 | bhex <- it$bhex 218 | 219 | # if 'factorPred' is not NULL, update the list: 220 | if (!is.null(it$factorPred)) { 221 | #remove the 'predict' column, which is the first column in predict dataframe 222 | #Ok <<- it$factorPred[,2:ncol(it$factorPred)] 223 | if (length(FACTORPREDCTIONS) > 0) FACTORPREDCTIONS <- list(FACTORPREDCTIONS, Y = it$factorPred[,2:ncol(it$factorPred)]) 224 | else FACTORPREDCTIONS <- list(Y = it$factorPred[,2:ncol(it$factorPred)]) 225 | 226 | # update the name of the new item 227 | names(FACTORPREDCTIONS)[length(FACTORPREDCTIONS)] <- Y 228 | } 229 | } 230 | 231 | else tryCatch(h2o::h2o.rm(h2o::h2o.get_automl("mlim")), 232 | error = function(cond) { 233 | return(NULL)}) 234 | 235 | # log & statusbar 236 | # -------------------------------------------------------------- 237 | time = as.integer(Sys.time()) - start 238 | if (debug) md.log(paste("done! after: ", time, " seconds"), 239 | date = TRUE, time = TRUE, print = FALSE, trace = FALSE) 240 | 241 | # update the statusbar 242 | if (verbose==0) setTxtProgressBar(pb, (which(ITERATIONVARS == Y))) 243 | 244 | 245 | } 246 | 247 | # CHECK CRITERIA FOR RUNNING THE NEXT ITERATION 248 | # -------------------------------------------------------------- 249 | if (debug) md.log("evaluating stopping criteria", date=debug, time=debug, trace=FALSE) 250 | SC <- stoppingCriteria(method="varwise_NA", miniter, maxiter, 251 | metrics, k, vars2impute, 252 | error_metric, 253 | tolerance, 254 | postimputealgos, 255 | runpostimpute, 256 | md.log = report) 257 | if (debug) { 258 | md.log(paste("running: ", SC$running), date=debug, time=debug, trace=FALSE) 259 | md.log(paste("\nEstimated", error_metric, "error:", SC$error), section="paragraph", date=debug, time=debug, trace=FALSE) 260 | } 261 | 262 | running <- SC$running 263 | error <- SC$error 264 | runpostimpute <- SC$runpostimpute 265 | ITERATIONVARS <- SC$vars2impute #only sets it to NULL 266 | 267 | # indication of postimpute 268 | if (length(ITERATIONVARS) == 0) { 269 | ITERATIONVARS <- vars2impute 270 | procedure <- "postimpute" 271 | } 272 | 273 | # update the loop number 274 | k <- k + 1L 275 | } 276 | 277 | # ............................................................ 278 | # END OF THE ITERATIONS 279 | # ............................................................ 280 | if (verbose) message("\n") 281 | md.log("", section="paragraph", trace=FALSE) 282 | 283 | # # if the iterations stops on minimum or maximum, return the last data 284 | # if (k == miniter || (k == maxiter && running) || maxiter == 1) { 285 | ###### ALWAYS RETURN THE LAST DATA. THIS WAS A BUG, REMAINING AFTER I INDIVIDUALIZED IMPUTATION EVALUATION 286 | 287 | ### Workaround for buggy 'as.data.frame' function 288 | ### ============================================= 289 | 290 | # INSTEAD OF DEFINING A NEW VARIABLE 'dataLast', just use the 'data' returned 291 | # FROM iteration and most importantly, AVOID THE BLOODY 'as.data.frame' function 292 | # which IS SO BUGGY 293 | attr(data, "metrics") <- metrics 294 | attr(data, error_metric) <- error 295 | # dataLast <- as.data.frame(hex) 296 | # Sys.sleep(sleep) 297 | # attr(dataLast, "metrics") <- metrics 298 | # attr(dataLast, error_metric) <- error 299 | # } 300 | # else { 301 | # md.log("return previous iteration's data", date=debug, time=debug, trace=FALSE) 302 | # } 303 | 304 | 305 | if (clean) { 306 | tryCatch(h2o::h2o.removeAll(), 307 | error = function(cond) { 308 | message("trying to connect to JAVA server...\n"); 309 | return(stop("Java server has crashed (low RAM?)"))}) 310 | md.log("server was cleaned", section="paragraph", trace=FALSE) 311 | } 312 | 313 | if (shutdown) { 314 | md.log("shutting down the server", section="paragraph", trace=FALSE) 315 | tryCatch(h2o::h2o.shutdown(prompt = FALSE), 316 | error = function(cond) { 317 | message("trying to connect to JAVA server...\n"); 318 | return(warning("Java server has crashed (low RAM?)"))}) 319 | Sys.sleep(sleep) 320 | } 321 | 322 | # ------------------------------------------------------------ 323 | # Adding stochastic variation 324 | # 325 | # Note: for continuous variables, RMSE is used as indication of 326 | # standard deviation. However, for binomial and multinomial 327 | # variables, the estimated probability of each level for 328 | # each missing observation is needed and thus, the predictions 329 | # of these variables should be stored and used in this section. 330 | # ============================================================ 331 | if (stochastic) { 332 | md.log("STOCHASTIC TIMES", section="paragraph", trace=FALSE) 333 | 334 | # evaluate each variable and add stochastic variation based on variable types 335 | # --------------------------------------------------------------------------- 336 | for (Y in vars2impute) { 337 | v.na <- dataNA[, Y] 338 | VEK <- data[which(v.na), Y] 339 | 340 | if (FAMILY[which(ITERATIONVARS == Y)] == 'gaussian' || 341 | FAMILY[which(ITERATIONVARS == Y)] == 'gaussian_integer' 342 | || FAMILY[which(ITERATIONVARS == Y)] == 'quasibinomial' ) { 343 | 344 | RMSE <- min(metrics[metrics$variable == Y, "RMSE"], na.rm = TRUE) 345 | data[which(v.na), Y] <- rnorm( 346 | n = length(VEK), 347 | mean = VEK, 348 | sd = RMSE) 349 | } 350 | 351 | else if (FAMILY[which(ITERATIONVARS == Y)] == 'binomial' || 352 | FAMILY[which(ITERATIONVARS == Y)] == 'multinomial') { 353 | 354 | #NOK <<- FACTORPREDCTIONS 355 | #> #column names represent the estimated levels' probabilities 356 | stochFactors <- stochasticFactorImpute(levels = colnames(FACTORPREDCTIONS[[Y]]), 357 | probMat = as.matrix(FACTORPREDCTIONS[[Y]])) 358 | 359 | # replace the missing observations with the stochastic data 360 | data[which(v.na), Y] <- stochFactors 361 | } 362 | 363 | } 364 | } 365 | 366 | # ------------------------------------------------------------ 367 | # Auto-Matching specifications 368 | # ============================================================ 369 | if (matching == "AUTO") { 370 | mtc <- 0 371 | for (Y in vars2impute) { 372 | mtc <- mtc + 1 373 | v.na <- dataNA[, Y] 374 | 375 | if ((FAMILY[mtc] == 'gaussian_integer') | (FAMILY[mtc] == 'quasibinomial')) { 376 | if (debug) md.log(paste("matching", Y), section="paragraph") 377 | 378 | matchedVal <- matching(imputed=data[v.na, Y], 379 | nonMiss=unique(data[!v.na,Y]), 380 | md.log) 381 | #message(matchedVal) 382 | if (!is.null(matchedVal)) data[v.na, Y] <- matchedVal 383 | else { 384 | md.log("matching failed", section="paragraph", trace=FALSE) 385 | } 386 | } 387 | } 388 | } 389 | 390 | # ------------------------------------------------------------ 391 | # Revert ordinal transformation 392 | # ============================================================ 393 | if (!ignore.rank) { 394 | data[, orderedCols] <- revert(data[, orderedCols, drop = FALSE], mem) 395 | } 396 | 397 | class(data) <- c("mlim", "data.frame") 398 | 399 | return(dataLast=data) 400 | } 401 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | **`mlim`** : Single and Multiple Imputation with Automated Machine Learning 5 | ================================================================ 6 | 7 | 8 | [![GitHub dev](https://img.shields.io/github/v/release/haghish/mlim.svg?color=2eb885)](https://github.com/haghish/mlim/releases/?include_prereleases&sort=semver "View GitHub releases") 9 | [![CRAN version](https://www.r-pkg.org/badges/version/mlim?color=2eb8b3)](https://cran.r-project.org/package=mlim) [![](https://cranlogs.r-pkg.org/badges/grand-total/mlim?color=a958d1)](https://cran.r-project.org/package=mlim) [![](man/figures/manual.svg)](https://CRAN.R-project.org/package=mlim) 10 | 16 | 17 | 18 | **`mlim`** is the first missing data imputation software to implement automated machine learning for performing _multiple imputation_ or _single imputation_ of missing data. The software, which is currently implemented as an R package, brings the state-of-the-arts of machine learning to provide a versatile missing data solution for various data types (continuous, binary, multinomial, and ordinal). In a nutshell, **`mlim`** is expected to outperform any other available missing data imputation software on many grounds. For example, **`mlim`** is expected to deliver: 19 | 20 | 1. **Lower imputation error** compared to other missing data imputation software. 21 | 2. **Higher imputation fairness**, when the data suffers from severe class imbalance, unnormal destribution, or the variables (features) have interactions with one another. 22 | 3. **Faster imputation of big datasets** because **`mlim`** excells in making an efficient use of available CPU cores and the runtime scales fairly well as the size of data becomes huge. 23 | 24 | The high performance of **`mlim`** is mainly by **fine-tuning** an **`ELNET`** algorithm, which often outperforms any standard statistical procedure or untuned machine learning algorithm and generalizes very well. However, **`mlim`** is an active research project and hence, it comes with a set of **experimental optimization toolkit** for exploring the possibility of performing multiple imputation with industry-standard machine learning algorithms such as _Deep Learning_, _Gradient Boosting Machine_, _Extreme Gradient Boosting_, and _Stacked Ensembles_. These algorithms can be used for either imputing missing data or optimizing already imputed data, but are **NOT used by default NOR recommended to all users**. Advanced users who are interested in exploring the possibilities of imputing missing data with these algorithms are recommended to read the free handbook (see below). These algorithms, as noted, are experimental, and the author is intended to examine their effectiveness for academic research (at this point). If you are interested to collaborate, [get in touch with the author](https://www.sv.uio.no/psi/english/people/aca/haghish/). 25 | 26 | 29 | 30 | Fine-tuning missing data imputation 31 | ----------------------------------- 32 | 33 | Simply put, for each variable in the dataset, **`mlim`** automatically fine-tunes a fast machine learning model, which results in significantly lower imputation error compared to classical statistical models or even untuned machine learning imputation software that use Random Forest or unsuperwised learning algorithms. Moreover, **`mlim`** is intended to give social scientists a powerful solution to their missing data problem, a tool that can automatically adopts to different variable types, that can appear at different rates, with unknown destributions and have high correlations or interactions with one another. **But it is not just about higher accuracy! `mlim` also delivers fairer imputation, particularly for categorical and ordinal variables because it automatically balances the levels of the avriable, minimizing the bias resulting from class imbalance, which can often be seen in social science data and has been commonly ignored by missing data imputation software.** 34 | 35 | 36 | 37 | 38 | **`mlim`** outperforms other R packages for all variable types, continuous, binary (factor), multinomial (factor), and ordinal (ordered factor). The reason for this improved performance is that **`mlim`**: 39 | 40 | - Automatically fine-tunes the parameters of the Machile Learning models 41 | - Delivers a very high prediction accuracy 42 | - Does not make any assumption about the destribution of the data 43 | - Takes the interactions between the variables into account 44 | - Can to some extend take the hierarchical structure of the data into account 45 | + Imputes missing data in nested observations with higher accuracy compared to the HLM imputation methods 46 | - Does not force a particular linear model 47 | - Uses a blend of different machine learning models 48 | 49 | 61 | 62 | Procedure: From preimputation to imputation and postimputation 63 | --------------------------------------------------------- 64 | 65 | When a dataframe with NAs is given to **`mlim`**, the NAs are replaced with plausible values (e.g. Mean and Mode) to prepare the dataset for the imputation, as shown in the flowchart below: 66 | 67 | 68 | 69 | **`mlim`** follows three steps to optimize the missing data imputation. This procedure is _optional_, depending on the amount of computing resources available to you. In general, **`ELNET`** imputation already outperforms other available single and multiple imputation methods available in **R**. However, the imputation error can be further improved by training stronger algorithms such as **`GBM`**, **`XGB`**, **`DL`**, or even **`Ensemble`**, stacking several models on top of one another. For the majority of the users, the **`GBM`** or **`XGB`** (XGB is available only in Mac OSX and Linux) will significantly imprive the **`ELNET`** imputation, if long-enough time is provided to generate a lot of models to fine-tune them. 70 | 71 | 72 | 73 | You do not necessarily need the post-imputation. Once you have _reimputed_ the data with `ELNET`, you can stop there. `ELNET` is relatively a fast algorithm and it is easy to fine-tune it compared to `GBM`, `XGB`, `DL`, or `Ensemble`. In addition, `ELNET` generalizes nicely and is less prone to overfiting. In the flowchart below the procedure of **mlim** algorithm is drawn. When using **mlim**, you can use `ELNET` to impute a dataset with NA or optimize the imputed values of a dataset that is already imputed. If you wish to go the extra mile, you can use heavier algorithms as well to activate the postimputation procedure, but it is strictly optional and by default, **mlim** does not use postimputation. 74 | 75 | 76 | 77 | 78 | 79 | Fast imputation with **`ELNET`** (without postimputation) 80 | --------------------------------------------------------- 81 | 82 | Below are some comparisons between different R packages for carrying out multiple imputations (bars with error) and single imputation. In these analyses, I only used the **`ELNET`** algorithm, which fine-tunes much faster than other algorithms (**`GBM`**, **`XGBoost`**, and **`DL`**). As it evident, **`ELNET`** already outperforms all other single and multiple imputation procedures available in **R** language. However, the performance of **`mlim`** can still be improved, by adding another algorithm, which activates the _postimputation_ procedure. 83 | 84 | 85 | 86 | 87 | 88 | 89 | Installation 90 | ------------ 91 | 92 | **`mlim`** is under fast development. The package receive monthly updates on CRAN. Therefore, it is recommended that you install the GitHub version until version 0.1 is released. To install the latest development version from GitHub: 93 | 94 | ``` r 95 | library(devtools) 96 | install_github("haghish/mlim") 97 | ``` 98 | 99 | Or alternatively, install the latest stable version from CRAN: 100 | ``` r 101 | install.packages("mlim") 102 | ``` 103 | 104 | Supported algorithms 105 | -------------------- 106 | 107 | **`mlim`** supports several algorithms: 108 | 109 | - `ELNET` (Elastic Net) 110 | - `RF` (Random Forest and Extremely Randomized Trees) 111 | - `GBM` (Gradient Boosting Machine) 112 | - `XGB` (Extreme Gradient Boosting, available in Mac OS and Linux) 113 | - `DL` (Deep Learning) 114 | - `Ensemble` (Stacked Ensemble) 115 | 116 | > `ELNET` is the default imputation algorithm. Among all of the above, ELNET is the simplest model, fastest to fine-tune, requires the least amount of RAM and CPU, and yet, it is the most stable one, which also makes it one of the most generalizable algorithms. By default, **`mlim`** uses only `ELNET`, however, you can add another algorithm to activate the post-imputation procedure. 117 | 118 | 120 | 126 | 128 | 129 | ### `GBM` vs `ELNET` 130 | 131 | But which one should you choose, assuming computation resources are not in question? Well, __`GBM`__ is very liokely to outperform __`ELNET`__, if you specify a large enough `max_models` argument to well-tune the algorithm for imputing each feature. That basically means generating more than 100 models, at least. But you will enjoy a slight -- yet probably statistically significant -- improvement in the imputation accuracy. The option is there, for those who can use it, and to my knowledge, fine-tuning __`GBM`__ with large enough number of models will be the most accurate imputation algorithm compared to any other procedure I know. But __`ELNET`__ comes second and compared to its speed advantage, it is indeed charming! 132 | 133 | Both of these algorithms offer one advantage over all the other machine learning missing data imputation methods such as kNN, K-Means, PCA, Random Forest, etc... Simply put, you do not need to specify any parameter yourself, everything is automatic and **`mlim`** searches for the optimal parameters for imputing each variable within each iteration. For all the aformentioned packages, some parameters need to be specified, which influence the imputation accuracy. Number of _k_ for kNN, number of components for PCA, number of trees (and other parameters) for Random Forest, etc... This is why `elnet` outperform the other packages. You get a software that optimizes its models on its own. 134 | 135 | Advantages and limitations 136 | -------------------------- 137 | 138 | **`mlim`** fine-tunes models for imputation, a procedure that has never been implemented in other R packages. This procedure often yields much higher accuracy compared to other machine learning imputation methods or missing data imputation procedures because of using more accurate models that are fine-tuned for each feature in the dataset. The cost, however, is computational resources. If you have access to a very powerful machine, with a huge amount of RAM per CPU, then try __`GBM`__. If you specify a high enough number of models in each fine-tuning process, you are likely to get a more accurate imputation that __`ELNET`__. However, for personal machines and laptops, __`ELNET`__ is generally recommended (see below). __If your machine is not powerful enough, it is likely that the imputation crashes due to memory problems...__. So, perhaps begin with __`ELNET`__, unless you are working with a powerful server. This is my general advice as long as **`mlim`** is in Beta version and under development. 139 | 140 | 141 | Citation 142 | -------- 143 | 144 | - Haghish, E. F. (2022). mlim: Multiple Imputation with Automated Machine Learning [Computer software]. https://CRAN.R-project.org/package=mlim. 145 | 146 | 147 | 160 | 161 | Example 162 | ------- 163 | 164 | `iris` ia a small dataset with 150 rows only. Let's add 50% of artifitial missing data and compare several state-of-the-art machine learning missing data imputation procedures. __`ELNET`__ comes up as a winner for a very simple reason! Because it was fine-tuned and all the rest were not. The larger the dataset and the higher the number of features, the difference between __`ELNET`__ and the others becomes more vivid. 165 | 166 | ### Single imputation 167 | 168 | In a single imputation, the NAs are replaced with the most plausible values according the model. You do not get the diversity of the multiple imputation, but you still get an estimated imputation error based on 10-fold (or higher, if specified) cross-validation procedure for each variable (column) in the dataset. As shown below, `mlim` provides the **`mlim.error()`** function to summarize the imputation error for the entire dataset or each variable. 169 | 170 | ```R 171 | # Comparison of different R packages imputing iris dataset 172 | # =============================================================================== 173 | rm(list = ls()) 174 | library(mlim) 175 | library(mice) 176 | library(missForest) 177 | library(VIM) 178 | 179 | # Add artifitial missing data 180 | # =============================================================================== 181 | irisNA <- mlim.na(iris, p = 0.5, stratify = TRUE, seed = 2022) 182 | 183 | # Single imputation with mlim, giving it 180 seconds to fine-tune each imputation 184 | # =============================================================================== 185 | MLIM <- mlim(irisNA, m=1, seed = 2022, tuning_time = 180) 186 | print(MLIMerror <- mlim.error(MLIM, irisNA, iris)) 187 | 188 | # kNN Imputation with VIM 189 | # =============================================================================== 190 | kNN <- kNN(irisNA, imp_var=FALSE) 191 | print(kNNerror <- mlim.error(kNN, irisNA, iris)) 192 | 193 | # Single imputation with MICE (for the sake of demonstration) 194 | # =============================================================================== 195 | MC <- mice(irisNA, m=1, maxit = 50, method = 'pmm', seed = 500) 196 | print(MCerror <- mlim.error(MC, irisNA, iris)) 197 | 198 | # Random Forest Imputation with missForest 199 | # =============================================================================== 200 | set.seed(2022) 201 | RF <- missForest(irisNA) 202 | print(RFerror <- mlim.error(RF$ximp, irisNA, iris)) 203 | ``` 204 | 205 | ### Multiple imputation 206 | 207 | `mlim` supports multiple imputation. All you need to do is to specify an integer higher than 1 for the value of `m`. For example, set `m = 5` in the `mlim` function to impute 5 datasets. Then, `mlim` returns a list including 5 datasets. You can convert this list to a `mids` object using the **`mlim.mids()`** function and then follow up the analysis with the `mids` object the same way it is carried out by the [`mice`](https://CRAN.R-project.org/package=mice) R package. Here is an example: 208 | 209 | ```R 210 | # Comparison of different R packages imputing iris dataset 211 | # =============================================================================== 212 | rm(list = ls()) 213 | library(mlim) 214 | library(mice) 215 | 216 | # Add artifitial missing data 217 | # =============================================================================== 218 | irisNA <- mlim.na(iris, p = 0.5, stratify = TRUE, seed = 2022) 219 | 220 | # multiple imputation with mlim, giving it 180 seconds to fine-tune each imputation 221 | # =============================================================================== 222 | MLIM2 <- mlim(irisNA, m = 5, seed = 2022, tuning_time = 180) 223 | print(MLIMerror2 <- mlim.error(MLIM2, irisNA, iris)) 224 | mids <- mlim.mids(MLIM2, dfNA) 225 | fit <- with(data=mids, exp=glm(Species ~ Sepal.Length, family = "binomial")) 226 | res <- mice::pool(fit) 227 | summary(res) 228 | ``` 229 | 230 | 231 | -------------------------------------------------------------------------------- /R/mlim.R: -------------------------------------------------------------------------------- 1 | #' @title missing data imputation with automated machine learning 2 | #' @description imputes data.frame with mixed variable types using automated 3 | #' machine learning (AutoML) 4 | # @param impute character. specify a vector of algorithms to be used 5 | # in the process of auto-tuning. the supported main algorithms are 6 | # "ELNET", "RF", 7 | # "GBM", "DL", "XGB" (available for Mac and Linux), and "Ensemble". 8 | # 9 | # the default is "AUTO", which is mostly based on "ELNET", but also 10 | # uses Extremely Randomized Forests, in addition to Random Forest, before 11 | # concluding the imputation, when "ELNET" stops improving. This procedure is 12 | # relatively fast and yields charming results, often equal to specifying "impute = c('ELNET', 'RF')", 13 | # which at each step of the imputation, uses both "ELNET" and "RF", doubling the imputation 14 | # time, and thus, it is advised. 15 | # 16 | # "GBM", "DL", "XGB", and "Ensemble" take the full given "tuning_time" (see below) to 17 | # tune the best model for imputing he given variable. it is advised to use these extensive 18 | # algorithms in the process of "postimputation" and let "ELNET" do most of the legwork to save 19 | # computational resources. 20 | #' @importFrom utils setTxtProgressBar txtProgressBar capture.output packageVersion 21 | #' @importFrom tools file_ext 22 | #' @importFrom h2o h2o.init as.h2o h2o.automl h2o.predict h2o.ls 23 | #' h2o.removeAll h2o.rm h2o.shutdown h2o.no_progress 24 | #' @importFrom md.log md.log 25 | #' @importFrom memuse Sys.meminfo 26 | #' @importFrom stats var setNames na.omit 27 | #' @importFrom curl curl 28 | #' @param data a \code{data.frame} (strictly) with missing data to be 29 | #' imputed. if \code{'load'} argument is provided, this argument will be ignored. 30 | #' @param m integer, specifying number of multiple imputations. the default value is 31 | #' 1, carrying out a single imputation. 32 | #' @param algos character vector, specifying algorithms to be used for missing data 33 | #' imputation. supported algorithms are "ELNET", "RF", "GBM", "DL", 34 | #' "XGB", and "Ensemble". if more than one algorithm is specified, 35 | #' mlim changes behavior to save on runtime. for example, 36 | #' the default is "ELNET", which fine-tunes an Elastic Net model. 37 | #' In general, "ELNET" is expected to 38 | #' be the best algorithm because it fine-tunes very fast, it is 39 | #' very robust to over-fitting, and hence, it generalizes very well. 40 | #' However, if your data has many factor variables, each with several 41 | #' levels, it is recommended to have c("ELNET", "RF") as your imputation 42 | #' algorithms (and possibly add "Ensemble" as well, to make the most out 43 | #' of tuning the models). 44 | #' 45 | #' Note that "XGB" is only available in Mac OS and Linux. moreover, 46 | #' "GBM", "DL" and "XGB" take the full given "tuning_time" (see below) to 47 | #' tune the best model for imputing he given variable, whereas "ELNET" 48 | #' will produce only one fine-tuned model, often at less time than 49 | #' other algorithms need for developing a single model, which is why "ELNET" 50 | #' is work horse of the mlim imputation package. 51 | #' @param preimpute character. specifies the 'primary' procedure of handling the missing 52 | #' data. before 'mlim' begins imputing the missing observations, they should 53 | #' be prepared for the imputation algorithms and thus, they should be replaced 54 | #' with some values. 55 | #' the default procedure is a quick "RF", which models the missing 56 | #' data with parallel Random Forest model. this is a very fast procedure, 57 | #' which later on, will be replaced within the "reimputation" procedure (see below). 58 | #' possible other alternative is \code{"mm"}, 59 | #' which carries out mean/mode replacement, as practiced by most imputation algorithms. 60 | #' "mm" is much faster than "RF". if your dataset is very 61 | #' large, consider pre-imputing it before hand using 'mlim.preimpute()' 62 | #' function and passing the preimputed dataset to mlim (see "preimputed.data" argument). 63 | # 64 | # another alternative is "iterate", which instead of filling the missing observations with mean and mode, it 65 | # gradually adds the imputed variables to the vector of predictors, as it carries out the 66 | # first iteration. 67 | #' @param postimpute (EXPERIMENTAL FEATURE) logical. if TRUE, mlim uses algorithms rather than 'ELNET' for carrying out 68 | #' postimputation optimization. however, if FALSE, all specified algorihms will 69 | #' be used in the process of 'reimputation' together. the 'Ensemble' algorithm 70 | #' is encouraged when other algorithms are used. However, for general users 71 | #' unspecialized in machine learning, postimpute is NOT recommended because this 72 | #' feature is currently experimental, prone to over-fitting, and highly computationally extensive. 73 | #' @param stochastic logical. by default it is set to TRUE for multiple imputation and FALSE for 74 | #' single imputation. stochastic argument is currently under testing and is intended to 75 | #' avoid inflating the correlation between imputed valuables. 76 | #' @param ignore character vector of column names or index of columns that should 77 | #' should be ignored in the process of imputation. 78 | #' @param tuning_time integer. maximum runtime (in seconds) for fine-tuning the 79 | #' imputation model for each variable in each iteration. the default 80 | #' time is 900 seconds but for a large dataset, you 81 | #' might need to provide a larger model development 82 | #' time. this argument also influences \code{max_models}, 83 | #' see below. If you are using 'ELNET' algorithm (default), 84 | #' you can be generous with the 'tuning_time' argument because 85 | #' 'ELNET' tunes much faster than the rest and will only 86 | #' produce one model. 87 | #' @param max_models integer. maximum number of models that can be generated in 88 | #' the proecess of fine-tuning the parameters. this value 89 | #' default to 100, meaning that for imputing each variable in 90 | #' each iteration, up to 100 models can be fine-tuned. increasing 91 | #' this value should be consistent with increasing 92 | #' \code{max_model_runtime_secs}, allowing the model to spend 93 | #' more time in the process of individualized fine-tuning. 94 | #' as a result, the better tuned the model, the more accurate 95 | #' the imputed values are expected to be 96 | #' @param autobalance logical. if TRUE (default), binary and multinomial factor variables 97 | #' will be balanced before the imputation to obtain fairer 98 | #' and less-biased imputations, which are typically in favor 99 | #' of the majority class. 100 | #' if FALSE, imputation fairness will be sacrificed for overall accuracy, which 101 | #' is not recommended, although it is commonly practiced in other missing data 102 | #' imputation software. MLIM is highly concerned with imputation fairness for 103 | #' factor variables and autobalancing is generally recommended. 104 | #' in fact, higher overall accuracy does not mean a better imputation as 105 | #' long as minority classes are neglected, which increases the bias in favor of the 106 | #' majority class. if you do not wish to autobalance all the 107 | #' factor variables, you can manually specify the variables 108 | #' that should be balanced using the 'balance' argument (see below). 109 | #' 110 | # NOTE: when a variable is balanced prior to the imputation, a different 111 | # bootstrap sampling procedure will be used. in doing so, instead of 112 | # carrying out bootstrap subsamples with replacement and adding the 113 | # duplicated observations as weights in the imputation, undersampling 114 | # bootstrap procedure without replacement is performed because the weights 115 | # of the artificially balanced data will conflicts the weights of the 116 | # bootstrap data. 117 | #' @param balance character vector, specifying variable names that should be 118 | #' balanced before imputation. balancing the prevalence might 119 | #' decrease the overall accuracy of the imputation, because it 120 | #' attempts to ensure the representation of the rare outcome. 121 | #' this argument is optional and intended for advanced users that 122 | #' impute a severely imbalance categorical (nominal) variable. 123 | #' @param matching logical. if \code{TRUE}, imputed values are coerced to the 124 | #' closest value to the non-missing values of the variable. 125 | #' if set to "AUTO", 'mlim' decides whether to match 126 | #' or not, based on the variable classes. the default is "AUTO". 127 | # @param ignore.rank logical, if FALSE (default), ordinal variables 128 | # are imputed as continuous integers with regression plus matching 129 | # and are reverted to ordinal later again. this procedure is 130 | # recommended. if FALSE, the rank of the categories will be ignored 131 | # the the algorithm will try to optimize for classification accuracy. 132 | # WARNING: the latter often results in very high classification accuracy but at 133 | # the cost of higher rank error. see the "mlim.error" function 134 | # documentation to see how rank error is computed. therefore, if you 135 | # intend to carry out analysis on the rank data as numeric, it is 136 | # recommended that you set this argument to FALSE. 137 | #' @param maxiter integer. maximum number of iterations. the default value is \code{15}, 138 | #' but it can be reduced to \code{3} (not recommended, see below). 139 | # @param miniter integer. minimum number of iterations. the default value is 140 | # 2. 141 | #' @param flush logical (experimental). if TRUE, after each model, the server is 142 | #' cleaned to retrieve RAM. this feature is in testing mode and is 143 | #' currently set to FALSE by default, but it is recommended if you 144 | #' have limited amount of RAM or large datasets. 145 | #' @param cv logical. specify number of k-fold Cross-Validation (CV). values of 146 | #' 10 or higher are recommended. default is 10. 147 | # @param error_metric character. specify the minimum improvement 148 | # in the estimated error to proceed to the 149 | # following iteration or stop the imputation. 150 | # the default is 10^-4 for \code{"MAE"} 151 | # (Mean Absolute Error). this criteria is only 152 | # applied from the end of the fourth iteration. 153 | # \code{"RMSE"} (Root Mean Square 154 | # Error). other possible values are \code{"MSE"}, 155 | # \code{"MAE"}, \code{"RMSLE"}. 156 | #' @param tolerance numeric. the minimum rate of improvement in estimated error metric 157 | #' of a variable to qualify the imputation for another round of iteration, 158 | #' if the \code{maxiter} is not yet reached. any improvement of imputation 159 | #' is desirable. however, specifying values above 0 can reduce the number 160 | #' of required iterations at a marginal increase of imputation error. 161 | #' for larger datasets, value of "1e-3" is recommended to reduce number 162 | #' of iterations. the default value is '1e-3'. 163 | #' @param doublecheck logical. default is TRUE (which is conservative). if FALSE, if the estimated 164 | #' imputation error of a variable does not improve, the variable 165 | #' will be not reimputed in the following iterations. in general, 166 | #' deactivating this argument will slightly reduce the imputation 167 | #' accuracy, however, it significantly reduces the computation time. 168 | #' if your dataset is large, you are advised to set this argument to 169 | #' FALSE. (EXPERIMENTAL: consider that by avoiding several iterations 170 | #' that marginally improve the imputation accuracy, you might gain 171 | #' higher accuracy by investing your computational resources in fine-tuning 172 | #' better algorithms such as "GBM") 173 | #' 174 | # @param stopping_metric character. 175 | # @param stopping_rounds integer. 176 | # @param stopping_tolerance numeric. 177 | # @param weights_column non-negative integer. a vector of observation weights 178 | # can be provided, which should be of the same length 179 | # as the dataframe. giving an observation a weight of 180 | # Zero is equivalent of ignoring that observation in the 181 | # model. in contrast, a weight of 2 is equivalent of 182 | # repeating that observation twice in the dataframe. 183 | # the higher the weight, the more important an observation 184 | # becomes in the modeling process. the default is NULL. 185 | #' @param seed integer. specify the random generator seed 186 | # @param plot logical. If TRUE, estimated error of the imputed dataset is plotted, 187 | # showing the reduction in CV error 188 | 189 | #' @param report filename. if a filename is specified (e.g. report = "mlim.md"), the \code{"md.log"} R 190 | #' package is used to generate a Markdown progress report for the 191 | #' imputation. the format of the report is adopted based on the 192 | #' \code{'verbosity'} argument. the higher the verbosity, the more 193 | #' technical the report becomes. if verbosity equals "debug", then 194 | #' a log file is generated, which includes time stamp and shows 195 | #' the function that has generated the message. otherwise, a 196 | #' reduced markdown-like report is generated. default is NULL. 197 | #' @param verbosity character. controls how much information is printed to console. 198 | #' the value can be "warn" (default), "info", "debug", or NULL. 199 | # @param init logical. should h2o Java server be initiated? the default is TRUE. 200 | # however, if the Java server is already running, set this argument 201 | #' to FALSE. 202 | #' @param cpu integer. number of CPUs to be dedicated for the imputation. 203 | #' the default takes all of the available CPUs. 204 | #' @param ram integer. specifies the maximum size, in Gigabytes, of the 205 | #' memory allocation. by default, all the available memory is 206 | #' used for the imputation. 207 | #' large memory size is particularly advised, especially 208 | #' for multicore processes. the more you give the more you get! 209 | #' @param preimputed.data data.frame. if you have used another software for missing 210 | #' data imputation, you can still optimize the imputation 211 | #' by handing the data.frame to this argument, which will 212 | #' bypass the "preimpute" procedure. 213 | #' @param save filename (with .mlim extension). if a filename is specified, an \code{mlim} object is 214 | #' saved after the end of each variable imputation. this object not only 215 | #' includes the imputed dataframe and estimated cross-validation error, but also 216 | #' includes the information needed for continuing the imputation, 217 | #' which is very useful feature for imputing large datasets, with a 218 | #' long runtime. this argument is activated by default and an 219 | #' mlim object is stored in the local directory named \code{"mlim.rds"}. 220 | #' @param load filename (with .mlim extension). an object of class "mlim", which includes the data, arguments, 221 | #' and settings for re-running the imputation, from where it was 222 | #' previously stopped. the "mlim" object saves the current state of 223 | #' the imputation and is particularly recommended for large datasets 224 | #' or when the user specifies a computationally extensive settings 225 | #' (e.g. specifying several algorithms, increasing tuning time, etc.). 226 | # @param force.load (NOT YET IMPLEMENTED FOR R).logical (default is TRUE). if TRUE, when loading the mlim class 227 | # object, its preserved settings are used for restoring and saving the 228 | # following itterations. otherwise, if FALSE, the current arguments of 229 | # mlim are used to overpower the settings of the mlim object. the settings 230 | # include the full list of the mlim arguments. 231 | #' @param shutdown logical. if TRUE, h2o server is closed after the imputation. 232 | #' the default is TRUE 233 | #' @param java character, specifying path to the executable 64bit Java JDK on the 234 | #' Microsoft Windows machines, if JDK is installed but the path environment 235 | #' variable is not set. 236 | #' @param ... arguments that are used internally between 'mlim' and 'mlim.postimpute'. 237 | #' these arguments are not documented in the help file and are not 238 | #' intended to be used by end user. 239 | #' @return a \code{data.frame}, showing the 240 | #' estimated imputation error from the cross validation within the data.frame's 241 | #' attribution 242 | #' @author E. F. Haghish 243 | #' 244 | #' @examples 245 | #' 246 | #' \dontrun{ 247 | #' data(iris) 248 | #' 249 | #' # add stratified missing observations to the data. to make the example run 250 | #' # faster, I add NAs only to a single variable. 251 | #' dfNA <- iris 252 | #' dfNA$Species <- mlim.na(dfNA$Species, p = 0.1, stratify = TRUE, seed = 2022) 253 | #' 254 | #' # run the ELNET single imputation (fastest imputation via 'mlim') 255 | #' MLIM <- mlim(dfNA, shutdown = FALSE) 256 | #' 257 | #' # in single imputation, you can estimate the imputation accuracy via cross validation RMSE 258 | #' mlim.summarize(MLIM) 259 | #' 260 | #' ### or if you want to carry out ELNET multiple imputation with 5 datasets. 261 | #' ### next, to carry out analysis on the multiple imputation, use the 'mlim.mids' function 262 | #' ### minimum of 5 datasets 263 | #' MLIM2 <- mlim(dfNA, m = 5) 264 | #' mids <- mlim.mids(MLIM2, dfNA) 265 | #' fit <- with(data=mids, exp=glm(Species ~ Sepal.Length, family = "binomial")) 266 | #' res <- mice::pool(fit) 267 | #' summary(res) 268 | #' 269 | #' # you can check the accuracy of the imputation, if you have the original dataset 270 | #' mlim.error(MLIM2, dfNA, iris) 271 | # 272 | # ### run GBM, RF, ELNET, and Ensemble algos and allow 60 minutes of tuning for each variable 273 | # ### this requires a lot of RAM on your machine and a lot of time! 274 | # MLIM <- mlim(dfNA, algos = c("GBM", "RF","ELNET","Ensemble"), tuning_time=60*60) 275 | # mlim.error(MLIM, dfNA, iris) 276 | # 277 | # ### if you have a larger data, there is a few things you can set to make the 278 | # ### algorithm faster, yet, having only a marginal accuracy reduction as a trade-off 279 | # MLIM <- mlim(dfNA, algos = 'ELNET', tolerance = 1e-3, doublecheck = FALSE) 280 | #' } 281 | #' @export 282 | 283 | 284 | mlim <- function(data = NULL, 285 | m = 1, 286 | algos = c("ELNET"), #impute, postimpute 287 | postimpute = FALSE, 288 | stochastic = m > 1, 289 | ignore = NULL, 290 | 291 | # computational resources 292 | tuning_time = 900, 293 | max_models = NULL, # run all that you can 294 | maxiter = 10L, 295 | #miniter = 2L, 296 | cv = 10L, 297 | #validation = 0, 298 | 299 | matching = "AUTO", #EXPERIMENTAL 300 | autobalance = TRUE, 301 | balance = NULL, #EXPERIMENTAL 302 | #ignore.rank = FALSE, #to ignore it, they should make it unordered! 303 | # weights_column = NULL, 304 | 305 | # report and reproducibility 306 | seed = NULL, 307 | verbosity = NULL, 308 | report = NULL, 309 | 310 | # stopping criteria 311 | tolerance = 1e-3, 312 | doublecheck = TRUE, 313 | 314 | ## simplify the settings by taking these arguments out 315 | preimpute = "RF", 316 | #impute = "AUTO", 317 | #postimpute = "AUTO", 318 | #error_metric = "RMSE", #??? mormalize it 319 | #stopping_metric = "AUTO", 320 | #stopping_rounds = 3, 321 | #stopping_tolerance=1e-3, 322 | 323 | # setup the h2o cluster 324 | cpu = -1, 325 | ram = NULL, 326 | flush = FALSE, 327 | 328 | 329 | # NOT YET IMPLEMENTED 330 | preimputed.data = NULL, 331 | save = NULL, 332 | load = NULL, 333 | #init = TRUE, 334 | shutdown = TRUE, 335 | java = NULL, 336 | #force.load = TRUE, 337 | ... 338 | ) { 339 | 340 | # improvements for the next release 341 | # ============================================================ 342 | # instead of using all the algorithms at each iteration, add the 343 | # other algorithms when the first algorithm stops being useful. 344 | # perhaps this will help optimizing, while reducing the computation burdon 345 | # h2o DRF does not give OOB error, so initial comparison preimputation is not possible 346 | # HOWEVER, I can estimate the CV for the preimputation procedure 347 | # 348 | # instead of adding postimpute_algos, extract it from specified algorithms 349 | 350 | # check the ... arguments 351 | # ============================================================ 352 | hidden_args <- c("superdebug", "init", "ignore.rank", "sleep", "stochastic") 353 | stopifnot( 354 | "incompatible '...' arguments" = (names(list(...)) %in% hidden_args) 355 | ) 356 | 357 | # Simplify the syntax by taking arguments that are less relevant to the majority 358 | # of the users out 359 | # ============================================================ 360 | #stopping_metric <- "AUTO" 361 | #stopping_rounds <- 3 362 | #stopping_tolerance <- 1e-3 363 | MI <- list() 364 | bdata <- NULL 365 | metrics <- NULL 366 | error <- NULL 367 | debug <- FALSE 368 | miniter <- 2L 369 | init <- threeDots(name = "init", ..., default = TRUE) 370 | # cv <- threeDots(name = "cv", ..., default = 10L) 371 | # flush <- threeDots(name = "flush", ..., default = TRUE) 372 | verbose <- 0 373 | error_metric<- "RMSE" 374 | #preimpute <- "RF" 375 | ignore.rank <- threeDots(name = "ignore.rank", ..., default = FALSE) #EXPERIMENTAL 376 | sleep <- threeDots(name = "sleep", ..., default = .25) 377 | superdebug <- threeDots(name = "superdebug", ..., default = FALSE) 378 | #stochastic <- threeDots(name = "stochastic", ..., default = FALSE) 379 | 380 | 381 | 382 | # ============================================================ 383 | # ============================================================ 384 | # LOAD SETTINGS FROM mlim class object 385 | # ============================================================ 386 | # ============================================================ 387 | if (!is.null(load)) { 388 | if (inherits(load, "character")) load <- readRDS(load) 389 | if (!inherits(load, "mlim")) stop("loaded object must be of class 'mlim'") 390 | 391 | # Data 392 | # ---------------------------------- 393 | MI <- load$MI # dataLast or multiple-imputation data 394 | dataNA <- load$dataNA 395 | preimputed.data<- load$preimputed.data 396 | data <- load$data # preimputed dataset that is constantly updated 397 | #bdata <- load$bdata 398 | #dataLast <- load$dataLast 399 | metrics <- load$metrics 400 | mem <- load$mem 401 | orderedCols <- load$orderedCols 402 | 403 | # Loop data 404 | # ---------------------------------- 405 | m <- load$m # number of datasets to impute 406 | m.it <- load$m.it # current dataset to impute 407 | k <- load$k # current loop number (global imputation iteration) 408 | z <- load$z # current local iteration number 409 | X <- load$X 410 | Y <- load$Y # last-imputed imputed variable. outside the 'load' argument, it means current variable to be imputed 411 | vars2impute <- load$vars2impute 412 | FAMILY <- load$FAMILY 413 | 414 | # settings 415 | # ---------------------------------- 416 | ITERATIONVARS <- load$ITERATIONVARS# variables to be imputed 417 | impute <- load$impute # reimputation algorithm(s) 418 | postimputealgos<- load$postimputealgos 419 | autobalance <- load$autobalance 420 | balance <- load$balance 421 | ignore <- load$ignore 422 | save <- load$save 423 | maxiter <- load$maxiter 424 | miniter <- load$miniter 425 | cv <- load$cv 426 | tuning_time <- load$tuning_time 427 | max_models <- load$max_models 428 | matching <- load$matching 429 | ignore.rank <- load$ignore.rank #KEEP IT HIDDEN 430 | #weights_column <- load$weights_column 431 | seed <- load$seed 432 | verbosity <- load$verbosity 433 | verbose <- load$verbose #KEEP IT HIDDEN 434 | debug <- load$debug #KEEP IT HIDDEN 435 | report <- load$report 436 | flush <- load$flush 437 | error_metric <- load$error_metric #KEEP IT HIDDEN 438 | error <- load$error #KEEP IT HIDDEN 439 | tolerance <- load$tolerance 440 | cpu <- load$cpu 441 | max_ram <- load$max_ram 442 | min_ram <- load$min_ram #KEEP IT HIDDEN 443 | keep_cv <- load$keep_cv 444 | pkg <- load$pkg #KEEP IT HIDDEN 445 | 446 | 447 | # MOVE-ON to the next variable after loading an mlim object 448 | # --------------------------------------------------------- 449 | moveOn <- iterationNextVar(m, m.it, k, z, Y, ITERATIONVARS, maxiter) 450 | m <- moveOn$m 451 | m.it <- moveOn$m.it 452 | k <- moveOn$k 453 | z <- moveOn$z 454 | Y <- moveOn$Y 455 | } 456 | 457 | # ============================================================ 458 | # ============================================================ 459 | # Prepare the imputation settings 460 | # ============================================================ 461 | # ============================================================ 462 | else { 463 | if (!is.null(seed)) set.seed(seed) # avoid setting seed by default if it is a continuation 464 | 465 | alg <- algoSelector(algos, postimpute) 466 | # preimpute <- "RF" #alg$preimpute ## for now, make this global 467 | impute <- alg$impute 468 | postimputealgos <- alg$postimpute 469 | 470 | synt <- syntaxProcessing(data, preimpute, impute, ram, 471 | matching=matching, miniter, maxiter, max_models, 472 | tuning_time, cv, verbosity=verbosity, report, save) 473 | min_ram <- synt$min_ram 474 | max_ram <- synt$max_ram 475 | keep_cv <- synt$keep_cross_validation_predictions 476 | verbose <- synt$verbose 477 | debug <- synt$debug 478 | } 479 | 480 | # disable h2o progress_bar 481 | #if (!debug) h2o::h2o.no_progress() 482 | if (!superdebug) h2o::h2o.no_progress() 483 | 484 | # ============================================================ 485 | # Initialize the Markdown report 486 | # ============================================================ 487 | if (is.null(report)) md.log("System information", file=tempfile(), 488 | trace=TRUE, sys.info = TRUE, date=TRUE, time=TRUE) 489 | 490 | else if (is.null(load)) md.log("System information", file=report, 491 | append = FALSE, trace=TRUE, sys.info = TRUE, 492 | date=TRUE, time=TRUE) #, print=TRUE 493 | 494 | else if (!is.null(load)) md.log("\nContinuing from where it was left...", file=report, 495 | append = TRUE, trace=TRUE, sys.info = TRUE, 496 | date=TRUE, time=TRUE) 497 | 498 | # Run H2O on the statistics server¤ 499 | # ============================================================ 500 | if (init) { 501 | #sink(file = report, append = TRUE) 502 | #message("\n") # for Markdown styling 503 | capture.output(connection <- init(nthreads = cpu, 504 | min_mem_size = min_ram, 505 | max_mem_size = max_ram, 506 | ignore_config = TRUE, 507 | java = java, 508 | report, debug), 509 | file = report, append = TRUE) 510 | #sink() 511 | 512 | ## ??? DO NOT CLOSE ALL THE CONNECTIONS 513 | #sink.reset <- function(){ 514 | # for(i in seq_len(sink.number())){ 515 | # sink(NULL) 516 | # } 517 | #} 518 | #sink.reset() 519 | 520 | 521 | ##closeAllConnections() 522 | ##print(connection) 523 | } 524 | 525 | # Identify variables for imputation and their models' families 526 | # ============================================================ 527 | if (is.null(load)) { 528 | VARS <- selectVariables(data, ignore, verbose, report) 529 | 530 | dataNA <- VARS$dataNA # the missing data placeholder 531 | allPredictors <- VARS$allPredictors 532 | vars2impute <- VARS$vars2impute 533 | vars2postimpute <- VARS$vars2impute 534 | storeVars2impute <- vars2impute 535 | X <- VARS$X 536 | bdata <- NULL 537 | 538 | # if there is only one variable to impute, there is no need to iterate! 539 | if (length(vars2impute) < 1) stop("\nthere is nothing to impute!\n") 540 | else if (length(vars2impute) == 1) { 541 | if (!is.valid(postimputealgos)) { 542 | maxiter <- 1 543 | } 544 | } 545 | 546 | # ......................................................... 547 | # check the variables for compatibility 548 | # ......................................................... 549 | # if preimputed data is provided, take it into consideration! 550 | if (!is.null(preimputed.data)) { 551 | 552 | # if a multiple imputation object is given, take the first dataset 553 | # ??? in the future, consider that each of the given datasets can 554 | # be fed independently as a separate "m". for now, this is NOT AN 555 | # announced feature and thus, just take the first dataset as preimputation 556 | if (inherits(preimputed.data, "mlim.mi")) { 557 | #preimputed.data <- preimputed.data[[1]] 558 | #stop("use 'mlim.postimpute' function for postimputing multiple imputation datasets\n") 559 | stop("multiple imputation datasets cannot be used as 'preimputed.data'\n") 560 | } 561 | 562 | # if the preimputation was done with mlim, extract the metrics 563 | else if (inherits(preimputed.data, "mlim")) { 564 | 565 | 566 | # remove the NAs of the last imputation and replace them with 567 | # the minimum 568 | metrics <- getMetrics(preimputed.data) 569 | } 570 | 571 | # SAVE RAM: if preimputed.data is given, replace the original data because 572 | # its missing data is reserved within dataNA 573 | data <- preimputed.data 574 | 575 | # reset the relevant predictors 576 | X <- allPredictors 577 | } 578 | 579 | Features <- checkNconvert(data, vars2impute, ignore, 580 | ignore.rank=ignore.rank, report) 581 | 582 | FAMILY<- Features$family 583 | 584 | # data <- Features$data ##> this will be moved inside the loop because 585 | # in multiple imputation, we want to start over 586 | # everytime! 587 | mem <- Features$mem 588 | orderedCols <- Features$orderedCols 589 | 590 | # ......................................................... 591 | # PREIMPUTATION: replace data with preimputed data 592 | # ......................................................... 593 | if (preimpute != "iterate" & is.null(preimputed.data)) { 594 | 595 | # preimpute in single imputation ONLY. for multiple imputation, each 596 | # bootstrap dataset is imputed seperately 597 | if (m == 1) { 598 | data <- mlim.preimpute(data=data, preimpute=preimpute, seed = NULL) # DO NOT RESET THE SEED! 599 | } 600 | 601 | # reset the relevant predictors 602 | X <- allPredictors 603 | } 604 | 605 | # ......................................................... 606 | # Remove 'Features', but keep 'preimputed.data' in MI 607 | # ......................................................... 608 | if (m > 1) preimputed.data <- Features$data 609 | else preimputed.data <- NULL 610 | rm(Features) 611 | gc() 612 | } 613 | 614 | 615 | # ............................................................ 616 | # ............................................................ 617 | # ITERATION LOOP 618 | # ............................................................ 619 | # ............................................................ 620 | if (is.null(load)) { 621 | k <- 1L 622 | z <- 1L 623 | m.it <- 1L 624 | MI <- NULL 625 | error <- setNames(rep(1, length(vars2impute)), vars2impute) 626 | } 627 | 628 | # drop 'load' from the memory 629 | # --------------------------- 630 | rm(load) 631 | gc() 632 | load <- NULL 633 | 634 | # ??? bdata must be NULL at the beginning of each itteration. Currently 635 | # this is NOT happenning when the 'mlim' object is loaded 636 | 637 | for (m.it in m.it:m) { 638 | 639 | # Start the new imputation data fresh, if it is multiple imputation 640 | if (k == 1 & z == 1) { 641 | if (!is.null(preimputed.data)) data <- preimputed.data 642 | md.log(paste("Dataset", m.it), section="section") 643 | } 644 | 645 | #it is always NULL. It doesn't have to be saved 646 | bdata <- NULL 647 | dataLast <- iteration_loop(MI, dataNA, preimputed.data, data, bdata, boot=m>1, 648 | metrics, tolerance, doublecheck, 649 | m, k, X, Y, z, m.it, 650 | # loop data 651 | vars2impute, vars2postimpute, storeVars2impute, 652 | allPredictors, preimpute, impute, postimputealgos, 653 | # settings 654 | error_metric, FAMILY=FAMILY, cv, tuning_time, 655 | max_models, 656 | keep_cv, 657 | autobalance, balance, seed, save, flush, 658 | verbose, debug, report, sleep, 659 | # saving settings 660 | mem, orderedCols, ignore, maxiter, 661 | miniter, matching, ignore.rank, 662 | verbosity, error, cpu, max_ram=max_ram, min_ram=min_ram, 663 | #??? shutdown has to be fixed in future updates 664 | shutdown=FALSE, clean = TRUE, 665 | stochastic=stochastic) 666 | 667 | if (m > 1) MI[[m.it]] <- dataLast 668 | else MI <- dataLast 669 | } 670 | 671 | message("\n") 672 | 673 | if (shutdown) { 674 | md.log("shutting down the server", trace=FALSE) 675 | h2o::h2o.shutdown(prompt = FALSE) 676 | Sys.sleep(sleep) 677 | } 678 | 679 | if (m > 1) class(MI) <- "mlim.mi" 680 | else class(MI) <- c("mlim", "data.frame") 681 | 682 | return(MI) 683 | } 684 | 685 | 686 | 687 | 688 | --------------------------------------------------------------------------------