├── simulation_generation.pdf ├── vignette_files ├── figure-markdown_github │ ├── compare.eps │ ├── Missmap_targeted.png │ ├── unnamed-chunk-4-1.png │ ├── unnamed-chunk-4-2.png │ ├── unnamed-chunk-4-3.png │ ├── unnamed-chunk-5-1.png │ ├── unnamed-chunk-5-2.png │ ├── unnamed-chunk-5-3.png │ ├── unnamed-chunk-9-1.png │ ├── unnamed-chunk-9-2.png │ ├── unnamed-chunk-9-3.png │ ├── unnamed-chunk-9-4.png │ ├── unnamed-chunk-9-5.png │ ├── unnamed-chunk-9-6.png │ ├── Missmap_untargeted.png │ ├── unnamed-chunk-12-1.png │ └── desktop.ini └── desktop.ini ├── Prediction_funcs.R ├── simulation_generation.Rmd ├── Trunc_KNN ├── Simulation.r ├── Final_tests.r └── Imput_funcs.r ├── Impute_wrapper.R ├── MVI_global.R ├── GSimp.R ├── targeted_data.csv ├── GSimp_evaluation.R ├── README.md └── untargeted_data.csv /simulation_generation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/simulation_generation.pdf -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/compare.eps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/compare.eps -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/Missmap_targeted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/Missmap_targeted.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-4-2.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-4-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-4-3.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-5-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-5-2.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-5-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-5-3.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-9-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-9-2.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-9-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-9-3.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-9-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-9-4.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-9-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-9-5.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-9-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-9-6.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/Missmap_untargeted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/Missmap_untargeted.png -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WandeRum/GSimp/HEAD/vignette_files/figure-markdown_github/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /vignette_files/desktop.ini: -------------------------------------------------------------------------------- 1 | [.ShellClassInfo] 2 | InfoTip=This folder is shared online. 3 | IconFile=C:\Program Files (x86)\Google\Drive\googledrivesync.exe 4 | IconIndex=12 5 | 6 | -------------------------------------------------------------------------------- /vignette_files/figure-markdown_github/desktop.ini: -------------------------------------------------------------------------------- 1 | [.ShellClassInfo] 2 | InfoTip=This folder is shared online. 3 | IconFile=C:\Program Files (x86)\Google\Drive\googledrivesync.exe 4 | IconIndex=12 5 | -------------------------------------------------------------------------------- /Prediction_funcs.R: -------------------------------------------------------------------------------- 1 | # Packages ---------------------------------------------------------------- 2 | require(randomForest) 3 | require(glmnet) 4 | require(rpart) 5 | require(FNN) 6 | 7 | lm_pred <- function(x, y) { 8 | data <- data.frame(y=y, x) 9 | model <- lm(y ~ ., data=data) 10 | y_hat <- predict(model, newdata=data) 11 | return(y_hat) 12 | } 13 | 14 | rlm_pred <- function(x, y) { 15 | data <- data.frame(y=y, x) 16 | model <- rlm(y ~ ., data=data) 17 | y_hat <- predict(model, newdata=data) 18 | return(y_hat) 19 | } 20 | 21 | rf_pred <- function(x, y, ntree=200, ...) { 22 | model <- randomForest(x=x, y=y, ntree=ntree, ...) 23 | y_hat <- predict(model, newdata=x) 24 | return(y_hat) 25 | } 26 | 27 | glmnet_pred <- function(x, y, alpha=.5, lambda=.01) { 28 | x_mat <- as.matrix(x) 29 | model <- glmnet(x=x_mat, y=y, alpha=alpha, lambda=lambda) 30 | y_hat <- predict(model, newx=x_mat)[, 1] 31 | return(y_hat) 32 | } 33 | 34 | rpart_pred <- function(x, y) { 35 | data <- data.frame(y=y, x) 36 | model <- rpart(y ~ ., data=data) 37 | y_hat <- predict(model, newdata=data) 38 | return(y_hat) 39 | } 40 | 41 | knn_pred <- function(x, y) { 42 | model <- knn.reg(train=x, y=y, k=5) 43 | y_hat <- model$pred 44 | return(y_hat) 45 | } 46 | -------------------------------------------------------------------------------- /simulation_generation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Simulation data generation" 3 | author: "Rum Wei" 4 | date: "12/18/2017" 5 | output: 6 | pdf_document: 7 | latex_engine: xelatex 8 | html_document: default 9 | word_document: default 10 | --- 11 | 12 | ## 1.Import real data and calculate covariance 13 | ```{r cache=T} 14 | options(stringsAsFactors = F) 15 | require(magrittr) 16 | require(pheatmap) 17 | require(mvtnorm) 18 | 19 | data_raw <- read.csv('real_data.csv', row.names=1) 20 | data_lg_sc <- data_raw %>% log %>% scale 21 | cov_mat <- cov(data_lg_sc) 22 | pheatmap(cov_mat, cluster_rows=F, cluster_cols=F, show_rownames=F, show_colnames=F, 23 | main='Covariance of real data') 24 | ``` 25 | 26 | ## 2.Simulation dataset generation (part-1, first 80 samples as first group) 27 | ```{r cache=T} 28 | set.seed(123) 29 | data_sim_0 <- rmvnorm(80, mean=rnorm(nrow(cov_mat), sd=.5), sigma=cov_mat, method='svd') 30 | cov_mat_sim_0 <- cov(data_sim_0) 31 | pheatmap(cov_mat_sim_0, cluster_rows=F, cluster_cols=F, show_rownames=F, show_colnames=F, 32 | main='Covariance of sim data-0 (group-0)') 33 | ``` 34 | 35 | ## 3.Simulation dataset generation (part-2, second 80 samples as second group) 36 | ```{r cache=T} 37 | set.seed(321) 38 | data_sim_1 <- rmvnorm(80, mean=rnorm(nrow(cov_mat), sd=.5), sigma=cov_mat, method='svd') 39 | cov_mat_sim_1 <- cov(data_sim_1) 40 | pheatmap(cov_mat_sim_1, cluster_rows=F, cluster_cols=F, show_rownames=F, show_colnames=F, 41 | main='Covariance of sim data-1 (group-1)') 42 | ``` 43 | 44 | ## 4.Simulation dataset generation (part-3, stack two groups together) 45 | ```{r cache=T} 46 | data_sim <- rbind(data_sim_0, data_sim_1) 47 | data_sim_sc <- scale(data_sim) 48 | cov_mat_sim <- cov(data_sim_sc) 49 | pheatmap(cov_mat_sim, cluster_rows=F, cluster_cols=F, show_rownames=F, show_colnames=F, 50 | main='Covariance of complete sim data (group-0 & 1)') 51 | ``` 52 | 53 | ## 5.T-test on two groups 54 | ```{r cache=T} 55 | group <- rep(c(0, 1), each=80) %>% as.factor 56 | sim_pvals <- apply(data_sim_sc, 2, function(x) t.test(x ~ group)$p.value) 57 | sim_pvals[1:10] 58 | ``` 59 | -------------------------------------------------------------------------------- /Trunc_KNN/Simulation.r: -------------------------------------------------------------------------------- 1 | #### SimulationFunctions 2 | CorrMatrixNegFixed <- function(blocks, a, corr, off) { 3 | Corr <- NULL 4 | for (i in 1:blocks) { 5 | Corr[[i]] <- matrix(NA, ncol = a, nrow = a) 6 | diag= rep(1,a) 7 | offdiag = corr 8 | Corr[[i]][lower.tri(Corr[[i]])] <- offdiag 9 | Corr[[i]][upper.tri(Corr[[i]])] <- t(Corr[[i]])[upper.tri(t(Corr[[i]]))] 10 | diag(Corr[[i]]) <- diag 11 | for ( k in ((ncol(Corr[[i]])/2)+1):(ncol(Corr[[i]])) ) { 12 | for( j in 1:(nrow(Corr[[i]])/2) ) { 13 | Corr[[i]][j,k] <- -1*Corr[[i]][j,k] 14 | } 15 | } 16 | 17 | for ( k in ((nrow(Corr[[i]])/2)+1):(nrow(Corr[[i]])) ) { 18 | for( j in 1:(ncol(Corr[[i]])/2) ) { 19 | Corr[[i]][k,j] <- -1*Corr[[i]][k,j] 20 | } 21 | } 22 | 23 | } 24 | res <- as.matrix(bdiag(Corr)) 25 | res[which(res == 0, arr.ind = TRUE)] <- off 26 | return(res) 27 | } 28 | 29 | 30 | ################################################################################## 31 | ## This function Simulates complete data sets. 32 | ## We are creating a dataset of size N (samples) by p (Metabolites) 33 | ################################################################################## 34 | 35 | SimulatedData <- function(n, p, covar, low, high) { 36 | ## No of Samples == N 37 | ## No of Metabolites = p 38 | Means <- runif(p, min = low, max = high) 39 | data <- rmvnorm(n, mean = Means, sigma = covar) 40 | return(data) 41 | } 42 | 43 | 44 | ################################################################################## 45 | ###### Create MissingValues Datasets 46 | ################################################################################## 47 | 48 | MissingData <- function(data, totalmiss, mar, perc) { 49 | missdata <- data 50 | 51 | ## First create Missing Not At Random (MNAR) 52 | belowthresh <- which(data < quantile(data, probs = c(totalmiss-mar)/100), arr.ind = TRUE) 53 | missdata[belowthresh] <- NA 54 | 55 | 56 | ## Next create Missing At Random (MAR) 57 | dataVector <- c(1:length(data)) 58 | randommiss <- sample(dataVector[-which(is.na(missdata))], (mar/100)*length(data) ) 59 | missdata[randommiss] <- NA 60 | 61 | ## Next exclude metabolites with more that 'perc' missing 62 | NumberMissing <- apply(missdata, 2, function(x) length(which(is.na(x)))) 63 | 64 | ind <- which(NumberMissing >= (nrow(data)*perc)) 65 | if(length(ind) > 0 ){ 66 | missdata <- missdata[,-ind] 67 | data <- data[,-ind] 68 | } 69 | 70 | return(list(missdata, data, NumberMissing)) 71 | } 72 | 73 | -------------------------------------------------------------------------------- /Impute_wrapper.R: -------------------------------------------------------------------------------- 1 | # Imputation Wrapper ------------------------------------------------------ 2 | require(missForest) 3 | require(impute) 4 | require(magrittr) 5 | require(imputeLCMD) 6 | source('MVI_global.R') 7 | source('GSimp.R') 8 | 9 | RF_wrapper <- function(data, ...) { 10 | result <- missForest(data, ...)[[1]] 11 | return (result) 12 | } 13 | 14 | kNN_wrapper <- function(data, ...) { 15 | result <- data %>% data.matrix %>% impute.knn(., ...) %>% extract2(1) 16 | return(result) 17 | } 18 | 19 | SVD_wrapper <- function(data, K = 5) { 20 | data_sc_res <- scale_recover(data, method = 'scale') 21 | data_sc <- data_sc_res[[1]] 22 | data_sc_param <- data_sc_res[[2]] 23 | result <- data_sc %>% impute.wrapper.SVD(., K = K) %>% 24 | scale_recover(., method = 'recover', param_df = data_sc_param) %>% extract2(1) 25 | return(result) 26 | } 27 | 28 | Mean_wrapper <- function(data) { 29 | result <- data 30 | result[] <- lapply(result, function(x) { 31 | x[is.na(x)] <- mean(x, na.rm = T) 32 | x 33 | }) 34 | return(result) 35 | } 36 | 37 | Median_wrapper <- function(data) { 38 | result <- data 39 | result[] <- lapply(result, function(x) { 40 | x[is.na(x)] <- median(x, na.rm = T) 41 | x 42 | }) 43 | return(result) 44 | } 45 | 46 | HM_wrapper <- function(data) { 47 | result <- data 48 | result[] <- lapply(result, function(x) { 49 | x[is.na(x)] <- min(x, na.rm = T)/2 50 | x 51 | }) 52 | return(result) 53 | } 54 | 55 | Zero_wrapper <- function(data) { 56 | result <- data 57 | result[is.na(result)] <- 0 58 | return(result) 59 | } 60 | 61 | QRILC_wrapper <- function(data, ...) { 62 | result <- data %>% log %>% impute.QRILC(., ...) %>% extract2(1) %>% exp 63 | return(result) 64 | } 65 | 66 | # pre_processing_GS_wrapper ----------------------------------------------- 67 | pre_processing_GS_wrapper <- function(data) { 68 | data_raw <- data 69 | ## log transformation ## 70 | data_raw_log <- data_raw %>% log() 71 | ## Initialization ## 72 | data_raw_log_qrilc <- impute.QRILC(data_raw_log) %>% extract2(1) 73 | ## Centralization and scaling ## 74 | data_raw_log_qrilc_sc <- scale_recover(data_raw_log_qrilc, method = 'scale') 75 | ## Data after centralization and scaling ## 76 | data_raw_log_qrilc_sc_df <- data_raw_log_qrilc_sc[[1]] 77 | ## Parameters for centralization and scaling ## 78 | ## For scaling recovery ## 79 | data_raw_log_qrilc_sc_df_param <- data_raw_log_qrilc_sc[[2]] 80 | ## NA position ## 81 | NA_pos <- which(is.na(data_raw), arr.ind = T) 82 | ## bala bala bala ## 83 | data_raw_log_sc <- data_raw_log_qrilc_sc_df 84 | data_raw_log_sc[NA_pos] <- NA 85 | ## GSimp imputation with initialized data and missing data ## 86 | result <- data_raw_log_sc %>% GS_impute(., iters_each=50, iters_all=10, 87 | initial = data_raw_log_qrilc_sc_df, 88 | lo=-Inf, hi= 'min', n_cores=2, 89 | imp_model='glmnet_pred') 90 | data_imp_log_sc <- result$data_imp 91 | ## Data recovery ## 92 | data_imp <- data_imp_log_sc %>% 93 | scale_recover(., method = 'recover', 94 | param_df = data_raw_log_qrilc_sc_df_param) %>% 95 | extract2(1) %>% exp() 96 | return(data_imp) 97 | } 98 | -------------------------------------------------------------------------------- /MVI_global.R: -------------------------------------------------------------------------------- 1 | require(magrittr) 2 | require(missForest) 3 | require(abind) 4 | # Missing at Random Function ---------------------------------------------- 5 | MAR_generate <- function(data, mis_prop = 0.5) { 6 | all_idx <- which(data != Inf, arr.ind = T) 7 | rdm_idx <- sample(1:nrow(all_idx), round(nrow(all_idx)*mis_prop)) 8 | slc_idx <- all_idx[rdm_idx, ] 9 | data_res <- data 10 | data_res[slc_idx] <- NA 11 | return(list(data_res = data_res, mis_idx = slc_idx)) 12 | } 13 | 14 | # column-wise NRMSE calculation ------------------------------------------- 15 | nrmse_col <- function(data_imp, data_miss, data_true) { 16 | idx_miss <- data_miss %>% colSums %>% is.na %>% which 17 | nrmse_vec <- c() 18 | for (idx in idx_miss) { 19 | nrmse_vec <- c(nrmse_vec, nrmse(data_imp[, idx], data_miss[, idx], data_true[, idx])) 20 | } 21 | names(nrmse_vec) <- names(idx_miss) 22 | return(nrmse_vec) 23 | } 24 | 25 | # MNAR imputation compare ------------------------------------------------- 26 | MNAR_generate <- function (data_c, mis_var = 0.5, var_prop = seq(.3, .6, .1)) { 27 | data_mis <- data_c 28 | if (is.numeric(mis_var)) var_mis_list <- sample(1:ncol(data_c), round(ncol(data_c)*mis_var)) 29 | else if (is.character(mis_var)) var_mis_list <- which(colnames(data_c) %in% mis_var) 30 | for (i in 1:length(var_mis_list)) { 31 | var_idx <- var_mis_list[i] 32 | cur_var <- data_mis[, var_idx] 33 | cutoff <- quantile(cur_var, sample(var_prop, 1)) 34 | cur_var[cur_var < cutoff] <- NA 35 | data_mis[, var_idx] <- cur_var 36 | } 37 | mis_idx_df <- which(is.na(data_mis), arr.ind = T) 38 | return (list(data_mis = data_mis, mis_idx_df = mis_idx_df)) 39 | } 40 | 41 | # Scale and recover ------------------------------------------------------- 42 | scale_recover <- function(data, method='scale', param_df = NULL) { 43 | results <- list() 44 | data_res <- data 45 | if (!is.null(param_df)) { 46 | if (method=='scale') { 47 | data_res[] <- scale(data, center=param_df$mean, scale=param_df$std) 48 | } else if (method=='recover') { 49 | data_res[] <- t(t(data)*param_df$std+param_df$mean) 50 | } 51 | } else { 52 | if (method=='scale') { 53 | param_df <- data.frame(mean=apply(data, 2, function(x) mean(x, na.rm=T)), 54 | std=apply(data, 2, function(x) sd(x, na.rm=T))) 55 | data_res[] <- scale(data, center=param_df$mean, scale=param_df$std) 56 | } else {stop('no param_df found for recover...')} 57 | } 58 | results[[1]] <- data_res 59 | results[[2]] <- param_df 60 | return(results) 61 | } 62 | 63 | # Multiplot 4 ggplot2 ----------------------------------------------------- 64 | multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { 65 | library(grid) 66 | 67 | # Make a list from the ... arguments and plotlist 68 | plots <- c(list(...), plotlist) 69 | 70 | numPlots = length(plots) 71 | 72 | # If layout is NULL, then use 'cols' to determine layout 73 | if (is.null(layout)) { 74 | # Make the panel 75 | # ncol: Number of columns of plots 76 | # nrow: Number of rows needed, calculated from # of cols 77 | layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), 78 | ncol = cols, nrow = ceiling(numPlots/cols)) 79 | } 80 | 81 | if (numPlots==1) { 82 | print(plots[[1]]) 83 | 84 | } else { 85 | # Set up the page 86 | grid.newpage() 87 | pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) 88 | 89 | # Make each plot, in the correct location 90 | for (i in 1:numPlots) { 91 | # Get the i,j matrix positions of the regions that contain this subplot 92 | matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) 93 | 94 | print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, 95 | layout.pos.col = matchidx$col)) 96 | } 97 | } 98 | } 99 | 100 | # Parallel combination ---------------------------------------------------- 101 | require(abind) 102 | cbind_abind <- function(a, b) { 103 | res <- list() 104 | res$y_imp <- cbind(a$y_imp, b$y_imp) 105 | res$gibbs_res <- abind(a$gibbs_res, b$gibbs_res, along=2) 106 | return(res) 107 | } 108 | -------------------------------------------------------------------------------- /Trunc_KNN/Final_tests.r: -------------------------------------------------------------------------------- 1 | ############################################################################################################################ 2 | #### This Scipt describes an example of running a dataset to impute values 3 | #### based on KNN-CR, KNN-TN and KNN-EU 4 | ############################################################################################################################ 5 | 6 | source("Trunc_KNN/Imput_funcs.r") 7 | source("Trunc_KNN/Simulation.r") 8 | 9 | ### Load libraries as some functions are dependent on them 10 | 11 | library(MASS) 12 | library(mvtnorm) 13 | library(Matrix) 14 | library(magrittr) 15 | 16 | ########################################################################### 17 | ##### Here we Simulate a dataset of size 50 Samples by 400 Metabolites 18 | ##### We will have a Complete Dataset and a Dataset with Missing Values 19 | ##### For this example we generate a Dataset with 15% total Missing where 20 | ##### 10% of the Data is missing below the Threshold and 5% missing above 21 | ##### the threshold. We also discard if there is missing greater than 75%. 22 | ############################################################################ 23 | set.seed(505) 24 | CorrelationMatrix0.7 <- CorrMatrixNegFixed(20, 20, 0.7, 0.2) 25 | DataSimul <- SimulatedData(50, 400, as.matrix(nearPD(CorrelationMatrix0.7)[[1]]), low = -5, high =5) 26 | 27 | ## Simulate 15% overall missing, 10% due to below LOD (MNAR), 5% MAR 28 | ## Only retain metabolites with <75% MVs 29 | Missingness <- MissingData(DataSimul, 15, 5, 0.75) 30 | 31 | CompleteData <- Missingness[[2]] 32 | MissData <- Missingness[[1]] 33 | dim(MissData) ## 50x390; After Screening we have 50 Samples and 390 Metabolites 34 | 35 | sum(is.na(MissData)/length(MissData)) ## 13.5%, below 15% due to screening 36 | 37 | LOD <- quantile(DataSimul, probs = 0.1) 38 | 39 | 40 | ################################################################################## 41 | ### We now do the different imputations and we provide the type of imputation and 42 | ### k neighbors 43 | ################################################################################## 44 | 45 | (kNN_Corr_Imp <- imputeKNN(t(MissData), k=10 , distance = "correlation")) 46 | (kNN_Euc_Imp <- KNNEuc(t(MissData), k=10)) 47 | (kNN_Trunc_Imp <- imputeKNN(t(MissData), k=10 , distance = "truncation", perc= 0.75)) 48 | 49 | ################################################################################### 50 | ### We compute the RMSE since we have the CompleteData, MissingData and ImputedData 51 | #################################################################################### 52 | 53 | RMSError <- ErrorsComputation(trunc=kNN_Trunc_Imp, corr=kNN_Corr_Imp, euc=kNN_Euc_Imp, 54 | miss=MissData, complete = CompleteData) 55 | 56 | names(RMSError) <- c("KNN-TN", "KNN-CR", "KNN-EU") 57 | RMSError 58 | ## KNN-TN KNN-CR KNN-EU 59 | ## 1.161180 1.345218 1.606502 60 | 61 | ###################################################################################### 62 | ### We look at the Distribution of Metabolites based on the Imputation Method 63 | ###################################################################################### 64 | 65 | 66 | ## This plots the first 20 Metabolites of the Dataset and overlays the imputed values 67 | ## from the different methods in different colors 68 | ## Reproduces Figure 6 in the manuscript 69 | 70 | dim(MissData) 71 | sum(is.na(MissData[,1:20])) ## 190 72 | 73 | png("Figure6.png", width = 7, height = 7, units = 'in', res = 600) 74 | 75 | col.black <- rgb(0,0,0,alpha=20,maxColorValue=255) 76 | col.black2 <- rgb(0,0,0,alpha=255,maxColorValue=255) 77 | 78 | col.blue <- rgb(0,0,255,alpha=200,maxColorValue=255) 79 | col.red <- rgb(255,0,0,alpha=200,maxColorValue=255) 80 | col.green <- rgb(0,255,0,alpha=200,maxColorValue=255) 81 | 82 | idx.na <- which(is.na(MissData[,1:20])) 83 | 84 | plot(rep(1:20, each = 50)[-idx.na], CompleteData[,1:20][-idx.na], 85 | xlab = "Metabolite", ylab = "Intensity Values", xlim=c(0, 20.5), 86 | pch = 1, ylim = c(-8,8), col = col.black, cex = 0.9) ##, xaxt='n') 87 | 88 | ## Just use 'points' to add to a plot rather than the 'plot' command 89 | points(rep(1:20, each = 50)[idx.na], CompleteData[,1:20][idx.na], 90 | pch = 8, col = col.black2, cex = 0.9) 91 | 92 | ## Need to make these colors more transparent so we can see all of them 93 | ## Shift these slightly to the right when plotting ... 94 | 95 | points(rep(1:20, each = 50)[idx.na]+0.23, t(kNN_Trunc_Imp[1:20,])[idx.na], 96 | pch=17, ylim = c(-8,8), col = col.blue, cex = 0.9) ## Trun 97 | points(rep(1:20, each = 50)[idx.na]+0.46, t(kNN_Corr_Imp[1:20,])[idx.na], 98 | pch=15, ylim = c(-8,8),col = col.red, cex = 0.9) ## Corr 99 | points(rep(1:20, each = 50)[idx.na]+0.7, t(kNN_Euc_Imp[1:20,])[idx.na], 100 | pch=18, ylim = c(-8,8),col = col.green, cex = 0.9) ## EUC 101 | 102 | legend("topleft", c("Original-Observed", "Orginal-Missing", "KNN-TN", "KNN-CR", "KNN-EU"), 103 | pch = c(1, 8, 17, 15, 18), col = c(1, 1, 4, 2, 3), bty = "n", 104 | text.col = c(1, 1, 4, 2, 3), pt.cex = 1) 105 | 106 | ## Color region below LOD light red 107 | col.red2 <- rgb(255,0,0,alpha=25,maxColorValue=255) 108 | lims <- par("usr") 109 | polygon(x = c(lims[1], lims[2], lims[2], lims[1]), y = c(LOD, LOD, lims[3], lims[3]), col = col.red2, 110 | border = NA) 111 | 112 | dev.off() 113 | 114 | -------------------------------------------------------------------------------- /GSimp.R: -------------------------------------------------------------------------------- 1 | require(missForest) 2 | require(imputeLCMD) 3 | require(magrittr) 4 | require(foreach) 5 | require(doParallel) 6 | require(MASS) 7 | 8 | ## Source ## 9 | source('MVI_global.R') 10 | source('Prediction_funcs.R') 11 | 12 | ## Draw n samples from a truncated normal distribution N(mu, std^2|[lo, hi]) ## 13 | rnorm_trunc <- function (n, mu, std, lo=-Inf, hi=Inf) { 14 | p_lo <- pnorm(lo, mu, std) 15 | p_hi <- pnorm(hi, mu, std) 16 | p_hi[p_hi < .01] <- .01 17 | u <- runif(n, p_lo, p_hi) 18 | return(qnorm(u, mu, std)) 19 | } 20 | 21 | ## Initialize the missing data ## 22 | ## lsym will draw samples from the right tail of the distribution and transformed to the left tail 23 | miss_init <- function(miss_data, method=c('lsym', 'qrilc', 'rsym')[1]) { 24 | init_data <- miss_data 25 | if (method=='lsym') { 26 | for (i in 1:ncol(init_data)) { 27 | col_temp <- init_data[, i] 28 | na_idx <- which(is.na(col_temp)) 29 | prop <- mean(is.na(col_temp)) 30 | min_temp <- min(col_temp, na.rm=T) 31 | col_temp[na_idx] <- min_temp - 1 32 | med_temp <- median(col_temp) 33 | col_temp[na_idx] <- med_temp - (sample(col_temp[col_temp >= quantile(col_temp, 1-prop)], length(na_idx), replace=T) - med_temp) 34 | init_data[, i] <- col_temp 35 | } 36 | } 37 | if (method=='rsym') { 38 | for (i in 1:ncol(init_data)) { 39 | col_temp <- init_data[, i] 40 | na_idx <- which(is.na(col_temp)) 41 | prop <- mean(is.na(col_temp)) 42 | max_temp <- max(col_temp, na.rm=T) 43 | col_temp[na_idx] <- max_temp + 1 44 | med_temp <- median(col_temp) 45 | col_temp[na_idx] <- med_temp + (med_temp - sample(col_temp[col_temp<=quantile(col_temp, prop)], length(na_idx), replace=T)) 46 | init_data[, i] <- col_temp 47 | } 48 | } 49 | if (method=='qrilc') { 50 | init_data <- impute.QRILC(miss_data)[[1]] 51 | } 52 | return(init_data) 53 | } 54 | 55 | ## Single missing variable imputation based on Gibbs sampler ## 56 | single_impute_iters <- function(x, y, y_miss, y_real=NULL, imp_model='glmnet_pred', lo=-Inf, hi=Inf, iters_each=100, gibbs=c()) { 57 | y_res <- y 58 | x <- as.matrix(x) 59 | na_idx <- which(is.na(y_miss)) 60 | imp_model_func <- getFunction(imp_model) 61 | nrmse_vec <- c() 62 | gibbs_res <- array(NA, dim=c(3, length(gibbs), iters_each)) 63 | dimnames(gibbs_res) <- list(c('std', 'yhat', 'yres'), NULL, NULL) 64 | 65 | for (i in 1:iters_each) { 66 | y_hat <- imp_model_func(x, y_res) 67 | std <- sqrt(sum((y_hat[na_idx]-y_res[na_idx])^2)/length(na_idx)) 68 | y_res[na_idx] <- rnorm_trunc(length(na_idx), y_hat[na_idx], std, lo, hi) 69 | if (length(gibbs)>0) { 70 | gibbs_res[1, , i] <- std 71 | gibbs_res[2, , i] <- y_hat[gibbs] 72 | gibbs_res[3, , i] <- y_res[gibbs] 73 | } 74 | ## The following code is for prediction function testing when y_real availabe ## 75 | if (!is.null(y_real)) { 76 | Sys.sleep(.5) 77 | par(mfrow=c(2, 2)) 78 | nrmse_vec <- c(nrmse_vec, nrmse(y_res, y_miss, y_real)) 79 | plot(y_real~y_res) 80 | plot(y_real~y_hat) 81 | plot(y_hat~y_res) 82 | plot(nrmse_vec) 83 | } 84 | } 85 | return(list(y_imp=y_res, gibbs_res=gibbs_res)) 86 | } 87 | 88 | 89 | ## Multiple missing variables imputation ## 90 | ## iters_each=number (100); vector of numbers, e.g. rep(100, 20) while iters_all=20 91 | ## lo/hi=numer; vector; functions like min/max/median/mean... 92 | ## initial=character ('qrilc'/'lysm'); initialized data maatrix 93 | ## n_cores=1 is sequentially (non-parallel) computing 94 | multi_impute <- function(data_miss, iters_each=100, iters_all=20, initial='qrilc', lo=-Inf, hi='min', 95 | n_cores=1, imp_model='glmnet_pred', gibbs=data.frame(row=integer(), col=integer())) { 96 | ## Convert to data.frame ## 97 | data_miss %<>% data.frame() 98 | 99 | ## Make vector for iters_each ## 100 | if (length(iters_each)==1) { 101 | iters_each <- rep(iters_each, iters_all) 102 | } else if (length(iters_each)==iters_all) { 103 | iters_each <- iters_each 104 | } else {stop('improper argument: iters_each')} 105 | 106 | 107 | ## Missing count in each column ## 108 | miss_count <- data_miss %>% apply(., 2, function(x) sum(is.na(x))) 109 | ## Index of missing variables, sorted (increasing) by the number of missings 110 | miss_col_idx <- order(miss_count, decreasing = T) %>% extract(1:sum(miss_count!=0)) %>% rev() 111 | 112 | if (!all(gibbs$col %in% miss_col_idx)) {stop('improper argument: gibbs')} 113 | gibbs_sort <- gibbs 114 | if (nrow(gibbs_sort)>0) { 115 | gibbs_sort$order <- c(1:nrow(gibbs_sort)) 116 | gibbs_sort <- gibbs_sort[order(gibbs_sort$row), ] 117 | gibbs_sort <- gibbs_sort[order(match(gibbs_sort$col, miss_col_idx)), ] 118 | } else {gibbs_sort$order <- integer()} 119 | 120 | ## Make vectors for lo and hi ## 121 | if (length(lo)>1) { 122 | if (length(lo)!=ncol(data_miss)) {stop('Length of lo should equal to one or the number of variables')} 123 | else {lo_vec <- lo} 124 | } else if (is.numeric(lo)) { 125 | lo_vec <- rep(lo, ncol(data_miss)) 126 | } else if (is.character(lo)) { 127 | lo_fun <- getFunction(lo) 128 | lo_vec <- apply(data_miss, 2, function(x) x %>% na.omit %>% lo_fun) 129 | } 130 | 131 | if (length(hi)>1) { 132 | if (length(hi)!=ncol(data_miss)) {stop('Length of hi should equal to one or the number of variables')} 133 | else {hi_vec <- hi} 134 | } else if (is.numeric(hi)) { 135 | hi_vec <- rep(hi, ncol(data_miss)) 136 | } else if (is.character(hi)) { 137 | hi_fun <- getFunction(hi) 138 | hi_vec <- apply(data_miss, 2, function(x) x %>% na.omit %>% hi_fun) 139 | } 140 | 141 | # Check whether lo is lower than hi 142 | if(!all(lo_vec < hi_vec)) {stop('lo should be lower than hi')} 143 | 144 | ## Initialization using build-in method or input initial matrix ## 145 | if(is.character(initial)) { 146 | data_init <- miss_init(data_miss, method=initial) 147 | } else if(is.data.frame(initial) & identical(data_miss[!is.na(data_miss)], initial[!is.na(data_miss)])) { 148 | data_init <- initial 149 | } else {stop('improper argument: initial')} 150 | 151 | data_imp <- data_init 152 | gibbs_res_final <- array(NA, dim=c(3, nrow(gibbs), 0)) 153 | 154 | ## Iterations for the whole data matrix ## 155 | for (i in 1:iters_all) { 156 | cat('Iteration', i, 'start...') 157 | 158 | ## Parallel computing ## 159 | if (n_cores>1) { 160 | cat(paste0('Parallel computing (n_cores=', n_cores, ')...')) 161 | ## Parallel on missing variables 162 | cl <- makeCluster(n_cores) 163 | registerDoParallel(cl) 164 | core_res <- foreach (k=miss_col_idx, .combine='cbind_abind', .export=c('single_impute_iters', 'rnorm_trunc'), .packages=c('magrittr')) %dopar% { 165 | source('Prediction_funcs.R') 166 | gibbs_sort_temp <- gibbs_sort[gibbs_sort$col==k, ] 167 | y_imp_res <- single_impute_iters(data_imp[, -k], data_imp[, k], data_miss[, k], imp_model=imp_model, 168 | lo=lo_vec[k], hi=hi_vec[k], iters_each=iters_each[i], gibbs=gibbs_sort_temp$row) 169 | y_imp_df <- y_imp_res$y_imp %>% data.frame 170 | colnames(y_imp_df) <- colnames(data_miss)[k] 171 | gibbs_res <- y_imp_res$gibbs_res 172 | list(y_imp=y_imp_df, gibbs_res=gibbs_res) 173 | } 174 | stopCluster(cl) 175 | y_imp_df <- core_res$y_imp 176 | gibbs_res_final <- abind(gibbs_res_final, core_res$gibbs_res, along=3) 177 | miss_col_idx_match <- match(colnames(y_imp_df), colnames(data_miss)) 178 | data_imp[, miss_col_idx_match] <- y_imp_df 179 | } else { 180 | ## Sequential computing ## 181 | gibbs_res_j <- array(NA, dim=c(3, 0, iters_each[i])) 182 | for (j in miss_col_idx) { 183 | gibbs_sort_temp <- gibbs_sort[gibbs_sort$col==j, ] 184 | y_miss <- data_miss[, j] 185 | y_imp_res <- single_impute_iters(data_imp[, -j], data_imp[, j], y_miss, imp_model=imp_model, lo=lo_vec[j], hi=hi_vec[j], 186 | iters_each=iters_each[i], gibbs=gibbs_sort_temp$row) 187 | y_imp <- y_imp_res$y_imp 188 | gibbs_res_j <- abind(gibbs_res_j, y_imp_res$gibbs_res, along=2) 189 | data_imp[is.na(y_miss), j] <- y_imp[is.na(y_miss)] 190 | } 191 | gibbs_res_final <- abind(gibbs_res_final, gibbs_res_j, along=3) 192 | } 193 | cat('end!\n') 194 | } 195 | gibbs_res_final_reorder <- gibbs_res_final[, gibbs_sort$order, ] 196 | return(list(data_imp=data_imp, gibbs_res=gibbs_res_final_reorder)) 197 | } 198 | 199 | 200 | # GS_impute --------------------------------------------------------------- 201 | GS_impute <- multi_impute 202 | -------------------------------------------------------------------------------- /targeted_data.csv: -------------------------------------------------------------------------------- 1 | ,cmpd_1,cmpd_2,cmpd_3,cmpd_4,cmpd_5,cmpd_6,cmpd_7,cmpd_8,cmpd_9,cmpd_10,cmpd_11,cmpd_12,cmpd_13,cmpd_14,cmpd_15,cmpd_16,cmpd_17,cmpd_18,cmpd_19,cmpd_20,cmpd_21,cmpd_22,cmpd_23,cmpd_24,cmpd_25,cmpd_26,cmpd_27,cmpd_28,cmpd_29,cmpd_30,cmpd_31,cmpd_32,cmpd_33,cmpd_34,cmpd_35,cmpd_36,cmpd_37,cmpd_38,cmpd_39,cmpd_40,cmpd_41 2 | S_1,5.174,6.02,4.748,39.37,3.458,89.591,0.318,NA,0.298,8.198,0.193,0.07,0.309,NA,4.456,0.436,314.773,64.233,0.146,0.574,NA,0.833,231.716,191.016,18.616,83.793,0.673,1.764,2.615,2.96,13.623,0.61,5.442,51.379,364.91,20.49,18.316,1.317,19.519,1.978,5.952 3 | S_2,39.718,0.944,1.419,34.127,4.326,67.739,1.534,NA,0.365,10.988,0.201,1.402,2.354,0.062,5.76,1.793,331.675,129.223,1.12,1.671,0.129,0.333,201.174,171.271,16.742,64.277,2.926,1.45,1.222,2.029,6.312,1.225,9.89,79.128,209.535,34.141,37.701,7.47,86.55,2.813,4.835 4 | S_3,20.301,0.329,0.684,49.892,2.428,58.887,1.611,0.699,0.127,9.456,0.303,2.231,8.175,0.593,6.549,9.042,153.59,139.014,0.584,0.861,0.059,0.324,69.377,27.322,18.077,50.369,NA,1.61,0.561,0.439,0.673,0.403,4.131,83.212,171.39,51.544,21.034,6.446,112.794,3.835,12.319 5 | S_4,20.837,0.541,0.765,35.853,2.44,17.473,0.263,1.047,NA,1.261,0.259,2.633,8.725,1.181,3.255,9.174,60.514,106.88,1.133,4.812,NA,0.415,112.557,48.72,23.533,26.465,0.753,1.504,0.53,0.336,0.597,0.379,6.5,31.306,108.458,27.742,7.181,1.94,100.889,5.067,14.813 6 | S_5,45.223,0.72,0.653,9.704,0.799,19.411,0.032,0.171,0.085,2.187,0.744,4.366,4.467,0.335,1.554,2.784,39.351,129.151,3.018,2.557,0.431,0.316,47.3,33.529,27.452,13.994,1.145,1.803,0.244,0.739,1.506,0.196,0.812,5.171,159.035,53.354,10.547,7.823,143.802,0.914,3.327 7 | S_6,62.464,0.258,0.327,44.7,2.04,37.057,1.8,0.724,0.194,6.179,NA,0.88,3.565,0.287,0.41,1.172,8.655,71.585,2.832,1.743,1.044,0.21,60.209,13.012,8.534,6.533,0.318,1.775,0.164,0.816,NA,0.239,0.466,2.039,220.046,124.045,19.321,18.797,134.378,0.731,2.298 8 | S_7,13.494,2.612,1.139,55.735,2.232,182.772,0.502,0.14,0.11,15.805,0.218,4.027,14.195,0.296,3.686,6.625,50.221,383.257,19.567,6.84,1.275,0.351,162.698,40.668,52.9,38.4,NA,1.848,0.644,2.999,0.769,0.69,2.043,17.999,467.935,247.656,18.989,14.43,160.21,2.974,9.557 9 | S_8,11.554,1.799,0.989,28.458,1.512,176.7,0.927,0.199,NA,11.424,0.22,5.793,8.632,0.204,2.15,3.241,22.511,186.676,5.452,1.743,0.005,0.691,201.906,20.223,37.056,28.33,NA,0.65,0.399,0.968,0.781,0.699,1.621,7.624,173.908,69.201,13.429,7.043,98.463,2.876,10.935 10 | S_9,47.865,0.292,1.575,65.121,5.485,60.606,1.993,0.812,1.188,6.173,0.342,4.868,1.631,0.63,7.663,1.583,108.176,93.859,2.517,4.605,0.204,0.087,62.283,40.711,19.14,52.994,NA,2.041,0.977,0.466,0.653,0.36,3.874,25.771,271.643,57.826,28.56,5.924,129.484,0.944,2.497 11 | S_10,13.312,0.232,0.46,15.75,1.459,10.616,0.002,NA,NA,1.505,0.287,1.979,0.897,0.68,1.96,0.418,59.023,52.92,1.23,2.048,NA,NA,62.329,15.88,5.977,11.016,0.34,1.266,0.753,0.172,0.466,0.453,2.549,19.491,68.809,23.233,6.869,2.497,64.278,0.605,3.321 12 | S_11,1.298,5.851,6.133,252.956,9.987,316.321,11.606,NA,1.265,39.731,0.174,NA,NA,NA,7.386,NA,199.751,15.198,NA,0.042,NA,0.265,211.943,49.463,4.604,105.822,0.363,1.492,1.802,0.479,1.815,1.977,5.782,45.318,766.385,7.525,84.313,1.07,5.568,4.126,12.569 13 | S_12,5.754,3.338,2.469,274.048,8.583,165.402,17.405,0.467,0.762,22.156,0.171,0.233,0.232,NA,1.715,0.206,45.447,17.672,0.088,0.151,NA,0.159,335.569,49.69,3.79,41.105,1.578,0.975,0.834,0.546,0.414,3.411,3.263,3.58,781.469,35.337,102.362,5.946,13.397,1.901,7.871 14 | S_13,0.92,5.006,4.022,194.742,6.19,84.355,5.52,0.144,0.529,10.317,0.402,NA,0.184,NA,8.846,NA,300.015,NA,NA,0.015,NA,0.797,342.823,104.082,NA,147.109,0.153,2.077,1.581,1.216,6.554,2.961,5.675,39.312,577.057,NA,64.805,NA,2.794,1.219,4.41 15 | S_14,0.687,7.835,5.241,105.538,1.426,24.812,2.586,NA,0.098,3.051,0.262,NA,NA,NA,5.552,NA,93.956,NA,NA,NA,NA,1.067,372.969,67.955,NA,95.63,0.203,1.521,1.351,1.346,3.754,3.911,5.02,17.141,237.88,NA,17.873,NA,1.943,1.647,4.533 16 | S_15,1.634,1.803,1.237,30.915,2.073,29.323,1.319,NA,0.454,5.752,2.557,2.413,1.97,0.492,2.845,0.434,58.456,10.603,4.036,1.894,0.241,0.456,450.848,142.41,NA,35.014,0.61,0.416,2.183,0.985,1.78,1.897,1.942,10.197,89.661,2.887,11.186,0.973,8.887,1.05,3.256 17 | S_16,0.793,1.646,1.656,34.936,0.86,41.562,1.871,NA,0.126,4.676,1.162,0.569,0.603,0.167,1.551,0.281,38.311,1.339,1.915,0.587,0.055,0.173,221.731,73.185,NA,23.524,0.206,0.428,1.18,0.489,1.743,0.985,1.776,12.556,119.517,NA,20.407,0.116,3.516,0.687,2.399 18 | S_17,4.639,0.703,0.996,15.028,2.539,68.035,0.462,0.469,0.13,8.617,0.606,0.559,0.831,0.18,4.311,0.934,170.712,53.427,1.298,0.941,NA,0.353,80.348,187.133,NA,32.442,NA,0.792,1.174,1.242,4.547,0.277,2.641,20.361,145.208,15.97,23.137,2.987,23.13,11.069,36.744 19 | S_18,29.584,0.254,0.381,29.131,0.926,25.176,0.769,0.331,0.16,6.46,0.597,3.056,3.155,0.233,1.076,2.8,19.282,55.477,4.613,2.943,0.486,0.202,46.547,23.598,17.695,8.72,0.97,1.564,0.389,0.031,0.739,0.128,1.717,7.69,207.89,32.425,18.332,4.117,99.052,6.626,23.525 20 | S_19,2.247,1.355,1.062,35.494,2.2,236.575,2.02,0.176,0.303,51.759,0.388,0.128,1.351,NA,12.805,1.076,506.565,130.806,1.314,0.917,0.242,0.2,33.339,39.633,7.822,42.158,0.339,2.12,2.339,1.811,4.55,1.037,7.306,279.09,551.469,53.018,93.238,8.731,9.264,14.734,49.116 21 | S_20,10.312,1.149,0.906,51.682,2.117,112.575,3.034,0.672,0.211,26.947,0.313,1.597,2.4,0.411,3.501,2.32,126.721,125.593,2.364,1.046,0.228,0.22,85.775,18.685,12.157,32.172,0.72,1.12,0.451,0.946,1.231,1.733,4.048,75.427,260.953,105.616,43.753,17.667,44.125,10.435,29.533 22 | S_21,2.465,0.563,0.623,16.507,2.511,36.887,0.581,0.232,0.314,5.351,0.276,0.256,0.532,NA,0.886,0.557,55.083,34.074,0.512,0.172,0.083,0.017,21.425,25.906,6.014,15.858,0.203,0.881,0.285,0.199,0.741,0.064,2.67,8.815,178.557,16.02,25.397,2.47,6.615,3.739,10.467 23 | S_22,16.122,0.609,0.462,25.255,5.391,39.374,0.856,1.066,1.239,5.903,0.292,2.593,1.878,0.274,1.05,1.664,38.032,88.687,1.194,0.784,0.356,0.298,55.324,64.297,32.58,16.67,0.777,1.19,0.213,0.817,2.36,0.154,2.998,5.595,228.82,53.788,29.397,9.642,30.307,2.104,6.779 24 | S_23,13.876,0.436,0.391,8.748,0.002,7.56,NA,0.168,NA,1.229,0.243,1.3,1.583,0.144,1.243,1.073,53.766,66.909,0.429,0.628,0.046,0.158,62.643,57.661,27.15,17.775,0.604,1.105,NA,0.661,2.25,0.011,1.194,4.185,79.008,13.644,7.851,2.265,21.161,2.573,9.616 25 | S_24,28.111,0.544,0.347,17.914,0.276,4.517,0.196,0.136,NA,0.943,NA,8.344,9.574,0.49,0.575,2.536,13.283,93.641,3.814,4.43,0.143,0.238,105.397,18.654,26.156,20.663,1.389,1.034,NA,0.343,0.463,0.266,1.761,2.931,39.157,22.966,5.231,3.536,151.119,2.757,10.715 26 | S_25,6.556,1.355,0.624,109.368,5.348,171.778,2.167,1.448,0.317,9.154,0.271,1.821,1.7,0.391,3.576,1.597,72.615,125.298,0.975,1.196,0.081,0.442,153.716,24.885,17.719,39.553,0.834,0.745,0.644,1.46,1.015,1.79,3.341,35.805,637.192,151.805,38.842,9.87,44.79,1.349,4.221 27 | S_26,23.768,0.747,0.516,26.574,3.084,38.849,0.148,1.395,0.132,2.165,0.281,3.755,1.936,0.558,1.674,1.253,101.525,142.283,0.781,5.298,0.049,0.221,75.046,17.34,22.179,26.364,1.696,0.698,1.043,0.949,1.6,2.083,8.725,99.959,202.592,52,9.898,4.518,89.169,0.921,3.193 28 | S_27,1.217,0.338,0.571,43.032,5.363,114.289,1.652,0.501,0.651,13.696,NA,0.179,0.282,0.053,6.411,0.538,529.31,61.619,0.077,0.091,NA,0.292,42.058,49.197,6.384,55.808,0.184,0.624,0.798,1.088,2.066,0.81,4.907,274.201,447.99,14.635,48.403,2.214,5,0.489,2.948 29 | S_28,33.759,0.364,0.391,91.458,18.366,183.816,3.666,3.933,2.129,27.982,0.391,6.677,5.421,0.19,3.674,4.191,45.088,155.907,2.544,0.684,0.654,1.056,121.298,32.704,26.447,26.494,NA,0.964,0.383,2.026,1.043,0.984,2.541,13.741,585.296,184.318,79.569,29.592,93.908,0.858,2.05 30 | S_29,1.3,6.124,4.91,198.638,5.868,398.288,18.18,0.286,1.489,119.941,0.279,NA,0.082,NA,5.556,0.23,705.262,11.469,0.121,0.285,NA,0.563,158.357,49.01,NA,137.6,0.092,3.016,1.394,1.385,5.361,0.999,8.872,139.511,710.813,12.143,193.474,1.997,2.621,1.994,5.585 31 | S_30,10.524,1.887,2.151,252.906,3.787,299.865,23.928,0.462,0.601,62.472,0.267,2.824,4.203,0.33,6.36,2.331,141.179,100.992,1.057,2.831,0.115,0.563,226.767,50.233,28.029,106.985,0.63,1.783,1.337,1.609,3.052,1.759,6.344,33.005,639.664,93.228,164.417,15.657,42.335,2.241,5.004 32 | S_31,3.884,0.463,2.278,21.386,2.103,137.837,2.677,2.766,1.089,158.059,0.218,NA,0.332,0.161,12.838,0.902,610.853,232.315,0.136,2.412,NA,0.125,37.374,46.004,6.547,85.897,0.732,0.677,9.106,2.2,5.795,2.408,18.85,897.304,372.799,69.022,153.888,26.172,6.481,1.201,7.78 33 | S_32,11.163,0.453,1.123,108.63,12.03,584.856,6.769,4.404,1.646,146.173,0.198,1.022,1.427,0.273,2.269,2.418,209.48,150.898,3.017,2.811,0.112,0.191,46.713,33.778,14.715,46.937,1.641,1.064,2.097,5.079,3.358,0.775,18.195,196.763,1565.334,403.11,210.591,43.66,40.522,3.895,10.157 34 | S_33,2.567,0.241,0.611,37.273,1.936,20.662,0.756,NA,0.113,1.486,0.395,6.655,1.992,0.295,8.028,1.203,206.988,107.935,0.76,0.78,0.005,0.194,82.083,45.99,32.743,67.539,0.451,0.881,1.478,0.404,0.767,0.856,7.88,188.903,119.738,16.385,7.901,1.226,28.376,0.914,3.025 35 | S_34,0.93,0.82,0.576,65.002,5.521,39.753,0.95,1.508,0.371,2.219,0.176,0.23,0.136,0.238,0.773,0.353,101.911,24.206,0.125,0.067,NA,NA,56.476,51.987,9.395,25.05,0.284,2.038,0.392,1.015,3.737,0.358,5.098,27.036,204.883,20.787,13.904,1.875,5.524,1.958,4.52 36 | S_35,17.579,0.903,0.365,17.297,1.841,25.455,0.168,1.022,0.284,2.919,NA,8.921,6.821,0.252,1.411,2.046,30.813,272.549,10.461,1.732,0.748,0.5,121.724,28.403,50.879,22.269,NA,1.19,0.313,0.533,NA,0.886,1.165,4.041,134.701,76.801,8.606,7.004,94.19,0.413,1.222 37 | S_36,33.231,0.65,0.669,35.592,4.054,101.295,0.476,0.954,0.203,11.477,0.215,3.91,2.669,0.275,3.097,0.849,164.07,216.932,11.871,1.953,0.859,0.172,78.618,45.216,29.266,31.666,0.549,1.478,1.059,2.115,1.367,0.82,2.153,24.807,414.127,122.889,26.451,11.75,115.91,1.032,1.839 38 | S_37,229.991,0.227,0.393,17.042,6.033,56.722,5.168,4.443,6.404,31.846,0.715,4.922,5.895,0.272,0.246,2.209,20.043,26.864,5.407,1.667,2.873,0.324,21.366,13.045,6.74,5.379,2.051,1.402,NA,0.468,NA,0.227,2.888,4.466,222.928,29.023,159.985,32.34,147.602,1.002,2.591 39 | S_38,89.414,NA,NA,14.657,6.341,26.124,1.371,2.293,2.611,9.735,0.606,2.278,2.474,0.159,0.805,0.684,28.3,6.237,3.15,1.69,0.34,0.07,4.851,4.646,0.137,3.286,2.17,3.013,0.422,0.035,0.438,0.202,12.082,19.453,178.653,9.243,58.867,6.041,69.683,1.32,3.299 40 | S_39,0.952,1.319,4.325,86.893,3.336,32.777,1.843,0.157,0.224,2.429,0.244,NA,0.16,NA,4.327,0.4,214.91,23.578,0.102,0.502,NA,0.109,57.626,16.705,0.514,65.994,0.193,2.14,1.178,0.712,1.339,1.168,11.177,169.394,300.794,7.423,20.172,0.63,3.68,9.628,32.488 41 | S_40,2.377,1.201,1.704,57.98,3.469,34.348,2.319,0.119,0.799,3.384,0.219,0.055,0.096,NA,8.046,0.293,174.113,12.783,NA,0.998,NA,0.245,168.39,31.883,NA,52.677,0.281,1.931,3.153,0.823,2.935,2.313,19.475,165.938,248.123,1.386,26.169,0.051,7.054,10.523,34.653 -------------------------------------------------------------------------------- /Trunc_KNN/Imput_funcs.r: -------------------------------------------------------------------------------- 1 | ################################################################################## 2 | #### MLE for the Truncated Normal 3 | #### Creating a Function that Returns the Log Likelihood, Gradient and 4 | #### Hessian Functions 5 | ################################################################################## 6 | 7 | ## data = numeric vector 8 | ## t = truncation limits 9 | mklhood <- function(data, t, ...) { 10 | 11 | data <- na.omit(data) 12 | n <- length(data) 13 | t <- sort(t) 14 | 15 | psi<-function(y, mu, sigma){ 16 | exp(-(y-mu)^2/(2*sigma^2))/(sigma*sqrt(2*pi)) 17 | } 18 | 19 | psi.mu<-function(y,mu,sigma){ 20 | exp(-(y-mu)^2/(2*sigma^2)) * ((y-mu)/(sigma^3*sqrt(2*pi))) 21 | } 22 | 23 | psi.sigma<-function(y,mu,sigma){ 24 | exp(-(y-mu)^2/(2*sigma^2)) * 25 | (((y-mu)^2)/(sigma^4*sqrt(2*pi)) - 1/(sigma^2*sqrt(2*pi))) 26 | } 27 | 28 | psi2.mu<-function(y,mu,sigma){ 29 | exp(-(y - mu)^2/(2*sigma^2)) * 30 | (((y - mu)^2)/(sigma^5*sqrt(2*pi))-1/(sigma^3*sqrt(2*pi))) 31 | } 32 | 33 | psi2.sigma<-function(y,mu,sigma){ 34 | exp(-(y-mu)^2/(2*sigma^2)) * 35 | ((2)/(sigma^3*sqrt(2*pi)) - (5*(y-mu))/(sigma^5*sqrt(2*pi)) + 36 | ((y-mu)^4)/(sigma^7*sqrt(2*pi))) 37 | } 38 | 39 | psi12.musig<-function(y,mu,sigma){ 40 | exp(-(y-mu)^2/(2*sigma^2)) * 41 | (((y-mu)^3)/(sigma^6*sqrt(2*pi)) - (3*(y-mu))/(sigma^4*sqrt(2*pi))) 42 | } 43 | 44 | ll.tnorm2<-function(p){ 45 | out <- (-n*log(pnorm(t[2],p[1],p[2])-pnorm(t[1],p[1],p[2]))) - 46 | (n*log(sqrt(2*pi*p[2]^2))) - (sum((data-p[1])^2)/(2*p[2]^2)) 47 | -1*out 48 | } 49 | 50 | grad.tnorm<-function(p){ 51 | g1 <- (-n*(integrate(psi.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value) / 52 | (pnorm(max(t),p[1],p[2])-pnorm(min(t),p[1],p[2]))) - ((n*p[1]-sum(data))/p[2]^2) 53 | g2 <- (-n*(integrate(psi.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value) / 54 | (pnorm(max(t),p[1],p[2])-pnorm(min(t),p[1],p[2]))) - ((n)/(p[2])) + ((sum((data-p[1])^2))/(p[2]^3)) 55 | out <- c(g1,g2) 56 | return(out) 57 | } 58 | 59 | hessian.tnorm<-function(p){ 60 | 61 | h1<- -n*(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value * 62 | integrate(psi2.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value - 63 | integrate(psi.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) / 64 | (integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) - 65 | n/(p[2]^2) 66 | 67 | h3<- -n*(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value * 68 | integrate(psi12.musig,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value - 69 | integrate(psi.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value * 70 | integrate(psi.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value) / 71 | (integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) + 72 | (2*(n*p[1]-sum(data)))/(p[2]^3) 73 | 74 | h2<- -n*(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value * 75 | integrate(psi2.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value - 76 | integrate(psi.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) / 77 | (integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) + 78 | (n)/(p[2]^2)-(3*sum((data-p[1])^2))/(p[2]^4) 79 | 80 | H<-matrix(0,nrow=2,ncol=2) 81 | H[1,1]<-h1 82 | H[2,2]<-h2 83 | H[1,2]<-H[2,1]<-h3 84 | return(H) 85 | } 86 | 87 | 88 | return(list(ll.tnorm2 = ll.tnorm2, grad.tnorm = grad.tnorm, hessian.tnorm = hessian.tnorm)) 89 | } 90 | ################################################################################## 91 | ###### Newton Raphson Function 92 | ###### This takes in the Objects Returned from mklhood Function above 93 | ################################################################################## 94 | 95 | NewtonRaphsonLike <- function(lhood, p, tol = 1e-07, maxit = 100) { 96 | 97 | cscore <- lhood$grad.tnorm(p) 98 | if(sum(abs(cscore)) < tol) 99 | return(list(estimate = p, value = lhood$ll.tnorm2(p), iter = 0)) 100 | cur <- p 101 | for(i in 1:maxit) { 102 | inverseHess <- solve(lhood$hessian.tnorm(cur)) 103 | cscore <- lhood$grad.tnorm(cur) 104 | new <- cur - cscore %*% inverseHess 105 | if (new[2] <= 0) stop("Sigma < 0") 106 | cscore <- lhood$grad.tnorm(new) 107 | 108 | if(((abs(lhood$ll.tnorm2(cur)- lhood$ll.tnorm2(new))/(lhood$ll.tnorm2(cur))) < tol)) 109 | return(list(estimate = new, value= lhood$ll.tnorm2(new), iter = i)) 110 | cur <- new 111 | } 112 | 113 | return(list(estimate = new, value= lhood$ll.tnorm2(new), iter = i)) 114 | } 115 | 116 | ################################################################################## 117 | ###### Based on the MLE Functions (mklhood) and NewtonRaphson Function 118 | ###### (NewtonRaphsonLike), This function estimates the MEAN and SD from the 119 | ###### Truncated using Newton Raphson. 120 | ################################################################################## 121 | 122 | ## missingdata = matrix where rows = features, columns = samples 123 | ## perc = if %MVs > perc then just sample mean / SD 124 | ## iter = # iterations in NR algorithm 125 | 126 | EstimatesComputation <- function(missingdata, perc, iter=50) { 127 | 128 | ## 2 column matrix where column 1 = means, column 2 = SD 129 | ParamEstim <- matrix(NA, nrow = nrow(missingdata), ncol = 2) 130 | nsamp <- ncol(missingdata) 131 | 132 | ## sample means / SDs 133 | ParamEstim[,1] <- rowMeans(missingdata, na.rm = TRUE) 134 | ParamEstim[,2] <- apply(missingdata, 1, function(x) sd(x, na.rm = TRUE)) 135 | 136 | ## Case 1: missing % > perc => use sample mean / SD 137 | na.sum <- apply(missingdata, 1, function(x) sum(is.na(x))) 138 | idx1 <- which(na.sum/nsamp >= perc) 139 | 140 | ## Case 2: sample mean > 3 SD away from LOD => use sample mean / SD 141 | lod <- min(missingdata, na.rm=TRUE) ## why use the min of whole data set?????? 142 | idx2 <- which(ParamEstim[,1] > 3*ParamEstim[,2] + lod) 143 | 144 | ## Case 3: for all others, use NR method to obtain truncated mean / SD estimate 145 | idx.nr <- setdiff(1:nrow(missingdata), c(idx1, idx2)) 146 | ## t = limits of integration (LOD and upper) 147 | upplim <- max(missingdata, na.rm=TRUE) + 2*max(ParamEstim[,2]) 148 | for (i in idx.nr) { 149 | Likelihood <- mklhood(missingdata[i,], t=c(lod, upplim)) 150 | res <- tryCatch(NewtonRaphsonLike(Likelihood, p = ParamEstim[i,]), 151 | error = function(e) 1000) 152 | 153 | if (length(res) == 1) { 154 | next 155 | } else if (res$iter >= iter) { 156 | next 157 | } else { 158 | ParamEstim[i,] <- as.numeric(res$estimate) 159 | } 160 | } 161 | return(ParamEstim) 162 | } 163 | 164 | 165 | 166 | #################################################################### 167 | #### This Function imputes the data BASED on KNN-EUCLIDEAN 168 | #################################################################### 169 | 170 | ## data = data set to be imputed, where rows = features, columns = samples 171 | ## k = number of neighbors for imputing values 172 | ## rm.na, rm.nan, rm.inf = whether NA, NaN, and Inf values should be imputed 173 | 174 | 175 | KNNEuc <- function (data, k, rm.na = TRUE, rm.nan = TRUE, rm.inf = TRUE) { 176 | 177 | nr <- dim(data)[1] 178 | 179 | imp.knn <- data 180 | imp.knn[is.finite(data) == FALSE] <- NA 181 | t.data<-t(data) 182 | 183 | mv.ind <- which(is.na(imp.knn), arr.ind = TRUE) 184 | arrays <- unique(mv.ind[, 2]) 185 | array.ind <- match(arrays, mv.ind[, 2]) 186 | nfeatures <- 1:nr 187 | 188 | for (i in 1:length(arrays)) { 189 | set <- array.ind[i]:min((array.ind[(i + 1)] - 1), dim(mv.ind)[1], na.rm = TRUE) 190 | cand.features <- nfeatures[-unique(mv.ind[set, 1])] 191 | cand.vectors <- t.data[,cand.features] 192 | exp.num <- arrays[i] 193 | 194 | for (j in set) { 195 | feature.num <- mv.ind[j, 1] 196 | tar.vector <- data[feature.num,] 197 | 198 | dist <- sqrt(colMeans((tar.vector-cand.vectors)^2, na.rm = TRUE)) 199 | dist[is.nan(dist) | is.na(dist)] <- Inf 200 | dist[dist==0] <- ifelse(is.finite(min(dist[dist>0])), min(dist[dist>0])/2, 1) 201 | 202 | if (sum(is.finite(dist)) < k) { 203 | stop(message = "Fewer than K finite distances found") 204 | } 205 | k.features.ind <- order(dist)[1:k] 206 | k.features <- cand.features[k.features.ind] 207 | wghts <- 1/dist[k.features.ind]/sum(1/dist[k.features.ind]) 208 | imp.knn[feature.num, exp.num] <- wghts %*% data[k.features, exp.num] 209 | } 210 | } 211 | 212 | if (!rm.na) { 213 | imp.knn[is.na(data) == TRUE & is.nan(data) == FALSE] <- NA 214 | } 215 | if (!rm.inf) { 216 | index <- is.finite(data) == FALSE & is.na(data) == FALSE & 217 | is.nan(data) == FALSE 218 | imp.knn[index] <- data[index] 219 | } 220 | if (!rm.nan) { 221 | imp.knn[is.nan(data) == TRUE] <- NaN 222 | } 223 | return(imp.knn) 224 | } 225 | 226 | #################################################################### 227 | #### This Function imputes the data based on KNN-CORRELATION or 228 | #### KNN-TRUNCATION. The Parameter Estimates based on the Truncated 229 | #### Normal from EstimateComputation function is run on this function 230 | #################################################################### 231 | 232 | imputeKNN <- function (data, k , distance = "correlation", 233 | rm.na = TRUE, rm.nan = TRUE, rm.inf = TRUE, perc=1,...) { 234 | 235 | if (!(is.matrix(data))) { 236 | stop(message = paste(deparse(substitute(data)), 237 | " is not a matrix.", sep = "")) 238 | } 239 | 240 | distance <- match.arg(distance, c("correlation","truncation")) 241 | 242 | nr <- dim(data)[1] 243 | if (k < 1 | k > nr) { 244 | stop(message = "k should be between 1 and the number of rows") 245 | } 246 | 247 | if (distance=="correlation"){ 248 | genemeans<-rowMeans(data,na.rm=TRUE) 249 | genesd<-apply(data, 1, function(x) sd(x, na.rm = TRUE)) 250 | data<-(data-genemeans)/genesd 251 | } 252 | 253 | if (distance=="truncation"){ 254 | 255 | ParamMat <- EstimatesComputation(data, perc = perc) 256 | 257 | genemeans<-ParamMat[,1] 258 | genesd<-ParamMat[,2] 259 | data<-(data-genemeans)/genesd 260 | } 261 | 262 | imp.knn <- data 263 | imp.knn[is.finite(data) == FALSE] <- NA 264 | t.data<-t(data) 265 | 266 | mv.ind <- which(is.na(imp.knn), arr.ind = TRUE) 267 | arrays <- unique(mv.ind[, 2]) 268 | array.ind <- match(arrays, mv.ind[, 2]) 269 | ngenes <- 1:nr 270 | 271 | for (i in 1:length(arrays)) { 272 | set <- array.ind[i]:min((array.ind[(i + 1)] - 1), dim(mv.ind)[1], 273 | na.rm = TRUE) 274 | cand.genes <- ngenes[-unique(mv.ind[set, 1])] 275 | cand.vectors <- t.data[,cand.genes] 276 | exp.num<- arrays[i] 277 | for (j in set) { 278 | 279 | gene.num <- mv.ind[j, 1] 280 | tar.vector <- data[gene.num,] 281 | 282 | r <- (cor(cand.vectors,tar.vector, use = "pairwise.complete.obs")) 283 | dist <- switch(distance, 284 | correlation = (1 - abs(r)), 285 | truncation = (1 - abs(r))) 286 | dist[is.nan(dist) | is.na(dist)] <- Inf 287 | dist[dist==0]<-ifelse(is.finite(min(dist[dist>0])), min(dist[dist>0])/2, 1) 288 | dist[abs(r) == 1] <- Inf 289 | 290 | if (sum(is.finite(dist)) < k) { 291 | stop(message = "Fewer than K finite distances found") 292 | } 293 | k.genes.ind <- order(dist)[1:k] 294 | k.genes <- cand.genes[k.genes.ind] 295 | 296 | wghts <- (1/dist[k.genes.ind]/sum(1/dist[k.genes.ind])) * sign(r[k.genes.ind]) 297 | imp.knn[gene.num, exp.num] <- wghts %*% data[k.genes, exp.num] 298 | } 299 | } 300 | 301 | if (distance=="correlation") { 302 | imp.knn <- (imp.knn * genesd) + genemeans 303 | } 304 | 305 | if(distance=="truncation") { 306 | imp.knn <- (imp.knn * genesd) + genemeans 307 | } 308 | 309 | if (!rm.na) { 310 | imp.knn[is.na(data) == TRUE & is.nan(data) == FALSE] <- NA 311 | } 312 | if (!rm.inf) { 313 | index <- is.finite(data) == FALSE & is.na(data) == FALSE & 314 | is.nan(data) == FALSE 315 | imp.knn[index] <- data[index] 316 | } 317 | if (!rm.nan) { 318 | imp.knn[is.nan(data) == TRUE] <- NaN 319 | } 320 | return(imp.knn) 321 | } 322 | 323 | ################################################################################## 324 | #### Root Mean Squared Error Function 325 | ################################################################################## 326 | Rmse <- function(imp, mis, true, norm = FALSE) { 327 | imp <- as.matrix(imp) 328 | mis <- as.matrix(mis) 329 | true <- as.matrix(true) 330 | missIndex <- which(is.na(mis)) 331 | errvec <- imp[missIndex] - true[missIndex] 332 | rmse <- sqrt(mean(errvec^2)) 333 | if (norm) { 334 | rmse <- rmse/sd(true[missIndex]) 335 | } 336 | return(rmse) 337 | } 338 | ################################################################################## 339 | #### Compute the Errors for the Different Imputation Methods 340 | ################################################################################## 341 | ErrorsComputation <- function(trunc, corr, euc, miss, complete) { 342 | 343 | ImputeErrors <- NULL 344 | missing <- t(miss) 345 | original <- t(complete) 346 | KnnTrunc <- Rmse(trunc, missing, original) 347 | KnnCorr <- Rmse(corr, missing, original) 348 | KnnEuc <- Rmse(euc, missing, original) 349 | ImputeErrors <- c(KnnTrunc, KnnCorr, KnnEuc) 350 | return(ImputeErrors) 351 | 352 | } 353 | 354 | 355 | 356 | -------------------------------------------------------------------------------- /GSimp_evaluation.R: -------------------------------------------------------------------------------- 1 | require(vegan) 2 | require(foreach) 3 | require(doParallel) 4 | require(reshape2) 5 | require(ggplot2) 6 | require(ropls) 7 | source('MVI_global.R') 8 | source('Impute_wrapper.R') 9 | 10 | # MNAR generation and imputation ------------------------------------------ 11 | MNAR_gen_imp <- function(data_c, mis_var_prop=seq(.1, .8, .1), var_mis_prop=seq(.1, .8, .1), 12 | impute_list=c('kNN_wrapper', 'SVD_wrapper', 'HM_wrapper', 'QRILC_wrapper'), cores=5) { 13 | if (cores==1) { 14 | results <- list() 15 | for (i in seq_along(mis_var_prop)) { 16 | prop <- mis_var_prop[i] 17 | data_m_res <- MNAR_generate(data_c, mis_var=prop, var_prop=var_mis_prop) 18 | data_m <- data_m_res[[1]] 19 | mis_idx <- data_m_res[[2]] 20 | result_list <- list() 21 | for (j in seq_along(impute_list)) { 22 | method=impute_list[j] 23 | print(method) 24 | result_list[[j]] <- do.call(method, list(data_m)) 25 | } 26 | result_list[[j+1]] <- mis_idx 27 | results[[i]] <- result_list 28 | } 29 | } else if (cores>1) { 30 | cl <- makeCluster(cores) 31 | registerDoParallel(cl) 32 | results <- foreach(prop=mis_var_prop, .export=impute_list, .packages=c('magrittr', 'impute', 'imputeLCMD')) %dopar% { 33 | source('MVI_global.R') 34 | data_m_res <- MNAR_generate(data_c, mis_var=prop, var_prop=var_mis_prop) 35 | data_m <- data_m_res[[1]] 36 | mis_idx <- data_m_res[[2]] 37 | result_list <- list() 38 | for (i in seq_along(impute_list)) { 39 | method=impute_list[i] 40 | print(method) 41 | result_list[[i]] <- do.call(method, list(data_m)) 42 | } 43 | result_list[[i+1]] <- mis_idx 44 | result_list 45 | } 46 | stopCluster(cl) 47 | } else {cat('Improper argument: cores!')} 48 | return(list(results=results, data_c=data_c, impute_list=impute_list)) 49 | } 50 | 51 | # NRMSE rank calculate and plot ------------------------------------------- 52 | NRMSE_rank_cal_plot <- function(impute_results, plot=T, x='Miss_Prop', colors=NULL, shapes=NULL) { 53 | name_list=gsub('(.*)_.*', '\\1', impute_results$impute_list) 54 | data_c <- impute_results$data_c 55 | impute_results_list <- impute_results$results 56 | nrmse_rank_df <- data.frame(matrix(0, length(impute_results_list), length(name_list) + 2)) 57 | colnames(nrmse_rank_df) <- c(name_list, 'Miss_Prop', 'Miss_Num') 58 | for (i in seq_along(impute_results_list)) { 59 | # print(i) 60 | results_temp <- impute_results_list[[i]] 61 | mis_idx_temp <- results_temp[[length(results_temp)]] 62 | var_idx_temp <- unique(mis_idx_temp[, 2]) 63 | data_m_temp <- data_c 64 | data_m_temp[results_temp[[length(results_temp)]]] <- NA 65 | temp_list <- rep(0, length(name_list)) 66 | for (j in var_idx_temp) { 67 | for (k in seq_along(name_list)) { 68 | temp_list[k] <- nrmse(results_temp[[k]][, j], data_m_temp[, j], data_c[, j]) 69 | } 70 | rank_temp <- rank(temp_list) 71 | nrmse_rank_df[i, seq_along(name_list)] <- nrmse_rank_df[i, seq_along(name_list)] + rank_temp 72 | } 73 | nrmse_rank_df$Miss_Prop[i] <- mean(is.na(data_m_temp)) 74 | nrmse_rank_df$Miss_Num[i] <- length(var_idx_temp) 75 | } 76 | 77 | nrmse_rank_df_melt <- melt(nrmse_rank_df[, c(name_list, x)], id.vars=x) 78 | colnames(nrmse_rank_df_melt)[-1] <- c('Method', 'NRMSE_Rank') 79 | if (plot) { 80 | if (is.null(colors) & is.null(shapes)) { 81 | print(ggplot(aes_string(x=x, y='NRMSE_Rank', color='Method'), data=nrmse_rank_df_melt) + 82 | geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw()) 83 | } else if (!is.null(colors) & is.null(shapes)) { 84 | print(ggplot(aes_string(x=x, y='NRMSE_Rank', color='Method'), data=nrmse_rank_df_melt) + 85 | geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + scale_color_manual(values=colors)) 86 | } else if (is.null(colors) & !is.null(shapes)) { 87 | print(ggplot(aes_string(x=x, y='NRMSE_Rank', color='Method'), data=nrmse_rank_df_melt) + 88 | geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + scale_shape_manual(values=shapes)) 89 | } else { 90 | print(ggplot(aes_string(x=x, y='NRMSE_Rank', color='Method'), data=nrmse_rank_df_melt) + 91 | geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + 92 | scale_shape_manual(values=shapes) + scale_color_manual(values=colors)) 93 | } 94 | } 95 | return(list(NRMSE_rank=nrmse_rank_df, NRMSE_rank_melt=nrmse_rank_df_melt)) 96 | } 97 | 98 | # Procrustes and plot ----------------------------------------------------- 99 | Procrustes_cal_plot <- function(impute_results, DR='PCA', lg=F, nPCs=2, outcome=NULL, x='Miss_Prop', plot=T, colors=NULL, shapes=NULL) { 100 | name_list=gsub('(.*)_.*', '\\1', impute_results$impute_list) 101 | data_c <- impute_results$data_c 102 | if (lg) {data_c %<>% log1p} 103 | impute_results_list <- impute_results$result 104 | procruste_df <- data.frame(matrix(NA, length(impute_results_list), length(name_list) + 2)) 105 | colnames(procruste_df) <- c(name_list, 'Miss_Prop', 'Miss_Num') 106 | if (DR == 'PCA') { 107 | data_c_pca <- prcomp(data_c, scale.=T, center=T)$x[, 1:nPCs] 108 | for (i in seq_along(impute_results_list)) { 109 | # print(i) 110 | results_temp <- impute_results_list[[i]] 111 | mis_idx_temp <- results_temp[[length(results_temp)]] 112 | var_idx_temp <- unique(mis_idx_temp[, 2]) 113 | data_m_temp <- data_c 114 | data_m_temp[results_temp[[length(results_temp)]]] <- NA 115 | for (j in seq_along(name_list)) { 116 | # print(name_list[j]) 117 | data_imp_temp <- results_temp[[j]] 118 | if (lg) {data_imp_temp %<>% log1p} 119 | procruste_df[i, j] <- procrustes(data_c_pca, prcomp(data_imp_temp, scale.=T, center=T)$x[, 1:nPCs], symmetric=T)$ss 120 | } 121 | procruste_df$Miss_Prop[i] <- mean(is.na(data_m_temp)) 122 | procruste_df$Miss_Num[i] <- length(var_idx_temp) 123 | } 124 | } 125 | else if (DR == 'PLS' & !is.null(outcome)) { 126 | data_c_pls <- data.frame(opls(data_c, outcome, predI=nPCs, permI=0, plotL=F, printL=F)@scoreMN) 127 | for (i in seq_along(impute_results_list)) { 128 | # print(i) 129 | results_temp <- impute_results_list[[i]] 130 | mis_idx_temp <- results_temp[[length(results_temp)]] 131 | var_idx_temp <- unique(mis_idx_temp[, 2]) 132 | data_m_temp <- data_c 133 | data_m_temp[results_temp[[length(results_temp)]]] <- NA 134 | for (j in seq_along(name_list)) { 135 | data_imp_temp <- results_temp[[j]] 136 | if (lg) {data_imp_temp %<>% log1p} 137 | procruste_df[i, j] <- procrustes(data_c_pls, 138 | data.frame(opls(data_imp_temp, outcome, predI=nPCs, permI=0, plotL=F, printL=F)@scoreMN), 139 | symmetric=T)$ss 140 | } 141 | procruste_df$Miss_Prop[i] <- mean(is.na(data_m_temp)) 142 | procruste_df$Miss_Num[i] <- length(var_idx_temp) 143 | } 144 | } 145 | procruste_df_melt <- melt(procruste_df[, c(name_list, x)], id.vars=x) 146 | colnames(procruste_df_melt)[-1] <- c('Method', 'Pro_SS') 147 | if (plot) { 148 | if (is.null(colors) & is.null(shapes)) { 149 | print(ggplot(aes_string(x=x, y='Pro_SS', color='Method'), data=procruste_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw()) 150 | } else if (!is.null(colors) & is.null(shapes)) { 151 | print(ggplot(aes_string(x=x, y='Pro_SS', color='Method'), data=procruste_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + scale_color_manual(values=colors)) 152 | } else if (is.null(colors) & !is.null(shapes)) { 153 | print(ggplot(aes_string(x=x, y='Pro_SS', color='Method'), data=procruste_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + scale_shape_manual(values=shapes)) 154 | } else { 155 | print(ggplot(aes_string(x=x, y='Pro_SS', color='Method'), data=procruste_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + 156 | scale_shape_manual(values=shapes) + scale_color_manual(values=colors)) 157 | } 158 | } 159 | return(list(Pro_SS=procruste_df, Pro_SS_melt=procruste_df_melt)) 160 | } 161 | 162 | 163 | # Univariate analysis and plot -------------------------------------------- 164 | Ttest_cor_cal_plot <- function(impute_results, group=NULL, plot=T, x='Miss_Prop', cor='P', colors=NULL, shapes=NULL) { 165 | name_list <- gsub('(.*)_.*', '\\1', impute_results$impute_list) 166 | data_c <- impute_results$data_c 167 | impute_results_list <- impute_results$results 168 | Ttest_p_c <- apply(data_c, 2, function(x) t.test(x ~ group)$p.value) 169 | 170 | pcor_df <- data.frame(matrix(NA, length(impute_results_list), length(name_list) + 2)) 171 | scor_df <- data.frame(matrix(NA, length(impute_results_list), length(name_list) + 2)) 172 | colnames(pcor_df) <- c(name_list, 'Miss_Prop', 'Miss_Num') 173 | colnames(scor_df) <- c(name_list, 'Miss_Prop', 'Miss_Num') 174 | 175 | for (i in seq_along(impute_results_list)) { 176 | # print(i) 177 | results_temp <- impute_results_list[[i]] 178 | mis_idx_temp <- results_temp[[length(results_temp)]] 179 | var_idx_temp <- unique(mis_idx_temp[, 2]) 180 | data_m_temp <- data_c 181 | data_m_temp[results_temp[[length(results_temp)]]] <- NA 182 | for (j in seq_along(name_list)) { 183 | pcor_df[i, j] <- cor.test(log(Ttest_p_c[var_idx_temp]), 184 | log(apply(results_temp[[j]][, var_idx_temp], 2, 185 | function(x) t.test(x ~ group)$p.value)))$estimate 186 | scor_df[i, j] <- cor.test(Ttest_p_c[var_idx_temp], 187 | apply(results_temp[[j]][, var_idx_temp], 2, 188 | function(x) t.test(x ~ group)$p.value), method='spearman')$estimate 189 | } 190 | pcor_df$Miss_Prop[i] <- mean(is.na(data_m_temp)) 191 | scor_df$Miss_Prop[i] <- mean(is.na(data_m_temp)) 192 | pcor_df$Miss_Num[i] <- length(var_idx_temp) 193 | scor_df$Miss_Num[i] <- length(var_idx_temp) 194 | } 195 | pcor_df_melt <- melt(pcor_df[, c(name_list, x)], id.var=x) 196 | colnames(pcor_df_melt)[-1] <- c('Method', 'Cor') 197 | scor_df_melt <- melt(scor_df[, c(name_list, x)], id.var=x) 198 | colnames(scor_df_melt)[-1] <- c('Method', 'Cor') 199 | if (plot & cor == 'P') { 200 | if (is.null(colors) & is.null(shapes)) { 201 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=pcor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4)) 202 | } else if (!is.null(colors) & is.null(shapes)) { 203 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=pcor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_color_manual(values=colors)) 204 | } else if (is.null(colors) & !is.null(shapes)) { 205 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=pcor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_shape_manual(values=shapes)) 206 | } else { 207 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=pcor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_shape_manual(values=shapes) + scale_color_manual(values=colors)) 208 | } 209 | } else if (plot & cor == 'S') { 210 | if (is.null(colors) & is.null(shapes)) { 211 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=scor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4)) 212 | } else if (!is.null(colors) & is.null(shapes)) { 213 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=scor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_color_manual(values=colors)) 214 | } else if (is.null(colors) & !is.null(shapes)) { 215 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=scor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_shape_manual(values=shapes)) 216 | } else { 217 | print(ggplot(aes_string(x=x, y='Cor', color='Method'), data=scor_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_shape_manual(values=shapes) + scale_color_manual(values=colors)) 218 | } 219 | } 220 | return(list(P_cor=pcor_df, S_cor=scor_df, P_cor_melt=pcor_df_melt, S_cor_melt=scor_df_melt)) 221 | } 222 | 223 | Ttest_P_cal_plot <- function(impute_results, group=NULL, plot=T, p_cut=.05, x='Miss_Prop', colors=NULL, shapes=NULL) { 224 | name_list <- gsub('(.*)_.*', '\\1', impute_results$impute_list) 225 | data_c <- impute_results$data_c 226 | impute_results_list <- impute_results$results 227 | Ttest_p_c <- apply(data_c, 2, function(x) t.test(x ~ group)$p.value) 228 | sig_idx <- which(Ttest_p_c < p_cut) 229 | 230 | P_df <- data.frame(matrix(NA, length(impute_results_list), length(name_list) + 2)) 231 | colnames(P_df) <- c(name_list, 'Miss_Prop', 'Miss_Num') 232 | 233 | for (i in seq_along(impute_results_list)) { 234 | # print(i) 235 | results_temp <- impute_results_list[[i]] 236 | mis_idx_temp <- results_temp[[length(results_temp)]] 237 | var_idx_temp <- unique(mis_idx_temp[, 2]) 238 | data_m_temp <- data_c 239 | data_m_temp[results_temp[[length(results_temp)]]] <- NA 240 | for (j in seq_along(name_list)) { 241 | sig_idx_temp <- which(apply(results_temp[[j]], 2, 242 | function(x) t.test(x ~ group)$p.value) < p_cut) 243 | P_df[i, j] <- mean(sig_idx %in% sig_idx_temp) 244 | } 245 | P_df$Miss_Prop[i] <- mean(is.na(data_m_temp)) 246 | P_df$Miss_Num[i] <- length(var_idx_temp) 247 | } 248 | P_df_melt <- melt(P_df[, c(name_list, x)], id.var=x) 249 | colnames(P_df_melt)[-1] <- c('Method', 'Po') 250 | if (plot) { 251 | if (is.null(colors) & is.null(shapes)) { 252 | print(ggplot(aes_string(x=x, y='Po', color='Method'), data=P_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4)) 253 | } else if (!is.null(colors) & is.null(shapes)) { 254 | print(ggplot(aes_string(x=x, y='Po', color='Method'), data=P_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_color_manual(values=colors)) 255 | } else if (is.null(colors) & !is.null(shapes)) { 256 | print(ggplot(aes_string(x=x, y='Po', color='Method'), data=P_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_shape_manual(values=shapes)) 257 | } else { 258 | print(ggplot(aes_string(x=x, y='Po', color='Method'), data=P_df_melt) + geom_point(aes(shape=Method), size=4) + geom_line() + theme_bw() + geom_point(aes(shape=Method), size=4) + scale_shape_manual(values=shapes) + scale_color_manual(values=colors)) 259 | } 260 | } 261 | return(list(P_df=P_df, P_df_melt=P_df_melt)) 262 | } 263 | 264 | 265 | # Calculate PCA_Pro on a list of imputed data ----------------------------- 266 | PCA_pro_list <- function(imp_list, nPCs=2, method_names) { 267 | if(length(imp_list)!=length(method_names)) { 268 | stop('imp_list not comparable with method_names') 269 | } 270 | 271 | pro_res <- list() 272 | pca_data <- lapply(imp_list, function(x) prcomp(x, center=T, scale.=T)$x[, 1:nPCs]) %>% set_names(method_names) 273 | 274 | pro_ss_df <- matrix(NA, length(method_names), length(method_names)) %>% as.data.frame() 275 | colnames(pro_ss_df) <- method_names 276 | row.names(pro_ss_df) <- method_names 277 | method_combn <- combn(method_names, 2) 278 | 279 | for(i in 1:ncol(method_combn)) { 280 | name_temp <- paste0(method_combn[2, i], '_', method_combn[1, i]) 281 | pro_res[[i]] <- procrustes(pca_data[[method_combn[2, i]]], pca_data[[method_combn[1, i]]], symmetric=T) 282 | names(pro_res)[i] <- name_temp 283 | pro_ss_df[method_combn[1, i], method_combn[2, i]] <- pro_ss_df[method_combn[2, i], method_combn[1, i]] <- pro_res[[i]]$ss 284 | } 285 | return(list(pro_ss_df = pro_ss_df, pro_res = pro_res)) 286 | } 287 | 288 | # Calculate NRMSE on a list of imputed data ------------------------------- 289 | NRMSE_list <- function(imp_list, miss_df, method_names) { 290 | if(length(imp_list)!=length(method_names)) { 291 | stop('imp_list not comparable with method_names') 292 | } 293 | 294 | nrmse_res <- matrix(NA, length(method_names), length(method_names)) %>% as.data.frame() 295 | colnames(nrmse_res) <- method_names 296 | row.names(nrmse_res) <- method_names 297 | method_combn <- combn(method_names, 2) 298 | 299 | imp_list_sc <- list() 300 | log_sc_1 <- imp_list[[1]] %>% scale_recover() 301 | sc_param <- log_sc_1[[2]] 302 | for(i in 1:length(method_names)) { 303 | imp_list_sc[[i]] <- scale_recover(imp_list[[i]], param_df = sc_param)[[1]] 304 | } 305 | imp_list_sc <- imp_list_sc %>% set_names(method_names) 306 | 307 | for(i in 1:ncol(method_combn)) { 308 | name_temp <- paste0(method_combn[1, i], '_', method_combn[2, i]) 309 | nrmse_res[method_combn[1, i], method_combn[2, i]] <- nrmse_res[method_combn[2, i], method_combn[1, i]] <- 310 | nrmse(imp_list_sc[[method_combn[1, i]]], miss_df[,col_na], imp_list_sc[[method_combn[2, i]]]) 311 | } 312 | return(nrmse_res = nrmse_res) 313 | } 314 | 315 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI:10.1371/journal.pcbi.1005973](https://zenodo.org/badge/DOI/10.1371/journal.pcbi.1005973.svg)](https://doi.org/10.1371/journal.pcbi.1005973) 2 | 3 | # A Quick Tour of GSimp 4 | 5 | # 1. Introduction 6 | 7 | GSimp is a gibbs sampler based left-censored missing value imputation 8 | approach for metabolomics studies. This vignette provides a quick tour 9 | of GSimp that contains, data pre-processing, simulated data generation, 10 | missing not at random (MNAR) generation, wrapper functions for different 11 | MNAR imputation methods (GSimp, QRILC, and kNN-TN) and evaluations of 12 | these methods. Core functions for GSimp and the real-world metabolomics 13 | datasets are available here. 14 | All content of GSimp is licensed under the [Creative Commons 15 | Attribution-NonCommercial-ShareAlike 4.0 International 16 | License](https://creativecommons.org/licenses/by-nc-sa/4.0/). 17 | 18 | Please cite our paper once it is published: Wei, R., Wang, J., Jia, E., 19 | Chen, T., Ni, Y., & Jia, W. (2017). GSimp: A Gibbs sampler based 20 | left-censored missing value imputation approach for metabolomics 21 | studies. PLOS Computational Biology (under revision). 22 | 23 | **GSimp.R** contains the core functions for GSimp. 24 | 25 | **GSimp_evaluation.R** contains MNAR generation and evaluation functions which are part of our missing value imputation evaluation pipeline. 26 | 27 | **Impute_wrapper.R** contains wrapper functions for different imputation methods (contains *pre_processing_GS_wrapper*). 28 | 29 | **MVI_global.R** contains some basic global functions. 30 | 31 | **Prediction_funcs.R** contains wrapper functions for different prediction models. 32 | 33 | **simulation_generation.Rmd** contains a simulation dataset generation algorithm. 34 | 35 | **simulation_generation.pdf** was knited by **simulation_generation.Rmd**. 36 | 37 | **real_data.csv** is a real-world targeted metabolomics dataset contains both FFAs and BAs. 38 | 39 | **data_sim.csv** is the simulated dataset (details could be found in **simulation_generation.Rmd**). 40 | 41 | **targeted_data.csv** is a targeted LC/MS data with missing values. 42 | 43 | **untargeted_data.csv** is an untargeted GC/MS data with missing values after manually missing retrieval. 44 | 45 | **Trunc_KNN** contains kNN-TN algorithm and related functions developed by Jasmit S. Shah (https://doi.org/10.1186/s12859-017-1547-6). 46 | 47 | **vignette_files** contains files for README.md. 48 | 49 | # 2. Preparation 50 | ## 2.1 Packages and source code importing 51 | -------------------------------- 52 | 53 | ``` r 54 | options(stringsAsFactors = F) 55 | source('Trunc_KNN/Imput_funcs.r') 56 | source('GSimp_evaluation.R') 57 | source('GSimp.R') 58 | ``` 59 | 60 | Package Dependencies and Version 61 | 62 | | Package | Version | 63 | |:-------------|:-----------| 64 | | Amelia | 1.7.4 | 65 | | abind | 1.4-5 | 66 | | doParallel | 1.0.11 | 67 | | FNN | 1.1 | 68 | | foreach | 1.4.4 | 69 | | ggplot2 | 2.2.1.9000 | 70 | | glmnet | 2.0-13 | 71 | | impute | 1.50.1 | 72 | | imputeLCMD | 2 | 73 | | knitr | 1.17 | 74 | | magrittr | 1.5 | 75 | | markdown | 0.8 | 76 | | missForest | 1.4 | 77 | | pheatmap | 1.0.8 | 78 | | randomForest | 4.6-12 | 79 | | reshape2 | 1.4.3 | 80 | | ropls | 1.8.0 | 81 | | vegan | 2.4-5 | 82 | 83 | 84 | ## 2.2 Data Pre-processing 85 | 86 | In GSimp, we recommend data pre-processing steps as following: 87 | 88 | - Log-transformation (for non-normal data) 89 | - Initialization for missing values (e.g., QRILC) 90 | - Centralization and scaling (for elastic-net prediction) 91 | - Imputation using GSimp 92 | - Scaling recovery 93 | - Exponential recovery 94 | - Imputed data output 95 | 96 | All aboved steps has been wrapped into the 97 | *pre\_processing\_GS\_wrapper* function for a one-step processing and 98 | imputation. The function will give the final imputed dataset. 99 | 100 | ``` r 101 | # wrapper function with data pre-processing 102 | pre_processing_GS_wrapper <- function(data) { 103 | data_raw <- data 104 | # Log transformation # 105 | data_raw_log <- data_raw %>% log() 106 | # Initialization # 107 | data_raw_log_qrilc <- impute.QRILC(data_raw_log) %>% extract2(1) 108 | # Centralization and scaling # 109 | data_raw_log_qrilc_sc <- scale_recover(data_raw_log_qrilc, method = 'scale') 110 | # Data after centralization and scaling # 111 | data_raw_log_qrilc_sc_df <- data_raw_log_qrilc_sc[[1]] 112 | # Parameters for centralization and scaling (for scaling recovery) # 113 | data_raw_log_qrilc_sc_df_param <- data_raw_log_qrilc_sc[[2]] 114 | # NA position # 115 | NA_pos <- which(is.na(data_raw), arr.ind = T) 116 | # NA introduced to log-scaled-initialized data # 117 | data_raw_log_sc <- data_raw_log_qrilc_sc_df 118 | data_raw_log_sc[NA_pos] <- NA 119 | # Feed initialized and missing data into GSimp imputation # 120 | result <- data_raw_log_sc %>% GS_impute(., iters_each=50, iters_all=10, 121 | initial = data_raw_log_qrilc_sc_df, 122 | lo=-Inf, hi= 'min', n_cores=2, 123 | imp_model='glmnet_pred') 124 | data_imp_log_sc <- result$data_imp 125 | # Data recovery # 126 | data_imp <- data_imp_log_sc %>% 127 | scale_recover(., method = 'recover', 128 | param_df = data_raw_log_qrilc_sc_df_param) %>% 129 | extract2(1) %>% exp() 130 | return(data_imp) 131 | } 132 | ``` 133 | 134 | # 3. GSimp in a nutshell 135 | 136 | The function *GS\_impute* is the core function for the imputation of 137 | missing data and tracing the Gibbs sampler with certain missing 138 | positions. 139 | 140 | Some arguments of *GS\_impute*: 141 | 142 | - **iters\_each** is the number of iterations for imputing each 143 | missing variable (default=100). 144 | 145 | - **iters\_all** is the number of iterations for imputing the whole 146 | data matrix (default=20). 147 | 148 | - Although a large number of iterations (e.g., iters\_all=20 and 149 | iters\_each=100) is recommended for the convergence of MCMC, a smaller 150 | number of iterations (iters\_all=10, iters\_each=50) won’t severely 151 | affect the imputation accuracy as we tested on the simulation data. 152 | 153 | - **initial** is the initialization method for missing values 154 | (default=‘qrilc’). We provided three ways: ‘lsym’, ‘qrilc’, ‘rsym’. 155 | ‘lsym’ will draw samples from the right tail of the distribution and 156 | symmetrically transformed to the left tail; ‘rsym’ will draw samples 157 | from the left tail of the distribution and symmetrically transformed 158 | to the right tail, this is for the right-censored missing; ‘qrilc’ 159 | will use QRILC imputed values as initial. An pre-initilized data 160 | frame is also acceptable for this argument. 161 | 162 | - **lo** is the lower limits (default=‘-Inf’) and **hi** 163 | (default=‘min’) is the upper limits for missing values. These two 164 | arguments can be defined as -Inf/Inf/‘min’/‘max’/‘median’/‘mean’ or 165 | any single determined value or a vector of values (same length with number 166 | of variables, the values of non-missing vairables won’t affect 167 | results). Here, lo=-Inf, hi=‘min’ are default setting for 168 | left-censored missing values where the upper bound is set to the 169 | minimum value of non-missing part (notably, quantile values can be 170 | applied if minimum is too strict. For example, hi=sapply(data, 171 | function(x) quantile(x, .1, na.rm=T)) represent the 10% quantiles of 172 | each variable are set to the upper bounds). When non-informative 173 | bounds for both upper and lower limits (e.g., +∞, -∞) were applied, 174 | GSimp could be extended to the situation of MCAR/MAR. 175 | 176 | - **n\_cores** is the number of cores for computing (parallel 177 | computing will impute all missing variables simultaneously while 178 | non-parallel computing will impute missing variables sequentially 179 | from the least number of missings to the most). 180 | 181 | - **gibbs** is the missing elements you want to trace across the whole 182 | MCMC (default=data.frame(row=integer(), col=integer())). This 183 | argument must be set as the positions of missing elements. For 184 | example, gibbs=data.frame(row=c(1, 3), col=c(2, 5)) represent you 185 | want to trace the missing elements in row 1 column 2 and row 3 186 | column 5. 187 | 188 | Outputs of *GS\_impute*: 189 | 190 | - **data\_imp** is the imputed data frame. 191 | 192 | - **gibbs\_res** is a three dimensional array that records the whole 193 | process of specified missing elements across MCMC iterations. The 194 | first dimension represents std/yhat/yres which stands for the 195 | stadard deviation/predicted value/sampling value. The second 196 | dimension represents missing elements you specified and the third 197 | dimension represents the iterations. 198 | 199 | ## 3.1 GSimp in real-world missing data sets 200 | ### 3.1.1 Data sets 201 | 202 | The untargeted GC/MS dataset contains 37 samples and 112 variables with 203 | 317 missing elements and 221 of them were retrieved manually. From the following 204 | kernel density plot, we found overlaps between 205 | non-missing values and retrieved missing values. Thus, we assumed that the 206 | majority of missingness in untargeted GC/MS-based metabolomics data are 207 | MCAR/MAR. The targeted LC/MS dataset contains 40 samples and 41 variables with 88 208 | missing elements are failed to be quantified due to LOQ/LOD. 209 | 210 | ![](vignette_files/figure-markdown_github/compare.eps) 211 | 212 | The following analyses are tested on post-missing retrieval untargeted GC/MS dataset and targted LC/MS dataset. 213 | 214 | ### 3.1.2 Other Wrapper functions 215 | 216 | We compared GSimp with other left-censored missing 217 | imputation/substitution methods: 218 | 219 | - QRILC (Quantile Regression Imputation of Left-Censored data) imputes 220 | missing elements randomly drawing from a truncated distribution 221 | estimated by a quantile regression. Function impute.QRILC in R 222 | package imputeLCMD was applied for this imputation approach. The 223 | function *sim\_QRILC\_wrapper* was used in this method. 224 | 225 | - kNN-TN (Truncation k-nearest neighbors imputation) applied a 226 | Newton-Raphson (NR) optimization to estimate the truncated mean and 227 | standard deviation. Then, Pearson correlation was calculated based 228 | on standardized data followed by correlation-based kNN imputation. 229 | kNN-TN algorithm and related functions developed by Jasmit S. Shah 230 | () was used for this 231 | imputation approach. The function *sim\_trKNN\_wrapper* was used in 232 | this method. 233 | 234 | - HM (Half of minimum) replace missingness with half of the minimum of 235 | the variable. 236 | 237 | ``` r 238 | # QRILC 239 | sim_QRILC_wrapper <- function(data, ...) { 240 | result <- data %>% impute.QRILC(., ...) %>% extract2(1) 241 | return(result) 242 | } 243 | 244 | # trKNN 245 | sim_trKNN_wrapper <- function(data) { 246 | result <- data %>% as.matrix %>% t %>% imputeKNN(., k=3, distance='truncation', 247 | perc=0) %>% t 248 | return(result) 249 | } 250 | 251 | # HM 252 | sim_HM_wrapper <- function(data) { 253 | result <- data 254 | result[] <- lapply(result, function(x) { 255 | x[is.na(x)] <- min(x, na.rm = T)/2 256 | x 257 | }) 258 | return(result) 259 | } 260 | ``` 261 | 262 | ### 3.1.3 Imputation comparison 263 | 264 | #### 3.1.3.1 Untargeted GC/MS dataset 265 | 266 | ``` r 267 | # load dataset 268 | untargeted_data <- read.csv('untargeted_data.csv', row.names=1) 269 | missmap(untargeted_data, col=c("black", "grey"), legend=FALSE) 270 | ``` 271 | ![](vignette_files/figure-markdown_github/Missmap_untargeted.png) 272 | 273 | Each black cell represents a missing value. 274 | 275 | ``` r 276 | # record positions of missing values 277 | NA_pos <- which(is.na(untargeted_data), arr.ind = T) 278 | col_na <- NA_pos[, 2] 279 | 280 | # imputations 281 | set.seed(123) 282 | after_GS_imp <- pre_processing_GS_wrapper(untargeted_data) 283 | ``` 284 | 285 | ## Iteration 1 start...Parallel computing (n_cores=2)...end! 286 | ## Iteration 2 start...Parallel computing (n_cores=2)...end! 287 | ## Iteration 3 start...Parallel computing (n_cores=2)...end! 288 | ## Iteration 4 start...Parallel computing (n_cores=2)...end! 289 | ## Iteration 5 start...Parallel computing (n_cores=2)...end! 290 | ## Iteration 6 start...Parallel computing (n_cores=2)...end! 291 | ## Iteration 7 start...Parallel computing (n_cores=2)...end! 292 | ## Iteration 8 start...Parallel computing (n_cores=2)...end! 293 | ## Iteration 9 start...Parallel computing (n_cores=2)...end! 294 | ## Iteration 10 start...Parallel computing (n_cores=2)...end! 295 | 296 | ``` r 297 | data_raw_log <- untargeted_data %>% log() 298 | after_trKNN_imp <- sim_trKNN_wrapper(data_raw_log) %>% data.frame() %>% exp() 299 | after_QRILC_imp <- sim_QRILC_wrapper(data_raw_log) %>% exp() 300 | after_HM_imp <- sim_HM_wrapper(untargeted_data) 301 | 302 | # NRMSE and PCA procruste comparisons 303 | imp_list <- list() 304 | imp_list[[1]] <- after_GS_imp[,col_na] %>% log() 305 | imp_list[[2]] <- after_trKNN_imp[,col_na] %>% log() 306 | imp_list[[3]] <- after_QRILC_imp[,col_na] %>% log() 307 | imp_list[[4]] <- after_HM_imp[,col_na] %>% log() 308 | method_names <- c('GSimp', 'trKNN', 'QRILC', 'HM') 309 | 310 | # NRMSE Results 311 | NRMSE_res <- NRMSE_list(imp_list, untargeted_data, method_names) 312 | NRMSE_res <- round(NRMSE_res, digits = 3) 313 | kable(NRMSE_res) 314 | ``` 315 | 316 | NRMSE between imputation methods 317 | 318 | | | GSimp| trKNN| QRILC| HM| 319 | |-------|------:|------:|------:|------:| 320 | | GSimp | NA| 2.175| 1.272| 1.674| 321 | | trKNN | 2.175| NA| 2.995| 3.918| 322 | | QRILC | 1.272| 2.995| NA| 1.361| 323 | | HM | 1.674| 3.918| 1.361| NA| 324 | 325 | ``` r 326 | # PCA procrustes results 327 | PCA_res <- PCA_pro_list(imp_list, nPCs=3, method_names) 328 | procrustes_df <- round(PCA_res$pro_ss_df, digits = 3) 329 | kable(procrustes_df) 330 | ``` 331 | PCA Procrustes results between imputation methods 332 | 333 | | | GSimp| trKNN| QRILC| HM| 334 | |-------|------:|------:|------:|------:| 335 | | GSimp | NA| 0.119| 0.018| 0.029| 336 | | trKNN | 0.119| NA| 0.136| 0.214| 337 | | QRILC | 0.018| 0.136| NA| 0.031| 338 | | HM | 0.029| 0.214| 0.031| NA| 339 | 340 | ``` r 341 | plot(PCA_res$pro_res$trKNN_GSimp) 342 | ``` 343 | 344 | ![](vignette_files/figure-markdown_github/unnamed-chunk-4-1.png) 345 | 346 | PCA-Procrustes between GSimp (points) and kNN-TN (targets) 347 | 348 | 349 | ``` r 350 | plot(PCA_res$pro_res$QRILC_GSimp) 351 | ``` 352 | 353 | ![](vignette_files/figure-markdown_github/unnamed-chunk-4-2.png) 354 | 355 | PCA-Procrustes between GSimp (points) and QRILC (targets) 356 | 357 | 358 | ``` r 359 | plot(PCA_res$pro_res$HM_GSimp) 360 | ``` 361 | 362 | ![](vignette_files/figure-markdown_github/unnamed-chunk-4-3.png) 363 | 364 | PCA-Procrustes between GSimp (points) and HM (targets) 365 | 366 | 367 | #### 3.1.3.2 Targeted LC/MS dataset 368 | 369 | ``` r 370 | targeted_data <- read.csv('targeted_data.csv', row.names=1) 371 | missmap(targeted_data, col=c("black", "grey"), legend=FALSE) 372 | ``` 373 | ![](vignette_files/figure-markdown_github/Missmap_targeted.png) 374 | 375 | Each black cell represents a missing value. 376 | 377 | ```r 378 | # record positions of missing values 379 | NA_pos <- which(is.na(targeted_data), arr.ind = T) 380 | col_na <- unique(NA_pos[, 2]) 381 | 382 | # imputations 383 | set.seed(123) 384 | after_GS_imp <- pre_processing_GS_wrapper(targeted_data) 385 | ``` 386 | 387 | ## Iteration 1 start...Parallel computing (n_cores=2)...end! 388 | ## Iteration 2 start...Parallel computing (n_cores=2)...end! 389 | ## Iteration 3 start...Parallel computing (n_cores=2)...end! 390 | ## Iteration 4 start...Parallel computing (n_cores=2)...end! 391 | ## Iteration 5 start...Parallel computing (n_cores=2)...end! 392 | ## Iteration 6 start...Parallel computing (n_cores=2)...end! 393 | ## Iteration 7 start...Parallel computing (n_cores=2)...end! 394 | ## Iteration 8 start...Parallel computing (n_cores=2)...end! 395 | ## Iteration 9 start...Parallel computing (n_cores=2)...end! 396 | ## Iteration 10 start...Parallel computing (n_cores=2)...end! 397 | 398 | ``` r 399 | data_raw_log <- targeted_data %>% log() 400 | after_trKNN_imp <- sim_trKNN_wrapper(data_raw_log) %>% 401 | data.frame() %>% exp() 402 | after_QRILC_imp <- sim_QRILC_wrapper(data_raw_log) %>% exp() 403 | after_HM_imp <- sim_HM_wrapper(targeted_data) 404 | 405 | # NRMSE and PCA procruste comparisons 406 | imp_list <- list() 407 | imp_list[[1]] <- after_GS_imp[,col_na] %>% log() 408 | imp_list[[2]] <- after_trKNN_imp[,col_na] %>% log() 409 | imp_list[[3]] <- after_QRILC_imp[,col_na] %>% log() 410 | imp_list[[4]] <- after_HM_imp[,col_na] %>% log() 411 | method_names <- c('GSimp', 'trKNN', 'QRILC', 'HM') 412 | 413 | 414 | # NRMSE results 415 | NRMSE_res <- NRMSE_list(imp_list, targeted_data, method_names) 416 | NRMSE_res <- round(NRMSE_res, digits = 3) 417 | kable(NRMSE_res) 418 | ``` 419 | 420 | NRMSE between imputation methods 421 | 422 | 423 | | | GSimp| trKNN| QRILC| HM| 424 | |-------|------:|------:|------:|------:| 425 | | GSimp | NA| 2.005| 1.403| 1.085| 426 | | trKNN | 2.005| NA| 1.628| 2.449| 427 | | QRILC | 1.403| 1.628| NA| 1.222| 428 | | HM | 1.085| 2.449| 1.222| NA| 429 | 430 | ``` r 431 | # PCA procrustes results 432 | PCA_res <- PCA_pro_list(imp_list, nPCs=3, method_names) 433 | procrustes_df <- round(PCA_res$pro_ss_df, digits = 3) 434 | kable(procrustes_df) 435 | ``` 436 | 437 | PCA Procrustes results between imputation methods 438 | 439 | 440 | | | GSimp| trKNN| QRILC| HM| 441 | |-------|------:|------:|------:|------:| 442 | | GSimp | NA| 0.171| 0.094| 0.029| 443 | | trKNN | 0.171| NA| 0.051| 0.113| 444 | | QRILC | 0.094| 0.051| NA| 0.039| 445 | | HM | 0.029| 0.113| 0.039| NA| 446 | 447 | ``` r 448 | plot(PCA_res$pro_res$trKNN_GSimp) 449 | ``` 450 | 451 | ![](vignette_files/figure-markdown_github/unnamed-chunk-5-1.png) 452 | 453 | PCA-Procrustes between GSimp (points) and kNN-TN (targets) 454 | 455 | 456 | ``` r 457 | plot(PCA_res$pro_res$QRILC_GSimp) 458 | ``` 459 | 460 | ![](vignette_files/figure-markdown_github/unnamed-chunk-5-2.png) 461 | 462 | PCA-Procrustes between GSimp (points) and QRILC (targets) 463 | 464 | 465 | ``` r 466 | plot(PCA_res$pro_res$HM_GSimp) 467 | ``` 468 | 469 | ![](vignette_files/figure-markdown_github/unnamed-chunk-5-3.png) 470 | 471 | PCA-Procrustes between GSimp (points) and HM (targets) 472 | 473 | 474 | ## 3.2 GSimp in simulated data sets 475 | ### 3.2.1 Simulated dataset generation 476 | 477 | For the simulation dataset, we first calculated the covariance matrix 478 | Cov based on the whole diabetes dataset (*P*=76) where *P* represents the 479 | number of variables. Then we generated two separated data matrices with 480 | the same number of 80 observations from multivariate normal 481 | distributions, representing two different biological groups. For each 482 | data matrix, the sample mean of each variable was drawn from a normal 483 | distribution *N*(0, 0.5^2^) and *Cov* was kept using SVD. Then, two data 484 | matrices were horizontally (column-wise) stacked together as a complete 485 | data matrix (*N*×*P*=160×76) so that group differences were simulated and 486 | covariance was kept. 487 | 488 | ```r 489 | data_sim <- read.csv('data_sim.csv', row.names=1) 490 | data_sim_sc <- scale(data_sim) 491 | ``` 492 | 493 | ```r 494 | group <- rep(c(0, 1), each=80) %>% as.factor 495 | sim_pvals <- apply(data_sim_sc, 2, function(x) t.test(x ~ group)$p.value) 496 | ``` 497 | 498 | ``` 499 | # P-values for two groups on simulation dataset is: 500 | # 7.746669e-11 1.963778e-01 8.522196e-06 9.885971e-02 1.165720e-01 501 | 5.295838e-06 5.510601e-01 3.338875e-06 1.934161e-04 1.385230e-01 ... 502 | ``` 503 | 504 | ### 3.2.2 Imputation comparison 505 | 506 | First, we generated a series of MNAR datasets by using the missing 507 | proportion from 0.1 to 0.8 step by 0.1 with MNAR cut-off drawn from 508 | *U*(0.3, 0.6). Then, GSimp, QRILC, and kNN-TN was used to impute MNAR 509 | datasets, respectively. 510 | 511 | ``` r 512 | # Simply package GSimp into a wrapper function which requires no data preprocessing 513 | considering how we generated the simulation data 514 | 515 | sim_GS_wrapper <- function(data) { 516 | result <- data %>% GS_impute(., iters_each=50, iters_all=10, initial='qrilc', 517 | lo=-Inf, hi='min', n_cores=2, imp_model='glmnet_pred') 518 | return(result$data_imp) 519 | } 520 | 521 | # a list of MNAR datasets generation and imputation with different imputation wrapper functions 522 | sim_MNAR_list <- MNAR_gen_imp(data_c=data_sim_sc, mis_var_prop=seq(.1, .8, .1), 523 | var_mis_prop=seq(.3, .6, .1), 524 | impute_list=c('sim_QRILC_wrapper', 'sim_GS_wrapper', 525 | 'sim_trKNN_wrapper'), cores=1) 526 | # cores should be 1 in MNAR_gen_imp function, since sim_GS_wrapper() use multiple cores 527 | ``` 528 | 529 | Unlabeled measurements include the NRMSE-based sum of ranks (SOR), 530 | principal component analysis (PCA)-Procrustes analysis while labeled 531 | measurements include correlation analysis for univariate results, 532 | partial least square (PLS)-Procurstes analysis. 533 | 534 | ``` r 535 | ## SOR calculation and plot 536 | sim_MNAR_NRMSE_rank_list <- NRMSE_rank_cal_plot(sim_MNAR_list, plot=T, x='Miss_Num', 537 | colors=c('#7CAE00', '#00BFC4', '#C77CFF'), 538 | shapes=c(17, 15, 3)) 539 | ``` 540 | 541 | ![](vignette_files/figure-markdown_github/unnamed-chunk-9-1.png) 542 | 543 | SOR of three imputation methods with the increasing number of missing variables 544 | 545 | 546 | 547 | ``` r 548 | ## PCA-Procrustes and plot 549 | sim_MNAR_PCA_ProSS_list <- Procrustes_cal_plot(sim_MNAR_list, DR='PCA', nPCs=2, x='Miss_Num', 550 | plot=T, 551 | colors=c('#7CAE00', '#00BFC4', '#C77CFF'), 552 | shapes=c(17, 15, 3)) 553 | ``` 554 | 555 | ![](vignette_files/figure-markdown_github/unnamed-chunk-9-2.png) 556 | 557 | PCA-Procrustes results of three imputation methods with the increasing number of missing variables 558 | 559 | 560 | ``` r 561 | ## PLS-Procrustes and plot 562 | sim_MNAR_PLS_ProSS_list <- Procrustes_cal_plot(sim_MNAR_list, DR='PLS', nPCs=2, outcome=group, 563 | x='Miss_Num', plot=T, 564 | colors=c('#7CAE00', '#00BFC4', '#C77CFF'), 565 | shapes=c(17, 15, 3)) 566 | ``` 567 | 568 | ![](vignette_files/figure-markdown_github/unnamed-chunk-9-3.png) 569 | 570 | PLS-Procrustes results of three imputation methods with the increasing number of missing variables 571 | 572 | 573 | ``` r 574 | ## T-test correlation and plot 575 | sim_MNAR_Ttest_Cor_list <- Ttest_cor_cal_plot(sim_MNAR_list, group=group, plot=T, x='Miss_Num', 576 | cor='P', 577 | colors=c('#7CAE00', '#00BFC4', '#C77CFF'), 578 | shapes=c(17, 15, 3)) 579 | ``` 580 | 581 | ![](vignette_files/figure-markdown_github/unnamed-chunk-9-4.png) 582 | 583 | Correlation of T-test p-values of three imputation methods with the increasing number of missing variables 584 | 585 | 586 | 587 | ``` r 588 | ## TPR calculation and plot 589 | sim_MNAR_Ttest_TPR_list_2 <- Ttest_P_cal_plot(impute_results=sim_MNAR_list, group=group, 590 | plot=T, p_cut=.05, x = 'Miss_Num', 591 | colors=c('#7CAE00', '#00BFC4', '#C77CFF'), 592 | shapes=c(17, 15, 3)) 593 | ``` 594 | 595 | ![](vignette_files/figure-markdown_github/unnamed-chunk-9-5.png) 596 | 597 | TPR (p-value=.05) of three imputation methods with the increasing number of missing variables 598 | 599 | 600 | 601 | ``` r 602 | sim_MNAR_Ttest_TPR_list_3 <- Ttest_P_cal_plot(impute_results=sim_MNAR_list, group=group, 603 | plot=T, p_cut=.01, x = 'Miss_Num', 604 | colors=c('#7CAE00', '#00BFC4', '#C77CFF'), 605 | shapes=c(17, 15, 3)) 606 | ``` 607 | 608 | ![](vignette_files/figure-markdown_github/unnamed-chunk-9-6.png) 609 | 610 | TPR (p-value=.01) of three imputation methods with the increasing number of missing variables 611 | 612 | 613 | ## 3.3 GSimp with different iterations 614 | 615 | Since GSimp employed an iterative Gibbs sampler method, a large number 616 | of iterations (iters\_all=20, iters\_each=100) are preferable for the 617 | convergence of parameters estimation. However, as we tested on the simulation 618 | dataset with different number of iterations (iters\_each=50 and 619 | iters\_all=20, iters\_each=100 and iters\_all=20, iters\_each=50 and 620 | iters\_all=10, iters\_each=100 and iters\_all=10), a much less 621 | iterations won’t severely affect the imputation accuracy. 622 | 623 | ``` r 624 | GSimp_50_20_wrapper <- function(data) { 625 | result <- data %>% GS_impute(., iters_each=50, iters_all=20, initial='qrilc', 626 | lo=-Inf, hi='min', 627 | n_cores=2, imp_model='glmnet_pred') 628 | return(result$data_imp) 629 | } 630 | 631 | GSimp_100_20_wrapper <- function(data) { 632 | result <- data %>% GS_impute(., iters_each=100, iters_all=20, initial='qrilc', 633 | lo=-Inf, hi='min', 634 | n_cores=2, imp_model='glmnet_pred') 635 | return(result$data_imp) 636 | } 637 | 638 | GSimp_50_10_wrapper <- function(data) { 639 | result <- data %>% GS_impute(., iters_each=50, iters_all=10, initial='qrilc', 640 | lo=-Inf, hi='min', 641 | n_cores=2, imp_model='glmnet_pred') 642 | return(result$data_imp) 643 | } 644 | 645 | GSimp_100_10_wrapper <- function(data) { 646 | result <- data %>% GS_impute(., iters_each=100, iters_all=10, initial='qrilc', 647 | lo=-Inf, hi='min', 648 | n_cores=2, imp_model='glmnet_pred') 649 | return(result$data_imp) 650 | } 651 | ``` 652 | 653 | ``` r 654 | GSimp_iters_MNAR_list <- MNAR_gen_imp(data_c=data_sim_sc, mis_var_prop=seq(.2, .5, .1), 655 | var_mis_prop=seq(.1, .5, .1), 656 | impute_list=c('GSimp_50_20_wrapper', 657 | 'GSimp_100_20_wrapper', 658 | 'GSimp_50_10_wrapper', 659 | 'GSimp_100_10_wrapper'), 660 | cores=1) 661 | # cores should be 1 in MNAR_gen_imp function, since GS_impute() use multiple cores 662 | ``` 663 | 664 | ``` r 665 | GSimp_iters_MNAR_NRMSE_rank_list <- NRMSE_rank_cal_plot(GSimp_iters_MNAR_list, 666 | plot=T, x='Miss_Num') 667 | ``` 668 | 669 | ![](vignette_files/figure-markdown_github/unnamed-chunk-12-1.png) 670 | 671 | SOR of different interations of GSimp with the increasing number of missing variables 672 | 673 | 674 | 675 | # 4. Concluding remarks 676 | 677 | GSimp is a convenient software for the imputation of left-censored MNAR data. With 678 | proper modifications (e.g., truncation points (**lo** and **hi**), pre-processing approaches), GSimp can be applicable 679 | to handle different types of missingness (e.g., right-censored MNAR, MCAR, MAR) and different types of -omics 680 | studies, which is deserved to be further explored in the future. 681 | -------------------------------------------------------------------------------- /untargeted_data.csv: -------------------------------------------------------------------------------- 1 | ,Cmpd_2,Cmpd_3,Cmpd_4,Cmpd_5,Cmpd_7,Cmpd_10,Cmpd_11,Cmpd_12,Cmpd_13,Cmpd_14,Cmpd_15,Cmpd_16,Cmpd_17,Cmpd_18,Cmpd_19,Cmpd_20,Cmpd_21,Cmpd_22,Cmpd_23,Cmpd_24,Cmpd_25,Cmpd_27,Cmpd_28,Cmpd_29,Cmpd_30,Cmpd_31,Cmpd_32,Cmpd_34,Cmpd_35,Cmpd_36,Cmpd_38,Cmpd_40,Cmpd_41,Cmpd_42,Cmpd_43,Cmpd_44,Cmpd_45,Cmpd_47,Cmpd_48,Cmpd_49,Cmpd_50,Cmpd_52,Cmpd_54,Cmpd_55,Cmpd_56,Cmpd_59,Cmpd_60,Cmpd_61,Cmpd_68,Cmpd_73,Cmpd_75,Cmpd_76,Cmpd_77,Cmpd_78,Cmpd_80,Cmpd_82,Cmpd_83,Cmpd_86,Cmpd_87,Cmpd_89,Cmpd_92,Cmpd_94,Cmpd_95,Cmpd_99,Cmpd_101,Cmpd_105,Cmpd_106,Cmpd_107,Cmpd_109,Cmpd_111,Cmpd_119,Cmpd_120,Cmpd_123,Cmpd_127,Cmpd_130,Cmpd_131,Cmpd_139,Cmpd_141,Cmpd_142,Cmpd_144,Cmpd_146,Cmpd_147,Cmpd_148,Cmpd_150,Cmpd_151,Cmpd_152,Cmpd_153,Cmpd_155,Cmpd_162,Cmpd_169,Cmpd_171,Cmpd_174,Cmpd_180,Cmpd_182,Cmpd_184,Cmpd_188,Cmpd_190,Cmpd_194,Cmpd_196,Cmpd_199,Cmpd_201,Cmpd_202,Cmpd_212,Cmpd_213,Cmpd_222,Cmpd_224,Cmpd_226,Cmpd_230,Cmpd_232,Cmpd_257,Cmpd_267,Cmpd_277 2 | S_1,451606,23576680,8395928,4290993,725363,316024,1038629,7719123,8339645,5040907,29615649,3686649,33991849,1199947,15648443,1893128,4407705,491505,259439,4329582,619205,177025,1702299,182758,15296471,14299882,13017735,2361907,69259,13572115,509285,1667983,400695,7473245,9835878,2114413,1266700,95655,972647,1555681,3738592,1472105,1776117,3954130,206310,12510042,6107514,955395,258666,4558188,1669161,15535400,272104,322794,32142372,103054,136406,97544,382404,727184,4836657,2011636,261298,532418,84099,230681,40180,1159157,20434371,1051193,295103,281821,202733,327194,11684817,2831927,3397090,1148256,2544637,253731,28971,1140014,144461,2268582,36451,9638669,12556523,76282,1490416,3339322,5484977,93680,1423651,29413,34144,17882921,408681,142748,16663357,233709,1588952,749310,418785,317019,854099,433537,23349428,7736207,95295,2935180,39319,4942515 3 | S_2,3030467,29024419,6718542,436017,2235837,1416281,5992464,44123341,51486642,2241881,43218555,1637867,34406555,918629,86777074,16924097,6330014,2292406,7572052,26112970,2407948,2620687,12244529,2819252,71422743,102628762,12245103,36008505,30957053,3355353,16236502,2868876,376373,11262530,2480824,1667124,10485254,110262,805098,800912,25852339,205882,1873810,49134958,175768,10156772,11581700,7309245,2307882,12259581,153059,54566776,691526,74041,4316627,2358329,220481,457773,709344,1787390,6689723,2517318,1149079,1295645,177687,1193926,135727,1842135,41433060,116818,1277154,873377,109272,351239,3666162,18989856,433572,1058780,411049,6190562,107700,1024917,46440,1516466,136968,38175346,85618322,1435551,1825090,157109,546185,231072,209182,210195,40310,1040945,NA,402098,37800237,376725,124257,NA,261730,563506,145049,628141,74430,1715926,NA,68908,519612,5952713 4 | S_3,2008460,28655043,4775583,207973,1292487,1137059,4964721,35546422,50613619,1885775,88870295,2034694,25221665,954626,74527813,17694486,4871571,1991394,6875743,19773485,4447568,2311713,10660333,1983936,72340500,95462851,7925805,56065346,31039573,2611204,5448545,1405981,525829,6262414,1986688,1298779,9924737,96264,889843,773138,25096620,451640,1317071,45287363,199700,30977820,11195501,5069604,2046981,4536138,205763,60259718,352439,170181,8274227,179163,261492,242621,669164,1066022,7539830,1016305,526516,963023,173637,1189873,105792,1940839,41745924,103935,456464,528125,78503,249568,4854372,19466421,235620,848837,1851041,3273204,29501,1025526,37994,943221,64817,36067792,79367412,1734559,4852039,136040,677638,136088,NA,139254,32683,2044297,314170,NA,32840919,209805,115697,41888,732453,480796,103513,1048600,64802,2122727,NA,57356,107751,4894229 5 | S_4,1918709,21549940,4814789,371737,1061385,935640,6270720,23490183,39963323,1371526,32303970,1002233,12096495,667123,80698618,15280855,2516722,1435204,5116201,11638969,968353,2198252,8879061,3221221,56256711,73103498,3385644,30677786,22608053,2301382,2972099,2482784,203061,20547170,1860724,2011601,8373404,76647,464813,1108803,19107019,101125,821361,24721705,107908,8757200,6536503,6756145,3978262,11399760,406368,49230113,1037774,87662,7351813,2388013,255332,175809,266644,1645440,6854064,436603,792432,1053326,407037,1662942,67685,3602555,27509635,108686,297880,395995,66383,154520,2512061,25619893,557922,364899,842389,3623783,125161,946030,34676,1007754,21055,23181439,55913698,1641391,NA,115769,95615,159616,150801,193982,24390,446815,466001,145780,23116900,19519,140315,42958,283177,434035,53340,451453,84772,2449488,37405,NA,309119,13906946 6 | S_5,1773122,20226589,4317341,264041,945112,566402,6030522,8777954,33411431,1228318,24305370,2156949,7652535,675515,61539083,10633431,1321904,1431693,6199090,7110129,5074119,1908685,10140788,1173804,53296161,62063355,2922945,18126853,34060362,2040144,3046229,1964332,354792,9981359,1656897,1469191,9394494,58925,366508,1190808,15438286,335633,358242,18343367,212155,7833038,5508228,5074723,2717034,12754398,298207,43811568,582025,52786,6520973,828469,211000,116793,396525,1409582,6167218,1113893,501974,617037,313414,708330,84587,2527491,37985332,72286,450841,169641,96541,68015,15870685,21830789,861466,302357,1389328,3128587,62572,947354,41875,589805,39210,19483723,36956991,708667,5945629,90839,86654,145705,401381,126973,22276,2760530,289729,111375,29135927,162331,116930,58408,404051,533255,70494,182562,88325,1752119,48113,NA,NA,8934009 7 | S_6,1762716,20139629,4221171,246977,1064454,592697,4635157,13791821,35434072,1374099,21370605,1736008,10198235,787336,58934247,11013897,1804989,1397109,6251924,7887020,7050774,1747305,8225283,1209845,54559961,60277112,4333220,22468036,13870148,1478633,2222788,6399241,379376,6537822,2578906,1053931,7877804,67006,615655,1306434,15445788,481989,1074252,18644319,229239,28989597,6694446,4857931,1722736,41392781,604293,43609221,572492,38372,21731151,70690,550214,81342,599550,1310541,6142709,321117,311131,697909,264346,813220,53171,2885327,8695654,140253,392544,244091,65832,178610,8916798,17076890,675126,220867,466346,3746500,175681,750042,40213,739048,20553,19234903,40106131,1682822,3083466,97739,415896,147462,624378,114838,16043,2226636,NA,362530,66270125,61538,240179,85287,251063,710354,85931,109679,90392,5547881,44484,62869,NA,2773331 8 | S_7,2560132,18950335,5077924,250519,927611,990839,5024756,8291654,34772415,1124929,18006358,989378,6654046,865921,66919482,11755443,1134752,956960,4492617,10485749,2202956,3887810,7895011,2172953,49722768,65778589,2874185,32482687,35172139,3573774,2190650,1835363,440898,7477804,1695400,1794455,6846236,98670,1110497,1171412,16814033,188134,382439,21534202,111647,7402589,6212051,4864086,3526093,18507209,684422,41281975,650907,31678,5333324,6804317,254388,311873,304001,1658657,6825130,1138437,794173,1020653,273921,1126210,87122,2395785,16543340,144908,377238,189427,67532,77917,4356500,21557746,1412806,394181,480525,2016909,1050476,804737,52391,742136,101026,19318043,48142571,504175,974791,89942,322056,197587,205676,129676,31944,656911,546905,123219,31966776,355107,49251,25001,177678,746129,70182,1161693,181768,1656846,NA,NA,386430,2679834 9 | S_8,2393069,22285464,3562682,355140,1340277,979095,6418320,20108758,38331279,1718934,39483338,2532491,16430643,785355,71215032,13564366,2600655,1490187,6834045,8362188,7445812,2356204,9756457,1824329,59874685,65127658,6114287,22047196,33340053,2354295,2859005,3192514,334321,11226977,2715265,1553880,7782372,71183,832301,805622,17838979,416578,1390876,18545391,308323,31862705,6522588,4601819,1870852,17709994,341411,45831154,511274,69100,14968863,216820,617228,191166,460892,1276118,6051660,2289532,203222,639830,286113,938875,34530,2482506,35477791,155828,260112,283041,87965,243277,18239365,18812721,138940,930924,2814454,2677684,104384,937704,68131,1394977,18224,21632152,45059412,1829496,7687649,155301,1146189,186499,28750,160681,12133,3116073,598120,131948,28592584,198562,139115,97990,966052,307113,125154,82700,104375,3700053,48210,66515,NA,9918138 10 | S_9,1472087,17558611,3779994,349289,1563553,571459,3194285,6409859,31960147,1798772,20572744,1926483,10886894,506501,53156132,10469941,1163798,839582,2579063,5366192,3159488,2046736,5549810,945643,35739679,47648546,3436275,13014420,35421635,5728694,2317364,1483297,230124,11107955,5440645,1366375,4578311,68504,868857,1404305,13594687,279693,1098967,8724974,248532,25432803,4665871,3047025,836774,10703658,236007,39574753,745142,41701,99244243,85177,672643,167377,551886,1427418,4189644,3296781,518512,442974,197477,542314,58662,1752991,33357396,364669,130264,250619,248265,186691,15565766,6952155,812246,942678,1825860,1594414,65330,893594,104878,1556727,42796,14497781,32208375,449470,11899869,115473,2290511,188250,313137,98765,29630,2324922,632017,NA,11765577,68077,95281,77711,511747,310107,170558,63836,73376,25843508,NA,NA,NA,7690149 11 | S_10,1919904,23390396,5993740,438346,1399495,770161,7358552,13356062,37479831,1602580,34708056,2742051,13015848,666096,74632557,12844322,1930717,1710264,7280150,11560373,3360242,3388480,12000369,1371567,73318589,71300496,2891967,30590438,42750849,3756127,4100320,962824,374065,12558824,2571056,1941539,7866791,94517,1004474,1255102,19530105,345748,426017,25681760,369447,23263570,7798575,4280331,1850863,3106641,120611,45684025,476297,95967,50125684,1214571,153450,483469,416473,1258528,6970054,6476467,572417,781472,273332,804709,105240,1551996,51093881,99181,344940,207091,NA,77613,8494905,26327238,640453,910492,1953145,2997104,26886,1108113,40806,1898018,38494,27082500,54084848,520178,5271783,96372,256894,164072,269245,113689,36232,3483900,493391,69162,16833314,78699,107679,79272,551091,359260,121299,230044,96663,12530503,NA,NA,62595,3504753 12 | S_11,1726682,19831800,3859858,254836,2057484,788060,4656087,18699876,34410177,2104420,57378307,2428878,13806937,874265,57051716,12051243,2428142,1113403,4316479,11257353,2078441,2550276,7021768,1461593,48012095,59456542,2981568,30816878,28132761,2251510,3148906,1013886,388253,10085008,1955619,1636780,6939883,86368,956022,1136292,17131401,674695,621312,21694696,130846,16446316,6730339,3416228,2098700,5378626,93304,41966378,578757,53800,1930803,744072,82838,256382,251441,1264944,6202609,566959,227827,336634,187660,627630,33986,1406247,54001445,84981,287917,212111,68718,117679,6237287,17113740,1115376,912242,2624000,2742000,23881,1229946,37032,1204976,18118,19587347,44059469,222205,1389842,127923,85578,172082,370459,119914,15838,1341690,NA,NA,21633570,157838,169580,NA,782536,354894,116062,194932,51075,1795578,NA,52435,265148,3944130 13 | S_12,1998137,26274339,8210561,616779,2290100,931005,6830574,32428882,45102481,2905191,57691453,1536442,32182497,679824,70229501,14616624,5142695,2292386,8182000,17688202,4297878,3106114,14851537,2244556,67087136,84366343,5890352,29226586,33560499,4150125,5976728,1474975,92509,14440139,2946384,2758433,14365696,73886,901616,751107,22443930,307201,1147872,28375159,27637,13693123,7156605,10041526,4636177,3008226,76537,50024602,1375379,74153,3872807,3801383,297668,377964,619047,2289351,6017305,1291666,637628,1136762,384050,977531,122587,4308509,50906183,127084,1256809,298982,98838,194326,6013127,25490681,556504,508964,1222990,5291378,13300,1216352,38380,1094556,34946,30097981,68077743,988157,2408985,207062,437428,154931,154055,306203,32919,1675219,382240,121768,32413095,20843,173494,82544,473454,331427,165988,187760,58471,1373462,NA,75734,634032,2587909 14 | S_13,1983463,33721858,5274626,230284,2880376,1347169,5818748,61399914,59629755,2178570,79509467,1704911,47586155,728047,88784802,20759339,10020065,3007326,9432473,28837652,3353862,2899963,14999470,2858417,81246912,114327456,7937738,35337182,18173246,1852989,12997837,2203204,365614,4702578,2082918,1567157,12802993,82377,862207,769276,29186256,280839,1335908,61385604,171728,44322381,12818036,9565339,2288453,12086107,258687,67704635,716926,248927,2089187,263403,260749,427751,491937,1913550,7023666,NA,335842,1176160,211371,1117514,56103,4090754,46838532,90464,692644,499344,90665,257031,5214784,35556680,332484,654855,2775975,5014182,165329,1091209,32264,933919,40027,43101036,96324864,3829746,2667263,260185,361545,161566,NA,287084,40276,1592930,291932,208974,35954884,21374,171565,50581,921056,795010,NA,1460269,NA,2635560,39720,100717,203185,10335281 15 | S_14,2627819,18469523,4631535,250471,938249,1113557,5438493,13067134,34924072,1275569,15453387,1200555,5562376,1065410,77905579,12376667,1065445,768104,2907791,11873543,1419004,3274109,7364844,2488097,48202266,73520793,2035510,55348905,36906071,4469467,2336502,705954,391712,15729468,1845217,2032584,5254813,116676,661452,1089207,17752255,125216,491527,24230002,135994,4802424,7288312,4682310,529629,5076899,99478,39973359,769121,156401,1308727,8000724,97536,321030,235020,1231942,7698421,1379777,551473,1032552,287607,1286699,70920,1533311,26149993,129335,353380,201936,NA,100496,4797717,11534829,795529,472556,NA,1135308,29323,869724,48442,1025319,78130,21109957,55493508,314135,NA,69213,65752,181690,277620,98860,35737,NA,420424,131949,23259207,335420,64120,29637,157648,264385,90172,NA,73639,NA,35727,NA,865840,3910950 16 | S_15,2935258,21712694,10078799,426488,2338317,1135337,4660913,13536183,37671653,1534874,42815265,900113,15771822,765068,67400582,12163759,2003503,1363473,4479388,15851116,3440173,3364146,9851132,2302424,60541275,77276028,4788958,49090420,33868939,8042224,8470121,691435,386761,12018374,2811781,1887718,7139610,100106,553532,859171,20702308,182904,666851,30885460,160524,6443472,8267500,5721922,2571380,2173703,52625,42020910,767957,17920,7252047,5564436,62928,563493,517278,1995785,6702513,4467893,743287,1252178,189999,753306,117426,1949741,57261151,206156,854803,227772,NA,121898,7303433,18862500,1560154,731927,531947,2986947,46505,1318716,40960,1212141,110852,25786223,61042248,1043366,2140870,131655,804144,213334,691340,177325,50026,2020024,484651,193208,585791,330646,133067,NA,297841,337033,240794,213645,51009,2248852,NA,57978,591310,1846131 17 | S_16,3156567,32775134,5032558,552057,1686903,1578827,9513198,37164766,56356214,1688372,38737625,1533330,33105398,973448,105715251,19802191,5011074,2751630,13617430,31808343,1885683,1603603,15225052,4458753,92699661,116870224,10648119,32295300,13885011,1891695,16005572,4950964,421200,13891910,1921120,1382934,13563757,115546,397650,696375,30170060,353386,1680930,67177543,141452,12392296,13403047,10205318,2437129,19566976,196400,59849966,680977,110253,1009882,774439,349699,599716,540509,2016847,7296312,74129326,10602690,1151751,150316,1253907,1711353,2261464,43998906,68525,680074,666895,58574,310810,5084937,21457391,243706,NA,343701,12606021,64032,1037453,38089,1641553,224304,39796477,80063228,1900611,483970,250216,102716,198192,56562,195153,27454,255659,259217,346522,59418525,448110,208619,26468,188933,240898,94226,NA,59840,NA,51583,119501,310160,7497339 18 | S_17,1926092,20543198,3786502,197043,1833060,640257,4536377,11506814,35996923,1750563,65101430,3265183,9531373,698748,61117795,11705826,1136501,1295896,4282297,6847833,5376964,2898647,8534669,1112980,49075472,61229516,2913886,26829807,29155288,2318427,2941617,654490,290829,4369884,2127265,1247201,6741997,67748,547985,1253056,16278837,544416,412352,13336459,271722,53041268,5996329,3749053,1128177,6809789,337612,44008700,611950,133584,4115403,118496,214604,217101,493812,1598551,5793532,1462134,406302,356262,167114,542219,56247,2041793,45583878,76461,286690,114319,135334,77248,14377130,14232363,1029603,467094,3917870,1905207,245752,1069521,38493,841370,36931,20142296,41526620,533453,8160840,68647,407312,194096,244295,103539,19616,1762152,261613,51492,19937013,62702,77658,34462,1374193,454364,166135,430043,69446,1068645,NA,NA,NA,11272550 19 | S_18,2497807,27207027,5992133,273013,2700143,1149955,4343033,33340712,47246040,2539445,78511540,2544051,33193139,936356,72725507,15695934,6395546,1897719,5130208,20943590,3888982,3022319,10889225,2241092,64892619,87359694,7469857,38958856,30168018,5758013,6888933,1535646,255856,3805537,2234641,1554937,8057925,94015,861980,766811,22970053,413934,1345439,35954897,215496,39111808,13486952,5130642,1619768,10122285,252289,53979131,830336,167970,3556346,97401,342885,415446,611064,1619269,5789332,765047,396134,889158,170782,700290,60519,2160893,58913205,138353,388420,373258,85841,246840,6302542,15588512,443729,1025487,2939307,5007387,209567,1291961,45827,1144338,42764,32976011,73294246,1490671,3873170,234892,943861,204254,212000,129347,18621,1569389,402714,143600,38832956,78400,159250,NA,896674,478983,161562,836087,73186,1063053,NA,104599,241307,10930165 20 | S_19,1669901,19614240,4904610,540793,2327708,700752,3679612,25531558,35456198,1853951,26493460,1904265,23056782,690479,63660255,11141428,2477445,1390996,2709815,14872620,5014999,1025165,7378917,1279477,41412877,67217267,11543472,33642669,15557112,5374532,5362441,2251514,349908,2480871,2362413,1207810,5826019,95164,486526,1001834,16161667,242116,1072273,29595667,312890,8299946,8933064,3635994,965530,8493161,176121,38067378,500533,65232,83413302,75683,219121,239415,605555,913971,6629382,3603624,611851,668992,124480,636927,109635,1310990,39225148,199490,715413,472238,132968,191178,6293809,8390928,1119615,894915,495143,3808207,38782,952479,58585,1923876,54145,26240875,55778533,1651124,2122697,126879,1200367,123134,308869,73220,27862,962062,379043,236179,515675,202888,99298,32549,189055,234408,107266,407293,40214,22444866,NA,70176,198264,3415210 21 | S_20,1428616,17229365,4097811,183816,1645551,586371,3595757,6948126,30820521,2076782,40825639,3022564,17328461,588119,58534159,9828603,1192371,991599,2278864,8705055,2358784,1320178,6142170,885537,28601951,50839990,9890023,21358666,24140726,10777190,1986326,1162513,236140,1816595,2822413,970574,4808192,80601,776062,1412252,12360172,381984,463666,14375677,281168,12248664,7097672,3179962,414248,4033205,223498,36253744,650847,84825,83299507,125777,300325,282127,431905,1220577,5412741,2066714,987397,293095,126655,417443,107818,1332490,26115273,258773,579283,319961,141802,82663,6222520,6256235,2339573,1035486,1186914,664279,20099,798186,67914,2502170,92124,17509436,36489023,164025,1448631,99186,2031582,178703,360729,85110,43102,753286,297638,127557,9966052,547039,108787,105504,365314,389779,205968,202066,51779,21570314,262264,63557,39932,3000035 22 | S_21,1872543,22138961,6740227,409954,2386563,718140,4846008,16249695,31634359,2128310,33391500,2741688,8559930,718894,61019697,9928547,2027117,1161600,4035140,6684036,9410527,2141219,7244844,1177037,41160388,53354514,2962524,21444606,40331043,4216837,2641750,1651992,439146,7816758,2913876,1240270,5453356,67379,500601,1186296,13479370,449796,922389,14889292,543297,19660555,6875505,2778230,1953381,10412285,282588,42420831,457969,19685,38894807,77763,306550,217038,382763,1151215,5936386,1561074,266500,439630,136533,689228,50186,1271875,23546496,129834,143459,358773,201352,166248,40844741,10835817,204672,653959,2525102,4461585,54095,751344,64928,1241209,24270,19879719,36367874,614302,6547141,188959,525710,184067,253748,66629,17067,7256119,365199,161681,11938431,80169,166111,109714,876585,183311,NA,124870,NA,9669794,48455,125945,NA,8026511 23 | S_22,1901938,41467087,10580551,1844209,2469248,1377171,11268669,90981324,66808919,3620987,92370934,1693877,44828995,1052444,104753500,23392169,7028189,5305104,31118237,36052496,4343502,1325861,27480650,3850312,89893855,128875504,11035224,15196943,19477205,15895881,10922409,8735042,48589,3167413,4596352,4865813,21710222,69700,735142,858910,32031683,1381664,3024382,36017234,63343,31313457,8371547,23784645,2670184,28259058,776512,72505663,1939802,164478,139017444,94680,972391,570508,1004490,3057502,6503698,8227726,1161821,1135819,466827,950814,239241,11202269,24859083,205842,2145077,1637423,138041,479127,5706591,23953442,906675,594416,1350224,9650794,80144,876474,76465,1118756,100259,49316973,105650648,6121471,2089724,392721,578962,198305,193874,801861,41969,1554674,441638,1471712,39910563,332365,278768,204189,415695,546950,110948,144879,41918,38746437,NA,205551,NA,15836288 24 | S_23,1655502,18842736,4447652,217989,1460984,638123,4073947,5505768,32079376,2375285,69334907,2565555,10945868,669463,50873340,9764451,616798,1263334,3684123,5147519,7991611,1632356,7664918,1095884,32732757,53367810,6193815,23491903,28411559,10970174,1302309,1479713,357972,1982192,4340430,1209345,7306129,51658,414607,1121635,11618453,794956,873171,10781634,325339,29253584,4920343,4435974,1229117,7202679,262489,41517075,637529,20150,100121367,152936,307497,260277,629098,1613833,6276844,2432153,462294,586855,177391,531825,102170,2732961,9096251,181475,625596,124730,436051,148433,28430674,9232651,1921508,988073,903081,809424,20611,490902,118231,708616,53967,16402705,37231467,NA,2299427,75664,312156,174021,213296,163207,35621,1882485,297395,191377,5093982,223926,242312,112216,324817,465186,200349,137994,74457,27783262,57400,47917,NA,3409458 25 | S_24,1606156,16975943,3535572,351780,2146074,558646,3889661,6388891,31025385,2355985,64331907,2815546,8178396,710614,55355488,9543748,1115174,1009616,3334946,4797521,13921597,1741051,6548014,865235,36361061,50149709,3472740,22905881,23293437,5676432,1288447,2030697,253998,4227466,3882988,971624,5441067,47386,430271,1049583,11821016,802161,458605,10240359,648722,24943746,5934790,3582012,729086,17385873,475516,40579440,640792,25769,30972128,105374,276805,211360,311370,1228227,5976446,1296938,294701,255459,153413,512354,55274,2276369,14704161,213236,193532,103412,263199,90935,48080170,8059959,720320,525300,1378331,2002042,120780,673147,81228,874952,22795,15470466,33132324,450825,1446631,125198,383522,166447,564442,106827,100836,3730484,300435,94085,16360300,84228,132721,115064,600832,220673,NA,NA,107177,7726858,NA,90323,NA,13819442 26 | S_25,1829985,16223858,7435423,160459,1612389,667629,3504690,9363181,28042055,1736995,35659272,2375247,13453694,525588,57494982,8838454,1251782,1064502,2568957,12213092,2785308,1382298,6577676,1316544,28425534,57694837,7843865,26667684,22708858,7101486,2469866,1645236,357294,2594787,2695841,1157388,5979943,83930,371738,1321489,12375528,181685,607795,20905983,184519,19485283,6896693,4553038,1266268,7668660,262650,32379561,608348,61973,24026546,1554348,262703,287462,731585,1744307,6451324,2143453,1383930,637820,107848,699812,210627,1732241,23766647,139677,1141576,395223,100243,108098,5152434,9345121,2927894,749264,440435,1032065,29185,787947,59231,1354833,118221,20793209,47590606,451652,892359,95687,1087266,182428,740446,142027,51105,692078,292354,441037,17036320,205020,111451,47587,140246,818674,135697,173709,NA,6236880,65207,53172,167728,4734636 27 | S_26,1664321,19359918,5490828,176994,1610449,781887,4539648,17249727,33010849,2028633,46981678,2345936,25663830,590281,61659216,10860110,1941845,1569799,3887556,11879898,4081992,1103735,7813585,1109456,39237649,60450371,14448308,29272323,13728455,10858811,3324523,3236316,403740,3263291,2987062,1166584,6921405,85567,274422,1275506,14129078,422734,997809,21983438,217036,12512756,9138112,4304418,NA,13653636,320925,37428550,648695,51160,62582599,165272,337534,431366,659482,2015368,6090247,2802680,1103038,1712287,83811,654561,174973,1854150,32085131,431599,1125666,332444,132831,172558,8193982,12120459,1315501,1255016,453606,1071722,31493,876604,93103,2174157,108569,23089049,47612818,914827,1950454,122852,2204344,194134,764041,168693,52047,1352972,307466,253776,22227960,129832,106959,65974,146821,360879,458585,174268,NA,16908735,NA,60803,137303,4954613 28 | S_27,1594675,16995510,4253666,183354,1396334,676956,4384579,7144414,29447080,1737668,57568999,2634331,9481660,640613,58502201,10622949,754194,1086413,3199734,5550998,6153237,1451870,6431824,805352,27642281,48209471,5351992,18976727,24871178,2935454,1325492,1394553,302908,1349101,2345309,1048506,5908219,69101,864011,1110649,10841943,473950,470780,11119697,323445,17241940,6015735,3074838,NA,6673979,201841,38756206,631477,23537,16376773,102556,239920,195318,482495,1461759,5713605,2558214,678281,NA,118957,459741,103476,1719372,7725504,64127,961838,262684,203094,89002,21459869,8039201,1398914,554709,476062,636681,19101,439823,70268,750862,55924,16337800,33453569,357988,1166322,57974,274816,201745,235196,88360,31153,948188,224946,119069,5123838,83562,263267,70324,172324,553116,76224,93432,53345,4266042,NA,46685,NA,3071567 29 | S_28,1527608,21551930,3058839,304790,1414641,745012,4444426,16529394,37903679,1627918,37842159,1824684,19893271,802431,70912439,13643435,1711724,1206177,4474293,10980145,3694846,776717,6997833,1064796,41886773,62871639,5885005,30172513,3702378,4272720,1972970,5040914,221377,4862071,10337053,1265047,6123732,78723,867557,1494556,17443371,591381,691086,18336017,264355,25205828,10027408,3452096,264572,22306297,409521,45470731,912095,124522,165486239,44670,760162,267441,517838,1572205,5863278,652295,436881,392952,126077,642018,66289,2061748,23404322,406745,312399,253587,86456,128730,7366087,8931602,241693,398835,1070732,1925679,52613,786731,145790,1451215,36022,20710158,47044139,1265473,798940,239203,1401650,184203,81298,73722,18035,1091443,351962,117684,36230549,237980,163833,110973,330041,445057,118893,283187,63917,47739549,36758,171400,NA,9571739 30 | S_29,1805069,17372989,3540625,257759,1097786,578690,3259404,7465029,30775419,1697656,42758987,3826733,7481273,831783,47773433,9246794,1149632,1013301,3442757,5971273,6221162,1220713,6031612,1145767,31939864,51600785,2440532,25312751,25152372,2099352,1587024,1578336,349469,5525895,1894845,1149189,6016357,79407,838207,1317013,12467461,624803,569150,13579093,317918,23275613,5828003,2841694,1095514,11568747,352364,38430744,613008,33356,1681773,130502,142864,138123,291190,991638,6588284,584707,245505,589445,133088,606315,42452,1637943,24117178,78541,507785,137320,147675,101228,14825644,9594304,870135,522945,1328337,1638251,91583,766001,33754,739513,9704,15239764,35017366,479364,3764943,102430,265536,149175,312440,79191,11654,1256701,NA,60919,13701489,29748,112791,35150,462293,192889,NA,109683,74013,556212,40901,67628,91124,11979943 31 | S_30,2144191,19275210,13154891,1695629,1930804,949022,6880594,28079438,30264256,3982951,20370742,1649479,17687548,655227,57117390,10049385,2014064,1995090,6062105,4040915,3714572,1665602,13928484,1735962,32049963,52168076,6562319,1597621,29244549,3175981,6035834,2166344,97516,10907185,3235648,10732556,13466082,76433,878752,1357880,12703123,565988,1279524,2715055,113969,2296757,1497724,13765951,2079734,7101014,199237,36149428,2740334,15727,26309769,6838624,285639,290379,329226,3182943,3244871,1749445,1124911,556385,616964,376529,144978,6865835,35389642,137184,2018372,501489,NA,178817,5769227,16978362,1761762,230025,739924,3179617,40507,1189702,47452,2114384,62313,16571046,33722949,421299,3031230,NA,335870,220036,567298,501869,93096,2070609,531375,304225,11005741,111301,137780,102955,248794,1375621,89571,622811,56345,7295314,NA,NA,630232,5289474 32 | S_31,1741485,32410775,5732365,210084,2314478,1370050,5055904,74550533,56668484,2036899,41349573,1749240,33693095,1078463,85210900,20488521,4004872,2352115,8216167,26833227,4806216,1692728,11680291,3022454,76849415,109707087,8245478,48286328,23427811,3268306,14788107,12225466,346378,11021512,2240590,1248140,11148319,104094,680943,820579,27344070,880630,1592847,59370915,165469,25011393,12746085,6421232,1451121,25630537,284212,58479241,589497,82995,12314507,488281,1436591,245632,845103,1318803,7438259,2003453,910169,1509559,111924,845121,142037,2274998,29383870,NA,1728011,865352,94278,290635,3311474,28185099,724784,1227718,1092847,11631049,51955,855900,66450,882973,82473,43759831,93606030,3871492,2568710,192050,142665,170736,451076,254298,34235,1523074,224349,554754,41722022,100231,143169,34235,299250,590981,NA,400770,NA,3371419,50668,107035,NA,5980668 33 | S_32,1356496,25859648,5563156,271823,1474797,784275,5023523,29747451,42455483,2015780,26142831,1763583,18304769,631543,69953015,14761162,2622836,2141677,5526462,11691075,5158722,1069686,10142955,1757955,50634529,77261350,3774477,35579310,17620500,2657378,3994346,4699488,79049,4812962,2352211,1497051,10775756,58565,362356,778875,19237312,606824,797870,23180127,131776,24644171,6250167,7204222,1473250,18182239,337774,53684945,1203543,75218,12561250,105560,450574,205005,429807,2133171,6619403,1077275,479275,807906,263346,649416,92687,4367918,18876106,NA,1008615,442505,65864,132485,5864070,15276847,327722,416690,869261,4377926,56029,711543,37500,644186,40020,27997627,58977051,2242171,2310875,125251,161336,153438,55582,201141,24248,1796036,193598,243905,28265891,62519,103975,69657,215990,316784,71389,192621,53515,3358540,NA,81866,NA,9881417 34 | S_33,1427305,9402421,8083460,139015,1339214,359907,2448797,5576309,15203550,781348,11581647,835513,8662498,419803,23738684,3571943,762022,852668,1587596,4872686,2949107,661858,5271758,646868,13422243,30406754,3208079,14035463,6274768,4190559,1367201,2098906,361886,1695544,2352996,1069351,4867304,65102,294816,1341225,6645971,78533,340684,9823082,105584,6998135,3623703,3362993,646151,8072472,148818,19610635,594306,44013,37634254,892236,118729,198331,492372,1381495,4840795,1674815,835061,507093,134883,275916,159719,1513883,28512371,92850,1718256,255154,80844,50721,7091125,6178844,1676021,253469,417293,1471919,22601,785702,61450,814663,66625,9948558,22346515,1063861,3488897,49785,557313,113727,304559,67247,43874,527251,181347,275650,15848567,56337,79830,19280,102956,192887,82968,NA,36889,10188261,44352,NA,77652,3761433 35 | S_34,1354744,12822972,4682078,148574,1429245,497235,2628319,11623732,20262005,1447967,23918978,1600800,8749188,529912,31146180,5475798,1027565,972044,2113176,5168312,2105354,1410985,5242914,974185,20011454,34974739,2373931,18752837,24977268,1901507,2381929,1368541,488870,3331174,1858021,1143743,4859225,60923,419987,1328789,8755633,304917,409169,10143784,89045,17416561,4104021,2907902,993630,3940622,150677,26095144,379729,57328,18494706,77828,137421,157801,283530,1080357,5527208,1108950,353963,272836,115010,296217,52675,1546193,25364258,64049,447560,182940,50896,71440,5764866,6570596,1334340,524839,717946,2252770,45383,770413,45546,514762,24207,12924297,25421669,326425,969982,94610,176159,124518,394558,92281,18100,811603,186163,155183,10320608,96517,97503,31931,199859,574871,69726,72460,108013,4868665,NA,51539,NA,4417599 36 | S_35,1830964,17469374,5532110,199139,1013817,585344,4762377,20702548,30025214,1678633,30350428,1759776,13502199,765590,59283968,9800655,1518204,1386616,4089512,7938895,3267071,940164,7280343,1237644,34964939,53928423,4817586,21108736,13159710,1794409,3221717,3015100,409326,4638545,1635923,1174427,8379722,83987,439070,1480433,13151972,472645,822285,17672670,154664,15778752,5923269,4221019,830658,23624521,920099,35107056,752091,55285,10715179,345569,334642,169175,313830,1934782,6187043,718768,807731,581192,120844,717768,110241,2226505,28142772,49284,720789,390810,53999,147182,4236728,15550088,298833,331445,509949,3636148,560533,854705,42205,678751,45526,20140778,38712806,1681077,2567087,111152,125872,190952,NA,100332,22464,902161,189941,241412,30065501,75323,107474,34823,176335,646284,57258,170767,118625,2876194,NA,67644,137103,34565002 37 | S_36,1238782,23815462,4899110,207004,1635908,902186,4435444,42061486,41808675,1804841,27106396,1387668,19712077,752543,74143338,14859453,2013008,1815777,5015504,13056589,3069776,889472,8100686,1873361,49931915,76356337,6176968,30499073,7668540,9149519,4291797,4891068,340448,4927832,1748218,1001457,8574486,81130,350444,972262,18551862,580385,1565130,25953292,170061,17542652,7822486,5129656,218302,15996654,210735,48170584,577198,25432,33361515,39048,414312,185834,511530,1229473,6149747,NA,308682,535323,101734,845150,35139,2426139,12791723,138782,1730131,656264,67467,272996,3268308,17833326,228127,707224,171475,6412489,54207,583808,56659,621113,35324,28848449,58691133,2209260,1718235,115270,209704,194871,NA,202534,14305,1100952,NA,401703,13870867,129121,102366,28934,76404,1042371,68937,196049,NA,8621675,42527,59597,107996,4784790 38 | S_37,1823761,24771183,5308982,212982,1125479,1007882,5609255,45426395,47329922,1793194,24751206,2133923,17852927,690281,82456655,16742315,2329398,2093759,4707501,16808642,3117121,1317413,9634511,1664197,53064047,84500369,9780018,44012486,23029337,1964256,3653264,2052823,297681,3900680,1870749,1295382,9043763,91655,474749,827077,21543190,301972,1384270,35176866,261198,11526386,9194830,4916037,962140,8872202,127191,52040944,820323,52340,6001984,219113,320869,280356,317813,1581524,5887876,1303519,599210,822314,141706,981267,87164,1957195,24292497,75500,1144107,847964,93181,238955,4786616,11358428,247912,869992,436923,4636381,34800,740761,51193,1628580,50863,31780495,67882293,3719240,1526794,NA,393402,197503,50494,133357,30053,1105967,253243,505028,13341339,60003,148866,40466,188723,295784,77566,152077,NA,1652145,NA,58648,338703,4933904 --------------------------------------------------------------------------------