├── .Rbuildignore ├── .gitattributes ├── .github ├── .gitignore └── workflows │ └── check-standard.yaml ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── R ├── apply_tmle_to_validation.R ├── cleanup-latex.R ├── compile-results.R ├── create_cv_folds.R ├── estimate_pooled_results.R ├── estimate_tmle.R ├── estimate_tmle2.R ├── exportLatex.R ├── factors_to_indicators.R ├── globals.R ├── length-unique.R ├── max_sqr.R ├── plot-var.R ├── print.varimpact.R ├── process-factors.R ├── process-numerics.R ├── quantiles_equivalent.R ├── reduce_dimensions.R ├── restrict_by_quantiles.R ├── results-by-level.R ├── separate_factors_numerics.R ├── sum_na.R ├── tmle_bound.R ├── tmle_estimate_g.R ├── tmle_estimate_q.R ├── tmle_init_stage1.R ├── varimpact.R ├── vim-factors.R └── vim-numerics.R ├── README.md ├── TODO.txt ├── appveyor.yml ├── images ├── README-example_1-1.png └── README-example_5-1.png ├── man ├── cleanup_latex_files.Rd ├── create_cv_folds.Rd ├── estimate_tmle.Rd ├── estimate_tmle2.Rd ├── exportLatex.Rd ├── factors_to_indicators.Rd ├── max_sqr.Rd ├── plot_var.Rd ├── print.varimpact.Rd ├── quantiles_equivalent.Rd ├── reduce_dimensions.Rd ├── restrict_by_quantiles.Rd ├── results_by_level.Rd ├── separate_factors_numerics.Rd ├── sum_na.Rd ├── tmle_estimate_q.Rd └── varimpact.Rd ├── readme.Rmd └── tests ├── testthat.R └── testthat ├── test-estimate_tmle2.R ├── test-exportLatex.R ├── test-factorsToIndicators.R ├── test-reduce_dimensions.R ├── test-varimpact-breastcancer.R └── test-varimpact.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | # RStudio files. 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | 5 | # Automated testing. 6 | ^\.travis\.yml$ 7 | ^appveyor\.yml$ 8 | 9 | # Misc. 10 | ^TODO.txt$ 11 | ^images$ 12 | 13 | # Github files. 14 | ^readme.Rmd$ 15 | ^\.github$ 16 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macOS-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | 26 | env: 27 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 28 | R_KEEP_PKG_SOURCE: yes 29 | 30 | steps: 31 | - uses: actions/checkout@v2 32 | 33 | - uses: r-lib/actions/setup-pandoc@v2 34 | 35 | - uses: r-lib/actions/setup-r@v2 36 | with: 37 | r-version: ${{ matrix.config.r }} 38 | http-user-agent: ${{ matrix.config.http-user-agent }} 39 | use-public-rspm: true 40 | 41 | - uses: r-lib/actions/setup-r-dependencies@v2 42 | with: 43 | extra-packages: any::rcmdcheck 44 | needs: check 45 | 46 | - uses: r-lib/actions/check-r-package@v2 47 | with: 48 | upload-snapshots: true 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .DS_Store 5 | *.Rproj 6 | results 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Docs: https://docs.travis-ci.com/user/languages/r/ 2 | # Validate your .travis.yml file at http://lint.travis-ci.org/ 3 | 4 | language: r 5 | cache: packages 6 | sudo: false 7 | 8 | # Install nlopt package to help nloptr install. 9 | addons: 10 | apt: 11 | packages: 12 | - libnlopt-dev 13 | 14 | #r_github_packages: 15 | # - jimhester/covr 16 | 17 | r_packages: 18 | - covr 19 | 20 | bioc_packages: 21 | - hopach 22 | - multtest 23 | 24 | # Test on 3 versions of R. 25 | r: 26 | - release 27 | - devel 28 | # glmnet no longer works on R 3.5 :/ 29 | #- oldrel 30 | 31 | env: 32 | global: 33 | # Allow build to continue even if a suggested package cannot be installed. 34 | - _R_CHECK_FORCE_SUGGESTS_=false 35 | 36 | matrix: 37 | # Report build completion status once non-"allow_failures" builds are done. 38 | fast_finish: true 39 | # Allow failures on OSX. 40 | allow_failures: 41 | - os: osx 42 | 43 | # Allow build to take up to 40 minutes without returning output (esp. during example testing) 44 | # Without this section, travis build will fail to lack of output for 10 minutes. 45 | script: 46 | - | 47 | R CMD build . 48 | travis_wait 40 R CMD check varimpact*.tar.gz 49 | 50 | # code coverage testing is timing out after 10 minutes, also need to 51 | # enable travis_wait for this. 52 | #after_success: 53 | # Only check test coverage for R = release and OS = linux. 54 | # - test ${TRAVIS_R_VERSION_STRING} == "release" && test ${TRAVIS_OS_NAME} == "linux" && Rscript -e 'covr::codecov()' 55 | 56 | # Get error logs in case of failure. 57 | after_failure: 58 | find *Rcheck -name '*.fail' -print -exec cat '{}' \; 59 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: varimpact 2 | Type: Package 3 | Title: Variable Importance Estimation Using Targeted Causal Inference (TMLE) 4 | Version: 1.3.0-9006 5 | Date: 2022-06-21 6 | Authors@R: c( 7 | person("Alan", "Hubbard", email = "hubbard@berkeley.edu", 8 | role = c("aut"), 9 | comment = c(ORCID = "0000-0002-3769-0127")), 10 | person("Chris", "Kennedy", email = "chrisken@gmail.com", 11 | role = c("aut", "cre"), 12 | comment = c(ORCID = "0000-0001-7444-2766"))) 13 | URL: http://github.com/ck37/varimpact 14 | BugReports: https://github.com/ck37/varimpact/issues 15 | Description: Automated data processing and variable importance using data- 16 | adaptive target parameter methods combined with TMLE. Returns an ordered 17 | variable list of importance. 18 | License: GPL (>= 2) 19 | Encoding: UTF-8 20 | Depends: 21 | SuperLearner 22 | Imports: 23 | arules, 24 | caret, 25 | cvTools, 26 | dplyr, 27 | future, 28 | future.apply, 29 | ggplot2, 30 | glmnet, 31 | histogram, 32 | hopach, 33 | magrittr, 34 | MASS, 35 | modeest, 36 | multtest, 37 | RANN, 38 | tmle, 39 | xtable 40 | biocViews: 41 | Suggests: 42 | doSNOW, 43 | foreach, 44 | mlbench, 45 | RhpcBLASctl, 46 | snow, 47 | testthat 48 | RoxygenNote: 7.3.2 49 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,varimpact) 4 | export(.bound) 5 | export(cleanup_latex_files) 6 | export(estimate_tmle2) 7 | export(exportLatex) 8 | export(factors_to_indicators) 9 | export(max_sqr) 10 | export(plot_var) 11 | export(quantiles_equivalent) 12 | export(reduce_dimensions) 13 | export(restrict_by_quantiles) 14 | export(separate_factors_numerics) 15 | export(tmle_estimate_q) 16 | export(varimpact) 17 | import(ggplot2) 18 | importFrom(SuperLearner,All) 19 | importFrom(cvTools,cvFolds) 20 | importFrom(dplyr,first) 21 | importFrom(dplyr,funs) 22 | importFrom(dplyr,group_by) 23 | importFrom(dplyr,mutate) 24 | importFrom(dplyr,select) 25 | importFrom(dplyr,summarize_all) 26 | importFrom(hopach,distancematrix) 27 | importFrom(hopach,hopach) 28 | importFrom(magrittr,"%>%") 29 | importFrom(modeest,mlv) 30 | importFrom(stats,as.formula) 31 | importFrom(stats,binomial) 32 | importFrom(stats,coef) 33 | importFrom(stats,cor) 34 | importFrom(stats,glm) 35 | importFrom(stats,model.matrix) 36 | importFrom(stats,na.omit) 37 | importFrom(stats,plogis) 38 | importFrom(stats,pnorm) 39 | importFrom(stats,poisson) 40 | importFrom(stats,predict) 41 | importFrom(stats,qlogis) 42 | importFrom(stats,quantile) 43 | importFrom(stats,var) 44 | importFrom(tmle,tmle) 45 | importFrom(utils,packageDescription) 46 | -------------------------------------------------------------------------------- /R/apply_tmle_to_validation.R: -------------------------------------------------------------------------------- 1 | apply_tmle_to_validation = 2 | function(Y, 3 | A, 4 | W, 5 | family, 6 | delta = rep(1, length(Y)), 7 | tmle, 8 | id = 1:length(Y), 9 | verbose = FALSE) { 10 | 11 | ########### 12 | # Transform Y to Y_star, needed for later fluctuation step. 13 | Y_star = Y 14 | if (tmle$map_to_ystar) { 15 | # Use the Qbounds from the full range of Y, 16 | # not the tmle$ab that is based only on training data. 17 | # Y_star = (Y_star - tmle$ab[1]) / diff(tmle$ab) 18 | # If Y is binary this will have no effect as bounds & outcome are already {0, 1}. 19 | Y_star = (Y_star - tmle$Qbounds[1]) / diff(tmle$Qbounds) 20 | 21 | if (verbose) { 22 | cat("Mapped Y to Y_star using Qbounds:", tmle$Qbounds, "\n") 23 | cat("New Y_star range:", range(Y_star), "\n") 24 | } 25 | } 26 | 27 | if (max(Y_star) > 1 | min(Y_star) < 0) { 28 | cat("Error on Y_star's range for logistic fluctuation\n") 29 | cat("Y_star distribution:\n") 30 | print(summary(Y_star)) 31 | cat("Qbounds:", tmle$Qbounds, "\n") 32 | cat("Stage1 Qbounds:", tmle$stage1_Qbounds, "\n") 33 | cat("Stage1 ab:", tmle$ab, "\n") 34 | cat("Validation Y range:", range(Y), "\n") 35 | stop("Y values must be 0 <= y <= 1") 36 | } 37 | 38 | # We only include this because TMLE functions use Z. 39 | Z = rep(0, length(Y)) 40 | 41 | q_df = data.frame(Z, A = 1, W) 42 | 43 | # Predict Q(1, W) 44 | tryCatch({ 45 | sl_pred = predict(tmle$q_model, q_df, onlySL = T) 46 | Q_hat = sl_pred$pred 47 | }, error = function(e) { 48 | print(e) 49 | print(tmle$q_model) 50 | #browser() 51 | stop("apply_tmle_to_validation() failed during prediction of Q(1, W).") 52 | }) 53 | 54 | if (verbose) cat("Bounding Q_hat to", tmle$stage1_Qbounds, "\n") 55 | Q_hat = .bound(Q_hat, tmle$stage1_Qbounds) 56 | 57 | if (min(Q_hat) < 0 || max(Q_hat) > 1) { 58 | cat("Error: predicted Q_hat outside of [0, 1] bounds.\n") 59 | #browser() 60 | } 61 | 62 | # Predict g 63 | tryCatch({ 64 | # Check specifically for a g_model that doesn't exist. 65 | if (is.null(tmle$g_model)) { 66 | stop("tmle$g_model has class = NULL") 67 | } 68 | sl_pred = predict(tmle$g_model, W, type = "response", onlySL = TRUE) 69 | g1W_hat = sl_pred$pred 70 | }, error = function(e) { 71 | print(e) 72 | print(tmle$g_model) 73 | #browser() 74 | stop("apply_tmle_to_validation() failed during prediction of g.") 75 | }) 76 | 77 | if (verbose) cat("Current range of g1W on test:", range(g1W_hat), "\n") 78 | 79 | # Truncate g1W_hat 80 | # TODO: double-check this with Alan. 81 | g1W_hat_truncated = .bound(g1W_hat, tmle$gbounds) 82 | if (verbose) cat("Truncating g1W on test using bounds:", tmle$gbounds, "\n") 83 | 84 | # Create clever covariate. 85 | # H1W = (A == 1) / g1W_hat_truncated 86 | # Based on Jeremy Coyle's CV-TMLE implementation. 87 | H1W = 1 / g1W_hat_truncated 88 | HAW = A * H1W 89 | 90 | # TODO: handle delta in some way? 91 | 92 | if (verbose) { 93 | cat("Mean Y_a on validation:", mean(Y[A == 1]), "\n") 94 | cat("Mean Y_star_a on validation:", mean(Y_star[A == 1]), "\n") 95 | cat("Mean Q_bar_a on validation:", mean(Q_hat[A == 1]), "\n") 96 | } 97 | 98 | #################### 99 | # Return results 100 | 101 | # We return Y_star rather than Y, for use in pooled fluctuation step. 102 | data = data.frame(Y_star = Y_star, A = A, Q_hat = Q_hat, 103 | g1W_hat = g1W_hat_truncated, 104 | H1W = H1W, 105 | HAW = HAW) 106 | 107 | results = data 108 | 109 | return(results) 110 | } 111 | -------------------------------------------------------------------------------- /R/cleanup-latex.R: -------------------------------------------------------------------------------- 1 | #' Clean up LaTeX files created by exportLatex 2 | #' 3 | #' This function removes LaTeX files that are typically created by the exportLatex() function. 4 | #' It's designed to be used after exportLatex() calls to clean up temporary files. 5 | #' 6 | #' @param dir Directory where LaTeX files are located (default: current directory) 7 | #' @param outname Prefix for the LaTeX files (default: empty string) 8 | #' @param verbose If TRUE, print messages about which files were removed 9 | #' 10 | #' @return Invisibly returns a logical vector indicating which files were successfully removed 11 | #' 12 | #' @examples 13 | #' \dontrun{ 14 | #' # After calling exportLatex() 15 | #' exportLatex(vim) 16 | #' cleanup_latex_files() 17 | #' 18 | #' # With custom directory and prefix 19 | #' exportLatex(vim, outname = "myresults_", dir = "output/") 20 | #' cleanup_latex_files(dir = "output/", outname = "myresults_") 21 | #' } 22 | #' 23 | #' @export 24 | cleanup_latex_files <- function(dir = ".", outname = "", verbose = FALSE) { 25 | 26 | # Define the standard LaTeX file names that exportLatex() creates 27 | latex_files <- c( 28 | paste0(dir, "/", outname, "varimpByFold.tex"), 29 | paste0(dir, "/", outname, "varimpAll.tex"), 30 | paste0(dir, "/", outname, "varimpConsistent.tex") 31 | ) 32 | 33 | # Check which files exist 34 | existing_files <- latex_files[file.exists(latex_files)] 35 | 36 | if (length(existing_files) == 0) { 37 | if (verbose) { 38 | cat("No LaTeX files found to clean up.\n") 39 | } 40 | return(invisible(logical(0))) 41 | } 42 | 43 | if (verbose) { 44 | cat("Cleaning up LaTeX files:\n") 45 | cat(paste(" -", basename(existing_files), collapse = "\n"), "\n") 46 | } 47 | 48 | # Remove the files 49 | removal_success <- suppressWarnings({ 50 | file.remove(existing_files) 51 | }) 52 | 53 | if (verbose) { 54 | successful_removals <- sum(removal_success) 55 | cat("Successfully removed", successful_removals, "of", length(existing_files), "files.\n") 56 | } 57 | 58 | return(invisible(removal_success)) 59 | } -------------------------------------------------------------------------------- /R/compile-results.R: -------------------------------------------------------------------------------- 1 | compile_results = 2 | function(colnames_numeric, 3 | colnames_factor, 4 | vim_numeric, 5 | vim_factor, 6 | V, 7 | verbose = FALSE) { 8 | 9 | num_numeric = length(colnames_numeric) 10 | num_factor = length(colnames_factor) 11 | 12 | variable_types = c(rep("Ordered", num_numeric), rep("Factor", num_factor)) 13 | variable_names = c(colnames_numeric, colnames_factor) 14 | 15 | all_vims = c(vim_numeric, vim_factor) 16 | names(all_vims) = variable_names 17 | 18 | element_length = sapply(all_vims, length) 19 | 20 | # May increase this to 8, hence >= operator in following lines. 21 | expected_length = 8 22 | 23 | num_short_vars = sum(element_length < expected_length) 24 | if (verbose && num_short_vars > 0) { 25 | # Identify which variables do not have expected length. 26 | cat(num_short_vars, "variables did not contain >=", expected_length, 27 | "result elements. Removing those variables from the results.\n") 28 | # Likely missing EY1V, EY0V, labV, and fold_results. 29 | print(element_length[element_length < expected_length]) 30 | # browser() 31 | } 32 | 33 | # Restrict to vim results that have at least 7 elements. 34 | vim_combined = all_vims[element_length >= expected_length] 35 | variable_names = variable_names[element_length >= expected_length] 36 | variable_types = variable_types[element_length >= expected_length] 37 | 38 | # Set defaults for variables we want to return. 39 | outres = outres.all = outres.cons = outres.byV = NULL 40 | 41 | # Get rid of any variables that have a validation sample with no 42 | # estimates of variable importance. 43 | if (length(vim_combined) == 0) { 44 | error_msg = "No variable importance estimates could be calculated due to sample size, etc." 45 | if (verbose) { 46 | cat(error_msg, "\n") 47 | #cat("Lengths:", element_length, "\n") 48 | } 49 | warning(error_msg) 50 | # TODO: also write output to the file in a separate function call. 51 | #write("No VIM's could be calculated due to sample size, etc", 52 | # file = "AllReslts.tex") 53 | } else { 54 | length_ey1 = sapply(vim_combined, function(x) { 55 | # In some cases class(x$EY1) is NULL, so this avoids a warning. 56 | if (is.null(x$EY1V)) { 57 | return(0) 58 | } 59 | ey1_vector = na.omit(x$EY1V) 60 | length(ey1_vector) 61 | }) 62 | 63 | valid_results = vim_combined[length_ey1 == V] 64 | variable_names = variable_names[length_ey1 == V] 65 | variable_types = variable_types[length_ey1 == V] 66 | 67 | 68 | if (length(valid_results) == 0) { 69 | error_msg = "No VIMs could be calculated due to sample size, etc." 70 | if (verbose) { 71 | cat(error_msg, "\n") 72 | cat("EY1 lengths:", length_ey1, "\n") 73 | } 74 | warning(error_msg) 75 | } else { 76 | 77 | # Order of results: 78 | # 1. EY1V, #2. EY0V, #3. thetaV, #4. varICV, #5, labV 79 | # #6. nV, #7. type, #8. name. 80 | 81 | theta = do.call(rbind, lapply(valid_results, function(x) x$thetaV)) 82 | 83 | theta_sum_na = apply(theta, 1, sum_na) 84 | results_no_na = valid_results[theta_sum_na == 0] 85 | variable_names = variable_names[theta_sum_na == 0] 86 | variable_types = variable_types[theta_sum_na == 0] 87 | 88 | # Extract the various components of the results. 89 | EY1 = do.call(rbind, lapply(results_no_na, function(x) x$EY1)) 90 | EY0 = do.call(rbind, lapply(results_no_na, function(x) x$EY0)) 91 | 92 | # Risk difference 93 | theta = do.call(rbind, lapply(results_no_na, function(x) x$thetaV)) 94 | varIC = do.call(rbind, lapply(results_no_na, function(x) x$varICV)) 95 | 96 | # Relative risk 97 | theta_rr = do.call(rbind, lapply(results_no_na, function(x) x$thetaV_rr)) 98 | varIC_log_rr = do.call(rbind, lapply(results_no_na, function(x) x$varICV_log_rr)) 99 | 100 | nV = do.call(rbind, lapply(results_no_na, function(x) x$nV)) 101 | n = sum(nV[1, ]) 102 | 103 | # SE: Risk difference 104 | SEV = sqrt(varIC / nV) 105 | 106 | # SE: relative risk 107 | SEV_log_rr = sqrt(varIC_log_rr / nV) 108 | 109 | # Get labels for each of the training samples. 110 | extract_labels = function(x, total_folds) { 111 | # Repeat each fold id twice, one for low level and once for high. 112 | labels = rep(1:total_folds, 2) 113 | # Re-order in ascending order. 114 | # TODO: isn't there another rep() function that would sort automatically? 115 | oo = order(labels) 116 | labels = labels[oo] 117 | # Convert labels from vector elements to columns in a single row. 118 | out = as.vector(t(x)) 119 | names(out) = paste0(rep(c("Low", "High"), total_folds), "_v", labels) 120 | out 121 | } 122 | 123 | # labV is result element 5. 124 | tst = lapply(results_no_na, function(x) x$labV) 125 | tst = lapply(tst, extract_labels, total_folds = V) 126 | labels = do.call(rbind, tst) 127 | 128 | # Parameteter: risk difference 129 | psi = apply(theta, 1, mean) 130 | # Each row is a variable and each column in a fold estimate. 131 | meanvarIC = apply(varIC, 1, mean) 132 | SE = sqrt(meanvarIC / n) 133 | 134 | ci_lower = psi - 1.96 * SE 135 | ci_upper = psi + 1.96 * SE 136 | # 1-sided p-value 137 | pvalue = 1 - pnorm(psi / SE) 138 | 139 | # Parameter: relative risk 140 | psi_rr = apply(theta_rr, 1, mean) 141 | # Each row is a variable and each column in a fold estimate. 142 | meanvarIC_log_rr = apply(varIC_log_rr, 1, mean) 143 | se_log_rr = sqrt(meanvarIC_log_rr / n) 144 | # Calculate CI on log-RR scale. 145 | ci_lower_rr = exp(log(psi_rr) - 1.96 * se_log_rr) 146 | ci_upper_rr = exp(log(psi_rr) + 1.96 * se_log_rr) 147 | 148 | # TODO: double-check this. 149 | #pvalue_rr = 1 - pnorm(abs(log(psi_rr)) / se_log_rr) 150 | pvalue_rr = 1 - pnorm(log(psi_rr) / se_log_rr) 151 | 152 | 153 | # Number of significant digits. 154 | signif_digits = 3 155 | 156 | # TODO: provide ci_lower and ci_upper as separate elements. 157 | CI95 = paste0("(", signif(ci_lower, signif_digits), " - ", 158 | signif(ci_upper, signif_digits), ")") 159 | CI95_rr = paste0("(", signif(ci_lower_rr, signif_digits), " - ", 160 | signif(ci_upper_rr, signif_digits), ")") 161 | 162 | 163 | ##### FOR THETA (generalize to chi-square test?) 164 | # TT = (theta[,1] - theta[,2]) / sqrt(SEV[,1]^2 + SEV[,2]^2) 165 | # pval.comp=2*(1-pnorm(abs(TT))) FOR levels 166 | # (just make sure in same order) 167 | 168 | num_continuous = sum(variable_types == "Ordered") 169 | num_vars = length(variable_types) 170 | 171 | length.uniq = function(x) { 172 | length(unique(x)) 173 | } 174 | 175 | ################## 176 | # Ordered variables first 177 | cons = NULL 178 | if (num_continuous > 0) { 179 | dir = NULL 180 | # Check for consistency. 181 | for (i in 1:V) { 182 | lower_temp = labels[1:num_continuous, i * 2 - 1] 183 | xx = regexpr(",", lower_temp) 184 | lwr = as.numeric(substr(lower_temp, 2, xx - 1)) 185 | 186 | upper_temp = labels[1:num_continuous, i * 2] 187 | xx = regexpr(",", upper_temp) 188 | nx = nchar(upper_temp) 189 | uwr = as.numeric(substr(upper_temp, xx + 1, nx - 1)) 190 | 191 | dir = cbind(dir, uwr > lwr) 192 | } 193 | 194 | # For numeric variables, consistency means each fold finds the same directionality. 195 | cons = apply(dir, 1, length.uniq) 196 | } 197 | 198 | ################## 199 | # Factors 200 | num_factors = num_vars - num_continuous 201 | if (num_factors > 0) { 202 | lwr = NULL 203 | uwr = NULL 204 | for (i in 1:V) { 205 | # The 2 here is because we have the a_l and a_h labels, not because V = 2. 206 | lwr = cbind(lwr, labels[(num_continuous + 1):num_vars, i * 2 - 1]) 207 | uwr = cbind(uwr, labels[(num_continuous + 1):num_vars, i * 2]) 208 | } 209 | # Count have many unique levels are used for the lower bin - want it to be 1. 210 | conslwr = apply(lwr, 1, length.uniq) 211 | # Count how many unique levels are used for the upper bin - want it to be 1. 212 | consupr = apply(uwr, 1, length.uniq) 213 | # If conslwr * consupr == 1 then the variable is consistent. 214 | cons = c(cons, conslwr * consupr) 215 | } 216 | # consist= (cons==1 & pval.comp > 0.05) 217 | signsum = function(x) { 218 | sum(sign(x)) 219 | } 220 | 221 | # Consistent results need to have all positive all negative thetas, 222 | # And use the same factor levels for the low and high bins in each CV fold. 223 | # CK: but really, shouldn't they all be positive? May want to remove abs() 224 | consist = cons == 1 & abs(apply(theta, 1, signsum)) == V 225 | 226 | procedures = c("Holm", "BH") 227 | if (num_vars > 1) { 228 | # Adjust p-values for multiple testing. 229 | res = multtest::mt.rawp2adjp(pvalue, procedures) 230 | res_rr = multtest::mt.rawp2adjp(pvalue_rr, procedures) 231 | 232 | # Attempt to prepend these names with "rr_" for the relative risk version. 233 | colnames(res_rr$adj) = paste0("rr_", colnames(res_rr$adj)) 234 | 235 | # This indexing sorts the results in ascending order of unadjusted p-value, 236 | # then descending by impact estimate. 237 | # TODO: this may need to be fixed. 238 | #sorted_rows = base::order(res$index, -psi) 239 | sorted_rows = res$index 240 | 241 | #browser() 242 | outres = data.frame(var_type = variable_types[sorted_rows], 243 | theta[sorted_rows, , drop = FALSE], 244 | psi[sorted_rows], 245 | CI95[sorted_rows], 246 | res$adj, 247 | labels[sorted_rows, , drop = FALSE], 248 | # Relative risk parameter. 249 | "AvePsi_rr" = psi_rr[sorted_rows], 250 | "CI95_rr" = CI95_rr[sorted_rows], 251 | res_rr$adj, 252 | # Consistency 253 | consist[sorted_rows]) 254 | } else if (num_vars == 1) { 255 | # No need for multiple testing adjustment. 256 | # TODO: just integrate into previous part? 257 | outres = data.frame(var_type = variable_types, 258 | theta, 259 | psi, 260 | CI95, 261 | rawp = pvalue, 262 | Holm = pvalue, 263 | BH = pvalue, 264 | labels, 265 | # TODO: implement these. 266 | "AvePsi_rr" = NA, 267 | "CI95_rr" = NA, 268 | "rr_rawp" = NA, 269 | "rr_Holm" = NA, 270 | "rr_BH" = NA, 271 | consist) 272 | } else { 273 | outres = NULL 274 | } 275 | 276 | # TODO: this will give an error if we have no results. 277 | 278 | # Restrict to variables that aren't missing their p-value. 279 | outres = outres[!is.na(outres[, "rawp"]), , drop = FALSE] 280 | #print(colnames(outres)) 281 | #print(ncol(outres)) 282 | 283 | #names(outres)[1:(1 + 2*V)] = c("VarType", paste0("Est_v", 1:V), "AvePsi", "CI95") 284 | names(outres)[1:(3 + V)] = c("VarType", paste0("Est_v", 1:V), "AvePsi", "CI95") 285 | 286 | #names(outres)[(9 + 2 * V)] = "Consistent" 287 | names(outres)[ncol(outres)] = "Consistent" 288 | 289 | #print(colnames(outres)) 290 | #print(ncol(outres)) 291 | 292 | # Save rownames, because dplyr will discard them. 293 | outres$varname = rownames(outres) 294 | 295 | # Sort by raw p-value, then BH p-value, then effect size. 296 | outres = outres %>% dplyr::arrange(rawp, BH, desc(AvePsi)) %>% as.data.frame() 297 | 298 | # Restore rownames. 299 | rownames(outres) = outres$varname 300 | outres$varname = NULL 301 | 302 | # drops = c('VarType','description','Holm,') 303 | # outres.all=outres[,!(names(outres) %in% drops)] 304 | # We want to extract the per-fold psi estimates, per-fold levels, and consistency flag. 305 | # TODO: update for relative risk parameter. 306 | outres.byV = outres[, c(2:(2 + V - 1), (7 + V):(7 + 3 * V)), drop = FALSE] 307 | 308 | outres.all = outres[, c("VarType", 309 | # Risk difference: 310 | "AvePsi", "CI95", "rawp", "BH", 311 | # Relative risk: 312 | "AvePsi_rr", "CI95_rr", "rr_rawp", "rr_BH", 313 | "Consistent"), drop = FALSE] 314 | 315 | colnames(outres.all) = c("Type", 316 | # Risk difference: 317 | "Estimate", "CI95", "P-value", "Adj. p-value", 318 | # Relative risk: 319 | "Est. RR", "CI95 RR", "P-value RR", "Adj. p-value RR", 320 | "Consistent") 321 | 322 | ################ 323 | # Get Consistency Measure and only significant 324 | # TODO: Make BH cut-off flexible in future versions (default at 0.05) 325 | outres.cons = outres.all[outres[, "BH"] < 0.05 & 326 | outres$Consistent, , drop = FALSE] 327 | outres.cons = subset(outres.cons, select = -c(Consistent)) 328 | } 329 | } 330 | 331 | # Return results. 332 | results = list(results_consistent = outres.cons, 333 | results_all = outres.all, 334 | results_by_fold = outres.byV, 335 | results_raw = outres, 336 | all_vims = all_vims) 337 | 338 | 339 | return(results) 340 | } 341 | -------------------------------------------------------------------------------- /R/create_cv_folds.R: -------------------------------------------------------------------------------- 1 | #' Stratified CV to insure balance (by one grouping variable, Y) 2 | #' 3 | #' @param V number of folds 4 | #' @param Y Outcome variable. If binary will be used for stratification. 5 | #' @param verbose If T will display extra output. 6 | #' 7 | #' @return Vector of fold assignments. 8 | #' 9 | #' @importFrom cvTools cvFolds 10 | create_cv_folds = function(V, Y, verbose = F) { 11 | Ys = unique(Y) 12 | nys = length(Ys) 13 | nn = length(Y) 14 | # Binary outcome so we can do stratified fold generation. 15 | if (nys == 2) { 16 | out = rep(NA, nn) 17 | for (i in 1:nys) { 18 | # Record how many observations have this Y value. 19 | n = sum(Y == Ys[i]) 20 | folds = cvTools::cvFolds(n, K = V, R = 1, type = "random")$which 21 | #if (verbose) { 22 | # cat("Y", i, "is", Ys[i], "count:", sum(Y == Ys[i]), "n=", n, "fold length:", 23 | # length(folds), "\n") 24 | #} 25 | out[Y == Ys[i]] = folds 26 | } 27 | if (verbose) { 28 | cat("Cross-validation fold breakdown:\n") 29 | print(table(Y, "Fold"=out, useNA="ifany")) 30 | } 31 | } else { 32 | # More than 2 Ys, so don't stratify. 33 | xx = cvTools::cvFolds(nn, K = V, R = 1, type = "random")$which 34 | out = xx 35 | } 36 | return(out) 37 | } 38 | -------------------------------------------------------------------------------- /R/estimate_pooled_results.R: -------------------------------------------------------------------------------- 1 | estimate_pooled_results = function(fold_results, 2 | fluctuation = "logistic", 3 | verbose = FALSE) { 4 | # Fold results is a list with test results from each fold. 5 | 6 | # Each fold result should have at least this element: 7 | # val_preds dataframe, with Y_star, g, Q, H. 8 | # TODO: need to change this check, it doesn't work correctly. 9 | #num_fails = sum(is.null(sapply(fold_results, `[[`, "level"))) 10 | #if (verbose) { 11 | # cat("Number of fold failures:", num_fails, "of", length(fold_results), "\n") 12 | #} 13 | 14 | # Placeholder results to return in case of error. 15 | results = list( 16 | thetas = NULL, 17 | influence_curves = NULL, 18 | epsilon = NULL 19 | ) 20 | 21 | #if (num_fails == length(fold_results)) { 22 | # Every fold failed. 23 | # if (verbose) cat("Error: every fold failed.\n") 24 | # return(results) 25 | #} 26 | 27 | # browser() 28 | 29 | # Extract the results from each CV-TMLE fold and rbind into a single dataframe. 30 | data = do.call(rbind, lapply(1:length(fold_results), function(i) { 31 | fold = fold_results[[i]] 32 | # Save the fold number so we can use it to generate fold-specific estimates. 33 | if (is.null(fold$val_preds)) { 34 | # Skip folds that failed. 35 | NULL 36 | } else { 37 | # val_preds is a dataframe with columns: Y_star, g, Q, H 38 | df = cbind(fold$val_preds, fold_num = i) 39 | df 40 | } 41 | })) 42 | 43 | if (is.null(data)) { 44 | # Every fold failed. 45 | if (verbose) cat("Error: every fold failed.\n") 46 | return(results) 47 | } 48 | 49 | if (min(data$Q_hat) < 0 || max(data$Q_hat) > 1) { 50 | cat("Error: some predicted values of Q_hat are out of bounds.", 51 | "They should be in [0, 1].\n") 52 | print(summary(data$Q_hat)) 53 | browser() 54 | } 55 | 56 | # Set some default values in case of a future error. 57 | thetas = NULL 58 | influence_curves = NULL 59 | epsilon = NULL 60 | 61 | 62 | if (!is.null(data)) { 63 | n = nrow(data) 64 | 65 | # If Y is binary, take logit of Q. 66 | #if (length(unique(data$Y)) == 2) { 67 | 68 | # Look at thetas prior to fluctuation. 69 | pre_thetas = tapply(data$Q_hat, data$fold_num, mean, na.rm = TRUE) 70 | if (verbose) cat("Pre-fluctuation thetas:", pre_thetas, "\n") 71 | 72 | # If Q is binary or continuous we still want to take logit of predicted values. 73 | # See tmle::estimateQ where it does this after predicting Q. 74 | data$logit_Q_hat = try(stats::qlogis(data$Q_hat)) 75 | if (inherits(data$logit_Q_hat, "try-error")) { 76 | cat("Error in estimate_pooled_results() with qlogis()\n") 77 | print(summary(data$Q_hat)) 78 | browser() 79 | } 80 | #} 81 | 82 | # Estimate epsilon 83 | if (verbose) cat("Estimating epsilon: ") 84 | 85 | if (fluctuation == "logistic") { 86 | suppressWarnings({ 87 | #epsilon = coef(glm(Y_star ~ -1 + offset(logit_Q_hat) + H1W, 88 | #epsilon = coef(glm(Y_star ~ -1 + offset(logit_Q_hat) + HAW, 89 | # data = data, family = "binomial")) 90 | reg = try(stats::glm(Y_star ~ -1 + stats::offset(logit_Q_hat) + HAW, 91 | data = data, family = "binomial")) 92 | if ("try-error" %in% class(reg)) { 93 | cat("Error in epsilon regression.\n") 94 | browser() 95 | } 96 | epsilon = try(stats::coef(reg)) 97 | }) 98 | # Use more stable version where clever covariate is the weight, and now we 99 | # have an intercept. Causal 2, Lecture 3, slide 51. 100 | # We have to suppressWarnings about "non-integrate #successes in binomial glm". 101 | #suppressWarnings({ 102 | # Catch an error if one occurs here. 103 | #epsilon = try(coef(glm(Y_star ~ offset(logit_Q_hat), 104 | # epsilon = try(coef(glm(Y_star ~ ., 105 | # offset = logit_Q_hat, 106 | # weights = H1W, 107 | # data = data, family = "binomial"))) 108 | #}) 109 | if (verbose) cat(epsilon, "\n") 110 | } else { 111 | # No need to support linear fluctuation as it does not respect model bounds. 112 | stop("Only support logistic fluctuation currently.") 113 | # TBD. 114 | } 115 | 116 | if ("try-error" %in% class(epsilon)) { 117 | if (verbose) cat("Error when estimating epsilon.\n") 118 | print(summary(data$Y_star)) 119 | browser() 120 | } else { 121 | 122 | if (verbose) cat("Fluctuating Q_star\n") 123 | 124 | # Fluctuate Q to get Q_star 125 | Q_star = data$logit_Q_hat + epsilon * data$H1W 126 | #Q_star = data$logit_Q_hat + epsilon * data$HAW 127 | 128 | if (verbose) cat("Transforming Q_star\n") 129 | #if (length(unique(data$Y)) == 2) { 130 | Q_star = plogis(Q_star) 131 | #} 132 | 133 | if (verbose) cat("Estimating per-fold thetas: ") 134 | 135 | # Estimate treatment-specific mean parameter on every validation fold. 136 | thetas = tapply(Q_star, data$fold_num, mean, na.rm = TRUE) 137 | if (verbose) cat(thetas, "\n") 138 | 139 | # Take average across folds to get final estimate. 140 | #theta = mean(thetas) 141 | 142 | # Move Q_star into the data so that it can be analyzed per-fold. 143 | data$Q_star = Q_star 144 | rm(Q_star) 145 | 146 | if (verbose) cat("Calculating per-fold influence curves\n") 147 | 148 | # Get influence curve per fold - for treatment-specific mean. 149 | # Influence_curves here is a list, where each element is a result. 150 | # We can't convert to a matrix because lengths are different. 151 | # TODO: figure out why this can generate NaNs 152 | influence_curves = base::by(data, data$fold_num, function(fold_data) { 153 | if (F && verbose) { 154 | with(fold_data, 155 | cat("A:", length(A), "g1W_hat:", length(g1W_hat), "Y_star:", length(Y_star), 156 | "Q_star:", length(Q_star), "\n")) 157 | } 158 | #with(fold_data, (A / g1W_hat) * (Y - Q_star) + Q_star - theta) 159 | result = with(fold_data, (A / g1W_hat) * (Y_star - Q_star) + 160 | Q_star - mean(Q_star, na.rm = TRUE)) 161 | #if (verbose) cat("Result:", class(result), "Length:", length(result), "\n") 162 | result 163 | }) 164 | 165 | # Check for NaNs. 166 | num_nans = sum(sapply(influence_curves, function(curve) sum(is.nan(curve)))) 167 | if (num_nans > 0) { 168 | if (verbose) { 169 | cat("Error: influence curves contain", num_nans, "NaNs.\n") 170 | cat("g1W_hat zeros:", sum(data$g1W_hat == 0), "\n") 171 | } 172 | } 173 | 174 | #if (verbose) cat("IC class:", class(influence_curves), "\n") 175 | 176 | # Old version: 177 | #influence_curve = with(data, (A / g1W_hat) * (Y - Q_star) + Q_star - theta) 178 | 179 | # Calculate standard error. 180 | #std_err = stats::var(influence_curves) / n 181 | } 182 | } 183 | 184 | if (is.null(thetas)) { 185 | # All folds must have failed. 186 | if (verbose) cat("No pooled results. All folds seemed to have failed.\n") 187 | } 188 | 189 | # Compile results 190 | results = list( 191 | #theta = theta, 192 | thetas = thetas, 193 | influence_curves = influence_curves, 194 | #std_err = std_err, 195 | epsilon = epsilon 196 | ) 197 | 198 | return(results) 199 | } 200 | -------------------------------------------------------------------------------- /R/estimate_tmle.R: -------------------------------------------------------------------------------- 1 | #' Get TMLE estimate: E[Y | A = 1, W]. 2 | #' 3 | #' @param Y Outcome variable 4 | #' @param A Treatment indicator 5 | #' @param W Dataframe of adjustment covariates 6 | #' @param family Binomial or gaussian 7 | #' @param delta Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing. 8 | #' @param Q.lib SuperLearner library for estimating Q (potential outcome) 9 | #' @param g.lib SuperLearner library for estimating g (propensity score) 10 | #' @param verbose If true output extra information during execution. 11 | #' @importFrom tmle tmle 12 | estimate_tmle = function(Y, 13 | A, 14 | W, 15 | family, 16 | delta = NULL, 17 | Q.lib, 18 | g.lib, 19 | verbose = F) { 20 | 21 | if (!family %in% c("binomial", "gaussian")) { 22 | stop('Estimate_tmle: family must be either "binomial" or "gaussian".') 23 | } 24 | 25 | # Because of quirk of program, delete observations with delta=0 if #>0 26 | # & < 10 27 | n = length(Y) 28 | inc = rep(TRUE, n) 29 | 30 | if (!is.null(delta)) { 31 | ss = sum(delta == 0) 32 | if (ss > 0 & ss < 10) { 33 | inc[delta == 0] = FALSE 34 | } 35 | } 36 | 37 | Y = Y[inc] 38 | A = A[inc] 39 | W = W[inc, , drop = F] 40 | 41 | delta = delta[inc] 42 | 43 | # Here we are using tmle but not using the treatment effect estimate. 44 | # We're actually using the underlying variables to estimate Y_a. 45 | tmle.1 = tmle::tmle(Y, A, W, Delta = delta, g.SL.library = g.lib, 46 | Q.SL.library = Q.lib, family = family, verbose = verbose) 47 | 48 | # Propensity score for treatment. 49 | g1 = tmle.1$g$g1W 50 | 51 | # Unit's estimated outcome under treatment: hat(Y) | A = 1, W 52 | # This is after the fluctuation step, so it is targeted. 53 | Qst = tmle.1$Qstar[, 2] 54 | 55 | # E[Y | A = 1, W] 56 | theta = mean(Qst) 57 | 58 | # Influence curve 59 | IC = (A / g1) * (Y - Qst) + Qst - theta 60 | 61 | # Compile results. 62 | result = list(theta = theta, IC = IC) 63 | 64 | return(result) 65 | } 66 | -------------------------------------------------------------------------------- /R/estimate_tmle2.R: -------------------------------------------------------------------------------- 1 | #' Get TMLE estimate: E[Y | A = 1, W]. 2 | #' 3 | #' @param Y Outcome variable 4 | #' @param A Treatment indicator 5 | #' @param W Dataframe of adjustment covariates 6 | #' @param family Outcome family - binomial or gaussian 7 | #' @param delta Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing. 8 | #' @param Q.lib SuperLearner library for estimating Q (potential outcome) 9 | #' @param g.lib SuperLearner library for estimating g (propensity score) 10 | #' @param id Optional subject-level identifier. 11 | #' @param Qbounds Bounds on Q 12 | #' @param gbound Bounds on G 13 | #' @param alpha TBD, from TMLE package 14 | #' @param fluctuation Only logistic is currently supported. 15 | #' @param V Number of folds for SuperLearner 16 | #' @param verbose If true output extra information during execution. 17 | #' @importFrom tmle tmle 18 | #' @importFrom stats as.formula binomial coef glm plogis poisson predict qlogis 19 | #' @importFrom utils packageDescription 20 | #' @export 21 | estimate_tmle2 = 22 | function(Y, 23 | A, 24 | W, 25 | family, 26 | delta = rep(1, length(Y)), 27 | Q.lib, 28 | g.lib, 29 | id = 1:length(Y), 30 | Qbounds = NULL, 31 | gbound = 0.025, 32 | alpha = 0.995, 33 | fluctuation = "logistic", 34 | V = 10, 35 | verbose = F) { 36 | 37 | if (!family %in% c("binomial", "gaussian")) { 38 | stop('Estimate_tmle: family must be either "binomial" or "gaussian".') 39 | } 40 | 41 | # Because of quirk of program, delete observations with delta=0 if #>0 42 | # & < 10 43 | n = length(Y) 44 | inc = rep(T, n) 45 | 46 | # TODO: revisit this decision. 47 | if (!is.null(delta)) { 48 | num_missing = sum(delta == 0) 49 | if (num_missing > 0 && num_missing < 10) { 50 | inc[delta == 0] = F 51 | } 52 | } 53 | 54 | if (length(dim(W)) != 2) { 55 | stop("Error: W should have two dimensions. Instead its dimensions are:", paste(dim(W)), "\n") 56 | } 57 | 58 | Y = Y[inc] 59 | A = A[inc] 60 | W = W[inc, , drop = F] 61 | 62 | delta = delta[inc] 63 | 64 | # Check for any remaining missing data. 65 | # It is technically ok for Y or A to include missingness, because the 66 | # delta estimation is intended to estimate the missingness mechanism. 67 | # This needs more testing though. 68 | 69 | missing_vals = sum(is.na(W)) 70 | if (missing_vals != 0) { 71 | cat("Warning: found", missing_vals, "NAs in W.\n") 72 | 73 | na_sum = sapply(W, function(col) sum(is.na(col))) 74 | cat("Columns with NAs:\n") 75 | print(na_sum[na_sum > 0]) 76 | } 77 | 78 | missing_vals = sum(is.na(A)) 79 | if (missing_vals != 0) { 80 | cat("Warning: found", missing_vals, "NAs in A.\n") 81 | } 82 | 83 | missing_vals = sum(is.na(Y[delta])) 84 | if (missing_vals != 0) { 85 | cat("Warning: found", missing_vals, "NAs in Y.\n") 86 | } 87 | 88 | # Here we are using tmle but not using the treatment effect estimate. 89 | # We're actually using the underlying variables to estimate Y_a. 90 | # TODO: disable this call, we're just running it now to double-check 91 | # the custom results. 92 | if (F) { 93 | tmle.1 = tmle::tmle(Y, A, W, Delta = delta, g.SL.library = g.lib, 94 | Q.SL.library = Q.lib, family = family, verbose = verbose) 95 | } else { 96 | tmle.1 = NULL 97 | } 98 | 99 | if (verbose) { 100 | cat("Estimating g. A distribution:\n") 101 | print(table(A)) 102 | } 103 | 104 | min_g_cell = min(table(A)) 105 | g_V = V 106 | if (min_g_cell < V) { 107 | g_V = max(min_g_cell, 2) 108 | if (verbose) { 109 | cat("A's minimum cell sizes", min_g_cell, "is less than the number of CV", 110 | "folds", V, "\n. Reducing folds to", g_V, "\n") 111 | } 112 | } 113 | 114 | # Using modified version of tmle::estimateG 115 | #g_model = SuperLearner::SuperLearner(Y = A, X = W, SL.library = g.lib, 116 | #g = tmle_estimate_g(d = cbind(A[delta == 1], W[delta == 1, ]), 117 | g = tmle_estimate_g(d = cbind(A, W), 118 | SL.library = g.lib, 119 | verbose = verbose, 120 | V = g_V, 121 | outcome = "A", 122 | message = "g") 123 | 124 | # Handle gBounds - code from tmle::tmle(). 125 | if (length(gbound) == 1) { 126 | if (length(unique(A)) == 1) { 127 | # EY1 only, no controlled direct effect 128 | gbound = c(gbound, 1) 129 | } else { 130 | gbound = c(gbound, 1 - gbound) 131 | } 132 | } 133 | g$bound = gbound 134 | 135 | # Propensity score for treatment. 136 | #g1 = tmle.1$g$g1W 137 | g1 = g$g1W 138 | 139 | # Check if these two are highly correlated. 140 | if (!is.null(tmle.1) && verbose) { 141 | # This will generate a warning of SD is zero for either vector. 142 | # If this is the case we'll see an NA here. 143 | suppressWarnings(cat("Correlation of custom g to tmle-based g:", 144 | stats::cor(tmle.1$g$g1W, g1), "\n")) 145 | } 146 | 147 | # This is copied from within tmle::tmle() 148 | map_to_ystar = fluctuation == "logistic" 149 | 150 | if (verbose) cat("TMLE init stage1\n") 151 | 152 | # Run tmle stage 1 - this is basically just bounding & transforming Y. 153 | stage1 = tmle_init_stage1(Y = Y, Q = NULL, 154 | A = A, 155 | Delta = delta, 156 | alpha = alpha, 157 | Qbounds = Qbounds, 158 | maptoYstar = map_to_ystar, 159 | family = family) 160 | 161 | if (verbose) cat("TMLE q\n") 162 | 163 | # Estimate Qinit 164 | # cvQinit = F by default, meaning that we don't need CV.SuperLearner. 165 | q = tmle_estimate_q(Y = stage1$Ystar, 166 | A = A, 167 | W = W, 168 | Q = stage1$Q, 169 | Delta = delta, 170 | SL.library = Q.lib, 171 | family = family, 172 | V = V, 173 | verbose = verbose, 174 | maptoYstar = map_to_ystar, 175 | Qbounds = stage1$Qbounds) 176 | 177 | # Convert from a matrix to a df, so we can use $ to access elements. 178 | # Nevermind, this breaks the plogis code. 179 | # q$Q = data.frame(q$Q) 180 | 181 | # Check for all NaN in QAW. 182 | stopifnot(mean(is.nan(q$Q[, "QAW"])) == 0) 183 | 184 | # cat(class(q$Q), paste(dim(q$Q)), paste(colnames(q$Q)), "\n") 185 | # TODO: check if our custom q$QAW equals the tmle Q. 186 | 187 | # Specify random arguments from tmle::tmle() 188 | pDelta1 = NULL 189 | g.Deltaform = NULL 190 | 191 | # From tmle::tmle() 192 | ############################################ 193 | if (verbose) cat("Estimating g.Delta (missingness mechanism)\n") 194 | 195 | g.z <- NULL 196 | g.z$type="No intermediate variable" 197 | g.z$coef=NA 198 | g.Delta <- suppressWarnings({ 199 | tmle_estimate_g(d = data.frame(delta, Z=1, A, W), 200 | pDelta1, 201 | g.Deltaform, 202 | g.lib, 203 | id = id, V = V, 204 | verbose = verbose, 205 | message = "missingness mechanism", 206 | outcome="D") 207 | }) 208 | g1W.total <- .bound(g$g1W*g.Delta$g1W[,"Z0A1"], gbound) 209 | if (sum(is.na(g1W.total)) > 0) { 210 | if (verbose) { 211 | cat("Error, g1W.total has NAs:", sum(is.na(g1W.total)), "\n") 212 | } 213 | } 214 | g0W.total <- .bound((1-g$g1W)*g.Delta$g1W[,"Z0A0"], gbound) 215 | if(all(g1W.total==0)){g1W.total <- rep(10^-9, length(g1W.total))} 216 | if(all(g0W.total==0)){g0W.total <- rep(10^-9, length(g0W.total))} 217 | H1W <- A/g1W.total 218 | H0W <- (1-A)/g0W.total 219 | 220 | if (verbose) cat("Estimating epsilon\n") 221 | 222 | suppressWarnings( 223 | epsilon <- coef(glm(stage1$Ystar~-1 + offset(q$Q[,"QAW"]) + H0W + H1W, family=q$family, subset=delta==1)) 224 | ) 225 | epsilon[is.na(epsilon)] <- 0 # needed for EY1 calculation 226 | Qstar <- q$Q + c((epsilon[1]*H0W + epsilon[2]*H1W), epsilon[1]/g0W.total, epsilon[2]/g1W.total) 227 | colnames(Qstar) <- c("QAW", "Q0W", "Q1W") 228 | Ystar <- stage1$Ystar 229 | if (map_to_ystar) { 230 | Qstar <- plogis(Qstar)*diff(stage1$ab)+stage1$ab[1] 231 | Qstar <- plogis(Qstar)*diff(stage1$ab)+stage1$ab[1] 232 | q$Q <- plogis(q$Q)*diff(stage1$ab)+stage1$ab[1] 233 | Ystar <- Ystar*diff(stage1$ab)+stage1$ab[1] 234 | } 235 | colnames(q$Q) <- c("QAW", "Q0W", "Q1W") 236 | q$Q <- q$Q[,-1] 237 | 238 | if (verbose) cat("tmle::calcParameters\n") 239 | res <- tmle::calcParameters(Ystar, A, I.Z=rep(1, length(Ystar)), delta, g1W.total, g0W.total, Qstar, 240 | mu1=mean(Qstar[,"Q1W"]), mu0=mean(Qstar[,"Q0W"]), id, family, 241 | obsWeights=rep(1, length(Ystar))) 242 | 243 | #returnVal <- list(estimates=res, Qinit=Q, g=g, g.Z=g.z, g.Delta=g.Delta, Qstar=Qstar[,-1], epsilon=epsilon) 244 | #class(returnVal) <- "tmle" 245 | ############################################ 246 | 247 | 248 | 249 | 250 | # Calculate Qstar 251 | #QbarAW_star = plogis(qlogis(q$Q$QAW) + epsilon * h_aw) 252 | #Qbar1W_star = plogis(qlogis(q$Q$Q1W) + epsilon * h_1w) 253 | #Qbar0W_star = plogis(qlogis(q$Q$Q0W) + epsilon * h_0w) 254 | 255 | # Unit's estimated outcome under treatment: hat(Y) | A = 1, W 256 | # This is after the fluctuation step, so it is targeted. 257 | # Qst = tmle.1$Qstar[, 2] 258 | # Qst = tmle.1$Qstar$Q1W 259 | # Qst = Qbar1W_star 260 | Qst = Qstar[, "Q1W"] 261 | 262 | # E[Y | A = 1, W] 263 | theta = mean(Qst) 264 | 265 | # Influence curve 266 | IC = (A / g1) * (Y - Qst) + Qst - theta 267 | 268 | # Compile results. 269 | result = list(theta = theta, IC = IC, g_model = g$model, q_model = q$model, 270 | tmle = tmle.1, alpha = alpha, 271 | Qbounds = Qbounds, 272 | stage1_Qbounds = stage1$Qbounds, 273 | gbounds = g$bound, 274 | V = V, 275 | # May have had to reduce the # of SL folds for g, due to sparsity 276 | # in A. 277 | g_V = g_V, 278 | fluctuation = fluctuation, 279 | map_to_ystar = map_to_ystar, ab = stage1$ab) 280 | 281 | return(result) 282 | } 283 | 284 | -------------------------------------------------------------------------------- /R/exportLatex.R: -------------------------------------------------------------------------------- 1 | 2 | #' Export varimpact results as Latex tables 3 | #' 4 | #' Outputs results from varimpact() into three Latex tables: consistent results, 5 | #' all results, and per-fold results. 6 | #' 7 | #' Creates three Latex table files: 8 | #' \itemize{ 9 | #' \item varimpConsistent.tex - the ``consistent'' significant results, meaning those with consistent categories chosen as comparison groups among factors and consistent ordering for numeric variables. 10 | #' \item varimpAll.tex - the file with cross-validated average variable impacts ordered by statistical significance. 11 | #' \item varimpByV.tex - the comparison levels used within each validation sample. Either integer ordering of factors or short-hand for percentile cut-off (0-1 is the 10th percentile, 10+ is the 100th percentile) 12 | #' } 13 | #' 14 | #' @param impact_results Result object from previous varimpact() call. 15 | #' @param outname (Optional) String that is prepended to filenames. 16 | #' @param dir (Optional) Directory to save the results, defaults to current directory. 17 | #' @param digits Digits to round numbers, passed through to xtable. 18 | #' @param ... Additional parameters passed to print.xtable(). 19 | #' 20 | #' @seealso 21 | #' \code{\link[varimpact]{varimpact}} 22 | #' 23 | #' @export 24 | # TODO: document return object. 25 | exportLatex = function(impact_results, outname = "", dir = ".", digits = 4, ...) { 26 | 27 | # Check if results are valid 28 | if (is.null(impact_results$results_by_fold) || is.null(impact_results$results_all)) { 29 | warning("Cannot export LaTeX: varimpact results are NULL or incomplete") 30 | return(invisible(NULL)) 31 | } 32 | 33 | table_byfold = cbind("Variable" = rownames(impact_results$results_by_fold), 34 | impact_results$results_by_fold) 35 | 36 | xtable_byfold = xtable::xtable(table_byfold, 37 | caption = "Variable Importance Results By Estimation Sample", 38 | label = "byFold", 39 | digits = digits) 40 | 41 | print(xtable_byfold, 42 | type = "latex", 43 | file = paste0(dir, "/", outname, "varimpByFold.tex"), 44 | caption.placement = "top", 45 | include.rownames = F, 46 | ...) 47 | 48 | 49 | # Use hline.after to add a line after the p = 0.05 cut-off. 50 | signif_cutoff = which(impact_results$results_all[, "Adj. p-value"] > 0.05) 51 | if (length(signif_cutoff) > 0) { 52 | signif_row = min(signif_cutoff) - 1 53 | hline.after = c(-1,0, signif_row, nrow(impact_results$results_all)) 54 | } else { 55 | # All variables are important. 56 | hline.after = NULL 57 | } 58 | 59 | table_all = cbind("Rank" = 1:nrow(impact_results$results_all), 60 | "Variable" = rownames(impact_results$results_all), 61 | impact_results$results_all) 62 | 63 | xtable_all = xtable::xtable(table_all, 64 | caption = "Variable Importance Results For Combined Estimates", 65 | label = "allRes", 66 | digits = digits) 67 | 68 | print(xtable_all, 69 | type = "latex", 70 | file = paste0(dir, "/", outname, "varimpAll.tex"), 71 | caption.placement = "top", 72 | include.rownames = F, 73 | hline.after = hline.after, 74 | ...) 75 | 76 | if (nrow(impact_results$results_consistent) > 0) { 77 | table_consistent = cbind("Rank" = 1:nrow(impact_results$results_consistent), 78 | "Variable" = rownames(impact_results$results_consistent), 79 | impact_results$results_consistent) 80 | xtable_consistent = xtable::xtable(table_consistent, 81 | caption = "Subset of of Significant and ``Consistent'' Results", 82 | label = "consisRes", 83 | digits = digits) 84 | } else { 85 | # Create a blank dataframe. 86 | table_consistent = data.frame() 87 | # Create a blank xtable. 88 | xtable_consistent = NULL 89 | } 90 | 91 | print(xtable_consistent, 92 | type = "latex", 93 | file = paste0(dir, "/", outname, "varimpConsistent.tex"), 94 | caption.placement = "top", 95 | include.rownames = F, 96 | ...) 97 | 98 | # Return a list with the output results. 99 | results = list(tables = list( 100 | consistent = table_consistent, 101 | all = table_all, 102 | byfold = table_byfold 103 | ), 104 | xtables = list( 105 | consistent = xtable_consistent, 106 | all = xtable_all, 107 | byfold = xtable_byfold 108 | )) 109 | 110 | 111 | 112 | return(invisible(results)) 113 | } 114 | -------------------------------------------------------------------------------- /R/factors_to_indicators.R: -------------------------------------------------------------------------------- 1 | #' Convert a dataframe of factor into separate indicators. 2 | #' 3 | #' More details 4 | #' 5 | #' @param factor_df Dataframe consisting only of factor variables. 6 | #' @param miss_name_prefix Starting name for each missing indicator. 7 | #' @param verbose Set to T for detailed output. 8 | #' 9 | #' @return A list of results. 10 | #' 11 | #' @export 12 | factors_to_indicators = function(factor_df, miss_name_prefix = "Imiss_", 13 | verbose = F) { 14 | 15 | # TODO: confirm that factor_df contains only factor variables. 16 | 17 | sum_nas = apply(factor_df, 2, sum_na) 18 | 19 | if (verbose) cat("Factors with missingness:", sum(sum_nas > 0), "\n") 20 | 21 | facnames = names(factor_df) 22 | 23 | nam.fac = function(x, name) { 24 | num_chars = nchar(x) 25 | out = paste(name, substr(x, 2, num_chars), sep = "XX") 26 | # Remove spaces in variable names. 27 | out = gsub(" ", "", out) 28 | return(out) 29 | } 30 | 31 | ############ 32 | # Missing Basis for Factors 33 | 34 | factor_names = colnames(factor_df) 35 | miss.fac = NULL 36 | names_miss = NULL 37 | 38 | # Calculate the number of missing values for each column. 39 | sum_nas = apply(factor_df, 2, sum_na) 40 | 41 | # Loop over each column in our new indicator dataframe. 42 | # TODO: use which() to only loop over indices with missing data. 43 | for (col_k in 1:ncol(factor_df)) { 44 | if (sum_nas[col_k] > 0) { 45 | # if (verbose) cat("Missing data in", factor_names[col_k], "\n") 46 | # Again, we are flagging non-missing as 1 and missing as 0 here. 47 | indicator_column = as.numeric(is.na(factor_df[, col_k]) == F) 48 | 49 | miss.fac = cbind(miss.fac, indicator_column) 50 | 51 | names_miss = c(names_miss, paste0(miss_name_prefix, factor_names[col_k])) 52 | } 53 | } 54 | colnames(miss.fac) = names_miss 55 | 56 | newX = NULL 57 | factor_names = NULL 58 | 59 | # TODO: remove this line? 60 | options(na.action = "na.pass") 61 | 62 | # Loop over each factor variable. 63 | for (i in 1:ncol(factor_df)) { 64 | # Note: we want to keep this variable name as x so that the names are short 65 | # from model.matrix. We rely on this variable name being only 1 character. 66 | x = factor_df[, i] 67 | # CK: looks like we are omitting the first level? 68 | if (T || sum_nas[i] == 0) { 69 | omit_levels = -1 70 | } else { 71 | # if there is missing data, also omit the last level (NA) 72 | omit_levels = -1 * c(1, length(levels(x))) 73 | } 74 | 75 | # Convert to a series of indicators. 76 | indicators = model.matrix(~ x - 1)[, omit_levels] 77 | names = colnames(indicators) 78 | 79 | # Any remaining missing data is set to 0. 80 | remaining_nas = sum(sapply(indicators, function(col) sum(is.na(col)))) 81 | if (remaining_nas > 0) { 82 | if (verbose) cat("Replacing", remaining_nas, "remaining nas with 0s.\n") 83 | indicators[is.na(indicators)] = 0 84 | } 85 | 86 | # CK: why do we need this option? 87 | if (is.null(names)) { 88 | names2 = facnames[i] 89 | } else { 90 | # Clean up the names for each indicator. 91 | names2 = nam.fac(names, facnames[i]) 92 | } 93 | 94 | # Accumulate the names. 95 | factor_names = c(factor_names, names2) 96 | # Append the next set of indicators to the resulting matrix. 97 | newX = cbind(newX, indicators) 98 | } 99 | 100 | colnames(newX) = factor_names 101 | 102 | ################## 103 | # Indexing vector for dummy basis back to original factors. 104 | cc = regexpr("XX", factor_names) 105 | ncc = nchar(factor_names) 106 | cc[cc < 0] = ncc[cc < 0] + 1 107 | 108 | # NOTE: we don't seem to actually use this in varImpact() at the moment. 109 | factor_index = substr(factor_names, 1, cc - 1) 110 | 111 | # Create a list to hold multi-variable results. 112 | results = list(data = newX, factor_index = factor_index, missing_indicators = miss.fac) 113 | 114 | results 115 | } 116 | -------------------------------------------------------------------------------- /R/globals.R: -------------------------------------------------------------------------------- 1 | # Global variable declarations to avoid R CMD check NOTEs 2 | # These variables are used in dplyr operations and ggplot2 3 | 4 | # Variables used in dplyr operations 5 | utils::globalVariables(c( 6 | "name", "level", "level_label", "test_msg", "train_msg", 7 | "cv_fold", "train_cell_size", "test_cell_size", 8 | "rawp", "BH", "AvePsi", "Consistent", 9 | "test_theta_tmle", "color" 10 | )) 11 | 12 | # Function used in dplyr operations 13 | utils::globalVariables("desc") -------------------------------------------------------------------------------- /R/length-unique.R: -------------------------------------------------------------------------------- 1 | # Function that counts # of unique values. 2 | # Do not export. 3 | length_unique = function(x) { 4 | # Skip NAs. 5 | length(setdiff(unique(x), NA)) 6 | } 7 | -------------------------------------------------------------------------------- /R/max_sqr.R: -------------------------------------------------------------------------------- 1 | #' Find the maximum of the squared values. 2 | #' 3 | #' @param x Numeric vector 4 | #' 5 | #' @return Maximum of the squared values, or -Inf if all elements are NA. 6 | #' @export 7 | max_sqr = function(x) { 8 | # Handle missing data manually so we don't get warnings when it occurs. 9 | x = na.omit(x) 10 | if (length(x) == 0) { 11 | -Inf 12 | } else { 13 | max(x^2) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /R/plot-var.R: -------------------------------------------------------------------------------- 1 | #' Plot the varimpact results for a given variable 2 | #' 3 | #' Displays the adjusted treatment-specific means and the impact estimate. 4 | #' @param var_name String name of the variable 5 | #' @param vim Varimpact result object that contains the variable 6 | #' @param digits Number of digits for rounding purposes. 7 | #' @param verbose If true, display extra output. 8 | # TODO: @example 9 | #' @import ggplot2 10 | #' @export 11 | plot_var = 12 | function(var_name, vim, digits = 2L, verbose = FALSE) { 13 | 14 | # Confirm that we can plot this variable. 15 | if (is.null(vim$numeric_vims$results_by_level) && 16 | is.null(vim$factor_vims$results_by_level)) { 17 | stop(paste("No results_by_level found in vim object.", 18 | "Please post an issue to github if this is persistent.")) 19 | } 20 | 21 | # Can only plot numerics currently, need to expand to factors. 22 | numeric_vars = unique(vim$numeric_vims$results_by_level$name) 23 | factor_vars = unique(vim$factor_vims$results_by_level$name) 24 | if (var_name %in% numeric_vars) { 25 | results = vim$numeric_vims$results_by_level 26 | } else if (var_name %in% factor_vars) { 27 | results = vim$factor_vims$results_by_level 28 | } else { 29 | stop("There is no variable called", var_name, ".\n") 30 | } 31 | 32 | 33 | # Create plot dataframe. 34 | plot_data = results[results$name == var_name, 35 | c("level", "level_label", "test_theta_tmle", 36 | "test_var_tmle")] 37 | 38 | # Create color column based on min, max, and other. 39 | plot_data$color = "Other" 40 | plot_data$color[which.min(plot_data$test_theta_tmle)] = "Low risk" 41 | plot_data$color[which.max(plot_data$test_theta_tmle)] = "High risk" 42 | 43 | # Add "Impact" row to dataframe. 44 | result_row = rownames(vim$results_all) == var_name 45 | uniq_vals = nrow(plot_data) 46 | plot_data = rbind(plot_data, 47 | list(level = NA, 48 | level_label = "Impact", 49 | test_theta_tmle = vim$results_all$Estimate[result_row], 50 | # Don't have this yet. 51 | test_var_tmle = NA, 52 | color = "Impact" 53 | )) 54 | 55 | # red, blue, orange, gray 56 | # impact, max, min, other 57 | plot_palette = c("#d09996", # red 58 | "#95b4df", # blue 59 | "#f2c197", # orange 60 | "#d9d9d9") # gray 61 | 62 | # New: 63 | # high risk, impact, low risk, "other" 64 | plot_palette = plot_palette[c(1, 3, 2, 4)] 65 | 66 | # Plot TSMs from $numeric_vims$results_by_level and the varimpact estimate. 67 | p = ggplot(data = plot_data, 68 | aes(x = factor(level_label), 69 | y = test_theta_tmle, 70 | #label = round(test_theta_tmle, 1L), 71 | fill = factor(color))) + 72 | geom_col(width = 0.4) + 73 | geom_label(aes(label = round(test_theta_tmle, digits)), 74 | size = 6, 75 | fill = "white", 76 | color = "gray10", 77 | # label.color = "gray50", 78 | hjust = -0.2) + 79 | # Make the bar char horizontal rather than vertical 80 | coord_flip() + 81 | theme_minimal() + 82 | theme(axis.title.x = element_blank(), 83 | axis.title.y = element_blank(), 84 | axis.text = element_text(size = 18), 85 | plot.title = element_text(size = 20), 86 | panel.background = element_rect(fill = "white", color = "gray50"), 87 | #plot.background = element_rect(fill = "gray95"), 88 | plot.background = element_rect(fill = "#f2f2f2"), 89 | panel.grid.major = element_blank(), 90 | panel.grid.minor = element_blank(), 91 | legend.title = element_blank(), 92 | #legend.position = "none") + 93 | NULL) + 94 | theme(legend.position = "bottom", 95 | plot.title = element_text(size = 15), 96 | axis.title.y = element_text(size = 14, angle = 270, 97 | # This should center the var label vertically. 98 | hjust = (uniq_vals / (uniq_vals + 1)) / 2, 99 | color = "gray30"), 100 | axis.title.x = element_text(size = 12, color = "gray40"), 101 | #margin = margin(t = 1)), 102 | legend.spacing.x = unit(0.2, 'cm'), 103 | plot.margin = unit(c(0.5, 1, 0, 0.5), "lines"), 104 | axis.text.x = element_text(size = 14, color = "gray40")) + 105 | geom_hline(yintercept = 0, color = "gray90") + 106 | scale_x_discrete(limits = c("Impact", rev(setdiff(unique(plot_data$level_label), "Impact")))) + 107 | scale_y_continuous(expand = c(0, 0.15)) + 108 | scale_fill_manual(values = plot_palette) + 109 | guides(fill = guide_legend(label.position = "bottom")) + 110 | labs(title = paste("Impact of", var_name), 111 | y = "Adjusted outcome mean", 112 | x = var_name) 113 | 114 | p 115 | } 116 | -------------------------------------------------------------------------------- /R/print.varimpact.R: -------------------------------------------------------------------------------- 1 | #' Custom printing of the varimpact results. 2 | #' 3 | #' Shows the significant and consistent results by default. If there are no 4 | #' consistent results it shows all results. 5 | #' 6 | #' @param x Results object from varimpact. 7 | #' @param ... Further arguments passed to or from other methods. 8 | #' 9 | #' @export 10 | print.varimpact = function(x, ...) { 11 | # Just print the significant and consistent results. 12 | if (!is.null(x$results_consistent) && nrow(x$results_consistent) > 0) { 13 | cat("Significant and consistent results:\n") 14 | print(x$results_consistent) 15 | } else if (!is.null(x$results_all)) { 16 | cat("No significant and consistent results.\n") 17 | cat("All results:\n") 18 | print(x$results_all) 19 | } else { 20 | cat("No results could be calculated.\n") 21 | } 22 | invisible(x) 23 | } 24 | -------------------------------------------------------------------------------- /R/process-factors.R: -------------------------------------------------------------------------------- 1 | process_factors = function(data.fac, 2 | quantile_probs_factor, 3 | miss.cut, 4 | verbose = FALSE) { 5 | 6 | # Set some default return values. 7 | # TODO: improve names for these objects. 8 | num_factors = 0L 9 | miss.fac = NULL 10 | datafac.dum = NULL 11 | 12 | ##################### 13 | if (ncol(data.fac) > 0L) { 14 | 15 | if (verbose) cat("Processing factors. Start count:", ncol(data.fac), "\n") 16 | 17 | ###################################### 18 | # Replace blank factor values with NA's. 19 | 20 | # We re-use this num_cols variable in the next section. 21 | num_cols = ncol(data.fac) 22 | for (i in 1:num_cols) { 23 | new_factor = as.character(data.fac[, i]) 24 | # The exclude argument replaces any empty strings with NAs. 25 | new_factor = factor(new_factor, exclude = "") 26 | data.fac[, i] = new_factor 27 | } 28 | 29 | ################### 30 | # For each factor, apply function and get rid of those where 31 | # 'true' data.fac is data frame of variables that are factors 32 | if (!is.null(quantile_probs_factor)) { 33 | data.fac = restrict_by_quantiles(data.fac, quantile_probs = quantile_probs_factor, 34 | verbose = verbose) 35 | } 36 | 37 | dropped_cols = num_cols - ncol(data.fac) 38 | 39 | if (verbose) { 40 | if (dropped_cols > 0) { 41 | cat("Dropped", dropped_cols, "factors due to lack of variation.\n") 42 | } else { 43 | cat("No factors dropped due to lack of variation.\n") 44 | } 45 | } 46 | 47 | # We don't seem to use this yet. 48 | # num.cat = sapply(data.fac, length_unique) 49 | 50 | ###################### 51 | # Remove columns with missing data % greater than the threshold. 52 | sum_nas = sapply(data.fac, sum_na) 53 | 54 | if (length(sum_nas) == 0L) { 55 | if (verbose) { 56 | cat("All factors were dropped.\n") 57 | } 58 | data.fac = NULL 59 | } else { 60 | 61 | if (verbose) cat("Factors with missingness:", sum(sum_nas > 0L), "\n") 62 | 63 | miss_pct = sum_nas / nrow(data.fac) 64 | 65 | data.fac = data.fac[, miss_pct < miss.cut, drop = FALSE] 66 | 67 | if (verbose) { 68 | cat("Dropped", sum(miss_pct >= miss.cut), "factors due to the missingness threshold.\n") 69 | } 70 | 71 | # Save how many separate factors we have in this dataframe. 72 | num_factors = ncol(data.fac) 73 | 74 | factor_results = factors_to_indicators(data.fac, verbose = verbose) 75 | 76 | datafac.dum = factor_results$data 77 | # Here 1 = defined, 0 = missing. 78 | miss.fac = factor_results$missing_indicators 79 | 80 | if (verbose) { 81 | cat("End factor count:", num_factors, "Indicators:", ncol(datafac.dum), 82 | "Missing indicators:", ncol(miss.fac), "\n") 83 | } 84 | } 85 | } else { 86 | data.fac = NULL 87 | } 88 | 89 | (results = 90 | list( 91 | num_factors = num_factors, 92 | miss.fac = miss.fac, 93 | datafac.dum = datafac.dum, 94 | data.fac = data.fac 95 | )) 96 | 97 | } 98 | -------------------------------------------------------------------------------- /R/process-numerics.R: -------------------------------------------------------------------------------- 1 | process_numerics = 2 | function(data.num, 3 | quantile_probs_numeric, 4 | miss.cut, 5 | bins_numeric, 6 | impute, 7 | verbose = FALSE) { 8 | 9 | n = nrow(data.num) 10 | 11 | if (ncol(data.num) > 0) { 12 | num_cols = ncol(data.num) 13 | if (verbose) cat("Processing numerics. Start count:", num_cols, "\n") 14 | 15 | # Remove columns where the 0.1 and 0.9 quantiles have the same value, i.e. insufficent variation. 16 | # TODO: set this is a configurable setting? 17 | if (!is.null(quantile_probs_numeric)) { 18 | data.num = restrict_by_quantiles(data.num, quantile_probs = quantile_probs_numeric) 19 | } 20 | 21 | if (verbose) { 22 | num_dropped = num_cols - ncol(data.num) 23 | if (num_dropped > 0) { 24 | cat("Dropped", num_dropped, "numerics due to lack of variation.\n") 25 | } else { 26 | cat("No numerics dropped due to lack of variation.\n") 27 | } 28 | } 29 | 30 | # Save how many numeric variables we have in this dataframe. 31 | num_numeric = ncol(data.num) 32 | } else { 33 | num_numeric = 0L 34 | } 35 | 36 | if (num_numeric > 0) { 37 | if (verbose) cat("Cleaning up", num_numeric, "numeric variables.\n") 38 | # Make deciles for continuous variables 39 | X = data.num 40 | xc = dim(X)[2] 41 | # We don't seem to use this qt variable. 42 | qt = apply(na.omit(X), 2, quantile, probs = seq(0.1, 0.9, 0.1)) 43 | newX = NULL 44 | coln = NULL 45 | varn = colnames(X) 46 | 47 | num.cat = apply(X, 2, length_unique) 48 | 49 | # Matrix to store the numeric columns converted to bin levels (integer value per quantile). 50 | numerics_binned = matrix(nrow = n, ncol = num_numeric) 51 | 52 | # Make a list to store the levels for each numeric variable. 53 | numeric_levels = vector("list", num_numeric) 54 | 55 | for (numeric_i in 1:num_numeric) { 56 | # Because we do not specify "drop" within the brackets, Xt is now a vector. 57 | Xt = X[, numeric_i] 58 | 59 | name = colnames(data.num)[numeric_i] 60 | 61 | if (verbose) { 62 | cat("Processing", name, numeric_i, "of", num_numeric, "\n") 63 | } 64 | 65 | # Suppress the warning that can occur when there are fewer than the desired 66 | # maximum number of bins, as specified by bins_numeric. We should be able to 67 | # see this as var_binned containing fewer than bins_numeric columns. 68 | # Warning is in .cut2(): min(xx[xx > upper]) 69 | # "no non-missing arguments to min; returning Inf" 70 | num_unique_vals = length(setdiff(unique(Xt), NA)) 71 | 72 | num_breaks = min(bins_numeric, num_unique_vals) 73 | 74 | arules_method = "frequency" 75 | 76 | # No need to apply tiling, we already have a limited # of unique vals. 77 | # TODO: return info on this, and possible a notice message. 78 | if (num_unique_vals <= bins_numeric) { 79 | arules_method = "interval" 80 | } 81 | 82 | suppressWarnings({ 83 | # Discretize into up to 10 quantiles (by default), configurable based on 84 | # bins_numeric argument. 85 | # This returns a factor version of the discretized variable. 86 | tryCatch({ var_binned_names = arules::discretize(Xt, 87 | method = arules_method, 88 | breaks = num_breaks, 89 | ordered = TRUE) 90 | }, error = function(error) { 91 | # This can happen with skewed distributions where multiple breaks are not unique. 92 | print(error) 93 | cat("Error: could not discretize numeric", numeric_i, "", name, "\n") 94 | cat("Unique values:", length(unique(Xt)), "\n") 95 | cat("Switching to cluster-based discretization.\n") 96 | tryCatch({ 97 | var_binned_names = arules::discretize(Xt, method = "cluster", 98 | breaks = num_breaks, 99 | ordered = TRUE)}, 100 | error = function(error2) { 101 | # TODO: use another package/function to discretize. 102 | print(error2) 103 | cat("Cluster-based discretization failed - using all levels.") 104 | var_binned_names = factor(Xt) 105 | }) 106 | 107 | }) 108 | }) 109 | 110 | # Save the levels for future usage. 111 | numeric_levels[[numeric_i]] = levels(var_binned_names) 112 | # This converts the factor variable to just the quantile numbers. 113 | var_binned = as.numeric(var_binned_names) 114 | numerics_binned[, numeric_i] = var_binned 115 | 116 | if (verbose) { 117 | cat("New levels:", paste(levels(var_binned_names), collapse = ", "), "\n") 118 | } 119 | } 120 | colnames(numerics_binned) = varn 121 | data.cont.dist = numerics_binned 122 | 123 | ############### 124 | # Missing Basis for numeric variables, post-binning. 125 | 126 | n.cont = nrow(data.cont.dist) 127 | 128 | sum_nas = apply(data.cont.dist, 2, sum_na) 129 | nmesX = colnames(data.cont.dist) 130 | miss.cont = NULL 131 | nmesm = NULL 132 | 133 | # Create imputed version of the numeric dataframe. 134 | # This is used as the adjustment set, but not used when generating the treatment assignment vector. 135 | data.numW = data.num 136 | 137 | # Loop over each binned numeric variable. 138 | # TODO: do this as part of the binning process. 139 | for (k in 1:num_numeric) { 140 | # Check if that variable has any missing values. 141 | if (sum_nas[k] > 0) { 142 | # The effect is that the basis is set to 1 if it exists and 0 if it's missing. 143 | ix = as.numeric(!is.na(data.cont.dist[, k])) 144 | miss.cont = cbind(miss.cont, ix) 145 | # TODO: convert to paste0 146 | nmesm = c(nmesm, paste("Imiss_", nmesX[k], sep = "")) 147 | } 148 | } 149 | # if(is.null(miss.cont)){miss.cont= rep(1,n.cont)} 150 | colnames(miss.cont) = nmesm 151 | 152 | # Impute missing data in numeric columns. 153 | if (impute == "zero") { 154 | data.numW[is.na(data.num)] = 0 155 | impute_info = 0 156 | } else if (impute == "median") { 157 | impute_info = caret::preProcess(data.num, method = "medianImpute") 158 | data.numW = predict(impute_info, data.num) 159 | } else if (impute == "mean") { 160 | stop("Mean imputation not implemented yet. Please use another imputation method.") 161 | } else if (impute == "knn") { 162 | # NOTE: this also results in caret centering and scaling the data. 163 | impute_info = caret::preProcess(data.num, method = "knnImpute") 164 | data.numW = predict(impute_info, data.num) 165 | } 166 | 167 | # Confirm that there are no missing values remaining in data.numW 168 | stopifnot(sum(is.na(data.numW)) == 0) 169 | } else { 170 | data.cont.dist = NULL 171 | miss.cont = NULL 172 | data.numW = NULL 173 | impute_info = NULL 174 | data.num = NULL 175 | } 176 | 177 | (results = 178 | list( 179 | data.cont.dist = data.cont.dist, 180 | num_numeric = num_numeric, 181 | miss.cont = miss.cont, 182 | data.num = data.num, 183 | data.numW = data.numW, 184 | impute_info = impute_info 185 | )) 186 | } 187 | -------------------------------------------------------------------------------- /R/quantiles_equivalent.R: -------------------------------------------------------------------------------- 1 | #' Checks if two quantiles have the same sample value for a given vector. 2 | #' This indicates that the vector has little variation. 3 | #' 4 | #' @param x Data vector. If a factor it is converted to numeric using unclass(). 5 | #' @param quantile_probs Vector with two probabilities that specify the quantiles. 6 | #' 7 | #' @return True if the two quantiles are equal, indicating a lack of variation in the sample data. 8 | #' 9 | #' @seealso restrict_by_quantiles 10 | #' 11 | #' @export 12 | quantiles_equivalent = function(x, quantile_probs = c(0.1, 0.9)) { 13 | if (length(quantile_probs) != 2) { 14 | warning("Quantiles_equivalent() expects quantile_probs to be a 2-element vector.") 15 | } 16 | if (is.factor(x)) { 17 | x = unclass(x) 18 | } 19 | quantiles = quantile(x, probs = quantile_probs, na.rm = T) 20 | # Returns True if there is no differences between the first and second quantiles. 21 | (quantiles[2] - quantiles[1]) == 0 22 | } 23 | -------------------------------------------------------------------------------- /R/reduce_dimensions.R: -------------------------------------------------------------------------------- 1 | #' Reduce variables in a dataframe to a target number of covariates. 2 | #' 3 | #' Currently uses HOPACH hierarchical clustering but could be generalized. 4 | #' 5 | #' @param data Dataframe 6 | #' @param newX Optional second dataframe to receive the same reduction. 7 | #' @param max_variables Maximum we want to allow, after which dimension reduction 8 | #' will take place. Cannot be more than 15 due to HOPACH limitations. Set to NULL 9 | #' to disable any dimension reduction. 10 | #' @param verbose If true will output additional information during execution. 11 | #' 12 | #' @importFrom hopach hopach distancematrix 13 | #' 14 | #' @export 15 | reduce_dimensions = function(data, newX = NULL, max_variables, verbose = FALSE) { 16 | 17 | # Identify constant columns in training data. 18 | is_constant = sapply(data, function(col) var(col, na.rm = TRUE) == 0) 19 | 20 | if (sum(is_constant) > 0) { 21 | if (verbose) cat("First removing", sum(is_constant), "constant columns.\n") 22 | 23 | # Remove constant columns. 24 | data = data[, !is_constant, drop = FALSE] 25 | 26 | # Remove those same constant columns from the test data, if it was provided. 27 | if (!is.null(newX)) { 28 | # Here we have to operate by names in case the validation data has different columns. 29 | newX = newX[, !names(newX) %in% names(is_constant[is_constant]), drop = FALSE] 30 | } 31 | } 32 | 33 | # Set this by default, then override it if we do reduce dimensions. 34 | variables = colnames(data) 35 | 36 | num_columns = ncol(data) 37 | 38 | # Skip if number covariates is within the target maximum or if the maximum is null. 39 | if (num_columns <= max_variables || is.null(max_variables)) { 40 | Wtsht = data 41 | Wvsht = newX 42 | 43 | # We still need to restrict validation W to contain only columns that 44 | # were in the training data. I.e. remove any extra missingness indicators. 45 | # TODO: should we do this at the very beginning? 46 | Wvsht = Wvsht[, colnames(Wvsht) %in% colnames(Wtsht), drop = FALSE] 47 | 48 | } else { 49 | if (verbose) cat("Reducing dimensions via clustering.\n") 50 | 51 | #mydist = as.matrix(hopach::distancematrix(t(Wt), d = "cosangle", na.rm = T)) 52 | 53 | # Compute pairwise distances between each variable in the dataframe. 54 | # We transpose Wt because we want to cluster columns rather than rows. 55 | mydist = try(hopach::distancematrix(t(data), d = "cosangle", na.rm = T), 56 | silent = !verbose) 57 | if (inherits(mydist, "try-error")) { 58 | cat("Error in HOPACH clustering: failed to calculate distance matrix.\n") 59 | } 60 | 61 | # Attempt #1. 62 | # We transpose Wt to cluster the columns rather than rows. 63 | # K = number of variables to choose. 64 | # kmax = maximum number of children at each node in the tree. 65 | # khigh = max # of children at each node when computing mss, usually the same. 66 | suppressWarnings({ # Suppress warnings about newmed = "medsil" in collap(). 67 | hopach.1 = try(hopach::hopach(t(data), dmat = mydist, mss = "mean", verbose = verbose, 68 | K = max_variables, kmax = 3, khigh = 3), 69 | silent = !verbose) 70 | }) 71 | if (inherits(hopach.1, "try-error")) { 72 | if (verbose) { 73 | cat("Hopach attempt 1 fail.\n") 74 | print(hopach.1) 75 | } 76 | 77 | # Attempt #2. 78 | # We transpose Wt to cluster the columns rather than rows. 79 | suppressWarnings({ # Suppress warnings about newmed = "medsil" in collap(). 80 | hopach.1 <- try(hopach::hopach(t(data), dmat = mydist, mss = "med", 81 | verbose = verbose, 82 | K = max_variables, kmax = 3, khigh = 3), 83 | silent = !verbose) 84 | }) 85 | } 86 | if (inherits(hopach.1, "try-error")) { 87 | if (verbose) { 88 | cat("Attempt 2 fail.")# Reverting to original W dataframe.\n") 89 | print(hopach.1) 90 | } 91 | 92 | # Attempt #3. Last try! 93 | # We transpose Wt to cluster the columns rather than rows. 94 | suppressWarnings({ # Suppress warnings about newmed = "medsil" in collap(). 95 | hopach.1 <- try(hopach::hopach(t(data), dmat = mydist, mss = "med", 96 | verbose = F, 97 | K = max_variables, kmax = 3, khigh = 3, 98 | newmed="nn"), 99 | silent = !verbose) 100 | }) 101 | } 102 | if (inherits(hopach.1, "try-error")) { 103 | if (verbose) { 104 | cat("Attempt 3 fail. Reverting to original W dataframe.\n") 105 | # Now try to debug this. 106 | # stop() 107 | } 108 | #warning("Dimensionality reduction failed. i=", i, "V=", kk, "A=", nameA) 109 | Wtsht = data 110 | Wvsht = newX 111 | } else { 112 | # TODO: annotate what is going on here with the HOPACH result object. 113 | nlvls = nchar(max(hopach.1$final$labels)) 114 | no = trunc(mean(log10(hopach.1$final$labels))) 115 | 116 | # Find highest level of tree where minimum number of covariates is >= adjust_cutoff. 117 | lvl = 1:nlvls 118 | ncv = NULL 119 | for (ii in lvl) { 120 | ncv = c(ncv, length(unique(trunc(hopach.1$final$labels/10^(no - (ii - 1)))))) 121 | } 122 | ncv = unique(ncv) 123 | # Suppress possible "no non-missing arguments to min; returning Inf" 124 | # warning from min(). 125 | # TODO: investigate more and confirm that this is ok. 126 | suppressWarnings({ 127 | lev = min(min(nlvls, dim(data)[2]), min(lvl[ncv >= max_variables])) 128 | }) 129 | two.clust <- unique(trunc(hopach.1$final$labels/(10^(no - (lev - 1))))) 130 | md <- hopach.1$final$medoids 131 | mm = md[, 1] %in% two.clust 132 | incc = md[mm, 2] 133 | 134 | # Restrict to those variables in the training and validation data. 135 | Wtsht = data[, incc, drop = F] 136 | Wvsht = newX[, incc, drop = F] 137 | 138 | # Save the chosen variables so that we can return them in the result list. 139 | variables = colnames(data)[incc] 140 | } 141 | } 142 | 143 | # If training data contains any columsn that don't exist in the validation data 144 | # create them and assign a value of 0. 145 | 146 | missing_cols = setdiff(colnames(Wtsht), colnames(Wvsht)) 147 | if (length(missing_cols) > 0) { 148 | if (verbose) cat(paste("Adding missing columns in prediction data:", 149 | paste(missing_cols, collapse = ", "))) 150 | 151 | new_cols = matrix(0, nrow = nrow(Wvsht), ncol = length(missing_cols)) 152 | colnames(new_cols) = missing_cols 153 | Wvsht = cbind(Wvsht, new_cols) 154 | 155 | # Sort columns in the correct order so that matrix multiplication is correct. 156 | Wvsht = Wvsht[, colnames(Wtsht), drop = FALSE] 157 | } 158 | 159 | if (verbose) cat(" Updated columns:", ncol(Wtsht), "training", 160 | ncol(Wvsht), "validation.\n") 161 | 162 | results = list(data = Wtsht, newX = Wvsht, variables = variables) 163 | results 164 | } 165 | -------------------------------------------------------------------------------- /R/restrict_by_quantiles.R: -------------------------------------------------------------------------------- 1 | #' Remove columns from a dataframe if they do not have sufficient variation. 2 | #' 3 | #' @param data Dataframe or matrix 4 | #' @param quantile_probs The probabilities corresponding to the quantiles that will be compared. 5 | #' @param verbose If TRUE output additional details during execution. 6 | #' 7 | #' @return New dataframe with the restriction applied. 8 | #' 9 | #' @seealso quantiles_equivalent 10 | #' 11 | #' @export 12 | restrict_by_quantiles = 13 | function(data, 14 | quantile_probs = c(0.1, 0.9), 15 | verbose = FALSE) { 16 | 17 | # Drop column if the two quantiles have the same sample value (i.e. difference = 0). 18 | # True = remove, False = keep 19 | # TODO: support parallelization, e.g. for very wide datasets. 20 | drop_cols = sapply(1:ncol(data), 21 | function(i) quantiles_equivalent(data[, i], quantile_probs)) 22 | 23 | if (verbose) { 24 | if (sum(drop_cols) > 0) { 25 | cat("restrict_by_quantiles(): dropping", paste(names(data)[drop_cols], collapse = ", "), "\n") 26 | } 27 | } 28 | 29 | # Restrict to variables with sufficient variation. 30 | data = data[, !drop_cols, drop = FALSE] 31 | 32 | return(data) 33 | } 34 | -------------------------------------------------------------------------------- /R/results-by-level.R: -------------------------------------------------------------------------------- 1 | #' Aggregate the results_by_fold_and_level df into a results_by_level df. 2 | #' 3 | #' @param results_by_fold_and_level Dataframe containing the VIM results for 4 | #' all levels of each variable across all CV folds. 5 | #' @param verbose If true, display extra output. 6 | #' @importFrom magrittr %>% 7 | #' @importFrom dplyr group_by summarize_all funs select mutate first 8 | #' @importFrom modeest mlv 9 | results_by_level = 10 | function(results_by_fold_and_level, 11 | verbose = FALSE) { 12 | tryCatch({ 13 | results = results_by_fold_and_level %>% 14 | # NOTE: because we group by level_label, we are assuming that any histogram 15 | # penalization happened outside of the CV to ensure that the levels are 16 | # the same across training folds. 17 | group_by(name, level) %>% 18 | # TEMP: restrict to the most common label. 19 | #mutate(level_label = as.character(mlv(as.factor(level_label), method = "mfv")$M)) %>% 20 | # TEMP: restrict to the first label 21 | mutate(level_label = first(level_label)) %>% 22 | # Now we can also group by level_label because they will be the same for a given level. 23 | group_by(name, level, level_label) %>% 24 | # Remove test_msg for now. 25 | # TODO: take mode of test_msg or first value, rather than mean. 26 | select(-c(test_msg, train_msg)) %>% 27 | # this generates a warning in mean() because test_msg is a character not a numeric. 28 | summarize_all(dplyr::funs(mean)) %>% 29 | select(-c(cv_fold, train_cell_size, test_cell_size)) 30 | 31 | # Don't keep this as a tibble. 32 | as.data.frame(results) 33 | }, error = function(error) { 34 | cat("Failed in results_by_level()\n") 35 | print(error) 36 | NULL 37 | }) 38 | } 39 | -------------------------------------------------------------------------------- /R/separate_factors_numerics.R: -------------------------------------------------------------------------------- 1 | #' Split df into one of only factors and one of only numerics. 2 | #' 3 | #' @param data A dataframe or matrix. 4 | #' @param strings_to_factors Convert character strings to factors if True, 5 | #' otherwise ignore character string variables. 6 | #' @param verbose If TRUE output additional details during execution. 7 | #' 8 | #' @return Results list. 9 | #' 10 | #' Convert strings to factors as well. 11 | #' 12 | #' @export 13 | separate_factors_numerics = 14 | function(data, strings_to_factors = TRUE, 15 | verbose = FALSE) { 16 | 17 | if (strings_to_factors) { 18 | # Identify character strings. 19 | is_char = sapply(data, is.character) 20 | 21 | # Convert strings to factors. 22 | data[, is_char] = sapply(data[, is_char], as.factor) 23 | } 24 | 25 | # Identify factors. 26 | is_factor = sapply(data, is.factor) 27 | df_factors = data[, is_factor, drop = FALSE] 28 | 29 | # Identify numerics. 30 | is_numeric = sapply(data, is.numeric) 31 | df_numerics = data[, is_numeric, drop = FALSE] 32 | 33 | result = list(df_factors = df_factors, df_numerics = df_numerics) 34 | return(result) 35 | } 36 | -------------------------------------------------------------------------------- /R/sum_na.R: -------------------------------------------------------------------------------- 1 | #' Get missingness for each column 2 | #' 3 | #' Function for getting total number missing values for vector 4 | #' 5 | #' @param x Vector, matrix, or dataframe 6 | sum_na = function(x) { 7 | sum(is.na(x)) 8 | } 9 | -------------------------------------------------------------------------------- /R/tmle_bound.R: -------------------------------------------------------------------------------- 1 | # CK: copied from tmle package. 2 | #---------- function .bound --------------- 3 | # set outliers to min/max allowable values 4 | # assumes x contains only numerical data 5 | #----------------------------------------- 6 | #' @export 7 | .bound <- function(x, bounds) { 8 | x[x>max(bounds)] <- max(bounds) 9 | x[x 1 64 | type <- "user-supplied values" 65 | if(is.null(Q)){ 66 | if(verbose) { cat("\tEstimating initial regression of Y on A and W\n")} 67 | Q <- matrix(NA, nrow=length(Y), ncol = 5) 68 | colnames(Q)<- c("QAW", "Q0W", "Q1W", "Q0W.Z1", "Q1W.Z1") 69 | if(!(is.null(Qform))){ 70 | if(identical(as.character(as.formula(Qform)), c("~","Y", "."))){ 71 | if(CDE){ 72 | Qform <- paste("Y~Z+A+", paste(colnames(W), collapse="+")) 73 | } else { 74 | Qform <- paste("Y~A+", paste(colnames(W), collapse="+")) 75 | } 76 | } 77 | m <- suppressWarnings(glm(Qform, data=data.frame(Y,Z,A,W, Delta), family=family, subset=Delta==1)) 78 | Q[,"QAW"] <- predict(m, newdata=data.frame(Y,Z,A,W), type="response") 79 | Q[,"Q0W"] <- predict(m, newdata=data.frame(Y,Z=0,A=0,W), type="response") 80 | Q[,"Q1W"] <- predict(m, newdata=data.frame(Y,Z=0,A=1,W), type="response") 81 | Q[,"Q0W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=0,W), type="response") 82 | Q[,"Q1W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=1,W), type="response") 83 | coef <- coef(m) 84 | type="glm, user-supplied model" 85 | } else { 86 | if(cvQinit){ 87 | stop("cvQinit = T not supported sadly.") 88 | #m <- try(tmle::.estQcvSL(Y,X=cbind(Z,A,W),SL.library, family=family, 89 | # Delta=Delta, Qbounds=Qbounds,id=id, verbose=verbose)) 90 | #if(!(identical(class(m), "try-error"))){ 91 | # type <- "cross-validated SL" 92 | # Qinit <- m 93 | # Q <- Qinit$Q 94 | #} 95 | } else { 96 | if(verbose) {cat("\t using SuperLearner\n")} 97 | n <- length(Y) 98 | X <- data.frame(Z,A,W) 99 | X00 <- data.frame(Z=0,A=0, W) 100 | X01 <- data.frame(Z=0,A=1, W) 101 | newX <- rbind(X, X00, X01) 102 | if(CDE) { 103 | X10 <- data.frame(Z=1,A=0, W) 104 | X11 <- data.frame(Z=1,A=1, W) 105 | newX <- rbind(newX, X10, X11) 106 | } 107 | if(packageDescription("SuperLearner")$Version < SL.version){ 108 | arglist <- list(Y=Y[Delta==1],X=X[Delta==1,], newX=newX, SL.library=SL.library, 109 | V=V, family=family, save.fit.library=T, id=id[Delta==1]) 110 | } else { 111 | arglist <- list(Y=Y[Delta==1],X=X[Delta==1,], newX=newX, SL.library=SL.library, 112 | cvControl=list(V=V), family=family, control = list(saveFitLibrary=T), id=id[Delta==1]) 113 | } 114 | suppressWarnings({ 115 | # CK: try to eliminate messages from loading packages. 116 | out = utils::capture.output({ 117 | suppressPackageStartupMessages({ 118 | m <- try(do.call(SuperLearner::SuperLearner, arglist)) 119 | }) 120 | }) 121 | # Set call to null because do.call() messes up that element. 122 | m$call = NULL 123 | }) 124 | if (identical(class(m),"SuperLearner")){ 125 | #if (verbose) print(m) 126 | Q[,"QAW"] <- m$SL.predict[1:n] 127 | Q[,"Q0W"] <- m$SL.predict[(n+1):(2*n)] 128 | Q[,"Q1W"] <- m$SL.predict[(2*n+1):(3*n)] 129 | if(CDE){ 130 | Q[,"Q0W.Z1"] <- m$SL.predict[(3*n+1):(4*n)] 131 | Q[,"Q1W.Z1"] <- m$SL.predict[(4*n+1):(5*n)] 132 | } 133 | type <- "SuperLearner" 134 | } else { 135 | stop("Super Learner failed when estimating Q. Exiting program\n") 136 | } 137 | } } 138 | } 139 | if(is.na(Q[1,1]) | identical(class(m), "try-error")){ 140 | if(verbose) {cat("\t Running main terms regression for 'Q' using glm\n")} 141 | Qform <- paste("Y~Z+A+", paste(colnames(W), collapse="+")) 142 | m <- glm(Qform, data=data.frame(Y,Z,A,W, Delta), family=family, subset=Delta==1) 143 | Q[,"QAW"] <- predict(m, newdata=data.frame(Y,Z,A,W), type="response") 144 | Q[,"Q1W"] <- predict(m, newdata=data.frame(Y,Z=0,A=1,W), type="response") 145 | Q[,"Q0W"] <- predict(m, newdata=data.frame(Y,Z=0,A=0,W), type="response") 146 | Q[,"Q0W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=0,W), type="response") 147 | Q[,"Q1W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=1,W), type="response") 148 | 149 | coef <- coef(m) 150 | type="glm, main terms model" 151 | } 152 | Q <- varimpact::.bound(Q, Qbounds) 153 | if(maptoYstar | identical(Qfamily,"binomial") | identical(Qfamily, binomial)){ 154 | Q <- qlogis(Q) 155 | Qfamily <- "binomial" 156 | } else if (identical(Qfamily, "poisson") | identical(Qfamily, poisson)) { 157 | Q <- log(Q) 158 | Qfamily <- "poisson" 159 | } 160 | if(!CDE){ 161 | Q <- Q[,1:3] 162 | } 163 | if(cvQinit){ 164 | Qinit$Q <- Q 165 | } else { 166 | Qinit <- list(Q=Q, family=Qfamily, coef=coef, type=type, model = m) 167 | if(type=="SuperLearner"){ 168 | Qinit$SL.library=SL.library 169 | Qinit$coef=m$coef 170 | } 171 | } 172 | return(Qinit) 173 | } 174 | -------------------------------------------------------------------------------- /R/tmle_init_stage1.R: -------------------------------------------------------------------------------- 1 | # Copied from tmle package. 2 | #---------- function .initStage1 --------------- 3 | # Bound Y, map to Ystar if applicable, and 4 | # set boundson on Q and enforce on user-specified values 5 | # returns 6 | # Ystar - outcome values (between [0,1] if maptoYstar=TRUE) 7 | # Q - matrix of user-specified values 8 | # Qbounds - bounds on predicted values for Q (10% wider at each end then 9 | # observed range of Y 10 | # (-Inf,+Inf) is default for linear regression 11 | # ab - bounding levels used to transform Y to Ystar 12 | #----------------------------------------------- 13 | tmle_init_stage1 <- function(Y,A, Q, Q.Z1=NULL, Delta, Qbounds, alpha, maptoYstar, family){ 14 | if(family=="binomial") {Qbounds <- c(0,1)} 15 | if(is.null(Qbounds)) { 16 | if(maptoYstar){ 17 | Qbounds <- range(Y[Delta==1]) 18 | Qbounds <- Qbounds + .1*c(-abs(Qbounds[1]),abs(Qbounds[2])) 19 | } else { 20 | Qbounds <- c(-Inf, Inf) 21 | } 22 | } 23 | if(!is.null(Q)){ 24 | QAW <- (1-A)*Q[,1] + A*Q[,2] 25 | Q <- cbind(QAW, Q0W=Q[,1], Q1W=Q[,2]) 26 | } 27 | if(!is.null(Q.Z1)){ 28 | Q <- cbind(Q, Q0W.Z1=Q.Z1[,1], Q1W.Z1=Q.Z1[,2]) 29 | } 30 | ab <- c(0,1) 31 | Ystar <- Y 32 | if(maptoYstar){ 33 | Ystar <- .bound(Y, Qbounds) 34 | if(!is.null(Q)){ 35 | Q <- .bound(Q, Qbounds) 36 | } 37 | if(0 >= alpha | 1 <= alpha){ 38 | alpha <- .995 39 | warning(paste("\n\talpha must be between 0 and 1, alpha reset to",alpha,"\n"), 40 | immediate. = TRUE) 41 | } 42 | ab <- range(Ystar, na.rm=TRUE) 43 | Ystar[is.na(Ystar)] <- 0 44 | Ystar <- (Ystar-ab[1])/diff(ab) 45 | if(!is.null(Q)){Q <- (Q-ab[1])/diff(ab)} 46 | Qbounds <- c(alpha, 1-alpha) 47 | } 48 | return(list(Ystar=Ystar, Q=Q, Qbounds=Qbounds, ab=ab)) 49 | } 50 | -------------------------------------------------------------------------------- /R/varimpact.R: -------------------------------------------------------------------------------- 1 | #' @title Variable importance estimation using causal inference (targeted learning) 2 | #' 3 | #' @description \code{varimpact} returns variable importance statistics ordered 4 | #' by statistical significance using a combination of data-adaptive target 5 | #' parameter 6 | #' 7 | #' @details 8 | #' The function performs the following functions. 9 | #' \enumerate{ 10 | #' \item Drops variables missing > miss.cut of time (tuneable). 11 | #' \item Separate out covariates into factors and continuous (ordered). 12 | #' \item Drops variables for which their distribution is uneven - e.g., all 1 13 | #' value (tuneable) separately for factors and numeric variables (ADD MORE 14 | #' DETAIL HERE) 15 | #' \item Makes dummy variable basis for factors, including naming dummies 16 | #' to be traceable to original factor variables later. 17 | #' \item Makes new ordered variable of integers mapped to intervals defined by 18 | #' deciles for the ordered numeric variables (automatically makes) fewer 19 | #' categories if original variable has < 10 values. 20 | #' \item Creates associated list of number of unique values and the list of them 21 | #' for each variable for use in variable importance part. 22 | #' \item Makes missing covariate basis for both factors and ordered variables 23 | #' \item For each variable, after assigning it as A, uses optimal histogram 24 | #' function to combine values using the distribution of A | Y=1 to avoid very 25 | #' small cell sizes in distribution of Y vs. A (tuneable) (ADD DETAIL) 26 | #' \item Uses HOPACH* to cluster variables associated confounder/missingness 27 | #' basis for W, that uses specified minimum number of adjustment variables. 28 | #' \item Finds min and max estimate of E(Ya) w.r.t. a. after looping through 29 | #' all values of A* (after processed by histogram) 30 | #' \item Returns estimate of E(Ya(max)-Ya(min)) with SE using CV-TMLE. 31 | #' } 32 | #' *HOPACH is "Hierarchical Ordered Partitioning and Collapsing Hybrid" 33 | #' 34 | #' @param Y outcome of interest (numeric vector) 35 | #' @param data data frame of predictor variables of interest for 36 | #' which function returns VIM's. (possibly a matrix?) 37 | #' @param A_names Names of the variables for which we want to estimate importance, 38 | #' a subset of the data argument. 39 | #' @param V Number of cross-validation folds. 40 | #' @param Q.library library used by SuperLearner for model of outcome 41 | #' versus predictors 42 | #' @param g.library library used by SuperLearner for model of 43 | #' predictor variable of interest versus other predictors 44 | #' @param family family ('binomial' or 'gaussian') 45 | #' @param minYs mininum # of obs with event - if it is < minYs, skip VIM 46 | #' @param minCell is the cut-off for including a category of A in analysis, and 47 | #' presents the minumum of cells in a 2x2 table of the indicator of that level 48 | #' versus outcome, separately by training and validation sample. 49 | #' @param adjust_cutoff Maximum number of adjustment variables during TMLE. If 50 | #' more than this cutoff varimpact will attempt to reduce the dimensions to 51 | #' that number (using HOPACH hierarchical clustering). Must not be more than 52 | #' 15 due to HOPACH constraints. Set to NULL to disable any dimension 53 | #' reduction. 54 | #' @param corthres cut-off correlation with explanatory 55 | #' variable for inclusion of an adjustment variables 56 | #' @param impute Type of missing value imputation to conduct. One of: "zero", 57 | #' "median", "knn" (default). Note: knn results in the covariate data being centered/scaled. 58 | #' @param miss.cut eliminates explanatory (X) variables with proportion 59 | #' of missing obs > cut.off 60 | #' @param bins_numeric Numbers of bins when discretizing numeric variables. 61 | #' @param quantile_probs_factor Quantiles used to check if factors have 62 | #' sufficient variation. 63 | #' @param quantile_probs_numeric Quantiles used to check if numerics have 64 | #' sufficient variation. 65 | #' @param parallel Use parallel processing if a backend is registered; enabled 66 | #' by default. 67 | #' @param verbose Boolean - if TRUE the method will display more detailed 68 | #' output. 69 | #' @param verbose_tmle Boolean - if TRUE, will display even more detail on the TMLE 70 | #' estimation process. 71 | #' @param verbose_reduction Boolean - if TRUE, will display more detail during 72 | #' variable reduction step (clustering). 73 | #' @param digits Number of digits to round the value labels. 74 | #' 75 | #' @return Results object. TODO: add more detail here. 76 | #' 77 | #' @importFrom stats cor model.matrix na.omit pnorm quantile var 78 | #' @importFrom SuperLearner All 79 | #' 80 | #' @seealso 81 | #' \code{\link[varimpact]{exportLatex}}, \code{\link[varimpact]{print.varimpact}} 82 | #' 83 | #' @encoding utf8 84 | #' 85 | #' @section Authors: 86 | #' Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley 87 | #' 88 | #' 89 | #' @section References: 90 | #' Benjamini, Y., & Hochberg, Y. (1995). \emph{Controlling the false discovery 91 | #' rate: a practical and powerful approach to multiple testing}. Journal of the 92 | #' royal statistical society. Series B (Methodological), 289-300. 93 | #' 94 | #' Gruber, S., & van der Laan, M. J. (2012). \emph{tmle: An R Package for 95 | #' Targeted Maximum Likelihood Estimation}. Journal of Statistical Software, 96 | #' 51(i13). 97 | #' 98 | #' Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016). 99 | #' \emph{Statistical Inference for Data Adaptive Target Parameters}. The 100 | #' international journal of biostatistics, 12(1), 3-19. 101 | #' 102 | #' Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A., 103 | #' Bulger, E. M., ... & Rahbar, M. H. (2013). \emph{Time-Dependent Prediction 104 | #' and Evaluation of Variable Importance Using SuperLearning in High Dimensional 105 | #' Clinical Data}. The journal of trauma and acute care surgery, 75(1 0 1), S53. 106 | #' 107 | #' Hubbard, A. E., & van der Laan, M. J. (2016). \emph{Mining with inference: 108 | #' data-adaptive target parameters (pp. 439-452)}. In P. Buhlmann et al. (Ed.), 109 | #' \emph{Handbook of Big Data}. CRC Press, Taylor & Francis Group, LLC: Boca 110 | #' Raton, FL. 111 | #' 112 | #' van der Laan, M. J. (2006). \emph{Statistical inference for variable 113 | #' importance}. The International Journal of Biostatistics, 2(1). 114 | #' 115 | #' van der Laan, M. J., & Pollard, K. S. (2003). \emph{A new algorithm for 116 | #' hybrid hierarchical clustering with visualization and the bootstrap}. Journal 117 | #' of Statistical Planning and Inference, 117(2), 275-303. 118 | #' 119 | #' van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). \emph{Super 120 | #' learner}. Statistical applications in genetics and molecular biology, 6(1). 121 | #' 122 | #' van der Laan, M. J., & Rose, S. (2011). \emph{Targeted learning: causal 123 | #' inference for observational and experimental data}. Springer Science & 124 | #' Business Media. 125 | #' 126 | #' @examples 127 | #' #################################### 128 | #' # Create test dataset. 129 | #' set.seed(1) 130 | #' N <- 100 131 | #' num_normal <- 5 132 | #' X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal)) 133 | #' Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) 134 | #' # Add some missing data to X so we can test imputation. 135 | #' for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA 136 | #' 137 | #' #################################### 138 | #' # Basic example 139 | #' 140 | #' vim <- varimpact(Y = Y, data = X[, 1:3]) 141 | #' vim 142 | #' vim$results_all 143 | #' exportLatex(vim) 144 | #' cleanup_latex_files() 145 | #' 146 | #' # Impute by median rather than knn. 147 | #' \dontrun{ 148 | #' vim <- varimpact(Y = Y, data = X[, 1:3], impute = "median") 149 | #' } 150 | #' 151 | #' #################################### 152 | #' # Multicore parallel example. 153 | #' \dontrun{ 154 | #' # Setup multicore parallelization. 155 | #' library(future) 156 | #' plan("multisession", workers = 2) 157 | #' 158 | #' vim <- varimpact(Y = Y, data = X[, 1:3]) 159 | #' } 160 | #' 161 | #' #################################### 162 | #' # Cluster parallel example. 163 | #' \dontrun{ 164 | #' cl = parallel::makeCluster(2L) 165 | #' plan(cluster, workers = cl) 166 | #' vim <- varimpact(Y = Y, data = X[, 1:3]) 167 | #' parallel::stopCluster(cl) 168 | #' } 169 | #' 170 | #' #################################### 171 | #' # mlbench BreastCancer example. 172 | #' \dontrun{ 173 | #' data(BreastCancer, package="mlbench") 174 | #' data <- BreastCancer 175 | #' 176 | #' set.seed(1, "L'Ecuyer-CMRG") 177 | #' # Reduce to a dataset of 100 observations to speed up testing. 178 | # data = data[sample(nrow(data), 100), ] 179 | # 180 | #' # Create a numeric outcome variable. 181 | #' data$Y <- as.numeric(data$Class == "malignant") 182 | # 183 | #' # Use multicore parallelization to speed up processing. 184 | #' future::plan("multiprocess", workers = 2) 185 | #' vim <- varimpact(Y = data$Y, data = subset(data, select=-c(Y, Class, Id))) 186 | #' } 187 | #' 188 | #' @export 189 | varimpact = 190 | function(Y, 191 | data, 192 | A_names = colnames(data), 193 | V = 2L, 194 | #Q.library = c("SL.glmnet", "SL.mean"), 195 | #g.library = c("SL.glmnet", "SL.mean"), 196 | Q.library = c("SL.glm", "SL.mean"), 197 | g.library = c("SL.glm", "SL.mean"), 198 | #g.library = c("SL.stepAIC"), 199 | family = "binomial", 200 | minYs = 15L, 201 | minCell = 0L, 202 | adjust_cutoff = 10L, 203 | corthres = 0.8, 204 | impute = "median", 205 | miss.cut = 0.5, 206 | bins_numeric = 10L, 207 | quantile_probs_factor = c(0.1, 0.9), 208 | quantile_probs_numeric = quantile_probs_factor, 209 | verbose = FALSE, 210 | verbose_tmle = FALSE, 211 | verbose_reduction = FALSE, 212 | parallel = TRUE, 213 | digits = 4L) { 214 | 215 | # Time the full function execution. 216 | time_start = proc.time() 217 | 218 | ###################### 219 | # Argument checks. 220 | 221 | # Confirm that data has at least two columns. 222 | if (ncol(data) < 2L) { 223 | stop("Data argument must have at least two columns.") 224 | } 225 | 226 | # Ensure that Y is numeric; e.g. can't be a factor. 227 | stopifnot(class(Y) %in% c("numeric", "integer")) 228 | 229 | if (family == "binomial" && 230 | (min(Y, na.rm = TRUE) < 0 || max(Y, na.rm = TRUE) > 1)) { 231 | stop("With binomial family Y must be bounded by [0, 1]. Specify family=\"gaussian\" otherwise.") 232 | } 233 | 234 | if (!family %in% c("binomial", "gaussian")) { 235 | stop('Family must be either "binomial" or "gaussian".') 236 | } 237 | 238 | if (parallel && verbose) { 239 | cat("Future backend set to the following:\n") 240 | print(future::plan()) 241 | } 242 | 243 | # Save bounds on the full Y variables for later transformation if Y is not binary. 244 | if (family == "binomial" || length(unique(Y)) == 2) { 245 | #Qbounds = NULL 246 | Qbounds = c(0, 1) 247 | } else { 248 | # This part is duplicated from the TMLE code in tmle_init_stage1. 249 | 250 | # Define Qbounds just for continuous (non-binary) outcomes. 251 | Qbounds = range(Y, na.rm = TRUE) 252 | # Extend bounds 10% beyond the observed range. 253 | # NOTE: if one of the bounds is zero then it won't be extended. 254 | Qbounds = Qbounds + 0.1 * c(-abs(Qbounds[1]), abs(Qbounds[2])) 255 | } 256 | 257 | ######## 258 | # Applied to Explanatory (X) data frame 259 | sna = sapply(data, sum_na) 260 | 261 | n = nrow(data) 262 | 263 | ####### 264 | # Missing proportion by variable. 265 | mis.prop = sna / n 266 | 267 | ####### 268 | # Cut-off for eliminating variable for proportion of obs missing. 269 | data = data[, mis.prop < miss.cut, drop = FALSE] 270 | 271 | # TODO: move this stuff into a separate function. 272 | if (verbose) cat("Removed", sum(mis.prop >= miss.cut), "variables due to high", 273 | "missing value proportion.\n") 274 | 275 | # Separate dataframe into factors-only and numerics-only. 276 | # Also converts characters to factors automatically. 277 | separated_data = separate_factors_numerics(data) 278 | 279 | factors = process_factors(separated_data$df_factors, 280 | quantile_probs_factor = quantile_probs_factor, 281 | miss.cut = miss.cut, 282 | verbose = verbose) 283 | 284 | # Pre-process numeric/continuous variables. 285 | numerics = 286 | process_numerics(separated_data$df_numerics, 287 | quantile_probs_numeric = quantile_probs_numeric, 288 | miss.cut = miss.cut, 289 | bins_numeric = bins_numeric, 290 | impute = impute, 291 | verbose = verbose) 292 | 293 | cat("Finished pre-processing variables.\n") 294 | 295 | cat("\nProcessing results:\n") 296 | cat("- Factor variables:", factors$num_factors, "\n") 297 | cat("- Numeric variables:", numerics$num_numeric, "\n\n") 298 | 299 | # Create cross-validation folds (2 by default). 300 | folds = create_cv_folds(V, Y, verbose = verbose) 301 | 302 | # VIM for factors. 303 | factor_vims = 304 | vim_factors(Y = Y, numerics = numerics, factors = factors, 305 | V = V, folds = folds, 306 | A_names = A_names, 307 | family = family, 308 | minCell = minCell, 309 | minYs = minYs, 310 | Q.library = Q.library, 311 | g.library = g.library, 312 | Qbounds = Qbounds, 313 | corthres = corthres, 314 | adjust_cutoff = adjust_cutoff, 315 | verbose = verbose, 316 | verbose_tmle = verbose_tmle, 317 | verbose_reduction = verbose_reduction) 318 | 319 | # Repeat for numerics. 320 | numeric_vims = 321 | vim_numerics(Y = Y, numerics = numerics, factors = factors, 322 | V = V, folds = folds, 323 | A_names = A_names, 324 | family = family, 325 | minCell = minCell, 326 | minYs = minYs, 327 | Q.library = Q.library, 328 | g.library = g.library, 329 | Qbounds = Qbounds, 330 | corthres = corthres, 331 | adjust_cutoff = adjust_cutoff, 332 | verbose = verbose, 333 | verbose_tmle = verbose_tmle, 334 | verbose_reduction = verbose_reduction) 335 | 336 | # Combine the separate continuous and factor results. 337 | results = 338 | compile_results(numeric_vims$colnames_numeric, 339 | factor_vims$colnames_factor, 340 | numeric_vims$vim_numeric, 341 | factor_vims$vim_factor, 342 | V = V, 343 | verbose = verbose) 344 | 345 | # End timing the full execution. 346 | time_end = proc.time() 347 | 348 | # Final compilation of results. 349 | results = c(results, 350 | # Append additional settings to the results object. 351 | # TODO: make this a sublist? 352 | list(V = V, 353 | g.library = g.library, 354 | Q.library = Q.library, 355 | minCell = minCell, 356 | minYs = minYs, 357 | family = family, 358 | datafac.dumW = factors$datafac.dumW, 359 | miss.fac = factors$miss.fac, 360 | data.numW = numerics$data.numW, 361 | numeric_vims = numeric_vims, 362 | factor_vims = factor_vims, 363 | impute_info = numerics$impute_info, 364 | time = time_end - time_start, 365 | cv_folds = folds)) 366 | 367 | # Set a custom class so that we can override print and summary. 368 | class(results) = "varimpact" 369 | 370 | invisible(results) 371 | } 372 | -------------------------------------------------------------------------------- /R/vim-factors.R: -------------------------------------------------------------------------------- 1 | vim_factors = 2 | function(Y, 3 | numerics, 4 | factors, 5 | V, 6 | folds, 7 | A_names, 8 | family, 9 | minCell, 10 | minYs, 11 | Q.library, 12 | g.library, 13 | Qbounds, 14 | corthres, 15 | adjust_cutoff, 16 | verbose = FALSE, 17 | verbose_tmle = FALSE, 18 | verbose_reduction = FALSE) { 19 | 20 | # NOTE: we use && so that conditional will short-circuit if num_factors == 0. 21 | if (factors$num_factors > 0L && ncol(factors$data.fac) > 0L) { 22 | cat("Estimating variable importance for", factors$num_factors, "factors.\n") 23 | 24 | # Find the level of covariate that has lowest risk 25 | datafac.dumW = factors$datafac.dum 26 | # NOTE: can't we skip this line because we already imputed missing data to 0? 27 | datafac.dumW[is.na(factors$datafac.dum)] = 0 28 | 29 | ############################# 30 | # Below is to get indexing vectors so that any basis functions related to current A 31 | # that are in covariate matrix can be removed. 32 | names.fac = colnames(factors$data.fac) 33 | nmes.facW = colnames(datafac.dumW) 34 | nmes.mfacW = colnames(factors$miss.fac) 35 | nchar.facW = nchar(nmes.facW) + 1 36 | nchar.mfacW = nchar(nmes.mfacW) + 1 37 | 38 | XXm = regexpr("XX", nmes.facW) 39 | XXm[XXm < 0] = nchar.facW[XXm < 0] 40 | 41 | XXm2 = regexpr("XX", nmes.mfacW) 42 | XXm2[XXm2 < 0] = nchar.mfacW[XXm2 < 0] 43 | 44 | vars.facW = substr(nmes.facW, 1, XXm - 1) 45 | vars.mfacW = substr(nmes.mfacW, 7, XXm2 - 1) 46 | 47 | xc = ncol(factors$data.fac) 48 | n.fac = nrow(factors$data.fac) 49 | 50 | # vim_factor = lapply(1:xc, function(i) { 51 | 52 | # vim_factor will be a list of results, one element per factor variable. 53 | # Define var_i just to avoid automated NOTEs, will be overwritten by foreach. 54 | var_i = NULL 55 | #vim_factor = foreach::foreach(var_i = 1:xc, .verbose = verbose, .errorhandling = "stop") %do_op% { 56 | vim_factor = future.apply::future_lapply(1:xc, future.seed = TRUE, function(var_i) { 57 | #vim_factor = lapply(1:xc, function(var_i) { 58 | nameA = names.fac[var_i] 59 | 60 | if (verbose) cat("Var:", nameA, var_i, "out of", xc, "factor variables\n") 61 | 62 | if (!nameA %in% A_names) { 63 | if (verbose) cat("Skipping", nameA, "as it is not in A_names.\n") 64 | return(NULL) 65 | } 66 | 67 | # Loop over each fold. 68 | # TODO: incorporate this loop into parallelization. 69 | # for (fold_k in 1:V) { 70 | 71 | # This is looping sequentially for now. 72 | #fold_results = foreach::foreach(fold_k = 1:V) foreach::`%do%` { 73 | fold_results = lapply(1:V, function(fold_k) { 74 | if (verbose) cat("i =", var_i, "V =", fold_k, "\n") 75 | 76 | # All data not in this fold is the training data. 77 | At = factors$data.fac[folds != fold_k, var_i] 78 | 79 | # All data in this fold is the validation data. 80 | Av = factors$data.fac[folds == fold_k, var_i] 81 | 82 | Yt = Y[folds != fold_k] 83 | Yv = Y[folds == fold_k] 84 | 85 | 86 | ####################################### 87 | # Create adjustment dataframe. 88 | 89 | ### acit.numW is just same as acit.cont.dist except with NA's replaced by 90 | ### 0's. 91 | mtch1 = match(vars.facW, nameA) 92 | mtch2 = match(vars.mfacW, nameA) 93 | Adum = data.frame(factors$datafac.dum[, is.na(mtch1) == FALSE]) 94 | dumW = factors$datafac.dum[, is.na(mtch1)] 95 | missdumW = factors$miss.fac[, is.na(mtch2)] 96 | 97 | if (is.null(missdumW)) { 98 | missdumW = rep(NA, n.fac) 99 | } 100 | if (is.null(numerics$miss.cont)) { 101 | numerics$miss.cont = rep(NA, n.fac) 102 | } 103 | if (is.null(dumW)) { 104 | dumW = rep(NA, n.fac) 105 | } 106 | if (is.null(numerics$data.numW)) { 107 | numerics$data.numW = rep(NA, n.fac) 108 | } 109 | 110 | W = data.frame(numerics$data.numW, numerics$miss.cont, dumW, missdumW) 111 | 112 | # Restrict to columns in which there is less than 100% missingness. 113 | W = W[, !apply(is.na(W), 2, all), drop = FALSE] 114 | 115 | ####################################### 116 | 117 | # Divide into training and validation subsets. 118 | Wt = W[folds != fold_k, , drop = FALSE] 119 | Wv = W[folds == fold_k, , drop = FALSE] 120 | 121 | Adum = data.frame(Adum[folds != fold_k, ]) 122 | 123 | ### 124 | # Pull out any variables that are overly correlated with At (corr coef < corthes) 125 | #if (sd(Adum) == 0) { 126 | # if (verbose) cat("Warning: sd of Adum = 0.\n") 127 | #} 128 | 129 | # Suppress possible "the standard deviation is zero" warning from cor(). 130 | # TODO: investigate more and confirm that this is ok. 131 | suppressWarnings({ 132 | corAt = apply(stats::cor(Adum, Wt, use = "complete.obs"), 2, max_sqr) 133 | }) 134 | corAt[corAt < -1] = 0 135 | # cat('i = ',i,' maxCor = ',max(corAt,na.rm=T),'\n') 136 | incc = abs(corAt) < corthres & !is.na(corAt) 137 | 138 | if (verbose && sum(!incc) > 0) { 139 | cat("Removed", sum(!incc), "columns based on correlation threshold", corthres, "\n") 140 | } 141 | 142 | Wv = Wv[, incc, drop = F] 143 | Wt = Wt[, incc, drop = F] 144 | 145 | if (verbose) { 146 | cat("Columns:", ncol(Wt)) 147 | if (!is.null(adjust_cutoff)) cat(" Reducing dimensions to", adjust_cutoff) 148 | cat("\n") 149 | } 150 | 151 | # Use HOPACH to reduce dimension of W to some level of tree 152 | reduced_results = reduce_dimensions(Wt, Wv, adjust_cutoff, verbose = verbose_reduction) 153 | 154 | Wtsht = reduced_results$data 155 | Wvsht = reduced_results$newX 156 | 157 | # We should have no constant columns after calling reduce_dimensions(). 158 | # Remove any NA values - but shouldn't these already be imputed? 159 | is_constant = sapply(Wtsht, function(col) var(col, na.rm = TRUE) == 0) 160 | # Restrict to just the TRUE variables - those that are constant. 161 | is_constant = is_constant[is_constant] 162 | 163 | if (verbose) { 164 | cat("Updated ncols -- training:", ncol(Wtsht), "test:", ncol(Wvsht), "\n") 165 | 166 | # We should have no constant columns after calling reduce_dimensions(). 167 | if (length(is_constant) > 0) { 168 | cat("Constant columns (", length(is_constant), "):\n") 169 | print(is_constant) 170 | } 171 | } 172 | 173 | # Finished with any needed clustering for variable reduction. 174 | 175 | deltat = as.numeric(!is.na(Yt) & !is.na(At)) 176 | deltav = as.numeric(!is.na(Yv) & !is.na(Av)) 177 | 178 | # TODO (CK): don't do this, in order to use the delta missingness estimation. 179 | # To avoid crashing TMLE function just drop obs missing A or Y if the 180 | # total number of missing is < 10 181 | if (sum(deltat == 0) < 10) { 182 | Yt = Yt[deltat == 1] 183 | At = At[deltat == 1] 184 | Wtsht = Wtsht[deltat == 1, , drop = FALSE] 185 | deltat = deltat[deltat == 1] 186 | } 187 | 188 | levA = levels(At) 189 | 190 | if (length(unique(Yt)) == 2) { 191 | # Binary outcome. 192 | 193 | # Minimum numer of observations for each cell in validation fold. 194 | minc = apply(table(Av, Yv), 1, min) 195 | 196 | # Minimum numer of observations for each cell in training fold. 197 | minc2 = apply(table(At, Yt), 1, min) 198 | 199 | # Restrict analysis to levels of this variable in which 200 | # there are sufficient observations in each fold 201 | # across each treatment and outcome combination. 202 | vals = levA[pmin(minc, minc2) > minCell] 203 | } else { 204 | # Continuous outcome. 205 | vals = levA 206 | } 207 | num.cat = length(vals) 208 | 209 | # CK TODO 6/6: don't assume that positive outcome is the rare 210 | # outcome. (e.g. via table) 211 | 212 | # Number of positive outcomes in training data. 213 | nYt = sum(Yt[!is.na(At)]) 214 | 215 | # Number of positive outcomes in validation data. 216 | nYv = sum(Yv[!is.na(Av)]) 217 | 218 | # Create a list to hold the results we calculate in this fold. 219 | # Set them to default values and update as they are calculated. 220 | fold_result = list( 221 | failed = TRUE, 222 | # Message should report on the status for this fold. 223 | message = "", 224 | obs_training = length(Yt), 225 | obs_validation = length(Yv), 226 | error_count = 0, 227 | # Save the variables chosen by reduce_dimensions(). 228 | variables = reduced_results$variables, 229 | # Results for estimating the maximum level / treatment. 230 | level_max = list( 231 | # Level is which bin was chosen. 232 | level = NULL, 233 | # Label is the description of that bin. 234 | label = NULL, 235 | # val_preds contains the g, Q, and H predictions on the validation data. 236 | val_preds = NULL, 237 | # Estimate of EY on the training data. 238 | estimate_training = NULL, 239 | # Risk from SuperLearner on Q. 240 | risk_Q = NULL, 241 | # Risk from SuperLearner on g. 242 | risk_g = NULL 243 | ) 244 | ) 245 | # Copy the blank result to a second element for the minimum level/bin. 246 | fold_result$level_min = fold_result$level_max 247 | 248 | ############################ 249 | # Don't do if 1) no more than one category of A left or 250 | # 2) if missingness pattern for A is such that there are few death events left 251 | # in either (< minYs) 252 | # Applies only to binary outcomes, not continuous. 253 | if ((length(unique(Yt)) == 2L && 254 | (num.cat < 2L || min(nYt, nYv) < minYs)) || 255 | (length(is_constant) > 0 && mean(is_constant) == 1)) { 256 | if (length(is_constant) > 0 && mean(is_constant) == 1) { 257 | error_msg = paste("Skipping", nameA, "because HOPACH reduced W to", 258 | "all constant columns.") 259 | } else if (num.cat < 2L) { 260 | error_msg = paste("Skipping", nameA, "due to lack of variation.") 261 | } else { 262 | error_msg = paste("Skipping", nameA, "due to minY constraint.", min(nYt, nYv), "<", minYs) 263 | } 264 | if (verbose) cat(error_msg, "\n") 265 | 266 | fold_result$message = error_msg 267 | # At this point here we are skipping to the end of the loop. 268 | 269 | } else { 270 | if (verbose) cat("Estimating TMLE on training", paste0("(", num.cat, ")")) 271 | 272 | error_count = 0 273 | 274 | # TODO: stop using training_estimates and switch to using bin_results. 275 | training_estimates = list() 276 | bin_results = list() 277 | 278 | # Estimate Y_a, Q hat and g hat for each level of our current variable, 279 | # on the training data. 280 | for (bin_j in 1:num.cat) { 281 | 282 | # Create a list to hold the results for this level. 283 | bin_result = list( 284 | name = nameA, 285 | cv_fold = fold_k, 286 | level = bin_j, 287 | #level_label = At_bin_labels[bin_j], 288 | level_label = vals[bin_j], 289 | 290 | # Training: Estimates 291 | # Our default value needs to be NA rather than NULL, 292 | # to allow rbinding into a dataframe later on. 293 | train_theta_tmle = NA, 294 | # TODO: implement these. 295 | #train_theta_iptw = NA, 296 | #train_theta_gcomp = NA, 297 | train_theta_unadj = NA, 298 | 299 | # Training: Misc 300 | train_cell_size = NA, 301 | train_msg = NA, 302 | 303 | # Test: Estimates 304 | test_theta_tmle = NA, 305 | # TODO: implement these. 306 | #test_theta_iptw = NA, 307 | #test_theta_gcomp = NA, 308 | test_theta_unadj = NA, 309 | 310 | # Test: Misc 311 | test_cell_size = NA, 312 | test_msg = NA, 313 | 314 | # Test: Predicted values (g, Q_bar, h) 315 | test_predictions = NULL 316 | #test_pred_g = NULL, 317 | #test_pred_Q_bar = NULL, 318 | #test_pred_h = NULL 319 | ) 320 | 321 | 322 | # Create a treatment indicator, where 1 = obs in this bin 323 | # and 0 = obs not in this bin. 324 | IA = as.numeric(At == vals[bin_j]) 325 | 326 | # Save how many obs have this level/bin in this training fold. 327 | bin_result$train_cell_size = sum(IA) 328 | 329 | # Any observations missing At are assigned to 0. 330 | IA[is.na(IA)] = 0 331 | 332 | # Save unadjusted estimate: outcome mean among observations 333 | # at the desired treatment level, who are not missing their outcome value. 334 | bin_result$train_theta_unadj = mean(Yt[IA & deltat]) 335 | 336 | # if(min(table(IA,Yt))>=) 337 | 338 | # CV-TMLE: we are using this for three reasons: 339 | # 1. Estimate Y_a on training data. 340 | # 2. Estimate Q on training data. 341 | # 3. Estimate g on training data. 342 | tmle_result = try(estimate_tmle2(Yt, IA, Wtsht, family, deltat, 343 | Q.lib = Q.library, 344 | Qbounds = Qbounds, 345 | g.lib = g.library, verbose = verbose_tmle), 346 | silent = !verbose) 347 | 348 | if (inherits(tmle_result, "try-error")) { 349 | # TMLE estimation failed. 350 | if (verbose) cat("X") 351 | error_count = error_count + 1 352 | 353 | # Initialize to NULL so validation code doesn't get subscript error 354 | training_estimates[[bin_j]] = NULL 355 | } else { 356 | # TMLE estimation successed. 357 | 358 | # Save label 359 | tmle_result$label = vals[bin_j] 360 | 361 | training_estimates[[bin_j]] = tmle_result 362 | 363 | # TODO: may also want to save the TMLE object to this bin_result list. 364 | bin_result$train_theta_tmle = tmle_result$theta 365 | 366 | if (verbose) { 367 | cat(".") 368 | } 369 | } 370 | 371 | ####################################################### 372 | # NEW: also run code on corresponding validation fold. 373 | 374 | # TODO: remove the later validation code and use these results instead. 375 | 376 | # Indicator for having the desired treatment bin on validation 377 | IA = as.numeric(Av == vals[bin_j]) 378 | 379 | # Missing values are not taken to be in this level. 380 | IA[is.na(IA)] = 0 381 | 382 | # Save how many obs have this level/bin in this validation fold. 383 | bin_result$test_cell_size = sum(IA) 384 | 385 | ################## 386 | # Run estimates on validation data (TMLE, IPTW, G-Comp, Unadj) 387 | # TODO: move into its own function. 388 | 389 | # Save unadjusted estimate: outcome mean among observations 390 | # at the desired treatment level, who are not missing their outcome value. 391 | bin_result$test_theta_unadj = mean(Yv[IA & deltav]) 392 | 393 | # CV-TMLE: predict g, Q, and clever covariate on validation data. 394 | if (!is.null(training_estimates[[bin_j]])) { 395 | preds = try(apply_tmle_to_validation(Yv, IA, Wvsht, family, 396 | deltav, training_estimates[[bin_j]], 397 | verbose = verbose)) 398 | if (inherits(preds, "try-error")) { 399 | bin_result$test_msg = paste("CV-TMLE prediction on validation failed") 400 | } else { 401 | # Save the result. 402 | bin_result$test_predictions = preds 403 | } 404 | } 405 | 406 | bin_result$test_msg = "success" 407 | 408 | # Save to the main list. 409 | bin_results[[bin_j]] = bin_result 410 | 411 | } 412 | # Finished looping over each level of the assignment variable. 413 | if (verbose) cat(" done. Errors:", error_count, "\n") 414 | 415 | # Save individual bin results. 416 | fold_result$bin_results = bin_results 417 | 418 | # Create a dataframe version of the bin results. 419 | fold_result$bin_df = 420 | do.call(rbind, lapply(bin_results, function(result) { 421 | # Exclude certain elements from the list - here the prediction vectors. 422 | # These should be saved separately. 423 | data.frame(result[!names(result) %in% c("test_predictions")], 424 | stringsAsFactors = FALSE) 425 | })) 426 | 427 | # Save test_predictions for each bin into a combined dataframe. 428 | fold_result$test_predictions = 429 | do.call(rbind, lapply(1:length(bin_results), function(bin) { 430 | result = bin_results[[bin]] 431 | tryCatch({ 432 | data.frame(bin = bin, 433 | bin_label = vals[bin], 434 | fold = fold_k, 435 | result$test_predictions, 436 | stringsAsFactors = FALSE) 437 | }, error = function(error) { 438 | NULL 439 | }) 440 | }) 441 | ) 442 | 443 | ##################################### 444 | # Resume normal varimpact algorithm. 445 | 446 | fold_result$error_count = error_count 447 | 448 | # Extract theta estimates. 449 | theta_estimates = sapply(training_estimates, function(result) { 450 | # Handle errors in the tmle estimation by returning NA. 451 | if (is.null(result)) { 452 | NA 453 | } else { 454 | ifelse("theta" %in% names(result), result$theta, NA) 455 | } 456 | }) 457 | 458 | if (!all(is.na(theta_estimates))) { 459 | # Identify maximum EY1 (theta) 460 | # Note: this may be NA if the tmle estimation failed. 461 | maxj = which.max(theta_estimates) 462 | 463 | # Identify minimum EY1 (theta) 464 | # Note: this may be NA if the tmle estimation failed. 465 | minj = which.min(theta_estimates) 466 | if (verbose) { 467 | cat("Max level:", vals[maxj], paste0("(", maxj, ")"), 468 | "Min level:", vals[minj], paste0("(", minj, ")"), "\n") 469 | } 470 | } else { 471 | maxj = NA 472 | minj = NA 473 | } 474 | 475 | # This fold failed if we got an error for each category 476 | # Or if the minimum and maximum bin is the same. 477 | # Or if the min/max training estimates are NULL. 478 | if (error_count == num.cat || 479 | (is.na(minj) && is.na(maxj)) || 480 | minj == maxj || 481 | is.null(training_estimates[[minj]]) || is.null(training_estimates[[maxj]])) { 482 | message = paste("Fold", fold_k, "failed,") 483 | if (length(theta_estimates) == 0 || error_count == num.cat) { 484 | message = paste(message, "all", num.cat, "levels had errors.") 485 | } else if (minj == maxj) { 486 | message = paste(message, "min and max level are the same. (j = ", minj, ")") 487 | } else { 488 | message = paste(message, "min or max training estimate is NULL.") 489 | } 490 | fold_result$message = message 491 | 492 | if (verbose) { 493 | cat(message, "\n") 494 | } 495 | } else { 496 | 497 | # Extract max items. 498 | maxEY1 = training_estimates[[maxj]]$theta 499 | labmax = vals[maxj] 500 | 501 | # Save these items into the fold_result list. 502 | fold_result$level_max$level = maxj 503 | fold_result$level_max$estimate_training = maxEY1 504 | fold_result$level_max$label = labmax 505 | # Save the Q risk for the discrete SuperLearner. 506 | # We don't have the CV.SL results for the full SuperLearner as it's too 507 | # computationallity intensive. 508 | fold_result$level_max$risk_Q = 509 | training_estimates[[maxj]]$q_model$cvRisk[ 510 | which.min(training_estimates[[maxj]]$q_model$cvRisk)] 511 | # And the g's discrete SL risk. 512 | fold_result$level_max$risk_g = 513 | training_estimates[[maxj]]$g_model$cvRisk[ 514 | which.min(training_estimates[[maxj]]$g_model$cvRisk)] 515 | 516 | # Extact TMLE results. 517 | fold_result$level_max 518 | 519 | #fold_result$level_max$tmle = training_estimates[[maxj]] 520 | 521 | # Extract min items. 522 | minEY1 = training_estimates[[minj]]$theta 523 | labmin = vals[minj] 524 | 525 | # Save these items into the fold_result list. 526 | fold_result$level_min$level = minj 527 | fold_result$level_min$estimate_training = minEY1 528 | fold_result$level_min$label = labmin 529 | #fold_result$level_min$tmle = training_estimates[[minj]] 530 | 531 | # Save the Q risk for the discrete SuperLearner. 532 | # We don't have the CV.SL results for the full SuperLearner as it's too 533 | # computationallity intensive. 534 | fold_result$level_min$risk_Q = 535 | training_estimates[[minj]]$q_model$cvRisk[ 536 | which.min(training_estimates[[minj]]$q_model$cvRisk)] 537 | # And the g's discrete SL risk. 538 | fold_result$level_min$risk_g = 539 | training_estimates[[minj]]$g_model$cvRisk[ 540 | which.min(training_estimates[[minj]]$g_model$cvRisk)] 541 | 542 | # Turn to validation data. 543 | 544 | # Estimate minimum level (control). 545 | 546 | # Indicator for having the desired control bin on validation. 547 | IA = as.numeric(Av == vals[minj]) 548 | 549 | # Missing values are not taken to be in this level. 550 | IA[is.na(IA)] = 0 551 | 552 | if (verbose) cat("\nMin level prediction - apply_tmle_to_validation()\n") 553 | 554 | # CV-TMLE: predict g, Q, and clever covariate on validation data. 555 | min_preds = try(apply_tmle_to_validation(Yv, IA, Wvsht, family, 556 | deltav, training_estimates[[minj]], 557 | verbose = verbose)) 558 | 559 | # Old version: 560 | #res = try(estimate_tmle(Yv, IA, Wvsht, family, deltav, 561 | # Q.lib = Q.library, 562 | # g.lib = g.library, verbose = verbose), 563 | # silent = T) 564 | 565 | if (inherits(min_preds, "try-error")) { 566 | message = paste("CV-TMLE prediction on validation failed during", 567 | "low/control level.") 568 | fold_result$message = message 569 | if (verbose) cat(message, "\n") 570 | } else { 571 | # Save the result. 572 | fold_result$level_min$val_preds = min_preds 573 | 574 | # Switch to maximum level (treatment). 575 | 576 | # Indicator for having the desired treatment bin on validation 577 | IA = as.numeric(Av == vals[maxj]) 578 | 579 | # Missing values are not taken to be in this level. 580 | IA[is.na(IA)] = 0 581 | 582 | if (verbose) cat("\nMax level prediction - apply_tmle_to_validation()\n") 583 | 584 | # CV-TMLE: predict g, Q, and clever covariate on validation data. 585 | max_preds = try(apply_tmle_to_validation(Yv, IA, Wvsht, family, 586 | deltav, training_estimates[[maxj]], 587 | verbose = verbose)) 588 | # Old code: 589 | #res2 = try(estimate_tmle(Yv, IA, Wvsht, family, deltav, 590 | # Q.lib = Q.library, 591 | # g.lib = g.library, verbose = verbose), 592 | # silent = !verbose) 593 | 594 | 595 | if (inherits(max_preds, "try-error")) { 596 | message = paste("CV-TMLE prediction on validation failed", 597 | "during high/treatment level.") 598 | fold_result$message = message 599 | if (verbose) cat(message, "\n") 600 | } else { 601 | # Save the result. 602 | fold_result$level_max$val_preds = max_preds 603 | fold_result$message = "Succcess" 604 | # Fold succeeded. 605 | fold_result$failed = FALSE 606 | } 607 | } 608 | } 609 | } 610 | if (verbose) cat("Completed fold", fold_k, "\n\n") 611 | 612 | # Return results for this fold. 613 | fold_result 614 | } 615 | ) # End lapply if we're not using foreach. 616 | # Done looping over each fold. 617 | 618 | ######################## 619 | # Reconstruct CV-TMLE treatment-specific mean estimate for each validation fold. 620 | # Loop over bins and calculate bin-specific CV-TMLE estimate. 621 | if (verbose) cat("Estimating CV-TMLE treatment-specific means.\n") 622 | #for (bin in 1:numcat.cont[var_i]) { 623 | num_bins = length(levels(factors$data.fac[, var_i])) 624 | for (bin in 1:num_bins) { 625 | 626 | if (verbose) { 627 | cat("Bin", bin, "of", num_bins, "\n") 628 | } 629 | # Expects a list of results by fold. 630 | # Each element of that list should have the val_preds list, which 631 | # is calculated by apply_tmle_to_validation and currently saved in 632 | # fold_results[[*]]$test_predictions (separately by fold * level). 633 | compile_rows = lapply(fold_results, function(fold_r) { 634 | # Check if we even have test predictions for this fold. 635 | # We may not if the estimation failed on the training data. 636 | if (!"test_predictions" %in% names(fold_r)) { 637 | if (verbose) cat("(failed) ") 638 | return(NULL) 639 | } 640 | # Extract the rows specific to this bin/level. 641 | rows = fold_r$test_predictions[fold_r$test_predictions$bin == bin, , drop = FALSE] 642 | if (verbose) cat("Rows:", nrow(rows), " ") 643 | # If we have 0 rows for this bin in this fold, we need to debug. 644 | # if (nrow(rows) == 0) browser() 645 | rows 646 | }) 647 | bin_df = do.call(rbind, compile_rows) 648 | if (verbose) cat("\n") 649 | 650 | if (!inherits(bin_df, "data.frame") || nrow(bin_df) == 0L) { 651 | if (verbose) { 652 | cat("Skipping bin", bin, "- no rows are available.\n") 653 | } 654 | # We have no val_preds for this bin, so skip pooled result estimation. 655 | 656 | # Temporary simplification for debugging purposes. 657 | #pooled_bin = list(thetas = 1:V) 658 | pooled_bin = list(thetas = rep(NA, V)) 659 | } else { 660 | 661 | # Create a list with one element ($val_preds df) per fold. 662 | bin_list = lapply(1:V, function(fold_i) { 663 | # Return with an enclosing list. 664 | list(bin_df[bin_df$fold == fold_i, ]) 665 | }) 666 | 667 | # Rename the element to be $val_preds 668 | for (fold in 1:V) { 669 | names(bin_list[[fold]]) = c("val_preds") 670 | } 671 | 672 | # bin_df can be NULL if the variable is skipped due to errors, 673 | # e.g. lack of variation. 674 | if (!is.null(bin_df) && nrow(bin_df) > 0L) { 675 | pooled_bin = estimate_pooled_results(bin_list, verbose = verbose) 676 | # Now we have $thetas and $influence_curves 677 | 678 | # Save the vector of estimates into the appropriate spot. 679 | # $thetas has the treatment-specific means 680 | # $influence_curves can be used to calculate the SE's, but shouldn't those 681 | # already be calculated by estimate_pooled_results() 682 | 683 | # Loop over fold results and insert the thetas into appropriate df. 684 | for (fold in 1:length(bin_list)) { 685 | bin_df = fold_results[[fold]]$bin_df 686 | row = bin_df$level == bin & bin_df$cv_fold == fold 687 | fold_results[[fold]]$bin_df[row, "test_theta_tmle"] = pooled_bin$thetas[fold] 688 | fold_results[[fold]]$bin_df[row, "test_var_tmle"] = var(pooled_bin$influence_curves[[fold]]) 689 | } 690 | 691 | } else { 692 | if (verbose) { 693 | cat("Skipping bin", bin, "- no rows are available.\n") 694 | } 695 | # We have no val_preds for this bin, so skip pooled result estimation. 696 | 697 | # Temporary simplification for debugging purposes. 698 | #pooled_bin = list(thetas = 1:V) 699 | pooled_bin = list(thetas = rep(NA, V)) 700 | } 701 | 702 | if (verbose) { 703 | cat("\n") 704 | } 705 | 706 | } 707 | } 708 | 709 | # Combine results for each fold into a single dataframe. 710 | # This can fail if all bins for this variable failed. 711 | tryCatch({ 712 | results_by_fold_and_level = do.call(rbind, lapply(fold_results, `[[`, "bin_df")) 713 | }, error = function(e) { 714 | results_by_fold_and_level = NULL 715 | }) 716 | 717 | # Aggregate into a results_by_level dataframe. 718 | # This can fail if all bins for this variable failed. 719 | tryCatch({ 720 | results_by_level = results_by_level(results_by_fold_and_level, 721 | verbose = verbose) 722 | }, error = function(e) { 723 | results_by_level = NULL 724 | }) 725 | 726 | # Create list to save results for this variable. 727 | var_results = list( 728 | EY1V = NULL, 729 | EY0V = NULL, 730 | thetaV = NULL, 731 | thetaV_rr = NULL, 732 | varICV = NULL, 733 | labV = NULL, 734 | nV = NULL, 735 | fold_results = fold_results, 736 | type = "factor", 737 | results_by_fold_and_level = results_by_fold_and_level, 738 | results_by_level = results_by_level, 739 | name = nameA 740 | ) 741 | 742 | # TODO: compile results into the new estimate. 743 | 744 | if (verbose) cat("Estimating pooled min.\n") 745 | pooled_min = estimate_pooled_results(lapply(fold_results, function(x) x$level_min), 746 | verbose = verbose) 747 | cat("\n") 748 | if (verbose) cat("Estimating pooled max.\n") 749 | pooled_max = estimate_pooled_results(lapply(fold_results, function(x) x$level_max), 750 | verbose = verbose) 751 | cat("\n") 752 | 753 | var_results$EY0V = pooled_min$thetas 754 | var_results$EY1V = pooled_max$thetas 755 | 756 | if (length(var_results$EY1V) == length(var_results$EY0V)) { 757 | var_results$thetaV = var_results$EY1V - var_results$EY0V 758 | 759 | # Calculate relative risk parameter (aka risk ratio). 760 | # This only makes sense if Y is binary. 761 | var_results$thetaV_rr = var_results$EY1V / var_results$EY0V 762 | 763 | } else { 764 | if (verbose) { 765 | cat("Error: EY1V and EY0V are different lengths. EY1V =", 766 | length(var_results$EY1V), "EY0V =", length(var_results$EY0V), "\n") 767 | } 768 | var_results$thetaV = rep(NA, max(length(var_results$EY1V), 769 | length(var_results$EY0V))) 770 | 771 | var_results$thetaV_rr = rep(NA, max(length(var_results$EY1V), 772 | length(var_results$EY0V))) 773 | } 774 | 775 | 776 | # Save how many observations were in each validation fold. 777 | var_results$nV = sapply(fold_results, function(x) x$obs_validation) 778 | 779 | # Combine labels into a two-column matrix. 780 | # First column is min and second is max. 781 | # TODO: not sure if data structure for this part is correct. 782 | labels = do.call(rbind, 783 | lapply(fold_results, function(x) c(x$level_min$label, x$level_max$label))) 784 | 785 | var_results$labV = labels 786 | 787 | # If either of the thetas is null it means that all CV-TMLE folds failed. 788 | if (!is.null(pooled_min$thetas)) { 789 | 790 | # Influence_curves here is a list, with an element for each fold. 791 | var_results$varICV = sapply(1:V, function(index) { 792 | if (length(pooled_max$influence_curves) >= index && 793 | length(pooled_min$influence_curves) >= index) { 794 | var(pooled_max$influence_curves[[index]] - pooled_min$influence_curves[[index]]) 795 | } else { 796 | NA 797 | } 798 | }) 799 | 800 | # Parameter: relative risk 801 | # TODO: only calculate if Y is binary. 802 | var_results$varICV_log_rr = sapply(1:V, function(index) { 803 | if (length(pooled_max$influence_curves) >= index && 804 | length(pooled_min$influence_curves) >= index) { 805 | # Variance for the risk difference (maximal contrast parameter). 806 | # TODO: double-check this. 807 | var(pooled_max$influence_curves[[index]] / pooled_max$thetas[[index]] - 808 | pooled_min$influence_curves[[index]] / pooled_min$thetas[[index]]) 809 | } else { 810 | NA 811 | } 812 | }) 813 | 814 | if (verbose) { 815 | signif_digits = 4 816 | 817 | ey0_mean = mean(pooled_min$thetas) 818 | if (is.numeric(ey0_mean)) { 819 | cat("[Min] EY0:", signif(ey0_mean, signif_digits)) 820 | if (is.numeric(pooled_min$epsilon)) { 821 | cat(" Epsilon:", signif(pooled_min$epsilon, signif_digits)) 822 | } 823 | cat("\n") 824 | } 825 | 826 | ey1_mean = mean(pooled_max$thetas) 827 | if (is.numeric(ey1_mean)) { 828 | cat("[Max] EY1:", signif(ey1_mean, signif_digits)) 829 | if (is.numeric(pooled_max$epsilon)) { 830 | cat(" Epsilon:", signif(pooled_max$epsilon, signif_digits)) 831 | } 832 | cat("\n") 833 | } 834 | 835 | cat("ATEs:", signif(var_results$thetaV, signif_digits), "\n") 836 | cat("Variances:", signif(var_results$varICV, signif_digits), "\n") 837 | cat("Labels:\n") 838 | print(labels) 839 | cat("\n") 840 | cat("RRs:", signif(var_results$thetaV_rr, signif_digits), "\n") 841 | cat("Variances:", signif(var_results$varICV_log_rr, signif_digits), "\n") 842 | cat("\n") 843 | } 844 | } 845 | 846 | # Return results for this factor variable. 847 | var_results 848 | #} # End foreach loop over all variables. 849 | }) # End lapply or future_lapply if we're not using foreach. 850 | 851 | if (verbose) cat("Factor VIMs:", length(vim_factor), "\n\n") 852 | 853 | # Confirm that we have the correct number of results, otherwise fail out. 854 | stopifnot(length(vim_factor) == xc) 855 | 856 | 857 | # Dataframe to hold all of the variable-by-fold-by-level results. 858 | results_by_fold_and_level_obj = do.call(rbind, lapply(vim_factor, function(result) { 859 | # Only extract results_by_fold_and_level if it's not NULL 860 | if ("results_by_fold_and_level" %in% names(result) && 861 | !is.null(result$results_by_fold_and_level)# && 862 | #!is.na(result$results_by_fold_and_level) 863 | ) { 864 | result$results_by_fold_and_level 865 | } else { 866 | NULL 867 | } 868 | })) 869 | #"results_by_fold_and_level")) 870 | 871 | #results_by_level = do.call(rbind, lapply(vim_factor, `[[`, "results_by_level")) 872 | compile_results_by_level = lapply(vim_factor, function(result) { 873 | # Only extract results_by_fold_and_level if it's not NULL 874 | if ("results_by_level" %in% names(result) && 875 | !is.null(result$results_by_level) && 876 | # TODO: figure out why expressions are going into this element. 877 | !is.expression(result$results_by_level) 878 | # !is.na(result$results_by_level) 879 | ) { 880 | result$results_by_level 881 | } else { 882 | NULL 883 | } 884 | }) 885 | 886 | results_by_level_obj = NULL 887 | tryCatch({ 888 | results_by_level_obj = do.call(rbind, compile_results_by_level) 889 | }, error = function(e) { 890 | 891 | # TODO: add browser? 892 | # TODO: figure out why this happens - presumably due to covariate that failed. 893 | # Error message: 894 | # Error in rep(xi, length.out = nvar) : 895 | # attempt to replicate an object of type 'closure' 896 | cat("Errored while compiling results by level.\n") 897 | }) 898 | 899 | colnames_factor = colnames(factors$data.fac) 900 | } else { 901 | colnames_factor = NULL 902 | vim_factor = NULL 903 | results_by_fold_and_level_obj = NULL 904 | results_by_level_obj = NULL 905 | cat("No factor variables - skip VIM estimation.\n\n") 906 | } 907 | 908 | # Compile and return results. 909 | (results = list( 910 | vim_factor = vim_factor, 911 | results_by_fold_and_level = results_by_fold_and_level_obj, 912 | results_by_level = results_by_level_obj, 913 | colnames_factor = colnames_factor 914 | )) 915 | } 916 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # varimpact - variable importance through causal inference 5 | 6 | [![Build 7 | Status](https://travis-ci.org/ck37/varimpact.svg?branch=master)](https://travis-ci.org/ck37/varimpact) 8 | [![AppVeyor Build 9 | Status](https://ci.appveyor.com/api/projects/status/github/ck37/varimpact?branch=master&svg=true)](https://ci.appveyor.com/project/ck37/varimpact) 10 | [![codecov](https://codecov.io/gh/ck37/varimpact/branch/master/graph/badge.svg)](https://codecov.io/gh/ck37/varimpact) 11 | 12 | ## Summary 13 | 14 | varimpact uses causal inference statistics to generate variable 15 | importance estimates for a given dataset and outcome. It answers the 16 | question: which of my Xs are most related to my Y? Each variable’s 17 | influence on the outcome is estimated semiparametrically, without 18 | assuming a linear relationship or other functional form, and the 19 | covariate list is ranked by order of importance. This can be used for 20 | exploratory data analysis, for dimensionality reduction, for 21 | experimental design (e.g. to determine blocking and re-randomization), 22 | to reduce variance in an estimation procedure, etc. See Hubbard, 23 | Kennedy, & van der Laan (2018) for more details, or Hubbard & van der 24 | Laan (2016) for an earlier description. 25 | 26 | ## Details 27 | 28 | Each covariate is analyzed using targeted minimum loss-based estimation 29 | ([TMLE](https://CRAN.R-project.org/package=tmle)) as though it were a 30 | treatment, with all other variables serving as adjustment variables via 31 | [SuperLearner](https://github.com/ecpolley/SuperLearner). Then the 32 | statistical significance of the estimated treatment effect for each 33 | covariate determines the variable importance ranking. This formulation 34 | allows the asymptotics of TMLE to provide valid standard errors and 35 | p-values, unlike other variable importance algorithms. 36 | 37 | The results provide raw p-values as well as p-values adjusted for false 38 | discovery rate using the Benjamini-Hochberg (1995) procedure. Adjustment 39 | variables are automatically clustered hierarchically using HOPACH (van 40 | der Laan & Pollard 2003) in order to reduce dimensionality. The package 41 | supports multi-core and multi-node parallelization, which are detected 42 | and used automatically when a parallel backend is registered. Missing 43 | values are automatically imputed using K-nearest neighbors (Troyanskaya 44 | et al. 2001, Jerez et al. 2010) and missingness indicator variables are 45 | incorporated into the analysis. 46 | 47 | varimpact is under active development so please submit any bug reports 48 | or feature requests to the [issue 49 | queue](https://github.com/ck37/varimpact/issues), or email Alan and/or 50 | Chris directly. 51 | 52 | ## Installation 53 | 54 | ### GitHub 55 | 56 | ``` r 57 | # Install remotes if necessary: 58 | # install.packages("remotes") 59 | remotes::install_github("ck37/varimpact") 60 | ``` 61 | 62 | ### CRAN 63 | 64 | Forthcoming fall 2022 65 | 66 | ## Examples 67 | 68 | ### Example: basic functionality 69 | 70 | ``` r 71 | library(varimpact) 72 | 73 | #################################### 74 | # Create test dataset. 75 | set.seed(1, "L'Ecuyer-CMRG") 76 | N <- 200 77 | num_normal <- 4 78 | X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal)) 79 | Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) 80 | # Add some missing data to X so we can test imputation. 81 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA 82 | 83 | #################################### 84 | # Basic example 85 | vim <- varimpact(Y = Y, data = X) 86 | #> Finished pre-processing variables. 87 | #> 88 | #> Processing results: 89 | #> - Factor variables: 0 90 | #> - Numeric variables: 4 91 | #> 92 | #> No factor variables - skip VIM estimation. 93 | #> 94 | #> Estimating variable importance for 4 numerics. 95 | 96 | # Review consistent and significant results. 97 | vim 98 | #> Significant and consistent results: 99 | #> Type Estimate CI95 P-value Adj. p-value Est. RR 100 | #> V3 Ordered 0.4986136 (0.255 - 0.742) 2.935984e-05 0.0001174394 2.908162 101 | #> CI95 RR P-value RR Adj. p-value RR 102 | #> V3 (1.92 - 4.4) 2.214955e-07 8.859819e-07 103 | 104 | # Look at all results. 105 | vim$results_all 106 | #> Type Estimate CI95 P-value Adj. p-value Est. RR 107 | #> V3 Ordered 0.49861358 (0.255 - 0.742) 2.935984e-05 0.0001174394 2.9081617 108 | #> V4 Ordered 0.21853793 (-0.167 - 0.604) 1.334006e-01 0.2668012809 1.4110231 109 | #> V2 Ordered 0.04733746 (-0.276 - 0.371) 3.872064e-01 0.5162752138 1.0698709 110 | #> V1 Ordered -0.10939221 (-0.494 - 0.275) 7.116162e-01 0.7116161584 0.8266168 111 | #> CI95 RR P-value RR Adj. p-value RR Consistent 112 | #> V3 (1.92 - 4.4) 2.214955e-07 8.859819e-07 TRUE 113 | #> V4 (0.833 - 2.39) 1.000755e-01 2.001509e-01 TRUE 114 | #> V2 (0.664 - 1.72) 3.905406e-01 5.207208e-01 TRUE 115 | #> V1 (0.441 - 1.55) 7.238555e-01 7.238555e-01 TRUE 116 | 117 | # Plot the V2 impact. 118 | plot_var("V2", vim) 119 | ``` 120 | 121 | ![](images/README-example_1-1.png) 122 | 123 | ``` r 124 | 125 | # Generate latex tables with results. 126 | exportLatex(vim) 127 | 128 | # Clean up LaTeX files 129 | cleanup_latex_files() 130 | ``` 131 | 132 | ### Example: customize outcome and propensity score estimation 133 | 134 | ``` r 135 | Q_lib = c("SL.mean", "SL.glmnet", "SL.ranger", "SL.rpartPrune") 136 | g_lib = c("SL.mean", "SL.glmnet") 137 | set.seed(1, "L'Ecuyer-CMRG") 138 | (vim = varimpact(Y = Y, data = X, Q.library = Q_lib, g.library = g_lib)) 139 | #> Finished pre-processing variables. 140 | #> 141 | #> Processing results: 142 | #> - Factor variables: 0 143 | #> - Numeric variables: 4 144 | #> 145 | #> No factor variables - skip VIM estimation. 146 | #> 147 | #> Estimating variable importance for 4 numerics. 148 | #> Significant and consistent results: 149 | #> Type Estimate CI95 P-value Adj. p-value Est. RR 150 | #> V3 Ordered 0.56401 (0.326 - 0.802) 1.749015e-06 6.996059e-06 3.644982 151 | #> CI95 RR P-value RR Adj. p-value RR 152 | #> V3 (2.34 - 5.69) 6.234554e-09 2.493822e-08 153 | ``` 154 | 155 | ### Example: parallel via multicore 156 | 157 | ``` r 158 | library(future) 159 | plan("multisession") 160 | vim = varimpact(Y = Y, data = X) 161 | #> Finished pre-processing variables. 162 | #> 163 | #> Processing results: 164 | #> - Factor variables: 0 165 | #> - Numeric variables: 4 166 | #> 167 | #> No factor variables - skip VIM estimation. 168 | #> 169 | #> Estimating variable importance for 4 numerics. 170 | ``` 171 | 172 | ### Example: mlbench breast cancer 173 | 174 | ``` r 175 | data(BreastCancer, package = "mlbench") 176 | data = BreastCancer 177 | 178 | # Create a numeric outcome variable. 179 | data$Y = as.integer(data$Class == "malignant") 180 | 181 | # Use multicore parallelization to speed up processing. 182 | plan("multisession") 183 | (vim = varimpact(Y = data$Y, data = subset(data, select = -c(Y, Class, Id)))) 184 | #> Finished pre-processing variables. 185 | #> 186 | #> Processing results: 187 | #> - Factor variables: 9 188 | #> - Numeric variables: 0 189 | #> 190 | #> Estimating variable importance for 9 factors. 191 | #> Significant and consistent results: 192 | #> Type Estimate CI95 P-value Adj. p-value 193 | #> Bare.nuclei Factor 0.6174459 (0.5 - 0.735) 0.000000e+00 0.000000e+00 194 | #> Mitoses Factor 0.4092028 (0.333 - 0.486) 0.000000e+00 0.000000e+00 195 | #> Cl.thickness Factor 0.5245860 (0.382 - 0.667) 3.027578e-13 9.082735e-13 196 | #> Cell.size Factor 0.5650275 (0.395 - 0.735) 3.313050e-11 5.963490e-11 197 | #> Est. RR CI95 RR P-value RR Adj. p-value RR 198 | #> Bare.nuclei 3.682218 (2.21 - 6.14) 0.000000e+00 0.000000e+00 199 | #> Mitoses 2.093929 (1.85 - 2.37) 3.023193e-11 1.360437e-10 200 | #> Cl.thickness 2.952087 (2.13 - 4.08) 2.956850e-07 8.870549e-07 201 | #> Cell.size 3.445132 (1.98 - 6) 4.465977e-06 8.038759e-06 202 | plot_var("Mitoses", vim) 203 | ``` 204 | 205 | ![](images/README-example_5-1.png) 206 | 207 | ## Authors 208 | 209 | Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley 210 | 211 | ## References 212 | 213 | Benjamini, Y., & Hochberg, Y. (1995). Controlling the false discovery 214 | rate: a practical and powerful approach to multiple testing. Journal of 215 | the royal statistical society. Series B (Methodological), 289-300. 216 | 217 | Gruber, S., & van der Laan, M. J. (2012). tmle: An R Package for 218 | Targeted Maximum Likelihood Estimation. Journal of Statistical Software, 219 | 51(i13). 220 | 221 | Hubbard, A. E., Kennedy, C. J., van der Laan, M. J. (2018). 222 | Data-adaptive target parameters. In M. van der Laan & S. Rose (2018) 223 | Targeted Learning in Data Science. Springer. 224 | 225 | Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016). 226 | Statistical Inference for Data Adaptive Target Parameters. The 227 | international journal of biostatistics, 12(1), 3-19. 228 | 229 | Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A., 230 | Bulger, E. M., … & Rahbar, M. H. (2013). Time-Dependent Prediction and 231 | Evaluation of Variable Importance Using SuperLearning in High 232 | Dimensional Clinical Data. The journal of trauma and acute care surgery, 233 | 75(1 0 1), S53. 234 | 235 | Hubbard, A. E., & van der Laan, M. J. (2016). Mining with inference: 236 | data-adaptive target parameters (pp. 439-452). In P. Bühlmann et 237 | al. (Ed.), Handbook of Big Data. CRC Press, Taylor & Francis Group, LLC: 238 | Boca Raton, FL. 239 | 240 | Jerez, J. M., Molina, I., García-Laencina, P. J., Alba, E., Ribelles, 241 | N., Martín, M., & Franco, L. (2010). Missing data imputation using 242 | statistical and machine learning methods in a real breast cancer 243 | problem. Artificial intelligence in medicine, 50(2), 105-115. 244 | 245 | Rozenholc, Y., Mildenberger, T., & Gather, U. (2010). Combining regular 246 | and irregular histograms by penalized likelihood. Computational 247 | Statistics & Data Analysis, 54(12), 3313-3323. 248 | 249 | Troyanskaya, O., Cantor, M., Sherlock, G., Brown, P., Hastie, T., 250 | Tibshirani, R., Botstein, D., & Altman, R. B. (2001). Missing value 251 | estimation methods for DNA microarrays. Bioinformatics, 17(6), 520-525. 252 | 253 | van der Laan, M. J. (2006). Statistical inference for variable 254 | importance. The International Journal of Biostatistics, 2(1). 255 | 256 | van der Laan, M. J., & Pollard, K. S. (2003). A new algorithm for hybrid 257 | hierarchical clustering with visualization and the bootstrap. Journal of 258 | Statistical Planning and Inference, 117(2), 275-303. 259 | 260 | van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). Super 261 | learner. Statistical applications in genetics and molecular biology, 262 | 6(1). 263 | 264 | van der Laan, M. J., & Rose, S. (2011). Targeted learning: causal 265 | inference for observational and experimental data. Springer Science & 266 | Business Media. 267 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | VarImpact TODOs 2 | 3 | Alan priorities: 4 | - Display true bin ranges for continuous variables in results tables. 5 | - Unadjusted difference in means estimate (t-test) to compare to TMLE estimate 6 | - Cell sizes in the bins/levels that it chooses (A = 0 and A = 1) 7 | - Calculate RR treatment effects (see tmle, ltmle, Ben Arnold's package). 8 | * OR doesn't need to be in there, byproduct of parametric regression. 9 | - Test CV-TMLE with non-binary outcomes. E.g. transform theta? 10 | 11 | Mark priorities: 12 | - Support for 10-fold CV-TMLE - IN PROGRESS 13 | 14 | Chris items: 15 | * PRIORITY: for continuous outcomes, need to transform effect size and CIs back to the original scale (multiply by range of the original bounds.) 16 | * PRIORITY: Track and report why variables are removed from analysis 17 | - minCell, minYs, error during estimation, missing values, restrict_by_quantiles(), etc. 18 | * Simulation study 19 | * Investigate penalized histogramming of numeric variables 20 | - Needs more tests and probably its own function. 21 | * Give a warning if data is a matrix rather than dataframe. Causes an error in separate_factors_numerics. 22 | * Methods 23 | - Rename exportLatex to export_latex; support but deprecate old method name. 24 | - Data reduction: switch from HOPACH to a more reliable clustering algorithm. 25 | - Or upgrade HOPACH to support more than 10 (or 15?) variables. 26 | * Arguments 27 | - Argument for p-value cutoff for consistent results. 28 | * Coding style 29 | - Create more subfunctions. 30 | * Output 31 | - Add "Variable" column name to $results_by_fold. 32 | - Results by fold should be divided into two tables: 33 | - by_fold_estimates - psi estimates for each fold 34 | - by_fold_levels - high and low levels for each fold 35 | - Show a_L and a_H in consistent results table - critical for interpretation. 36 | * Unfortunately this requires per-fold results for numeric variables, due to 37 | the "directionality" definition of consistency. 38 | - Sort consistent results by p-value then estimate size? 39 | - Support summary() 40 | - Return SEs in results tables. 41 | - Fix exportTable() when vim_result is NULL due to lack of results. 42 | * Quality 43 | - Warn/error if there are character columns in the data, as a bonus check. 44 | - Figure out how to reduce the number of potential failures in various steps. 45 | - Automatically handle rows that are all NAs? 46 | - Automatically remove any data column that is 100% equal to the outcome (so it can stay in df). 47 | * Prediction Accuracy 48 | - Save risk estimates for g and Q during CV-TMLE. - IN PROGRESS 49 | - Save and report range of g to examine sparsity. 50 | * Testing 51 | - Need tests with single factors and numerics. Lots of lingering bugs. 52 | - Add other tests. 53 | * Visualization 54 | - Visualize the treatment-specific means for each level of the treatment, with CIs 55 | - Label the high and low level 56 | - Layer on the raw density of the variable 57 | - Plot of BH correction ala figure 10.6 in All of Statistics. 58 | * Performance 59 | - Try Rprofvis 60 | - Investigate Jeremy's method of not wasting SL folds during CV-TMLE. 61 | * Documentation 62 | - Improve varImpact() roxygen description. 63 | - Document the return object 64 | - Vignette 65 | * Clarify consistency definition 66 | * Examples 67 | * Benchmarking 68 | - Benchmark to random forest, lasso, OLS and SL-perturbation algorithms. 69 | * BUGS/ERRORS 70 | - HOPACH can fail twice in a row. Need to figure out why and fix (see factor test 1). 71 | - HOPACH (Minh example): Error in rowMeans(dmat[labels == ordlabels[j], labels == labels[medoid2]]) : 72 | * seem to occur in hopach::mssnextlevel() 73 | * possibly due to call to msssplitcluster() 74 | * may need a ", drop = F" when subsetting a matrix somewhere. 75 | * TODO: fork HOPACH and add a dimension check / debug stuff near there. 76 | 'x' must be an array of at least two dimensions 77 | - Error in predmat[which, seq(nlami)] = preds : replacement has length zero 78 | - Occurs when estimating TMLE on training samples. 79 | - Error in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs, : 80 | one multinomial or binomial class has 1 or 0 observations; not allowed 81 | -- Can happen with BreastCancer dataset, seems to depend on seed. 82 | -- Happens in parkinson's competition dataset. 83 | -- Seems to occur when a variable has value with a small number of obs. 84 | Then when estimating the treatment propensity score for a given fold 85 | there may be only a single value for all observations. 86 | - Error in y %*% rep(1, nc) : non-conformable arguments 87 | -- When estimating TMLE 88 | - "Missing value(s) in medoidsdata in collap()" 89 | * WARNINGS 90 | - Warning in collap(data, level, d, dmat, newmed) (conducting HOPACH, at least on factors): 91 | Not enough medoids to use newmed='medsil' in collap() - 92 | using newmed='nn' instead 93 | (CK: suppressing warnings currently to avoid this one.) 94 | 95 | From existing documentation: 96 | - Allow reporting of results that randomly do not have estimates for some of validation samples. 97 | 98 | Wishlist 99 | - Support clustering or variable reduction algorithms other than HOPACH. 100 | - Support more than 2 external (CV-TMLE) folds. 101 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | install: 10 | ps: Bootstrap 11 | 12 | cache: 13 | - C:\RLibrary 14 | 15 | build_script: 16 | - git config --global user.name "travis" 17 | - git config --global user.email "travis@example.org" 18 | - travis-tool.sh install_bioc_deps 19 | - travis-tool.sh install_deps 20 | #- travis-tool.sh github_package jimhester/covr 21 | 22 | test_script: 23 | - travis-tool.sh run_tests 24 | 25 | on_failure: 26 | - 7z a failure.zip *.Rcheck\* 27 | - appveyor PushArtifact failure.zip 28 | 29 | #on_success: 30 | # - Rscript -e 'covr::codecov()' 31 | 32 | environment: 33 | global: 34 | CRAN: http://cran.rstudio.com 35 | WARNINGS_ARE_ERRORS: 1 36 | _R_CHECK_FORCE_SUGGESTS_: 0 37 | R_ARCH: x64 38 | USE_RTOOLS: true ## to be able to use Remotes (i.e. packages from non-CRAN sources) 39 | 40 | matrix: 41 | - R_VERSION: release 42 | 43 | artifacts: 44 | - path: '*.Rcheck\**\*.log' 45 | name: Logs 46 | 47 | - path: '*.Rcheck\**\*.out' 48 | name: Logs 49 | 50 | - path: '*.Rcheck\**\*.fail' 51 | name: Logs 52 | 53 | - path: '*.Rcheck\**\*.Rout' 54 | name: Logs 55 | 56 | - path: '\*_*.tar.gz' 57 | name: Bits 58 | 59 | - path: '\*_*.zip' 60 | name: Bits 61 | 62 | -------------------------------------------------------------------------------- /images/README-example_1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ck37/varimpact/b4b78751d944b59f01362f43c5838ffe9255e3b5/images/README-example_1-1.png -------------------------------------------------------------------------------- /images/README-example_5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ck37/varimpact/b4b78751d944b59f01362f43c5838ffe9255e3b5/images/README-example_5-1.png -------------------------------------------------------------------------------- /man/cleanup_latex_files.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cleanup-latex.R 3 | \name{cleanup_latex_files} 4 | \alias{cleanup_latex_files} 5 | \title{Clean up LaTeX files created by exportLatex} 6 | \usage{ 7 | cleanup_latex_files(dir = ".", outname = "", verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{dir}{Directory where LaTeX files are located (default: current directory)} 11 | 12 | \item{outname}{Prefix for the LaTeX files (default: empty string)} 13 | 14 | \item{verbose}{If TRUE, print messages about which files were removed} 15 | } 16 | \value{ 17 | Invisibly returns a logical vector indicating which files were successfully removed 18 | } 19 | \description{ 20 | This function removes LaTeX files that are typically created by the exportLatex() function. 21 | It's designed to be used after exportLatex() calls to clean up temporary files. 22 | } 23 | \examples{ 24 | \dontrun{ 25 | # After calling exportLatex() 26 | exportLatex(vim) 27 | cleanup_latex_files() 28 | 29 | # With custom directory and prefix 30 | exportLatex(vim, outname = "myresults_", dir = "output/") 31 | cleanup_latex_files(dir = "output/", outname = "myresults_") 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /man/create_cv_folds.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create_cv_folds.R 3 | \name{create_cv_folds} 4 | \alias{create_cv_folds} 5 | \title{Stratified CV to insure balance (by one grouping variable, Y)} 6 | \usage{ 7 | create_cv_folds(V, Y, verbose = F) 8 | } 9 | \arguments{ 10 | \item{V}{number of folds} 11 | 12 | \item{Y}{Outcome variable. If binary will be used for stratification.} 13 | 14 | \item{verbose}{If T will display extra output.} 15 | } 16 | \value{ 17 | Vector of fold assignments. 18 | } 19 | \description{ 20 | Stratified CV to insure balance (by one grouping variable, Y) 21 | } 22 | -------------------------------------------------------------------------------- /man/estimate_tmle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_tmle.R 3 | \name{estimate_tmle} 4 | \alias{estimate_tmle} 5 | \title{Get TMLE estimate: E[Y | A = 1, W].} 6 | \usage{ 7 | estimate_tmle(Y, A, W, family, delta = NULL, Q.lib, g.lib, verbose = F) 8 | } 9 | \arguments{ 10 | \item{Y}{Outcome variable} 11 | 12 | \item{A}{Treatment indicator} 13 | 14 | \item{W}{Dataframe of adjustment covariates} 15 | 16 | \item{family}{Binomial or gaussian} 17 | 18 | \item{delta}{Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing.} 19 | 20 | \item{Q.lib}{SuperLearner library for estimating Q (potential outcome)} 21 | 22 | \item{g.lib}{SuperLearner library for estimating g (propensity score)} 23 | 24 | \item{verbose}{If true output extra information during execution.} 25 | } 26 | \description{ 27 | Get TMLE estimate: E[Y | A = 1, W]. 28 | } 29 | -------------------------------------------------------------------------------- /man/estimate_tmle2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/estimate_tmle2.R 3 | \name{estimate_tmle2} 4 | \alias{estimate_tmle2} 5 | \title{Get TMLE estimate: E[Y | A = 1, W].} 6 | \usage{ 7 | estimate_tmle2( 8 | Y, 9 | A, 10 | W, 11 | family, 12 | delta = rep(1, length(Y)), 13 | Q.lib, 14 | g.lib, 15 | id = 1:length(Y), 16 | Qbounds = NULL, 17 | gbound = 0.025, 18 | alpha = 0.995, 19 | fluctuation = "logistic", 20 | V = 10, 21 | verbose = F 22 | ) 23 | } 24 | \arguments{ 25 | \item{Y}{Outcome variable} 26 | 27 | \item{A}{Treatment indicator} 28 | 29 | \item{W}{Dataframe of adjustment covariates} 30 | 31 | \item{family}{Outcome family - binomial or gaussian} 32 | 33 | \item{delta}{Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing.} 34 | 35 | \item{Q.lib}{SuperLearner library for estimating Q (potential outcome)} 36 | 37 | \item{g.lib}{SuperLearner library for estimating g (propensity score)} 38 | 39 | \item{id}{Optional subject-level identifier.} 40 | 41 | \item{Qbounds}{Bounds on Q} 42 | 43 | \item{gbound}{Bounds on G} 44 | 45 | \item{alpha}{TBD, from TMLE package} 46 | 47 | \item{fluctuation}{Only logistic is currently supported.} 48 | 49 | \item{V}{Number of folds for SuperLearner} 50 | 51 | \item{verbose}{If true output extra information during execution.} 52 | } 53 | \description{ 54 | Get TMLE estimate: E[Y | A = 1, W]. 55 | } 56 | -------------------------------------------------------------------------------- /man/exportLatex.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exportLatex.R 3 | \name{exportLatex} 4 | \alias{exportLatex} 5 | \title{Export varimpact results as Latex tables} 6 | \usage{ 7 | exportLatex(impact_results, outname = "", dir = ".", digits = 4, ...) 8 | } 9 | \arguments{ 10 | \item{impact_results}{Result object from previous varimpact() call.} 11 | 12 | \item{outname}{(Optional) String that is prepended to filenames.} 13 | 14 | \item{dir}{(Optional) Directory to save the results, defaults to current directory.} 15 | 16 | \item{digits}{Digits to round numbers, passed through to xtable.} 17 | 18 | \item{...}{Additional parameters passed to print.xtable().} 19 | } 20 | \description{ 21 | Outputs results from varimpact() into three Latex tables: consistent results, 22 | all results, and per-fold results. 23 | } 24 | \details{ 25 | Creates three Latex table files: 26 | \itemize{ 27 | \item varimpConsistent.tex - the ``consistent'' significant results, meaning those with consistent categories chosen as comparison groups among factors and consistent ordering for numeric variables. 28 | \item varimpAll.tex - the file with cross-validated average variable impacts ordered by statistical significance. 29 | \item varimpByV.tex - the comparison levels used within each validation sample. Either integer ordering of factors or short-hand for percentile cut-off (0-1 is the 10th percentile, 10+ is the 100th percentile) 30 | } 31 | } 32 | \seealso{ 33 | \code{\link[varimpact]{varimpact}} 34 | } 35 | -------------------------------------------------------------------------------- /man/factors_to_indicators.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/factors_to_indicators.R 3 | \name{factors_to_indicators} 4 | \alias{factors_to_indicators} 5 | \title{Convert a dataframe of factor into separate indicators.} 6 | \usage{ 7 | factors_to_indicators(factor_df, miss_name_prefix = "Imiss_", verbose = F) 8 | } 9 | \arguments{ 10 | \item{factor_df}{Dataframe consisting only of factor variables.} 11 | 12 | \item{miss_name_prefix}{Starting name for each missing indicator.} 13 | 14 | \item{verbose}{Set to T for detailed output.} 15 | } 16 | \value{ 17 | A list of results. 18 | } 19 | \description{ 20 | More details 21 | } 22 | -------------------------------------------------------------------------------- /man/max_sqr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/max_sqr.R 3 | \name{max_sqr} 4 | \alias{max_sqr} 5 | \title{Find the maximum of the squared values.} 6 | \usage{ 7 | max_sqr(x) 8 | } 9 | \arguments{ 10 | \item{x}{Numeric vector} 11 | } 12 | \value{ 13 | Maximum of the squared values, or -Inf if all elements are NA. 14 | } 15 | \description{ 16 | Find the maximum of the squared values. 17 | } 18 | -------------------------------------------------------------------------------- /man/plot_var.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot-var.R 3 | \name{plot_var} 4 | \alias{plot_var} 5 | \title{Plot the varimpact results for a given variable} 6 | \usage{ 7 | plot_var(var_name, vim, digits = 2L, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{var_name}{String name of the variable} 11 | 12 | \item{vim}{Varimpact result object that contains the variable} 13 | 14 | \item{digits}{Number of digits for rounding purposes.} 15 | 16 | \item{verbose}{If true, display extra output.} 17 | } 18 | \description{ 19 | Displays the adjusted treatment-specific means and the impact estimate. 20 | } 21 | -------------------------------------------------------------------------------- /man/print.varimpact.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/print.varimpact.R 3 | \name{print.varimpact} 4 | \alias{print.varimpact} 5 | \title{Custom printing of the varimpact results.} 6 | \usage{ 7 | \method{print}{varimpact}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{Results object from varimpact.} 11 | 12 | \item{...}{Further arguments passed to or from other methods.} 13 | } 14 | \description{ 15 | Shows the significant and consistent results by default. If there are no 16 | consistent results it shows all results. 17 | } 18 | -------------------------------------------------------------------------------- /man/quantiles_equivalent.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/quantiles_equivalent.R 3 | \name{quantiles_equivalent} 4 | \alias{quantiles_equivalent} 5 | \title{Checks if two quantiles have the same sample value for a given vector. 6 | This indicates that the vector has little variation.} 7 | \usage{ 8 | quantiles_equivalent(x, quantile_probs = c(0.1, 0.9)) 9 | } 10 | \arguments{ 11 | \item{x}{Data vector. If a factor it is converted to numeric using unclass().} 12 | 13 | \item{quantile_probs}{Vector with two probabilities that specify the quantiles.} 14 | } 15 | \value{ 16 | True if the two quantiles are equal, indicating a lack of variation in the sample data. 17 | } 18 | \description{ 19 | Checks if two quantiles have the same sample value for a given vector. 20 | This indicates that the vector has little variation. 21 | } 22 | \seealso{ 23 | restrict_by_quantiles 24 | } 25 | -------------------------------------------------------------------------------- /man/reduce_dimensions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/reduce_dimensions.R 3 | \name{reduce_dimensions} 4 | \alias{reduce_dimensions} 5 | \title{Reduce variables in a dataframe to a target number of covariates.} 6 | \usage{ 7 | reduce_dimensions(data, newX = NULL, max_variables, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{data}{Dataframe} 11 | 12 | \item{newX}{Optional second dataframe to receive the same reduction.} 13 | 14 | \item{max_variables}{Maximum we want to allow, after which dimension reduction 15 | will take place. Cannot be more than 15 due to HOPACH limitations. Set to NULL 16 | to disable any dimension reduction.} 17 | 18 | \item{verbose}{If true will output additional information during execution.} 19 | } 20 | \description{ 21 | Currently uses HOPACH hierarchical clustering but could be generalized. 22 | } 23 | -------------------------------------------------------------------------------- /man/restrict_by_quantiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/restrict_by_quantiles.R 3 | \name{restrict_by_quantiles} 4 | \alias{restrict_by_quantiles} 5 | \title{Remove columns from a dataframe if they do not have sufficient variation.} 6 | \usage{ 7 | restrict_by_quantiles(data, quantile_probs = c(0.1, 0.9), verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{data}{Dataframe or matrix} 11 | 12 | \item{quantile_probs}{The probabilities corresponding to the quantiles that will be compared.} 13 | 14 | \item{verbose}{If TRUE output additional details during execution.} 15 | } 16 | \value{ 17 | New dataframe with the restriction applied. 18 | } 19 | \description{ 20 | Remove columns from a dataframe if they do not have sufficient variation. 21 | } 22 | \seealso{ 23 | quantiles_equivalent 24 | } 25 | -------------------------------------------------------------------------------- /man/results_by_level.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/results-by-level.R 3 | \name{results_by_level} 4 | \alias{results_by_level} 5 | \title{Aggregate the results_by_fold_and_level df into a results_by_level df.} 6 | \usage{ 7 | results_by_level(results_by_fold_and_level, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{results_by_fold_and_level}{Dataframe containing the VIM results for 11 | all levels of each variable across all CV folds.} 12 | 13 | \item{verbose}{If true, display extra output.} 14 | } 15 | \description{ 16 | Aggregate the results_by_fold_and_level df into a results_by_level df. 17 | } 18 | -------------------------------------------------------------------------------- /man/separate_factors_numerics.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/separate_factors_numerics.R 3 | \name{separate_factors_numerics} 4 | \alias{separate_factors_numerics} 5 | \title{Split df into one of only factors and one of only numerics.} 6 | \usage{ 7 | separate_factors_numerics(data, strings_to_factors = TRUE, verbose = FALSE) 8 | } 9 | \arguments{ 10 | \item{data}{A dataframe or matrix.} 11 | 12 | \item{strings_to_factors}{Convert character strings to factors if True, 13 | otherwise ignore character string variables.} 14 | 15 | \item{verbose}{If TRUE output additional details during execution.} 16 | } 17 | \value{ 18 | Results list. 19 | 20 | Convert strings to factors as well. 21 | } 22 | \description{ 23 | Split df into one of only factors and one of only numerics. 24 | } 25 | -------------------------------------------------------------------------------- /man/sum_na.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sum_na.R 3 | \name{sum_na} 4 | \alias{sum_na} 5 | \title{Get missingness for each column} 6 | \usage{ 7 | sum_na(x) 8 | } 9 | \arguments{ 10 | \item{x}{Vector, matrix, or dataframe} 11 | } 12 | \description{ 13 | Function for getting total number missing values for vector 14 | } 15 | -------------------------------------------------------------------------------- /man/tmle_estimate_q.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tmle_estimate_q.R 3 | \name{tmle_estimate_q} 4 | \alias{tmle_estimate_q} 5 | \title{purpose: estimate Q=E(Y |Z, A,W) data-adaptively,} 6 | \usage{ 7 | tmle_estimate_q( 8 | Y, 9 | Z = rep(0, length(Y)), 10 | A, 11 | W, 12 | Delta, 13 | Q = NULL, 14 | Qbounds, 15 | Qform = NULL, 16 | maptoYstar, 17 | SL.library, 18 | cvQinit = F, 19 | family, 20 | id = 1:length(Y), 21 | V = 10, 22 | verbose = F 23 | ) 24 | } 25 | \arguments{ 26 | \item{Y}{outcome} 27 | 28 | \item{Z}{intermediate variable between A and Y (default= 0 when no int. var.)} 29 | 30 | \item{A}{treatment indicator (1=treatment, 0=control)} 31 | 32 | \item{W}{baseline covariates} 33 | 34 | \item{Delta}{missingness indicator} 35 | 36 | \item{Q}{optional externally estimated values for Q} 37 | 38 | \item{Qbounds}{bounds for predicted values} 39 | 40 | \item{Qform}{optional regression formula to use for glm if} 41 | 42 | \item{maptoYstar}{if TRUE, using logistic fluctuation for bounded, continuous outcomes} 43 | 44 | \item{SL.library}{library of prediction algorithms for Super Learner} 45 | 46 | \item{cvQinit}{flag, if TRUE, cross-validate SL.} 47 | 48 | \item{family}{regression family} 49 | 50 | \item{id}{subject identifier} 51 | 52 | \item{V}{number of folds for SuperLearner} 53 | 54 | \item{verbose}{Set T for extra output} 55 | } 56 | \description{ 57 | unless super learner not available, or user specifies 58 | initial values or a regression formula 59 | arguments: 60 | } 61 | -------------------------------------------------------------------------------- /man/varimpact.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/varimpact.R 3 | \encoding{utf8} 4 | \name{varimpact} 5 | \alias{varimpact} 6 | \title{Variable importance estimation using causal inference (targeted learning)} 7 | \usage{ 8 | varimpact( 9 | Y, 10 | data, 11 | A_names = colnames(data), 12 | V = 2L, 13 | Q.library = c("SL.glm", "SL.mean"), 14 | g.library = c("SL.glm", "SL.mean"), 15 | family = "binomial", 16 | minYs = 15L, 17 | minCell = 0L, 18 | adjust_cutoff = 10L, 19 | corthres = 0.8, 20 | impute = "median", 21 | miss.cut = 0.5, 22 | bins_numeric = 10L, 23 | quantile_probs_factor = c(0.1, 0.9), 24 | quantile_probs_numeric = quantile_probs_factor, 25 | verbose = FALSE, 26 | verbose_tmle = FALSE, 27 | verbose_reduction = FALSE, 28 | parallel = TRUE, 29 | digits = 4L 30 | ) 31 | } 32 | \arguments{ 33 | \item{Y}{outcome of interest (numeric vector)} 34 | 35 | \item{data}{data frame of predictor variables of interest for 36 | which function returns VIM's. (possibly a matrix?)} 37 | 38 | \item{A_names}{Names of the variables for which we want to estimate importance, 39 | a subset of the data argument.} 40 | 41 | \item{V}{Number of cross-validation folds.} 42 | 43 | \item{Q.library}{library used by SuperLearner for model of outcome 44 | versus predictors} 45 | 46 | \item{g.library}{library used by SuperLearner for model of 47 | predictor variable of interest versus other predictors} 48 | 49 | \item{family}{family ('binomial' or 'gaussian')} 50 | 51 | \item{minYs}{mininum # of obs with event - if it is < minYs, skip VIM} 52 | 53 | \item{minCell}{is the cut-off for including a category of A in analysis, and 54 | presents the minumum of cells in a 2x2 table of the indicator of that level 55 | versus outcome, separately by training and validation sample.} 56 | 57 | \item{adjust_cutoff}{Maximum number of adjustment variables during TMLE. If 58 | more than this cutoff varimpact will attempt to reduce the dimensions to 59 | that number (using HOPACH hierarchical clustering). Must not be more than 60 | 15 due to HOPACH constraints. Set to NULL to disable any dimension 61 | reduction.} 62 | 63 | \item{corthres}{cut-off correlation with explanatory 64 | variable for inclusion of an adjustment variables} 65 | 66 | \item{impute}{Type of missing value imputation to conduct. One of: "zero", 67 | "median", "knn" (default). Note: knn results in the covariate data being centered/scaled.} 68 | 69 | \item{miss.cut}{eliminates explanatory (X) variables with proportion 70 | of missing obs > cut.off} 71 | 72 | \item{bins_numeric}{Numbers of bins when discretizing numeric variables.} 73 | 74 | \item{quantile_probs_factor}{Quantiles used to check if factors have 75 | sufficient variation.} 76 | 77 | \item{quantile_probs_numeric}{Quantiles used to check if numerics have 78 | sufficient variation.} 79 | 80 | \item{verbose}{Boolean - if TRUE the method will display more detailed 81 | output.} 82 | 83 | \item{verbose_tmle}{Boolean - if TRUE, will display even more detail on the TMLE 84 | estimation process.} 85 | 86 | \item{verbose_reduction}{Boolean - if TRUE, will display more detail during 87 | variable reduction step (clustering).} 88 | 89 | \item{parallel}{Use parallel processing if a backend is registered; enabled 90 | by default.} 91 | 92 | \item{digits}{Number of digits to round the value labels.} 93 | } 94 | \value{ 95 | Results object. TODO: add more detail here. 96 | } 97 | \description{ 98 | \code{varimpact} returns variable importance statistics ordered 99 | by statistical significance using a combination of data-adaptive target 100 | parameter 101 | } 102 | \details{ 103 | The function performs the following functions. 104 | \enumerate{ 105 | \item Drops variables missing > miss.cut of time (tuneable). 106 | \item Separate out covariates into factors and continuous (ordered). 107 | \item Drops variables for which their distribution is uneven - e.g., all 1 108 | value (tuneable) separately for factors and numeric variables (ADD MORE 109 | DETAIL HERE) 110 | \item Makes dummy variable basis for factors, including naming dummies 111 | to be traceable to original factor variables later. 112 | \item Makes new ordered variable of integers mapped to intervals defined by 113 | deciles for the ordered numeric variables (automatically makes) fewer 114 | categories if original variable has < 10 values. 115 | \item Creates associated list of number of unique values and the list of them 116 | for each variable for use in variable importance part. 117 | \item Makes missing covariate basis for both factors and ordered variables 118 | \item For each variable, after assigning it as A, uses optimal histogram 119 | function to combine values using the distribution of A | Y=1 to avoid very 120 | small cell sizes in distribution of Y vs. A (tuneable) (ADD DETAIL) 121 | \item Uses HOPACH* to cluster variables associated confounder/missingness 122 | basis for W, that uses specified minimum number of adjustment variables. 123 | \item Finds min and max estimate of E(Ya) w.r.t. a. after looping through 124 | all values of A* (after processed by histogram) 125 | \item Returns estimate of E(Ya(max)-Ya(min)) with SE using CV-TMLE. 126 | } 127 | *HOPACH is "Hierarchical Ordered Partitioning and Collapsing Hybrid" 128 | } 129 | \section{Authors}{ 130 | 131 | Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley 132 | } 133 | 134 | \section{References}{ 135 | 136 | Benjamini, Y., & Hochberg, Y. (1995). \emph{Controlling the false discovery 137 | rate: a practical and powerful approach to multiple testing}. Journal of the 138 | royal statistical society. Series B (Methodological), 289-300. 139 | 140 | Gruber, S., & van der Laan, M. J. (2012). \emph{tmle: An R Package for 141 | Targeted Maximum Likelihood Estimation}. Journal of Statistical Software, 142 | 51(i13). 143 | 144 | Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016). 145 | \emph{Statistical Inference for Data Adaptive Target Parameters}. The 146 | international journal of biostatistics, 12(1), 3-19. 147 | 148 | Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A., 149 | Bulger, E. M., ... & Rahbar, M. H. (2013). \emph{Time-Dependent Prediction 150 | and Evaluation of Variable Importance Using SuperLearning in High Dimensional 151 | Clinical Data}. The journal of trauma and acute care surgery, 75(1 0 1), S53. 152 | 153 | Hubbard, A. E., & van der Laan, M. J. (2016). \emph{Mining with inference: 154 | data-adaptive target parameters (pp. 439-452)}. In P. Buhlmann et al. (Ed.), 155 | \emph{Handbook of Big Data}. CRC Press, Taylor & Francis Group, LLC: Boca 156 | Raton, FL. 157 | 158 | van der Laan, M. J. (2006). \emph{Statistical inference for variable 159 | importance}. The International Journal of Biostatistics, 2(1). 160 | 161 | van der Laan, M. J., & Pollard, K. S. (2003). \emph{A new algorithm for 162 | hybrid hierarchical clustering with visualization and the bootstrap}. Journal 163 | of Statistical Planning and Inference, 117(2), 275-303. 164 | 165 | van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). \emph{Super 166 | learner}. Statistical applications in genetics and molecular biology, 6(1). 167 | 168 | van der Laan, M. J., & Rose, S. (2011). \emph{Targeted learning: causal 169 | inference for observational and experimental data}. Springer Science & 170 | Business Media. 171 | } 172 | 173 | \examples{ 174 | #################################### 175 | # Create test dataset. 176 | set.seed(1) 177 | N <- 100 178 | num_normal <- 5 179 | X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal)) 180 | Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) 181 | # Add some missing data to X so we can test imputation. 182 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA 183 | 184 | #################################### 185 | # Basic example 186 | 187 | vim <- varimpact(Y = Y, data = X[, 1:3]) 188 | vim 189 | vim$results_all 190 | exportLatex(vim) 191 | cleanup_latex_files() 192 | 193 | # Impute by median rather than knn. 194 | \dontrun{ 195 | vim <- varimpact(Y = Y, data = X[, 1:3], impute = "median") 196 | } 197 | 198 | #################################### 199 | # Multicore parallel example. 200 | \dontrun{ 201 | # Setup multicore parallelization. 202 | library(future) 203 | plan("multisession", workers = 2) 204 | 205 | vim <- varimpact(Y = Y, data = X[, 1:3]) 206 | } 207 | 208 | #################################### 209 | # Cluster parallel example. 210 | \dontrun{ 211 | cl = parallel::makeCluster(2L) 212 | plan(cluster, workers = cl) 213 | vim <- varimpact(Y = Y, data = X[, 1:3]) 214 | parallel::stopCluster(cl) 215 | } 216 | 217 | #################################### 218 | # mlbench BreastCancer example. 219 | \dontrun{ 220 | data(BreastCancer, package="mlbench") 221 | data <- BreastCancer 222 | 223 | set.seed(1, "L'Ecuyer-CMRG") 224 | # Reduce to a dataset of 100 observations to speed up testing. 225 | # Create a numeric outcome variable. 226 | data$Y <- as.numeric(data$Class == "malignant") 227 | # Use multicore parallelization to speed up processing. 228 | future::plan("multiprocess", workers = 2) 229 | vim <- varimpact(Y = data$Y, data = subset(data, select=-c(Y, Class, Id))) 230 | } 231 | 232 | } 233 | \seealso{ 234 | \code{\link[varimpact]{exportLatex}}, \code{\link[varimpact]{print.varimpact}} 235 | } 236 | -------------------------------------------------------------------------------- /readme.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r init_knitr, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "images/README-", 12 | fig.width = 5, 13 | fig.height = 3.5, 14 | message = FALSE 15 | ) 16 | 17 | # Hide warnings when knitting. 18 | options(warn = -1) 19 | ``` 20 | 21 | # varimpact - variable importance through causal inference 22 | 23 | [![Build Status](https://travis-ci.org/ck37/varimpact.svg?branch=master)](https://travis-ci.org/ck37/varimpact) 24 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/ck37/varimpact?branch=master&svg=true)](https://ci.appveyor.com/project/ck37/varimpact) 25 | [![codecov](https://codecov.io/gh/ck37/varimpact/branch/master/graph/badge.svg)](https://codecov.io/gh/ck37/varimpact) 26 | 27 | ## Summary 28 | 29 | varimpact uses causal inference statistics to generate variable importance estimates for a given dataset and outcome. It answers the question: which of my Xs are most related to my Y? Each variable's influence on the outcome is estimated semiparametrically, without assuming a linear relationship or other functional form, and the covariate list is ranked by order of importance. This can be used for exploratory data analysis, for dimensionality reduction, for experimental design (e.g. to determine blocking and re-randomization), to reduce variance in an estimation procedure, etc. See Hubbard, Kennedy, & van der Laan (2018) for more details, or Hubbard & van der Laan (2016) for an earlier description. 30 | 31 | ## Details 32 | 33 | Each covariate is analyzed using targeted minimum loss-based estimation ([TMLE](https://CRAN.R-project.org/package=tmle)) as though it were a treatment, with all other variables serving as adjustment variables via [SuperLearner](https://github.com/ecpolley/SuperLearner). Then the statistical significance of the estimated treatment effect for each covariate determines the variable importance ranking. This formulation allows the asymptotics of TMLE to provide valid standard errors and p-values, unlike other variable importance algorithms. 34 | 35 | The results provide raw p-values as well as p-values adjusted for false discovery rate using the Benjamini-Hochberg (1995) procedure. Adjustment variables are automatically clustered hierarchically using HOPACH (van der Laan & Pollard 2003) in order to reduce dimensionality. The package supports multi-core and multi-node parallelization, which are detected and used automatically when a parallel backend is registered. Missing values are automatically imputed using K-nearest neighbors (Troyanskaya et al. 2001, Jerez et al. 2010) and missingness indicator variables are incorporated into the analysis. 36 | 37 | varimpact is under active development so please submit any bug reports or feature requests to the [issue queue](https://github.com/ck37/varimpact/issues), or email Alan and/or Chris directly. 38 | 39 | ## Installation 40 | 41 | ### GitHub 42 | 43 | ```{r install, eval=FALSE} 44 | # Install remotes if necessary: 45 | # install.packages("remotes") 46 | remotes::install_github("ck37/varimpact") 47 | ``` 48 | 49 | ### CRAN 50 | 51 | Forthcoming fall 2022 52 | 53 | ## Examples 54 | 55 | ### Example: basic functionality 56 | 57 | ```{r example_1, fig.width = 6.5, fig.height = 3.5} 58 | library(varimpact) 59 | 60 | #################################### 61 | # Create test dataset. 62 | set.seed(1, "L'Ecuyer-CMRG") 63 | N <- 200 64 | num_normal <- 4 65 | X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal)) 66 | Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) 67 | # Add some missing data to X so we can test imputation. 68 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA 69 | 70 | #################################### 71 | # Basic example 72 | vim <- varimpact(Y = Y, data = X) 73 | 74 | # Review consistent and significant results. 75 | vim 76 | 77 | # Look at all results. 78 | vim$results_all 79 | 80 | # Plot the V2 impact. 81 | plot_var("V2", vim) 82 | 83 | # Generate latex tables with results. 84 | exportLatex(vim) 85 | 86 | # Clean up LaTeX files 87 | cleanup_latex_files() 88 | ``` 89 | 90 | ### Example: customize outcome and propensity score estimation 91 | 92 | ```{r example_2} 93 | Q_lib = c("SL.mean", "SL.glmnet", "SL.ranger", "SL.rpartPrune") 94 | g_lib = c("SL.mean", "SL.glmnet") 95 | set.seed(1, "L'Ecuyer-CMRG") 96 | (vim = varimpact(Y = Y, data = X, Q.library = Q_lib, g.library = g_lib)) 97 | ``` 98 | 99 | ### Example: parallel via multicore 100 | 101 | ```{r example_3} 102 | library(future) 103 | plan("multisession") 104 | vim = varimpact(Y = Y, data = X) 105 | ``` 106 | 107 | ### Example: mlbench breast cancer 108 | 109 | ```{r example_5, fig.width = 6, fig.height = 3.5} 110 | data(BreastCancer, package = "mlbench") 111 | data = BreastCancer 112 | 113 | # Create a numeric outcome variable. 114 | data$Y = as.integer(data$Class == "malignant") 115 | 116 | # Use multicore parallelization to speed up processing. 117 | plan("multisession") 118 | (vim = varimpact(Y = data$Y, data = subset(data, select = -c(Y, Class, Id)))) 119 | plot_var("Mitoses", vim) 120 | ``` 121 | 122 | ## Authors 123 | 124 | Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley 125 | 126 | ## References 127 | 128 | Benjamini, Y., & Hochberg, Y. (1995). Controlling the false discovery rate: a practical and powerful approach to multiple testing. Journal of the royal statistical society. Series B (Methodological), 289-300. 129 | 130 | Gruber, S., & van der Laan, M. J. (2012). tmle: An R Package for Targeted Maximum Likelihood Estimation. Journal of Statistical Software, 51(i13). 131 | 132 | Hubbard, A. E., Kennedy, C. J., van der Laan, M. J. (2018). Data-adaptive target parameters. In M. van der Laan & S. Rose (2018) Targeted Learning in Data Science. Springer. 133 | 134 | Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016). Statistical Inference for Data Adaptive Target Parameters. The international journal of biostatistics, 12(1), 3-19. 135 | 136 | Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A., Bulger, E. M., ... & Rahbar, M. H. (2013). Time-Dependent Prediction and Evaluation of Variable Importance Using SuperLearning in High Dimensional Clinical Data. The journal of trauma and acute care surgery, 75(1 0 1), S53. 137 | 138 | Hubbard, A. E., & van der Laan, M. J. (2016). Mining with inference: data-adaptive target parameters (pp. 439-452). In P. Bühlmann et al. (Ed.), Handbook of Big Data. CRC Press, Taylor & Francis Group, LLC: Boca Raton, FL. 139 | 140 | Jerez, J. M., Molina, I., García-Laencina, P. J., Alba, E., Ribelles, N., Martín, M., & Franco, L. (2010). Missing data imputation using statistical and machine learning methods in a real breast cancer problem. Artificial intelligence in medicine, 50(2), 105-115. 141 | 142 | Rozenholc, Y., Mildenberger, T., & Gather, U. (2010). Combining regular and irregular histograms by penalized likelihood. Computational Statistics & Data Analysis, 54(12), 3313-3323. 143 | 144 | Troyanskaya, O., Cantor, M., Sherlock, G., Brown, P., Hastie, T., Tibshirani, R., Botstein, D., & Altman, R. B. (2001). Missing value estimation methods for DNA microarrays. Bioinformatics, 17(6), 520-525. 145 | 146 | van der Laan, M. J. (2006). Statistical inference for variable importance. The International Journal of Biostatistics, 2(1). 147 | 148 | van der Laan, M. J., & Pollard, K. S. (2003). A new algorithm for hybrid hierarchical clustering with visualization and the bootstrap. Journal of Statistical Planning and Inference, 117(2), 275-303. 149 | 150 | van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). Super learner. Statistical applications in genetics and molecular biology, 6(1). 151 | 152 | van der Laan, M. J., & Rose, S. (2011). Targeted learning: causal inference for observational and experimental data. Springer Science & Business Media. 153 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(varimpact) 2 | 3 | # Only run tests if testthat package is installed. 4 | # This is in compliance with "Writing R Extensions" §1.1.3.1. 5 | if (requireNamespace("testthat", quietly = TRUE)) { 6 | testthat::test_check("varimpact", reporter = "check") 7 | } 8 | -------------------------------------------------------------------------------- /tests/testthat/test-estimate_tmle2.R: -------------------------------------------------------------------------------- 1 | library(varimpact) 2 | library(SuperLearner) 3 | library(tmle) 4 | library(testthat) 5 | 6 | # Create test dataset. 7 | context("Dataset A: continuous variables") 8 | 9 | # Set multicore-compatible seed. 10 | set.seed(1, "L'Ecuyer-CMRG") 11 | 12 | # Simulation code from PH 295, Fall 2016 -- 13 | # https://github.com/wilsoncai1992/PH295-lab/blob/master/lab3/Lab3_Lecture.Rmd 14 | 15 | # structural equation for W_1 16 | # takes as input a vector U_W1 and returns a vector evaluating 17 | # f_{W,1}(U_W1) 18 | f_W1 <- function(U_W1){ 19 | return(U_W1) 20 | } 21 | 22 | # structural equation for W_2 23 | # takes as input a vector U_W2 and returns a vector evaluating 24 | # f_{W,2}(U_W2) 25 | f_W2 <- function(U_W2){ 26 | return(U_W2) 27 | } 28 | 29 | # structural equation for A 30 | f_A <- function(W_1, W_2, U_A){ 31 | return(as.numeric(plogis(W_1 - W_2 + U_A) > 0.5)) 32 | } 33 | 34 | # structural equation for Y 35 | f_Y <- function(W_1, W_2, A, U_Y){ 36 | return(-W_1 + W_2 + A - U_Y) 37 | } 38 | 39 | # function to draw n observations from an scm 40 | # n = the number of observations to draw 41 | # returns a data.frame with named columns 42 | simObsSCM <- function(n){ 43 | ## first we draw the errors 44 | # draw U_{W,1} 45 | U_W1 <- rbinom(n,1,0.5) 46 | # draw U_{W,2} 47 | U_W2 <- rbinom(n,1,0.5) 48 | # draw U_A 49 | U_A <- rnorm(n,0,1) 50 | # draw U_Y 51 | U_Y <- rnorm(n,0,1) 52 | 53 | ## now we can evaluate the observations sequentially 54 | # evaluate W_1 55 | W_1 <- f_W1(U_W1) 56 | #evaluate W_2 57 | W_2 <- f_W2(U_W2) 58 | # evaluate A 59 | A <- f_A(W_1 = W_1, W_2 = W_2, U_A = U_A) 60 | # evaluate Y 61 | Y <- f_Y(W_1 = W_1, W_2 = W_2, A = A, U_Y = U_Y) 62 | 63 | ## return a data.frame object 64 | out <- data.frame(W_1 = W_1, W_2 = W_2, A = A, Y = Y) 65 | return(out) 66 | } 67 | 68 | data = simObsSCM(100) 69 | summary(data) 70 | 71 | sl_lib = c("SL.glmnet", "SL.glm", "SL.mean") 72 | 73 | # Estimate g and Q 74 | result = varimpact::estimate_tmle2(Y = data$Y, A = data$A, 75 | W = data[, c("W_1", "W_2")], family = "gaussian", 76 | Q.lib = sl_lib, 77 | g.lib = sl_lib, 78 | verbose = T) 79 | -------------------------------------------------------------------------------- /tests/testthat/test-exportLatex.R: -------------------------------------------------------------------------------- 1 | # Test exportLatex functionality 2 | library(testthat) 3 | library(varimpact) 4 | 5 | context("exportLatex() function") 6 | 7 | # Skip exportLatex tests during R CMD check to avoid creating LaTeX files 8 | skip_if(identical(Sys.getenv("_R_CHECK_PACKAGE_NAME_"), "varimpact"), 9 | "Skipping exportLatex tests during R CMD check") 10 | 11 | test_that("exportLatex creates and cleans up LaTeX files", { 12 | # Create test dataset 13 | set.seed(1, "L'Ecuyer-CMRG") 14 | N = 100 15 | X = data.frame( 16 | x1 = rnorm(N), 17 | x2 = rnorm(N), 18 | x3 = rnorm(N) 19 | ) 20 | Y = rbinom(N, 1, plogis(0.2 * X$x1 + 0.1 * X$x2 - 0.2 * X$x3)) 21 | 22 | # Run varimpact with minimal settings for speed 23 | future::plan("sequential") 24 | vim = varimpact(Y = Y, data = X, V = 2L, verbose = FALSE, 25 | Q.library = c("SL.mean", "SL.glm"), 26 | g.library = c("SL.mean", "SL.glm"), 27 | bins_numeric = 3L) 28 | 29 | # Skip test if no results were generated (due to sample size constraints) 30 | skip_if(is.null(vim$results_all), "No varimpact results generated") 31 | 32 | # Define cleanup function to ensure files are always removed 33 | tex_files = c("varimpByFold.tex", "varimpAll.tex", "varimpConsistent.tex") 34 | cleanup_files = function() { 35 | cleanup_latex_files(verbose = FALSE) 36 | } 37 | 38 | # Ensure cleanup happens even if test fails 39 | on.exit(cleanup_files()) 40 | 41 | # Test 1: exportLatex should create files 42 | exportLatex(vim) 43 | 44 | existing_files = tex_files[file.exists(tex_files)] 45 | 46 | expect_true(length(existing_files) > 0, 47 | info = "exportLatex should create at least some LaTeX files") 48 | expect_true("varimpByFold.tex" %in% existing_files, 49 | info = "varimpByFold.tex should be created") 50 | expect_true("varimpAll.tex" %in% existing_files, 51 | info = "varimpAll.tex should be created") 52 | 53 | # Test 2: Manual cleanup should work 54 | cleanup_files() 55 | 56 | remaining_files = tex_files[file.exists(tex_files)] 57 | expect_equal(length(remaining_files), 0, 58 | info = "Manual cleanup should remove all LaTeX files") 59 | 60 | # Test 3: Manual cleanup after exportLatex should work 61 | exportLatex(vim) 62 | cleanup_files() 63 | 64 | remaining_files_after_cleanup = tex_files[file.exists(tex_files)] 65 | expect_equal(length(remaining_files_after_cleanup), 0, 66 | info = "Manual cleanup after exportLatex should remove LaTeX files") 67 | }) 68 | 69 | test_that("exportLatex handles NULL results gracefully", { 70 | # Create a mock varimpact object with NULL results 71 | mock_vim = list( 72 | results_by_fold = NULL, 73 | results_all = NULL, 74 | results_consistent = data.frame() 75 | ) 76 | 77 | # Should return NULL and give a warning 78 | expect_warning( 79 | result <- exportLatex(mock_vim), 80 | "Cannot export LaTeX: varimpact results are NULL or incomplete" 81 | ) 82 | expect_null(result) 83 | 84 | # Should not create any files 85 | tex_files = c("varimpByFold.tex", "varimpAll.tex", "varimpConsistent.tex") 86 | existing_files = tex_files[file.exists(tex_files)] 87 | expect_equal(length(existing_files), 0, 88 | info = "exportLatex with NULL results should not create files") 89 | }) 90 | 91 | test_that("exportLatex with custom outname and directory", { 92 | # Create test dataset 93 | set.seed(2, "L'Ecuyer-CMRG") 94 | N = 100 95 | X = data.frame(x1 = rnorm(N), x2 = rnorm(N)) 96 | Y = rbinom(N, 1, plogis(0.3 * X$x1)) 97 | 98 | # Run varimpact 99 | future::plan("sequential") 100 | vim = varimpact(Y = Y, data = X, V = 2L, verbose = FALSE, 101 | Q.library = "SL.mean", g.library = "SL.mean", 102 | bins_numeric = 3L) 103 | 104 | # Skip test if no results were generated 105 | skip_if(is.null(vim$results_all), "No varimpact results generated") 106 | 107 | # Create temporary directory 108 | temp_dir = tempdir() 109 | custom_prefix = "test_" 110 | 111 | # Check for files with custom names in custom directory 112 | expected_files = c( 113 | file.path(temp_dir, paste0(custom_prefix, "varimpByFold.tex")), 114 | file.path(temp_dir, paste0(custom_prefix, "varimpAll.tex")), 115 | file.path(temp_dir, paste0(custom_prefix, "varimpConsistent.tex")) 116 | ) 117 | 118 | # Ensure cleanup happens even if test fails 119 | on.exit({ 120 | cleanup_latex_files(dir = temp_dir, outname = custom_prefix, verbose = FALSE) 121 | }) 122 | 123 | # Test with custom outname and directory 124 | exportLatex(vim, outname = custom_prefix, dir = temp_dir) 125 | 126 | existing_custom_files = expected_files[file.exists(expected_files)] 127 | expect_true(length(existing_custom_files) > 0, 128 | info = "Custom named files should be created in custom directory") 129 | 130 | # Test manual cleanup with custom names 131 | # Files should be cleaned up by on.exit() handler 132 | }) -------------------------------------------------------------------------------- /tests/testthat/test-factorsToIndicators.R: -------------------------------------------------------------------------------- 1 | library(varimpact) 2 | library(testthat) 3 | 4 | context("factorsToIndicators") 5 | 6 | # Create test dataset. 7 | 8 | set.seed(1, "L'Ecuyer-CMRG") 9 | N = 200 10 | 11 | num_normal = 7 12 | X = as.data.frame(matrix(rnorm(N * num_normal), N, num_normal)) 13 | 14 | # Add some missing data to X. 15 | miss_num = 10 16 | for (i in 1:miss_num) X[sample(nrow(X), 1), sample(ncol(X), 1)] = NA 17 | 18 | X_fac = data.frame(lapply(1:ncol(X), FUN=function(col_i) as.factor(floor(abs(pmin(pmax(X[, col_i], -1), 1)*3))))) 19 | dim(X_fac) 20 | colnames(X_fac) = paste0("fac_", 1:ncol(X_fac)) 21 | colnames(X_fac) 22 | summary(X_fac) 23 | 24 | ######################### 25 | 26 | table(X_fac[, 1], useNA="ifany") 27 | 28 | # Test a single factor. 29 | # Use column 3 which has 3 missing values based on the seed 30 | results = factors_to_indicators(X_fac[, 3, drop = F], verbose = T) 31 | dim(results$data) 32 | # We should have indicators for 1, 2, 3. 33 | colnames(results$data) 34 | 35 | dim(results$missing_indicators) 36 | colnames(results$missing_indicators) 37 | 38 | # We should have 1 missing data indicator, with 3 0s and the rest 1s. 39 | table(results$missing_indicators[, 1]) 40 | expect_equal(min(table(results$missing_indicators[, 1])), 3) 41 | 42 | expect_gt(ncol(results$missing_indicators), 0) 43 | 44 | # Test multiple factors. 45 | results = factors_to_indicators(X_fac[, 1:2, drop = F], verbose = T) 46 | 47 | dim(results$data) 48 | colnames(results$data) 49 | 50 | dim(results$missing_indicators) 51 | colnames(results$missing_indicators) 52 | 53 | table(results$missing_indicators[, 1]) 54 | 55 | # We expect our missing indicator matrix to have more than 0 columns. 56 | expect_is(results$missing_indicators, "matrix") 57 | 58 | # Test the full factor dataframe. 59 | results = factors_to_indicators(X_fac, verbose = T) 60 | dim(results$data) 61 | colnames(results$data) 62 | 63 | dim(results$missing_indicators) 64 | colnames(results$missing_indicators) 65 | -------------------------------------------------------------------------------- /tests/testthat/test-reduce_dimensions.R: -------------------------------------------------------------------------------- 1 | # TODO: need to test reduce_dimensions function. 2 | -------------------------------------------------------------------------------- /tests/testthat/test-varimpact-breastcancer.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(varimpact) 3 | 4 | ################################# 5 | # mlbench BreastCancer dataset. 6 | 7 | context("BreastCancer dataset") 8 | 9 | data(BreastCancer, package = "mlbench") 10 | data = BreastCancer 11 | names(data) = tolower(names(data)) 12 | 13 | set.seed(3, "L'Ecuyer-CMRG") 14 | 15 | # Reduce to a dataset of 200 observations to speed up testing. 16 | data = data[sample(nrow(data), 200), ] 17 | 18 | # Create a numeric outcome variable. 19 | data$y = as.numeric(data$class == "malignant") 20 | table(data$y) 21 | 22 | x = subset(data, select = -c(y, class, id)) 23 | dim(x) 24 | 25 | # Only run in RStudio so that automated CRAN checks don't give errors. 26 | if (.Platform$GUI == "RStudio") { 27 | # Use multicore parallelization to speed up processing. 28 | future::plan("multiprocess", workers = 2) 29 | } 30 | 31 | test_that("varimpact runs on BreastCancer dataset", { 32 | # Speed up the test with moderate optimization: 33 | # - Use only 2 variables to reduce computation time 34 | # - Use faster SuperLearner libraries 35 | # - Reduce bins and use minimal cross-validation 36 | vim = varimpact(Y = data$y, x[, 1:2], 37 | Q.library = c("SL.glm", "SL.mean"), 38 | g.library = c("SL.glm", "SL.mean"), 39 | bins_numeric = 2L, 40 | V = 2L, 41 | verbose = FALSE, 42 | verbose_tmle = FALSE) 43 | 44 | # Test that the function completes without error 45 | expect_is(vim, "varimpact") 46 | expect_is(vim$time, "proc_time") 47 | 48 | # Test that some VIMs were actually calculated 49 | # The test should produce meaningful results 50 | if (!is.null(vim$results_all)) { 51 | expect_gte(nrow(vim$results_all), 1) 52 | cat("Successfully calculated", nrow(vim$results_all), "VIMs\n") 53 | } else { 54 | cat("Warning: No VIMs calculated\n") 55 | } 56 | 57 | # Print timing for reference 58 | cat("Test execution time:", vim$time[3], "seconds\n") 59 | }) 60 | 61 | test_that("varimpact works with A_names parameter", { 62 | # Test a subset of columns for A_names (use just 1 variable). 63 | vim = varimpact(Y = data$y, x, 64 | A_names = colnames(x)[1], 65 | Q.library = c("SL.glm", "SL.mean"), 66 | g.library = c("SL.glm", "SL.mean"), 67 | bins_numeric = 2L, 68 | V = 2L, 69 | verbose = FALSE) 70 | 71 | # Test that the function completes without error 72 | expect_is(vim, "varimpact") 73 | expect_is(vim$time, "proc_time") 74 | 75 | # Test that some VIMs were actually calculated 76 | if (!is.null(vim$results_all)) { 77 | expect_gte(nrow(vim$results_all), 1) 78 | cat("Successfully calculated", nrow(vim$results_all), "VIMs with A_names\n") 79 | } else { 80 | cat("Warning: No VIMs calculated with A_names\n") 81 | } 82 | 83 | # Print timing for reference 84 | cat("A_names test execution time:", vim$time[3], "seconds\n") 85 | }) 86 | 87 | # Return to single core usage. 88 | future::plan("sequential") 89 | -------------------------------------------------------------------------------- /tests/testthat/test-varimpact.R: -------------------------------------------------------------------------------- 1 | # Make sure we're using the rebuilt package. Suppress any error if it isn't loaded. 2 | try(detach(package:varimpact), silent = TRUE) 3 | library(varimpact) 4 | library(testthat) 5 | 6 | # Create test dataset. 7 | context("varimpact(). Dataset A: continuous variables") 8 | 9 | # Set multicore-compatible seed. 10 | set.seed(1, "L'Ecuyer-CMRG") 11 | # Can't go below 90 without changing more varimpact default settings. 12 | N = 100 13 | num_normal = 5 14 | X = data.frame(matrix(rnorm(N * num_normal), N, num_normal)) 15 | 16 | # Systematic Y generation. 17 | Y = .2 * X[, 1] + 1 * X[, 2] - 0.8 * X[, 3] + .1 * X[, 3] * X[, 4] - .2 * abs(X[, 4]) 18 | 19 | # Binary distribution via the binomial. 20 | Y_bin = rbinom(N, 1, plogis(Y)) 21 | 22 | # Gaussian distribution. 23 | Y_gaus = Y + rnorm(N, 0, 1) 24 | 25 | # Add some missing data to X. 26 | miss_num = 10 27 | for (i in 1:miss_num) X[sample(nrow(X), 1), sample(ncol(X), 1)] = NA 28 | 29 | # Basic test - binary outcome. 30 | #future::plan("multiprocess") 31 | future::plan("sequential") 32 | vim = varimpact(Y = Y_bin, data = X[, 1:3], V = 3L, 33 | Q.library = c("SL.mean", "SL.glm"), 34 | g.library = c("SL.mean", "SL.glm"), 35 | verbose = TRUE, 36 | verbose_tmle = FALSE, bins_numeric = 3L) 37 | # Takes 25 seconds. 38 | vim$time 39 | # Be explict about printing for code coverage of tests. 40 | print(vim) 41 | vim$results_all 42 | vim$results_by_fold 43 | # names(vim) 44 | # exportLatex testing moved to test-exportLatex.R 45 | 46 | # And try a gaussian outcome. 47 | vim = varimpact(Y = Y_gaus, data = X[, 1:3], V = 3L, verbose = TRUE, 48 | family = "gaussian") 49 | print(vim) 50 | 51 | # Test imputation 52 | vim = varimpact(Y = Y_bin, data = X[, 1:3], verbose = TRUE, impute = "zero") 53 | vim = varimpact(Y = Y_bin, data = X[, 1:3], verbose = TRUE, impute = "median") 54 | vim = varimpact(Y = Y_bin, data = X[, 1:4], verbose = TRUE, impute = "knn") 55 | 56 | # Test a subset of columns using A_names. 57 | vim = varimpact(Y = Y_bin, data = X, A_names = colnames(X)[1:2], verbose = TRUE) 58 | print(vim) 59 | 60 | # Only run in RStudio so that automated CRAN checks don't give errors. 61 | if (.Platform$GUI == "RStudio") { 62 | # Test parallelization 63 | future::plan("multiprocess", workers = 2) 64 | vim = varimpact(Y = Y_bin, data = X[, 1:3], verbose = TRUE) 65 | print(vim) 66 | } 67 | 68 | # Only run in RStudio so that automated CRAN checks don't give errors. 69 | if (.Platform$GUI == "RStudio") { 70 | # Test parallelization via snow. 71 | cl = snow::makeCluster(2L) 72 | future::plan("cluster", workers = cl) 73 | vim = varimpact(Y = Y_bin, data = X[, 1:4], verbose = TRUE) 74 | vim 75 | snow::stopCluster(cl) 76 | } 77 | 78 | context("varimpact(). Dataset B: factor variables") 79 | 80 | # Set a new multicore-compatible seed. 81 | set.seed(2, "L'Ecuyer-CMRG") 82 | 83 | X_fac = data.frame(lapply(1:ncol(X), 84 | function(col_i) 85 | as.factor(floor(abs(pmin(pmax(X[, col_i], -1), 1) * 3))))) 86 | dim(X_fac) 87 | colnames(X_fac) = paste0("fac_", 1:ncol(X_fac)) 88 | colnames(X_fac) 89 | summary(X_fac) 90 | 91 | # Return to sequential execution for now. 92 | future::plan("sequential") 93 | 94 | # Basic factor test. 95 | vim = varimpact(Y = Y_bin, data = X_fac[, 1:3], V = 2L, verbose = TRUE) 96 | print(vim) 97 | 98 | # And gaussian 99 | vim = varimpact(Y = Y_gaus, data = X_fac[, 1:3], V = 2L, verbose = TRUE, 100 | family = "gaussian") 101 | print(vim) 102 | 103 | # Only run in RStudio so that automated CRAN checks don't give errors. 104 | # Disabled for now - need to review. 105 | if (F && .Platform$GUI == "RStudio") { 106 | # Test parallelization. 107 | future::plan("multiprocess") 108 | 109 | # Try a snow cluster, which does return the output to STDOUT. 110 | if (F) { 111 | # Run manually when debugging. 112 | cores = RhpcBLASctl::get_num_cores() 113 | capture.output({ cl = snow::makeCluster(cores, type="SOCK", outfile = "") }) 114 | doSNOW::registerDoSNOW(cl) 115 | parallel::setDefaultCluster(cl) 116 | foreach::getDoParName() 117 | } 118 | 119 | # Factor variables with parallelization. 120 | # TOFIX: This does not complete currently if fac_4 is included. 121 | # I think it is due to HOPACH never completing. 122 | vim = varimpact(Y = Y_bin, data = X_fac[, 1:3], 123 | #A_names = c(_4", "fac_2"), 124 | verbose = TRUE) 125 | vim 126 | 127 | # Return to single core usage. 128 | 129 | # Run manually when debugging, if the snow cluster was used. 130 | if (F) { 131 | ck37r::stop_cluster(cl) 132 | } 133 | 134 | } 135 | 136 | context("varimpact(). Dataset C: numeric and factor variables") 137 | 138 | ################################# 139 | # Combined numeric and factor test. 140 | X_combined = cbind(X[1:3], X_fac[4:5]) 141 | 142 | # Basic combined test. 143 | vim = varimpact(Y = Y_bin, data = X_combined, V = 2, verbose = TRUE) 144 | print(vim) 145 | 146 | # And gaussian 147 | vim = varimpact(Y = Y_gaus, data = X_combined, V = 2, verbose = TRUE, 148 | family = "gaussian") 149 | print(vim) 150 | 151 | context("varimpact() .Dataset D: basic example") 152 | 153 | #################################### 154 | # Create test dataset. 155 | set.seed(1, "L'Ecuyer-CMRG") 156 | 157 | N = 100 158 | num_normal = 7 159 | X = as.data.frame(matrix(rnorm(N * num_normal), N, num_normal)) 160 | Y = rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4]))) 161 | # Add some missing data to X so we can test imputation. 162 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] = NA 163 | 164 | #################################### 165 | # Basic example 166 | # TODO: fix warnings here, due to failed folds. 167 | # TOFIX: there is an error here on the numeric variables. 168 | # task 3 failed - "attempt to select less than one element in get1index" 169 | # X_3 seems to be causing the problem - need to investigate why. 170 | vim = varimpact(Y = Y, data = X, A_names = colnames(X)[c(1, 2, 4:7)], 171 | verbose = TRUE, parallel = FALSE) 172 | print(vim) 173 | vim$results_all 174 | vim$results_by_fold 175 | # In this test all variables are significant, which is rare. 176 | # exportLatex testing moved to test-exportLatex.R 177 | --------------------------------------------------------------------------------