├── .Rbuildignore
├── .gitattributes
├── .github
    ├── .gitignore
    └── workflows
    │   └── check-standard.yaml
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── apply_tmle_to_validation.R
    ├── cleanup-latex.R
    ├── compile-results.R
    ├── create_cv_folds.R
    ├── estimate_pooled_results.R
    ├── estimate_tmle.R
    ├── estimate_tmle2.R
    ├── exportLatex.R
    ├── factors_to_indicators.R
    ├── globals.R
    ├── length-unique.R
    ├── max_sqr.R
    ├── plot-var.R
    ├── print.varimpact.R
    ├── process-factors.R
    ├── process-numerics.R
    ├── quantiles_equivalent.R
    ├── reduce_dimensions.R
    ├── restrict_by_quantiles.R
    ├── results-by-level.R
    ├── separate_factors_numerics.R
    ├── sum_na.R
    ├── tmle_bound.R
    ├── tmle_estimate_g.R
    ├── tmle_estimate_q.R
    ├── tmle_init_stage1.R
    ├── varimpact.R
    ├── vim-factors.R
    └── vim-numerics.R
├── README.md
├── TODO.txt
├── appveyor.yml
├── images
    ├── README-example_1-1.png
    └── README-example_5-1.png
├── man
    ├── cleanup_latex_files.Rd
    ├── create_cv_folds.Rd
    ├── estimate_tmle.Rd
    ├── estimate_tmle2.Rd
    ├── exportLatex.Rd
    ├── factors_to_indicators.Rd
    ├── max_sqr.Rd
    ├── plot_var.Rd
    ├── print.varimpact.Rd
    ├── quantiles_equivalent.Rd
    ├── reduce_dimensions.Rd
    ├── restrict_by_quantiles.Rd
    ├── results_by_level.Rd
    ├── separate_factors_numerics.Rd
    ├── sum_na.Rd
    ├── tmle_estimate_q.Rd
    └── varimpact.Rd
├── readme.Rmd
└── tests
    ├── testthat.R
    └── testthat
        ├── test-estimate_tmle2.R
        ├── test-exportLatex.R
        ├── test-factorsToIndicators.R
        ├── test-reduce_dimensions.R
        ├── test-varimpact-breastcancer.R
        └── test-varimpact.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | # RStudio files.
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | 
 5 | # Automated testing.
 6 | ^\.travis\.yml$
 7 | ^appveyor\.yml$
 8 | 
 9 | # Misc.
10 | ^TODO.txt$
11 | ^images$
12 | 
13 | # Github files.
14 | ^readme.Rmd$
15 | ^\.github$
16 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, in case people don't have core.autocrlf set.
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/check-standard.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macOS-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 | 
26 |     env:
27 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
28 |       R_KEEP_PKG_SOURCE: yes
29 | 
30 |     steps:
31 |       - uses: actions/checkout@v2
32 | 
33 |       - uses: r-lib/actions/setup-pandoc@v2
34 | 
35 |       - uses: r-lib/actions/setup-r@v2
36 |         with:
37 |           r-version: ${{ matrix.config.r }}
38 |           http-user-agent: ${{ matrix.config.http-user-agent }}
39 |           use-public-rspm: true
40 | 
41 |       - uses: r-lib/actions/setup-r-dependencies@v2
42 |         with:
43 |           extra-packages: any::rcmdcheck
44 |           needs: check
45 | 
46 |       - uses: r-lib/actions/check-r-package@v2
47 |         with:
48 |           upload-snapshots: true
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .DS_Store
5 | *.Rproj
6 | results
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Docs: https://docs.travis-ci.com/user/languages/r/
 2 | # Validate your .travis.yml file at http://lint.travis-ci.org/
 3 | 
 4 | language: r
 5 | cache: packages
 6 | sudo: false
 7 | 
 8 | # Install nlopt package to help nloptr install.
 9 | addons:
10 |   apt:
11 |     packages:
12 |       - libnlopt-dev
13 | 
14 | #r_github_packages:
15 | #  - jimhester/covr
16 | 
17 | r_packages:
18 |   - covr
19 | 
20 | bioc_packages:
21 |  - hopach
22 |  - multtest
23 | 
24 | # Test on 3 versions of R.
25 | r:
26 |   - release
27 |   - devel
28 |   # glmnet no longer works on R 3.5 :/
29 |   #- oldrel
30 | 
31 | env:
32 |   global:
33 |     # Allow build to continue even if a suggested package cannot be installed.
34 |     - _R_CHECK_FORCE_SUGGESTS_=false
35 | 
36 | matrix:
37 |   # Report build completion status once non-"allow_failures" builds are done.
38 |   fast_finish: true
39 |   # Allow failures on OSX.
40 |   allow_failures:
41 |     - os: osx
42 | 
43 | # Allow build to take up to 40 minutes without returning output (esp. during example testing)
44 | # Without this section, travis build will fail to lack of output for 10 minutes.
45 | script:
46 |   - |
47 |     R CMD build .
48 |     travis_wait 40 R CMD check varimpact*.tar.gz
49 | 
50 | # code coverage testing is timing out after 10 minutes, also need to
51 | # enable travis_wait for this.
52 | #after_success:
53 |   # Only check test coverage for R = release and OS = linux.
54 | #  - test ${TRAVIS_R_VERSION_STRING} == "release" && test ${TRAVIS_OS_NAME} == "linux" && Rscript -e 'covr::codecov()'
55 | 
56 | # Get error logs in case of failure.
57 | after_failure:
58 |   find *Rcheck -name '*.fail' -print -exec cat '{}' \;
59 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: varimpact
 2 | Type: Package
 3 | Title: Variable Importance Estimation Using Targeted Causal Inference (TMLE)
 4 | Version: 1.3.0-9006
 5 | Date: 2022-06-21
 6 | Authors@R: c(
 7 |   person("Alan", "Hubbard", email = "hubbard@berkeley.edu",
 8 |     role = c("aut"),
 9 |     comment = c(ORCID = "0000-0002-3769-0127")),
10 |   person("Chris", "Kennedy", email = "chrisken@gmail.com",
11 |     role = c("aut", "cre"),
12 |     comment = c(ORCID = "0000-0001-7444-2766")))
13 | URL: http://github.com/ck37/varimpact
14 | BugReports: https://github.com/ck37/varimpact/issues
15 | Description: Automated data processing and variable importance using data-
16 |     adaptive target parameter methods combined with TMLE. Returns an ordered
17 |     variable list of importance.
18 | License: GPL (>= 2)
19 | Encoding: UTF-8
20 | Depends:
21 |     SuperLearner
22 | Imports:
23 |     arules,
24 |     caret,
25 |     cvTools,
26 |     dplyr,
27 |     future,
28 |     future.apply,
29 |     ggplot2,
30 |     glmnet,
31 |     histogram,
32 |     hopach,
33 |     magrittr,
34 |     MASS,
35 |     modeest,
36 |     multtest,
37 |     RANN,
38 |     tmle,
39 |     xtable
40 | biocViews:
41 | Suggests:
42 |     doSNOW,
43 |     foreach,
44 |     mlbench,
45 |     RhpcBLASctl,
46 |     snow,
47 |     testthat
48 | RoxygenNote: 7.3.2
49 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,varimpact)
 4 | export(.bound)
 5 | export(cleanup_latex_files)
 6 | export(estimate_tmle2)
 7 | export(exportLatex)
 8 | export(factors_to_indicators)
 9 | export(max_sqr)
10 | export(plot_var)
11 | export(quantiles_equivalent)
12 | export(reduce_dimensions)
13 | export(restrict_by_quantiles)
14 | export(separate_factors_numerics)
15 | export(tmle_estimate_q)
16 | export(varimpact)
17 | import(ggplot2)
18 | importFrom(SuperLearner,All)
19 | importFrom(cvTools,cvFolds)
20 | importFrom(dplyr,first)
21 | importFrom(dplyr,funs)
22 | importFrom(dplyr,group_by)
23 | importFrom(dplyr,mutate)
24 | importFrom(dplyr,select)
25 | importFrom(dplyr,summarize_all)
26 | importFrom(hopach,distancematrix)
27 | importFrom(hopach,hopach)
28 | importFrom(magrittr,"%>%")
29 | importFrom(modeest,mlv)
30 | importFrom(stats,as.formula)
31 | importFrom(stats,binomial)
32 | importFrom(stats,coef)
33 | importFrom(stats,cor)
34 | importFrom(stats,glm)
35 | importFrom(stats,model.matrix)
36 | importFrom(stats,na.omit)
37 | importFrom(stats,plogis)
38 | importFrom(stats,pnorm)
39 | importFrom(stats,poisson)
40 | importFrom(stats,predict)
41 | importFrom(stats,qlogis)
42 | importFrom(stats,quantile)
43 | importFrom(stats,var)
44 | importFrom(tmle,tmle)
45 | importFrom(utils,packageDescription)
46 | 


--------------------------------------------------------------------------------
/R/apply_tmle_to_validation.R:
--------------------------------------------------------------------------------
  1 | apply_tmle_to_validation =
  2 |   function(Y,
  3 |            A,
  4 |            W,
  5 |            family,
  6 |            delta = rep(1, length(Y)),
  7 |            tmle,
  8 |            id = 1:length(Y),
  9 |            verbose = FALSE) {
 10 | 
 11 |   ###########
 12 |   # Transform Y to Y_star, needed for later fluctuation step.
 13 |   Y_star = Y
 14 |   if (tmle$map_to_ystar) {
 15 |     # Use the Qbounds from the full range of Y,
 16 |     # not the tmle$ab that is based only on training data.
 17 |     # Y_star = (Y_star - tmle$ab[1]) / diff(tmle$ab)
 18 |     # If Y is binary this will have no effect as bounds & outcome are already {0, 1}.
 19 |     Y_star = (Y_star - tmle$Qbounds[1]) / diff(tmle$Qbounds)
 20 | 
 21 |     if (verbose) {
 22 |       cat("Mapped Y to Y_star using Qbounds:", tmle$Qbounds, "\n")
 23 |       cat("New Y_star range:", range(Y_star), "\n")
 24 |     }
 25 |   }
 26 | 
 27 |   if (max(Y_star) > 1 | min(Y_star) < 0) {
 28 |     cat("Error on Y_star's range for logistic fluctuation\n")
 29 |     cat("Y_star distribution:\n")
 30 |     print(summary(Y_star))
 31 |     cat("Qbounds:", tmle$Qbounds, "\n")
 32 |     cat("Stage1 Qbounds:", tmle$stage1_Qbounds, "\n")
 33 |     cat("Stage1 ab:", tmle$ab, "\n")
 34 |     cat("Validation Y range:", range(Y), "\n")
 35 |     stop("Y values must be 0 <= y <= 1")
 36 |   }
 37 | 
 38 |   # We only include this because TMLE functions use Z.
 39 |   Z = rep(0, length(Y))
 40 | 
 41 |   q_df = data.frame(Z, A = 1, W)
 42 | 
 43 |   # Predict Q(1, W)
 44 |   tryCatch({
 45 |     sl_pred = predict(tmle$q_model, q_df, onlySL = T)
 46 |     Q_hat = sl_pred$pred
 47 |   }, error = function(e) {
 48 |     print(e)
 49 |     print(tmle$q_model)
 50 |     #browser()
 51 |     stop("apply_tmle_to_validation() failed during prediction of Q(1, W).")
 52 |   })
 53 | 
 54 |   if (verbose) cat("Bounding Q_hat to", tmle$stage1_Qbounds, "\n")
 55 |   Q_hat = .bound(Q_hat, tmle$stage1_Qbounds)
 56 | 
 57 |   if (min(Q_hat) < 0 || max(Q_hat) > 1) {
 58 |     cat("Error: predicted Q_hat outside of [0, 1] bounds.\n")
 59 |     #browser()
 60 |   }
 61 | 
 62 |   # Predict g
 63 |   tryCatch({
 64 |     # Check specifically for a g_model that doesn't exist.
 65 |     if (is.null(tmle$g_model)) {
 66 |       stop("tmle$g_model has class = NULL")
 67 |     }
 68 |     sl_pred = predict(tmle$g_model, W, type = "response", onlySL = TRUE)
 69 |     g1W_hat = sl_pred$pred
 70 |   }, error = function(e) {
 71 |     print(e)
 72 |     print(tmle$g_model)
 73 |     #browser()
 74 |     stop("apply_tmle_to_validation() failed during prediction of g.")
 75 |   })
 76 | 
 77 |   if (verbose) cat("Current range of g1W on test:", range(g1W_hat), "\n")
 78 | 
 79 |   # Truncate g1W_hat
 80 |   # TODO: double-check this with Alan.
 81 |   g1W_hat_truncated = .bound(g1W_hat, tmle$gbounds)
 82 |   if (verbose) cat("Truncating g1W on test using bounds:", tmle$gbounds, "\n")
 83 | 
 84 |   # Create clever covariate.
 85 |   # H1W = (A == 1) / g1W_hat_truncated
 86 |   # Based on Jeremy Coyle's CV-TMLE implementation.
 87 |   H1W = 1 / g1W_hat_truncated
 88 |   HAW = A * H1W
 89 | 
 90 |   # TODO: handle delta in some way?
 91 | 
 92 |   if (verbose) {
 93 |     cat("Mean Y_a on validation:", mean(Y[A == 1]), "\n")
 94 |     cat("Mean Y_star_a on validation:", mean(Y_star[A == 1]), "\n")
 95 |     cat("Mean Q_bar_a on validation:", mean(Q_hat[A == 1]), "\n")
 96 |   }
 97 | 
 98 |   ####################
 99 |   # Return results
100 | 
101 |   # We return Y_star rather than Y, for use in pooled fluctuation step.
102 |   data = data.frame(Y_star = Y_star, A = A, Q_hat = Q_hat,
103 |                     g1W_hat = g1W_hat_truncated,
104 |                     H1W = H1W,
105 |                     HAW = HAW)
106 | 
107 |   results = data
108 | 
109 |   return(results)
110 | }
111 | 


--------------------------------------------------------------------------------
/R/cleanup-latex.R:
--------------------------------------------------------------------------------
 1 | #' Clean up LaTeX files created by exportLatex
 2 | #'
 3 | #' This function removes LaTeX files that are typically created by the exportLatex() function.
 4 | #' It's designed to be used after exportLatex() calls to clean up temporary files.
 5 | #'
 6 | #' @param dir Directory where LaTeX files are located (default: current directory)
 7 | #' @param outname Prefix for the LaTeX files (default: empty string)
 8 | #' @param verbose If TRUE, print messages about which files were removed
 9 | #'
10 | #' @return Invisibly returns a logical vector indicating which files were successfully removed
11 | #'
12 | #' @examples
13 | #' \dontrun{
14 | #' # After calling exportLatex()
15 | #' exportLatex(vim)
16 | #' cleanup_latex_files()
17 | #' 
18 | #' # With custom directory and prefix
19 | #' exportLatex(vim, outname = "myresults_", dir = "output/")
20 | #' cleanup_latex_files(dir = "output/", outname = "myresults_")
21 | #' }
22 | #'
23 | #' @export
24 | cleanup_latex_files <- function(dir = ".", outname = "", verbose = FALSE) {
25 |   
26 |   # Define the standard LaTeX file names that exportLatex() creates
27 |   latex_files <- c(
28 |     paste0(dir, "/", outname, "varimpByFold.tex"),
29 |     paste0(dir, "/", outname, "varimpAll.tex"),
30 |     paste0(dir, "/", outname, "varimpConsistent.tex")
31 |   )
32 |   
33 |   # Check which files exist
34 |   existing_files <- latex_files[file.exists(latex_files)]
35 |   
36 |   if (length(existing_files) == 0) {
37 |     if (verbose) {
38 |       cat("No LaTeX files found to clean up.\n")
39 |     }
40 |     return(invisible(logical(0)))
41 |   }
42 |   
43 |   if (verbose) {
44 |     cat("Cleaning up LaTeX files:\n")
45 |     cat(paste("  -", basename(existing_files), collapse = "\n"), "\n")
46 |   }
47 |   
48 |   # Remove the files
49 |   removal_success <- suppressWarnings({
50 |     file.remove(existing_files)
51 |   })
52 |   
53 |   if (verbose) {
54 |     successful_removals <- sum(removal_success)
55 |     cat("Successfully removed", successful_removals, "of", length(existing_files), "files.\n")
56 |   }
57 |   
58 |   return(invisible(removal_success))
59 | }


--------------------------------------------------------------------------------
/R/compile-results.R:
--------------------------------------------------------------------------------
  1 | compile_results =
  2 |   function(colnames_numeric,
  3 |            colnames_factor,
  4 |            vim_numeric,
  5 |            vim_factor,
  6 |            V,
  7 |            verbose = FALSE) {
  8 | 
  9 |   num_numeric = length(colnames_numeric)
 10 |   num_factor = length(colnames_factor)
 11 | 
 12 |   variable_types = c(rep("Ordered", num_numeric), rep("Factor", num_factor))
 13 |   variable_names = c(colnames_numeric, colnames_factor)
 14 | 
 15 |   all_vims = c(vim_numeric, vim_factor)
 16 |   names(all_vims) = variable_names
 17 | 
 18 |   element_length = sapply(all_vims, length)
 19 | 
 20 |   # May increase this to 8, hence >= operator in following lines.
 21 |   expected_length = 8
 22 | 
 23 |   num_short_vars = sum(element_length < expected_length)
 24 |   if (verbose && num_short_vars > 0) {
 25 |     # Identify which variables do not have expected length.
 26 |     cat(num_short_vars, "variables did not contain >=", expected_length,
 27 |         "result elements. Removing those variables from the results.\n")
 28 |     # Likely missing EY1V, EY0V, labV, and fold_results.
 29 |     print(element_length[element_length < expected_length])
 30 |     #     browser()
 31 |   }
 32 | 
 33 |   # Restrict to vim results that have at least 7 elements.
 34 |   vim_combined = all_vims[element_length >= expected_length]
 35 |   variable_names = variable_names[element_length >= expected_length]
 36 |   variable_types = variable_types[element_length >= expected_length]
 37 | 
 38 |   # Set defaults for variables we want to return.
 39 |   outres = outres.all = outres.cons = outres.byV = NULL
 40 | 
 41 |   # Get rid of any variables that have a validation sample with no
 42 |   # estimates of variable importance.
 43 |   if (length(vim_combined) == 0) {
 44 |     error_msg = "No variable importance estimates could be calculated due to sample size, etc."
 45 |     if (verbose) {
 46 |       cat(error_msg, "\n")
 47 |       #cat("Lengths:", element_length, "\n")
 48 |     }
 49 |     warning(error_msg)
 50 |     # TODO: also write output to the file in a separate function call.
 51 |     #write("No VIM's could be calculated due to sample size, etc",
 52 |     #     file = "AllReslts.tex")
 53 |   } else {
 54 |     length_ey1 = sapply(vim_combined, function(x) {
 55 |       # In some cases class(x$EY1) is NULL, so this avoids a warning.
 56 |       if (is.null(x$EY1V)) {
 57 |         return(0)
 58 |       }
 59 |       ey1_vector = na.omit(x$EY1V)
 60 |       length(ey1_vector)
 61 |     })
 62 | 
 63 |     valid_results = vim_combined[length_ey1 == V]
 64 |     variable_names = variable_names[length_ey1 == V]
 65 |     variable_types = variable_types[length_ey1 == V]
 66 | 
 67 | 
 68 |     if (length(valid_results) == 0) {
 69 |       error_msg = "No VIMs could be calculated due to sample size, etc."
 70 |       if (verbose) {
 71 |         cat(error_msg, "\n")
 72 |         cat("EY1 lengths:", length_ey1, "\n")
 73 |       }
 74 |       warning(error_msg)
 75 |     } else {
 76 | 
 77 |       # Order of results:
 78 |       # 1. EY1V, #2. EY0V, #3. thetaV, #4. varICV, #5, labV
 79 |       # #6. nV, #7. type, #8. name.
 80 | 
 81 |       theta = do.call(rbind, lapply(valid_results, function(x) x$thetaV))
 82 | 
 83 |       theta_sum_na = apply(theta, 1, sum_na)
 84 |       results_no_na = valid_results[theta_sum_na == 0]
 85 |       variable_names = variable_names[theta_sum_na == 0]
 86 |       variable_types = variable_types[theta_sum_na == 0]
 87 | 
 88 |       # Extract the various components of the results.
 89 |       EY1 = do.call(rbind, lapply(results_no_na, function(x) x$EY1))
 90 |       EY0 = do.call(rbind, lapply(results_no_na, function(x) x$EY0))
 91 | 
 92 |       # Risk difference
 93 |       theta = do.call(rbind, lapply(results_no_na, function(x) x$thetaV))
 94 |       varIC = do.call(rbind, lapply(results_no_na, function(x) x$varICV))
 95 | 
 96 |       # Relative risk
 97 |       theta_rr = do.call(rbind, lapply(results_no_na, function(x) x$thetaV_rr))
 98 |       varIC_log_rr = do.call(rbind, lapply(results_no_na, function(x) x$varICV_log_rr))
 99 | 
100 |       nV = do.call(rbind, lapply(results_no_na, function(x) x$nV))
101 |       n = sum(nV[1, ])
102 | 
103 |       # SE: Risk difference
104 |       SEV = sqrt(varIC / nV)
105 | 
106 |       # SE: relative risk
107 |       SEV_log_rr = sqrt(varIC_log_rr / nV)
108 | 
109 |       # Get labels for each of the training samples.
110 |       extract_labels = function(x, total_folds) {
111 |         # Repeat each fold id twice, one for low level and once for high.
112 |         labels = rep(1:total_folds, 2)
113 |         # Re-order in ascending order.
114 |         # TODO: isn't there another rep() function that would sort automatically?
115 |         oo = order(labels)
116 |         labels = labels[oo]
117 |         # Convert labels from vector elements to columns in a single row.
118 |         out = as.vector(t(x))
119 |         names(out) = paste0(rep(c("Low", "High"), total_folds), "_v", labels)
120 |         out
121 |       }
122 | 
123 |       # labV is result element 5.
124 |       tst = lapply(results_no_na, function(x) x$labV)
125 |       tst = lapply(tst, extract_labels, total_folds = V)
126 |       labels = do.call(rbind, tst)
127 | 
128 |       # Parameteter: risk difference
129 |       psi = apply(theta, 1, mean)
130 |       # Each row is a variable and each column in a fold estimate.
131 |       meanvarIC = apply(varIC, 1, mean)
132 |       SE = sqrt(meanvarIC / n)
133 | 
134 |       ci_lower = psi - 1.96 * SE
135 |       ci_upper = psi + 1.96 * SE
136 |       # 1-sided p-value
137 |       pvalue = 1 - pnorm(psi / SE)
138 | 
139 |       # Parameter: relative risk
140 |       psi_rr = apply(theta_rr, 1, mean)
141 |       # Each row is a variable and each column in a fold estimate.
142 |       meanvarIC_log_rr = apply(varIC_log_rr, 1, mean)
143 |       se_log_rr = sqrt(meanvarIC_log_rr / n)
144 |       # Calculate CI on log-RR scale.
145 |       ci_lower_rr = exp(log(psi_rr) - 1.96 * se_log_rr)
146 |       ci_upper_rr = exp(log(psi_rr) + 1.96 * se_log_rr)
147 | 
148 |       # TODO: double-check this.
149 |       #pvalue_rr = 1 - pnorm(abs(log(psi_rr)) / se_log_rr)
150 |       pvalue_rr = 1 - pnorm(log(psi_rr) / se_log_rr)
151 | 
152 | 
153 |       # Number of significant digits.
154 |       signif_digits = 3
155 | 
156 |       # TODO: provide ci_lower and ci_upper as separate elements.
157 |       CI95 = paste0("(", signif(ci_lower, signif_digits), " - ",
158 |                     signif(ci_upper, signif_digits), ")")
159 |       CI95_rr = paste0("(", signif(ci_lower_rr, signif_digits), " - ",
160 |                        signif(ci_upper_rr, signif_digits), ")")
161 | 
162 | 
163 |       ##### FOR THETA (generalize to chi-square test?)
164 |       # TT = (theta[,1] - theta[,2]) / sqrt(SEV[,1]^2 + SEV[,2]^2)
165 |       # pval.comp=2*(1-pnorm(abs(TT))) FOR levels
166 |       # (just make sure in same order)
167 | 
168 |       num_continuous = sum(variable_types == "Ordered")
169 |       num_vars = length(variable_types)
170 | 
171 |       length.uniq = function(x) {
172 |         length(unique(x))
173 |       }
174 | 
175 |       ##################
176 |       # Ordered variables first
177 |       cons = NULL
178 |       if (num_continuous > 0) {
179 |         dir = NULL
180 |         # Check for consistency.
181 |         for (i in 1:V) {
182 |           lower_temp = labels[1:num_continuous, i * 2 - 1]
183 |           xx = regexpr(",", lower_temp)
184 |           lwr = as.numeric(substr(lower_temp, 2, xx - 1))
185 | 
186 |           upper_temp = labels[1:num_continuous, i * 2]
187 |           xx = regexpr(",", upper_temp)
188 |           nx = nchar(upper_temp)
189 |           uwr = as.numeric(substr(upper_temp, xx + 1, nx - 1))
190 | 
191 |           dir = cbind(dir, uwr > lwr)
192 |         }
193 | 
194 |         # For numeric variables, consistency means each fold finds the same directionality.
195 |         cons = apply(dir, 1, length.uniq)
196 |       }
197 | 
198 |       ##################
199 |       # Factors
200 |       num_factors = num_vars - num_continuous
201 |       if (num_factors > 0) {
202 |         lwr = NULL
203 |         uwr = NULL
204 |         for (i in 1:V) {
205 |           # The 2 here is because we have the a_l and a_h labels, not because V = 2.
206 |           lwr = cbind(lwr, labels[(num_continuous + 1):num_vars, i * 2 - 1])
207 |           uwr = cbind(uwr, labels[(num_continuous + 1):num_vars, i * 2])
208 |         }
209 |         # Count have many unique levels are used for the lower bin - want it to be 1.
210 |         conslwr = apply(lwr, 1, length.uniq)
211 |         # Count how many unique levels are used for the upper bin - want it to be 1.
212 |         consupr = apply(uwr, 1, length.uniq)
213 |         # If conslwr * consupr == 1 then the variable is consistent.
214 |         cons = c(cons, conslwr * consupr)
215 |       }
216 |       # consist= (cons==1 & pval.comp > 0.05)
217 |       signsum = function(x) {
218 |         sum(sign(x))
219 |       }
220 | 
221 |       # Consistent results need to have all positive all negative thetas,
222 |       # And use the same factor levels for the low and high bins in each CV fold.
223 |       # CK: but really, shouldn't they all be positive? May want to remove abs()
224 |       consist = cons == 1 & abs(apply(theta, 1, signsum)) == V
225 | 
226 |       procedures = c("Holm", "BH")
227 |       if (num_vars > 1) {
228 |         # Adjust p-values for multiple testing.
229 |         res = multtest::mt.rawp2adjp(pvalue, procedures)
230 |         res_rr = multtest::mt.rawp2adjp(pvalue_rr, procedures)
231 | 
232 |         # Attempt to prepend these names with "rr_" for the relative risk version.
233 |         colnames(res_rr$adj) = paste0("rr_", colnames(res_rr$adj))
234 | 
235 |         # This indexing sorts the results in ascending order of unadjusted p-value,
236 |         # then descending by impact estimate.
237 |         # TODO: this may need to be fixed.
238 |         #sorted_rows = base::order(res$index, -psi)
239 |         sorted_rows = res$index
240 | 
241 |         #browser()
242 |         outres = data.frame(var_type = variable_types[sorted_rows],
243 |                             theta[sorted_rows, , drop = FALSE],
244 |                             psi[sorted_rows],
245 |                             CI95[sorted_rows],
246 |                             res$adj,
247 |                             labels[sorted_rows, , drop = FALSE],
248 |                             # Relative risk parameter.
249 |                             "AvePsi_rr" = psi_rr[sorted_rows],
250 |                             "CI95_rr" = CI95_rr[sorted_rows],
251 |                             res_rr$adj,
252 |                             # Consistency
253 |                             consist[sorted_rows])
254 |       } else if (num_vars == 1) {
255 |         # No need for multiple testing adjustment.
256 |         # TODO: just integrate into previous part?
257 |         outres = data.frame(var_type = variable_types,
258 |                             theta,
259 |                             psi,
260 |                             CI95,
261 |                             rawp = pvalue,
262 |                             Holm = pvalue,
263 |                             BH = pvalue,
264 |                             labels,
265 |                             # TODO: implement these.
266 |                             "AvePsi_rr" = NA,
267 |                             "CI95_rr" = NA,
268 |                             "rr_rawp" = NA,
269 |                             "rr_Holm" = NA,
270 |                             "rr_BH" = NA,
271 |                             consist)
272 |       } else {
273 |         outres = NULL
274 |       }
275 | 
276 |       # TODO: this will give an error if we have no results.
277 | 
278 |       # Restrict to variables that aren't missing their p-value.
279 |       outres = outres[!is.na(outres[, "rawp"]), , drop = FALSE]
280 |       #print(colnames(outres))
281 |       #print(ncol(outres))
282 | 
283 |       #names(outres)[1:(1 + 2*V)] = c("VarType", paste0("Est_v", 1:V), "AvePsi", "CI95")
284 |       names(outres)[1:(3 + V)] = c("VarType", paste0("Est_v", 1:V), "AvePsi", "CI95")
285 | 
286 |       #names(outres)[(9 + 2 * V)] = "Consistent"
287 |       names(outres)[ncol(outres)] = "Consistent"
288 | 
289 |       #print(colnames(outres))
290 |       #print(ncol(outres))
291 | 
292 |       #  Save rownames, because dplyr will discard them.
293 |       outres$varname = rownames(outres)
294 | 
295 |       # Sort by raw p-value, then BH p-value, then effect size.
296 |       outres = outres %>% dplyr::arrange(rawp, BH, desc(AvePsi)) %>% as.data.frame()
297 | 
298 |       # Restore rownames.
299 |       rownames(outres) = outres$varname
300 |       outres$varname = NULL
301 | 
302 |       # drops = c('VarType','description','Holm,')
303 |       # outres.all=outres[,!(names(outres) %in% drops)]
304 |       # We want to extract the per-fold psi estimates, per-fold levels, and consistency flag.
305 |       # TODO: update for relative risk parameter.
306 |       outres.byV = outres[, c(2:(2 + V - 1), (7 + V):(7 + 3 * V)), drop = FALSE]
307 | 
308 |       outres.all = outres[, c("VarType",
309 |                               # Risk difference:
310 |                               "AvePsi", "CI95", "rawp", "BH",
311 |                               # Relative risk:
312 |                               "AvePsi_rr", "CI95_rr", "rr_rawp", "rr_BH",
313 |                               "Consistent"), drop = FALSE]
314 | 
315 |       colnames(outres.all) = c("Type",
316 |                                # Risk difference:
317 |                                "Estimate", "CI95", "P-value", "Adj. p-value",
318 |                                # Relative risk:
319 |                                "Est. RR", "CI95 RR", "P-value RR", "Adj. p-value RR",
320 |                                "Consistent")
321 | 
322 |       ################
323 |       # Get Consistency Measure and only significant
324 |       # TODO: Make BH cut-off flexible in future versions (default at 0.05)
325 |       outres.cons = outres.all[outres[, "BH"] < 0.05 &
326 |                                outres$Consistent, , drop = FALSE]
327 |       outres.cons = subset(outres.cons, select = -c(Consistent))
328 |     }
329 |   }
330 | 
331 |   # Return results.
332 |   results = list(results_consistent = outres.cons,
333 |                  results_all = outres.all,
334 |                  results_by_fold = outres.byV,
335 |                  results_raw = outres,
336 |                  all_vims = all_vims)
337 | 
338 | 
339 |   return(results)
340 | }
341 | 


--------------------------------------------------------------------------------
/R/create_cv_folds.R:
--------------------------------------------------------------------------------
 1 | #' Stratified CV to insure balance (by one grouping variable, Y)
 2 | #'
 3 | #' @param V number of folds
 4 | #' @param Y Outcome variable. If binary will be used for stratification.
 5 | #' @param verbose If T will display extra output.
 6 | #'
 7 | #' @return Vector of fold assignments.
 8 | #'
 9 | #' @importFrom cvTools cvFolds
10 | create_cv_folds = function(V, Y, verbose = F) {
11 |   Ys = unique(Y)
12 |   nys = length(Ys)
13 |   nn = length(Y)
14 |   # Binary outcome so we can do stratified fold generation.
15 |   if (nys == 2) {
16 |     out = rep(NA, nn)
17 |     for (i in 1:nys) {
18 |       # Record how many observations have this Y value.
19 |       n = sum(Y == Ys[i])
20 |       folds = cvTools::cvFolds(n, K = V, R = 1, type = "random")$which
21 |       #if (verbose) {
22 |       #  cat("Y", i, "is", Ys[i], "count:", sum(Y == Ys[i]), "n=", n, "fold length:",
23 |       #      length(folds), "\n")
24 |       #}
25 |       out[Y == Ys[i]] = folds
26 |     }
27 |     if (verbose) {
28 |       cat("Cross-validation fold breakdown:\n")
29 |       print(table(Y, "Fold"=out, useNA="ifany"))
30 |     }
31 |   } else {
32 |     # More than 2 Ys, so don't stratify.
33 |     xx = cvTools::cvFolds(nn, K = V, R = 1, type = "random")$which
34 |     out = xx
35 |   }
36 |   return(out)
37 | }
38 | 


--------------------------------------------------------------------------------
/R/estimate_pooled_results.R:
--------------------------------------------------------------------------------
  1 | estimate_pooled_results = function(fold_results,
  2 |                                    fluctuation = "logistic",
  3 |                                    verbose = FALSE) {
  4 |   # Fold results is a list with test results from each fold.
  5 | 
  6 |   # Each fold result should have at least this element:
  7 |   # val_preds dataframe, with Y_star, g, Q, H.
  8 |   # TODO: need to change this check, it doesn't work correctly.
  9 |   #num_fails = sum(is.null(sapply(fold_results, `[[`, "level")))
 10 |   #if (verbose) {
 11 |   #  cat("Number of fold failures:", num_fails, "of", length(fold_results), "\n")
 12 |   #}
 13 | 
 14 |   # Placeholder results to return in case of error.
 15 |   results = list(
 16 |     thetas = NULL,
 17 |     influence_curves = NULL,
 18 |     epsilon = NULL
 19 |   )
 20 | 
 21 |   #if (num_fails == length(fold_results)) {
 22 |     # Every fold failed.
 23 |   #  if (verbose) cat("Error: every fold failed.\n")
 24 |   #  return(results)
 25 |   #}
 26 | 
 27 |   # browser()
 28 | 
 29 |   # Extract the results from each CV-TMLE fold and rbind into a single dataframe.
 30 |   data = do.call(rbind, lapply(1:length(fold_results), function(i) {
 31 |     fold = fold_results[[i]]
 32 |     # Save the fold number so we can use it to generate fold-specific estimates.
 33 |     if (is.null(fold$val_preds)) {
 34 |       # Skip folds that failed.
 35 |       NULL
 36 |     } else {
 37 |       # val_preds is a dataframe with columns: Y_star, g, Q, H
 38 |       df = cbind(fold$val_preds, fold_num = i)
 39 |       df
 40 |     }
 41 |   }))
 42 | 
 43 |   if (is.null(data)) {
 44 |     # Every fold failed.
 45 |     if (verbose) cat("Error: every fold failed.\n")
 46 |     return(results)
 47 |   }
 48 | 
 49 |   if (min(data$Q_hat) < 0 || max(data$Q_hat) > 1) {
 50 |     cat("Error: some predicted values of Q_hat are out of bounds.",
 51 |         "They should be in [0, 1].\n")
 52 |     print(summary(data$Q_hat))
 53 |     browser()
 54 |   }
 55 | 
 56 |   # Set some default values in case of a future error.
 57 |   thetas = NULL
 58 |   influence_curves = NULL
 59 |   epsilon = NULL
 60 | 
 61 | 
 62 |   if (!is.null(data)) {
 63 |     n = nrow(data)
 64 | 
 65 |     # If Y is binary, take logit of Q.
 66 |     #if (length(unique(data$Y)) == 2) {
 67 | 
 68 |     # Look at thetas prior to fluctuation.
 69 |     pre_thetas = tapply(data$Q_hat, data$fold_num, mean, na.rm = TRUE)
 70 |     if (verbose) cat("Pre-fluctuation thetas:", pre_thetas, "\n")
 71 | 
 72 |     # If Q is binary or continuous we still want to take logit of predicted values.
 73 |     # See tmle::estimateQ where it does this after predicting Q.
 74 |     data$logit_Q_hat = try(stats::qlogis(data$Q_hat))
 75 |     if (inherits(data$logit_Q_hat, "try-error")) {
 76 |       cat("Error in estimate_pooled_results() with qlogis()\n")
 77 |       print(summary(data$Q_hat))
 78 |       browser()
 79 |     }
 80 |     #}
 81 | 
 82 |     # Estimate epsilon
 83 |     if (verbose) cat("Estimating epsilon: ")
 84 | 
 85 |     if (fluctuation == "logistic") {
 86 |       suppressWarnings({
 87 |         #epsilon = coef(glm(Y_star ~ -1 + offset(logit_Q_hat) + H1W,
 88 |         #epsilon = coef(glm(Y_star ~ -1 + offset(logit_Q_hat) + HAW,
 89 |         #                 data = data, family = "binomial"))
 90 |         reg = try(stats::glm(Y_star ~ -1 + stats::offset(logit_Q_hat) + HAW,
 91 |                   data = data, family = "binomial"))
 92 |         if ("try-error" %in% class(reg)) {
 93 |           cat("Error in epsilon regression.\n")
 94 |           browser()
 95 |         }
 96 |         epsilon = try(stats::coef(reg))
 97 |       })
 98 |       # Use more stable version where clever covariate is the weight, and now we
 99 |       # have an intercept. Causal 2, Lecture 3, slide 51.
100 |       # We have to suppressWarnings about "non-integrate #successes in binomial glm".
101 |       #suppressWarnings({
102 |         # Catch an error if one occurs here.
103 |         #epsilon = try(coef(glm(Y_star ~ offset(logit_Q_hat),
104 |       #  epsilon = try(coef(glm(Y_star ~ .,
105 |       #                         offset = logit_Q_hat,
106 |       #                         weights = H1W,
107 |       #                         data = data, family = "binomial")))
108 |       #})
109 |       if (verbose) cat(epsilon, "\n")
110 |     } else {
111 |       # No need to support linear fluctuation as it does not respect model bounds.
112 |       stop("Only support logistic fluctuation currently.")
113 |       # TBD.
114 |     }
115 | 
116 |     if ("try-error" %in% class(epsilon)) {
117 |       if (verbose) cat("Error when estimating epsilon.\n")
118 |       print(summary(data$Y_star))
119 |       browser()
120 |     } else {
121 | 
122 |       if (verbose) cat("Fluctuating Q_star\n")
123 | 
124 |       # Fluctuate Q to get Q_star
125 |       Q_star = data$logit_Q_hat + epsilon * data$H1W
126 |       #Q_star = data$logit_Q_hat + epsilon * data$HAW
127 | 
128 |       if (verbose) cat("Transforming Q_star\n")
129 |       #if (length(unique(data$Y)) == 2) {
130 |       Q_star = plogis(Q_star)
131 |       #}
132 | 
133 |       if (verbose) cat("Estimating per-fold thetas: ")
134 | 
135 |       # Estimate treatment-specific mean parameter on every validation fold.
136 |       thetas = tapply(Q_star, data$fold_num, mean, na.rm = TRUE)
137 |       if (verbose) cat(thetas, "\n")
138 | 
139 |       # Take average across folds to get final estimate.
140 |       #theta = mean(thetas)
141 | 
142 |       # Move Q_star into the data so that it can be analyzed per-fold.
143 |       data$Q_star = Q_star
144 |       rm(Q_star)
145 | 
146 |       if (verbose) cat("Calculating per-fold influence curves\n")
147 | 
148 |       # Get influence curve per fold - for treatment-specific mean.
149 |       # Influence_curves here is a list, where each element is a result.
150 |       # We can't convert to a matrix because lengths are different.
151 |       # TODO: figure out why this can generate NaNs
152 |       influence_curves = base::by(data, data$fold_num, function(fold_data) {
153 |         if (F && verbose) {
154 |           with(fold_data,
155 |                cat("A:", length(A), "g1W_hat:", length(g1W_hat), "Y_star:", length(Y_star),
156 |                    "Q_star:", length(Q_star), "\n"))
157 |         }
158 |         #with(fold_data, (A / g1W_hat) * (Y - Q_star) + Q_star - theta)
159 |         result = with(fold_data, (A / g1W_hat) * (Y_star - Q_star) +
160 |                         Q_star - mean(Q_star, na.rm = TRUE))
161 |         #if (verbose) cat("Result:", class(result), "Length:", length(result), "\n")
162 |         result
163 |       })
164 | 
165 |       # Check for NaNs.
166 |       num_nans = sum(sapply(influence_curves, function(curve) sum(is.nan(curve))))
167 |       if (num_nans > 0) {
168 |         if (verbose) {
169 |           cat("Error: influence curves contain", num_nans, "NaNs.\n")
170 |           cat("g1W_hat zeros:", sum(data$g1W_hat == 0), "\n")
171 |         }
172 |       }
173 | 
174 |       #if (verbose) cat("IC class:", class(influence_curves), "\n")
175 | 
176 |       # Old version:
177 |       #influence_curve = with(data, (A / g1W_hat) * (Y - Q_star) + Q_star - theta)
178 | 
179 |       # Calculate standard error.
180 |       #std_err = stats::var(influence_curves) / n
181 |     }
182 |   }
183 | 
184 |   if (is.null(thetas))  {
185 |     # All folds must have failed.
186 |     if (verbose) cat("No pooled results. All folds seemed to have failed.\n")
187 |   }
188 | 
189 |   # Compile results
190 |   results = list(
191 |     #theta = theta,
192 |     thetas = thetas,
193 |     influence_curves = influence_curves,
194 |     #std_err = std_err,
195 |     epsilon = epsilon
196 |   )
197 | 
198 |   return(results)
199 | }
200 | 


--------------------------------------------------------------------------------
/R/estimate_tmle.R:
--------------------------------------------------------------------------------
 1 | #' Get TMLE estimate: E[Y | A = 1, W].
 2 | #'
 3 | #' @param Y Outcome variable
 4 | #' @param A Treatment indicator
 5 | #' @param W Dataframe of adjustment covariates
 6 | #' @param family Binomial or gaussian
 7 | #' @param delta Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing.
 8 | #' @param Q.lib SuperLearner library for estimating Q (potential outcome)
 9 | #' @param g.lib SuperLearner library for estimating g (propensity score)
10 | #' @param verbose If true output extra information during execution.
11 | #' @importFrom tmle tmle
12 | estimate_tmle = function(Y,
13 |                          A,
14 |                          W,
15 |                          family,
16 |                          delta = NULL,
17 |                          Q.lib,
18 |                          g.lib,
19 |                          verbose = F) {
20 | 
21 |   if (!family %in% c("binomial", "gaussian")) {
22 |     stop('Estimate_tmle: family must be either "binomial" or "gaussian".')
23 |   }
24 | 
25 |   # Because of quirk of program, delete observations with delta=0 if #>0
26 |   # & < 10
27 |   n = length(Y)
28 |   inc = rep(TRUE, n)
29 | 
30 |   if (!is.null(delta)) {
31 |     ss = sum(delta == 0)
32 |     if (ss > 0 & ss < 10) {
33 |       inc[delta == 0] = FALSE
34 |     }
35 |   }
36 | 
37 |   Y = Y[inc]
38 |   A = A[inc]
39 |   W = W[inc, , drop = F]
40 | 
41 |   delta = delta[inc]
42 | 
43 |   # Here we are using tmle but not using the treatment effect estimate.
44 |   # We're actually using the underlying variables to estimate Y_a.
45 |   tmle.1 = tmle::tmle(Y, A, W, Delta = delta, g.SL.library = g.lib,
46 |                       Q.SL.library = Q.lib, family = family, verbose = verbose)
47 | 
48 |   # Propensity score for treatment.
49 |   g1 = tmle.1$g$g1W
50 | 
51 |   # Unit's estimated outcome under treatment: hat(Y) | A = 1, W
52 |   # This is after the fluctuation step, so it is targeted.
53 |   Qst = tmle.1$Qstar[, 2]
54 | 
55 |   # E[Y | A = 1, W]
56 |   theta = mean(Qst)
57 | 
58 |   # Influence curve
59 |   IC = (A / g1) * (Y - Qst) + Qst - theta
60 | 
61 |   # Compile results.
62 |   result = list(theta = theta, IC = IC)
63 | 
64 |   return(result)
65 | }
66 | 


--------------------------------------------------------------------------------
/R/estimate_tmle2.R:
--------------------------------------------------------------------------------
  1 | #' Get TMLE estimate: E[Y | A = 1, W].
  2 | #'
  3 | #' @param Y Outcome variable
  4 | #' @param A Treatment indicator
  5 | #' @param W Dataframe of adjustment covariates
  6 | #' @param family Outcome family - binomial or gaussian
  7 | #' @param delta Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing.
  8 | #' @param Q.lib SuperLearner library for estimating Q (potential outcome)
  9 | #' @param g.lib SuperLearner library for estimating g (propensity score)
 10 | #' @param id Optional subject-level identifier.
 11 | #' @param Qbounds Bounds on Q
 12 | #' @param gbound Bounds on G
 13 | #' @param alpha TBD, from TMLE package
 14 | #' @param fluctuation Only logistic is currently supported.
 15 | #' @param V Number of folds for SuperLearner
 16 | #' @param verbose If true output extra information during execution.
 17 | #' @importFrom tmle tmle
 18 | #' @importFrom stats as.formula binomial coef glm plogis poisson predict qlogis
 19 | #' @importFrom utils packageDescription
 20 | #' @export
 21 | estimate_tmle2 =
 22 |   function(Y,
 23 |            A,
 24 |            W,
 25 |            family,
 26 |            delta = rep(1, length(Y)),
 27 |            Q.lib,
 28 |            g.lib,
 29 |            id = 1:length(Y),
 30 |            Qbounds = NULL,
 31 |            gbound = 0.025,
 32 |            alpha = 0.995,
 33 |            fluctuation = "logistic",
 34 |            V = 10,
 35 |            verbose = F) {
 36 | 
 37 |   if (!family %in% c("binomial", "gaussian")) {
 38 |     stop('Estimate_tmle: family must be either "binomial" or "gaussian".')
 39 |   }
 40 | 
 41 |   # Because of quirk of program, delete observations with delta=0 if #>0
 42 |   # & < 10
 43 |   n = length(Y)
 44 |   inc = rep(T, n)
 45 | 
 46 |   # TODO: revisit this decision.
 47 |   if (!is.null(delta)) {
 48 |     num_missing = sum(delta == 0)
 49 |     if (num_missing > 0 && num_missing < 10) {
 50 |       inc[delta == 0] = F
 51 |     }
 52 |   }
 53 | 
 54 |   if (length(dim(W)) != 2) {
 55 |     stop("Error: W should have two dimensions. Instead its dimensions are:", paste(dim(W)), "\n")
 56 |   }
 57 | 
 58 |   Y = Y[inc]
 59 |   A = A[inc]
 60 |   W = W[inc, , drop = F]
 61 | 
 62 |   delta = delta[inc]
 63 | 
 64 |   # Check for any remaining missing data.
 65 |   # It is technically ok for Y or A to include missingness, because the
 66 |   # delta estimation is intended to estimate the missingness mechanism.
 67 |   # This needs more testing though.
 68 | 
 69 |   missing_vals = sum(is.na(W))
 70 |   if (missing_vals != 0) {
 71 |     cat("Warning: found", missing_vals, "NAs in W.\n")
 72 | 
 73 |     na_sum = sapply(W, function(col) sum(is.na(col)))
 74 |     cat("Columns with NAs:\n")
 75 |     print(na_sum[na_sum > 0])
 76 |   }
 77 | 
 78 |   missing_vals = sum(is.na(A))
 79 |   if (missing_vals != 0) {
 80 |     cat("Warning: found", missing_vals, "NAs in A.\n")
 81 |   }
 82 | 
 83 |   missing_vals = sum(is.na(Y[delta]))
 84 |   if (missing_vals != 0) {
 85 |     cat("Warning: found", missing_vals, "NAs in Y.\n")
 86 |   }
 87 | 
 88 |   # Here we are using tmle but not using the treatment effect estimate.
 89 |   # We're actually using the underlying variables to estimate Y_a.
 90 |   # TODO: disable this call, we're just running it now to double-check
 91 |   # the custom results.
 92 |   if (F) {
 93 |     tmle.1 = tmle::tmle(Y, A, W, Delta = delta, g.SL.library = g.lib,
 94 |                       Q.SL.library = Q.lib, family = family, verbose = verbose)
 95 |   } else {
 96 |     tmle.1 = NULL
 97 |   }
 98 | 
 99 |   if (verbose) {
100 |     cat("Estimating g. A distribution:\n")
101 |     print(table(A))
102 |   }
103 | 
104 |   min_g_cell = min(table(A))
105 |   g_V = V
106 |   if (min_g_cell < V) {
107 |     g_V = max(min_g_cell, 2)
108 |     if (verbose) {
109 |       cat("A's minimum cell sizes", min_g_cell, "is less than the number of CV",
110 |           "folds", V, "\n. Reducing folds to", g_V,  "\n")
111 |     }
112 |   }
113 | 
114 |   # Using modified version of tmle::estimateG
115 |   #g_model = SuperLearner::SuperLearner(Y = A, X = W, SL.library = g.lib,
116 |   #g = tmle_estimate_g(d = cbind(A[delta == 1], W[delta == 1, ]),
117 |   g = tmle_estimate_g(d = cbind(A, W),
118 |                       SL.library = g.lib,
119 |                       verbose = verbose,
120 |                       V = g_V,
121 |                       outcome = "A",
122 |                       message = "g")
123 | 
124 |   # Handle gBounds - code from tmle::tmle().
125 |   if (length(gbound) == 1) {
126 |     if (length(unique(A)) == 1) {
127 |       # EY1 only, no controlled direct effect
128 |       gbound = c(gbound, 1)
129 |     } else {
130 |       gbound = c(gbound, 1 - gbound)
131 |     }
132 |   }
133 |   g$bound = gbound
134 | 
135 |   # Propensity score for treatment.
136 |   #g1 = tmle.1$g$g1W
137 |   g1 = g$g1W
138 | 
139 |   # Check if these two are highly correlated.
140 |   if (!is.null(tmle.1) && verbose) {
141 |     # This will generate a warning of SD is zero for either vector.
142 |     # If this is the case we'll see an NA here.
143 |     suppressWarnings(cat("Correlation of custom g to tmle-based g:",
144 |         stats::cor(tmle.1$g$g1W, g1), "\n"))
145 |   }
146 | 
147 |   # This is copied from within tmle::tmle()
148 |   map_to_ystar = fluctuation == "logistic"
149 | 
150 |   if (verbose) cat("TMLE init stage1\n")
151 | 
152 |   # Run tmle stage 1 - this is basically just bounding & transforming Y.
153 |   stage1 = tmle_init_stage1(Y = Y, Q = NULL,
154 |                              A = A,
155 |                              Delta = delta,
156 |                              alpha = alpha,
157 |                              Qbounds = Qbounds,
158 |                              maptoYstar = map_to_ystar,
159 |                              family = family)
160 | 
161 |   if (verbose) cat("TMLE q\n")
162 | 
163 |   # Estimate Qinit
164 |   # cvQinit = F by default, meaning that we don't need CV.SuperLearner.
165 |   q = tmle_estimate_q(Y = stage1$Ystar,
166 |                       A = A,
167 |                       W = W,
168 |                       Q = stage1$Q,
169 |                       Delta = delta,
170 |                       SL.library = Q.lib,
171 |                       family = family,
172 |                       V = V,
173 |                       verbose = verbose,
174 |                       maptoYstar = map_to_ystar,
175 |                       Qbounds = stage1$Qbounds)
176 | 
177 |   # Convert from a matrix to a df, so we can use $ to access elements.
178 |   # Nevermind, this breaks the plogis code.
179 |   # q$Q = data.frame(q$Q)
180 | 
181 |   # Check for all NaN in QAW.
182 |   stopifnot(mean(is.nan(q$Q[, "QAW"])) == 0)
183 | 
184 |   # cat(class(q$Q), paste(dim(q$Q)), paste(colnames(q$Q)), "\n")
185 |   # TODO: check if our custom q$QAW equals the tmle Q.
186 | 
187 |   # Specify random arguments from tmle::tmle()
188 |   pDelta1 = NULL
189 |   g.Deltaform = NULL
190 | 
191 |   # From tmle::tmle()
192 |   ############################################
193 |   if (verbose) cat("Estimating g.Delta (missingness mechanism)\n")
194 | 
195 |   g.z <- NULL
196 |   g.z$type="No intermediate variable"
197 |   g.z$coef=NA
198 |   g.Delta <- suppressWarnings({
199 |     tmle_estimate_g(d = data.frame(delta, Z=1, A, W),
200 |                     pDelta1,
201 |                     g.Deltaform,
202 |                     g.lib,
203 |                     id = id, V = V,
204 |                     verbose = verbose,
205 |                     message = "missingness mechanism",
206 |                     outcome="D")
207 |   })
208 |   g1W.total <- .bound(g$g1W*g.Delta$g1W[,"Z0A1"], gbound)
209 |   if (sum(is.na(g1W.total)) > 0) {
210 |     if (verbose) {
211 |       cat("Error, g1W.total has NAs:", sum(is.na(g1W.total)), "\n")
212 |     }
213 |   }
214 |   g0W.total <- .bound((1-g$g1W)*g.Delta$g1W[,"Z0A0"], gbound)
215 |   if(all(g1W.total==0)){g1W.total <- rep(10^-9, length(g1W.total))}
216 |   if(all(g0W.total==0)){g0W.total <- rep(10^-9, length(g0W.total))}
217 |   H1W <- A/g1W.total
218 |   H0W <- (1-A)/g0W.total
219 | 
220 |   if (verbose) cat("Estimating epsilon\n")
221 | 
222 |   suppressWarnings(
223 |     epsilon <- coef(glm(stage1$Ystar~-1 + offset(q$Q[,"QAW"]) + H0W + H1W, family=q$family, subset=delta==1))
224 |   )
225 |   epsilon[is.na(epsilon)] <- 0  # needed for EY1 calculation
226 |   Qstar <- q$Q + c((epsilon[1]*H0W + epsilon[2]*H1W), epsilon[1]/g0W.total, epsilon[2]/g1W.total)
227 |   colnames(Qstar) <- c("QAW", "Q0W", "Q1W")
228 |   Ystar <- stage1$Ystar
229 |   if (map_to_ystar) {
230 |     Qstar <- plogis(Qstar)*diff(stage1$ab)+stage1$ab[1]
231 |     Qstar <- plogis(Qstar)*diff(stage1$ab)+stage1$ab[1]
232 |     q$Q <- plogis(q$Q)*diff(stage1$ab)+stage1$ab[1]
233 |     Ystar <- Ystar*diff(stage1$ab)+stage1$ab[1]
234 |   }
235 |   colnames(q$Q) <- c("QAW", "Q0W", "Q1W")
236 |   q$Q <- q$Q[,-1]
237 | 
238 |   if (verbose) cat("tmle::calcParameters\n")
239 |   res <- tmle::calcParameters(Ystar, A, I.Z=rep(1, length(Ystar)), delta, g1W.total, g0W.total, Qstar,
240 |                         mu1=mean(Qstar[,"Q1W"]), mu0=mean(Qstar[,"Q0W"]), id, family, 
241 |                         obsWeights=rep(1, length(Ystar)))
242 | 
243 |   #returnVal <- list(estimates=res, Qinit=Q, g=g, g.Z=g.z, g.Delta=g.Delta, Qstar=Qstar[,-1], epsilon=epsilon)
244 |   #class(returnVal) <- "tmle"
245 |   ############################################
246 | 
247 | 
248 | 
249 | 
250 |   # Calculate Qstar
251 |   #QbarAW_star = plogis(qlogis(q$Q$QAW) + epsilon * h_aw)
252 |   #Qbar1W_star = plogis(qlogis(q$Q$Q1W) + epsilon * h_1w)
253 |   #Qbar0W_star = plogis(qlogis(q$Q$Q0W) + epsilon * h_0w)
254 | 
255 |   # Unit's estimated outcome under treatment: hat(Y) | A = 1, W
256 |   # This is after the fluctuation step, so it is targeted.
257 |   # Qst = tmle.1$Qstar[, 2]
258 |   # Qst = tmle.1$Qstar$Q1W
259 |   # Qst = Qbar1W_star
260 |   Qst = Qstar[, "Q1W"]
261 | 
262 |   # E[Y | A = 1, W]
263 |   theta = mean(Qst)
264 | 
265 |   # Influence curve
266 |   IC = (A / g1) * (Y - Qst) + Qst - theta
267 | 
268 |   # Compile results.
269 |   result = list(theta = theta, IC = IC, g_model = g$model, q_model = q$model,
270 |                 tmle = tmle.1, alpha = alpha,
271 |                 Qbounds = Qbounds,
272 |                 stage1_Qbounds = stage1$Qbounds,
273 |                 gbounds = g$bound,
274 |                 V = V,
275 |                 # May have had to reduce the # of SL folds for g, due to sparsity
276 |                 # in A.
277 |                 g_V = g_V,
278 |                 fluctuation = fluctuation,
279 |                 map_to_ystar = map_to_ystar, ab = stage1$ab)
280 | 
281 |   return(result)
282 | }
283 | 
284 | 


--------------------------------------------------------------------------------
/R/exportLatex.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Export varimpact results as Latex tables
  3 | #'
  4 | #' Outputs results from varimpact() into three Latex tables: consistent results,
  5 | #' all results, and per-fold results.
  6 | #'
  7 | #' Creates three Latex table files:
  8 | #'  \itemize{
  9 | #'    \item varimpConsistent.tex - the ``consistent'' significant results, meaning those with consistent categories chosen as comparison groups among factors and consistent ordering for numeric variables.
 10 | #'    \item varimpAll.tex - the file with cross-validated average variable impacts ordered by statistical significance.
 11 | #'    \item varimpByV.tex - the comparison levels used within each validation sample.  Either integer ordering of factors or short-hand for percentile cut-off (0-1 is the 10th percentile, 10+ is the 100th percentile)
 12 | #' }
 13 | #'
 14 | #' @param impact_results Result object from previous varimpact() call.
 15 | #' @param outname (Optional) String that is prepended to filenames.
 16 | #' @param dir (Optional) Directory to save the results, defaults to current directory.
 17 | #' @param digits Digits to round numbers, passed through to xtable.
 18 | #' @param ... Additional parameters passed to print.xtable().
 19 | #'
 20 | #' @seealso
 21 | #' \code{\link[varimpact]{varimpact}}
 22 | #'
 23 | #' @export
 24 | # TODO: document return object.
 25 | exportLatex = function(impact_results, outname = "", dir = ".", digits = 4, ...) {
 26 | 
 27 |   # Check if results are valid
 28 |   if (is.null(impact_results$results_by_fold) || is.null(impact_results$results_all)) {
 29 |     warning("Cannot export LaTeX: varimpact results are NULL or incomplete")
 30 |     return(invisible(NULL))
 31 |   }
 32 | 
 33 |   table_byfold = cbind("Variable" = rownames(impact_results$results_by_fold),
 34 |                     impact_results$results_by_fold)
 35 | 
 36 |   xtable_byfold = xtable::xtable(table_byfold,
 37 |             caption = "Variable Importance Results By Estimation Sample",
 38 |             label = "byFold",
 39 |             digits = digits)
 40 | 
 41 |   print(xtable_byfold,
 42 |         type = "latex",
 43 |         file = paste0(dir, "/", outname, "varimpByFold.tex"),
 44 |         caption.placement = "top",
 45 |         include.rownames = F,
 46 |         ...)
 47 | 
 48 | 
 49 |   # Use hline.after to add a line after the p = 0.05 cut-off.
 50 |   signif_cutoff = which(impact_results$results_all[, "Adj. p-value"] > 0.05)
 51 |   if (length(signif_cutoff) > 0) {
 52 |     signif_row = min(signif_cutoff) - 1
 53 |     hline.after = c(-1,0, signif_row, nrow(impact_results$results_all))
 54 |   } else {
 55 |     # All variables are important.
 56 |     hline.after = NULL
 57 |   }
 58 | 
 59 |   table_all = cbind("Rank" = 1:nrow(impact_results$results_all),
 60 |                     "Variable" = rownames(impact_results$results_all),
 61 |                     impact_results$results_all)
 62 | 
 63 |   xtable_all = xtable::xtable(table_all,
 64 |                    caption = "Variable Importance Results For Combined Estimates",
 65 |                    label = "allRes",
 66 |                    digits = digits)
 67 | 
 68 |   print(xtable_all,
 69 |           type = "latex",
 70 |           file = paste0(dir, "/", outname, "varimpAll.tex"),
 71 |           caption.placement = "top",
 72 |           include.rownames = F,
 73 |           hline.after = hline.after,
 74 |           ...)
 75 | 
 76 |   if (nrow(impact_results$results_consistent) > 0) {
 77 |     table_consistent = cbind("Rank" = 1:nrow(impact_results$results_consistent),
 78 |                            "Variable" = rownames(impact_results$results_consistent),
 79 |                            impact_results$results_consistent)
 80 |     xtable_consistent = xtable::xtable(table_consistent,
 81 |               caption = "Subset of of Significant and ``Consistent'' Results",
 82 |               label = "consisRes",
 83 |               digits = digits)
 84 |   } else {
 85 |     # Create a blank dataframe.
 86 |     table_consistent = data.frame()
 87 |     # Create a blank xtable.
 88 |     xtable_consistent = NULL
 89 |   }
 90 | 
 91 |   print(xtable_consistent,
 92 |         type = "latex",
 93 |         file = paste0(dir, "/", outname, "varimpConsistent.tex"),
 94 |         caption.placement = "top",
 95 |         include.rownames = F,
 96 |         ...)
 97 | 
 98 |   # Return a list with the output results.
 99 |   results = list(tables = list(
100 |                    consistent = table_consistent,
101 |                    all = table_all,
102 |                    byfold = table_byfold
103 |                  ),
104 |                  xtables = list(
105 |                    consistent = xtable_consistent,
106 |                    all = xtable_all,
107 |                    byfold = xtable_byfold
108 |                  ))
109 | 
110 | 
111 | 
112 |   return(invisible(results))
113 | }
114 | 


--------------------------------------------------------------------------------
/R/factors_to_indicators.R:
--------------------------------------------------------------------------------
  1 | #' Convert a dataframe of factor into separate indicators.
  2 | #'
  3 | #' More details
  4 | #'
  5 | #' @param factor_df Dataframe consisting only of factor variables.
  6 | #' @param miss_name_prefix Starting name for each missing indicator.
  7 | #' @param verbose Set to T for detailed output.
  8 | #'
  9 | #' @return A list of results.
 10 | #'
 11 | #' @export
 12 | factors_to_indicators = function(factor_df, miss_name_prefix = "Imiss_",
 13 |                                  verbose = F) {
 14 | 
 15 |   # TODO: confirm that factor_df contains only factor variables.
 16 | 
 17 |   sum_nas = apply(factor_df, 2, sum_na)
 18 | 
 19 |   if (verbose) cat("Factors with missingness:", sum(sum_nas > 0), "\n")
 20 | 
 21 |   facnames = names(factor_df)
 22 | 
 23 |   nam.fac = function(x, name) {
 24 |     num_chars = nchar(x)
 25 |     out = paste(name, substr(x, 2, num_chars), sep = "XX")
 26 |     # Remove spaces in variable names.
 27 |     out = gsub(" ", "", out)
 28 |     return(out)
 29 |   }
 30 | 
 31 |   ############
 32 |   # Missing Basis for Factors
 33 | 
 34 |   factor_names = colnames(factor_df)
 35 |   miss.fac = NULL
 36 |   names_miss = NULL
 37 | 
 38 |   # Calculate the number of missing values for each column.
 39 |   sum_nas = apply(factor_df, 2, sum_na)
 40 | 
 41 |   # Loop over each column in our new indicator dataframe.
 42 |   # TODO: use which() to only loop over indices with missing data.
 43 |   for (col_k in 1:ncol(factor_df)) {
 44 |     if (sum_nas[col_k] > 0) {
 45 |       # if (verbose) cat("Missing data in", factor_names[col_k], "\n")
 46 |       # Again, we are flagging non-missing as 1 and missing as 0 here.
 47 |       indicator_column = as.numeric(is.na(factor_df[, col_k]) == F)
 48 | 
 49 |       miss.fac = cbind(miss.fac, indicator_column)
 50 | 
 51 |       names_miss = c(names_miss, paste0(miss_name_prefix, factor_names[col_k]))
 52 |     }
 53 |   }
 54 |   colnames(miss.fac) = names_miss
 55 | 
 56 |   newX = NULL
 57 |   factor_names = NULL
 58 | 
 59 |   # TODO: remove this line?
 60 |   options(na.action = "na.pass")
 61 | 
 62 |   # Loop over each factor variable.
 63 |   for (i in 1:ncol(factor_df)) {
 64 |     # Note: we want to keep this variable name as x so that the names are short
 65 |     # from model.matrix. We rely on this variable name being only 1 character.
 66 |     x = factor_df[, i]
 67 |     # CK: looks like we are omitting the first level?
 68 |     if (T || sum_nas[i] == 0) {
 69 |       omit_levels = -1
 70 |     } else {
 71 |       # if there is missing data, also omit the last level (NA)
 72 |       omit_levels = -1 * c(1, length(levels(x)))
 73 |     }
 74 | 
 75 |     # Convert to a series of indicators.
 76 |     indicators = model.matrix(~ x - 1)[, omit_levels]
 77 |     names = colnames(indicators)
 78 | 
 79 |     # Any remaining missing data is set to 0.
 80 |     remaining_nas = sum(sapply(indicators, function(col) sum(is.na(col))))
 81 |     if (remaining_nas > 0) {
 82 |       if (verbose) cat("Replacing", remaining_nas, "remaining nas with 0s.\n")
 83 |       indicators[is.na(indicators)] = 0
 84 |     }
 85 | 
 86 |     # CK: why do we need this option?
 87 |     if (is.null(names)) {
 88 |       names2 = facnames[i]
 89 |     } else {
 90 |       # Clean up the names for each indicator.
 91 |       names2 = nam.fac(names, facnames[i])
 92 |     }
 93 | 
 94 |     # Accumulate the names.
 95 |     factor_names = c(factor_names, names2)
 96 |     # Append the next set of indicators to the resulting matrix.
 97 |     newX = cbind(newX, indicators)
 98 |   }
 99 | 
100 |   colnames(newX) = factor_names
101 | 
102 |   ##################
103 |   # Indexing vector for dummy basis back to original factors.
104 |   cc = regexpr("XX", factor_names)
105 |   ncc = nchar(factor_names)
106 |   cc[cc < 0] = ncc[cc < 0] + 1
107 | 
108 |   # NOTE: we don't seem to actually use this in varImpact() at the moment.
109 |   factor_index = substr(factor_names, 1, cc - 1)
110 | 
111 |   # Create a list to hold multi-variable results.
112 |   results = list(data = newX, factor_index = factor_index, missing_indicators = miss.fac)
113 | 
114 |   results
115 | }
116 | 


--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
 1 | # Global variable declarations to avoid R CMD check NOTEs
 2 | # These variables are used in dplyr operations and ggplot2
 3 | 
 4 | # Variables used in dplyr operations
 5 | utils::globalVariables(c(
 6 |   "name", "level", "level_label", "test_msg", "train_msg", 
 7 |   "cv_fold", "train_cell_size", "test_cell_size",
 8 |   "rawp", "BH", "AvePsi", "Consistent",
 9 |   "test_theta_tmle", "color"
10 | ))
11 | 
12 | # Function used in dplyr operations
13 | utils::globalVariables("desc")


--------------------------------------------------------------------------------
/R/length-unique.R:
--------------------------------------------------------------------------------
1 | # Function that counts # of unique values.
2 | # Do not export.
3 | length_unique = function(x) {
4 |   # Skip NAs.
5 |   length(setdiff(unique(x), NA))
6 | }
7 | 


--------------------------------------------------------------------------------
/R/max_sqr.R:
--------------------------------------------------------------------------------
 1 | #' Find the maximum of the squared values.
 2 | #'
 3 | #' @param x Numeric vector
 4 | #'
 5 | #' @return Maximum of the squared values, or -Inf if all elements are NA.
 6 | #' @export
 7 | max_sqr = function(x) {
 8 |   # Handle missing data manually so we don't get warnings when it occurs.
 9 |   x = na.omit(x)
10 |   if (length(x) == 0) {
11 |     -Inf
12 |   } else {
13 |     max(x^2)
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/R/plot-var.R:
--------------------------------------------------------------------------------
  1 | #' Plot the varimpact results for a given variable
  2 | #'
  3 | #' Displays the adjusted treatment-specific means and the impact estimate.
  4 | #' @param var_name String name of the variable
  5 | #' @param vim Varimpact result object that contains the variable
  6 | #' @param digits Number of digits for rounding purposes.
  7 | #' @param verbose If true, display extra output.
  8 | # TODO: @example
  9 | #' @import ggplot2
 10 | #' @export
 11 | plot_var =
 12 |   function(var_name, vim, digits = 2L, verbose = FALSE) {
 13 | 
 14 |   # Confirm that we can plot this variable.
 15 |   if (is.null(vim$numeric_vims$results_by_level) &&
 16 |       is.null(vim$factor_vims$results_by_level)) {
 17 |     stop(paste("No results_by_level found in vim object.",
 18 |                "Please post an issue to github if this is persistent."))
 19 |   }
 20 | 
 21 |   # Can only plot numerics currently, need to expand to factors.
 22 |   numeric_vars = unique(vim$numeric_vims$results_by_level$name)
 23 |   factor_vars = unique(vim$factor_vims$results_by_level$name)
 24 |   if (var_name %in% numeric_vars) {
 25 |     results = vim$numeric_vims$results_by_level
 26 |   } else if (var_name %in% factor_vars) {
 27 |     results = vim$factor_vims$results_by_level
 28 |   } else {
 29 |     stop("There is no variable called", var_name, ".\n")
 30 |   }
 31 | 
 32 | 
 33 |   # Create plot dataframe.
 34 |   plot_data = results[results$name == var_name,
 35 |                       c("level", "level_label", "test_theta_tmle",
 36 |                         "test_var_tmle")]
 37 | 
 38 |   # Create color column based on min, max, and other.
 39 |   plot_data$color = "Other"
 40 |   plot_data$color[which.min(plot_data$test_theta_tmle)] = "Low risk"
 41 |   plot_data$color[which.max(plot_data$test_theta_tmle)] = "High risk"
 42 | 
 43 |   # Add "Impact" row to dataframe.
 44 |   result_row = rownames(vim$results_all) == var_name
 45 |   uniq_vals = nrow(plot_data)
 46 |   plot_data = rbind(plot_data,
 47 |                     list(level = NA,
 48 |                          level_label = "Impact",
 49 |                          test_theta_tmle = vim$results_all$Estimate[result_row],
 50 |                          # Don't have this yet.
 51 |                          test_var_tmle = NA,
 52 |                          color = "Impact"
 53 |                          ))
 54 | 
 55 |   # red, blue, orange, gray
 56 |   # impact, max, min, other
 57 |   plot_palette = c("#d09996", # red
 58 |                    "#95b4df", # blue
 59 |                    "#f2c197", # orange
 60 |                    "#d9d9d9") # gray
 61 | 
 62 |   # New:
 63 |   # high risk, impact, low risk, "other"
 64 |   plot_palette = plot_palette[c(1, 3, 2, 4)]
 65 | 
 66 |   # Plot TSMs from $numeric_vims$results_by_level and the varimpact estimate.
 67 |   p = ggplot(data = plot_data,
 68 |              aes(x = factor(level_label),
 69 |                  y = test_theta_tmle,
 70 |                  #label = round(test_theta_tmle, 1L),
 71 |                  fill = factor(color))) +
 72 |     geom_col(width = 0.4) +
 73 |     geom_label(aes(label = round(test_theta_tmle, digits)),
 74 |                size = 6,
 75 |                fill = "white",
 76 |                color = "gray10",
 77 |                # label.color = "gray50",
 78 |                hjust = -0.2) +
 79 |     # Make the bar char horizontal rather than vertical
 80 |     coord_flip() +
 81 |     theme_minimal() +
 82 |     theme(axis.title.x = element_blank(),
 83 |           axis.title.y = element_blank(),
 84 |           axis.text = element_text(size = 18),
 85 |           plot.title = element_text(size = 20),
 86 |           panel.background = element_rect(fill = "white", color = "gray50"),
 87 |           #plot.background = element_rect(fill = "gray95"),
 88 |           plot.background = element_rect(fill = "#f2f2f2"),
 89 |           panel.grid.major = element_blank(),
 90 |           panel.grid.minor = element_blank(),
 91 |           legend.title = element_blank(),
 92 |           #legend.position = "none") +
 93 |           NULL) +
 94 |     theme(legend.position = "bottom",
 95 |          plot.title = element_text(size = 15),
 96 |          axis.title.y = element_text(size = 14, angle = 270,
 97 |                                      # This should center the var label vertically.
 98 |                                      hjust = (uniq_vals / (uniq_vals + 1)) / 2,
 99 |                                      color = "gray30"),
100 |          axis.title.x = element_text(size = 12, color = "gray40"),
101 |          #margin = margin(t = 1)),
102 |          legend.spacing.x = unit(0.2, 'cm'),
103 |          plot.margin = unit(c(0.5, 1, 0, 0.5), "lines"),
104 |          axis.text.x = element_text(size = 14, color = "gray40")) +
105 |     geom_hline(yintercept = 0, color = "gray90") +
106 |     scale_x_discrete(limits = c("Impact", rev(setdiff(unique(plot_data$level_label), "Impact")))) +
107 |     scale_y_continuous(expand = c(0, 0.15)) +
108 |     scale_fill_manual(values = plot_palette) +
109 |     guides(fill = guide_legend(label.position = "bottom")) +
110 |     labs(title = paste("Impact of", var_name),
111 |          y = "Adjusted outcome mean",
112 |          x = var_name)
113 | 
114 |   p
115 | }
116 | 


--------------------------------------------------------------------------------
/R/print.varimpact.R:
--------------------------------------------------------------------------------
 1 | #' Custom printing of the varimpact results.
 2 | #'
 3 | #' Shows the significant and consistent results by default. If there are no
 4 | #' consistent results it shows all results.
 5 | #'
 6 | #' @param x Results object from varimpact.
 7 | #' @param ... Further arguments passed to or from other methods.
 8 | #'
 9 | #' @export
10 | print.varimpact = function(x, ...) {
11 |   # Just print the significant and consistent results.
12 |   if (!is.null(x$results_consistent) && nrow(x$results_consistent) > 0) {
13 |     cat("Significant and consistent results:\n")
14 |     print(x$results_consistent)
15 |   } else if (!is.null(x$results_all)) {
16 |     cat("No significant and consistent results.\n")
17 |     cat("All results:\n")
18 |     print(x$results_all)
19 |   } else {
20 |     cat("No results could be calculated.\n")
21 |   }
22 |   invisible(x)
23 | }
24 | 


--------------------------------------------------------------------------------
/R/process-factors.R:
--------------------------------------------------------------------------------
 1 | process_factors = function(data.fac,
 2 |                            quantile_probs_factor,
 3 |                            miss.cut,
 4 |                            verbose = FALSE) {
 5 | 
 6 |   # Set some default return values.
 7 |   # TODO: improve names for these objects.
 8 |   num_factors = 0L
 9 |   miss.fac = NULL
10 |   datafac.dum = NULL
11 | 
12 |   #####################
13 |   if (ncol(data.fac) > 0L) {
14 | 
15 |     if (verbose) cat("Processing factors. Start count:", ncol(data.fac), "\n")
16 | 
17 |     ######################################
18 |     # Replace blank factor values with NA's.
19 | 
20 |     # We re-use this num_cols variable in the next section.
21 |     num_cols = ncol(data.fac)
22 |     for (i in 1:num_cols) {
23 |       new_factor = as.character(data.fac[, i])
24 |       # The exclude argument replaces any empty strings with NAs.
25 |       new_factor = factor(new_factor, exclude = "")
26 |       data.fac[, i] = new_factor
27 |     }
28 | 
29 |     ###################
30 |     # For each factor, apply function and get rid of those where
31 |     # 'true' data.fac is data frame of variables that are factors
32 |     if (!is.null(quantile_probs_factor)) {
33 |       data.fac = restrict_by_quantiles(data.fac, quantile_probs = quantile_probs_factor,
34 |                                        verbose = verbose)
35 |     }
36 | 
37 |     dropped_cols = num_cols - ncol(data.fac)
38 | 
39 |     if (verbose) {
40 |       if (dropped_cols > 0) {
41 |         cat("Dropped", dropped_cols, "factors due to lack of variation.\n")
42 |       } else {
43 |         cat("No factors dropped due to lack of variation.\n")
44 |       }
45 |     }
46 | 
47 |     # We don't seem to use this yet.
48 |     # num.cat = sapply(data.fac, length_unique)
49 | 
50 |     ######################
51 |     # Remove columns with missing data % greater than the threshold.
52 |     sum_nas = sapply(data.fac, sum_na)
53 | 
54 |     if (length(sum_nas) == 0L) {
55 |       if (verbose) {
56 |         cat("All factors were dropped.\n")
57 |       }
58 |       data.fac = NULL
59 |     } else {
60 | 
61 |       if (verbose) cat("Factors with missingness:", sum(sum_nas > 0L), "\n")
62 | 
63 |       miss_pct = sum_nas / nrow(data.fac)
64 | 
65 |       data.fac = data.fac[, miss_pct < miss.cut, drop = FALSE]
66 | 
67 |       if (verbose) {
68 |         cat("Dropped", sum(miss_pct >= miss.cut), "factors due to the missingness threshold.\n")
69 |       }
70 | 
71 |       # Save how many separate factors we have in this dataframe.
72 |       num_factors = ncol(data.fac)
73 | 
74 |       factor_results = factors_to_indicators(data.fac, verbose = verbose)
75 | 
76 |       datafac.dum = factor_results$data
77 |       # Here 1 = defined, 0 = missing.
78 |       miss.fac = factor_results$missing_indicators
79 | 
80 |       if (verbose) {
81 |         cat("End factor count:", num_factors, "Indicators:", ncol(datafac.dum),
82 |             "Missing indicators:", ncol(miss.fac), "\n")
83 |       }
84 |     }
85 |   } else {
86 |     data.fac = NULL
87 |   }
88 | 
89 |   (results =
90 |     list(
91 |       num_factors = num_factors,
92 |       miss.fac = miss.fac,
93 |       datafac.dum = datafac.dum,
94 |       data.fac = data.fac
95 |   ))
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/R/process-numerics.R:
--------------------------------------------------------------------------------
  1 | process_numerics =
  2 |   function(data.num,
  3 |            quantile_probs_numeric,
  4 |            miss.cut,
  5 |            bins_numeric,
  6 |            impute,
  7 |            verbose = FALSE) {
  8 | 
  9 |   n = nrow(data.num)
 10 | 
 11 |   if (ncol(data.num) > 0) {
 12 |     num_cols = ncol(data.num)
 13 |     if (verbose) cat("Processing numerics. Start count:", num_cols, "\n")
 14 | 
 15 |     # Remove columns where the 0.1 and 0.9 quantiles have the same value, i.e. insufficent variation.
 16 |     # TODO: set this is a configurable setting?
 17 |     if (!is.null(quantile_probs_numeric)) {
 18 |       data.num = restrict_by_quantiles(data.num, quantile_probs = quantile_probs_numeric)
 19 |     }
 20 | 
 21 |     if (verbose) {
 22 |       num_dropped = num_cols - ncol(data.num)
 23 |       if (num_dropped > 0) {
 24 |         cat("Dropped", num_dropped, "numerics due to lack of variation.\n")
 25 |       } else {
 26 |         cat("No numerics dropped due to lack of variation.\n")
 27 |       }
 28 |     }
 29 | 
 30 |     # Save how many numeric variables we have in this dataframe.
 31 |     num_numeric = ncol(data.num)
 32 |   } else {
 33 |     num_numeric = 0L
 34 |   }
 35 | 
 36 |   if (num_numeric > 0) {
 37 |     if (verbose) cat("Cleaning up", num_numeric, "numeric variables.\n")
 38 |     # Make deciles for continuous variables
 39 |     X = data.num
 40 |     xc = dim(X)[2]
 41 |     # We don't seem to use this qt variable.
 42 |     qt = apply(na.omit(X), 2, quantile, probs = seq(0.1, 0.9, 0.1))
 43 |     newX = NULL
 44 |     coln = NULL
 45 |     varn = colnames(X)
 46 | 
 47 |     num.cat = apply(X, 2, length_unique)
 48 | 
 49 |     # Matrix to store the numeric columns converted to bin levels (integer value per quantile).
 50 |     numerics_binned = matrix(nrow = n, ncol = num_numeric)
 51 | 
 52 |     # Make a list to store the levels for each numeric variable.
 53 |     numeric_levels = vector("list", num_numeric)
 54 | 
 55 |     for (numeric_i in 1:num_numeric) {
 56 |       # Because we do not specify "drop" within the brackets, Xt is now a vector.
 57 |       Xt = X[, numeric_i]
 58 | 
 59 |       name = colnames(data.num)[numeric_i]
 60 | 
 61 |       if (verbose) {
 62 |         cat("Processing", name, numeric_i, "of", num_numeric, "\n")
 63 |       }
 64 | 
 65 |       # Suppress the warning that can occur when there are fewer than the desired
 66 |       # maximum number of bins, as specified by bins_numeric. We should be able to
 67 |       # see this as var_binned containing fewer than bins_numeric columns.
 68 |       # Warning is in .cut2(): min(xx[xx > upper])
 69 |       # "no non-missing arguments to min; returning Inf"
 70 |       num_unique_vals = length(setdiff(unique(Xt), NA))
 71 | 
 72 |       num_breaks = min(bins_numeric, num_unique_vals)
 73 | 
 74 |       arules_method = "frequency"
 75 | 
 76 |       # No need to apply tiling, we already have a limited # of unique vals.
 77 |       # TODO: return info on this, and possible a notice message.
 78 |       if (num_unique_vals <= bins_numeric) {
 79 |         arules_method = "interval"
 80 |       }
 81 | 
 82 |       suppressWarnings({
 83 |         # Discretize into up to 10 quantiles (by default), configurable based on
 84 |         # bins_numeric argument.
 85 |         # This returns a factor version of the discretized variable.
 86 |         tryCatch({ var_binned_names = arules::discretize(Xt,
 87 |                                               method = arules_method,
 88 |                                               breaks = num_breaks,
 89 |                                               ordered = TRUE)
 90 |         }, error = function(error) {
 91 |           # This can happen with skewed distributions where multiple breaks are not unique.
 92 |           print(error)
 93 |           cat("Error: could not discretize numeric", numeric_i, "", name, "\n")
 94 |           cat("Unique values:", length(unique(Xt)), "\n")
 95 |           cat("Switching to cluster-based discretization.\n")
 96 |           tryCatch({
 97 |           var_binned_names = arules::discretize(Xt, method = "cluster",
 98 |                                                 breaks = num_breaks,
 99 |                                                 ordered = TRUE)},
100 |             error = function(error2) {
101 |               # TODO: use another package/function to discretize.
102 |               print(error2)
103 |               cat("Cluster-based discretization failed - using all levels.")
104 |               var_binned_names = factor(Xt)
105 |             })
106 | 
107 |         })
108 |       })
109 | 
110 |       # Save the levels for future usage.
111 |       numeric_levels[[numeric_i]] = levels(var_binned_names)
112 |       # This converts the factor variable to just the quantile numbers.
113 |       var_binned = as.numeric(var_binned_names)
114 |       numerics_binned[, numeric_i] =  var_binned
115 | 
116 |       if (verbose) {
117 |         cat("New levels:", paste(levels(var_binned_names), collapse = ", "), "\n")
118 |       }
119 |     }
120 |     colnames(numerics_binned) = varn
121 |     data.cont.dist = numerics_binned
122 | 
123 |     ###############
124 |     # Missing Basis for numeric variables, post-binning.
125 | 
126 |     n.cont = nrow(data.cont.dist)
127 | 
128 |     sum_nas = apply(data.cont.dist, 2, sum_na)
129 |     nmesX = colnames(data.cont.dist)
130 |     miss.cont = NULL
131 |     nmesm = NULL
132 | 
133 |     # Create imputed version of the numeric dataframe.
134 |     # This is used as the adjustment set, but not used when generating the treatment assignment vector.
135 |     data.numW = data.num
136 | 
137 |     # Loop over each binned numeric variable.
138 |     # TODO: do this as part of the binning process.
139 |     for (k in 1:num_numeric) {
140 |       # Check if that variable has any missing values.
141 |       if (sum_nas[k] > 0) {
142 |         # The effect is that the basis is set to 1 if it exists and 0 if it's missing.
143 |         ix = as.numeric(!is.na(data.cont.dist[, k]))
144 |         miss.cont = cbind(miss.cont, ix)
145 |         # TODO: convert to paste0
146 |         nmesm = c(nmesm, paste("Imiss_", nmesX[k], sep = ""))
147 |       }
148 |     }
149 |     # if(is.null(miss.cont)){miss.cont= rep(1,n.cont)}
150 |     colnames(miss.cont) = nmesm
151 | 
152 |     # Impute missing data in numeric columns.
153 |     if (impute == "zero") {
154 |       data.numW[is.na(data.num)] = 0
155 |       impute_info = 0
156 |     } else if (impute == "median") {
157 |       impute_info = caret::preProcess(data.num, method = "medianImpute")
158 |       data.numW = predict(impute_info, data.num)
159 |     } else if (impute == "mean") {
160 |       stop("Mean imputation not implemented yet. Please use another imputation method.")
161 |     } else if (impute == "knn") {
162 |       # NOTE: this also results in caret centering and scaling the data.
163 |       impute_info = caret::preProcess(data.num, method = "knnImpute")
164 |       data.numW = predict(impute_info, data.num)
165 |     }
166 | 
167 |     # Confirm that there are no missing values remaining in data.numW
168 |     stopifnot(sum(is.na(data.numW)) == 0)
169 |   } else {
170 |     data.cont.dist = NULL
171 |     miss.cont = NULL
172 |     data.numW = NULL
173 |     impute_info = NULL
174 |     data.num = NULL
175 |   }
176 | 
177 |   (results =
178 |       list(
179 |         data.cont.dist = data.cont.dist,
180 |         num_numeric = num_numeric,
181 |         miss.cont = miss.cont,
182 |         data.num = data.num,
183 |         data.numW = data.numW,
184 |         impute_info = impute_info
185 |   ))
186 | }
187 | 


--------------------------------------------------------------------------------
/R/quantiles_equivalent.R:
--------------------------------------------------------------------------------
 1 | #' Checks if two quantiles have the same sample value for a given vector.
 2 | #' This indicates that the vector has little variation.
 3 | #'
 4 | #' @param x Data vector. If a factor it is converted to numeric using unclass().
 5 | #' @param quantile_probs Vector with two probabilities that specify the quantiles.
 6 | #'
 7 | #' @return True if the two quantiles are equal, indicating a lack of variation in the sample data.
 8 | #'
 9 | #' @seealso restrict_by_quantiles
10 | #'
11 | #' @export
12 | quantiles_equivalent = function(x, quantile_probs = c(0.1, 0.9)) {
13 |   if (length(quantile_probs) != 2) {
14 |     warning("Quantiles_equivalent() expects quantile_probs to be a 2-element vector.")
15 |   }
16 |   if (is.factor(x)) {
17 |     x = unclass(x)
18 |   }
19 |   quantiles = quantile(x, probs = quantile_probs, na.rm = T)
20 |   # Returns True if there is no differences between the first and second quantiles.
21 |   (quantiles[2] - quantiles[1]) == 0
22 | }
23 | 


--------------------------------------------------------------------------------
/R/reduce_dimensions.R:
--------------------------------------------------------------------------------
  1 | #' Reduce variables in a dataframe to a target number of covariates.
  2 | #'
  3 | #' Currently uses HOPACH hierarchical clustering but could be generalized.
  4 | #'
  5 | #' @param data Dataframe
  6 | #' @param newX Optional second dataframe to receive the same reduction.
  7 | #' @param max_variables Maximum we want to allow, after which dimension reduction
  8 | #'   will take place. Cannot be more than 15 due to HOPACH limitations. Set to NULL
  9 | #'   to disable any dimension reduction.
 10 | #' @param verbose If true will output additional information during execution.
 11 | #'
 12 | #' @importFrom hopach hopach distancematrix
 13 | #'
 14 | #' @export
 15 | reduce_dimensions = function(data, newX = NULL, max_variables, verbose = FALSE) {
 16 | 
 17 |   # Identify constant columns in training data.
 18 |   is_constant = sapply(data, function(col) var(col, na.rm = TRUE) == 0)
 19 | 
 20 |   if (sum(is_constant) > 0) {
 21 |     if (verbose) cat("First removing", sum(is_constant), "constant columns.\n")
 22 | 
 23 |     # Remove constant columns.
 24 |     data = data[, !is_constant, drop = FALSE]
 25 | 
 26 |     # Remove those same constant columns from the test data, if it was provided.
 27 |     if (!is.null(newX)) {
 28 |       # Here we have to operate by names in case the validation data has different columns.
 29 |       newX = newX[, !names(newX) %in% names(is_constant[is_constant]), drop = FALSE]
 30 |     }
 31 |   }
 32 | 
 33 |   # Set this by default, then override it if we do reduce dimensions.
 34 |   variables = colnames(data)
 35 | 
 36 |   num_columns = ncol(data)
 37 | 
 38 |   # Skip if number covariates is within the target maximum or if the maximum is null.
 39 |   if (num_columns <= max_variables || is.null(max_variables)) {
 40 |     Wtsht = data
 41 |     Wvsht = newX
 42 | 
 43 |     # We still need to restrict validation W to contain only columns that
 44 |     # were in the training data. I.e. remove any extra missingness indicators.
 45 |     # TODO: should we do this at the very beginning?
 46 |     Wvsht = Wvsht[, colnames(Wvsht) %in% colnames(Wtsht), drop = FALSE]
 47 | 
 48 |   } else {
 49 |     if (verbose) cat("Reducing dimensions via clustering.\n")
 50 | 
 51 |     #mydist = as.matrix(hopach::distancematrix(t(Wt), d = "cosangle", na.rm = T))
 52 | 
 53 |     # Compute pairwise distances between each variable in the dataframe.
 54 |     # We transpose Wt because we want to cluster columns rather than rows.
 55 |     mydist = try(hopach::distancematrix(t(data), d = "cosangle", na.rm = T),
 56 |                  silent = !verbose)
 57 |     if (inherits(mydist, "try-error")) {
 58 |       cat("Error in HOPACH clustering: failed to calculate distance matrix.\n")
 59 |     }
 60 | 
 61 |     # Attempt #1.
 62 |     # We transpose Wt to cluster the columns rather than rows.
 63 |     # K = number of variables to choose.
 64 |     # kmax = maximum number of children at each node in the tree.
 65 |     # khigh = max # of children at each node when computing mss, usually the same.
 66 |     suppressWarnings({ # Suppress warnings about newmed = "medsil" in collap().
 67 |     hopach.1 = try(hopach::hopach(t(data), dmat = mydist, mss = "mean", verbose = verbose,
 68 |                                   K = max_variables, kmax = 3, khigh = 3),
 69 |                    silent = !verbose)
 70 |     })
 71 |     if (inherits(hopach.1, "try-error")) {
 72 |       if (verbose) {
 73 |         cat("Hopach attempt 1 fail.\n")
 74 |         print(hopach.1)
 75 |       }
 76 | 
 77 |       # Attempt #2.
 78 |       # We transpose Wt to cluster the columns rather than rows.
 79 |       suppressWarnings({ # Suppress warnings about newmed = "medsil" in collap().
 80 |         hopach.1 <- try(hopach::hopach(t(data), dmat = mydist, mss = "med",
 81 |                                        verbose = verbose,
 82 |                                        K = max_variables, kmax = 3, khigh = 3),
 83 |                       silent = !verbose)
 84 |       })
 85 |     }
 86 |     if (inherits(hopach.1, "try-error")) {
 87 |       if (verbose) {
 88 |         cat("Attempt 2 fail.")# Reverting to original W dataframe.\n")
 89 |         print(hopach.1)
 90 |       }
 91 | 
 92 |       # Attempt #3. Last try!
 93 |       # We transpose Wt to cluster the columns rather than rows.
 94 |       suppressWarnings({ # Suppress warnings about newmed = "medsil" in collap().
 95 |         hopach.1 <- try(hopach::hopach(t(data), dmat = mydist, mss = "med",
 96 |                                        verbose = F,
 97 |                                        K = max_variables, kmax = 3, khigh = 3,
 98 |                                        newmed="nn"),
 99 |                       silent = !verbose)
100 |       })
101 |     }
102 |     if (inherits(hopach.1, "try-error")) {
103 |       if (verbose) {
104 |         cat("Attempt 3 fail. Reverting to original W dataframe.\n")
105 |         # Now try to debug this.
106 |         # stop()
107 |       }
108 |       #warning("Dimensionality reduction failed. i=", i, "V=", kk, "A=", nameA)
109 |       Wtsht = data
110 |       Wvsht = newX
111 |     } else {
112 |       # TODO: annotate what is going on here with the HOPACH result object.
113 |       nlvls = nchar(max(hopach.1$final$labels))
114 |       no = trunc(mean(log10(hopach.1$final$labels)))
115 | 
116 |       # Find highest level of tree where minimum number of covariates is >= adjust_cutoff.
117 |       lvl = 1:nlvls
118 |       ncv = NULL
119 |       for (ii in lvl) {
120 |         ncv = c(ncv, length(unique(trunc(hopach.1$final$labels/10^(no - (ii - 1))))))
121 |       }
122 |       ncv = unique(ncv)
123 |       # Suppress possible "no non-missing arguments to min; returning Inf"
124 |       # warning from min().
125 |       # TODO: investigate more and confirm that this is ok.
126 |       suppressWarnings({
127 |         lev = min(min(nlvls, dim(data)[2]), min(lvl[ncv >= max_variables]))
128 |       })
129 |       two.clust <- unique(trunc(hopach.1$final$labels/(10^(no - (lev - 1)))))
130 |       md <- hopach.1$final$medoids
131 |       mm = md[, 1] %in% two.clust
132 |       incc = md[mm, 2]
133 | 
134 |       # Restrict to those variables in the training and validation data.
135 |       Wtsht = data[, incc, drop = F]
136 |       Wvsht = newX[, incc, drop = F]
137 | 
138 |       # Save the chosen variables so that we can return them in the result list.
139 |       variables = colnames(data)[incc]
140 |     }
141 |   }
142 | 
143 |   # If training data contains any columsn that don't exist in the validation data
144 |   # create them and assign a value of 0.
145 | 
146 |   missing_cols = setdiff(colnames(Wtsht), colnames(Wvsht))
147 |   if (length(missing_cols) > 0) {
148 |     if (verbose) cat(paste("Adding missing columns in prediction data:",
149 |                   paste(missing_cols, collapse = ", ")))
150 | 
151 |     new_cols = matrix(0, nrow = nrow(Wvsht), ncol = length(missing_cols))
152 |     colnames(new_cols) = missing_cols
153 |     Wvsht = cbind(Wvsht, new_cols)
154 | 
155 |     # Sort columns in the correct order so that matrix multiplication is correct.
156 |     Wvsht = Wvsht[, colnames(Wtsht), drop = FALSE]
157 |   }
158 | 
159 |   if (verbose) cat(" Updated columns:", ncol(Wtsht), "training",
160 |                    ncol(Wvsht), "validation.\n")
161 | 
162 |   results = list(data = Wtsht, newX = Wvsht, variables = variables)
163 |   results
164 | }
165 | 


--------------------------------------------------------------------------------
/R/restrict_by_quantiles.R:
--------------------------------------------------------------------------------
 1 | #' Remove columns from a dataframe if they do not have sufficient variation.
 2 | #'
 3 | #' @param data Dataframe or matrix
 4 | #' @param quantile_probs The probabilities corresponding to the quantiles that will be compared.
 5 | #' @param verbose If TRUE output additional details during execution.
 6 | #'
 7 | #' @return New dataframe with the restriction applied.
 8 | #'
 9 | #' @seealso quantiles_equivalent
10 | #'
11 | #' @export
12 | restrict_by_quantiles =
13 |   function(data,
14 |            quantile_probs = c(0.1, 0.9),
15 |            verbose = FALSE) {
16 | 
17 |   # Drop column if the two quantiles have the same sample value (i.e. difference = 0).
18 |   # True = remove, False = keep
19 |   # TODO: support parallelization, e.g. for very wide datasets.
20 |   drop_cols = sapply(1:ncol(data),
21 |                      function(i) quantiles_equivalent(data[, i], quantile_probs))
22 | 
23 |   if (verbose) {
24 |     if (sum(drop_cols) > 0) {
25 |       cat("restrict_by_quantiles(): dropping", paste(names(data)[drop_cols], collapse = ", "), "\n")
26 |     }
27 |   }
28 | 
29 |   # Restrict to variables with sufficient variation.
30 |   data = data[, !drop_cols, drop = FALSE]
31 | 
32 |   return(data)
33 | }
34 | 


--------------------------------------------------------------------------------
/R/results-by-level.R:
--------------------------------------------------------------------------------
 1 | #' Aggregate the results_by_fold_and_level df into a results_by_level df.
 2 | #'
 3 | #' @param results_by_fold_and_level Dataframe containing the VIM results for
 4 | #' all levels of each variable across all CV folds.
 5 | #' @param verbose If true, display extra output.
 6 | #' @importFrom magrittr %>%
 7 | #' @importFrom dplyr group_by summarize_all funs select mutate first
 8 | #' @importFrom modeest mlv
 9 | results_by_level =
10 |   function(results_by_fold_and_level,
11 |            verbose = FALSE) {
12 |   tryCatch({
13 |   results = results_by_fold_and_level %>%
14 |     # NOTE: because we group by level_label, we are assuming that any histogram
15 |     # penalization happened outside of the CV to ensure that the levels are
16 |     # the same across training folds.
17 |     group_by(name, level) %>%
18 |     # TEMP: restrict to the most common label.
19 |     #mutate(level_label = as.character(mlv(as.factor(level_label), method = "mfv")$M)) %>%
20 |     # TEMP: restrict to the first label
21 |     mutate(level_label = first(level_label)) %>%
22 |     # Now we can also group by level_label because they will be the same for a given level.
23 |     group_by(name, level, level_label) %>%
24 |     # Remove test_msg for now.
25 |     # TODO: take mode of test_msg or first value, rather than mean.
26 |     select(-c(test_msg, train_msg)) %>%
27 |     # this generates a warning in mean() because test_msg is a character not a numeric.
28 |     summarize_all(dplyr::funs(mean)) %>%
29 |     select(-c(cv_fold, train_cell_size, test_cell_size))
30 | 
31 |     # Don't keep this as a tibble.
32 |     as.data.frame(results)
33 |   }, error = function(error) {
34 |     cat("Failed in results_by_level()\n")
35 |     print(error)
36 |     NULL
37 |   })
38 | }
39 | 


--------------------------------------------------------------------------------
/R/separate_factors_numerics.R:
--------------------------------------------------------------------------------
 1 | #' Split df into one of only factors and one of only numerics.
 2 | #'
 3 | #' @param data A dataframe or matrix.
 4 | #' @param strings_to_factors Convert character strings to factors if True,
 5 | #'   otherwise ignore character string variables.
 6 | #' @param verbose If TRUE output additional details during execution.
 7 | #'
 8 | #' @return Results list.
 9 | #'
10 | #' Convert strings to factors as well.
11 | #'
12 | #' @export
13 | separate_factors_numerics =
14 |   function(data, strings_to_factors = TRUE,
15 |            verbose = FALSE) {
16 | 
17 |   if (strings_to_factors) {
18 |     # Identify character strings.
19 |     is_char = sapply(data, is.character)
20 | 
21 |     # Convert strings to factors.
22 |     data[, is_char] = sapply(data[, is_char], as.factor)
23 |   }
24 | 
25 |   # Identify factors.
26 |   is_factor = sapply(data, is.factor)
27 |   df_factors = data[, is_factor, drop = FALSE]
28 | 
29 |   # Identify numerics.
30 |   is_numeric = sapply(data, is.numeric)
31 |   df_numerics = data[, is_numeric, drop = FALSE]
32 | 
33 |   result = list(df_factors = df_factors, df_numerics = df_numerics)
34 |   return(result)
35 | }
36 | 


--------------------------------------------------------------------------------
/R/sum_na.R:
--------------------------------------------------------------------------------
1 | #' Get missingness for each column
2 | #'
3 | #' Function for getting total number missing values for vector
4 | #'
5 | #' @param x Vector, matrix, or dataframe
6 | sum_na = function(x) {
7 |   sum(is.na(x))
8 | }
9 | 


--------------------------------------------------------------------------------
/R/tmle_bound.R:
--------------------------------------------------------------------------------
 1 | # CK: copied from tmle package.
 2 | #---------- function .bound ---------------
 3 | # set outliers to min/max allowable values
 4 | # assumes x contains only numerical data
 5 | #-----------------------------------------
 6 | #' @export
 7 | .bound <- function(x, bounds) {
 8 |   x[x>max(bounds)] <- max(bounds)
 9 |   x[x<min(bounds)] <- min(bounds)
10 |   return(x)
11 | }
12 | 


--------------------------------------------------------------------------------
/R/tmle_estimate_g.R:
--------------------------------------------------------------------------------
  1 | # CK: this is copied from TMLE package, with some modifications.
  2 | # Edits:
  3 | # - full superLearner model is returned.
  4 | # - set default for id argument.
  5 | #
  6 | #-----------estimateG----------------
  7 | # Estimate factors of g
  8 | # 		P(A=1|W), P(Z=1|A,W), P(Delta=1|Z,A,W)
  9 | # d - dataframe (A,W), (Z,A,W), or (Delta,Z,A,W)
 10 | # g1W - optional vector/matrix  of externally estimated values
 11 | # gform - optionalformula to use for glm
 12 | # SL.library - algorithms to use for super learner estimation
 13 | #   id - subject identifier
 14 | # verbose - flag, whether or not to print messages
 15 | # message - printed when verbose=TRUE
 16 | # outcome - "A" for treatment, "Z" for intermediate variable,
 17 | #           "D" for Delta (missingness)
 18 | # newdata - optional values to predict on (needed by tmleMSM function)
 19 | # d = [A,W] for treatment
 20 | # d = [Z,A,W] for intermediate
 21 | # d = [Delta, Z,A,W for missingness]
 22 | #----------------------------------------
 23 | #' @importFrom utils packageDescription
 24 | tmle_estimate_g <-
 25 |   function (d,
 26 |             g1W = NULL,
 27 |             gform = NULL,
 28 |             SL.library,
 29 |             id=1:nrow(d),
 30 |             V = 10,
 31 |             stratify = TRUE,
 32 |             # TODO: consider changing default to method.NNloglik
 33 |             method = "method.NNLS",
 34 |             verbose = F,
 35 |             message = "",
 36 |             outcome = "A",
 37 |             newdata=d)  {
 38 | 
 39 |   cvControl = SuperLearner::SuperLearner.CV.control(V = V,
 40 |                                                     stratifyCV = stratify,
 41 |                                                     shuffle = TRUE)
 42 |   SL.version <- 2
 43 |   SL.ok <- FALSE
 44 |   m <- NULL
 45 |   coef <- NA
 46 |   type <- NULL
 47 |   if (is.null(g1W)){
 48 |     if(verbose){cat("\tEstimating", message, "\n")}
 49 |     if (length(unique(d[,1]))==1) {
 50 |       g1W <- rep(1,nrow(d))
 51 |       type <- paste("No", strsplit(message, " ")[[1]][1])
 52 |       if(outcome=="Z"){
 53 |         g1W <- cbind(A0=g1W, A1=g1W)
 54 |       } else if (outcome=="D"){
 55 |         g1W <- cbind(Z0A0=g1W, Z0A1=g1W, Z1A0=g1W, Z1A1=g1W)
 56 |       }
 57 |     } else {
 58 |       if (is.null(gform)){
 59 |         SL.ok <- TRUE
 60 |         old.SL <- packageDescription("SuperLearner")$Version < SL.version
 61 |         if(old.SL){
 62 |           arglist <- list(Y=d[,1], X=d[,-1, drop=FALSE], newX=newdata[,-1, drop=FALSE],
 63 |                           family="binomial", SL.library=SL.library, V=V, id=id)
 64 |         } else {
 65 |           sl_id = id
 66 |           # If IDs are unique we don't need to pass them to SL, which breaks
 67 |           # stratification.
 68 |           if (length(unique(id)) == length(id)) {
 69 |             sl_id = NULL
 70 |           }
 71 |           arglist <- list(Y=d[,1], X=d[,-1, drop=FALSE], newX=newdata[,-1, drop=FALSE],
 72 |                           family="binomial", method = method, SL.library=SL.library, cvControl = cvControl, id = sl_id)
 73 |         }
 74 |         # TODO: are we sure we want to suppress warnings here?
 75 |         suppressWarnings({
 76 |           # Suppress any package startup messages if we can.
 77 |           suppressPackageStartupMessages({
 78 |             m <- try(do.call(SuperLearner::SuperLearner, arglist))
 79 |           })
 80 |           # Set call to null because do.call() messes up that element.
 81 |           m$call = NULL
 82 |         })
 83 |         if(identical(class(m),"SuperLearner")) {
 84 |           g1W <- as.vector(m$SL.predict)
 85 |         } else {
 86 |           SL.ok <- FALSE
 87 |           cat("Error estimating g using SuperLearner. Defaulting to glm\n")
 88 |         }
 89 |         if (!SL.ok){
 90 |           if(verbose){cat("\tRunning main terms regression for 'g' using glm\n")}
 91 |           form <- paste(paste(colnames(d)[1],"~1"), paste(colnames(d)[-1], collapse = "+"), sep="+")
 92 |           m <- glm(form, data=d, family="binomial")
 93 |           g1W <- predict(m, newdata=newdata, type="response")
 94 |           coef <- coef(m)
 95 |         }
 96 |       } else {
 97 |         form <- try(as.formula(gform))
 98 |         if(inherits(form, "formula")) {
 99 |           m <- try(glm(form,  data=d, family="binomial"))
100 |           if (inherits(m, "try-error")){
101 |             if(verbose){cat("\tInvalid formula supplied. Running glm using main terms\n")}
102 |             form <- paste(colnames(d)[1],"~1 + ", paste(colnames(d)[-1], collapse = "+"), sep="")
103 |             m <- glm(form, data=d, family="binomial")
104 |           } else {
105 |             type <- "user-supplied regression formula"
106 |           }
107 |         } else {
108 |           if(verbose){cat("\tRunning main terms regression for 'g' using glm\n")}
109 |           form <- paste(colnames(d)[1],"~1", paste(colnames(d)[-1], collapse = "+"), sep="+")
110 |           m <- glm(form, data=d, family="binomial")
111 |         }
112 |         g1W <- predict(m, newdata=newdata, type="response")
113 |         coef <- coef(m)
114 |       }
115 |       # Get counterfactual predicted values
116 |       if(outcome=="Z"){
117 |         if(identical(class(m),"SuperLearner")){
118 |           g1W <- cbind(predict(m, newdata=data.frame(A=0, newdata[,-(1:2), drop=FALSE]), type="response",
119 |                                X=d[,-1, drop=FALSE], Y=d[,1])[[1]],
120 |                        predict(m, newdata=data.frame(A=1, newdata[,-(1:2), drop=FALSE]), type="response",
121 |                                X=d[,-1, drop=FALSE], Y=newdata[,1])[[1]])
122 |         } else {
123 |           g1W <- cbind(predict(m, newdata=data.frame(A=0, newdata[,-(1:2), drop=FALSE]), type="response"),
124 |                        predict(m, newdata=data.frame(A=1, newdata[,-(1:2), drop=FALSE]), type="response"))
125 |         }
126 |         colnames(g1W) <- c("A0", "A1")
127 | 
128 |       } else if (outcome=="D"){
129 |         if(identical(class(m),"SuperLearner")){
130 |           g1W <- cbind(predict(m, newdata=data.frame(Z=0, A=0, newdata[,-(1:3), drop=FALSE]), type="response",
131 |                                X=d[,-1,drop=FALSE], Y=d[,1])[[1]],
132 |                        predict(m, newdata=data.frame(Z=0, A=1, newdata[,-(1:3), drop=FALSE]), type="response",
133 |                                X=d[,-1, drop=FALSE], Y=d[,1])[[1]],
134 |                        predict(m, newdata=data.frame(Z=1, A=0, newdata[,-(1:3), drop=FALSE]), type="response",
135 |                                X=d[,-1, drop=FALSE], Y=d[,1])[[1]],
136 |                        predict(m, newdata=data.frame(Z=1, A=1, newdata[,-(1:3), drop=FALSE]), type="response",
137 |                                X=d[,-1, drop=FALSE], Y=d[,1])[[1]])
138 |         } else{
139 |           g1W <- cbind(predict(m, newdata=data.frame(Z=0, A=0, newdata[,-(1:3), drop=FALSE]), type="response"),
140 |                        predict(m, newdata=data.frame(Z=0, A=1, newdata[,-(1:3), drop=FALSE]), type="response"),
141 |                        predict(m, newdata=data.frame(Z=1, A=0, newdata[,-(1:3), drop=FALSE]), type="response"),	     	     		  predict(m, newdata=data.frame(Z=1, A=1, newdata[,-(1:3), drop=FALSE]), type="response"))
142 |         }
143 |         colnames(g1W) <- c("Z0A0", "Z0A1", "Z1A0", "Z1A1")
144 |       }
145 |     }
146 |   } else {
147 |     type <- "user-supplied values"
148 |     if(outcome=="Z") {
149 |       colnames(g1W) <- c("A0", "A1")
150 |     } else if (outcome=="D"){
151 |       colnames(g1W) <- c("Z0A0", "Z0A1", "Z1A0", "Z1A1")[1:ncol(g1W)]
152 |     }
153 |   }
154 |   if(is.null(type)){ type <- class(m)[1]}
155 |   returnVal <- list(g1W=g1W, coef=coef, type=type, model = m)
156 |   if(type=="SuperLearner"){
157 |     returnVal$SL.library=SL.library
158 |     returnVal$coef=m$coef
159 |   }
160 |   return(returnVal)
161 | }
162 | 


--------------------------------------------------------------------------------
/R/tmle_estimate_q.R:
--------------------------------------------------------------------------------
  1 | # CK:
  2 | # Edits:
  3 | #  - SuperLearner needs to have saveFitLibrary = T. (critical tweak)
  4 | #  - return Q model.
  5 | #  - Stop if Qbounds is null.
  6 | #  - add default for id argument.
  7 | #  - add default for cvQinit (false),
  8 | #  - add NULL default for Z, Q, QForm arguments.
  9 | #-----------estimateQ----------------
 10 | #' purpose: estimate Q=E(Y |Z, A,W) data-adaptively,
 11 | #'
 12 | #' unless super learner not available, or user specifies
 13 | #' initial values or a regression formula
 14 | #' arguments:
 15 | #' @param Y outcome
 16 | #' @param Z intermediate variable between A and Y (default= 0 when no int. var.)
 17 | #' @param A treatment indicator (1=treatment, 0=control)
 18 | #' @param W baseline covariates
 19 | #' @param	Delta missingness indicator
 20 | #' @param	Q optional externally estimated values for Q
 21 | #' @param	Qbounds bounds for predicted values
 22 | #' @param	Qform optional regression formula to use for glm if
 23 | #	        non-data adaptive estimation specified
 24 | #' @param maptoYstar if TRUE, using logistic fluctuation for bounded, continuous outcomes
 25 | # 		estimation inital Q on linear scale, bounded by (0,1),and return on logit scale
 26 | #		(will work if family=poisson)
 27 | #' @param	SL.library library of prediction algorithms for Super Learner
 28 | #' @param cvQinit flag, if TRUE, cross-validate SL.
 29 | #' @param family regression family
 30 | #' @param	id subject identifier
 31 | #' @param V number of folds for SuperLearner
 32 | #' @param verbose Set T for extra output
 33 | # returns matrix of linear predictors for Q(A,W), Q(0,W), Q(1,W),
 34 | #   (for controlled direct effect, 2 additional columns: Q(Z=1,A=0,W), Q(Z=1,A=1,W))
 35 | #		family for stage 2 targeting
 36 | #		coef, NA, unless Q is estimated using a parametric model
 37 | # 		type, estimation method for Q
 38 | #----------------------------------------
 39 | #' @importFrom utils packageDescription
 40 | #' @export
 41 | tmle_estimate_q <-
 42 |   function(Y,
 43 |            Z = rep(0, length(Y)),
 44 |            A,
 45 |            W,
 46 |            Delta,
 47 |            Q = NULL,
 48 |            Qbounds,
 49 |            Qform = NULL,
 50 |            maptoYstar,
 51 |            SL.library,
 52 |            cvQinit = F,
 53 |            family,
 54 |            id = 1:length(Y),
 55 |            V = 10,
 56 |            verbose = F) {
 57 | 
 58 |   if (is.null(Qbounds)) stop("Qbounds must be defined.")
 59 |   SL.version <- 2
 60 |   Qfamily <- family
 61 |   m <- NULL
 62 |   coef <- NA
 63 |   CDE <- length(unique(Z)) > 1
 64 |   type <- "user-supplied values"
 65 |   if(is.null(Q)){
 66 |     if(verbose) { cat("\tEstimating initial regression of Y on A and W\n")}
 67 |     Q <- matrix(NA, nrow=length(Y), ncol = 5)
 68 |     colnames(Q)<- c("QAW", "Q0W", "Q1W", "Q0W.Z1", "Q1W.Z1")
 69 |     if(!(is.null(Qform))){
 70 |       if(identical(as.character(as.formula(Qform)), c("~","Y", "."))){
 71 |         if(CDE){
 72 |           Qform <- paste("Y~Z+A+", paste(colnames(W), collapse="+"))
 73 |         } else {
 74 |           Qform <- paste("Y~A+", paste(colnames(W), collapse="+"))
 75 |         }
 76 |       }
 77 |       m <- suppressWarnings(glm(Qform, data=data.frame(Y,Z,A,W, Delta), family=family, subset=Delta==1))
 78 |       Q[,"QAW"] <- predict(m, newdata=data.frame(Y,Z,A,W), type="response")
 79 |       Q[,"Q0W"] <- predict(m, newdata=data.frame(Y,Z=0,A=0,W), type="response")
 80 |       Q[,"Q1W"] <- predict(m, newdata=data.frame(Y,Z=0,A=1,W), type="response")
 81 |       Q[,"Q0W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=0,W), type="response")
 82 |       Q[,"Q1W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=1,W), type="response")
 83 |       coef <- coef(m)
 84 |       type="glm, user-supplied model"
 85 |     } else {
 86 |       if(cvQinit){
 87 |         stop("cvQinit = T not supported sadly.")
 88 |         #m <- try(tmle::.estQcvSL(Y,X=cbind(Z,A,W),SL.library, family=family,
 89 |         #                   Delta=Delta, Qbounds=Qbounds,id=id, verbose=verbose))
 90 |         #if(!(identical(class(m), "try-error"))){
 91 |         #  type <- "cross-validated SL"
 92 |         #  Qinit <- m
 93 |         #  Q <- Qinit$Q
 94 |         #}
 95 |       } else {
 96 |         if(verbose) {cat("\t using SuperLearner\n")}
 97 |         n <- length(Y)
 98 |         X <- data.frame(Z,A,W)
 99 |         X00 <- data.frame(Z=0,A=0, W)
100 |         X01 <- data.frame(Z=0,A=1, W)
101 |         newX <- rbind(X, X00, X01)
102 |         if(CDE) {
103 |           X10 <- data.frame(Z=1,A=0, W)
104 |           X11 <- data.frame(Z=1,A=1, W)
105 |           newX <- rbind(newX, X10, X11)
106 |         }
107 |         if(packageDescription("SuperLearner")$Version < SL.version){
108 |           arglist <- list(Y=Y[Delta==1],X=X[Delta==1,], newX=newX, SL.library=SL.library,
109 |                           V=V, family=family, save.fit.library=T, id=id[Delta==1])
110 |         } else {
111 |           arglist <- list(Y=Y[Delta==1],X=X[Delta==1,], newX=newX, SL.library=SL.library,
112 |                           cvControl=list(V=V), family=family, control = list(saveFitLibrary=T), id=id[Delta==1])
113 |         }
114 |         suppressWarnings({
115 |           # CK: try to eliminate messages from loading packages.
116 |           out = utils::capture.output({
117 |             suppressPackageStartupMessages({
118 |               m <- try(do.call(SuperLearner::SuperLearner, arglist))
119 |             })
120 |           })
121 |           # Set call to null because do.call() messes up that element.
122 |           m$call = NULL
123 |         })
124 |         if (identical(class(m),"SuperLearner")){
125 |           #if (verbose) print(m)
126 |           Q[,"QAW"] <- m$SL.predict[1:n]
127 |           Q[,"Q0W"] <- m$SL.predict[(n+1):(2*n)]
128 |           Q[,"Q1W"] <- m$SL.predict[(2*n+1):(3*n)]
129 |           if(CDE){
130 |             Q[,"Q0W.Z1"] <- m$SL.predict[(3*n+1):(4*n)]
131 |             Q[,"Q1W.Z1"] <- m$SL.predict[(4*n+1):(5*n)]
132 |           }
133 |           type <- "SuperLearner"
134 |         } else {
135 |           stop("Super Learner failed when estimating Q. Exiting program\n")
136 |         }
137 |       } }
138 |   }
139 |   if(is.na(Q[1,1]) | identical(class(m), "try-error")){
140 |     if(verbose) {cat("\t Running main terms regression for 'Q' using glm\n")}
141 |     Qform <- paste("Y~Z+A+", paste(colnames(W), collapse="+"))
142 |     m <- glm(Qform, data=data.frame(Y,Z,A,W, Delta), family=family, subset=Delta==1)
143 |     Q[,"QAW"] <- predict(m, newdata=data.frame(Y,Z,A,W), type="response")
144 |     Q[,"Q1W"] <- predict(m, newdata=data.frame(Y,Z=0,A=1,W), type="response")
145 |     Q[,"Q0W"] <- predict(m, newdata=data.frame(Y,Z=0,A=0,W), type="response")
146 |     Q[,"Q0W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=0,W), type="response")
147 |     Q[,"Q1W.Z1"] <- predict(m, newdata=data.frame(Y,Z=1,A=1,W), type="response")
148 | 
149 |     coef <- coef(m)
150 |     type="glm, main terms model"
151 |   }
152 |   Q <- varimpact::.bound(Q, Qbounds)
153 |   if(maptoYstar | identical(Qfamily,"binomial") | identical(Qfamily, binomial)){
154 |     Q <- qlogis(Q)
155 |     Qfamily <- "binomial"
156 |   } else if (identical(Qfamily, "poisson") | identical(Qfamily, poisson)) {
157 |     Q <- log(Q)
158 |     Qfamily <- "poisson"
159 |   }
160 |   if(!CDE){
161 |     Q <- Q[,1:3]
162 |   }
163 |   if(cvQinit){
164 |     Qinit$Q <- Q
165 |   } else {
166 |     Qinit <- list(Q=Q, family=Qfamily, coef=coef, type=type, model = m)
167 |     if(type=="SuperLearner"){
168 |       Qinit$SL.library=SL.library
169 |       Qinit$coef=m$coef
170 |     }
171 |   }
172 |   return(Qinit)
173 | }
174 | 


--------------------------------------------------------------------------------
/R/tmle_init_stage1.R:
--------------------------------------------------------------------------------
 1 | # Copied from tmle package.
 2 | #---------- function .initStage1 ---------------
 3 | # Bound Y, map to Ystar if applicable, and
 4 | # set boundson on Q and enforce on user-specified values
 5 | # returns
 6 | #   Ystar - outcome values (between [0,1] if maptoYstar=TRUE)
 7 | #   Q - matrix of user-specified values
 8 | #   Qbounds - bounds on predicted values for Q (10% wider at each end then
 9 | # 			observed range of Y
10 | #			(-Inf,+Inf) is default for linear regression
11 | #   ab - bounding levels used to transform Y to Ystar
12 | #-----------------------------------------------
13 | tmle_init_stage1 <- function(Y,A, Q, Q.Z1=NULL, Delta, Qbounds, alpha, maptoYstar, family){
14 |   if(family=="binomial") {Qbounds <- c(0,1)}
15 |   if(is.null(Qbounds)) {
16 |     if(maptoYstar){
17 |       Qbounds <- range(Y[Delta==1])
18 |       Qbounds <- Qbounds + .1*c(-abs(Qbounds[1]),abs(Qbounds[2]))
19 |     } else {
20 |       Qbounds <- c(-Inf, Inf)
21 |     }
22 |   }
23 |   if(!is.null(Q)){
24 |     QAW <- (1-A)*Q[,1] + A*Q[,2]
25 |     Q <- cbind(QAW, Q0W=Q[,1], Q1W=Q[,2])
26 |   }
27 |   if(!is.null(Q.Z1)){
28 |     Q <- cbind(Q, Q0W.Z1=Q.Z1[,1], Q1W.Z1=Q.Z1[,2])
29 |   }
30 |   ab <- c(0,1)
31 |   Ystar <- Y
32 |   if(maptoYstar){
33 |     Ystar <- .bound(Y, Qbounds)
34 |     if(!is.null(Q)){
35 |       Q <- .bound(Q, Qbounds)
36 |     }
37 |     if(0 >= alpha | 1 <= alpha){
38 |       alpha <- .995
39 |       warning(paste("\n\talpha must be between 0 and 1, alpha reset to",alpha,"\n"),
40 |               immediate. = TRUE)
41 |     }
42 |     ab <- range(Ystar, na.rm=TRUE)
43 |     Ystar[is.na(Ystar)] <- 0
44 |     Ystar <- (Ystar-ab[1])/diff(ab)
45 |     if(!is.null(Q)){Q <- (Q-ab[1])/diff(ab)}
46 |     Qbounds <- c(alpha, 1-alpha)
47 |   }
48 |   return(list(Ystar=Ystar, Q=Q, Qbounds=Qbounds, ab=ab))
49 | }
50 | 


--------------------------------------------------------------------------------
/R/varimpact.R:
--------------------------------------------------------------------------------
  1 | #' @title Variable importance estimation using causal inference (targeted learning)
  2 | #'
  3 | #' @description \code{varimpact} returns variable importance statistics ordered
  4 | #'   by statistical significance using a combination of data-adaptive target
  5 | #'   parameter
  6 | #'
  7 | #' @details
  8 | #' The function performs the following functions.
  9 | #'  \enumerate{
 10 | #'  \item Drops variables missing > miss.cut of time (tuneable).
 11 | #'  \item Separate out covariates into factors and continuous (ordered).
 12 | #'  \item Drops variables for which their distribution is uneven  - e.g., all 1
 13 | #'  value (tuneable) separately for factors and numeric variables (ADD MORE
 14 | #'  DETAIL HERE)
 15 | #'  \item Makes dummy variable basis for factors, including naming dummies
 16 | #'  to be traceable to original factor variables later.
 17 | #'  \item Makes new ordered variable of integers mapped to intervals defined by
 18 | #'  deciles for the ordered numeric variables (automatically makes) fewer
 19 | #'  categories if original variable has < 10 values.
 20 | #'  \item Creates associated list of number of unique values and the list of them
 21 | #'  for each variable for use in variable importance part.
 22 | #'  \item Makes missing covariate basis for both factors and ordered variables
 23 | #'  \item For each variable, after assigning it as A, uses optimal histogram
 24 | #'  function to combine values using the distribution of A | Y=1 to avoid very
 25 | #'  small cell sizes in distribution of Y vs. A (tuneable) (ADD DETAIL)
 26 | #'  \item Uses HOPACH* to cluster variables associated confounder/missingness
 27 | #'  basis for W, that uses specified minimum number of adjustment variables.
 28 | #'  \item Finds min and max estimate of E(Ya) w.r.t. a. after looping through
 29 | #'  all values of A* (after processed by histogram)
 30 | #'  \item Returns estimate of E(Ya(max)-Ya(min)) with SE using CV-TMLE.
 31 | #' }
 32 | #' *HOPACH is "Hierarchical Ordered Partitioning and Collapsing Hybrid"
 33 | #'
 34 | #' @param Y outcome of interest (numeric vector)
 35 | #' @param data data frame of predictor variables of interest for
 36 | #' which function returns VIM's. (possibly a matrix?)
 37 | #' @param A_names Names of the variables for which we want to estimate importance,
 38 | #'  a subset of the data argument.
 39 | #' @param V Number of cross-validation folds.
 40 | #' @param Q.library library used by SuperLearner for model of outcome
 41 | #' versus predictors
 42 | #' @param g.library library used by SuperLearner for model of
 43 | #' predictor variable of interest versus other predictors
 44 | #' @param family family ('binomial' or 'gaussian')
 45 | #' @param minYs mininum # of obs with event  - if it is < minYs, skip VIM
 46 | #' @param minCell is the cut-off for including a category of A in analysis, and
 47 | #'   presents the minumum of cells in a 2x2 table of the indicator of that level
 48 | #'   versus outcome, separately by training and validation sample.
 49 | #' @param adjust_cutoff Maximum number of adjustment variables during TMLE. If
 50 | #'   more than this cutoff varimpact will attempt to reduce the dimensions to
 51 | #'   that number (using HOPACH hierarchical clustering). Must not be more than
 52 | #'   15 due to HOPACH constraints. Set to NULL to disable any dimension
 53 | #'   reduction.
 54 | #' @param corthres cut-off correlation with explanatory
 55 | #' variable for inclusion of an adjustment variables
 56 | #' @param impute Type of missing value imputation to conduct. One of: "zero",
 57 | #'   "median", "knn" (default). Note: knn results in the covariate data being centered/scaled.
 58 | #' @param miss.cut eliminates explanatory (X) variables with proportion
 59 | #' of missing obs > cut.off
 60 | #' @param bins_numeric Numbers of bins when discretizing numeric variables.
 61 | #' @param quantile_probs_factor Quantiles used to check if factors have
 62 | #'   sufficient variation.
 63 | #' @param quantile_probs_numeric Quantiles used to check if numerics have
 64 | #'   sufficient variation.
 65 | #' @param parallel Use parallel processing if a backend is registered; enabled
 66 | #'   by default.
 67 | #' @param verbose Boolean - if TRUE the method will display more detailed
 68 | #'   output.
 69 | #' @param verbose_tmle Boolean - if TRUE, will display even more detail on the TMLE
 70 | #'   estimation process.
 71 | #' @param verbose_reduction Boolean - if TRUE, will display more detail during
 72 | #'   variable reduction step (clustering).
 73 | #' @param digits Number of digits to round the value labels.
 74 | #'
 75 | #' @return Results object. TODO: add more detail here.
 76 | #'
 77 | #' @importFrom stats cor model.matrix na.omit pnorm quantile var
 78 | #' @importFrom SuperLearner All
 79 | #'
 80 | #' @seealso
 81 | #' \code{\link[varimpact]{exportLatex}}, \code{\link[varimpact]{print.varimpact}}
 82 | #'
 83 | #' @encoding utf8
 84 | #'
 85 | #' @section Authors:
 86 | #' Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley
 87 | #'
 88 | #'
 89 | #' @section References:
 90 | #' Benjamini, Y., & Hochberg, Y. (1995). \emph{Controlling the false discovery
 91 | #' rate: a practical and powerful approach to multiple testing}. Journal of the
 92 | #' royal statistical society. Series B (Methodological), 289-300.
 93 | #'
 94 | #' Gruber, S., & van der Laan, M. J. (2012). \emph{tmle: An R Package for
 95 | #' Targeted Maximum Likelihood Estimation}. Journal of Statistical Software,
 96 | #' 51(i13).
 97 | #'
 98 | #' Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016).
 99 | #' \emph{Statistical Inference for Data Adaptive Target Parameters}. The
100 | #' international journal of biostatistics, 12(1), 3-19.
101 | #'
102 | #' Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A.,
103 | #' Bulger, E. M., ... & Rahbar, M. H. (2013). \emph{Time-Dependent Prediction
104 | #' and Evaluation of Variable Importance Using SuperLearning in High Dimensional
105 | #' Clinical Data}. The journal of trauma and acute care surgery, 75(1 0 1), S53.
106 | #'
107 | #' Hubbard, A. E., & van der Laan, M. J. (2016). \emph{Mining with inference:
108 | #' data-adaptive target parameters (pp. 439-452)}. In P. Buhlmann et al. (Ed.),
109 | #' \emph{Handbook of Big Data}. CRC Press, Taylor & Francis Group, LLC: Boca
110 | #' Raton, FL.
111 | #'
112 | #' van der Laan, M. J. (2006). \emph{Statistical inference for variable
113 | #' importance}. The International Journal of Biostatistics, 2(1).
114 | #'
115 | #' van der Laan, M. J., & Pollard, K. S. (2003). \emph{A new algorithm for
116 | #' hybrid hierarchical clustering with visualization and the bootstrap}. Journal
117 | #' of Statistical Planning and Inference, 117(2), 275-303.
118 | #'
119 | #' van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). \emph{Super
120 | #' learner}. Statistical applications in genetics and molecular biology, 6(1).
121 | #'
122 | #' van der Laan, M. J., & Rose, S. (2011). \emph{Targeted learning: causal
123 | #' inference for observational and experimental data}. Springer Science &
124 | #' Business Media.
125 | #'
126 | #' @examples
127 | #' ####################################
128 | #' # Create test dataset.
129 | #' set.seed(1)
130 | #' N <- 100
131 | #' num_normal <- 5
132 | #' X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal))
133 | #' Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
134 | #' # Add some missing data to X so we can test imputation.
135 | #' for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA
136 | #'
137 | #' ####################################
138 | #' # Basic example
139 | #'
140 | #' vim <- varimpact(Y = Y, data = X[, 1:3])
141 | #' vim
142 | #' vim$results_all
143 | #' exportLatex(vim)
144 | #' cleanup_latex_files()
145 | #'
146 | #' # Impute by median rather than knn.
147 | #' \dontrun{
148 | #' vim <- varimpact(Y = Y, data = X[, 1:3], impute = "median")
149 | #' }
150 | #'
151 | #' ####################################
152 | #' # Multicore parallel example.
153 | #' \dontrun{
154 | #' # Setup multicore parallelization.
155 | #' library(future)
156 | #' plan("multisession", workers = 2)
157 | #'
158 | #' vim <- varimpact(Y = Y, data = X[, 1:3])
159 | #' }
160 | #'
161 | #' ####################################
162 | #' # Cluster parallel example.
163 | #' \dontrun{
164 | #' cl = parallel::makeCluster(2L)
165 | #' plan(cluster, workers = cl)
166 | #' vim <- varimpact(Y = Y, data = X[, 1:3])
167 | #' parallel::stopCluster(cl)
168 | #' }
169 | #'
170 | #' ####################################
171 | #' # mlbench BreastCancer example.
172 | #' \dontrun{
173 | #' data(BreastCancer, package="mlbench")
174 | #' data <- BreastCancer
175 | #'
176 | #' set.seed(1, "L'Ecuyer-CMRG")
177 | #' # Reduce to a dataset of 100 observations to speed up testing.
178 | #  data = data[sample(nrow(data), 100), ]
179 | #
180 | #' # Create a numeric outcome variable.
181 | #' data$Y <- as.numeric(data$Class == "malignant")
182 | #
183 | #' # Use multicore parallelization to speed up processing.
184 | #' future::plan("multiprocess", workers = 2)
185 | #' vim <- varimpact(Y = data$Y, data = subset(data, select=-c(Y, Class, Id)))
186 | #' }
187 | #'
188 | #' @export
189 | varimpact =
190 |   function(Y,
191 |            data,
192 |            A_names = colnames(data),
193 |            V = 2L,
194 |            #Q.library = c("SL.glmnet", "SL.mean"),
195 |            #g.library = c("SL.glmnet", "SL.mean"),
196 |            Q.library = c("SL.glm", "SL.mean"),
197 |            g.library = c("SL.glm", "SL.mean"),
198 |            #g.library = c("SL.stepAIC"),
199 |            family = "binomial",
200 |            minYs = 15L,
201 |            minCell = 0L,
202 |            adjust_cutoff = 10L,
203 |            corthres = 0.8,
204 |            impute = "median",
205 |            miss.cut = 0.5,
206 |            bins_numeric = 10L,
207 |            quantile_probs_factor = c(0.1, 0.9),
208 |            quantile_probs_numeric = quantile_probs_factor,
209 |            verbose = FALSE,
210 |            verbose_tmle = FALSE,
211 |            verbose_reduction = FALSE,
212 |            parallel = TRUE,
213 |            digits = 4L) {
214 | 
215 |   # Time the full function execution.
216 |   time_start = proc.time()
217 | 
218 |   ######################
219 |   # Argument checks.
220 | 
221 |   # Confirm that data has at least two columns.
222 |   if (ncol(data) < 2L) {
223 |     stop("Data argument must have at least two columns.")
224 |   }
225 | 
226 |   # Ensure that Y is numeric; e.g. can't be a factor.
227 |   stopifnot(class(Y) %in% c("numeric", "integer"))
228 | 
229 |   if (family == "binomial" &&
230 |       (min(Y, na.rm = TRUE) < 0 || max(Y, na.rm = TRUE) > 1)) {
231 |     stop("With binomial family Y must be bounded by [0, 1]. Specify family=\"gaussian\" otherwise.")
232 |   }
233 | 
234 |   if (!family %in% c("binomial", "gaussian")) {
235 |     stop('Family must be either "binomial" or "gaussian".')
236 |   }
237 | 
238 |   if (parallel && verbose) {
239 |     cat("Future backend set to the following:\n")
240 |     print(future::plan())
241 |   }
242 | 
243 |   # Save bounds on the full Y variables for later transformation if Y is not binary.
244 |   if (family == "binomial" || length(unique(Y)) == 2) {
245 |     #Qbounds = NULL
246 |     Qbounds = c(0, 1)
247 |   } else {
248 |     # This part is duplicated from the TMLE code in tmle_init_stage1.
249 | 
250 |     # Define Qbounds just for continuous (non-binary) outcomes.
251 |     Qbounds = range(Y, na.rm = TRUE)
252 |     # Extend bounds 10% beyond the observed range.
253 |     # NOTE: if one of the bounds is zero then it won't be extended.
254 |     Qbounds = Qbounds + 0.1 * c(-abs(Qbounds[1]), abs(Qbounds[2]))
255 |   }
256 | 
257 |   ########
258 |   # Applied to Explanatory (X) data frame
259 |   sna = sapply(data, sum_na)
260 | 
261 |   n = nrow(data)
262 | 
263 |   #######
264 |   # Missing proportion by variable.
265 |   mis.prop = sna / n
266 | 
267 |   #######
268 |   # Cut-off for eliminating variable for proportion of obs missing.
269 |   data = data[, mis.prop < miss.cut, drop = FALSE]
270 | 
271 |   # TODO: move this stuff into a separate function.
272 |   if (verbose) cat("Removed", sum(mis.prop >= miss.cut), "variables due to high",
273 |                    "missing value proportion.\n")
274 | 
275 |   # Separate dataframe into factors-only and numerics-only.
276 |   # Also converts characters to factors automatically.
277 |   separated_data = separate_factors_numerics(data)
278 | 
279 |   factors = process_factors(separated_data$df_factors,
280 |                             quantile_probs_factor = quantile_probs_factor,
281 |                             miss.cut = miss.cut,
282 |                             verbose = verbose)
283 | 
284 |   # Pre-process numeric/continuous variables.
285 |   numerics =
286 |     process_numerics(separated_data$df_numerics,
287 |                      quantile_probs_numeric = quantile_probs_numeric,
288 |                      miss.cut = miss.cut,
289 |                      bins_numeric = bins_numeric,
290 |                      impute = impute,
291 |                      verbose = verbose)
292 | 
293 |   cat("Finished pre-processing variables.\n")
294 | 
295 |   cat("\nProcessing results:\n")
296 |   cat("- Factor variables:", factors$num_factors, "\n")
297 |   cat("- Numeric variables:", numerics$num_numeric, "\n\n")
298 | 
299 |   # Create cross-validation folds (2 by default).
300 |   folds = create_cv_folds(V, Y, verbose = verbose)
301 | 
302 |   # VIM for factors.
303 |   factor_vims =
304 |     vim_factors(Y = Y, numerics = numerics, factors = factors,
305 |                 V = V, folds = folds,
306 |                 A_names = A_names,
307 |                 family = family,
308 |                 minCell = minCell,
309 |                 minYs = minYs,
310 |                 Q.library = Q.library,
311 |                 g.library = g.library,
312 |                 Qbounds = Qbounds,
313 |                 corthres = corthres,
314 |                 adjust_cutoff = adjust_cutoff,
315 |                 verbose = verbose,
316 |                 verbose_tmle = verbose_tmle,
317 |                 verbose_reduction = verbose_reduction)
318 | 
319 |   # Repeat for numerics.
320 |   numeric_vims =
321 |     vim_numerics(Y = Y, numerics = numerics, factors = factors,
322 |                  V = V, folds = folds,
323 |                  A_names = A_names,
324 |                  family = family,
325 |                  minCell = minCell,
326 |                  minYs = minYs,
327 |                  Q.library = Q.library,
328 |                  g.library = g.library,
329 |                  Qbounds = Qbounds,
330 |                  corthres = corthres,
331 |                  adjust_cutoff = adjust_cutoff,
332 |                  verbose = verbose,
333 |                  verbose_tmle = verbose_tmle,
334 |                  verbose_reduction = verbose_reduction)
335 | 
336 |   # Combine the separate continuous and factor results.
337 |   results =
338 |     compile_results(numeric_vims$colnames_numeric,
339 |                     factor_vims$colnames_factor,
340 |                     numeric_vims$vim_numeric,
341 |                     factor_vims$vim_factor,
342 |                     V = V,
343 |                     verbose = verbose)
344 | 
345 |   # End timing the full execution.
346 |   time_end = proc.time()
347 | 
348 |   # Final compilation of results.
349 |   results = c(results,
350 |               # Append additional settings to the results object.
351 |               # TODO: make this a sublist?
352 |               list(V = V,
353 |                    g.library = g.library,
354 |                    Q.library = Q.library,
355 |                    minCell = minCell,
356 |                    minYs = minYs,
357 |                    family = family,
358 |                    datafac.dumW  = factors$datafac.dumW,
359 |                    miss.fac = factors$miss.fac,
360 |                    data.numW = numerics$data.numW,
361 |                    numeric_vims = numeric_vims,
362 |                    factor_vims = factor_vims,
363 |                    impute_info = numerics$impute_info,
364 |                    time = time_end - time_start,
365 |                    cv_folds = folds))
366 | 
367 |   # Set a custom class so that we can override print and summary.
368 |   class(results) = "varimpact"
369 | 
370 |   invisible(results)
371 | }
372 | 


--------------------------------------------------------------------------------
/R/vim-factors.R:
--------------------------------------------------------------------------------
  1 | vim_factors =
  2 |   function(Y,
  3 |            numerics,
  4 |            factors,
  5 |            V,
  6 |            folds,
  7 |            A_names,
  8 |            family,
  9 |            minCell,
 10 |            minYs,
 11 |            Q.library,
 12 |            g.library,
 13 |            Qbounds,
 14 |            corthres,
 15 |            adjust_cutoff,
 16 |            verbose = FALSE,
 17 |            verbose_tmle = FALSE,
 18 |            verbose_reduction = FALSE) {
 19 | 
 20 |   # NOTE: we use && so that conditional will short-circuit if num_factors == 0.
 21 |   if (factors$num_factors > 0L && ncol(factors$data.fac) > 0L) {
 22 |     cat("Estimating variable importance for", factors$num_factors, "factors.\n")
 23 | 
 24 |     # Find the level of covariate that has lowest risk
 25 |     datafac.dumW = factors$datafac.dum
 26 |     # NOTE: can't we skip this line because we already imputed missing data to 0?
 27 |     datafac.dumW[is.na(factors$datafac.dum)] = 0
 28 | 
 29 |     #############################
 30 |     # Below is to get indexing vectors so that any basis functions related to current A
 31 |     # that are in covariate matrix can be removed.
 32 |     names.fac = colnames(factors$data.fac)
 33 |     nmes.facW = colnames(datafac.dumW)
 34 |     nmes.mfacW = colnames(factors$miss.fac)
 35 |     nchar.facW = nchar(nmes.facW) + 1
 36 |     nchar.mfacW = nchar(nmes.mfacW) + 1
 37 | 
 38 |     XXm = regexpr("XX", nmes.facW)
 39 |     XXm[XXm < 0] = nchar.facW[XXm < 0]
 40 | 
 41 |     XXm2 = regexpr("XX", nmes.mfacW)
 42 |     XXm2[XXm2 < 0] = nchar.mfacW[XXm2 < 0]
 43 | 
 44 |     vars.facW = substr(nmes.facW, 1, XXm - 1)
 45 |     vars.mfacW = substr(nmes.mfacW, 7, XXm2 - 1)
 46 | 
 47 |     xc = ncol(factors$data.fac)
 48 |     n.fac = nrow(factors$data.fac)
 49 | 
 50 |     # vim_factor = lapply(1:xc, function(i) {
 51 | 
 52 |     # vim_factor will be a list of results, one element per factor variable.
 53 |     # Define var_i just to avoid automated NOTEs, will be overwritten by foreach.
 54 |     var_i = NULL
 55 |     #vim_factor = foreach::foreach(var_i = 1:xc, .verbose = verbose, .errorhandling = "stop") %do_op% {
 56 |     vim_factor = future.apply::future_lapply(1:xc, future.seed = TRUE, function(var_i) {
 57 |     #vim_factor = lapply(1:xc, function(var_i) {
 58 |       nameA = names.fac[var_i]
 59 | 
 60 |       if (verbose) cat("Var:", nameA, var_i, "out of", xc, "factor variables\n")
 61 | 
 62 |       if (!nameA %in% A_names) {
 63 |         if (verbose) cat("Skipping", nameA, "as it is not in A_names.\n")
 64 |         return(NULL)
 65 |       }
 66 | 
 67 |       # Loop over each fold.
 68 |       # TODO: incorporate this loop into parallelization.
 69 |       # for (fold_k in 1:V) {
 70 | 
 71 |       # This is looping sequentially for now.
 72 |       #fold_results = foreach::foreach(fold_k = 1:V) foreach::`%do%` {
 73 |       fold_results = lapply(1:V, function(fold_k) {
 74 |         if (verbose) cat("i =", var_i, "V =", fold_k, "\n")
 75 | 
 76 |         # All data not in this fold is the training data.
 77 |         At = factors$data.fac[folds != fold_k, var_i]
 78 | 
 79 |         # All data in this fold is the validation data.
 80 |         Av = factors$data.fac[folds == fold_k, var_i]
 81 | 
 82 |         Yt = Y[folds != fold_k]
 83 |         Yv = Y[folds == fold_k]
 84 | 
 85 | 
 86 |         #######################################
 87 |         # Create adjustment dataframe.
 88 | 
 89 |         ### acit.numW is just same as acit.cont.dist except with NA's replaced by
 90 |         ### 0's.
 91 |         mtch1 = match(vars.facW, nameA)
 92 |         mtch2 = match(vars.mfacW, nameA)
 93 |         Adum = data.frame(factors$datafac.dum[, is.na(mtch1) == FALSE])
 94 |         dumW = factors$datafac.dum[, is.na(mtch1)]
 95 |         missdumW = factors$miss.fac[, is.na(mtch2)]
 96 | 
 97 |         if (is.null(missdumW)) {
 98 |           missdumW = rep(NA, n.fac)
 99 |         }
100 |         if (is.null(numerics$miss.cont)) {
101 |           numerics$miss.cont = rep(NA, n.fac)
102 |         }
103 |         if (is.null(dumW)) {
104 |           dumW = rep(NA, n.fac)
105 |         }
106 |         if (is.null(numerics$data.numW)) {
107 |           numerics$data.numW = rep(NA, n.fac)
108 |         }
109 | 
110 |         W = data.frame(numerics$data.numW, numerics$miss.cont, dumW, missdumW)
111 | 
112 |         # Restrict to columns in which there is less than 100% missingness.
113 |         W = W[, !apply(is.na(W), 2, all), drop = FALSE]
114 | 
115 |         #######################################
116 | 
117 |         # Divide into training and validation subsets.
118 |         Wt = W[folds != fold_k, , drop = FALSE]
119 |         Wv = W[folds == fold_k, , drop = FALSE]
120 | 
121 |         Adum = data.frame(Adum[folds != fold_k, ])
122 | 
123 |         ###
124 |         # Pull out any variables that are overly correlated with At (corr coef < corthes)
125 |         #if (sd(Adum) == 0) {
126 |         #  if (verbose) cat("Warning: sd of Adum = 0.\n")
127 |         #}
128 | 
129 |         # Suppress possible "the standard deviation is zero" warning from cor().
130 |         # TODO: investigate more and confirm that this is ok.
131 |         suppressWarnings({
132 |           corAt = apply(stats::cor(Adum, Wt, use = "complete.obs"), 2, max_sqr)
133 |         })
134 |         corAt[corAt < -1] = 0
135 |         # cat('i = ',i,' maxCor = ',max(corAt,na.rm=T),'\n')
136 |         incc = abs(corAt) < corthres & !is.na(corAt)
137 | 
138 |         if (verbose && sum(!incc) > 0) {
139 |           cat("Removed", sum(!incc), "columns based on correlation threshold", corthres, "\n")
140 |         }
141 | 
142 |         Wv = Wv[, incc, drop = F]
143 |         Wt = Wt[, incc, drop = F]
144 | 
145 |         if (verbose) {
146 |           cat("Columns:", ncol(Wt))
147 |           if (!is.null(adjust_cutoff)) cat(" Reducing dimensions to", adjust_cutoff)
148 |           cat("\n")
149 |         }
150 | 
151 |         # Use HOPACH to reduce dimension of W to some level of tree
152 |         reduced_results = reduce_dimensions(Wt, Wv, adjust_cutoff, verbose = verbose_reduction)
153 | 
154 |         Wtsht = reduced_results$data
155 |         Wvsht = reduced_results$newX
156 | 
157 |         # We should have no constant columns after calling reduce_dimensions().
158 |         # Remove any NA values - but shouldn't these already be imputed?
159 |         is_constant = sapply(Wtsht, function(col) var(col, na.rm = TRUE) == 0)
160 |         # Restrict to just the TRUE variables - those that are constant.
161 |         is_constant = is_constant[is_constant]
162 | 
163 |         if (verbose) {
164 |           cat("Updated ncols -- training:", ncol(Wtsht), "test:", ncol(Wvsht), "\n")
165 | 
166 |           # We should have no constant columns after calling reduce_dimensions().
167 |           if (length(is_constant) > 0) {
168 |             cat("Constant columns (", length(is_constant), "):\n")
169 |             print(is_constant)
170 |           }
171 |         }
172 | 
173 |         # Finished with any needed clustering for variable reduction.
174 | 
175 |         deltat = as.numeric(!is.na(Yt) & !is.na(At))
176 |         deltav = as.numeric(!is.na(Yv) & !is.na(Av))
177 | 
178 |         # TODO (CK): don't do this, in order to use the delta missingness estimation.
179 |         # To avoid crashing TMLE function just drop obs missing A or Y if the
180 |         # total number of missing is < 10
181 |         if (sum(deltat == 0) < 10) {
182 |           Yt = Yt[deltat == 1]
183 |           At = At[deltat == 1]
184 |           Wtsht = Wtsht[deltat == 1, , drop = FALSE]
185 |           deltat = deltat[deltat == 1]
186 |         }
187 | 
188 |         levA = levels(At)
189 | 
190 |         if (length(unique(Yt)) == 2) {
191 |           # Binary outcome.
192 | 
193 |           # Minimum numer of observations for each cell in validation fold.
194 |           minc = apply(table(Av, Yv), 1, min)
195 | 
196 |           # Minimum numer of observations for each cell in training fold.
197 |           minc2 = apply(table(At, Yt), 1, min)
198 | 
199 |           # Restrict analysis to levels of this variable in which
200 |           # there are sufficient observations in each fold
201 |           # across each treatment and outcome combination.
202 |           vals = levA[pmin(minc, minc2) > minCell]
203 |         } else {
204 |           # Continuous outcome.
205 |           vals = levA
206 |         }
207 |         num.cat = length(vals)
208 | 
209 |         # CK TODO 6/6: don't assume that positive outcome is the rare
210 |         #  outcome. (e.g. via table)
211 | 
212 |         # Number of positive outcomes in training data.
213 |         nYt = sum(Yt[!is.na(At)])
214 | 
215 |         # Number of positive outcomes in validation data.
216 |         nYv = sum(Yv[!is.na(Av)])
217 | 
218 |         # Create a list to hold the results we calculate in this fold.
219 |         # Set them to default values and update as they are calculated.
220 |         fold_result = list(
221 |           failed = TRUE,
222 |           # Message should report on the status for this fold.
223 |           message = "",
224 |           obs_training = length(Yt),
225 |           obs_validation = length(Yv),
226 |           error_count = 0,
227 |           # Save the variables chosen by reduce_dimensions().
228 |           variables = reduced_results$variables,
229 |           # Results for estimating the maximum level / treatment.
230 |           level_max = list(
231 |             # Level is which bin was chosen.
232 |             level = NULL,
233 |             # Label is the description of that bin.
234 |             label = NULL,
235 |             # val_preds contains the g, Q, and H predictions on the validation data.
236 |             val_preds = NULL,
237 |             # Estimate of EY on the training data.
238 |             estimate_training = NULL,
239 |             # Risk from SuperLearner on Q.
240 |             risk_Q = NULL,
241 |             # Risk from SuperLearner on g.
242 |             risk_g = NULL
243 |           )
244 |         )
245 |         # Copy the blank result to a second element for the minimum level/bin.
246 |         fold_result$level_min = fold_result$level_max
247 | 
248 |         ############################
249 |         # Don't do if 1) no more than one category of A left or
250 |         # 2) if missingness pattern for A is such that there are few death events left
251 |         # in either (< minYs)
252 |         # Applies only to binary outcomes, not continuous.
253 |         if ((length(unique(Yt)) == 2L &&
254 |              (num.cat < 2L || min(nYt, nYv) < minYs)) ||
255 |             (length(is_constant) > 0 && mean(is_constant) == 1)) {
256 |           if (length(is_constant) > 0 && mean(is_constant) == 1) {
257 |             error_msg = paste("Skipping", nameA, "because HOPACH reduced W to",
258 |                               "all constant columns.")
259 |           } else if (num.cat < 2L) {
260 |             error_msg = paste("Skipping", nameA, "due to lack of variation.")
261 |           } else {
262 |             error_msg = paste("Skipping", nameA, "due to minY constraint.", min(nYt, nYv), "<", minYs)
263 |           }
264 |           if (verbose) cat(error_msg, "\n")
265 | 
266 |           fold_result$message = error_msg
267 |           # At this point here we are skipping to the end of the loop.
268 | 
269 |         } else {
270 |           if (verbose) cat("Estimating TMLE on training", paste0("(", num.cat, ")"))
271 | 
272 |           error_count = 0
273 | 
274 |           # TODO: stop using training_estimates and switch to using bin_results.
275 |           training_estimates = list()
276 |           bin_results = list()
277 | 
278 |           # Estimate Y_a, Q hat and g hat for each level of our current variable,
279 |           # on the training data.
280 |           for (bin_j in 1:num.cat) {
281 | 
282 |             # Create a list to hold the results for this level.
283 |             bin_result = list(
284 |               name = nameA,
285 |               cv_fold = fold_k,
286 |               level = bin_j,
287 |               #level_label = At_bin_labels[bin_j],
288 |               level_label = vals[bin_j],
289 | 
290 |               # Training: Estimates
291 |               # Our default value needs to be NA rather than NULL,
292 |               # to allow rbinding into a dataframe later on.
293 |               train_theta_tmle = NA,
294 |               # TODO: implement these.
295 |               #train_theta_iptw = NA,
296 |               #train_theta_gcomp = NA,
297 |               train_theta_unadj = NA,
298 | 
299 |               # Training: Misc
300 |               train_cell_size = NA,
301 |               train_msg = NA,
302 | 
303 |               # Test: Estimates
304 |               test_theta_tmle = NA,
305 |               # TODO: implement these.
306 |               #test_theta_iptw = NA,
307 |               #test_theta_gcomp = NA,
308 |               test_theta_unadj = NA,
309 | 
310 |               # Test: Misc
311 |               test_cell_size = NA,
312 |               test_msg = NA,
313 | 
314 |               # Test: Predicted values (g, Q_bar, h)
315 |               test_predictions = NULL
316 |               #test_pred_g = NULL,
317 |               #test_pred_Q_bar = NULL,
318 |               #test_pred_h = NULL
319 |             )
320 | 
321 | 
322 |             # Create a treatment indicator, where 1 = obs in this bin
323 |             # and 0 = obs not in this bin.
324 |             IA = as.numeric(At == vals[bin_j])
325 | 
326 |             # Save how many obs have this level/bin in this training fold.
327 |             bin_result$train_cell_size = sum(IA)
328 | 
329 |             # Any observations missing At are assigned to 0.
330 |             IA[is.na(IA)] = 0
331 | 
332 |             # Save unadjusted estimate: outcome mean among observations
333 |             # at the desired treatment level, who are not missing their outcome value.
334 |             bin_result$train_theta_unadj = mean(Yt[IA & deltat])
335 | 
336 |             # if(min(table(IA,Yt))>=)
337 | 
338 |             # CV-TMLE: we are using this for three reasons:
339 |             # 1. Estimate Y_a on training data.
340 |             # 2. Estimate Q on training data.
341 |             # 3. Estimate g on training data.
342 |             tmle_result = try(estimate_tmle2(Yt, IA, Wtsht, family, deltat,
343 |                                     Q.lib = Q.library,
344 |                                     Qbounds = Qbounds,
345 |                                     g.lib = g.library, verbose = verbose_tmle),
346 |                       silent = !verbose)
347 | 
348 |             if (inherits(tmle_result, "try-error")) {
349 |               # TMLE estimation failed.
350 |               if (verbose) cat("X")
351 |               error_count = error_count + 1
352 | 
353 |               # Initialize to NULL so validation code doesn't get subscript error
354 |               training_estimates[[bin_j]] = NULL
355 |             } else {
356 |               # TMLE estimation successed.
357 | 
358 |               # Save label
359 |               tmle_result$label = vals[bin_j]
360 | 
361 |               training_estimates[[bin_j]] = tmle_result
362 | 
363 |               # TODO: may also want to save the TMLE object to this bin_result list.
364 |               bin_result$train_theta_tmle = tmle_result$theta
365 | 
366 |               if (verbose) {
367 |                 cat(".")
368 |               }
369 |             }
370 | 
371 |             #######################################################
372 |             # NEW: also run code on corresponding validation fold.
373 | 
374 |             # TODO: remove the later validation code and use these results instead.
375 | 
376 |             # Indicator for having the desired treatment bin on validation
377 |             IA = as.numeric(Av == vals[bin_j])
378 | 
379 |             # Missing values are not taken to be in this level.
380 |             IA[is.na(IA)] = 0
381 | 
382 |             # Save how many obs have this level/bin in this validation fold.
383 |             bin_result$test_cell_size = sum(IA)
384 | 
385 |             ##################
386 |             # Run estimates on validation data (TMLE, IPTW, G-Comp, Unadj)
387 |             # TODO: move into its own function.
388 | 
389 |             # Save unadjusted estimate: outcome mean among observations
390 |             # at the desired treatment level, who are not missing their outcome value.
391 |             bin_result$test_theta_unadj = mean(Yv[IA & deltav])
392 | 
393 |             # CV-TMLE: predict g, Q, and clever covariate on validation data.
394 |             if (!is.null(training_estimates[[bin_j]])) {
395 |               preds = try(apply_tmle_to_validation(Yv, IA, Wvsht, family,
396 |                                                    deltav, training_estimates[[bin_j]],
397 |                                                    verbose = verbose))
398 |               if (inherits(preds, "try-error")) {
399 |                 bin_result$test_msg = paste("CV-TMLE prediction on validation failed")
400 |               } else {
401 |                 # Save the result.
402 |                 bin_result$test_predictions = preds
403 |               }
404 |             }
405 | 
406 |             bin_result$test_msg = "success"
407 | 
408 |             # Save to the main list.
409 |             bin_results[[bin_j]] = bin_result
410 | 
411 |           }
412 |           # Finished looping over each level of the assignment variable.
413 |           if (verbose) cat(" done. Errors:", error_count, "\n")
414 | 
415 |           # Save individual bin results.
416 |           fold_result$bin_results = bin_results
417 | 
418 |           # Create a dataframe version of the bin results.
419 |           fold_result$bin_df =
420 |             do.call(rbind, lapply(bin_results, function(result) {
421 |               # Exclude certain elements from the list - here the prediction vectors.
422 |               # These should be saved separately.
423 |               data.frame(result[!names(result) %in% c("test_predictions")],
424 |                          stringsAsFactors = FALSE)
425 |             }))
426 | 
427 |           # Save test_predictions for each bin into a combined dataframe.
428 |           fold_result$test_predictions =
429 |             do.call(rbind, lapply(1:length(bin_results), function(bin) {
430 |               result = bin_results[[bin]]
431 |               tryCatch({
432 |                 data.frame(bin = bin,
433 |                            bin_label = vals[bin],
434 |                            fold = fold_k,
435 |                            result$test_predictions,
436 |                            stringsAsFactors = FALSE)
437 |               }, error = function(error) {
438 |                 NULL
439 |               })
440 |             })
441 |            )
442 | 
443 |           #####################################
444 |           # Resume normal varimpact algorithm.
445 | 
446 |           fold_result$error_count = error_count
447 | 
448 |           # Extract theta estimates.
449 |           theta_estimates = sapply(training_estimates, function(result) {
450 |             # Handle errors in the tmle estimation by returning NA.
451 |             if (is.null(result)) {
452 |               NA
453 |             } else {
454 |               ifelse("theta" %in% names(result), result$theta, NA)
455 |             }
456 |           })
457 | 
458 |           if (!all(is.na(theta_estimates))) {
459 |             # Identify maximum EY1 (theta)
460 |             # Note: this may be NA if the tmle estimation failed.
461 |             maxj = which.max(theta_estimates)
462 | 
463 |             # Identify minimum EY1 (theta)
464 |             # Note: this may be NA if the tmle estimation failed.
465 |             minj = which.min(theta_estimates)
466 |             if (verbose) {
467 |               cat("Max level:", vals[maxj], paste0("(", maxj, ")"),
468 |                     "Min level:", vals[minj], paste0("(", minj, ")"), "\n")
469 |             }
470 |           } else {
471 |             maxj = NA
472 |             minj = NA
473 |           }
474 | 
475 |           # This fold failed if we got an error for each category
476 |           # Or if the minimum and maximum bin is the same.
477 |           # Or if the min/max training estimates are NULL.
478 |           if (error_count == num.cat ||
479 |               (is.na(minj) && is.na(maxj)) ||
480 |               minj == maxj ||
481 |               is.null(training_estimates[[minj]]) || is.null(training_estimates[[maxj]])) {
482 |             message = paste("Fold", fold_k, "failed,")
483 |             if (length(theta_estimates) == 0 || error_count == num.cat) {
484 |               message = paste(message, "all", num.cat, "levels had errors.")
485 |             } else if (minj == maxj) {
486 |               message = paste(message, "min and max level are the same. (j = ", minj, ")")
487 |             } else {
488 |               message = paste(message, "min or max training estimate is NULL.")
489 |             }
490 |             fold_result$message = message
491 | 
492 |             if (verbose) {
493 |               cat(message, "\n")
494 |             }
495 |           } else {
496 | 
497 |             # Extract max items.
498 |             maxEY1 = training_estimates[[maxj]]$theta
499 |             labmax = vals[maxj]
500 | 
501 |             # Save these items into the fold_result list.
502 |             fold_result$level_max$level = maxj
503 |             fold_result$level_max$estimate_training = maxEY1
504 |             fold_result$level_max$label = labmax
505 |             # Save the Q risk for the discrete SuperLearner.
506 |             # We don't have the CV.SL results for the full SuperLearner as it's too
507 |             # computationallity intensive.
508 |             fold_result$level_max$risk_Q =
509 |                 training_estimates[[maxj]]$q_model$cvRisk[
510 |                   which.min(training_estimates[[maxj]]$q_model$cvRisk)]
511 |             # And the g's discrete SL risk.
512 |             fold_result$level_max$risk_g =
513 |                 training_estimates[[maxj]]$g_model$cvRisk[
514 |                   which.min(training_estimates[[maxj]]$g_model$cvRisk)]
515 | 
516 |             # Extact TMLE results.
517 |             fold_result$level_max
518 | 
519 |             #fold_result$level_max$tmle = training_estimates[[maxj]]
520 | 
521 |             # Extract min items.
522 |             minEY1 = training_estimates[[minj]]$theta
523 |             labmin = vals[minj]
524 | 
525 |             # Save these items into the fold_result list.
526 |             fold_result$level_min$level = minj
527 |             fold_result$level_min$estimate_training = minEY1
528 |             fold_result$level_min$label = labmin
529 |             #fold_result$level_min$tmle = training_estimates[[minj]]
530 | 
531 |             # Save the Q risk for the discrete SuperLearner.
532 |             # We don't have the CV.SL results for the full SuperLearner as it's too
533 |             # computationallity intensive.
534 |             fold_result$level_min$risk_Q =
535 |               training_estimates[[minj]]$q_model$cvRisk[
536 |                 which.min(training_estimates[[minj]]$q_model$cvRisk)]
537 |             # And the g's discrete SL risk.
538 |             fold_result$level_min$risk_g =
539 |               training_estimates[[minj]]$g_model$cvRisk[
540 |                 which.min(training_estimates[[minj]]$g_model$cvRisk)]
541 | 
542 |             # Turn to validation data.
543 | 
544 |             # Estimate minimum level (control).
545 | 
546 |             # Indicator for having the desired control bin on validation.
547 |             IA = as.numeric(Av == vals[minj])
548 | 
549 |             # Missing values are not taken to be in this level.
550 |             IA[is.na(IA)] = 0
551 | 
552 |             if (verbose) cat("\nMin level prediction - apply_tmle_to_validation()\n")
553 | 
554 |             # CV-TMLE: predict g, Q, and clever covariate on validation data.
555 |             min_preds = try(apply_tmle_to_validation(Yv, IA, Wvsht, family,
556 |                                      deltav, training_estimates[[minj]],
557 |                                      verbose = verbose))
558 | 
559 |             # Old version:
560 |             #res = try(estimate_tmle(Yv, IA, Wvsht, family, deltav,
561 |             #                        Q.lib = Q.library,
562 |             #                        g.lib = g.library, verbose = verbose),
563 |             #          silent = T)
564 | 
565 |             if (inherits(min_preds, "try-error")) {
566 |               message = paste("CV-TMLE prediction on validation failed during",
567 |                             "low/control level.")
568 |               fold_result$message = message
569 |               if (verbose) cat(message, "\n")
570 |             } else {
571 |               # Save the result.
572 |               fold_result$level_min$val_preds = min_preds
573 | 
574 |               # Switch to maximum level (treatment).
575 | 
576 |               # Indicator for having the desired treatment bin on validation
577 |               IA = as.numeric(Av == vals[maxj])
578 | 
579 |               # Missing values are not taken to be in this level.
580 |               IA[is.na(IA)] = 0
581 | 
582 |               if (verbose) cat("\nMax level prediction - apply_tmle_to_validation()\n")
583 | 
584 |               # CV-TMLE: predict g, Q, and clever covariate on validation data.
585 |               max_preds = try(apply_tmle_to_validation(Yv, IA, Wvsht, family,
586 |                                         deltav, training_estimates[[maxj]],
587 |                                         verbose = verbose))
588 |              # Old code:
589 |               #res2 = try(estimate_tmle(Yv, IA, Wvsht, family, deltav,
590 |               #                         Q.lib = Q.library,
591 |               #                         g.lib = g.library, verbose = verbose),
592 |               #           silent = !verbose)
593 | 
594 | 
595 |               if (inherits(max_preds, "try-error")) {
596 |                 message = paste("CV-TMLE prediction on validation failed",
597 |                       "during high/treatment level.")
598 |                 fold_result$message = message
599 |                 if (verbose) cat(message, "\n")
600 |               } else {
601 |                 # Save the result.
602 |                 fold_result$level_max$val_preds = max_preds
603 |                 fold_result$message = "Succcess"
604 |                 # Fold succeeded.
605 |                 fold_result$failed = FALSE
606 |               }
607 |             }
608 |           }
609 |         }
610 |         if (verbose) cat("Completed fold", fold_k, "\n\n")
611 | 
612 |         # Return results for this fold.
613 |         fold_result
614 |       }
615 |       ) # End lapply if we're not using foreach.
616 |       # Done looping over each fold.
617 | 
618 |       ########################
619 |       # Reconstruct CV-TMLE treatment-specific mean estimate for each validation fold.
620 |       # Loop over bins and calculate bin-specific CV-TMLE estimate.
621 |       if (verbose) cat("Estimating CV-TMLE treatment-specific means.\n")
622 |       #for (bin in 1:numcat.cont[var_i]) {
623 |       num_bins = length(levels(factors$data.fac[, var_i]))
624 |       for (bin in 1:num_bins) {
625 | 
626 |         if (verbose) {
627 |           cat("Bin", bin, "of", num_bins, "\n")
628 |         }
629 |         # Expects a list of results by fold.
630 |         # Each element of that list should have the val_preds list, which
631 |         # is calculated by apply_tmle_to_validation and currently saved in
632 |         # fold_results[[*]]$test_predictions (separately by fold * level).
633 |         compile_rows = lapply(fold_results, function(fold_r) {
634 |           # Check if we even have test predictions for this fold.
635 |           # We may not if the estimation failed on the training data.
636 |           if (!"test_predictions" %in% names(fold_r)) {
637 |             if (verbose) cat("(failed) ")
638 |             return(NULL)
639 |           }
640 |           # Extract the rows specific to this bin/level.
641 |           rows = fold_r$test_predictions[fold_r$test_predictions$bin == bin, , drop = FALSE]
642 |           if (verbose) cat("Rows:", nrow(rows), " ")
643 |           # If we have 0 rows for this bin in this fold, we need to debug.
644 |           # if (nrow(rows) == 0) browser()
645 |           rows
646 |         })
647 |         bin_df = do.call(rbind, compile_rows)
648 |         if (verbose) cat("\n")
649 | 
650 |         if (!inherits(bin_df, "data.frame") || nrow(bin_df) == 0L) {
651 |           if (verbose) {
652 |             cat("Skipping bin", bin, "- no rows are available.\n")
653 |           }
654 |           # We have no val_preds for this bin, so skip pooled result estimation.
655 | 
656 |           # Temporary simplification for debugging purposes.
657 |           #pooled_bin = list(thetas = 1:V)
658 |           pooled_bin = list(thetas = rep(NA, V))
659 |         } else {
660 | 
661 |         # Create a list with one element ($val_preds df) per fold.
662 |         bin_list = lapply(1:V, function(fold_i) {
663 |           # Return with an enclosing list.
664 |           list(bin_df[bin_df$fold == fold_i, ])
665 |         })
666 | 
667 |         # Rename the element to be $val_preds
668 |         for (fold in 1:V) {
669 |           names(bin_list[[fold]]) = c("val_preds")
670 |         }
671 | 
672 |         # bin_df can be NULL if the variable is skipped due to errors,
673 |         # e.g. lack of variation.
674 |         if (!is.null(bin_df) && nrow(bin_df) > 0L) {
675 |           pooled_bin = estimate_pooled_results(bin_list, verbose = verbose)
676 |           # Now we have $thetas and $influence_curves
677 | 
678 |           # Save the vector of estimates into the appropriate spot.
679 |           # $thetas has the treatment-specific means
680 |           # $influence_curves can be used to calculate the SE's, but shouldn't those
681 |           # already be calculated by estimate_pooled_results()
682 | 
683 |           # Loop over fold results and insert the thetas into appropriate df.
684 |           for (fold in 1:length(bin_list)) {
685 |             bin_df = fold_results[[fold]]$bin_df
686 |             row = bin_df$level == bin & bin_df$cv_fold == fold
687 |             fold_results[[fold]]$bin_df[row, "test_theta_tmle"] = pooled_bin$thetas[fold]
688 |             fold_results[[fold]]$bin_df[row, "test_var_tmle"] = var(pooled_bin$influence_curves[[fold]])
689 |           }
690 | 
691 |         } else {
692 |           if (verbose) {
693 |             cat("Skipping bin", bin, "- no rows are available.\n")
694 |           }
695 |           # We have no val_preds for this bin, so skip pooled result estimation.
696 | 
697 |           # Temporary simplification for debugging purposes.
698 |           #pooled_bin = list(thetas = 1:V)
699 |           pooled_bin = list(thetas = rep(NA, V))
700 |         }
701 | 
702 |         if (verbose) {
703 |           cat("\n")
704 |         }
705 | 
706 |         }
707 |       }
708 | 
709 |       # Combine results for each fold into a single dataframe.
710 |       # This can fail if all bins for this variable failed.
711 |       tryCatch({
712 |         results_by_fold_and_level = do.call(rbind, lapply(fold_results, `[[`, "bin_df"))
713 |       }, error = function(e) {
714 |         results_by_fold_and_level = NULL
715 |       })
716 | 
717 |       # Aggregate into a results_by_level dataframe.
718 |       # This can fail if all bins for this variable failed.
719 |       tryCatch({
720 |         results_by_level = results_by_level(results_by_fold_and_level,
721 |                                             verbose = verbose)
722 |       }, error = function(e) {
723 |         results_by_level = NULL
724 |       })
725 | 
726 |       # Create list to save results for this variable.
727 |       var_results = list(
728 |         EY1V = NULL,
729 |         EY0V = NULL,
730 |         thetaV = NULL,
731 |         thetaV_rr = NULL,
732 |         varICV = NULL,
733 |         labV = NULL,
734 |         nV = NULL,
735 |         fold_results = fold_results,
736 |         type = "factor",
737 |         results_by_fold_and_level = results_by_fold_and_level,
738 |         results_by_level = results_by_level,
739 |         name = nameA
740 |       )
741 | 
742 |       # TODO: compile results into the new estimate.
743 | 
744 |       if (verbose) cat("Estimating pooled min.\n")
745 |       pooled_min = estimate_pooled_results(lapply(fold_results, function(x) x$level_min),
746 |                                            verbose = verbose)
747 |       cat("\n")
748 |       if (verbose) cat("Estimating pooled max.\n")
749 |       pooled_max = estimate_pooled_results(lapply(fold_results, function(x) x$level_max),
750 |                                            verbose = verbose)
751 |       cat("\n")
752 | 
753 |       var_results$EY0V = pooled_min$thetas
754 |       var_results$EY1V = pooled_max$thetas
755 | 
756 |       if (length(var_results$EY1V) == length(var_results$EY0V)) {
757 |         var_results$thetaV = var_results$EY1V - var_results$EY0V
758 | 
759 |         # Calculate relative risk parameter (aka risk ratio).
760 |         # This only makes sense if Y is binary.
761 |         var_results$thetaV_rr = var_results$EY1V / var_results$EY0V
762 | 
763 |       } else {
764 |         if (verbose) {
765 |           cat("Error: EY1V and EY0V are different lengths. EY1V =",
766 |               length(var_results$EY1V), "EY0V =", length(var_results$EY0V), "\n")
767 |         }
768 |         var_results$thetaV = rep(NA, max(length(var_results$EY1V),
769 |                                          length(var_results$EY0V)))
770 | 
771 |         var_results$thetaV_rr = rep(NA, max(length(var_results$EY1V),
772 |                                             length(var_results$EY0V)))
773 |       }
774 | 
775 | 
776 |       # Save how many observations were in each validation fold.
777 |       var_results$nV = sapply(fold_results, function(x) x$obs_validation)
778 | 
779 |       # Combine labels into a two-column matrix.
780 |       # First column is min and second is max.
781 |       # TODO: not sure if data structure for this part is correct.
782 |       labels = do.call(rbind,
783 |                  lapply(fold_results, function(x) c(x$level_min$label, x$level_max$label)))
784 | 
785 |       var_results$labV = labels
786 | 
787 |       # If either of the thetas is null it means that all CV-TMLE folds failed.
788 |       if (!is.null(pooled_min$thetas)) {
789 | 
790 |         # Influence_curves here is a list, with an element for each fold.
791 |         var_results$varICV = sapply(1:V, function(index) {
792 |           if (length(pooled_max$influence_curves) >= index &&
793 |               length(pooled_min$influence_curves) >= index) {
794 |             var(pooled_max$influence_curves[[index]] - pooled_min$influence_curves[[index]])
795 |           } else {
796 |             NA
797 |           }
798 |         })
799 | 
800 |         # Parameter: relative risk
801 |         # TODO: only calculate if Y is binary.
802 |         var_results$varICV_log_rr = sapply(1:V, function(index) {
803 |           if (length(pooled_max$influence_curves) >= index &&
804 |               length(pooled_min$influence_curves) >= index) {
805 |             # Variance for the risk difference (maximal contrast parameter).
806 |             # TODO: double-check this.
807 |             var(pooled_max$influence_curves[[index]] / pooled_max$thetas[[index]] -
808 |                   pooled_min$influence_curves[[index]] / pooled_min$thetas[[index]])
809 |           } else {
810 |             NA
811 |           }
812 |         })
813 | 
814 |         if (verbose) {
815 |           signif_digits = 4
816 | 
817 |           ey0_mean = mean(pooled_min$thetas)
818 |           if (is.numeric(ey0_mean)) {
819 |             cat("[Min] EY0:", signif(ey0_mean, signif_digits))
820 |             if (is.numeric(pooled_min$epsilon)) {
821 |               cat(" Epsilon:", signif(pooled_min$epsilon, signif_digits))
822 |             }
823 |             cat("\n")
824 |           }
825 | 
826 |           ey1_mean =  mean(pooled_max$thetas)
827 |           if (is.numeric(ey1_mean)) {
828 |             cat("[Max] EY1:", signif(ey1_mean, signif_digits))
829 |             if (is.numeric(pooled_max$epsilon)) {
830 |               cat(" Epsilon:", signif(pooled_max$epsilon, signif_digits))
831 |             }
832 |             cat("\n")
833 |           }
834 | 
835 |           cat("ATEs:", signif(var_results$thetaV, signif_digits), "\n")
836 |           cat("Variances:", signif(var_results$varICV, signif_digits), "\n")
837 |           cat("Labels:\n")
838 |           print(labels)
839 |           cat("\n")
840 |           cat("RRs:", signif(var_results$thetaV_rr, signif_digits), "\n")
841 |           cat("Variances:", signif(var_results$varICV_log_rr, signif_digits), "\n")
842 |           cat("\n")
843 |         }
844 |       }
845 | 
846 |       # Return results for this factor variable.
847 |       var_results
848 |     #} # End foreach loop over all variables.
849 |     }) # End lapply or future_lapply if we're not using foreach.
850 | 
851 |     if (verbose) cat("Factor VIMs:", length(vim_factor), "\n\n")
852 | 
853 |     # Confirm that we have the correct number of results, otherwise fail out.
854 |     stopifnot(length(vim_factor) == xc)
855 | 
856 | 
857 |     # Dataframe to hold all of the variable-by-fold-by-level results.
858 |     results_by_fold_and_level_obj = do.call(rbind, lapply(vim_factor, function(result) {
859 |       # Only extract results_by_fold_and_level if it's not NULL
860 |       if ("results_by_fold_and_level" %in% names(result) &&
861 |           !is.null(result$results_by_fold_and_level)# &&
862 |           #!is.na(result$results_by_fold_and_level)
863 |           ) {
864 |         result$results_by_fold_and_level
865 |       } else {
866 |         NULL
867 |       }
868 |     }))
869 |    #"results_by_fold_and_level"))
870 | 
871 |    #results_by_level = do.call(rbind, lapply(vim_factor, `[[`, "results_by_level"))
872 |    compile_results_by_level = lapply(vim_factor, function(result) {
873 |       # Only extract results_by_fold_and_level if it's not NULL
874 |       if ("results_by_level" %in% names(result) &&
875 |           !is.null(result$results_by_level) &&
876 |           # TODO: figure out why expressions are going into this element.
877 |           !is.expression(result$results_by_level)
878 |          # !is.na(result$results_by_level)
879 |           ) {
880 |         result$results_by_level
881 |       } else {
882 |         NULL
883 |       }
884 |     })
885 | 
886 |     results_by_level_obj = NULL
887 |     tryCatch({
888 |       results_by_level_obj = do.call(rbind, compile_results_by_level)
889 |     }, error = function(e) {
890 | 
891 |       # TODO: add browser?
892 |       # TODO: figure out why this happens - presumably due to covariate that failed.
893 |       # Error message:
894 |       # Error in rep(xi, length.out = nvar) :
895 |       #  attempt to replicate an object of type 'closure'
896 |       cat("Errored while compiling results by level.\n")
897 |     })
898 | 
899 |     colnames_factor = colnames(factors$data.fac)
900 |   } else {
901 |     colnames_factor = NULL
902 |     vim_factor = NULL
903 |     results_by_fold_and_level_obj = NULL
904 |     results_by_level_obj = NULL
905 |     cat("No factor variables - skip VIM estimation.\n\n")
906 |   }
907 | 
908 |   # Compile and return results.
909 |   (results = list(
910 |     vim_factor = vim_factor,
911 |     results_by_fold_and_level = results_by_fold_and_level_obj,
912 |     results_by_level = results_by_level_obj,
913 |     colnames_factor = colnames_factor
914 |   ))
915 | }
916 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # varimpact - variable importance through causal inference
  5 | 
  6 | [![Build
  7 | Status](https://travis-ci.org/ck37/varimpact.svg?branch=master)](https://travis-ci.org/ck37/varimpact)
  8 | [![AppVeyor Build
  9 | Status](https://ci.appveyor.com/api/projects/status/github/ck37/varimpact?branch=master&svg=true)](https://ci.appveyor.com/project/ck37/varimpact)
 10 | [![codecov](https://codecov.io/gh/ck37/varimpact/branch/master/graph/badge.svg)](https://codecov.io/gh/ck37/varimpact)
 11 | 
 12 | ## Summary
 13 | 
 14 | varimpact uses causal inference statistics to generate variable
 15 | importance estimates for a given dataset and outcome. It answers the
 16 | question: which of my Xs are most related to my Y? Each variable’s
 17 | influence on the outcome is estimated semiparametrically, without
 18 | assuming a linear relationship or other functional form, and the
 19 | covariate list is ranked by order of importance. This can be used for
 20 | exploratory data analysis, for dimensionality reduction, for
 21 | experimental design (e.g. to determine blocking and re-randomization),
 22 | to reduce variance in an estimation procedure, etc. See Hubbard,
 23 | Kennedy, & van der Laan (2018) for more details, or Hubbard & van der
 24 | Laan (2016) for an earlier description.
 25 | 
 26 | ## Details
 27 | 
 28 | Each covariate is analyzed using targeted minimum loss-based estimation
 29 | ([TMLE](https://CRAN.R-project.org/package=tmle)) as though it were a
 30 | treatment, with all other variables serving as adjustment variables via
 31 | [SuperLearner](https://github.com/ecpolley/SuperLearner). Then the
 32 | statistical significance of the estimated treatment effect for each
 33 | covariate determines the variable importance ranking. This formulation
 34 | allows the asymptotics of TMLE to provide valid standard errors and
 35 | p-values, unlike other variable importance algorithms.
 36 | 
 37 | The results provide raw p-values as well as p-values adjusted for false
 38 | discovery rate using the Benjamini-Hochberg (1995) procedure. Adjustment
 39 | variables are automatically clustered hierarchically using HOPACH (van
 40 | der Laan & Pollard 2003) in order to reduce dimensionality. The package
 41 | supports multi-core and multi-node parallelization, which are detected
 42 | and used automatically when a parallel backend is registered. Missing
 43 | values are automatically imputed using K-nearest neighbors (Troyanskaya
 44 | et al. 2001, Jerez et al. 2010) and missingness indicator variables are
 45 | incorporated into the analysis.
 46 | 
 47 | varimpact is under active development so please submit any bug reports
 48 | or feature requests to the [issue
 49 | queue](https://github.com/ck37/varimpact/issues), or email Alan and/or
 50 | Chris directly.
 51 | 
 52 | ## Installation
 53 | 
 54 | ### GitHub
 55 | 
 56 | ``` r
 57 | # Install remotes if necessary:
 58 | # install.packages("remotes")
 59 | remotes::install_github("ck37/varimpact")
 60 | ```
 61 | 
 62 | ### CRAN
 63 | 
 64 | Forthcoming fall 2022
 65 | 
 66 | ## Examples
 67 | 
 68 | ### Example: basic functionality
 69 | 
 70 | ``` r
 71 | library(varimpact)
 72 | 
 73 | ####################################
 74 | # Create test dataset.
 75 | set.seed(1, "L'Ecuyer-CMRG")
 76 | N <- 200
 77 | num_normal <- 4
 78 | X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal))
 79 | Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
 80 | # Add some missing data to X so we can test imputation.
 81 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA
 82 | 
 83 | ####################################
 84 | # Basic example
 85 | vim <- varimpact(Y = Y, data = X)
 86 | #> Finished pre-processing variables.
 87 | #> 
 88 | #> Processing results:
 89 | #> - Factor variables: 0 
 90 | #> - Numeric variables: 4 
 91 | #> 
 92 | #> No factor variables - skip VIM estimation.
 93 | #> 
 94 | #> Estimating variable importance for 4 numerics.
 95 | 
 96 | # Review consistent and significant results.
 97 | vim
 98 | #> Significant and consistent results:
 99 | #>       Type  Estimate            CI95      P-value Adj. p-value  Est. RR
100 | #> V3 Ordered 0.4986136 (0.255 - 0.742) 2.935984e-05 0.0001174394 2.908162
101 | #>         CI95 RR   P-value RR Adj. p-value RR
102 | #> V3 (1.92 - 4.4) 2.214955e-07    8.859819e-07
103 | 
104 | # Look at all results.
105 | vim$results_all
106 | #>       Type    Estimate             CI95      P-value Adj. p-value   Est. RR
107 | #> V3 Ordered  0.49861358  (0.255 - 0.742) 2.935984e-05 0.0001174394 2.9081617
108 | #> V4 Ordered  0.21853793 (-0.167 - 0.604) 1.334006e-01 0.2668012809 1.4110231
109 | #> V2 Ordered  0.04733746 (-0.276 - 0.371) 3.872064e-01 0.5162752138 1.0698709
110 | #> V1 Ordered -0.10939221 (-0.494 - 0.275) 7.116162e-01 0.7116161584 0.8266168
111 | #>           CI95 RR   P-value RR Adj. p-value RR Consistent
112 | #> V3   (1.92 - 4.4) 2.214955e-07    8.859819e-07       TRUE
113 | #> V4 (0.833 - 2.39) 1.000755e-01    2.001509e-01       TRUE
114 | #> V2 (0.664 - 1.72) 3.905406e-01    5.207208e-01       TRUE
115 | #> V1 (0.441 - 1.55) 7.238555e-01    7.238555e-01       TRUE
116 | 
117 | # Plot the V2 impact.
118 | plot_var("V2", vim)
119 | ```
120 | 
121 | ![](images/README-example_1-1.png)<!-- -->
122 | 
123 | ``` r
124 | 
125 | # Generate latex tables with results.
126 | exportLatex(vim)
127 | 
128 | # Clean up LaTeX files
129 | cleanup_latex_files()
130 | ```
131 | 
132 | ### Example: customize outcome and propensity score estimation
133 | 
134 | ``` r
135 | Q_lib = c("SL.mean", "SL.glmnet", "SL.ranger", "SL.rpartPrune")
136 | g_lib = c("SL.mean", "SL.glmnet")
137 | set.seed(1, "L'Ecuyer-CMRG")
138 | (vim = varimpact(Y = Y, data = X, Q.library = Q_lib, g.library = g_lib))
139 | #> Finished pre-processing variables.
140 | #> 
141 | #> Processing results:
142 | #> - Factor variables: 0 
143 | #> - Numeric variables: 4 
144 | #> 
145 | #> No factor variables - skip VIM estimation.
146 | #> 
147 | #> Estimating variable importance for 4 numerics.
148 | #> Significant and consistent results:
149 | #>       Type Estimate            CI95      P-value Adj. p-value  Est. RR
150 | #> V3 Ordered  0.56401 (0.326 - 0.802) 1.749015e-06 6.996059e-06 3.644982
151 | #>          CI95 RR   P-value RR Adj. p-value RR
152 | #> V3 (2.34 - 5.69) 6.234554e-09    2.493822e-08
153 | ```
154 | 
155 | ### Example: parallel via multicore
156 | 
157 | ``` r
158 | library(future)
159 | plan("multisession")
160 | vim = varimpact(Y = Y, data = X)
161 | #> Finished pre-processing variables.
162 | #> 
163 | #> Processing results:
164 | #> - Factor variables: 0 
165 | #> - Numeric variables: 4 
166 | #> 
167 | #> No factor variables - skip VIM estimation.
168 | #> 
169 | #> Estimating variable importance for 4 numerics.
170 | ```
171 | 
172 | ### Example: mlbench breast cancer
173 | 
174 | ``` r
175 | data(BreastCancer, package = "mlbench")
176 | data = BreastCancer
177 | 
178 | # Create a numeric outcome variable.
179 | data$Y = as.integer(data$Class == "malignant")
180 | 
181 | # Use multicore parallelization to speed up processing.
182 | plan("multisession")
183 | (vim = varimpact(Y = data$Y, data = subset(data, select = -c(Y, Class, Id))))
184 | #> Finished pre-processing variables.
185 | #> 
186 | #> Processing results:
187 | #> - Factor variables: 9 
188 | #> - Numeric variables: 0 
189 | #> 
190 | #> Estimating variable importance for 9 factors.
191 | #> Significant and consistent results:
192 | #>                Type  Estimate            CI95      P-value Adj. p-value
193 | #> Bare.nuclei  Factor 0.6174459   (0.5 - 0.735) 0.000000e+00 0.000000e+00
194 | #> Mitoses      Factor 0.4092028 (0.333 - 0.486) 0.000000e+00 0.000000e+00
195 | #> Cl.thickness Factor 0.5245860 (0.382 - 0.667) 3.027578e-13 9.082735e-13
196 | #> Cell.size    Factor 0.5650275 (0.395 - 0.735) 3.313050e-11 5.963490e-11
197 | #>               Est. RR       CI95 RR   P-value RR Adj. p-value RR
198 | #> Bare.nuclei  3.682218 (2.21 - 6.14) 0.000000e+00    0.000000e+00
199 | #> Mitoses      2.093929 (1.85 - 2.37) 3.023193e-11    1.360437e-10
200 | #> Cl.thickness 2.952087 (2.13 - 4.08) 2.956850e-07    8.870549e-07
201 | #> Cell.size    3.445132    (1.98 - 6) 4.465977e-06    8.038759e-06
202 | plot_var("Mitoses", vim)
203 | ```
204 | 
205 | ![](images/README-example_5-1.png)<!-- -->
206 | 
207 | ## Authors
208 | 
209 | Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley
210 | 
211 | ## References
212 | 
213 | Benjamini, Y., & Hochberg, Y. (1995). Controlling the false discovery
214 | rate: a practical and powerful approach to multiple testing. Journal of
215 | the royal statistical society. Series B (Methodological), 289-300.
216 | 
217 | Gruber, S., & van der Laan, M. J. (2012). tmle: An R Package for
218 | Targeted Maximum Likelihood Estimation. Journal of Statistical Software,
219 | 51(i13).
220 | 
221 | Hubbard, A. E., Kennedy, C. J., van der Laan, M. J. (2018).
222 | Data-adaptive target parameters. In M. van der Laan & S. Rose (2018)
223 | Targeted Learning in Data Science. Springer.
224 | 
225 | Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016).
226 | Statistical Inference for Data Adaptive Target Parameters. The
227 | international journal of biostatistics, 12(1), 3-19.
228 | 
229 | Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A.,
230 | Bulger, E. M., … & Rahbar, M. H. (2013). Time-Dependent Prediction and
231 | Evaluation of Variable Importance Using SuperLearning in High
232 | Dimensional Clinical Data. The journal of trauma and acute care surgery,
233 | 75(1 0 1), S53.
234 | 
235 | Hubbard, A. E., & van der Laan, M. J. (2016). Mining with inference:
236 | data-adaptive target parameters (pp. 439-452). In P. Bühlmann et
237 | al. (Ed.), Handbook of Big Data. CRC Press, Taylor & Francis Group, LLC:
238 | Boca Raton, FL.
239 | 
240 | Jerez, J. M., Molina, I., García-Laencina, P. J., Alba, E., Ribelles,
241 | N., Martín, M., & Franco, L. (2010). Missing data imputation using
242 | statistical and machine learning methods in a real breast cancer
243 | problem. Artificial intelligence in medicine, 50(2), 105-115.
244 | 
245 | Rozenholc, Y., Mildenberger, T., & Gather, U. (2010). Combining regular
246 | and irregular histograms by penalized likelihood. Computational
247 | Statistics & Data Analysis, 54(12), 3313-3323.
248 | 
249 | Troyanskaya, O., Cantor, M., Sherlock, G., Brown, P., Hastie, T.,
250 | Tibshirani, R., Botstein, D., & Altman, R. B. (2001). Missing value
251 | estimation methods for DNA microarrays. Bioinformatics, 17(6), 520-525.
252 | 
253 | van der Laan, M. J. (2006). Statistical inference for variable
254 | importance. The International Journal of Biostatistics, 2(1).
255 | 
256 | van der Laan, M. J., & Pollard, K. S. (2003). A new algorithm for hybrid
257 | hierarchical clustering with visualization and the bootstrap. Journal of
258 | Statistical Planning and Inference, 117(2), 275-303.
259 | 
260 | van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). Super
261 | learner. Statistical applications in genetics and molecular biology,
262 | 6(1).
263 | 
264 | van der Laan, M. J., & Rose, S. (2011). Targeted learning: causal
265 | inference for observational and experimental data. Springer Science &
266 | Business Media.
267 | 


--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
  1 | VarImpact TODOs
  2 | 
  3 | Alan priorities:
  4 | - Display true bin ranges for continuous variables in results tables.
  5 | - Unadjusted difference in means estimate (t-test) to compare to TMLE estimate
  6 | - Cell sizes in the bins/levels that it chooses (A = 0 and A = 1)
  7 | - Calculate RR treatment effects (see tmle, ltmle, Ben Arnold's package).
  8 |    * OR doesn't need to be in there, byproduct of parametric regression.
  9 | - Test CV-TMLE with non-binary outcomes. E.g. transform theta?
 10 | 
 11 | Mark priorities:
 12 | - Support for 10-fold CV-TMLE - IN PROGRESS
 13 | 
 14 | Chris items:
 15 | * PRIORITY: for continuous outcomes, need to transform effect size and CIs back to the original scale (multiply by range of the original bounds.)
 16 | * PRIORITY: Track and report why variables are removed from analysis
 17 |  - minCell, minYs, error during estimation, missing values, restrict_by_quantiles(), etc.
 18 | * Simulation study
 19 | * Investigate penalized histogramming of numeric variables
 20 |   - Needs more tests and probably its own function.
 21 | * Give a warning if data is a matrix rather than dataframe. Causes an error in separate_factors_numerics.
 22 | * Methods
 23 |    - Rename exportLatex to export_latex; support but deprecate old method name.
 24 |    - Data reduction: switch from HOPACH to a more reliable clustering algorithm.
 25 |        - Or upgrade HOPACH to support more than 10 (or 15?) variables.
 26 | * Arguments
 27 |   - Argument for p-value cutoff for consistent results.
 28 | * Coding style
 29 |   - Create more subfunctions.
 30 | * Output
 31 |   - Add "Variable" column name to $results_by_fold.
 32 |   - Results by fold should be divided into two tables:
 33 |      - by_fold_estimates - psi estimates for each fold
 34 |      - by_fold_levels - high and low levels for each fold
 35 |   - Show a_L and a_H in consistent results table - critical for interpretation.
 36 |      * Unfortunately this requires per-fold results for numeric variables, due to
 37 |        the "directionality" definition of consistency.
 38 |   - Sort consistent results by p-value then estimate size?
 39 |   - Support summary()
 40 |   - Return SEs in results tables.
 41 |   - Fix exportTable() when vim_result is NULL due to lack of results.
 42 | * Quality
 43 |   - Warn/error if there are character columns in the data, as a bonus check.
 44 |   - Figure out how to reduce the number of potential failures in various steps.
 45 |   - Automatically handle rows that are all NAs?
 46 |   - Automatically remove any data column that is 100% equal to the outcome (so it can stay in df).
 47 | * Prediction Accuracy
 48 |   - Save risk estimates for g and Q during CV-TMLE. - IN PROGRESS
 49 |   - Save and report range of g to examine sparsity.
 50 | * Testing
 51 |   - Need tests with single factors and numerics. Lots of lingering bugs.
 52 |   - Add other tests.
 53 | * Visualization
 54 |   - Visualize the treatment-specific means for each level of the treatment, with CIs
 55 |      - Label the high and low level
 56 |      - Layer on the raw density of the variable
 57 |   - Plot of BH correction ala figure 10.6 in All of Statistics.
 58 | * Performance
 59 |   - Try Rprofvis
 60 |   - Investigate Jeremy's method of not wasting SL folds during CV-TMLE.
 61 | * Documentation
 62 |   - Improve varImpact() roxygen description.
 63 |   - Document the return object
 64 |   - Vignette
 65 |      * Clarify consistency definition
 66 | * Examples
 67 | * Benchmarking
 68 |   - Benchmark to random forest, lasso, OLS and SL-perturbation algorithms.
 69 | * BUGS/ERRORS
 70 |   - HOPACH can fail twice in a row. Need to figure out why and fix (see factor test 1).
 71 |   - HOPACH (Minh example): Error in rowMeans(dmat[labels == ordlabels[j], labels == labels[medoid2]]) :
 72 |      * seem to occur in hopach::mssnextlevel()
 73 |      * possibly due to call to msssplitcluster()
 74 |      * may need a ", drop = F" when subsetting a matrix somewhere.
 75 |      * TODO: fork HOPACH and add a dimension check / debug stuff near there.
 76 |   'x' must be an array of at least two dimensions
 77 |   - Error in predmat[which, seq(nlami)] = preds : replacement has length zero
 78 |      - Occurs when estimating TMLE on training samples.
 79 |   - Error in lognet(x, is.sparse, ix, jx, y, weights, offset, alpha, nobs,  :
 80 |   one multinomial or binomial class has 1 or 0 observations; not allowed
 81 |      -- Can happen with BreastCancer dataset, seems to depend on seed.
 82 |      -- Happens in parkinson's competition dataset.
 83 |      -- Seems to occur when a variable has value with a small number of obs.
 84 |       Then when estimating the treatment propensity score for a given fold
 85 |       there may be only a single value for all observations.
 86 |   - Error in y %*% rep(1, nc) : non-conformable arguments
 87 |      -- When estimating TMLE
 88 |   - "Missing value(s) in medoidsdata in collap()"
 89 | * WARNINGS
 90 |   - Warning in collap(data, level, d, dmat, newmed) (conducting HOPACH, at least on factors):
 91 |   Not enough medoids to use newmed='medsil' in collap() -
 92 |  using newmed='nn' instead
 93 |    (CK: suppressing warnings currently to avoid this one.)
 94 | 
 95 | From existing documentation:
 96 | - Allow reporting of results that randomly do not have estimates for some of validation samples.
 97 | 
 98 | Wishlist
 99 | - Support clustering or variable reduction algorithms other than HOPACH.
100 | - Support more than 2 external (CV-TMLE) folds.
101 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | install:
10 |   ps: Bootstrap
11 | 
12 | cache:
13 |   - C:\RLibrary
14 | 
15 | build_script:
16 |   - git config --global user.name "travis"
17 |   - git config --global user.email "travis@example.org"
18 |   - travis-tool.sh install_bioc_deps
19 |   - travis-tool.sh install_deps
20 |  #- travis-tool.sh github_package jimhester/covr
21 | 
22 | test_script:
23 |   - travis-tool.sh run_tests
24 | 
25 | on_failure:
26 |   - 7z a failure.zip *.Rcheck\*
27 |   - appveyor PushArtifact failure.zip
28 | 
29 | #on_success:
30 | #  - Rscript -e 'covr::codecov()'
31 | 
32 | environment:
33 |   global:
34 |     CRAN: http://cran.rstudio.com
35 |     WARNINGS_ARE_ERRORS: 1
36 |     _R_CHECK_FORCE_SUGGESTS_: 0
37 |     R_ARCH: x64
38 |     USE_RTOOLS: true ## to be able to use Remotes (i.e. packages from non-CRAN sources)
39 | 
40 |   matrix:
41 |     - R_VERSION: release
42 | 
43 | artifacts:
44 |   - path: '*.Rcheck\**\*.log'
45 |     name: Logs
46 | 
47 |   - path: '*.Rcheck\**\*.out'
48 |     name: Logs
49 | 
50 |   - path: '*.Rcheck\**\*.fail'
51 |     name: Logs
52 | 
53 |   - path: '*.Rcheck\**\*.Rout'
54 |     name: Logs
55 | 
56 |   - path: '\*_*.tar.gz'
57 |     name: Bits
58 | 
59 |   - path: '\*_*.zip'
60 |     name: Bits
61 | 
62 | 


--------------------------------------------------------------------------------
/images/README-example_1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ck37/varimpact/b4b78751d944b59f01362f43c5838ffe9255e3b5/images/README-example_1-1.png


--------------------------------------------------------------------------------
/images/README-example_5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ck37/varimpact/b4b78751d944b59f01362f43c5838ffe9255e3b5/images/README-example_5-1.png


--------------------------------------------------------------------------------
/man/cleanup_latex_files.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cleanup-latex.R
 3 | \name{cleanup_latex_files}
 4 | \alias{cleanup_latex_files}
 5 | \title{Clean up LaTeX files created by exportLatex}
 6 | \usage{
 7 | cleanup_latex_files(dir = ".", outname = "", verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{dir}{Directory where LaTeX files are located (default: current directory)}
11 | 
12 | \item{outname}{Prefix for the LaTeX files (default: empty string)}
13 | 
14 | \item{verbose}{If TRUE, print messages about which files were removed}
15 | }
16 | \value{
17 | Invisibly returns a logical vector indicating which files were successfully removed
18 | }
19 | \description{
20 | This function removes LaTeX files that are typically created by the exportLatex() function.
21 | It's designed to be used after exportLatex() calls to clean up temporary files.
22 | }
23 | \examples{
24 | \dontrun{
25 | # After calling exportLatex()
26 | exportLatex(vim)
27 | cleanup_latex_files()
28 | 
29 | # With custom directory and prefix
30 | exportLatex(vim, outname = "myresults_", dir = "output/")
31 | cleanup_latex_files(dir = "output/", outname = "myresults_")
32 | }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/man/create_cv_folds.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_cv_folds.R
 3 | \name{create_cv_folds}
 4 | \alias{create_cv_folds}
 5 | \title{Stratified CV to insure balance (by one grouping variable, Y)}
 6 | \usage{
 7 | create_cv_folds(V, Y, verbose = F)
 8 | }
 9 | \arguments{
10 | \item{V}{number of folds}
11 | 
12 | \item{Y}{Outcome variable. If binary will be used for stratification.}
13 | 
14 | \item{verbose}{If T will display extra output.}
15 | }
16 | \value{
17 | Vector of fold assignments.
18 | }
19 | \description{
20 | Stratified CV to insure balance (by one grouping variable, Y)
21 | }
22 | 


--------------------------------------------------------------------------------
/man/estimate_tmle.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/estimate_tmle.R
 3 | \name{estimate_tmle}
 4 | \alias{estimate_tmle}
 5 | \title{Get TMLE estimate: E[Y | A = 1, W].}
 6 | \usage{
 7 | estimate_tmle(Y, A, W, family, delta = NULL, Q.lib, g.lib, verbose = F)
 8 | }
 9 | \arguments{
10 | \item{Y}{Outcome variable}
11 | 
12 | \item{A}{Treatment indicator}
13 | 
14 | \item{W}{Dataframe of adjustment covariates}
15 | 
16 | \item{family}{Binomial or gaussian}
17 | 
18 | \item{delta}{Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing.}
19 | 
20 | \item{Q.lib}{SuperLearner library for estimating Q (potential outcome)}
21 | 
22 | \item{g.lib}{SuperLearner library for estimating g (propensity score)}
23 | 
24 | \item{verbose}{If true output extra information during execution.}
25 | }
26 | \description{
27 | Get TMLE estimate: E[Y | A = 1, W].
28 | }
29 | 


--------------------------------------------------------------------------------
/man/estimate_tmle2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/estimate_tmle2.R
 3 | \name{estimate_tmle2}
 4 | \alias{estimate_tmle2}
 5 | \title{Get TMLE estimate: E[Y | A = 1, W].}
 6 | \usage{
 7 | estimate_tmle2(
 8 |   Y,
 9 |   A,
10 |   W,
11 |   family,
12 |   delta = rep(1, length(Y)),
13 |   Q.lib,
14 |   g.lib,
15 |   id = 1:length(Y),
16 |   Qbounds = NULL,
17 |   gbound = 0.025,
18 |   alpha = 0.995,
19 |   fluctuation = "logistic",
20 |   V = 10,
21 |   verbose = F
22 | )
23 | }
24 | \arguments{
25 | \item{Y}{Outcome variable}
26 | 
27 | \item{A}{Treatment indicator}
28 | 
29 | \item{W}{Dataframe of adjustment covariates}
30 | 
31 | \item{family}{Outcome family - binomial or gaussian}
32 | 
33 | \item{delta}{Indicator of missing outcome or treatment assignment. 1 - observed, 0 - missing.}
34 | 
35 | \item{Q.lib}{SuperLearner library for estimating Q (potential outcome)}
36 | 
37 | \item{g.lib}{SuperLearner library for estimating g (propensity score)}
38 | 
39 | \item{id}{Optional subject-level identifier.}
40 | 
41 | \item{Qbounds}{Bounds on Q}
42 | 
43 | \item{gbound}{Bounds on G}
44 | 
45 | \item{alpha}{TBD, from TMLE package}
46 | 
47 | \item{fluctuation}{Only logistic is currently supported.}
48 | 
49 | \item{V}{Number of folds for SuperLearner}
50 | 
51 | \item{verbose}{If true output extra information during execution.}
52 | }
53 | \description{
54 | Get TMLE estimate: E[Y | A = 1, W].
55 | }
56 | 


--------------------------------------------------------------------------------
/man/exportLatex.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/exportLatex.R
 3 | \name{exportLatex}
 4 | \alias{exportLatex}
 5 | \title{Export varimpact results as Latex tables}
 6 | \usage{
 7 | exportLatex(impact_results, outname = "", dir = ".", digits = 4, ...)
 8 | }
 9 | \arguments{
10 | \item{impact_results}{Result object from previous varimpact() call.}
11 | 
12 | \item{outname}{(Optional) String that is prepended to filenames.}
13 | 
14 | \item{dir}{(Optional) Directory to save the results, defaults to current directory.}
15 | 
16 | \item{digits}{Digits to round numbers, passed through to xtable.}
17 | 
18 | \item{...}{Additional parameters passed to print.xtable().}
19 | }
20 | \description{
21 | Outputs results from varimpact() into three Latex tables: consistent results,
22 | all results, and per-fold results.
23 | }
24 | \details{
25 | Creates three Latex table files:
26 |  \itemize{
27 |    \item varimpConsistent.tex - the ``consistent'' significant results, meaning those with consistent categories chosen as comparison groups among factors and consistent ordering for numeric variables.
28 |    \item varimpAll.tex - the file with cross-validated average variable impacts ordered by statistical significance.
29 |    \item varimpByV.tex - the comparison levels used within each validation sample.  Either integer ordering of factors or short-hand for percentile cut-off (0-1 is the 10th percentile, 10+ is the 100th percentile)
30 | }
31 | }
32 | \seealso{
33 | \code{\link[varimpact]{varimpact}}
34 | }
35 | 


--------------------------------------------------------------------------------
/man/factors_to_indicators.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/factors_to_indicators.R
 3 | \name{factors_to_indicators}
 4 | \alias{factors_to_indicators}
 5 | \title{Convert a dataframe of factor into separate indicators.}
 6 | \usage{
 7 | factors_to_indicators(factor_df, miss_name_prefix = "Imiss_", verbose = F)
 8 | }
 9 | \arguments{
10 | \item{factor_df}{Dataframe consisting only of factor variables.}
11 | 
12 | \item{miss_name_prefix}{Starting name for each missing indicator.}
13 | 
14 | \item{verbose}{Set to T for detailed output.}
15 | }
16 | \value{
17 | A list of results.
18 | }
19 | \description{
20 | More details
21 | }
22 | 


--------------------------------------------------------------------------------
/man/max_sqr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/max_sqr.R
 3 | \name{max_sqr}
 4 | \alias{max_sqr}
 5 | \title{Find the maximum of the squared values.}
 6 | \usage{
 7 | max_sqr(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Numeric vector}
11 | }
12 | \value{
13 | Maximum of the squared values, or -Inf if all elements are NA.
14 | }
15 | \description{
16 | Find the maximum of the squared values.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/plot_var.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot-var.R
 3 | \name{plot_var}
 4 | \alias{plot_var}
 5 | \title{Plot the varimpact results for a given variable}
 6 | \usage{
 7 | plot_var(var_name, vim, digits = 2L, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{var_name}{String name of the variable}
11 | 
12 | \item{vim}{Varimpact result object that contains the variable}
13 | 
14 | \item{digits}{Number of digits for rounding purposes.}
15 | 
16 | \item{verbose}{If true, display extra output.}
17 | }
18 | \description{
19 | Displays the adjusted treatment-specific means and the impact estimate.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/print.varimpact.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.varimpact.R
 3 | \name{print.varimpact}
 4 | \alias{print.varimpact}
 5 | \title{Custom printing of the varimpact results.}
 6 | \usage{
 7 | \method{print}{varimpact}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{Results object from varimpact.}
11 | 
12 | \item{...}{Further arguments passed to or from other methods.}
13 | }
14 | \description{
15 | Shows the significant and consistent results by default. If there are no
16 | consistent results it shows all results.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/quantiles_equivalent.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/quantiles_equivalent.R
 3 | \name{quantiles_equivalent}
 4 | \alias{quantiles_equivalent}
 5 | \title{Checks if two quantiles have the same sample value for a given vector.
 6 | This indicates that the vector has little variation.}
 7 | \usage{
 8 | quantiles_equivalent(x, quantile_probs = c(0.1, 0.9))
 9 | }
10 | \arguments{
11 | \item{x}{Data vector. If a factor it is converted to numeric using unclass().}
12 | 
13 | \item{quantile_probs}{Vector with two probabilities that specify the quantiles.}
14 | }
15 | \value{
16 | True if the two quantiles are equal, indicating a lack of variation in the sample data.
17 | }
18 | \description{
19 | Checks if two quantiles have the same sample value for a given vector.
20 | This indicates that the vector has little variation.
21 | }
22 | \seealso{
23 | restrict_by_quantiles
24 | }
25 | 


--------------------------------------------------------------------------------
/man/reduce_dimensions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/reduce_dimensions.R
 3 | \name{reduce_dimensions}
 4 | \alias{reduce_dimensions}
 5 | \title{Reduce variables in a dataframe to a target number of covariates.}
 6 | \usage{
 7 | reduce_dimensions(data, newX = NULL, max_variables, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{data}{Dataframe}
11 | 
12 | \item{newX}{Optional second dataframe to receive the same reduction.}
13 | 
14 | \item{max_variables}{Maximum we want to allow, after which dimension reduction
15 | will take place. Cannot be more than 15 due to HOPACH limitations. Set to NULL
16 | to disable any dimension reduction.}
17 | 
18 | \item{verbose}{If true will output additional information during execution.}
19 | }
20 | \description{
21 | Currently uses HOPACH hierarchical clustering but could be generalized.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/restrict_by_quantiles.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/restrict_by_quantiles.R
 3 | \name{restrict_by_quantiles}
 4 | \alias{restrict_by_quantiles}
 5 | \title{Remove columns from a dataframe if they do not have sufficient variation.}
 6 | \usage{
 7 | restrict_by_quantiles(data, quantile_probs = c(0.1, 0.9), verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{data}{Dataframe or matrix}
11 | 
12 | \item{quantile_probs}{The probabilities corresponding to the quantiles that will be compared.}
13 | 
14 | \item{verbose}{If TRUE output additional details during execution.}
15 | }
16 | \value{
17 | New dataframe with the restriction applied.
18 | }
19 | \description{
20 | Remove columns from a dataframe if they do not have sufficient variation.
21 | }
22 | \seealso{
23 | quantiles_equivalent
24 | }
25 | 


--------------------------------------------------------------------------------
/man/results_by_level.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/results-by-level.R
 3 | \name{results_by_level}
 4 | \alias{results_by_level}
 5 | \title{Aggregate the results_by_fold_and_level df into a results_by_level df.}
 6 | \usage{
 7 | results_by_level(results_by_fold_and_level, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{results_by_fold_and_level}{Dataframe containing the VIM results for
11 | all levels of each variable across all CV folds.}
12 | 
13 | \item{verbose}{If true, display extra output.}
14 | }
15 | \description{
16 | Aggregate the results_by_fold_and_level df into a results_by_level df.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/separate_factors_numerics.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/separate_factors_numerics.R
 3 | \name{separate_factors_numerics}
 4 | \alias{separate_factors_numerics}
 5 | \title{Split df into one of only factors and one of only numerics.}
 6 | \usage{
 7 | separate_factors_numerics(data, strings_to_factors = TRUE, verbose = FALSE)
 8 | }
 9 | \arguments{
10 | \item{data}{A dataframe or matrix.}
11 | 
12 | \item{strings_to_factors}{Convert character strings to factors if True,
13 | otherwise ignore character string variables.}
14 | 
15 | \item{verbose}{If TRUE output additional details during execution.}
16 | }
17 | \value{
18 | Results list.
19 | 
20 | Convert strings to factors as well.
21 | }
22 | \description{
23 | Split df into one of only factors and one of only numerics.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/sum_na.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sum_na.R
 3 | \name{sum_na}
 4 | \alias{sum_na}
 5 | \title{Get missingness for each column}
 6 | \usage{
 7 | sum_na(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Vector, matrix, or dataframe}
11 | }
12 | \description{
13 | Function for getting total number missing values for vector
14 | }
15 | 


--------------------------------------------------------------------------------
/man/tmle_estimate_q.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tmle_estimate_q.R
 3 | \name{tmle_estimate_q}
 4 | \alias{tmle_estimate_q}
 5 | \title{purpose: estimate Q=E(Y |Z, A,W) data-adaptively,}
 6 | \usage{
 7 | tmle_estimate_q(
 8 |   Y,
 9 |   Z = rep(0, length(Y)),
10 |   A,
11 |   W,
12 |   Delta,
13 |   Q = NULL,
14 |   Qbounds,
15 |   Qform = NULL,
16 |   maptoYstar,
17 |   SL.library,
18 |   cvQinit = F,
19 |   family,
20 |   id = 1:length(Y),
21 |   V = 10,
22 |   verbose = F
23 | )
24 | }
25 | \arguments{
26 | \item{Y}{outcome}
27 | 
28 | \item{Z}{intermediate variable between A and Y (default= 0 when no int. var.)}
29 | 
30 | \item{A}{treatment indicator (1=treatment, 0=control)}
31 | 
32 | \item{W}{baseline covariates}
33 | 
34 | \item{Delta}{missingness indicator}
35 | 
36 | \item{Q}{optional externally estimated values for Q}
37 | 
38 | \item{Qbounds}{bounds for predicted values}
39 | 
40 | \item{Qform}{optional regression formula to use for glm if}
41 | 
42 | \item{maptoYstar}{if TRUE, using logistic fluctuation for bounded, continuous outcomes}
43 | 
44 | \item{SL.library}{library of prediction algorithms for Super Learner}
45 | 
46 | \item{cvQinit}{flag, if TRUE, cross-validate SL.}
47 | 
48 | \item{family}{regression family}
49 | 
50 | \item{id}{subject identifier}
51 | 
52 | \item{V}{number of folds for SuperLearner}
53 | 
54 | \item{verbose}{Set T for extra output}
55 | }
56 | \description{
57 | unless super learner not available, or user specifies
58 | initial values or a regression formula
59 | arguments:
60 | }
61 | 


--------------------------------------------------------------------------------
/man/varimpact.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/varimpact.R
  3 | \encoding{utf8}
  4 | \name{varimpact}
  5 | \alias{varimpact}
  6 | \title{Variable importance estimation using causal inference (targeted learning)}
  7 | \usage{
  8 | varimpact(
  9 |   Y,
 10 |   data,
 11 |   A_names = colnames(data),
 12 |   V = 2L,
 13 |   Q.library = c("SL.glm", "SL.mean"),
 14 |   g.library = c("SL.glm", "SL.mean"),
 15 |   family = "binomial",
 16 |   minYs = 15L,
 17 |   minCell = 0L,
 18 |   adjust_cutoff = 10L,
 19 |   corthres = 0.8,
 20 |   impute = "median",
 21 |   miss.cut = 0.5,
 22 |   bins_numeric = 10L,
 23 |   quantile_probs_factor = c(0.1, 0.9),
 24 |   quantile_probs_numeric = quantile_probs_factor,
 25 |   verbose = FALSE,
 26 |   verbose_tmle = FALSE,
 27 |   verbose_reduction = FALSE,
 28 |   parallel = TRUE,
 29 |   digits = 4L
 30 | )
 31 | }
 32 | \arguments{
 33 | \item{Y}{outcome of interest (numeric vector)}
 34 | 
 35 | \item{data}{data frame of predictor variables of interest for
 36 | which function returns VIM's. (possibly a matrix?)}
 37 | 
 38 | \item{A_names}{Names of the variables for which we want to estimate importance,
 39 | a subset of the data argument.}
 40 | 
 41 | \item{V}{Number of cross-validation folds.}
 42 | 
 43 | \item{Q.library}{library used by SuperLearner for model of outcome
 44 | versus predictors}
 45 | 
 46 | \item{g.library}{library used by SuperLearner for model of
 47 | predictor variable of interest versus other predictors}
 48 | 
 49 | \item{family}{family ('binomial' or 'gaussian')}
 50 | 
 51 | \item{minYs}{mininum # of obs with event  - if it is < minYs, skip VIM}
 52 | 
 53 | \item{minCell}{is the cut-off for including a category of A in analysis, and
 54 | presents the minumum of cells in a 2x2 table of the indicator of that level
 55 | versus outcome, separately by training and validation sample.}
 56 | 
 57 | \item{adjust_cutoff}{Maximum number of adjustment variables during TMLE. If
 58 | more than this cutoff varimpact will attempt to reduce the dimensions to
 59 | that number (using HOPACH hierarchical clustering). Must not be more than
 60 | 15 due to HOPACH constraints. Set to NULL to disable any dimension
 61 | reduction.}
 62 | 
 63 | \item{corthres}{cut-off correlation with explanatory
 64 | variable for inclusion of an adjustment variables}
 65 | 
 66 | \item{impute}{Type of missing value imputation to conduct. One of: "zero",
 67 | "median", "knn" (default). Note: knn results in the covariate data being centered/scaled.}
 68 | 
 69 | \item{miss.cut}{eliminates explanatory (X) variables with proportion
 70 | of missing obs > cut.off}
 71 | 
 72 | \item{bins_numeric}{Numbers of bins when discretizing numeric variables.}
 73 | 
 74 | \item{quantile_probs_factor}{Quantiles used to check if factors have
 75 | sufficient variation.}
 76 | 
 77 | \item{quantile_probs_numeric}{Quantiles used to check if numerics have
 78 | sufficient variation.}
 79 | 
 80 | \item{verbose}{Boolean - if TRUE the method will display more detailed
 81 | output.}
 82 | 
 83 | \item{verbose_tmle}{Boolean - if TRUE, will display even more detail on the TMLE
 84 | estimation process.}
 85 | 
 86 | \item{verbose_reduction}{Boolean - if TRUE, will display more detail during
 87 | variable reduction step (clustering).}
 88 | 
 89 | \item{parallel}{Use parallel processing if a backend is registered; enabled
 90 | by default.}
 91 | 
 92 | \item{digits}{Number of digits to round the value labels.}
 93 | }
 94 | \value{
 95 | Results object. TODO: add more detail here.
 96 | }
 97 | \description{
 98 | \code{varimpact} returns variable importance statistics ordered
 99 |   by statistical significance using a combination of data-adaptive target
100 |   parameter
101 | }
102 | \details{
103 | The function performs the following functions.
104 |  \enumerate{
105 |  \item Drops variables missing > miss.cut of time (tuneable).
106 |  \item Separate out covariates into factors and continuous (ordered).
107 |  \item Drops variables for which their distribution is uneven  - e.g., all 1
108 |  value (tuneable) separately for factors and numeric variables (ADD MORE
109 |  DETAIL HERE)
110 |  \item Makes dummy variable basis for factors, including naming dummies
111 |  to be traceable to original factor variables later.
112 |  \item Makes new ordered variable of integers mapped to intervals defined by
113 |  deciles for the ordered numeric variables (automatically makes) fewer
114 |  categories if original variable has < 10 values.
115 |  \item Creates associated list of number of unique values and the list of them
116 |  for each variable for use in variable importance part.
117 |  \item Makes missing covariate basis for both factors and ordered variables
118 |  \item For each variable, after assigning it as A, uses optimal histogram
119 |  function to combine values using the distribution of A | Y=1 to avoid very
120 |  small cell sizes in distribution of Y vs. A (tuneable) (ADD DETAIL)
121 |  \item Uses HOPACH* to cluster variables associated confounder/missingness
122 |  basis for W, that uses specified minimum number of adjustment variables.
123 |  \item Finds min and max estimate of E(Ya) w.r.t. a. after looping through
124 |  all values of A* (after processed by histogram)
125 |  \item Returns estimate of E(Ya(max)-Ya(min)) with SE using CV-TMLE.
126 | }
127 | *HOPACH is "Hierarchical Ordered Partitioning and Collapsing Hybrid"
128 | }
129 | \section{Authors}{
130 | 
131 | Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley
132 | }
133 | 
134 | \section{References}{
135 | 
136 | Benjamini, Y., & Hochberg, Y. (1995). \emph{Controlling the false discovery
137 | rate: a practical and powerful approach to multiple testing}. Journal of the
138 | royal statistical society. Series B (Methodological), 289-300.
139 | 
140 | Gruber, S., & van der Laan, M. J. (2012). \emph{tmle: An R Package for
141 | Targeted Maximum Likelihood Estimation}. Journal of Statistical Software,
142 | 51(i13).
143 | 
144 | Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016).
145 | \emph{Statistical Inference for Data Adaptive Target Parameters}. The
146 | international journal of biostatistics, 12(1), 3-19.
147 | 
148 | Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A.,
149 | Bulger, E. M., ... & Rahbar, M. H. (2013). \emph{Time-Dependent Prediction
150 | and Evaluation of Variable Importance Using SuperLearning in High Dimensional
151 | Clinical Data}. The journal of trauma and acute care surgery, 75(1 0 1), S53.
152 | 
153 | Hubbard, A. E., & van der Laan, M. J. (2016). \emph{Mining with inference:
154 | data-adaptive target parameters (pp. 439-452)}. In P. Buhlmann et al. (Ed.),
155 | \emph{Handbook of Big Data}. CRC Press, Taylor & Francis Group, LLC: Boca
156 | Raton, FL.
157 | 
158 | van der Laan, M. J. (2006). \emph{Statistical inference for variable
159 | importance}. The International Journal of Biostatistics, 2(1).
160 | 
161 | van der Laan, M. J., & Pollard, K. S. (2003). \emph{A new algorithm for
162 | hybrid hierarchical clustering with visualization and the bootstrap}. Journal
163 | of Statistical Planning and Inference, 117(2), 275-303.
164 | 
165 | van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). \emph{Super
166 | learner}. Statistical applications in genetics and molecular biology, 6(1).
167 | 
168 | van der Laan, M. J., & Rose, S. (2011). \emph{Targeted learning: causal
169 | inference for observational and experimental data}. Springer Science &
170 | Business Media.
171 | }
172 | 
173 | \examples{
174 | ####################################
175 | # Create test dataset.
176 | set.seed(1)
177 | N <- 100
178 | num_normal <- 5
179 | X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal))
180 | Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
181 | # Add some missing data to X so we can test imputation.
182 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA
183 | 
184 | ####################################
185 | # Basic example
186 | 
187 | vim <- varimpact(Y = Y, data = X[, 1:3])
188 | vim
189 | vim$results_all
190 | exportLatex(vim)
191 | cleanup_latex_files()
192 | 
193 | # Impute by median rather than knn.
194 | \dontrun{
195 | vim <- varimpact(Y = Y, data = X[, 1:3], impute = "median")
196 | }
197 | 
198 | ####################################
199 | # Multicore parallel example.
200 | \dontrun{
201 | # Setup multicore parallelization.
202 | library(future)
203 | plan("multisession", workers = 2)
204 | 
205 | vim <- varimpact(Y = Y, data = X[, 1:3])
206 | }
207 | 
208 | ####################################
209 | # Cluster parallel example.
210 | \dontrun{
211 | cl = parallel::makeCluster(2L)
212 | plan(cluster, workers = cl)
213 | vim <- varimpact(Y = Y, data = X[, 1:3])
214 | parallel::stopCluster(cl)
215 | }
216 | 
217 | ####################################
218 | # mlbench BreastCancer example.
219 | \dontrun{
220 | data(BreastCancer, package="mlbench")
221 | data <- BreastCancer
222 | 
223 | set.seed(1, "L'Ecuyer-CMRG")
224 | # Reduce to a dataset of 100 observations to speed up testing.
225 | # Create a numeric outcome variable.
226 | data$Y <- as.numeric(data$Class == "malignant")
227 | # Use multicore parallelization to speed up processing.
228 | future::plan("multiprocess", workers = 2)
229 | vim <- varimpact(Y = data$Y, data = subset(data, select=-c(Y, Class, Id)))
230 | }
231 | 
232 | }
233 | \seealso{
234 | \code{\link[varimpact]{exportLatex}}, \code{\link[varimpact]{print.varimpact}}
235 | }
236 | 


--------------------------------------------------------------------------------
/readme.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r init_knitr, echo = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "images/README-",
 12 |   fig.width = 5,
 13 |   fig.height = 3.5,
 14 |   message = FALSE
 15 | )
 16 | 
 17 | # Hide warnings when knitting.
 18 | options(warn = -1)
 19 | ```
 20 | 
 21 | # varimpact - variable importance through causal inference
 22 | 
 23 | [![Build Status](https://travis-ci.org/ck37/varimpact.svg?branch=master)](https://travis-ci.org/ck37/varimpact)
 24 | [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/ck37/varimpact?branch=master&svg=true)](https://ci.appveyor.com/project/ck37/varimpact)
 25 | [![codecov](https://codecov.io/gh/ck37/varimpact/branch/master/graph/badge.svg)](https://codecov.io/gh/ck37/varimpact)
 26 | 
 27 | ## Summary
 28 | 
 29 | varimpact uses causal inference statistics to generate variable importance estimates for a given dataset and outcome. It answers the question: which of my Xs are most related to my Y? Each variable's influence on the outcome is estimated semiparametrically, without assuming a linear relationship or other functional form, and the covariate list is ranked by order of importance. This can be used for exploratory data analysis, for dimensionality reduction, for experimental design (e.g. to determine blocking and re-randomization), to reduce variance in an estimation procedure, etc. See Hubbard, Kennedy, & van der Laan (2018) for more details, or Hubbard & van der Laan (2016) for an earlier description.
 30 | 
 31 | ## Details
 32 | 
 33 | Each covariate is analyzed using targeted minimum loss-based estimation ([TMLE](https://CRAN.R-project.org/package=tmle)) as though it were a treatment, with all other variables serving as adjustment variables via [SuperLearner](https://github.com/ecpolley/SuperLearner). Then the statistical significance of the estimated treatment effect for each covariate determines the variable importance ranking. This formulation allows the asymptotics of TMLE to provide valid standard errors and p-values, unlike other variable importance algorithms.
 34 | 
 35 | The results provide raw p-values as well as p-values adjusted for false discovery rate using the Benjamini-Hochberg (1995) procedure. Adjustment variables are automatically clustered hierarchically using HOPACH (van der Laan & Pollard 2003) in order to reduce dimensionality.  The package supports multi-core and multi-node parallelization, which are detected and used automatically when a parallel backend is registered. Missing values are automatically imputed using K-nearest neighbors (Troyanskaya et al. 2001, Jerez et al. 2010) and missingness indicator variables are incorporated into the analysis.
 36 | 
 37 | varimpact is under active development so please submit any bug reports or feature requests to the [issue queue](https://github.com/ck37/varimpact/issues), or email Alan and/or Chris directly.
 38 | 
 39 | ## Installation
 40 | 
 41 | ### GitHub
 42 | 
 43 | ```{r install, eval=FALSE}
 44 | # Install remotes if necessary:
 45 | # install.packages("remotes")
 46 | remotes::install_github("ck37/varimpact")
 47 | ```
 48 | 
 49 | ### CRAN
 50 | 
 51 | Forthcoming fall 2022
 52 | 
 53 | ## Examples
 54 | 
 55 | ### Example: basic functionality
 56 | 
 57 | ```{r example_1, fig.width = 6.5, fig.height = 3.5}
 58 | library(varimpact)
 59 | 
 60 | ####################################
 61 | # Create test dataset.
 62 | set.seed(1, "L'Ecuyer-CMRG")
 63 | N <- 200
 64 | num_normal <- 4
 65 | X <- as.data.frame(matrix(rnorm(N * num_normal), N, num_normal))
 66 | Y <- rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
 67 | # Add some missing data to X so we can test imputation.
 68 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] <- NA
 69 | 
 70 | ####################################
 71 | # Basic example
 72 | vim <- varimpact(Y = Y, data = X)
 73 | 
 74 | # Review consistent and significant results.
 75 | vim
 76 | 
 77 | # Look at all results.
 78 | vim$results_all
 79 | 
 80 | # Plot the V2 impact.
 81 | plot_var("V2", vim)
 82 | 
 83 | # Generate latex tables with results.
 84 | exportLatex(vim)
 85 | 
 86 | # Clean up LaTeX files
 87 | cleanup_latex_files()
 88 | ```
 89 | 
 90 | ### Example: customize outcome and propensity score estimation
 91 | 
 92 | ```{r example_2}
 93 | Q_lib = c("SL.mean", "SL.glmnet", "SL.ranger", "SL.rpartPrune")
 94 | g_lib = c("SL.mean", "SL.glmnet")
 95 | set.seed(1, "L'Ecuyer-CMRG")
 96 | (vim = varimpact(Y = Y, data = X, Q.library = Q_lib, g.library = g_lib))
 97 | ```
 98 | 
 99 | ### Example: parallel via multicore
100 | 
101 | ```{r example_3}
102 | library(future)
103 | plan("multisession")
104 | vim = varimpact(Y = Y, data = X)
105 | ```
106 | 
107 | ### Example: mlbench breast cancer
108 | 
109 | ```{r example_5, fig.width = 6, fig.height = 3.5}
110 | data(BreastCancer, package = "mlbench")
111 | data = BreastCancer
112 | 
113 | # Create a numeric outcome variable.
114 | data$Y = as.integer(data$Class == "malignant")
115 | 
116 | # Use multicore parallelization to speed up processing.
117 | plan("multisession")
118 | (vim = varimpact(Y = data$Y, data = subset(data, select = -c(Y, Class, Id))))
119 | plot_var("Mitoses", vim)
120 | ```
121 | 
122 | ## Authors
123 | 
124 | Alan E. Hubbard and Chris J. Kennedy, University of California, Berkeley
125 | 
126 | ## References
127 | 
128 | Benjamini, Y., & Hochberg, Y. (1995). Controlling the false discovery rate: a practical and powerful approach to multiple testing. Journal of the royal statistical society. Series B (Methodological), 289-300.
129 | 
130 | Gruber, S., & van der Laan, M. J. (2012). tmle: An R Package for Targeted Maximum Likelihood Estimation. Journal of Statistical Software, 51(i13).
131 | 
132 | Hubbard, A. E., Kennedy, C. J., van der Laan, M. J. (2018). Data-adaptive target parameters. In M. van der Laan & S. Rose (2018) Targeted Learning in Data Science. Springer.
133 | 
134 | Hubbard, A. E., Kherad-Pajouh, S., & van der Laan, M. J. (2016). Statistical Inference for Data Adaptive Target Parameters. The international journal of biostatistics, 12(1), 3-19.
135 | 
136 | Hubbard, A., Munoz, I. D., Decker, A., Holcomb, J. B., Schreiber, M. A., Bulger, E. M., ... & Rahbar, M. H. (2013). Time-Dependent Prediction and Evaluation of Variable Importance Using SuperLearning in High Dimensional Clinical Data. The journal of trauma and acute care surgery, 75(1 0 1), S53.
137 | 
138 | Hubbard, A. E., & van der Laan, M. J. (2016). Mining with inference: data-adaptive target parameters (pp. 439-452). In P. Bühlmann et al. (Ed.), Handbook of Big Data. CRC Press, Taylor & Francis Group, LLC: Boca Raton, FL.
139 | 
140 | Jerez, J. M., Molina, I., García-Laencina, P. J., Alba, E., Ribelles, N., Martín, M., & Franco, L. (2010). Missing data imputation using statistical and machine learning methods in a real breast cancer problem. Artificial intelligence in medicine, 50(2), 105-115.
141 | 
142 | Rozenholc, Y., Mildenberger, T., & Gather, U. (2010). Combining regular and irregular histograms by penalized likelihood. Computational Statistics & Data Analysis, 54(12), 3313-3323.
143 | 
144 | Troyanskaya, O., Cantor, M., Sherlock, G., Brown, P., Hastie, T., Tibshirani, R., Botstein, D., & Altman, R. B. (2001). Missing value estimation methods for DNA microarrays. Bioinformatics, 17(6), 520-525.
145 | 
146 | van der Laan, M. J. (2006). Statistical inference for variable importance. The International Journal of Biostatistics, 2(1).
147 | 
148 | van der Laan, M. J., & Pollard, K. S. (2003). A new algorithm for hybrid hierarchical clustering with visualization and the bootstrap. Journal of Statistical Planning and Inference, 117(2), 275-303.
149 | 
150 | van der Laan, M. J., Polley, E. C., & Hubbard, A. E. (2007). Super learner. Statistical applications in genetics and molecular biology, 6(1).
151 | 
152 | van der Laan, M. J., & Rose, S. (2011). Targeted learning: causal inference for observational and experimental data. Springer Science & Business Media.
153 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(varimpact)
2 | 
3 | # Only run tests if testthat package is installed.
4 | # This is in compliance with "Writing R Extensions" §1.1.3.1.
5 | if (requireNamespace("testthat", quietly = TRUE)) {
6 |   testthat::test_check("varimpact", reporter = "check")
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/testthat/test-estimate_tmle2.R:
--------------------------------------------------------------------------------
 1 | library(varimpact)
 2 | library(SuperLearner)
 3 | library(tmle)
 4 | library(testthat)
 5 | 
 6 | # Create test dataset.
 7 | context("Dataset A: continuous variables")
 8 | 
 9 | # Set multicore-compatible seed.
10 | set.seed(1, "L'Ecuyer-CMRG")
11 | 
12 | # Simulation code from PH 295, Fall 2016 --
13 | # https://github.com/wilsoncai1992/PH295-lab/blob/master/lab3/Lab3_Lecture.Rmd
14 | 
15 | # structural equation for W_1
16 | # takes as input a vector U_W1 and returns a vector evaluating
17 | # f_{W,1}(U_W1)
18 | f_W1 <- function(U_W1){
19 |   return(U_W1)
20 | }
21 | 
22 | # structural equation for W_2
23 | # takes as input a vector U_W2 and returns a vector evaluating
24 | # f_{W,2}(U_W2)
25 | f_W2 <- function(U_W2){
26 |   return(U_W2)
27 | }
28 | 
29 | # structural equation for A
30 | f_A <- function(W_1, W_2, U_A){
31 |   return(as.numeric(plogis(W_1 - W_2 + U_A) > 0.5))
32 | }
33 | 
34 | # structural equation for Y
35 | f_Y <- function(W_1, W_2, A, U_Y){
36 |   return(-W_1 + W_2 + A - U_Y)
37 | }
38 | 
39 | # function to draw n observations from an scm
40 | # n = the number of observations to draw
41 | # returns a data.frame with named columns
42 | simObsSCM <- function(n){
43 |   ## first we draw the errors
44 |   # draw U_{W,1}
45 |   U_W1 <- rbinom(n,1,0.5)
46 |   # draw U_{W,2}
47 |   U_W2 <- rbinom(n,1,0.5)
48 |   # draw U_A
49 |   U_A <- rnorm(n,0,1)
50 |   # draw U_Y
51 |   U_Y <- rnorm(n,0,1)
52 | 
53 |   ## now we can evaluate the observations sequentially
54 |   # evaluate W_1
55 |   W_1 <- f_W1(U_W1)
56 |   #evaluate W_2
57 |   W_2 <- f_W2(U_W2)
58 |   # evaluate A
59 |   A <- f_A(W_1 = W_1, W_2 = W_2, U_A = U_A)
60 |   # evaluate Y
61 |   Y <- f_Y(W_1 = W_1, W_2 = W_2, A = A, U_Y = U_Y)
62 | 
63 |   ## return a data.frame object
64 |   out <- data.frame(W_1 = W_1, W_2 = W_2, A = A, Y = Y)
65 |   return(out)
66 | }
67 | 
68 | data = simObsSCM(100)
69 | summary(data)
70 | 
71 | sl_lib = c("SL.glmnet", "SL.glm", "SL.mean")
72 | 
73 | # Estimate g and Q
74 | result = varimpact::estimate_tmle2(Y = data$Y, A = data$A,
75 |   W = data[, c("W_1", "W_2")], family = "gaussian",
76 |   Q.lib = sl_lib,
77 |   g.lib = sl_lib,
78 |   verbose = T)
79 | 


--------------------------------------------------------------------------------
/tests/testthat/test-exportLatex.R:
--------------------------------------------------------------------------------
  1 | # Test exportLatex functionality
  2 | library(testthat)
  3 | library(varimpact)
  4 | 
  5 | context("exportLatex() function")
  6 | 
  7 | # Skip exportLatex tests during R CMD check to avoid creating LaTeX files
  8 | skip_if(identical(Sys.getenv("_R_CHECK_PACKAGE_NAME_"), "varimpact"), 
  9 |         "Skipping exportLatex tests during R CMD check")
 10 | 
 11 | test_that("exportLatex creates and cleans up LaTeX files", {
 12 |   # Create test dataset
 13 |   set.seed(1, "L'Ecuyer-CMRG")
 14 |   N = 100
 15 |   X = data.frame(
 16 |     x1 = rnorm(N),
 17 |     x2 = rnorm(N),
 18 |     x3 = rnorm(N)
 19 |   )
 20 |   Y = rbinom(N, 1, plogis(0.2 * X$x1 + 0.1 * X$x2 - 0.2 * X$x3))
 21 |   
 22 |   # Run varimpact with minimal settings for speed
 23 |   future::plan("sequential")
 24 |   vim = varimpact(Y = Y, data = X, V = 2L, verbose = FALSE,
 25 |                   Q.library = c("SL.mean", "SL.glm"),
 26 |                   g.library = c("SL.mean", "SL.glm"),
 27 |                   bins_numeric = 3L)
 28 |   
 29 |   # Skip test if no results were generated (due to sample size constraints)
 30 |   skip_if(is.null(vim$results_all), "No varimpact results generated")
 31 |   
 32 |   # Define cleanup function to ensure files are always removed
 33 |   tex_files = c("varimpByFold.tex", "varimpAll.tex", "varimpConsistent.tex")
 34 |   cleanup_files = function() {
 35 |     cleanup_latex_files(verbose = FALSE)
 36 |   }
 37 |   
 38 |   # Ensure cleanup happens even if test fails
 39 |   on.exit(cleanup_files())
 40 |   
 41 |   # Test 1: exportLatex should create files
 42 |   exportLatex(vim)
 43 |   
 44 |   existing_files = tex_files[file.exists(tex_files)]
 45 |   
 46 |   expect_true(length(existing_files) > 0, 
 47 |               info = "exportLatex should create at least some LaTeX files")
 48 |   expect_true("varimpByFold.tex" %in% existing_files,
 49 |               info = "varimpByFold.tex should be created")
 50 |   expect_true("varimpAll.tex" %in% existing_files,
 51 |               info = "varimpAll.tex should be created")
 52 |   
 53 |   # Test 2: Manual cleanup should work
 54 |   cleanup_files()
 55 |   
 56 |   remaining_files = tex_files[file.exists(tex_files)]
 57 |   expect_equal(length(remaining_files), 0,
 58 |                info = "Manual cleanup should remove all LaTeX files")
 59 |   
 60 |   # Test 3: Manual cleanup after exportLatex should work
 61 |   exportLatex(vim)
 62 |   cleanup_files()
 63 |   
 64 |   remaining_files_after_cleanup = tex_files[file.exists(tex_files)]
 65 |   expect_equal(length(remaining_files_after_cleanup), 0,
 66 |                info = "Manual cleanup after exportLatex should remove LaTeX files")
 67 | })
 68 | 
 69 | test_that("exportLatex handles NULL results gracefully", {
 70 |   # Create a mock varimpact object with NULL results
 71 |   mock_vim = list(
 72 |     results_by_fold = NULL,
 73 |     results_all = NULL,
 74 |     results_consistent = data.frame()
 75 |   )
 76 |   
 77 |   # Should return NULL and give a warning
 78 |   expect_warning(
 79 |     result <- exportLatex(mock_vim),
 80 |     "Cannot export LaTeX: varimpact results are NULL or incomplete"
 81 |   )
 82 |   expect_null(result)
 83 |   
 84 |   # Should not create any files
 85 |   tex_files = c("varimpByFold.tex", "varimpAll.tex", "varimpConsistent.tex")
 86 |   existing_files = tex_files[file.exists(tex_files)]
 87 |   expect_equal(length(existing_files), 0,
 88 |                info = "exportLatex with NULL results should not create files")
 89 | })
 90 | 
 91 | test_that("exportLatex with custom outname and directory", {
 92 |   # Create test dataset
 93 |   set.seed(2, "L'Ecuyer-CMRG")
 94 |   N = 100
 95 |   X = data.frame(x1 = rnorm(N), x2 = rnorm(N))
 96 |   Y = rbinom(N, 1, plogis(0.3 * X$x1))
 97 |   
 98 |   # Run varimpact
 99 |   future::plan("sequential")
100 |   vim = varimpact(Y = Y, data = X, V = 2L, verbose = FALSE,
101 |                   Q.library = "SL.mean", g.library = "SL.mean",
102 |                   bins_numeric = 3L)
103 |   
104 |   # Skip test if no results were generated
105 |   skip_if(is.null(vim$results_all), "No varimpact results generated")
106 |   
107 |   # Create temporary directory
108 |   temp_dir = tempdir()
109 |   custom_prefix = "test_"
110 |   
111 |   # Check for files with custom names in custom directory
112 |   expected_files = c(
113 |     file.path(temp_dir, paste0(custom_prefix, "varimpByFold.tex")),
114 |     file.path(temp_dir, paste0(custom_prefix, "varimpAll.tex")),
115 |     file.path(temp_dir, paste0(custom_prefix, "varimpConsistent.tex"))
116 |   )
117 |   
118 |   # Ensure cleanup happens even if test fails
119 |   on.exit({
120 |     cleanup_latex_files(dir = temp_dir, outname = custom_prefix, verbose = FALSE)
121 |   })
122 |   
123 |   # Test with custom outname and directory
124 |   exportLatex(vim, outname = custom_prefix, dir = temp_dir)
125 |   
126 |   existing_custom_files = expected_files[file.exists(expected_files)]
127 |   expect_true(length(existing_custom_files) > 0,
128 |               info = "Custom named files should be created in custom directory")
129 |   
130 |   # Test manual cleanup with custom names
131 |   # Files should be cleaned up by on.exit() handler
132 | })


--------------------------------------------------------------------------------
/tests/testthat/test-factorsToIndicators.R:
--------------------------------------------------------------------------------
 1 | library(varimpact)
 2 | library(testthat)
 3 | 
 4 | context("factorsToIndicators")
 5 | 
 6 | # Create test dataset.
 7 | 
 8 | set.seed(1, "L'Ecuyer-CMRG")
 9 | N = 200
10 | 
11 | num_normal = 7
12 | X = as.data.frame(matrix(rnorm(N * num_normal), N, num_normal))
13 | 
14 | # Add some missing data to X.
15 | miss_num = 10
16 | for (i in 1:miss_num) X[sample(nrow(X), 1), sample(ncol(X), 1)] = NA
17 | 
18 | X_fac = data.frame(lapply(1:ncol(X), FUN=function(col_i) as.factor(floor(abs(pmin(pmax(X[, col_i], -1), 1)*3)))))
19 | dim(X_fac)
20 | colnames(X_fac) = paste0("fac_", 1:ncol(X_fac))
21 | colnames(X_fac)
22 | summary(X_fac)
23 | 
24 | #########################
25 | 
26 | table(X_fac[, 1], useNA="ifany")
27 | 
28 | # Test a single factor.
29 | # Use column 3 which has 3 missing values based on the seed
30 | results = factors_to_indicators(X_fac[, 3, drop = F], verbose = T)
31 | dim(results$data)
32 | # We should have indicators for 1, 2, 3.
33 | colnames(results$data)
34 | 
35 | dim(results$missing_indicators)
36 | colnames(results$missing_indicators)
37 | 
38 | # We should have 1 missing data indicator, with 3 0s and the rest 1s.
39 | table(results$missing_indicators[, 1])
40 | expect_equal(min(table(results$missing_indicators[, 1])), 3)
41 | 
42 | expect_gt(ncol(results$missing_indicators), 0)
43 | 
44 | # Test multiple factors.
45 | results = factors_to_indicators(X_fac[, 1:2, drop = F], verbose = T)
46 | 
47 | dim(results$data)
48 | colnames(results$data)
49 | 
50 | dim(results$missing_indicators)
51 | colnames(results$missing_indicators)
52 | 
53 | table(results$missing_indicators[, 1])
54 | 
55 | # We expect our missing indicator matrix to have more than 0 columns.
56 | expect_is(results$missing_indicators, "matrix")
57 | 
58 | # Test the full factor dataframe.
59 | results = factors_to_indicators(X_fac, verbose = T)
60 | dim(results$data)
61 | colnames(results$data)
62 | 
63 | dim(results$missing_indicators)
64 | colnames(results$missing_indicators)
65 | 


--------------------------------------------------------------------------------
/tests/testthat/test-reduce_dimensions.R:
--------------------------------------------------------------------------------
1 | # TODO: need to test reduce_dimensions function.
2 | 


--------------------------------------------------------------------------------
/tests/testthat/test-varimpact-breastcancer.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | library(varimpact)
 3 | 
 4 | #################################
 5 | # mlbench BreastCancer dataset.
 6 | 
 7 | context("BreastCancer dataset")
 8 | 
 9 | data(BreastCancer, package = "mlbench")
10 | data = BreastCancer
11 | names(data) = tolower(names(data))
12 | 
13 | set.seed(3, "L'Ecuyer-CMRG")
14 | 
15 | # Reduce to a dataset of 200 observations to speed up testing.
16 | data = data[sample(nrow(data), 200), ]
17 | 
18 | # Create a numeric outcome variable.
19 | data$y = as.numeric(data$class == "malignant")
20 | table(data$y)
21 | 
22 | x = subset(data, select = -c(y, class, id))
23 | dim(x)
24 | 
25 | # Only run in RStudio so that automated CRAN checks don't give errors.
26 | if (.Platform$GUI == "RStudio") {
27 |   # Use multicore parallelization to speed up processing.
28 |   future::plan("multiprocess", workers = 2)
29 | }
30 | 
31 | test_that("varimpact runs on BreastCancer dataset", {
32 |   # Speed up the test with moderate optimization:
33 |   # - Use only 2 variables to reduce computation time
34 |   # - Use faster SuperLearner libraries
35 |   # - Reduce bins and use minimal cross-validation
36 |   vim = varimpact(Y = data$y, x[, 1:2], 
37 |                   Q.library = c("SL.glm", "SL.mean"),
38 |                   g.library = c("SL.glm", "SL.mean"),
39 |                   bins_numeric = 2L,
40 |                   V = 2L,
41 |                   verbose = FALSE, 
42 |                   verbose_tmle = FALSE)
43 |   
44 |   # Test that the function completes without error
45 |   expect_is(vim, "varimpact")
46 |   expect_is(vim$time, "proc_time")
47 |   
48 |   # Test that some VIMs were actually calculated
49 |   # The test should produce meaningful results
50 |   if (!is.null(vim$results_all)) {
51 |     expect_gte(nrow(vim$results_all), 1)
52 |     cat("Successfully calculated", nrow(vim$results_all), "VIMs\n")
53 |   } else {
54 |     cat("Warning: No VIMs calculated\n")
55 |   }
56 |   
57 |   # Print timing for reference
58 |   cat("Test execution time:", vim$time[3], "seconds\n")
59 | })
60 | 
61 | test_that("varimpact works with A_names parameter", {
62 |   # Test a subset of columns for A_names (use just 1 variable).
63 |   vim = varimpact(Y = data$y, x, 
64 |                   A_names = colnames(x)[1], 
65 |                   Q.library = c("SL.glm", "SL.mean"),
66 |                   g.library = c("SL.glm", "SL.mean"),
67 |                   bins_numeric = 2L,
68 |                   V = 2L,
69 |                   verbose = FALSE)
70 |   
71 |   # Test that the function completes without error
72 |   expect_is(vim, "varimpact")
73 |   expect_is(vim$time, "proc_time")
74 |   
75 |   # Test that some VIMs were actually calculated
76 |   if (!is.null(vim$results_all)) {
77 |     expect_gte(nrow(vim$results_all), 1)
78 |     cat("Successfully calculated", nrow(vim$results_all), "VIMs with A_names\n")
79 |   } else {
80 |     cat("Warning: No VIMs calculated with A_names\n")
81 |   }
82 |   
83 |   # Print timing for reference
84 |   cat("A_names test execution time:", vim$time[3], "seconds\n")
85 | })
86 | 
87 | # Return to single core usage.
88 | future::plan("sequential")
89 | 


--------------------------------------------------------------------------------
/tests/testthat/test-varimpact.R:
--------------------------------------------------------------------------------
  1 | # Make sure we're using the rebuilt package. Suppress any error if it isn't loaded.
  2 | try(detach(package:varimpact), silent = TRUE)
  3 | library(varimpact)
  4 | library(testthat)
  5 | 
  6 | # Create test dataset.
  7 | context("varimpact(). Dataset A: continuous variables")
  8 | 
  9 | # Set multicore-compatible seed.
 10 | set.seed(1, "L'Ecuyer-CMRG")
 11 | # Can't go below 90 without changing more varimpact default settings.
 12 | N = 100
 13 | num_normal = 5
 14 | X = data.frame(matrix(rnorm(N * num_normal), N, num_normal))
 15 | 
 16 | # Systematic Y generation.
 17 | Y = .2 * X[, 1] + 1 * X[, 2] - 0.8 * X[, 3] + .1 * X[, 3] * X[, 4] - .2 * abs(X[, 4])
 18 | 
 19 | # Binary distribution via the binomial.
 20 | Y_bin = rbinom(N, 1, plogis(Y))
 21 | 
 22 | # Gaussian distribution.
 23 | Y_gaus = Y + rnorm(N, 0, 1)
 24 | 
 25 | # Add some missing data to X.
 26 | miss_num = 10
 27 | for (i in 1:miss_num) X[sample(nrow(X), 1), sample(ncol(X), 1)] = NA
 28 | 
 29 | # Basic test - binary outcome.
 30 | #future::plan("multiprocess")
 31 | future::plan("sequential")
 32 | vim = varimpact(Y = Y_bin, data = X[, 1:3], V = 3L,
 33 |                 Q.library = c("SL.mean", "SL.glm"),
 34 |                 g.library = c("SL.mean", "SL.glm"),
 35 |                 verbose = TRUE,
 36 |                 verbose_tmle = FALSE, bins_numeric = 3L)
 37 | # Takes 25 seconds.
 38 | vim$time
 39 | # Be explict about printing for code coverage of tests.
 40 | print(vim)
 41 | vim$results_all
 42 | vim$results_by_fold
 43 | # names(vim)
 44 | # exportLatex testing moved to test-exportLatex.R
 45 | 
 46 | # And try a gaussian outcome.
 47 | vim = varimpact(Y = Y_gaus, data = X[, 1:3], V = 3L, verbose = TRUE,
 48 |                 family = "gaussian")
 49 | print(vim)
 50 | 
 51 | # Test imputation
 52 | vim = varimpact(Y = Y_bin, data = X[, 1:3], verbose = TRUE, impute = "zero")
 53 | vim = varimpact(Y = Y_bin, data = X[, 1:3], verbose = TRUE, impute = "median")
 54 | vim = varimpact(Y = Y_bin, data = X[, 1:4], verbose = TRUE, impute = "knn")
 55 | 
 56 | # Test a subset of columns using A_names.
 57 | vim = varimpact(Y = Y_bin, data = X, A_names = colnames(X)[1:2], verbose = TRUE)
 58 | print(vim)
 59 | 
 60 | # Only run in RStudio so that automated CRAN checks don't give errors.
 61 | if (.Platform$GUI == "RStudio") {
 62 |   # Test parallelization
 63 |   future::plan("multiprocess", workers = 2)
 64 |   vim = varimpact(Y = Y_bin, data = X[, 1:3], verbose = TRUE)
 65 |   print(vim)
 66 | }
 67 | 
 68 | # Only run in RStudio so that automated CRAN checks don't give errors.
 69 | if (.Platform$GUI == "RStudio") {
 70 |   # Test parallelization via snow.
 71 |   cl = snow::makeCluster(2L)
 72 |   future::plan("cluster", workers = cl)
 73 |   vim = varimpact(Y = Y_bin, data = X[, 1:4], verbose = TRUE)
 74 |   vim
 75 |   snow::stopCluster(cl)
 76 | }
 77 | 
 78 | context("varimpact(). Dataset B: factor variables")
 79 | 
 80 | # Set a new multicore-compatible seed.
 81 | set.seed(2, "L'Ecuyer-CMRG")
 82 | 
 83 | X_fac = data.frame(lapply(1:ncol(X),
 84 |                           function(col_i)
 85 |                         as.factor(floor(abs(pmin(pmax(X[, col_i], -1), 1) * 3)))))
 86 | dim(X_fac)
 87 | colnames(X_fac) = paste0("fac_", 1:ncol(X_fac))
 88 | colnames(X_fac)
 89 | summary(X_fac)
 90 | 
 91 | # Return to sequential execution for now.
 92 | future::plan("sequential")
 93 | 
 94 | # Basic factor test.
 95 | vim = varimpact(Y = Y_bin, data = X_fac[, 1:3], V = 2L, verbose = TRUE)
 96 | print(vim)
 97 | 
 98 | # And gaussian
 99 | vim = varimpact(Y = Y_gaus, data = X_fac[, 1:3], V = 2L, verbose = TRUE,
100 |                 family = "gaussian")
101 | print(vim)
102 | 
103 | # Only run in RStudio so that automated CRAN checks don't give errors.
104 | # Disabled for now - need to review.
105 | if (F && .Platform$GUI == "RStudio") {
106 |   # Test parallelization.
107 |   future::plan("multiprocess")
108 | 
109 |   # Try a snow cluster, which does return the output to STDOUT.
110 |   if (F) {
111 |     # Run manually when debugging.
112 |     cores = RhpcBLASctl::get_num_cores()
113 |     capture.output({ cl = snow::makeCluster(cores, type="SOCK", outfile = "") })
114 |     doSNOW::registerDoSNOW(cl)
115 |     parallel::setDefaultCluster(cl)
116 |     foreach::getDoParName()
117 |   }
118 | 
119 |   # Factor variables with parallelization.
120 |   # TOFIX: This does not complete currently if fac_4 is included.
121 |   # I think it is due to HOPACH never completing.
122 |   vim = varimpact(Y = Y_bin, data = X_fac[, 1:3],
123 |                   #A_names = c(_4", "fac_2"),
124 |                   verbose = TRUE)
125 |   vim
126 | 
127 |   # Return to single core usage.
128 | 
129 |   # Run manually when debugging, if the snow cluster was used.
130 |   if (F) {
131 |     ck37r::stop_cluster(cl)
132 |   }
133 | 
134 | }
135 | 
136 | context("varimpact(). Dataset C: numeric and factor variables")
137 | 
138 | #################################
139 | # Combined numeric and factor test.
140 | X_combined = cbind(X[1:3], X_fac[4:5])
141 | 
142 | # Basic combined test.
143 | vim = varimpact(Y = Y_bin, data = X_combined, V = 2, verbose = TRUE)
144 | print(vim)
145 | 
146 | # And gaussian
147 | vim = varimpact(Y = Y_gaus, data = X_combined, V = 2, verbose = TRUE,
148 |                 family = "gaussian")
149 | print(vim)
150 | 
151 | context("varimpact() .Dataset D: basic example")
152 | 
153 | ####################################
154 | # Create test dataset.
155 | set.seed(1, "L'Ecuyer-CMRG")
156 | 
157 | N = 100
158 | num_normal = 7
159 | X = as.data.frame(matrix(rnorm(N * num_normal), N, num_normal))
160 | Y = rbinom(N, 1, plogis(.2*X[, 1] + .1*X[, 2] - .2*X[, 3] + .1*X[, 3]*X[, 4] - .2*abs(X[, 4])))
161 | # Add some missing data to X so we can test imputation.
162 | for (i in 1:10) X[sample(nrow(X), 1), sample(ncol(X), 1)] = NA
163 | 
164 | ####################################
165 | # Basic example
166 | # TODO: fix warnings here, due to failed folds.
167 | # TOFIX: there is an error here on the numeric variables.
168 | # task 3 failed - "attempt to select less than one element in get1index"
169 | # X_3 seems to be causing the problem - need to investigate why.
170 | vim = varimpact(Y = Y, data = X, A_names = colnames(X)[c(1, 2, 4:7)],
171 |                 verbose = TRUE, parallel = FALSE)
172 | print(vim)
173 | vim$results_all
174 | vim$results_by_fold
175 | # In this test all variables are significant, which is rare.
176 | # exportLatex testing moved to test-exportLatex.R
177 | 


--------------------------------------------------------------------------------