├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── R
    ├── RcppExports.R
    ├── SL.hdps.R
    ├── assess_recurrence.R
    ├── hdps-package.R
    ├── hdps_screen.R
    ├── identify_covariates.R
    ├── prioritize_covariates.R
    └── utils.R
├── README.md
├── hdps.Rproj
├── man
    ├── SL.hdps.generator.Rd
    ├── assess_recurrence.Rd
    ├── hdps-package.Rd
    ├── hdps_screen.Rd
    ├── identify_covariates.Rd
    ├── predict.SL_hdps.Rd
    ├── predict.hdps_covars.Rd
    ├── prioritize_covariates.Rd
    └── screen.Rd
├── src
    ├── RcppExports.cpp
    ├── calc_rr_cds.cpp
    ├── colPrevScores.cpp
    └── colVars.cpp
└── tests
    ├── testthat.R
    └── testthat
        ├── test_SL.hdps.R
        ├── test_assess_recurrence.R
        ├── test_calc_rr_cds.R
        ├── test_hdps_screen.R
        └── test_predict.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | .travis.yml
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | *.o
5 | *.so
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 | warnings_are_errors: true
3 | r_binary_packages:
4 |  - testthat
5 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: hdps
 2 | Type: Package
 3 | Title: High-dimensional propensity score algorithm
 4 | Version: 0.1.6
 5 | Date: 2017-08-16
 6 | Author: Sam Lendle
 7 | Maintainer: Sam Lendle <sam.lendle@gmail.com>
 8 | Description: The high-dimensional propensity score algorithm is a method for
 9 |     high-dimensional proxy adjustment in claims data. This package implements
10 |     the variable transformation and variable selection parts of the
11 |     algorithm.
12 | License: MIT + file LICENSE
13 | Imports:
14 |     Rcpp (>= 0.11.1),
15 |     Matrix,
16 |     glmnet    
17 | LinkingTo: Rcpp
18 | Suggests:
19 |     testthat
20 | RoxygenNote: 6.0.1
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2017
2 | COPYRIGHT HOLDER: Sam Lendle


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(predict,SL_hdps)
 4 | S3method(predict,hdps_covars)
 5 | export(SL.hdps.generator)
 6 | export(assess_recurrence)
 7 | export(hdps_screen)
 8 | export(identify_covariates)
 9 | export(prioritize_covariates)
10 | export(screen.excludenames)
11 | export(screen.names)
12 | importFrom(Matrix,sparse.model.matrix)
13 | importFrom(Rcpp,evalCpp)
14 | importFrom(glmnet,cv.glmnet)
15 | importFrom(glmnet,glmnet)
16 | importFrom(stats,cor)
17 | importFrom(stats,predict)
18 | importFrom(stats,quantile)
19 | useDynLib(hdps)
20 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | calc_rr_cds <- function(outcome, covars) {
 5 |     .Call('_hdps_calc_rr_cds', PACKAGE = 'hdps', outcome, covars)
 6 | }
 7 | 
 8 | colPrevScores <- function(x) {
 9 |     .Call('_hdps_colPrevScores', PACKAGE = 'hdps', x)
10 | }
11 | 
12 | colVars <- function(x) {
13 |     .Call('_hdps_colVars', PACKAGE = 'hdps', x)
14 | }
15 | 
16 | 


--------------------------------------------------------------------------------
/R/SL.hdps.R:
--------------------------------------------------------------------------------
  1 | ##' Generates a wrapper for SuperLearner using HDPS
  2 | ##' 
  3 | ##' A HDPS candidate will generate covariates using \code{hdps_screen} from
  4 | ##' codes, and estimate the propensity score with logistic regression on
  5 | ##' generated covariates and predefined covariates.
  6 | ##' 
  7 | ##' To use HDPS in SuperLearner to estimate a propensity score, you need to
  8 | ##' include the outcome variable as a covariate where here outcome means the
  9 | ##' outcome of interest in the causal problem as opposed to the \code{Y}
 10 | ##' variable in SuperLearner. For non-HDPS candidates in SuperLearner, it's
 11 | ##' important to exclude the outcome variable via \code{\link[=screen]{screen.named}} or
 12 | ##' some other screening algorithm in order to avoid adjusting for something
 13 | ##' downstream on the causal pathway.
 14 | ##' @title SL.hdps.generator
 15 | ##' @param out_name Name of the outcome variable.
 16 | ##' @param dimension_names Dimension names of HDPS dimensions. See
 17 | ##'   \code{\link{hdps_screen}}.
 18 | ##' @param predef_covar_names Names of predefined covariates to be included in
 19 | ##'   logistic regression model.
 20 | ##' @param keep_k_total See \code{\link{hdps_screen}}.
 21 | ##' @param ... Other arguments passed to \code{\link{hdps_screen}}.
 22 | ##' @param cvglmnet Use \code{glmnet} or \code{cv.glmnet} for fitting. Defaults to FALSE.
 23 | ##' @param glmnet_args list of arguments to be passed to glmnet or cv.glmnet. If \code{cvglmnet=FALSE}, \code{glmnet_args}
 24 | ##' should be set such that calling \code{predict} on the \code{glmnet} object returns only one vector of predictions. 
 25 | ##' E.g. only one value of \code{lambda} should be set.
 26 | ##' @return A SuperLearner wrapper function
 27 | ##' @author Sam Lendle
 28 | ##' @importFrom Matrix sparse.model.matrix 
 29 | ##' @importFrom glmnet glmnet
 30 | ##' @importFrom glmnet cv.glmnet
 31 | ##' @importFrom stats cor
 32 | ##' @export
 33 | SL.hdps.generator <- function(out_name, dimension_names, predef_covar_names=c(), keep_k_total, ..., 
 34 |                               cvglmnet=FALSE, glmnet_args=if (cvglmnet) list() else list(lambda=0)) {
 35 |   function(Y, X, newX, family, obsWeights, id) {
 36 |     if (missing(newX)) {
 37 |       newX <- X
 38 |     }
 39 |     if(family$family == 'gaussian') {
 40 |       stop("SL.hdps only for binomial")
 41 |     }
 42 |     
 43 |     hdps_fit <- hdps_screen(X[, out_name], Y, X, dimension_names, keep_k_total=keep_k_total, ...)
 44 |     
 45 |     predef_covars <- X[, predef_covar_names]
 46 |     if (keep_k_total > 0) {
 47 |       hdps_covars <- predict(hdps_fit)
 48 |       hdps_keep <- colnames(hdps_covars)[abs(cor(Y, hdps_covars)) <= 0.95]
 49 |       hdps_covars <- hdps_covars[, hdps_keep]
 50 |       df = as.data.frame(cbind(predef_covars, hdps_covars))
 51 |     } else {
 52 |       hdps_keep <- NULL
 53 |       df = as.data.frame(predef_covars)
 54 |     }
 55 |     
 56 |     smm <- sparse.model.matrix(~.-1, df)
 57 |     
 58 |     myglmnet <- function(...) if (cvglmnet) 
 59 |       cv.glmnet(smm, Y, family="binomial") else 
 60 |       glmnet(smm, Y, family="binomial", ...)
 61 |     glmnet_fit <- do.call(myglmnet, glmnet_args)
 62 |     
 63 |     if (identical(X, newX)) {
 64 |       smmnew <- smm
 65 |     } else {
 66 |     
 67 |       new_predef_covars <- newX[, predef_covar_names]
 68 |       if (keep_k_total > 0) {
 69 |         new_hdps_covars <- predict(hdps_fit, newdata=newX)
 70 |         new_hdps_covars <- new_hdps_covars[, hdps_keep]
 71 |         new_df = as.data.frame(cbind(new_predef_covars, new_hdps_covars))
 72 |       } else {
 73 |         new_df = as.data.frame(new_predef_covars)
 74 |       }
 75 |       
 76 |       smmnew <- sparse.model.matrix(~.-1, new_df)
 77 |     }
 78 |     
 79 |     pred <- predict(glmnet_fit, smmnew, type="response")
 80 |     if (ncol(pred) != 1) stop("Check cvglmnet and glmnet_args arguments to insure that predict returns only one column")
 81 |     
 82 |     
 83 |     # fit returns all objects needed for predict.SL.template
 84 |     fit <- list(glmnet_fit = glmnet_fit, hdps_fit = hdps_fit, 
 85 |                 predef_covar_names=predef_covar_names, hdps_keep=hdps_keep, keep_k_total=keep_k_total)
 86 |     # declare class of fit for predict.SL.template
 87 |     class(fit) <- 'SL_hdps'
 88 |     # return a list with pred and fit
 89 |     out <- list(pred = pred, fit = fit)
 90 |     return(out)
 91 |   }
 92 | }
 93 | 
 94 | #' Get predictions from SL_hdps wrapper
 95 | #'
 96 | #' @title Get predictions from SL_hdps wrapper
 97 | #' @param object object of class \code{SL_hdps}
 98 | #' @param newdata a matrix of covariates to predict from
 99 | #' @param ... ignored
100 | #' @return vector of predictions
101 | #' @author Sam Lendle
102 | #' @importFrom stats predict
103 | #' @export
104 | predict.SL_hdps <- function(object, newdata, ...){
105 |   new_predef_covars <- newdata[, object$predef_covar_names]
106 |   new_hdps_covars <- predict(object$hdps_fit, newdata=newdata, keep_k_total=object$keep_k_total)
107 |   new_hdps_covars <- new_hdps_covars[, object$hdps_keep]
108 |   new_df <- cbind(new_predef_covars, new_hdps_covars)
109 |   smmnew <- sparse.model.matrix(~.-1, new_df)
110 |   pred <- predict(object$glmnet_fit, smmnew, type = "response")
111 |   if (ncol(pred) != 1) stop("Check cvglmnet and glmnet_args arguments to insure that predict returns only one column")
112 |   pred
113 | }
114 | 
115 | ##' @name screen
116 | ##' @rdname screen
117 | ##' 
118 | ##' @title SuperLearner screening wrappers
119 | ##' 
120 | ##' @description Functions to set up screening wrappers for SuperLearner
121 | ##'   
122 | ##' @param names Names to be included or excluded
123 | ##' 
124 | ##' These functions generate simple screening wrappers for SuperLearner to 
125 | ##' include or exclude variables based on \code{names}. This is is helpful 
126 | ##' because in order to use HDPS as a candidate in SuperLearner, you need to 
127 | ##' include the study outcome variable as a covariate. But to use a non-HDPS 
128 | ##' algorithm, (say a random forest on some specified set of covariates,) as a
129 | ##' candidate as well, you want to make sure you're not adjusting for the 
130 | ##' outcome which is downstream from treatment on the causal pathway.
131 | ##' 
132 | ##' See documentation for the SuperLearner package for more about screening algorithms.
133 | ##'   
134 | ##' @examples
135 | ##' 
136 | ##' screen.predefined <- screen.names(c("names", "of", "predefined",
137 | ##' "covariates", "that", "definitely", "dont", "include", "the", "outcome"))
138 | ##' 
139 | ##' screen.notoutcome <- screen.excludenames(c("outcome_variable_name",
140 | ##' "and", "other", "covariates", "to", "exclude"))
141 | ##' 
142 | NULL
143 | 
144 | ##' @rdname screen
145 | ##' @export
146 | screen.names <- function (names) {
147 |   function (Y, X, family, obsWeights, id, ...) {
148 |     colnames(X) %in% names
149 |   }
150 | }
151 | 
152 | ##' @rdname screen
153 | ##' @export
154 | screen.excludenames <- function (names) {
155 |   function (Y, X, family, obsWeights, id, ...) {
156 |     !(colnames(X) %in% names)
157 |   }
158 | }
159 | 


--------------------------------------------------------------------------------
/R/assess_recurrence.R:
--------------------------------------------------------------------------------
 1 | #' Expands covarites to up to three binary columns for at least one, sporadic, or frequent occrence of each covariate.
 2 | #'
 3 | #' Each column \code{x} of \code{covars} is expanded to three binary columns. 
 4 | #' The first column indicates that the value of \code{x} is non-zero. 
 5 | #' The second indicates that the value of \code{x} \eqn{\ge} the median of non-zero values of \code{x}.
 6 | #' The third indicates that the value of \code{x} \eqn{\ge} the 75th percentile of non-zero values of \code{x}. 
 7 | #' Non-unique columns per covariate are dropped.
 8 | #' 
 9 | #' Groups of columns of the returned matrix are in the same order of columns in \code{covars}. 
10 | #' 
11 | #' If \code{covars} has column names, the returned matrix will have the same column names with suffexes \code{"_once"},
12 | #' \code{"_sporadic"}, and \code{"_frequent"} for the first, second, and third expanded columns, respectively.
13 | #' 
14 | #' @title assess_recurrence
15 | #' @param covars a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates
16 | #' @param debug Enables some debuging checks which slow things down, but may yield useful warnings or errors.
17 | #' @return Expanded \code{covars} matrix.
18 | #' @author Sam Lendle
19 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
20 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
21 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology
22 | #' (Cambridge, Mass.)}, 20(4), 512.
23 | #' @export
24 | assess_recurrence <- function(covars, debug=FALSE) {
25 |   #expands a matrix by replacing it's columns with as.numeric(x > 0), 
26 |   # as.numeric(x > median(x)), as.numeric(x > quantile(x, prob=0.75))
27 |   #only unique columns (per original column) are kept
28 |   
29 |   covars <- as.matrix(covars)
30 |   
31 |   temp <- function(i) {
32 |     column <- covars[,i]
33 |     quants <- get_quantiles(column)
34 |     column_recurrence(column, quants, warndup=debug)
35 |   }
36 |   #mats_quants <- lapply(1:ncol(covars), temp)
37 |   mats_quants <- list()
38 |   
39 |   for (i in 1:ncol(covars))
40 |     mats_quants[[i]] = temp(i)
41 |   
42 |   mats <- lapply(mats_quants, `[[`, "mat")#function(mq) mq[["mat"]])
43 |   quants <- lapply(mats_quants, `[[`, "quants")#function(mq) mq[["quants"]])
44 |   
45 |   cnams <- colnames(covars)  
46 |   if (!is.null(cnams)) {
47 |     for (i in seq_along(mats)) {
48 |       colnames(mats[[i]]) <- paste(cnams[i], colnames(mats[[i]]), sep="")    
49 |       quants[[i]] <- lapply(quants[[i]], function(q) c(varname=cnams[i], q))
50 |     }
51 |   }
52 | 
53 |   mat <- do.call(cbind, mats)
54 |   quants <- do.call(c, quants)
55 |   list(mat=mat, quants=quants)
56 | }
57 | 


--------------------------------------------------------------------------------
/R/hdps-package.R:
--------------------------------------------------------------------------------
 1 | #' High-dimensional propensity score algorithm
 2 | #' 
 3 | #' The high-dimensional propensity score (HDPS) algorithm is a method for
 4 | #' high-dimensional proxy adjustment in claims data. This package implements
 5 | #' the variable transformation and variable selection parts of the
 6 | #' algorithm.
 7 | #' 
 8 | #' \tabular{ll}{
 9 | #' Package: \tab hdps\cr
10 | #' Type: \tab Package\cr
11 | #' Version: \tab 0.1.6\cr
12 | #' Date: \tab 2017-08-16\cr
13 | #' License: \tab MIT \cr 
14 | #' } 
15 | #' 
16 | #' This package implements part of step 2 (\code{\link{identify_covariates}}), 
17 | #' steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}})
18 | #' of the HDPS algorithm (Schneeweiss et al., 2009).
19 | #' 
20 | #' The \code{\link{hdps_screen}} function is a wrapper function for \code{\link{identify_covariates}},
21 | #' \code{\link{assess_recurrence}},  and \code{\link{prioritize_covariates}}.
22 | #' 
23 | #' @name hdps-package
24 | #' @aliases hdps-package hdps
25 | #' @docType package
26 | #' @author Sam Lendle
27 | #' 
28 | #' Maintainer: Sam Lendle <sam.lendle@@gmail.com>
29 | #' 
30 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
31 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
32 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology
33 | #' (Cambridge, Mass.)}, 20(4), 512.
34 | #' @keywords package
35 | #' @examples
36 | #' 
37 | #' #~~ simple examples of the most important functions ~~
38 | #' 
39 | #' @useDynLib hdps
40 | #' @importFrom Rcpp evalCpp
41 | NULL
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/R/hdps_screen.R:
--------------------------------------------------------------------------------
  1 | #' The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 
  2 | #' steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}})
  3 | #' of the HDPS algorithm (Schneeweiss et al., 2009).
  4 | #'
  5 | #' The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 
  6 | #' steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}})
  7 | #' of the HDPS algorithm (Schneeweiss et al., 2009).
  8 | #' 
  9 | #' \emph{Step 2.} Columns of \code{covars} are split by data dimension (as defined in Schneeweiss et al. (2009)) and
 10 | #' filtered by \code{\link{identify_covariates}}.
 11 | #'
 12 | #' Dimensions can be specified in two ways.
 13 | #' If \code{dimension_names} is used, the \code{colnames(covars)} is \code{\link[base]{grep}}ed for each value of 
 14 | #' \code{dimension_names}.
 15 | #' If some column names match more than one pattern, an error is thrown.
 16 | #' If some column names are not matched by any pattern, a warning is issued and those columns are ignored.
 17 | #' For example, suppose the column names of \code{covars} are \code{c("drug_1", "drug_2", "proc_1", "proc_2")}.
 18 | #' \code{dimension_names <- c("drug", "proc")} would split \code{covars} into two dimensions,
 19 | #' one for \code{drug}s and one for \code{proc}s.
 20 | #' 
 21 | #' Dimensions can also be specified by \code{dimension_indexes} which should contain a list of either column
 22 | #' indexes or column names for each dimension.
 23 | #' 
 24 | #' If neither \code{dimension_names} nor \code{dimension_indexes} is specified, all covariates are treated as one dimension.
 25 | #' 
 26 | #' \emph{Step 3.} After filtering, remaining covariates are expanded by \code{\link{assess_recurrence}}.
 27 | #' 
 28 | #' If at this point, the number of expanded covariates is less than \code{keep_k_total}, all expanded covariates are returned.
 29 | #' 
 30 | #' \emph{Step 4.} Expanded covariates are ordered with \code{\link{prioritize_covariates}}.
 31 | #' 
 32 | #' \emph{Step 5.} Step 5 can be performed with \code{\link{predict.hdps_covars}}.
 33 | #' 
 34 | #' @title hdps_screen
 35 | #' @param outcome binary vector of outcomes
 36 | #' @param treatment binary vector of treatments
 37 | #' @param covars \code{matrix} or \code{data.frame} of binary covariates. 
 38 | #' @param dimension_names A character vector of patterns to match against the column names of \code{covars} to split columns into dimension groups. See details.
 39 | #' @param dimension_indexes A list of vectors of column indexes corresponding to dimension groups. See details. Cannot be specified with \code{dimension_names}.
 40 | #' @param keep_n_per_dimension The maximum number of covariates to be kept per dimension by \code{\link{identify_covariates}}.
 41 | #' @param keep_k_total Total number of covariates to keep after expanding by \code{\link{assess_recurrence}} and ordering by \code{link{prioritize_covariates}}.
 42 | #' @param verbose Should verbose output be printed?
 43 | #' @param debug Enables some debuging checks which slow things down, but may yield useful warnings or errors.
 44 | #' @return An object of class \code{hdps_covars}
 45 | #' @seealso \code{\link{predict.hdps_covars}}
 46 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
 47 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
 48 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology
 49 | #' (Cambridge, Mass.)}, 20(4), 512.
 50 | #' @author Sam Lendle
 51 | #' @examples
 52 | #' set.seed(123)
 53 | #' n <- 1000
 54 | #' p <- 10000
 55 | #' out <- rbinom(n, 1, 0.05)
 56 | #' trt <- rbinom(n, 1, 0.5)
 57 | #' covars <- matrix(rbinom(n*p, 3, 0.05), n)
 58 | #' colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"),
 59 | #'                       paste("proc", 1:(p/2), sep="_"))
 60 | #' 
 61 | #' dimension_names <- c("drug", "proc")
 62 | #' 
 63 | #' screened_covars_fit <- hdps_screen(out, trt, covars, 
 64 | #'                                    dimension_names = dimension_names,
 65 | #'                                    keep_n_per_dimension = 400,
 66 | #'                                    keep_k_total = 200,
 67 | #'                                    verbose=TRUE)
 68 | #'                                    
 69 | #' screened_covars <- predict(screened_covars_fit)
 70 | #' 
 71 | #' @export
 72 | hdps_screen <- function(outcome, treatment, covars,
 73 |                         dimension_names=NULL, dimension_indexes=NULL,
 74 |                         keep_n_per_dimension=200, keep_k_total=500,
 75 |                         verbose=FALSE, debug=FALSE) {
 76 |   
 77 |   check_inputs(outcome, treatment, covars)
 78 |   
 79 |   if (!is.null(dimension_names) && !is.null(dimension_indexes)) {
 80 |     stop("At most, one of dimension_names and dimension_indexes should be specified")
 81 |   }
 82 |   
 83 |   if (!is.null(dimension_names)) {
 84 |     dimension_indexes <- lapply(dimension_names, grep, x = colnames(covars))
 85 |     all_idx <- do.call(c, dimension_indexes)
 86 |     if (anyDuplicated(all_idx)) {
 87 |       stop("Some column names of covars are matched by more than one pattern in dimension_names")
 88 |     }
 89 |     if (!all(all_idx %in% 1:ncol(covars))) {
 90 |       warning("Some column names of covars are not matched by any of the patterns in dimension_names")
 91 |     } 
 92 |   }
 93 |   
 94 |   # Step 2. Identify empirical candidate covariates
 95 |   if (is.null(dimension_indexes)) {
 96 |     if (verbose) message("No dimensions specified...")
 97 |     if (verbose) message("Filtering covariates...")
 98 |     filtered_covars <- identify_covariates(covars, keep_n_covars=keep_n_per_dimension, indexes=FALSE)
 99 |   } else {
100 |     if (verbose) message("Filtering covariates...")
101 |     filtered_covars <- lapply(seq_along(dimension_indexes), function(i) {
102 |       if (verbose) message("\tFiltering dimension ", 
103 |                            if (!is.null(dimension_names)) dimension_names[i] else i,
104 |                            "...")
105 |       identify_covariates(covars[, dimension_indexes[[i]]], keep_n_covars=keep_n_per_dimension, indexes=FALSE)
106 |     })
107 |     if (verbose) message("Combining dimensions...")
108 |     filtered_covars <- do.call(cbind, filtered_covars)
109 |   }
110 |   
111 |   #Step 3. Assess recurrence
112 |   if (verbose) message("Expanding covariates...")
113 |   ar <- assess_recurrence(filtered_covars, debug=debug)
114 |   expanded_covars <- ar[["mat"]]
115 |   quants <- ar[["quants"]]
116 |   
117 |   if (dim(expanded_covars)[2] != length(quants)) stop("something is wrong...")
118 |   
119 |   #Step 4. Prioritize covariates
120 |   if (verbose) message("Prioritizing covariates...")
121 |   ordered_indexes <- prioritize_covariates(outcome, treatment, expanded_covars, keep_NaNs=TRUE)
122 |   
123 |   res <- list(expanded_covars=expanded_covars,
124 |        quants=quants,
125 |        ordered_indexes=ordered_indexes,
126 |        keep_k_total=keep_k_total
127 |        )
128 |   if (verbose) message("...Done!")
129 |   class(res) <- "hdps_covars"
130 |   
131 |   return(res)
132 | 
133 | }
134 | 
135 | #' returns the matix of covariates based on an hdps screening
136 | #'
137 | #' @title Get matrix of hdps selected covariates
138 | #' @param object object of class \code{hdps_covars}
139 | #' @param newdata \code{NULL}, or a matrix who's columns have names corresponding to those selected by hdps in \code{object}. 
140 | #' If \code{NULL} selected covariates from original matrix used in the screening step are returned.
141 | #' @param keep_k_total change \code{keep_k_total} from the original call to \code{\link{hdps_screen}}
142 | #' @param ... ignored
143 | #' @return A matrix of hdps selected covariates
144 | #' @seealso \link{hdps_screen}
145 | #' @author Sam Lendle
146 | #' @export
147 | predict.hdps_covars <- function(object, newdata=NULL, keep_k_total, ...) {
148 |   if (missing(keep_k_total)) keep_k_total <- object$keep_k_total
149 |   
150 |   if (!is.null(newdata)) {
151 |     #could be more efficient here
152 |     # by first filtering the quants to only the keep_k_total needed
153 |     # then by grouping by varname
154 |     mats <- lapply(object$quants, function(quant) {
155 |       x <- newdata[, quant$varname]
156 |       mat <- column_recurrence(x, list(quant))$mat
157 |       colnames(mat) <- paste(quant$varname, colnames(mat), sep="")    
158 |       mat
159 |     })
160 |     expanded_covars <- do.call(cbind, mats)
161 |   } else {
162 |     expanded_covars <- object$expanded_covars
163 |   }
164 |   
165 |   if (ncol(expanded_covars) <= keep_k_total) {
166 |     return(expanded_covars)
167 |   }
168 |   
169 |   ordered_indexes <- object$ordered_indexes
170 |   selected_indexes <- ordered_indexes[1:min(keep_k_total, length(ordered_indexes))]
171 |   selected_covars <- expanded_covars[, sort(selected_indexes)]
172 |   return(selected_covars)
173 | }
174 |   
175 |   
176 | 


--------------------------------------------------------------------------------
/R/identify_covariates.R:
--------------------------------------------------------------------------------
 1 | #' Given a matrix of covarites, \code{identify_covariates} returns the top \code{keep_n_covars} or the indexes of those columns.
 2 | #'
 3 | #' Columns are sorted in descending order of \code{min(prevalence, 1-prevalence)}  where \code{prevalence} is the the proportion of
 4 | #' non-zero values in a given column.
 5 | #' 
 6 | #' If \code{indexes==TRUE}, a vector of the top \code{keep_n_covars} column indexes is returned.
 7 | #' 
 8 | #' If \code{indexes==FALSE}, a matrix of covariates is returned whos columns are the top \code{keep_n_covars} colums of
 9 | #' \code{covars}. Columns are in their original order.
10 | #' If also \code{keep_n_covars >= ncol(covar)}, then the function returns immediately without ranking columns in terms of
11 | #' prevalence as it is unecessary.
12 | #' 
13 | #' \strong{Differences from Schneeweiss et al. (2009):} 
14 | #' \itemize{
15 | #'   \item{Covariates that have fewer than 100 non-zero values are not automatically dropped. 
16 | #'   If typical covariates tend to have more than 100 non-zero values will typically be ranked higher than those with fewer than 100
17 | #'   automatically.}
18 | #' }
19 | #' 
20 | #' @title identify_covariates
21 | #' @param covars a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates
22 | #' @param keep_n_covars number of covariates to keep
23 | #' @param indexes Should indexes be returned? Or a subset of \code{covars}. Defaults to \code{FALSE}.
24 | #' @return Indexes of identified columns or a subset of \code{covars}
25 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
26 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
27 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology
28 | #' (Cambridge, Mass.)}, 20(4), 512.
29 | #' @author Sam Lendle
30 | #' @export
31 | identify_covariates <- function(covars, keep_n_covars=200, indexes=FALSE) {
32 |   
33 |   if (!indexes && ncol(covars) <= keep_n_covars) return(covars)
34 |   vars <- colPrevScores(as.matrix(covars))
35 |   var_ords <- order(vars, decreasing=TRUE)[1:min(keep_n_covars, ncol(covars))] 
36 |   
37 |   if (indexes) {
38 |     return(var_ords)
39 |   } else {
40 |   covars[, sort(var_ords)]  
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/R/prioritize_covariates.R:
--------------------------------------------------------------------------------
 1 | #' Returns the indexes of columns \code{covars} ordered by the possible amount of counfounding each column could adjust for.
 2 | #'
 3 | #' For each covariate in \code{covars}, the potential multiplicitive bias is calculated as described in Schneeweiss et al. (2009).
 4 | #' The column indexes are then put in descending order of the absolute value of the log of the multiplicitive bias.
 5 | #' 
 6 | #' If \code{return_bias==TRUE}, the returned vector of indexes includes the multiplicitive biases sorted in the same order and stored in an attribute called \code{"bias_m"}.
 7 | #' 
 8 | #' If \code{outcome} has no variation for a particular value of a covariate, then the multiplicitive bias is calculated as \code{NaN}.
 9 | #' If \code{keep_NaNs==FALSE}, then the column indexes of such covariates are not included in the returned vector.
10 | #' 
11 | #' @title prioritize_covariates
12 | #' @param outcome binary vector of outcomes
13 | #' @param treatment binary vector of treatments
14 | #' @param covars \code{matrix} or \code{data.frame} of binary covariates. 
15 | #' @param return_bias Should the calculated multiplitive biases be returned with the ordered indexes? Defaults to \code{FALSE}
16 | #' @param keep_NaNs If the calculated multiplicitive bias for a covariate is \code{NaN}, should its index be included in the returned vector? Defaults to \code{FALSE}
17 | #' @return Vector of column indexes of \code{covars}.
18 | #' @author Sam Lendle
19 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
20 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
21 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology
22 | #' (Cambridge, Mass.)}, 20(4), 512.
23 | #' @export
24 | prioritize_covariates <- function(outcome, treatment, covars, return_bias=FALSE, keep_NaNs=FALSE) {
25 |   check_inputs(outcome, treatment, covars, covars_bin=TRUE)
26 |   
27 |   treatment <- factor(treatment)
28 |   
29 |   covar_prev <- by(covars, treatment, colMeans)
30 |   p_c1 <- covar_prev[[levels(treatment)[1]]]
31 |   p_c0 <- covar_prev[[levels(treatment)[2]]]
32 |   
33 |   #a vector of rr_cd if rr_cd > 1, 1/rr_cd otherwise
34 |   rr_cds <- calc_rr_cds(outcome, covars)
35 |   
36 |   #Infs in rr_cds will result in NaN for some covariates
37 |   bias_mult <- (p_c1 * (rr_cds - 1) + 1) / (p_c0 * (rr_cds - 1) + 1)
38 |   
39 |   abs_log_bias_mult <- abs(log(bias_mult))
40 |   
41 |   na.last <- if (keep_NaNs) TRUE else NA
42 |   ordered_idxs <- order(abs_log_bias_mult, decreasing=TRUE, na.last=na.last)
43 |   if (return_bias) attr(ordered_idxs, "bias_m") <- bias_mult[ordered_idxs]
44 |   ordered_idxs
45 | }
46 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
 1 | #' @importFrom stats quantile
 2 | get_quantiles <- function(x) {
 3 |   xx0 <- x[x>0]
 4 |   quants <- quantile(xx0, probs=c(0.5, 0.75), names=FALSE, type=2)
 5 |   quants <- list(list(q="_once", count=1),
 6 |                  list(q="_sporadic", count=quants[1]),
 7 |                  list(q="_frequent", count=quants[2]))
 8 |   
 9 |   counts <- sapply(quants, `[[`, "count")
10 |   ux <- unique(xx0)
11 |   cutoffs <- sapply(counts, function(count) min(ux[ux >= count]))  
12 |   dups <- duplicated(cutoffs)
13 |   quants[!dups]
14 | }
15 | 
16 | column_recurrence <- function(x, quants, warndup=FALSE) {
17 |   mat <- matrix(0, length(x), length(quants))    
18 |   colnames(mat) <- sapply(quants, `[[`, "q")
19 |   
20 |   for (i in 1:length(quants)) {
21 |     mat[, i] <- x >= quants[[i]]$count
22 |   }
23 |   
24 |   if (warndup) {
25 |     dups <- duplicated(mat, MARGIN=2)
26 |     if (any(dups)) {
27 |       warning("Duplicate columns in mat. This should not happen when hdps_screen is called, but could when predict is called.")
28 |     }
29 |   }
30 |   
31 |   
32 |   list(mat=mat, quants=quants)
33 | }
34 | 
35 | check_inputs <- function(outcome, treatment, covars, covars_bin=FALSE) {
36 |   n = nrow(covars)
37 |   
38 |   if(!is.vector(outcome)) stop("outcome should be a vector")
39 |   if(!is.vector(treatment)) stop("treatment should be a vector")
40 |   
41 |   if (!length(outcome) == n || !length(treatment) == n)
42 |     stop("outcome and treatment should be the same length, which should be equal to nrow(covars)")
43 |   
44 |   if (!all(outcome %in% c(0,1)))
45 |     stop("outcome should be binary")
46 |   if (!all(treatment %in% c(0,1)))
47 |     stop("treatment should be binary")
48 |   if (covars_bin && !all(covars %in% c(0,1)))
49 |     stop("covars should be binary")
50 | }
51 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | hdps
 2 | ====
 3 | 
 4 | [![Build Status](https://travis-ci.org/lendle/hdps.svg)](https://travis-ci.org/lendle/hdps)
 5 | 
 6 | High dimensional propensity score algorithm
 7 | 
 8 | Install
 9 | -------
10 | This package can be installed using the `devtools` package in R:
11 | ```r
12 | library(devtools)
13 | install_github("lendle/hdps")
14 | # or if that doesn't work, try
15 | install_git("git@github.com:lendle/hdps")
16 | ```
17 | 
18 | Documentation
19 | -------------
20 | 
21 | Wouldn't it be great if you could automatically generate markdown files from `.Rd` files so you could just view the docs directly on github? Well you can't. So open up R and do this instead:
22 | ```r
23 | library(hdps)
24 | ?hdps
25 | ```
26 | 


--------------------------------------------------------------------------------
/hdps.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: No
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageInstallArgs: --no-multiarch --with-keep.source
17 | PackageRoxygenize: rd,collate,namespace
18 | 


--------------------------------------------------------------------------------
/man/SL.hdps.generator.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SL.hdps.R
 3 | \name{SL.hdps.generator}
 4 | \alias{SL.hdps.generator}
 5 | \title{SL.hdps.generator}
 6 | \usage{
 7 | SL.hdps.generator(out_name, dimension_names, predef_covar_names = c(),
 8 |   keep_k_total, ..., cvglmnet = FALSE, glmnet_args = if (cvglmnet) list()
 9 |   else list(lambda = 0))
10 | }
11 | \arguments{
12 | \item{out_name}{Name of the outcome variable.}
13 | 
14 | \item{dimension_names}{Dimension names of HDPS dimensions. See
15 | \code{\link{hdps_screen}}.}
16 | 
17 | \item{predef_covar_names}{Names of predefined covariates to be included in
18 | logistic regression model.}
19 | 
20 | \item{keep_k_total}{See \code{\link{hdps_screen}}.}
21 | 
22 | \item{...}{Other arguments passed to \code{\link{hdps_screen}}.}
23 | 
24 | \item{cvglmnet}{Use \code{glmnet} or \code{cv.glmnet} for fitting. Defaults to FALSE.}
25 | 
26 | \item{glmnet_args}{list of arguments to be passed to glmnet or cv.glmnet. If \code{cvglmnet=FALSE}, \code{glmnet_args}
27 | should be set such that calling \code{predict} on the \code{glmnet} object returns only one vector of predictions. 
28 | E.g. only one value of \code{lambda} should be set.}
29 | }
30 | \value{
31 | A SuperLearner wrapper function
32 | }
33 | \description{
34 | Generates a wrapper for SuperLearner using HDPS
35 | }
36 | \details{
37 | A HDPS candidate will generate covariates using \code{hdps_screen} from
38 | codes, and estimate the propensity score with logistic regression on
39 | generated covariates and predefined covariates.
40 | 
41 | To use HDPS in SuperLearner to estimate a propensity score, you need to
42 | include the outcome variable as a covariate where here outcome means the
43 | outcome of interest in the causal problem as opposed to the \code{Y}
44 | variable in SuperLearner. For non-HDPS candidates in SuperLearner, it's
45 | important to exclude the outcome variable via \code{\link[=screen]{screen.named}} or
46 | some other screening algorithm in order to avoid adjusting for something
47 | downstream on the causal pathway.
48 | }
49 | \author{
50 | Sam Lendle
51 | }
52 | 


--------------------------------------------------------------------------------
/man/assess_recurrence.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/assess_recurrence.R
 3 | \name{assess_recurrence}
 4 | \alias{assess_recurrence}
 5 | \title{assess_recurrence}
 6 | \usage{
 7 | assess_recurrence(covars, debug = FALSE)
 8 | }
 9 | \arguments{
10 | \item{covars}{a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates}
11 | 
12 | \item{debug}{Enables some debuging checks which slow things down, but may yield useful warnings or errors.}
13 | }
14 | \value{
15 | Expanded \code{covars} matrix.
16 | }
17 | \description{
18 | Expands covarites to up to three binary columns for at least one, sporadic, or frequent occrence of each covariate.
19 | }
20 | \details{
21 | Each column \code{x} of \code{covars} is expanded to three binary columns. 
22 | The first column indicates that the value of \code{x} is non-zero. 
23 | The second indicates that the value of \code{x} \eqn{\ge} the median of non-zero values of \code{x}.
24 | The third indicates that the value of \code{x} \eqn{\ge} the 75th percentile of non-zero values of \code{x}. 
25 | Non-unique columns per covariate are dropped.
26 | 
27 | Groups of columns of the returned matrix are in the same order of columns in \code{covars}.
28 | 
29 | If \code{covars} has column names, the returned matrix will have the same column names with suffexes \code{"_once"},
30 | \code{"_sporadic"}, and \code{"_frequent"} for the first, second, and third expanded columns, respectively.
31 | }
32 | \references{
33 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
34 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
35 | in studies of treatment effects using health care claims data. \emph{Epidemiology
36 | (Cambridge, Mass.)}, 20(4), 512.
37 | }
38 | \author{
39 | Sam Lendle
40 | }
41 | 


--------------------------------------------------------------------------------
/man/hdps-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hdps-package.R
 3 | \docType{package}
 4 | \name{hdps-package}
 5 | \alias{hdps-package}
 6 | \alias{hdps}
 7 | \title{High-dimensional propensity score algorithm}
 8 | \description{
 9 | The high-dimensional propensity score (HDPS) algorithm is a method for
10 | high-dimensional proxy adjustment in claims data. This package implements
11 | the variable transformation and variable selection parts of the
12 | algorithm.
13 | }
14 | \details{
15 | \tabular{ll}{
16 | Package: \tab hdps\cr
17 | Type: \tab Package\cr
18 | Version: \tab 0.1.6\cr
19 | Date: \tab 2017-08-16\cr
20 | License: \tab MIT \cr 
21 | }
22 | 
23 | This package implements part of step 2 (\code{\link{identify_covariates}}), 
24 | steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}})
25 | of the HDPS algorithm (Schneeweiss et al., 2009).
26 | 
27 | The \code{\link{hdps_screen}} function is a wrapper function for \code{\link{identify_covariates}},
28 | \code{\link{assess_recurrence}},  and \code{\link{prioritize_covariates}}.
29 | }
30 | \examples{
31 | 
32 | #~~ simple examples of the most important functions ~~
33 | 
34 | }
35 | \references{
36 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
37 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
38 | in studies of treatment effects using health care claims data. \emph{Epidemiology
39 | (Cambridge, Mass.)}, 20(4), 512.
40 | }
41 | \author{
42 | Sam Lendle
43 | 
44 | Maintainer: Sam Lendle <sam.lendle@gmail.com>
45 | }
46 | \keyword{package}
47 | 


--------------------------------------------------------------------------------
/man/hdps_screen.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/hdps_screen.R
  3 | \name{hdps_screen}
  4 | \alias{hdps_screen}
  5 | \title{hdps_screen}
  6 | \usage{
  7 | hdps_screen(outcome, treatment, covars, dimension_names = NULL,
  8 |   dimension_indexes = NULL, keep_n_per_dimension = 200,
  9 |   keep_k_total = 500, verbose = FALSE, debug = FALSE)
 10 | }
 11 | \arguments{
 12 | \item{outcome}{binary vector of outcomes}
 13 | 
 14 | \item{treatment}{binary vector of treatments}
 15 | 
 16 | \item{covars}{\code{matrix} or \code{data.frame} of binary covariates.}
 17 | 
 18 | \item{dimension_names}{A character vector of patterns to match against the column names of \code{covars} to split columns into dimension groups. See details.}
 19 | 
 20 | \item{dimension_indexes}{A list of vectors of column indexes corresponding to dimension groups. See details. Cannot be specified with \code{dimension_names}.}
 21 | 
 22 | \item{keep_n_per_dimension}{The maximum number of covariates to be kept per dimension by \code{\link{identify_covariates}}.}
 23 | 
 24 | \item{keep_k_total}{Total number of covariates to keep after expanding by \code{\link{assess_recurrence}} and ordering by \code{link{prioritize_covariates}}.}
 25 | 
 26 | \item{verbose}{Should verbose output be printed?}
 27 | 
 28 | \item{debug}{Enables some debuging checks which slow things down, but may yield useful warnings or errors.}
 29 | }
 30 | \value{
 31 | An object of class \code{hdps_covars}
 32 | }
 33 | \description{
 34 | The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 
 35 | steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}})
 36 | of the HDPS algorithm (Schneeweiss et al., 2009).
 37 | }
 38 | \details{
 39 | The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 
 40 | steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}})
 41 | of the HDPS algorithm (Schneeweiss et al., 2009).
 42 | 
 43 | \emph{Step 2.} Columns of \code{covars} are split by data dimension (as defined in Schneeweiss et al. (2009)) and
 44 | filtered by \code{\link{identify_covariates}}.
 45 | 
 46 | Dimensions can be specified in two ways.
 47 | If \code{dimension_names} is used, the \code{colnames(covars)} is \code{\link[base]{grep}}ed for each value of 
 48 | \code{dimension_names}.
 49 | If some column names match more than one pattern, an error is thrown.
 50 | If some column names are not matched by any pattern, a warning is issued and those columns are ignored.
 51 | For example, suppose the column names of \code{covars} are \code{c("drug_1", "drug_2", "proc_1", "proc_2")}.
 52 | \code{dimension_names <- c("drug", "proc")} would split \code{covars} into two dimensions,
 53 | one for \code{drug}s and one for \code{proc}s.
 54 | 
 55 | Dimensions can also be specified by \code{dimension_indexes} which should contain a list of either column
 56 | indexes or column names for each dimension.
 57 | 
 58 | If neither \code{dimension_names} nor \code{dimension_indexes} is specified, all covariates are treated as one dimension.
 59 | 
 60 | \emph{Step 3.} After filtering, remaining covariates are expanded by \code{\link{assess_recurrence}}.
 61 | 
 62 | If at this point, the number of expanded covariates is less than \code{keep_k_total}, all expanded covariates are returned.
 63 | 
 64 | \emph{Step 4.} Expanded covariates are ordered with \code{\link{prioritize_covariates}}.
 65 | 
 66 | \emph{Step 5.} Step 5 can be performed with \code{\link{predict.hdps_covars}}.
 67 | }
 68 | \examples{
 69 | set.seed(123)
 70 | n <- 1000
 71 | p <- 10000
 72 | out <- rbinom(n, 1, 0.05)
 73 | trt <- rbinom(n, 1, 0.5)
 74 | covars <- matrix(rbinom(n*p, 3, 0.05), n)
 75 | colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"),
 76 |                       paste("proc", 1:(p/2), sep="_"))
 77 | 
 78 | dimension_names <- c("drug", "proc")
 79 | 
 80 | screened_covars_fit <- hdps_screen(out, trt, covars, 
 81 |                                    dimension_names = dimension_names,
 82 |                                    keep_n_per_dimension = 400,
 83 |                                    keep_k_total = 200,
 84 |                                    verbose=TRUE)
 85 |                                    
 86 | screened_covars <- predict(screened_covars_fit)
 87 | 
 88 | }
 89 | \references{
 90 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
 91 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
 92 | in studies of treatment effects using health care claims data. \emph{Epidemiology
 93 | (Cambridge, Mass.)}, 20(4), 512.
 94 | }
 95 | \seealso{
 96 | \code{\link{predict.hdps_covars}}
 97 | }
 98 | \author{
 99 | Sam Lendle
100 | }
101 | 


--------------------------------------------------------------------------------
/man/identify_covariates.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/identify_covariates.R
 3 | \name{identify_covariates}
 4 | \alias{identify_covariates}
 5 | \title{identify_covariates}
 6 | \usage{
 7 | identify_covariates(covars, keep_n_covars = 200, indexes = FALSE)
 8 | }
 9 | \arguments{
10 | \item{covars}{a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates}
11 | 
12 | \item{keep_n_covars}{number of covariates to keep}
13 | 
14 | \item{indexes}{Should indexes be returned? Or a subset of \code{covars}. Defaults to \code{FALSE}.}
15 | }
16 | \value{
17 | Indexes of identified columns or a subset of \code{covars}
18 | }
19 | \description{
20 | Given a matrix of covarites, \code{identify_covariates} returns the top \code{keep_n_covars} or the indexes of those columns.
21 | }
22 | \details{
23 | Columns are sorted in descending order of \code{min(prevalence, 1-prevalence)}  where \code{prevalence} is the the proportion of
24 | non-zero values in a given column.
25 | 
26 | If \code{indexes==TRUE}, a vector of the top \code{keep_n_covars} column indexes is returned.
27 | 
28 | If \code{indexes==FALSE}, a matrix of covariates is returned whos columns are the top \code{keep_n_covars} colums of
29 | \code{covars}. Columns are in their original order.
30 | If also \code{keep_n_covars >= ncol(covar)}, then the function returns immediately without ranking columns in terms of
31 | prevalence as it is unecessary.
32 | 
33 | \strong{Differences from Schneeweiss et al. (2009):} 
34 | \itemize{
35 |   \item{Covariates that have fewer than 100 non-zero values are not automatically dropped. 
36 |   If typical covariates tend to have more than 100 non-zero values will typically be ranked higher than those with fewer than 100
37 |   automatically.}
38 | }
39 | }
40 | \references{
41 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
42 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
43 | in studies of treatment effects using health care claims data. \emph{Epidemiology
44 | (Cambridge, Mass.)}, 20(4), 512.
45 | }
46 | \author{
47 | Sam Lendle
48 | }
49 | 


--------------------------------------------------------------------------------
/man/predict.SL_hdps.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SL.hdps.R
 3 | \name{predict.SL_hdps}
 4 | \alias{predict.SL_hdps}
 5 | \title{Get predictions from SL_hdps wrapper}
 6 | \usage{
 7 | \method{predict}{SL_hdps}(object, newdata, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{SL_hdps}}
11 | 
12 | \item{newdata}{a matrix of covariates to predict from}
13 | 
14 | \item{...}{ignored}
15 | }
16 | \value{
17 | vector of predictions
18 | }
19 | \description{
20 | Get predictions from SL_hdps wrapper
21 | }
22 | \author{
23 | Sam Lendle
24 | }
25 | 


--------------------------------------------------------------------------------
/man/predict.hdps_covars.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hdps_screen.R
 3 | \name{predict.hdps_covars}
 4 | \alias{predict.hdps_covars}
 5 | \title{Get matrix of hdps selected covariates}
 6 | \usage{
 7 | \method{predict}{hdps_covars}(object, newdata = NULL, keep_k_total, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class \code{hdps_covars}}
11 | 
12 | \item{newdata}{\code{NULL}, or a matrix who's columns have names corresponding to those selected by hdps in \code{object}. 
13 | If \code{NULL} selected covariates from original matrix used in the screening step are returned.}
14 | 
15 | \item{keep_k_total}{change \code{keep_k_total} from the original call to \code{\link{hdps_screen}}}
16 | 
17 | \item{...}{ignored}
18 | }
19 | \value{
20 | A matrix of hdps selected covariates
21 | }
22 | \description{
23 | returns the matix of covariates based on an hdps screening
24 | }
25 | \seealso{
26 | \link{hdps_screen}
27 | }
28 | \author{
29 | Sam Lendle
30 | }
31 | 


--------------------------------------------------------------------------------
/man/prioritize_covariates.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/prioritize_covariates.R
 3 | \name{prioritize_covariates}
 4 | \alias{prioritize_covariates}
 5 | \title{prioritize_covariates}
 6 | \usage{
 7 | prioritize_covariates(outcome, treatment, covars, return_bias = FALSE,
 8 |   keep_NaNs = FALSE)
 9 | }
10 | \arguments{
11 | \item{outcome}{binary vector of outcomes}
12 | 
13 | \item{treatment}{binary vector of treatments}
14 | 
15 | \item{covars}{\code{matrix} or \code{data.frame} of binary covariates.}
16 | 
17 | \item{return_bias}{Should the calculated multiplitive biases be returned with the ordered indexes? Defaults to \code{FALSE}}
18 | 
19 | \item{keep_NaNs}{If the calculated multiplicitive bias for a covariate is \code{NaN}, should its index be included in the returned vector? Defaults to \code{FALSE}}
20 | }
21 | \value{
22 | Vector of column indexes of \code{covars}.
23 | }
24 | \description{
25 | Returns the indexes of columns \code{covars} ordered by the possible amount of counfounding each column could adjust for.
26 | }
27 | \details{
28 | For each covariate in \code{covars}, the potential multiplicitive bias is calculated as described in Schneeweiss et al. (2009).
29 | The column indexes are then put in descending order of the absolute value of the log of the multiplicitive bias.
30 | 
31 | If \code{return_bias==TRUE}, the returned vector of indexes includes the multiplicitive biases sorted in the same order and stored in an attribute called \code{"bias_m"}.
32 | 
33 | If \code{outcome} has no variation for a particular value of a covariate, then the multiplicitive bias is calculated as \code{NaN}.
34 | If \code{keep_NaNs==FALSE}, then the column indexes of such covariates are not included in the returned vector.
35 | }
36 | \references{
37 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun,
38 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment
39 | in studies of treatment effects using health care claims data. \emph{Epidemiology
40 | (Cambridge, Mass.)}, 20(4), 512.
41 | }
42 | \author{
43 | Sam Lendle
44 | }
45 | 


--------------------------------------------------------------------------------
/man/screen.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/SL.hdps.R
 3 | \name{screen}
 4 | \alias{screen}
 5 | \alias{screen.names}
 6 | \alias{screen.excludenames}
 7 | \title{SuperLearner screening wrappers}
 8 | \usage{
 9 | screen.names(names)
10 | 
11 | screen.excludenames(names)
12 | }
13 | \arguments{
14 | \item{names}{Names to be included or excluded
15 | 
16 | These functions generate simple screening wrappers for SuperLearner to 
17 | include or exclude variables based on \code{names}. This is is helpful 
18 | because in order to use HDPS as a candidate in SuperLearner, you need to 
19 | include the study outcome variable as a covariate. But to use a non-HDPS 
20 | algorithm, (say a random forest on some specified set of covariates,) as a
21 | candidate as well, you want to make sure you're not adjusting for the 
22 | outcome which is downstream from treatment on the causal pathway.
23 | 
24 | See documentation for the SuperLearner package for more about screening algorithms.}
25 | }
26 | \description{
27 | Functions to set up screening wrappers for SuperLearner
28 | }
29 | \examples{
30 | 
31 | screen.predefined <- screen.names(c("names", "of", "predefined",
32 | "covariates", "that", "definitely", "dont", "include", "the", "outcome"))
33 | 
34 | screen.notoutcome <- screen.excludenames(c("outcome_variable_name",
35 | "and", "other", "covariates", "to", "exclude"))
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | // calc_rr_cds
 9 | NumericVector calc_rr_cds(NumericVector outcome, NumericMatrix covars);
10 | RcppExport SEXP _hdps_calc_rr_cds(SEXP outcomeSEXP, SEXP covarsSEXP) {
11 | BEGIN_RCPP
12 |     Rcpp::RObject rcpp_result_gen;
13 |     Rcpp::RNGScope rcpp_rngScope_gen;
14 |     Rcpp::traits::input_parameter< NumericVector >::type outcome(outcomeSEXP);
15 |     Rcpp::traits::input_parameter< NumericMatrix >::type covars(covarsSEXP);
16 |     rcpp_result_gen = Rcpp::wrap(calc_rr_cds(outcome, covars));
17 |     return rcpp_result_gen;
18 | END_RCPP
19 | }
20 | // colPrevScores
21 | NumericVector colPrevScores(NumericMatrix x);
22 | RcppExport SEXP _hdps_colPrevScores(SEXP xSEXP) {
23 | BEGIN_RCPP
24 |     Rcpp::RObject rcpp_result_gen;
25 |     Rcpp::RNGScope rcpp_rngScope_gen;
26 |     Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP);
27 |     rcpp_result_gen = Rcpp::wrap(colPrevScores(x));
28 |     return rcpp_result_gen;
29 | END_RCPP
30 | }
31 | // colVars
32 | NumericVector colVars(NumericMatrix x);
33 | RcppExport SEXP _hdps_colVars(SEXP xSEXP) {
34 | BEGIN_RCPP
35 |     Rcpp::RObject rcpp_result_gen;
36 |     Rcpp::RNGScope rcpp_rngScope_gen;
37 |     Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP);
38 |     rcpp_result_gen = Rcpp::wrap(colVars(x));
39 |     return rcpp_result_gen;
40 | END_RCPP
41 | }
42 | 
43 | static const R_CallMethodDef CallEntries[] = {
44 |     {"_hdps_calc_rr_cds", (DL_FUNC) &_hdps_calc_rr_cds, 2},
45 |     {"_hdps_colPrevScores", (DL_FUNC) &_hdps_colPrevScores, 1},
46 |     {"_hdps_colVars", (DL_FUNC) &_hdps_colVars, 1},
47 |     {NULL, NULL, 0}
48 | };
49 | 
50 | RcppExport void R_init_hdps(DllInfo *dll) {
51 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
52 |     R_useDynamicSymbols(dll, FALSE);
53 | }
54 | 


--------------------------------------------------------------------------------
/src/calc_rr_cds.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | // Below is a simple example of exporting a C++ function to R. You can
 5 | // source this function into an R session using the Rcpp::sourceCpp 
 6 | // function (or via the Source button on the editor toolbar)
 7 | 
 8 | // For more on using Rcpp click the Help button on the editor toolbar
 9 | 
10 | // [[Rcpp::export]]
11 | NumericVector calc_rr_cds(NumericVector outcome, NumericMatrix covars) {
12 |   int nrow = covars.nrow(), ncol = covars.ncol();
13 |   if (outcome.length() != nrow) {
14 |     stop("length of outcome should be the same as the number of rows in covars");
15 |   }
16 |   
17 |   NumericVector out(ncol);
18 |   out.attr("names") = colnames(covars);
19 |   
20 |   for (int j = 0; j < ncol; j++) {
21 |     double outcomes1 = 0;
22 |     double outcomes0 = 0;
23 |     double n1 = 0;
24 |     double n0 = 0;
25 |         
26 |     for (int i = 0; i < nrow; i++) {
27 |       double covar = covars(i,j);
28 |       if (covar == 0.0) {
29 |         n0 += 1;
30 |         outcomes0 += outcome(i);
31 |       } else {
32 |         n1 += 1;
33 |         outcomes1 += outcome(i);
34 |       }
35 |     }
36 |     
37 |     double prev1 = outcomes1/n1;
38 |     double prev0 = outcomes0/n0;
39 |     
40 |     double rr = prev1/prev0;
41 |     out(j) = rr;
42 |   }
43 |   return out;
44 | }


--------------------------------------------------------------------------------
/src/colPrevScores.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | // [[Rcpp::export]]
 5 | NumericVector colPrevScores(NumericMatrix x) {
 6 |   int nrow = x.nrow(), ncol = x.ncol();
 7 |   NumericVector out(ncol);
 8 |   
 9 |   for (int j = 0; j < ncol; j++) {
10 |     int num_non_zero = 0;
11 |     
12 |     for (int i = 0; i < nrow; i++) {
13 |       num_non_zero += x(i,j) > 0.0 ? 1 : 0;
14 |     }
15 |     
16 |     double prev = (double) num_non_zero/nrow;
17 |     
18 |     out(j) = std::min(prev, 1.0-prev);
19 |   }
20 |   
21 |   return out;
22 | }
23 | 


--------------------------------------------------------------------------------
/src/colVars.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | using namespace Rcpp;
 3 | 
 4 | // [[Rcpp::export]]
 5 | NumericVector colVars(NumericMatrix x) {
 6 |   int nrow = x.nrow(), ncol = x.ncol();
 7 |   NumericVector out(ncol);
 8 |   
 9 |   for (int j = 0; j < ncol; j++) {
10 |     double mean = 0;
11 |     double M2 = 0;
12 |     int n;
13 |     double delta, xx;
14 |     
15 |     for (int i = 0; i < nrow; i++) {
16 |       n = i+1;
17 |       xx = x(i,j);
18 |       delta = xx - mean;
19 |       mean += delta/n;
20 |       M2 = M2 + delta*(xx-mean);
21 |     }
22 |     
23 |     out(j) = M2/(n-1);
24 |   }
25 |   
26 |   return out;
27 | }
28 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(hdps)
3 | 
4 | test_check("hdps")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_SL.hdps.R:
--------------------------------------------------------------------------------
 1 | # well I tried testing this, but seems to not work because of the way SL searches for library functions and scoping. 
 2 | # you can run the code inside the test_that block, and it works, but i'm commenting out for now because it won't run inside
 3 | # that block.
 4 | 
 5 | context("SL.hdps")
 6 | 
 7 | test_that("SL.hdps.generator works with glmnet and cv.glmnet", {
 8 |   expect_true(TRUE)
 9 | #   set.seed(123)
10 | #   n <- 50
11 | #   p <- 500
12 | #   out <- rbinom(n, 1, 0.2)
13 | #   trt <- rbinom(n, 1, 0.5)
14 | #   covars <- matrix(rbinom(n*p, 3, 0.05), n)
15 | #   colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"),
16 | #                         paste("proc", 1:(p/2), sep="_"))
17 | #   
18 | #   dimension_names <- c("drug", "proc")
19 | #   
20 | # 
21 | #   SL.library <- c("SL.hdps.50", "SL.hdps.50.cv")
22 | #   
23 | #   library(SuperLearner)
24 | # 
25 | #   slfit <- SuperLearner(trt, X = data.frame(OUTCOME=out, covars), family=binomial, SL.library=c("SL.hdps.50", "SL.hdps.50.cv"), cvControl = list(V=3))
26 | #   
27 | #   expect_is(slfit, "SuperLearner")
28 | #   expect_is(slfit$fitLibrary$SL.hdps.50_All$glmnet_fit, "glmnet")
29 | #   expect_is(slfit$fitLibrary$SL.hdps.50.cv_All$glmnet_fit, "cv.glmnet")
30 | })
31 | 


--------------------------------------------------------------------------------
/tests/testthat/test_assess_recurrence.R:
--------------------------------------------------------------------------------
 1 | context("assess_recurrence")
 2 | 
 3 | test_that("get_quantiles and column_recurrence work when x is binary", {
 4 |   x <- c(0,0,0,0,1,1,1)
 5 |   mat <- matrix(x, length(x), 1)
 6 |   colnames(mat) = c("_once")
 7 |   cr <- column_recurrence(x, get_quantiles(x))
 8 |   expect_equal(cr$mat, mat)
 9 |   expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q"))
10 | })
11 | 
12 | test_that("get_quantiles and column_recurrence work when 50th and 75th %ile are the same", {
13 |   x <- c(0,0,0,0,1,1,1,3,3,3,3,3)
14 |   mat <- cbind(as.numeric(x>0), as.numeric(x >=3))
15 |   colnames(mat) = c("_once", "_sporadic")
16 |   cr <- column_recurrence(x, get_quantiles(x))
17 |   expect_equal(cr$mat, mat)
18 |   expect_equal(dim(cr$mat)[2], length(cr$quants))
19 |   expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q"))
20 | })
21 | 
22 | test_that("get_quantiles and column_recurrence work when 50th %ile is 1", {
23 |   x <- c(0,0,0,0,1,1,1,1,1,3,3,3)
24 |   mat <- cbind(as.numeric(x>0), as.numeric(x >=3))
25 |   colnames(mat) = c("_once", "_frequent")
26 |   cr <- column_recurrence(x, get_quantiles(x))
27 |   expect_equal(cr$mat, mat)
28 |   expect_equal(dim(cr$mat)[2], length(cr$quants))
29 |   expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q"))
30 | })
31 | 
32 | test_that("get_quantiles and column_recurrence work when 50th %ile is 1.5 and 75th is 2", {
33 |   x <- c(0,0,0,0,1,1,1,1,1,2,2,2,2,2)
34 |   mat <- cbind(as.numeric(x>0), as.numeric(x >1))
35 |   colnames(mat) = c("_once", "_sporadic")
36 |   cr <- column_recurrence(x, get_quantiles(x))
37 |   expect_equal(cr$mat, mat)
38 |   expect_equal(dim(cr$mat)[2], length(cr$quants))
39 |   expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q"))
40 | })
41 | 
42 | test_that("get_quantiles and column_recurrence work when min is 50th %ile", {
43 |   x <- c(0,0,0,0,2,2,2,2,2,2,2,2, 3,3)
44 |   mat <- cbind(as.numeric(x>0))
45 |   colnames(mat) = c("_once")
46 |   cr <- column_recurrence(x, get_quantiles(x))
47 |   expect_equal(cr$mat, mat)
48 |   expect_equal(dim(cr$mat)[2], length(cr$quants))
49 |   expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q"))
50 | })
51 | 
52 | test_that("dups are taken care of when quantiles are ints but not in the data", {
53 |   x <- c(0,0, 5, 1, 5, 5, 3, 7, 2, 5, 5, 3, 1, 3)
54 |   mat <- cbind(as.numeric(x>0), x > 4)
55 |   cr <- column_recurrence(x, get_quantiles(x), warndup=TRUE)
56 |   colnames(mat) = c("_once", "_sporadic")
57 |   expect_equal(cr$mat, mat)
58 |   
59 | })
60 | 
61 | 
62 | test_that("assess_recurrence works", {
63 |   dat <- cbind(a=c(0,0,0,0,1,1,1,3,3,3,3,3),
64 |                b=c(0,0,0,0,1,1,1,1,1,3,3,3))
65 |   mat <- cbind(as.numeric(dat[ ,1]>0), as.numeric(dat[ ,1] >=3),
66 |                as.numeric(dat[ ,2]>0), as.numeric(dat[ ,2] >=3))
67 |   colnames(mat) <- c("a_once", "a_sporadic", "b_once", "b_frequent")
68 |   #quants <- list(a=c("_once"=1, "_sporadic"=3), b=c("_once"=1, "_frequent"=3))
69 |   quants <- list(list(varname="a", q="_once", count=1),
70 |                  list(varname="a", q="_sporadic", count=3),
71 |                  list(varname="b", q="_once", count=1),
72 |                  list(varname="b", q="_frequent", count=3))
73 |   ar <- assess_recurrence(dat)
74 |   expect_equal(ar[["mat"]], mat)
75 |   expect_equal(ar[["quants"]], quants)
76 | })
77 | 
78 | 


--------------------------------------------------------------------------------
/tests/testthat/test_calc_rr_cds.R:
--------------------------------------------------------------------------------
 1 | 
 2 | context("calc_rr_cds")
 3 | 
 4 | test_that("calc_rr_cds c++ function works the same way as the all R version", {
 5 |   calc_rr_cd <- function(outcome, covar) {
 6 |     prevs <- by(outcome, covar, mean)
 7 |     (prevs[2])/(prevs[1])
 8 |   }
 9 |   
10 |   set.seed(123)
11 |   n <- 100
12 |   p <- 1000
13 |   out <- rbinom(n, 1, 0.2)
14 |   covars <- matrix(rbinom(n*p, 1, 0.05), n)
15 |   colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"),
16 |                         paste("proc", 1:(p/2), sep="_"))
17 |   
18 |   #make outcome 0 for all obs with covar 1 = 1, so first RR = Inf
19 |   out[covars[, 1]==1] <- 0 
20 |   
21 |   rr_cds <-  calc_rr_cds(out, covars)
22 |   
23 |   expect_equal(rr_cds, apply(covars, 2, calc_rr_cd, outcome=out))
24 |   expect_equal(rr_cds[1], 0, check.names=FALSE)
25 | })
26 | 


--------------------------------------------------------------------------------
/tests/testthat/test_hdps_screen.R:
--------------------------------------------------------------------------------
 1 | context("hdps_screen")
 2 | 
 3 | test_that("works on data.frames", {
 4 |   set.seed(123)
 5 |   n <- 100
 6 |   p <- 1000
 7 |   out <- rbinom(n, 1, 0.2)
 8 |   trt <- rbinom(n, 1, 0.5)
 9 |   covars <- matrix(rbinom(n*p, 3, 0.05), n)
10 |   colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"),
11 |                         paste("proc", 1:(p/2), sep="_"))
12 |   
13 |   dimension_names <- c("drug", "proc")
14 |   
15 |   screened_covars_fit <- hdps_screen(out, trt, as.data.frame(covars), 
16 |                                      dimension_names = dimension_names,
17 |                                      keep_n_per_dimension = 200,
18 |                                      keep_k_total = 100,
19 |                                      verbose=FALSE)
20 |   
21 |   screened_covars <- predict(screened_covars_fit)
22 |   expect_equal(screened_covars, predict(screened_covars_fit, newdata=covars))
23 | })
24 | 


--------------------------------------------------------------------------------
/tests/testthat/test_predict.R:
--------------------------------------------------------------------------------
 1 | context("predict")
 2 | 
 3 | test_that("predict returns the same matrix when newdata is the same as the original data", {
 4 |   set.seed(123)
 5 |   n <- 100
 6 |   p <- 1000
 7 |   out <- rbinom(n, 1, 0.2)
 8 |   trt <- rbinom(n, 1, 0.5)
 9 |   covars <- matrix(rbinom(n*p, 3, 0.05), n)
10 |   colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"),
11 |                        paste("proc", 1:(p/2), sep="_"))
12 |   
13 |   dimension_names <- c("drug", "proc")
14 | 
15 |   screened_covars_fit <- hdps_screen(out, trt, covars, 
16 |                                      dimension_names = dimension_names,
17 |                                      keep_n_per_dimension = 200,
18 |                                      keep_k_total = 100,
19 |                                      verbose=FALSE)
20 |                                      
21 |   screened_covars <- predict(screened_covars_fit)
22 |   expect_equal(screened_covars, predict(screened_covars_fit, newdata=covars))
23 |   expect_equal(screened_covars, predict(screened_covars_fit, newdata=as.data.frame(covars)))
24 | })
25 | 


--------------------------------------------------------------------------------