├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── RcppExports.R ├── SL.hdps.R ├── assess_recurrence.R ├── hdps-package.R ├── hdps_screen.R ├── identify_covariates.R ├── prioritize_covariates.R └── utils.R ├── README.md ├── hdps.Rproj ├── man ├── SL.hdps.generator.Rd ├── assess_recurrence.Rd ├── hdps-package.Rd ├── hdps_screen.Rd ├── identify_covariates.Rd ├── predict.SL_hdps.Rd ├── predict.hdps_covars.Rd ├── prioritize_covariates.Rd └── screen.Rd ├── src ├── RcppExports.cpp ├── calc_rr_cds.cpp ├── colPrevScores.cpp └── colVars.cpp └── tests ├── testthat.R └── testthat ├── test_SL.hdps.R ├── test_assess_recurrence.R ├── test_calc_rr_cds.R ├── test_hdps_screen.R └── test_predict.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | .travis.yml 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | *.o 5 | *.so 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | warnings_are_errors: true 3 | r_binary_packages: 4 | - testthat 5 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: hdps 2 | Type: Package 3 | Title: High-dimensional propensity score algorithm 4 | Version: 0.1.6 5 | Date: 2017-08-16 6 | Author: Sam Lendle 7 | Maintainer: Sam Lendle 8 | Description: The high-dimensional propensity score algorithm is a method for 9 | high-dimensional proxy adjustment in claims data. This package implements 10 | the variable transformation and variable selection parts of the 11 | algorithm. 12 | License: MIT + file LICENSE 13 | Imports: 14 | Rcpp (>= 0.11.1), 15 | Matrix, 16 | glmnet 17 | LinkingTo: Rcpp 18 | Suggests: 19 | testthat 20 | RoxygenNote: 6.0.1 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2017 2 | COPYRIGHT HOLDER: Sam Lendle -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(predict,SL_hdps) 4 | S3method(predict,hdps_covars) 5 | export(SL.hdps.generator) 6 | export(assess_recurrence) 7 | export(hdps_screen) 8 | export(identify_covariates) 9 | export(prioritize_covariates) 10 | export(screen.excludenames) 11 | export(screen.names) 12 | importFrom(Matrix,sparse.model.matrix) 13 | importFrom(Rcpp,evalCpp) 14 | importFrom(glmnet,cv.glmnet) 15 | importFrom(glmnet,glmnet) 16 | importFrom(stats,cor) 17 | importFrom(stats,predict) 18 | importFrom(stats,quantile) 19 | useDynLib(hdps) 20 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | calc_rr_cds <- function(outcome, covars) { 5 | .Call('_hdps_calc_rr_cds', PACKAGE = 'hdps', outcome, covars) 6 | } 7 | 8 | colPrevScores <- function(x) { 9 | .Call('_hdps_colPrevScores', PACKAGE = 'hdps', x) 10 | } 11 | 12 | colVars <- function(x) { 13 | .Call('_hdps_colVars', PACKAGE = 'hdps', x) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /R/SL.hdps.R: -------------------------------------------------------------------------------- 1 | ##' Generates a wrapper for SuperLearner using HDPS 2 | ##' 3 | ##' A HDPS candidate will generate covariates using \code{hdps_screen} from 4 | ##' codes, and estimate the propensity score with logistic regression on 5 | ##' generated covariates and predefined covariates. 6 | ##' 7 | ##' To use HDPS in SuperLearner to estimate a propensity score, you need to 8 | ##' include the outcome variable as a covariate where here outcome means the 9 | ##' outcome of interest in the causal problem as opposed to the \code{Y} 10 | ##' variable in SuperLearner. For non-HDPS candidates in SuperLearner, it's 11 | ##' important to exclude the outcome variable via \code{\link[=screen]{screen.named}} or 12 | ##' some other screening algorithm in order to avoid adjusting for something 13 | ##' downstream on the causal pathway. 14 | ##' @title SL.hdps.generator 15 | ##' @param out_name Name of the outcome variable. 16 | ##' @param dimension_names Dimension names of HDPS dimensions. See 17 | ##' \code{\link{hdps_screen}}. 18 | ##' @param predef_covar_names Names of predefined covariates to be included in 19 | ##' logistic regression model. 20 | ##' @param keep_k_total See \code{\link{hdps_screen}}. 21 | ##' @param ... Other arguments passed to \code{\link{hdps_screen}}. 22 | ##' @param cvglmnet Use \code{glmnet} or \code{cv.glmnet} for fitting. Defaults to FALSE. 23 | ##' @param glmnet_args list of arguments to be passed to glmnet or cv.glmnet. If \code{cvglmnet=FALSE}, \code{glmnet_args} 24 | ##' should be set such that calling \code{predict} on the \code{glmnet} object returns only one vector of predictions. 25 | ##' E.g. only one value of \code{lambda} should be set. 26 | ##' @return A SuperLearner wrapper function 27 | ##' @author Sam Lendle 28 | ##' @importFrom Matrix sparse.model.matrix 29 | ##' @importFrom glmnet glmnet 30 | ##' @importFrom glmnet cv.glmnet 31 | ##' @importFrom stats cor 32 | ##' @export 33 | SL.hdps.generator <- function(out_name, dimension_names, predef_covar_names=c(), keep_k_total, ..., 34 | cvglmnet=FALSE, glmnet_args=if (cvglmnet) list() else list(lambda=0)) { 35 | function(Y, X, newX, family, obsWeights, id) { 36 | if (missing(newX)) { 37 | newX <- X 38 | } 39 | if(family$family == 'gaussian') { 40 | stop("SL.hdps only for binomial") 41 | } 42 | 43 | hdps_fit <- hdps_screen(X[, out_name], Y, X, dimension_names, keep_k_total=keep_k_total, ...) 44 | 45 | predef_covars <- X[, predef_covar_names] 46 | if (keep_k_total > 0) { 47 | hdps_covars <- predict(hdps_fit) 48 | hdps_keep <- colnames(hdps_covars)[abs(cor(Y, hdps_covars)) <= 0.95] 49 | hdps_covars <- hdps_covars[, hdps_keep] 50 | df = as.data.frame(cbind(predef_covars, hdps_covars)) 51 | } else { 52 | hdps_keep <- NULL 53 | df = as.data.frame(predef_covars) 54 | } 55 | 56 | smm <- sparse.model.matrix(~.-1, df) 57 | 58 | myglmnet <- function(...) if (cvglmnet) 59 | cv.glmnet(smm, Y, family="binomial") else 60 | glmnet(smm, Y, family="binomial", ...) 61 | glmnet_fit <- do.call(myglmnet, glmnet_args) 62 | 63 | if (identical(X, newX)) { 64 | smmnew <- smm 65 | } else { 66 | 67 | new_predef_covars <- newX[, predef_covar_names] 68 | if (keep_k_total > 0) { 69 | new_hdps_covars <- predict(hdps_fit, newdata=newX) 70 | new_hdps_covars <- new_hdps_covars[, hdps_keep] 71 | new_df = as.data.frame(cbind(new_predef_covars, new_hdps_covars)) 72 | } else { 73 | new_df = as.data.frame(new_predef_covars) 74 | } 75 | 76 | smmnew <- sparse.model.matrix(~.-1, new_df) 77 | } 78 | 79 | pred <- predict(glmnet_fit, smmnew, type="response") 80 | if (ncol(pred) != 1) stop("Check cvglmnet and glmnet_args arguments to insure that predict returns only one column") 81 | 82 | 83 | # fit returns all objects needed for predict.SL.template 84 | fit <- list(glmnet_fit = glmnet_fit, hdps_fit = hdps_fit, 85 | predef_covar_names=predef_covar_names, hdps_keep=hdps_keep, keep_k_total=keep_k_total) 86 | # declare class of fit for predict.SL.template 87 | class(fit) <- 'SL_hdps' 88 | # return a list with pred and fit 89 | out <- list(pred = pred, fit = fit) 90 | return(out) 91 | } 92 | } 93 | 94 | #' Get predictions from SL_hdps wrapper 95 | #' 96 | #' @title Get predictions from SL_hdps wrapper 97 | #' @param object object of class \code{SL_hdps} 98 | #' @param newdata a matrix of covariates to predict from 99 | #' @param ... ignored 100 | #' @return vector of predictions 101 | #' @author Sam Lendle 102 | #' @importFrom stats predict 103 | #' @export 104 | predict.SL_hdps <- function(object, newdata, ...){ 105 | new_predef_covars <- newdata[, object$predef_covar_names] 106 | new_hdps_covars <- predict(object$hdps_fit, newdata=newdata, keep_k_total=object$keep_k_total) 107 | new_hdps_covars <- new_hdps_covars[, object$hdps_keep] 108 | new_df <- cbind(new_predef_covars, new_hdps_covars) 109 | smmnew <- sparse.model.matrix(~.-1, new_df) 110 | pred <- predict(object$glmnet_fit, smmnew, type = "response") 111 | if (ncol(pred) != 1) stop("Check cvglmnet and glmnet_args arguments to insure that predict returns only one column") 112 | pred 113 | } 114 | 115 | ##' @name screen 116 | ##' @rdname screen 117 | ##' 118 | ##' @title SuperLearner screening wrappers 119 | ##' 120 | ##' @description Functions to set up screening wrappers for SuperLearner 121 | ##' 122 | ##' @param names Names to be included or excluded 123 | ##' 124 | ##' These functions generate simple screening wrappers for SuperLearner to 125 | ##' include or exclude variables based on \code{names}. This is is helpful 126 | ##' because in order to use HDPS as a candidate in SuperLearner, you need to 127 | ##' include the study outcome variable as a covariate. But to use a non-HDPS 128 | ##' algorithm, (say a random forest on some specified set of covariates,) as a 129 | ##' candidate as well, you want to make sure you're not adjusting for the 130 | ##' outcome which is downstream from treatment on the causal pathway. 131 | ##' 132 | ##' See documentation for the SuperLearner package for more about screening algorithms. 133 | ##' 134 | ##' @examples 135 | ##' 136 | ##' screen.predefined <- screen.names(c("names", "of", "predefined", 137 | ##' "covariates", "that", "definitely", "dont", "include", "the", "outcome")) 138 | ##' 139 | ##' screen.notoutcome <- screen.excludenames(c("outcome_variable_name", 140 | ##' "and", "other", "covariates", "to", "exclude")) 141 | ##' 142 | NULL 143 | 144 | ##' @rdname screen 145 | ##' @export 146 | screen.names <- function (names) { 147 | function (Y, X, family, obsWeights, id, ...) { 148 | colnames(X) %in% names 149 | } 150 | } 151 | 152 | ##' @rdname screen 153 | ##' @export 154 | screen.excludenames <- function (names) { 155 | function (Y, X, family, obsWeights, id, ...) { 156 | !(colnames(X) %in% names) 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /R/assess_recurrence.R: -------------------------------------------------------------------------------- 1 | #' Expands covarites to up to three binary columns for at least one, sporadic, or frequent occrence of each covariate. 2 | #' 3 | #' Each column \code{x} of \code{covars} is expanded to three binary columns. 4 | #' The first column indicates that the value of \code{x} is non-zero. 5 | #' The second indicates that the value of \code{x} \eqn{\ge} the median of non-zero values of \code{x}. 6 | #' The third indicates that the value of \code{x} \eqn{\ge} the 75th percentile of non-zero values of \code{x}. 7 | #' Non-unique columns per covariate are dropped. 8 | #' 9 | #' Groups of columns of the returned matrix are in the same order of columns in \code{covars}. 10 | #' 11 | #' If \code{covars} has column names, the returned matrix will have the same column names with suffexes \code{"_once"}, 12 | #' \code{"_sporadic"}, and \code{"_frequent"} for the first, second, and third expanded columns, respectively. 13 | #' 14 | #' @title assess_recurrence 15 | #' @param covars a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates 16 | #' @param debug Enables some debuging checks which slow things down, but may yield useful warnings or errors. 17 | #' @return Expanded \code{covars} matrix. 18 | #' @author Sam Lendle 19 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 20 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 21 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology 22 | #' (Cambridge, Mass.)}, 20(4), 512. 23 | #' @export 24 | assess_recurrence <- function(covars, debug=FALSE) { 25 | #expands a matrix by replacing it's columns with as.numeric(x > 0), 26 | # as.numeric(x > median(x)), as.numeric(x > quantile(x, prob=0.75)) 27 | #only unique columns (per original column) are kept 28 | 29 | covars <- as.matrix(covars) 30 | 31 | temp <- function(i) { 32 | column <- covars[,i] 33 | quants <- get_quantiles(column) 34 | column_recurrence(column, quants, warndup=debug) 35 | } 36 | #mats_quants <- lapply(1:ncol(covars), temp) 37 | mats_quants <- list() 38 | 39 | for (i in 1:ncol(covars)) 40 | mats_quants[[i]] = temp(i) 41 | 42 | mats <- lapply(mats_quants, `[[`, "mat")#function(mq) mq[["mat"]]) 43 | quants <- lapply(mats_quants, `[[`, "quants")#function(mq) mq[["quants"]]) 44 | 45 | cnams <- colnames(covars) 46 | if (!is.null(cnams)) { 47 | for (i in seq_along(mats)) { 48 | colnames(mats[[i]]) <- paste(cnams[i], colnames(mats[[i]]), sep="") 49 | quants[[i]] <- lapply(quants[[i]], function(q) c(varname=cnams[i], q)) 50 | } 51 | } 52 | 53 | mat <- do.call(cbind, mats) 54 | quants <- do.call(c, quants) 55 | list(mat=mat, quants=quants) 56 | } 57 | -------------------------------------------------------------------------------- /R/hdps-package.R: -------------------------------------------------------------------------------- 1 | #' High-dimensional propensity score algorithm 2 | #' 3 | #' The high-dimensional propensity score (HDPS) algorithm is a method for 4 | #' high-dimensional proxy adjustment in claims data. This package implements 5 | #' the variable transformation and variable selection parts of the 6 | #' algorithm. 7 | #' 8 | #' \tabular{ll}{ 9 | #' Package: \tab hdps\cr 10 | #' Type: \tab Package\cr 11 | #' Version: \tab 0.1.6\cr 12 | #' Date: \tab 2017-08-16\cr 13 | #' License: \tab MIT \cr 14 | #' } 15 | #' 16 | #' This package implements part of step 2 (\code{\link{identify_covariates}}), 17 | #' steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}}) 18 | #' of the HDPS algorithm (Schneeweiss et al., 2009). 19 | #' 20 | #' The \code{\link{hdps_screen}} function is a wrapper function for \code{\link{identify_covariates}}, 21 | #' \code{\link{assess_recurrence}}, and \code{\link{prioritize_covariates}}. 22 | #' 23 | #' @name hdps-package 24 | #' @aliases hdps-package hdps 25 | #' @docType package 26 | #' @author Sam Lendle 27 | #' 28 | #' Maintainer: Sam Lendle 29 | #' 30 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 31 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 32 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology 33 | #' (Cambridge, Mass.)}, 20(4), 512. 34 | #' @keywords package 35 | #' @examples 36 | #' 37 | #' #~~ simple examples of the most important functions ~~ 38 | #' 39 | #' @useDynLib hdps 40 | #' @importFrom Rcpp evalCpp 41 | NULL 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /R/hdps_screen.R: -------------------------------------------------------------------------------- 1 | #' The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 2 | #' steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}}) 3 | #' of the HDPS algorithm (Schneeweiss et al., 2009). 4 | #' 5 | #' The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 6 | #' steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}}) 7 | #' of the HDPS algorithm (Schneeweiss et al., 2009). 8 | #' 9 | #' \emph{Step 2.} Columns of \code{covars} are split by data dimension (as defined in Schneeweiss et al. (2009)) and 10 | #' filtered by \code{\link{identify_covariates}}. 11 | #' 12 | #' Dimensions can be specified in two ways. 13 | #' If \code{dimension_names} is used, the \code{colnames(covars)} is \code{\link[base]{grep}}ed for each value of 14 | #' \code{dimension_names}. 15 | #' If some column names match more than one pattern, an error is thrown. 16 | #' If some column names are not matched by any pattern, a warning is issued and those columns are ignored. 17 | #' For example, suppose the column names of \code{covars} are \code{c("drug_1", "drug_2", "proc_1", "proc_2")}. 18 | #' \code{dimension_names <- c("drug", "proc")} would split \code{covars} into two dimensions, 19 | #' one for \code{drug}s and one for \code{proc}s. 20 | #' 21 | #' Dimensions can also be specified by \code{dimension_indexes} which should contain a list of either column 22 | #' indexes or column names for each dimension. 23 | #' 24 | #' If neither \code{dimension_names} nor \code{dimension_indexes} is specified, all covariates are treated as one dimension. 25 | #' 26 | #' \emph{Step 3.} After filtering, remaining covariates are expanded by \code{\link{assess_recurrence}}. 27 | #' 28 | #' If at this point, the number of expanded covariates is less than \code{keep_k_total}, all expanded covariates are returned. 29 | #' 30 | #' \emph{Step 4.} Expanded covariates are ordered with \code{\link{prioritize_covariates}}. 31 | #' 32 | #' \emph{Step 5.} Step 5 can be performed with \code{\link{predict.hdps_covars}}. 33 | #' 34 | #' @title hdps_screen 35 | #' @param outcome binary vector of outcomes 36 | #' @param treatment binary vector of treatments 37 | #' @param covars \code{matrix} or \code{data.frame} of binary covariates. 38 | #' @param dimension_names A character vector of patterns to match against the column names of \code{covars} to split columns into dimension groups. See details. 39 | #' @param dimension_indexes A list of vectors of column indexes corresponding to dimension groups. See details. Cannot be specified with \code{dimension_names}. 40 | #' @param keep_n_per_dimension The maximum number of covariates to be kept per dimension by \code{\link{identify_covariates}}. 41 | #' @param keep_k_total Total number of covariates to keep after expanding by \code{\link{assess_recurrence}} and ordering by \code{link{prioritize_covariates}}. 42 | #' @param verbose Should verbose output be printed? 43 | #' @param debug Enables some debuging checks which slow things down, but may yield useful warnings or errors. 44 | #' @return An object of class \code{hdps_covars} 45 | #' @seealso \code{\link{predict.hdps_covars}} 46 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 47 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 48 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology 49 | #' (Cambridge, Mass.)}, 20(4), 512. 50 | #' @author Sam Lendle 51 | #' @examples 52 | #' set.seed(123) 53 | #' n <- 1000 54 | #' p <- 10000 55 | #' out <- rbinom(n, 1, 0.05) 56 | #' trt <- rbinom(n, 1, 0.5) 57 | #' covars <- matrix(rbinom(n*p, 3, 0.05), n) 58 | #' colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"), 59 | #' paste("proc", 1:(p/2), sep="_")) 60 | #' 61 | #' dimension_names <- c("drug", "proc") 62 | #' 63 | #' screened_covars_fit <- hdps_screen(out, trt, covars, 64 | #' dimension_names = dimension_names, 65 | #' keep_n_per_dimension = 400, 66 | #' keep_k_total = 200, 67 | #' verbose=TRUE) 68 | #' 69 | #' screened_covars <- predict(screened_covars_fit) 70 | #' 71 | #' @export 72 | hdps_screen <- function(outcome, treatment, covars, 73 | dimension_names=NULL, dimension_indexes=NULL, 74 | keep_n_per_dimension=200, keep_k_total=500, 75 | verbose=FALSE, debug=FALSE) { 76 | 77 | check_inputs(outcome, treatment, covars) 78 | 79 | if (!is.null(dimension_names) && !is.null(dimension_indexes)) { 80 | stop("At most, one of dimension_names and dimension_indexes should be specified") 81 | } 82 | 83 | if (!is.null(dimension_names)) { 84 | dimension_indexes <- lapply(dimension_names, grep, x = colnames(covars)) 85 | all_idx <- do.call(c, dimension_indexes) 86 | if (anyDuplicated(all_idx)) { 87 | stop("Some column names of covars are matched by more than one pattern in dimension_names") 88 | } 89 | if (!all(all_idx %in% 1:ncol(covars))) { 90 | warning("Some column names of covars are not matched by any of the patterns in dimension_names") 91 | } 92 | } 93 | 94 | # Step 2. Identify empirical candidate covariates 95 | if (is.null(dimension_indexes)) { 96 | if (verbose) message("No dimensions specified...") 97 | if (verbose) message("Filtering covariates...") 98 | filtered_covars <- identify_covariates(covars, keep_n_covars=keep_n_per_dimension, indexes=FALSE) 99 | } else { 100 | if (verbose) message("Filtering covariates...") 101 | filtered_covars <- lapply(seq_along(dimension_indexes), function(i) { 102 | if (verbose) message("\tFiltering dimension ", 103 | if (!is.null(dimension_names)) dimension_names[i] else i, 104 | "...") 105 | identify_covariates(covars[, dimension_indexes[[i]]], keep_n_covars=keep_n_per_dimension, indexes=FALSE) 106 | }) 107 | if (verbose) message("Combining dimensions...") 108 | filtered_covars <- do.call(cbind, filtered_covars) 109 | } 110 | 111 | #Step 3. Assess recurrence 112 | if (verbose) message("Expanding covariates...") 113 | ar <- assess_recurrence(filtered_covars, debug=debug) 114 | expanded_covars <- ar[["mat"]] 115 | quants <- ar[["quants"]] 116 | 117 | if (dim(expanded_covars)[2] != length(quants)) stop("something is wrong...") 118 | 119 | #Step 4. Prioritize covariates 120 | if (verbose) message("Prioritizing covariates...") 121 | ordered_indexes <- prioritize_covariates(outcome, treatment, expanded_covars, keep_NaNs=TRUE) 122 | 123 | res <- list(expanded_covars=expanded_covars, 124 | quants=quants, 125 | ordered_indexes=ordered_indexes, 126 | keep_k_total=keep_k_total 127 | ) 128 | if (verbose) message("...Done!") 129 | class(res) <- "hdps_covars" 130 | 131 | return(res) 132 | 133 | } 134 | 135 | #' returns the matix of covariates based on an hdps screening 136 | #' 137 | #' @title Get matrix of hdps selected covariates 138 | #' @param object object of class \code{hdps_covars} 139 | #' @param newdata \code{NULL}, or a matrix who's columns have names corresponding to those selected by hdps in \code{object}. 140 | #' If \code{NULL} selected covariates from original matrix used in the screening step are returned. 141 | #' @param keep_k_total change \code{keep_k_total} from the original call to \code{\link{hdps_screen}} 142 | #' @param ... ignored 143 | #' @return A matrix of hdps selected covariates 144 | #' @seealso \link{hdps_screen} 145 | #' @author Sam Lendle 146 | #' @export 147 | predict.hdps_covars <- function(object, newdata=NULL, keep_k_total, ...) { 148 | if (missing(keep_k_total)) keep_k_total <- object$keep_k_total 149 | 150 | if (!is.null(newdata)) { 151 | #could be more efficient here 152 | # by first filtering the quants to only the keep_k_total needed 153 | # then by grouping by varname 154 | mats <- lapply(object$quants, function(quant) { 155 | x <- newdata[, quant$varname] 156 | mat <- column_recurrence(x, list(quant))$mat 157 | colnames(mat) <- paste(quant$varname, colnames(mat), sep="") 158 | mat 159 | }) 160 | expanded_covars <- do.call(cbind, mats) 161 | } else { 162 | expanded_covars <- object$expanded_covars 163 | } 164 | 165 | if (ncol(expanded_covars) <= keep_k_total) { 166 | return(expanded_covars) 167 | } 168 | 169 | ordered_indexes <- object$ordered_indexes 170 | selected_indexes <- ordered_indexes[1:min(keep_k_total, length(ordered_indexes))] 171 | selected_covars <- expanded_covars[, sort(selected_indexes)] 172 | return(selected_covars) 173 | } 174 | 175 | 176 | -------------------------------------------------------------------------------- /R/identify_covariates.R: -------------------------------------------------------------------------------- 1 | #' Given a matrix of covarites, \code{identify_covariates} returns the top \code{keep_n_covars} or the indexes of those columns. 2 | #' 3 | #' Columns are sorted in descending order of \code{min(prevalence, 1-prevalence)} where \code{prevalence} is the the proportion of 4 | #' non-zero values in a given column. 5 | #' 6 | #' If \code{indexes==TRUE}, a vector of the top \code{keep_n_covars} column indexes is returned. 7 | #' 8 | #' If \code{indexes==FALSE}, a matrix of covariates is returned whos columns are the top \code{keep_n_covars} colums of 9 | #' \code{covars}. Columns are in their original order. 10 | #' If also \code{keep_n_covars >= ncol(covar)}, then the function returns immediately without ranking columns in terms of 11 | #' prevalence as it is unecessary. 12 | #' 13 | #' \strong{Differences from Schneeweiss et al. (2009):} 14 | #' \itemize{ 15 | #' \item{Covariates that have fewer than 100 non-zero values are not automatically dropped. 16 | #' If typical covariates tend to have more than 100 non-zero values will typically be ranked higher than those with fewer than 100 17 | #' automatically.} 18 | #' } 19 | #' 20 | #' @title identify_covariates 21 | #' @param covars a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates 22 | #' @param keep_n_covars number of covariates to keep 23 | #' @param indexes Should indexes be returned? Or a subset of \code{covars}. Defaults to \code{FALSE}. 24 | #' @return Indexes of identified columns or a subset of \code{covars} 25 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 26 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 27 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology 28 | #' (Cambridge, Mass.)}, 20(4), 512. 29 | #' @author Sam Lendle 30 | #' @export 31 | identify_covariates <- function(covars, keep_n_covars=200, indexes=FALSE) { 32 | 33 | if (!indexes && ncol(covars) <= keep_n_covars) return(covars) 34 | vars <- colPrevScores(as.matrix(covars)) 35 | var_ords <- order(vars, decreasing=TRUE)[1:min(keep_n_covars, ncol(covars))] 36 | 37 | if (indexes) { 38 | return(var_ords) 39 | } else { 40 | covars[, sort(var_ords)] 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /R/prioritize_covariates.R: -------------------------------------------------------------------------------- 1 | #' Returns the indexes of columns \code{covars} ordered by the possible amount of counfounding each column could adjust for. 2 | #' 3 | #' For each covariate in \code{covars}, the potential multiplicitive bias is calculated as described in Schneeweiss et al. (2009). 4 | #' The column indexes are then put in descending order of the absolute value of the log of the multiplicitive bias. 5 | #' 6 | #' If \code{return_bias==TRUE}, the returned vector of indexes includes the multiplicitive biases sorted in the same order and stored in an attribute called \code{"bias_m"}. 7 | #' 8 | #' If \code{outcome} has no variation for a particular value of a covariate, then the multiplicitive bias is calculated as \code{NaN}. 9 | #' If \code{keep_NaNs==FALSE}, then the column indexes of such covariates are not included in the returned vector. 10 | #' 11 | #' @title prioritize_covariates 12 | #' @param outcome binary vector of outcomes 13 | #' @param treatment binary vector of treatments 14 | #' @param covars \code{matrix} or \code{data.frame} of binary covariates. 15 | #' @param return_bias Should the calculated multiplitive biases be returned with the ordered indexes? Defaults to \code{FALSE} 16 | #' @param keep_NaNs If the calculated multiplicitive bias for a covariate is \code{NaN}, should its index be included in the returned vector? Defaults to \code{FALSE} 17 | #' @return Vector of column indexes of \code{covars}. 18 | #' @author Sam Lendle 19 | #' @references Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 20 | #' H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 21 | #' in studies of treatment effects using health care claims data. \emph{Epidemiology 22 | #' (Cambridge, Mass.)}, 20(4), 512. 23 | #' @export 24 | prioritize_covariates <- function(outcome, treatment, covars, return_bias=FALSE, keep_NaNs=FALSE) { 25 | check_inputs(outcome, treatment, covars, covars_bin=TRUE) 26 | 27 | treatment <- factor(treatment) 28 | 29 | covar_prev <- by(covars, treatment, colMeans) 30 | p_c1 <- covar_prev[[levels(treatment)[1]]] 31 | p_c0 <- covar_prev[[levels(treatment)[2]]] 32 | 33 | #a vector of rr_cd if rr_cd > 1, 1/rr_cd otherwise 34 | rr_cds <- calc_rr_cds(outcome, covars) 35 | 36 | #Infs in rr_cds will result in NaN for some covariates 37 | bias_mult <- (p_c1 * (rr_cds - 1) + 1) / (p_c0 * (rr_cds - 1) + 1) 38 | 39 | abs_log_bias_mult <- abs(log(bias_mult)) 40 | 41 | na.last <- if (keep_NaNs) TRUE else NA 42 | ordered_idxs <- order(abs_log_bias_mult, decreasing=TRUE, na.last=na.last) 43 | if (return_bias) attr(ordered_idxs, "bias_m") <- bias_mult[ordered_idxs] 44 | ordered_idxs 45 | } 46 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | #' @importFrom stats quantile 2 | get_quantiles <- function(x) { 3 | xx0 <- x[x>0] 4 | quants <- quantile(xx0, probs=c(0.5, 0.75), names=FALSE, type=2) 5 | quants <- list(list(q="_once", count=1), 6 | list(q="_sporadic", count=quants[1]), 7 | list(q="_frequent", count=quants[2])) 8 | 9 | counts <- sapply(quants, `[[`, "count") 10 | ux <- unique(xx0) 11 | cutoffs <- sapply(counts, function(count) min(ux[ux >= count])) 12 | dups <- duplicated(cutoffs) 13 | quants[!dups] 14 | } 15 | 16 | column_recurrence <- function(x, quants, warndup=FALSE) { 17 | mat <- matrix(0, length(x), length(quants)) 18 | colnames(mat) <- sapply(quants, `[[`, "q") 19 | 20 | for (i in 1:length(quants)) { 21 | mat[, i] <- x >= quants[[i]]$count 22 | } 23 | 24 | if (warndup) { 25 | dups <- duplicated(mat, MARGIN=2) 26 | if (any(dups)) { 27 | warning("Duplicate columns in mat. This should not happen when hdps_screen is called, but could when predict is called.") 28 | } 29 | } 30 | 31 | 32 | list(mat=mat, quants=quants) 33 | } 34 | 35 | check_inputs <- function(outcome, treatment, covars, covars_bin=FALSE) { 36 | n = nrow(covars) 37 | 38 | if(!is.vector(outcome)) stop("outcome should be a vector") 39 | if(!is.vector(treatment)) stop("treatment should be a vector") 40 | 41 | if (!length(outcome) == n || !length(treatment) == n) 42 | stop("outcome and treatment should be the same length, which should be equal to nrow(covars)") 43 | 44 | if (!all(outcome %in% c(0,1))) 45 | stop("outcome should be binary") 46 | if (!all(treatment %in% c(0,1))) 47 | stop("treatment should be binary") 48 | if (covars_bin && !all(covars %in% c(0,1))) 49 | stop("covars should be binary") 50 | } 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | hdps 2 | ==== 3 | 4 | [![Build Status](https://travis-ci.org/lendle/hdps.svg)](https://travis-ci.org/lendle/hdps) 5 | 6 | High dimensional propensity score algorithm 7 | 8 | Install 9 | ------- 10 | This package can be installed using the `devtools` package in R: 11 | ```r 12 | library(devtools) 13 | install_github("lendle/hdps") 14 | # or if that doesn't work, try 15 | install_git("git@github.com:lendle/hdps") 16 | ``` 17 | 18 | Documentation 19 | ------------- 20 | 21 | Wouldn't it be great if you could automatically generate markdown files from `.Rd` files so you could just view the docs directly on github? Well you can't. So open up R and do this instead: 22 | ```r 23 | library(hdps) 24 | ?hdps 25 | ``` 26 | -------------------------------------------------------------------------------- /hdps.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageInstallArgs: --no-multiarch --with-keep.source 17 | PackageRoxygenize: rd,collate,namespace 18 | -------------------------------------------------------------------------------- /man/SL.hdps.generator.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SL.hdps.R 3 | \name{SL.hdps.generator} 4 | \alias{SL.hdps.generator} 5 | \title{SL.hdps.generator} 6 | \usage{ 7 | SL.hdps.generator(out_name, dimension_names, predef_covar_names = c(), 8 | keep_k_total, ..., cvglmnet = FALSE, glmnet_args = if (cvglmnet) list() 9 | else list(lambda = 0)) 10 | } 11 | \arguments{ 12 | \item{out_name}{Name of the outcome variable.} 13 | 14 | \item{dimension_names}{Dimension names of HDPS dimensions. See 15 | \code{\link{hdps_screen}}.} 16 | 17 | \item{predef_covar_names}{Names of predefined covariates to be included in 18 | logistic regression model.} 19 | 20 | \item{keep_k_total}{See \code{\link{hdps_screen}}.} 21 | 22 | \item{...}{Other arguments passed to \code{\link{hdps_screen}}.} 23 | 24 | \item{cvglmnet}{Use \code{glmnet} or \code{cv.glmnet} for fitting. Defaults to FALSE.} 25 | 26 | \item{glmnet_args}{list of arguments to be passed to glmnet or cv.glmnet. If \code{cvglmnet=FALSE}, \code{glmnet_args} 27 | should be set such that calling \code{predict} on the \code{glmnet} object returns only one vector of predictions. 28 | E.g. only one value of \code{lambda} should be set.} 29 | } 30 | \value{ 31 | A SuperLearner wrapper function 32 | } 33 | \description{ 34 | Generates a wrapper for SuperLearner using HDPS 35 | } 36 | \details{ 37 | A HDPS candidate will generate covariates using \code{hdps_screen} from 38 | codes, and estimate the propensity score with logistic regression on 39 | generated covariates and predefined covariates. 40 | 41 | To use HDPS in SuperLearner to estimate a propensity score, you need to 42 | include the outcome variable as a covariate where here outcome means the 43 | outcome of interest in the causal problem as opposed to the \code{Y} 44 | variable in SuperLearner. For non-HDPS candidates in SuperLearner, it's 45 | important to exclude the outcome variable via \code{\link[=screen]{screen.named}} or 46 | some other screening algorithm in order to avoid adjusting for something 47 | downstream on the causal pathway. 48 | } 49 | \author{ 50 | Sam Lendle 51 | } 52 | -------------------------------------------------------------------------------- /man/assess_recurrence.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assess_recurrence.R 3 | \name{assess_recurrence} 4 | \alias{assess_recurrence} 5 | \title{assess_recurrence} 6 | \usage{ 7 | assess_recurrence(covars, debug = FALSE) 8 | } 9 | \arguments{ 10 | \item{covars}{a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates} 11 | 12 | \item{debug}{Enables some debuging checks which slow things down, but may yield useful warnings or errors.} 13 | } 14 | \value{ 15 | Expanded \code{covars} matrix. 16 | } 17 | \description{ 18 | Expands covarites to up to three binary columns for at least one, sporadic, or frequent occrence of each covariate. 19 | } 20 | \details{ 21 | Each column \code{x} of \code{covars} is expanded to three binary columns. 22 | The first column indicates that the value of \code{x} is non-zero. 23 | The second indicates that the value of \code{x} \eqn{\ge} the median of non-zero values of \code{x}. 24 | The third indicates that the value of \code{x} \eqn{\ge} the 75th percentile of non-zero values of \code{x}. 25 | Non-unique columns per covariate are dropped. 26 | 27 | Groups of columns of the returned matrix are in the same order of columns in \code{covars}. 28 | 29 | If \code{covars} has column names, the returned matrix will have the same column names with suffexes \code{"_once"}, 30 | \code{"_sporadic"}, and \code{"_frequent"} for the first, second, and third expanded columns, respectively. 31 | } 32 | \references{ 33 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 34 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 35 | in studies of treatment effects using health care claims data. \emph{Epidemiology 36 | (Cambridge, Mass.)}, 20(4), 512. 37 | } 38 | \author{ 39 | Sam Lendle 40 | } 41 | -------------------------------------------------------------------------------- /man/hdps-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hdps-package.R 3 | \docType{package} 4 | \name{hdps-package} 5 | \alias{hdps-package} 6 | \alias{hdps} 7 | \title{High-dimensional propensity score algorithm} 8 | \description{ 9 | The high-dimensional propensity score (HDPS) algorithm is a method for 10 | high-dimensional proxy adjustment in claims data. This package implements 11 | the variable transformation and variable selection parts of the 12 | algorithm. 13 | } 14 | \details{ 15 | \tabular{ll}{ 16 | Package: \tab hdps\cr 17 | Type: \tab Package\cr 18 | Version: \tab 0.1.6\cr 19 | Date: \tab 2017-08-16\cr 20 | License: \tab MIT \cr 21 | } 22 | 23 | This package implements part of step 2 (\code{\link{identify_covariates}}), 24 | steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}}) 25 | of the HDPS algorithm (Schneeweiss et al., 2009). 26 | 27 | The \code{\link{hdps_screen}} function is a wrapper function for \code{\link{identify_covariates}}, 28 | \code{\link{assess_recurrence}}, and \code{\link{prioritize_covariates}}. 29 | } 30 | \examples{ 31 | 32 | #~~ simple examples of the most important functions ~~ 33 | 34 | } 35 | \references{ 36 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 37 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 38 | in studies of treatment effects using health care claims data. \emph{Epidemiology 39 | (Cambridge, Mass.)}, 20(4), 512. 40 | } 41 | \author{ 42 | Sam Lendle 43 | 44 | Maintainer: Sam Lendle 45 | } 46 | \keyword{package} 47 | -------------------------------------------------------------------------------- /man/hdps_screen.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hdps_screen.R 3 | \name{hdps_screen} 4 | \alias{hdps_screen} 5 | \title{hdps_screen} 6 | \usage{ 7 | hdps_screen(outcome, treatment, covars, dimension_names = NULL, 8 | dimension_indexes = NULL, keep_n_per_dimension = 200, 9 | keep_k_total = 500, verbose = FALSE, debug = FALSE) 10 | } 11 | \arguments{ 12 | \item{outcome}{binary vector of outcomes} 13 | 14 | \item{treatment}{binary vector of treatments} 15 | 16 | \item{covars}{\code{matrix} or \code{data.frame} of binary covariates.} 17 | 18 | \item{dimension_names}{A character vector of patterns to match against the column names of \code{covars} to split columns into dimension groups. See details.} 19 | 20 | \item{dimension_indexes}{A list of vectors of column indexes corresponding to dimension groups. See details. Cannot be specified with \code{dimension_names}.} 21 | 22 | \item{keep_n_per_dimension}{The maximum number of covariates to be kept per dimension by \code{\link{identify_covariates}}.} 23 | 24 | \item{keep_k_total}{Total number of covariates to keep after expanding by \code{\link{assess_recurrence}} and ordering by \code{link{prioritize_covariates}}.} 25 | 26 | \item{verbose}{Should verbose output be printed?} 27 | 28 | \item{debug}{Enables some debuging checks which slow things down, but may yield useful warnings or errors.} 29 | } 30 | \value{ 31 | An object of class \code{hdps_covars} 32 | } 33 | \description{ 34 | The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 35 | steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}}) 36 | of the HDPS algorithm (Schneeweiss et al., 2009). 37 | } 38 | \details{ 39 | The \code{hdps_screen} function performs part of step 2 (\code{\link{identify_covariates}}), 40 | steps 3 (\code{\link{assess_recurrence}}) and 4 (\code{\link{prioritize_covariates}}) 41 | of the HDPS algorithm (Schneeweiss et al., 2009). 42 | 43 | \emph{Step 2.} Columns of \code{covars} are split by data dimension (as defined in Schneeweiss et al. (2009)) and 44 | filtered by \code{\link{identify_covariates}}. 45 | 46 | Dimensions can be specified in two ways. 47 | If \code{dimension_names} is used, the \code{colnames(covars)} is \code{\link[base]{grep}}ed for each value of 48 | \code{dimension_names}. 49 | If some column names match more than one pattern, an error is thrown. 50 | If some column names are not matched by any pattern, a warning is issued and those columns are ignored. 51 | For example, suppose the column names of \code{covars} are \code{c("drug_1", "drug_2", "proc_1", "proc_2")}. 52 | \code{dimension_names <- c("drug", "proc")} would split \code{covars} into two dimensions, 53 | one for \code{drug}s and one for \code{proc}s. 54 | 55 | Dimensions can also be specified by \code{dimension_indexes} which should contain a list of either column 56 | indexes or column names for each dimension. 57 | 58 | If neither \code{dimension_names} nor \code{dimension_indexes} is specified, all covariates are treated as one dimension. 59 | 60 | \emph{Step 3.} After filtering, remaining covariates are expanded by \code{\link{assess_recurrence}}. 61 | 62 | If at this point, the number of expanded covariates is less than \code{keep_k_total}, all expanded covariates are returned. 63 | 64 | \emph{Step 4.} Expanded covariates are ordered with \code{\link{prioritize_covariates}}. 65 | 66 | \emph{Step 5.} Step 5 can be performed with \code{\link{predict.hdps_covars}}. 67 | } 68 | \examples{ 69 | set.seed(123) 70 | n <- 1000 71 | p <- 10000 72 | out <- rbinom(n, 1, 0.05) 73 | trt <- rbinom(n, 1, 0.5) 74 | covars <- matrix(rbinom(n*p, 3, 0.05), n) 75 | colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"), 76 | paste("proc", 1:(p/2), sep="_")) 77 | 78 | dimension_names <- c("drug", "proc") 79 | 80 | screened_covars_fit <- hdps_screen(out, trt, covars, 81 | dimension_names = dimension_names, 82 | keep_n_per_dimension = 400, 83 | keep_k_total = 200, 84 | verbose=TRUE) 85 | 86 | screened_covars <- predict(screened_covars_fit) 87 | 88 | } 89 | \references{ 90 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 91 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 92 | in studies of treatment effects using health care claims data. \emph{Epidemiology 93 | (Cambridge, Mass.)}, 20(4), 512. 94 | } 95 | \seealso{ 96 | \code{\link{predict.hdps_covars}} 97 | } 98 | \author{ 99 | Sam Lendle 100 | } 101 | -------------------------------------------------------------------------------- /man/identify_covariates.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identify_covariates.R 3 | \name{identify_covariates} 4 | \alias{identify_covariates} 5 | \title{identify_covariates} 6 | \usage{ 7 | identify_covariates(covars, keep_n_covars = 200, indexes = FALSE) 8 | } 9 | \arguments{ 10 | \item{covars}{a matrix or something that can be coerced with \code{\link[base]{as.matrix}} of covariates} 11 | 12 | \item{keep_n_covars}{number of covariates to keep} 13 | 14 | \item{indexes}{Should indexes be returned? Or a subset of \code{covars}. Defaults to \code{FALSE}.} 15 | } 16 | \value{ 17 | Indexes of identified columns or a subset of \code{covars} 18 | } 19 | \description{ 20 | Given a matrix of covarites, \code{identify_covariates} returns the top \code{keep_n_covars} or the indexes of those columns. 21 | } 22 | \details{ 23 | Columns are sorted in descending order of \code{min(prevalence, 1-prevalence)} where \code{prevalence} is the the proportion of 24 | non-zero values in a given column. 25 | 26 | If \code{indexes==TRUE}, a vector of the top \code{keep_n_covars} column indexes is returned. 27 | 28 | If \code{indexes==FALSE}, a matrix of covariates is returned whos columns are the top \code{keep_n_covars} colums of 29 | \code{covars}. Columns are in their original order. 30 | If also \code{keep_n_covars >= ncol(covar)}, then the function returns immediately without ranking columns in terms of 31 | prevalence as it is unecessary. 32 | 33 | \strong{Differences from Schneeweiss et al. (2009):} 34 | \itemize{ 35 | \item{Covariates that have fewer than 100 non-zero values are not automatically dropped. 36 | If typical covariates tend to have more than 100 non-zero values will typically be ranked higher than those with fewer than 100 37 | automatically.} 38 | } 39 | } 40 | \references{ 41 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 42 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 43 | in studies of treatment effects using health care claims data. \emph{Epidemiology 44 | (Cambridge, Mass.)}, 20(4), 512. 45 | } 46 | \author{ 47 | Sam Lendle 48 | } 49 | -------------------------------------------------------------------------------- /man/predict.SL_hdps.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SL.hdps.R 3 | \name{predict.SL_hdps} 4 | \alias{predict.SL_hdps} 5 | \title{Get predictions from SL_hdps wrapper} 6 | \usage{ 7 | \method{predict}{SL_hdps}(object, newdata, ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class \code{SL_hdps}} 11 | 12 | \item{newdata}{a matrix of covariates to predict from} 13 | 14 | \item{...}{ignored} 15 | } 16 | \value{ 17 | vector of predictions 18 | } 19 | \description{ 20 | Get predictions from SL_hdps wrapper 21 | } 22 | \author{ 23 | Sam Lendle 24 | } 25 | -------------------------------------------------------------------------------- /man/predict.hdps_covars.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hdps_screen.R 3 | \name{predict.hdps_covars} 4 | \alias{predict.hdps_covars} 5 | \title{Get matrix of hdps selected covariates} 6 | \usage{ 7 | \method{predict}{hdps_covars}(object, newdata = NULL, keep_k_total, ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class \code{hdps_covars}} 11 | 12 | \item{newdata}{\code{NULL}, or a matrix who's columns have names corresponding to those selected by hdps in \code{object}. 13 | If \code{NULL} selected covariates from original matrix used in the screening step are returned.} 14 | 15 | \item{keep_k_total}{change \code{keep_k_total} from the original call to \code{\link{hdps_screen}}} 16 | 17 | \item{...}{ignored} 18 | } 19 | \value{ 20 | A matrix of hdps selected covariates 21 | } 22 | \description{ 23 | returns the matix of covariates based on an hdps screening 24 | } 25 | \seealso{ 26 | \link{hdps_screen} 27 | } 28 | \author{ 29 | Sam Lendle 30 | } 31 | -------------------------------------------------------------------------------- /man/prioritize_covariates.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prioritize_covariates.R 3 | \name{prioritize_covariates} 4 | \alias{prioritize_covariates} 5 | \title{prioritize_covariates} 6 | \usage{ 7 | prioritize_covariates(outcome, treatment, covars, return_bias = FALSE, 8 | keep_NaNs = FALSE) 9 | } 10 | \arguments{ 11 | \item{outcome}{binary vector of outcomes} 12 | 13 | \item{treatment}{binary vector of treatments} 14 | 15 | \item{covars}{\code{matrix} or \code{data.frame} of binary covariates.} 16 | 17 | \item{return_bias}{Should the calculated multiplitive biases be returned with the ordered indexes? Defaults to \code{FALSE}} 18 | 19 | \item{keep_NaNs}{If the calculated multiplicitive bias for a covariate is \code{NaN}, should its index be included in the returned vector? Defaults to \code{FALSE}} 20 | } 21 | \value{ 22 | Vector of column indexes of \code{covars}. 23 | } 24 | \description{ 25 | Returns the indexes of columns \code{covars} ordered by the possible amount of counfounding each column could adjust for. 26 | } 27 | \details{ 28 | For each covariate in \code{covars}, the potential multiplicitive bias is calculated as described in Schneeweiss et al. (2009). 29 | The column indexes are then put in descending order of the absolute value of the log of the multiplicitive bias. 30 | 31 | If \code{return_bias==TRUE}, the returned vector of indexes includes the multiplicitive biases sorted in the same order and stored in an attribute called \code{"bias_m"}. 32 | 33 | If \code{outcome} has no variation for a particular value of a covariate, then the multiplicitive bias is calculated as \code{NaN}. 34 | If \code{keep_NaNs==FALSE}, then the column indexes of such covariates are not included in the returned vector. 35 | } 36 | \references{ 37 | Schneeweiss, S., Rassen, J. A., Glynn, R. J., Avorn, J., Mogun, 38 | H., & Brookhart, M. A. (2009). High-dimensional propensity score adjustment 39 | in studies of treatment effects using health care claims data. \emph{Epidemiology 40 | (Cambridge, Mass.)}, 20(4), 512. 41 | } 42 | \author{ 43 | Sam Lendle 44 | } 45 | -------------------------------------------------------------------------------- /man/screen.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/SL.hdps.R 3 | \name{screen} 4 | \alias{screen} 5 | \alias{screen.names} 6 | \alias{screen.excludenames} 7 | \title{SuperLearner screening wrappers} 8 | \usage{ 9 | screen.names(names) 10 | 11 | screen.excludenames(names) 12 | } 13 | \arguments{ 14 | \item{names}{Names to be included or excluded 15 | 16 | These functions generate simple screening wrappers for SuperLearner to 17 | include or exclude variables based on \code{names}. This is is helpful 18 | because in order to use HDPS as a candidate in SuperLearner, you need to 19 | include the study outcome variable as a covariate. But to use a non-HDPS 20 | algorithm, (say a random forest on some specified set of covariates,) as a 21 | candidate as well, you want to make sure you're not adjusting for the 22 | outcome which is downstream from treatment on the causal pathway. 23 | 24 | See documentation for the SuperLearner package for more about screening algorithms.} 25 | } 26 | \description{ 27 | Functions to set up screening wrappers for SuperLearner 28 | } 29 | \examples{ 30 | 31 | screen.predefined <- screen.names(c("names", "of", "predefined", 32 | "covariates", "that", "definitely", "dont", "include", "the", "outcome")) 33 | 34 | screen.notoutcome <- screen.excludenames(c("outcome_variable_name", 35 | "and", "other", "covariates", "to", "exclude")) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | // calc_rr_cds 9 | NumericVector calc_rr_cds(NumericVector outcome, NumericMatrix covars); 10 | RcppExport SEXP _hdps_calc_rr_cds(SEXP outcomeSEXP, SEXP covarsSEXP) { 11 | BEGIN_RCPP 12 | Rcpp::RObject rcpp_result_gen; 13 | Rcpp::RNGScope rcpp_rngScope_gen; 14 | Rcpp::traits::input_parameter< NumericVector >::type outcome(outcomeSEXP); 15 | Rcpp::traits::input_parameter< NumericMatrix >::type covars(covarsSEXP); 16 | rcpp_result_gen = Rcpp::wrap(calc_rr_cds(outcome, covars)); 17 | return rcpp_result_gen; 18 | END_RCPP 19 | } 20 | // colPrevScores 21 | NumericVector colPrevScores(NumericMatrix x); 22 | RcppExport SEXP _hdps_colPrevScores(SEXP xSEXP) { 23 | BEGIN_RCPP 24 | Rcpp::RObject rcpp_result_gen; 25 | Rcpp::RNGScope rcpp_rngScope_gen; 26 | Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); 27 | rcpp_result_gen = Rcpp::wrap(colPrevScores(x)); 28 | return rcpp_result_gen; 29 | END_RCPP 30 | } 31 | // colVars 32 | NumericVector colVars(NumericMatrix x); 33 | RcppExport SEXP _hdps_colVars(SEXP xSEXP) { 34 | BEGIN_RCPP 35 | Rcpp::RObject rcpp_result_gen; 36 | Rcpp::RNGScope rcpp_rngScope_gen; 37 | Rcpp::traits::input_parameter< NumericMatrix >::type x(xSEXP); 38 | rcpp_result_gen = Rcpp::wrap(colVars(x)); 39 | return rcpp_result_gen; 40 | END_RCPP 41 | } 42 | 43 | static const R_CallMethodDef CallEntries[] = { 44 | {"_hdps_calc_rr_cds", (DL_FUNC) &_hdps_calc_rr_cds, 2}, 45 | {"_hdps_colPrevScores", (DL_FUNC) &_hdps_colPrevScores, 1}, 46 | {"_hdps_colVars", (DL_FUNC) &_hdps_colVars, 1}, 47 | {NULL, NULL, 0} 48 | }; 49 | 50 | RcppExport void R_init_hdps(DllInfo *dll) { 51 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 52 | R_useDynamicSymbols(dll, FALSE); 53 | } 54 | -------------------------------------------------------------------------------- /src/calc_rr_cds.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | // Below is a simple example of exporting a C++ function to R. You can 5 | // source this function into an R session using the Rcpp::sourceCpp 6 | // function (or via the Source button on the editor toolbar) 7 | 8 | // For more on using Rcpp click the Help button on the editor toolbar 9 | 10 | // [[Rcpp::export]] 11 | NumericVector calc_rr_cds(NumericVector outcome, NumericMatrix covars) { 12 | int nrow = covars.nrow(), ncol = covars.ncol(); 13 | if (outcome.length() != nrow) { 14 | stop("length of outcome should be the same as the number of rows in covars"); 15 | } 16 | 17 | NumericVector out(ncol); 18 | out.attr("names") = colnames(covars); 19 | 20 | for (int j = 0; j < ncol; j++) { 21 | double outcomes1 = 0; 22 | double outcomes0 = 0; 23 | double n1 = 0; 24 | double n0 = 0; 25 | 26 | for (int i = 0; i < nrow; i++) { 27 | double covar = covars(i,j); 28 | if (covar == 0.0) { 29 | n0 += 1; 30 | outcomes0 += outcome(i); 31 | } else { 32 | n1 += 1; 33 | outcomes1 += outcome(i); 34 | } 35 | } 36 | 37 | double prev1 = outcomes1/n1; 38 | double prev0 = outcomes0/n0; 39 | 40 | double rr = prev1/prev0; 41 | out(j) = rr; 42 | } 43 | return out; 44 | } -------------------------------------------------------------------------------- /src/colPrevScores.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | // [[Rcpp::export]] 5 | NumericVector colPrevScores(NumericMatrix x) { 6 | int nrow = x.nrow(), ncol = x.ncol(); 7 | NumericVector out(ncol); 8 | 9 | for (int j = 0; j < ncol; j++) { 10 | int num_non_zero = 0; 11 | 12 | for (int i = 0; i < nrow; i++) { 13 | num_non_zero += x(i,j) > 0.0 ? 1 : 0; 14 | } 15 | 16 | double prev = (double) num_non_zero/nrow; 17 | 18 | out(j) = std::min(prev, 1.0-prev); 19 | } 20 | 21 | return out; 22 | } 23 | -------------------------------------------------------------------------------- /src/colVars.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | // [[Rcpp::export]] 5 | NumericVector colVars(NumericMatrix x) { 6 | int nrow = x.nrow(), ncol = x.ncol(); 7 | NumericVector out(ncol); 8 | 9 | for (int j = 0; j < ncol; j++) { 10 | double mean = 0; 11 | double M2 = 0; 12 | int n; 13 | double delta, xx; 14 | 15 | for (int i = 0; i < nrow; i++) { 16 | n = i+1; 17 | xx = x(i,j); 18 | delta = xx - mean; 19 | mean += delta/n; 20 | M2 = M2 + delta*(xx-mean); 21 | } 22 | 23 | out(j) = M2/(n-1); 24 | } 25 | 26 | return out; 27 | } 28 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(hdps) 3 | 4 | test_check("hdps") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_SL.hdps.R: -------------------------------------------------------------------------------- 1 | # well I tried testing this, but seems to not work because of the way SL searches for library functions and scoping. 2 | # you can run the code inside the test_that block, and it works, but i'm commenting out for now because it won't run inside 3 | # that block. 4 | 5 | context("SL.hdps") 6 | 7 | test_that("SL.hdps.generator works with glmnet and cv.glmnet", { 8 | expect_true(TRUE) 9 | # set.seed(123) 10 | # n <- 50 11 | # p <- 500 12 | # out <- rbinom(n, 1, 0.2) 13 | # trt <- rbinom(n, 1, 0.5) 14 | # covars <- matrix(rbinom(n*p, 3, 0.05), n) 15 | # colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"), 16 | # paste("proc", 1:(p/2), sep="_")) 17 | # 18 | # dimension_names <- c("drug", "proc") 19 | # 20 | # 21 | # SL.library <- c("SL.hdps.50", "SL.hdps.50.cv") 22 | # 23 | # library(SuperLearner) 24 | # 25 | # slfit <- SuperLearner(trt, X = data.frame(OUTCOME=out, covars), family=binomial, SL.library=c("SL.hdps.50", "SL.hdps.50.cv"), cvControl = list(V=3)) 26 | # 27 | # expect_is(slfit, "SuperLearner") 28 | # expect_is(slfit$fitLibrary$SL.hdps.50_All$glmnet_fit, "glmnet") 29 | # expect_is(slfit$fitLibrary$SL.hdps.50.cv_All$glmnet_fit, "cv.glmnet") 30 | }) 31 | -------------------------------------------------------------------------------- /tests/testthat/test_assess_recurrence.R: -------------------------------------------------------------------------------- 1 | context("assess_recurrence") 2 | 3 | test_that("get_quantiles and column_recurrence work when x is binary", { 4 | x <- c(0,0,0,0,1,1,1) 5 | mat <- matrix(x, length(x), 1) 6 | colnames(mat) = c("_once") 7 | cr <- column_recurrence(x, get_quantiles(x)) 8 | expect_equal(cr$mat, mat) 9 | expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q")) 10 | }) 11 | 12 | test_that("get_quantiles and column_recurrence work when 50th and 75th %ile are the same", { 13 | x <- c(0,0,0,0,1,1,1,3,3,3,3,3) 14 | mat <- cbind(as.numeric(x>0), as.numeric(x >=3)) 15 | colnames(mat) = c("_once", "_sporadic") 16 | cr <- column_recurrence(x, get_quantiles(x)) 17 | expect_equal(cr$mat, mat) 18 | expect_equal(dim(cr$mat)[2], length(cr$quants)) 19 | expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q")) 20 | }) 21 | 22 | test_that("get_quantiles and column_recurrence work when 50th %ile is 1", { 23 | x <- c(0,0,0,0,1,1,1,1,1,3,3,3) 24 | mat <- cbind(as.numeric(x>0), as.numeric(x >=3)) 25 | colnames(mat) = c("_once", "_frequent") 26 | cr <- column_recurrence(x, get_quantiles(x)) 27 | expect_equal(cr$mat, mat) 28 | expect_equal(dim(cr$mat)[2], length(cr$quants)) 29 | expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q")) 30 | }) 31 | 32 | test_that("get_quantiles and column_recurrence work when 50th %ile is 1.5 and 75th is 2", { 33 | x <- c(0,0,0,0,1,1,1,1,1,2,2,2,2,2) 34 | mat <- cbind(as.numeric(x>0), as.numeric(x >1)) 35 | colnames(mat) = c("_once", "_sporadic") 36 | cr <- column_recurrence(x, get_quantiles(x)) 37 | expect_equal(cr$mat, mat) 38 | expect_equal(dim(cr$mat)[2], length(cr$quants)) 39 | expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q")) 40 | }) 41 | 42 | test_that("get_quantiles and column_recurrence work when min is 50th %ile", { 43 | x <- c(0,0,0,0,2,2,2,2,2,2,2,2, 3,3) 44 | mat <- cbind(as.numeric(x>0)) 45 | colnames(mat) = c("_once") 46 | cr <- column_recurrence(x, get_quantiles(x)) 47 | expect_equal(cr$mat, mat) 48 | expect_equal(dim(cr$mat)[2], length(cr$quants)) 49 | expect_equal(colnames(cr$mat), sapply(cr$quants, `[[`, "q")) 50 | }) 51 | 52 | test_that("dups are taken care of when quantiles are ints but not in the data", { 53 | x <- c(0,0, 5, 1, 5, 5, 3, 7, 2, 5, 5, 3, 1, 3) 54 | mat <- cbind(as.numeric(x>0), x > 4) 55 | cr <- column_recurrence(x, get_quantiles(x), warndup=TRUE) 56 | colnames(mat) = c("_once", "_sporadic") 57 | expect_equal(cr$mat, mat) 58 | 59 | }) 60 | 61 | 62 | test_that("assess_recurrence works", { 63 | dat <- cbind(a=c(0,0,0,0,1,1,1,3,3,3,3,3), 64 | b=c(0,0,0,0,1,1,1,1,1,3,3,3)) 65 | mat <- cbind(as.numeric(dat[ ,1]>0), as.numeric(dat[ ,1] >=3), 66 | as.numeric(dat[ ,2]>0), as.numeric(dat[ ,2] >=3)) 67 | colnames(mat) <- c("a_once", "a_sporadic", "b_once", "b_frequent") 68 | #quants <- list(a=c("_once"=1, "_sporadic"=3), b=c("_once"=1, "_frequent"=3)) 69 | quants <- list(list(varname="a", q="_once", count=1), 70 | list(varname="a", q="_sporadic", count=3), 71 | list(varname="b", q="_once", count=1), 72 | list(varname="b", q="_frequent", count=3)) 73 | ar <- assess_recurrence(dat) 74 | expect_equal(ar[["mat"]], mat) 75 | expect_equal(ar[["quants"]], quants) 76 | }) 77 | 78 | -------------------------------------------------------------------------------- /tests/testthat/test_calc_rr_cds.R: -------------------------------------------------------------------------------- 1 | 2 | context("calc_rr_cds") 3 | 4 | test_that("calc_rr_cds c++ function works the same way as the all R version", { 5 | calc_rr_cd <- function(outcome, covar) { 6 | prevs <- by(outcome, covar, mean) 7 | (prevs[2])/(prevs[1]) 8 | } 9 | 10 | set.seed(123) 11 | n <- 100 12 | p <- 1000 13 | out <- rbinom(n, 1, 0.2) 14 | covars <- matrix(rbinom(n*p, 1, 0.05), n) 15 | colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"), 16 | paste("proc", 1:(p/2), sep="_")) 17 | 18 | #make outcome 0 for all obs with covar 1 = 1, so first RR = Inf 19 | out[covars[, 1]==1] <- 0 20 | 21 | rr_cds <- calc_rr_cds(out, covars) 22 | 23 | expect_equal(rr_cds, apply(covars, 2, calc_rr_cd, outcome=out)) 24 | expect_equal(rr_cds[1], 0, check.names=FALSE) 25 | }) 26 | -------------------------------------------------------------------------------- /tests/testthat/test_hdps_screen.R: -------------------------------------------------------------------------------- 1 | context("hdps_screen") 2 | 3 | test_that("works on data.frames", { 4 | set.seed(123) 5 | n <- 100 6 | p <- 1000 7 | out <- rbinom(n, 1, 0.2) 8 | trt <- rbinom(n, 1, 0.5) 9 | covars <- matrix(rbinom(n*p, 3, 0.05), n) 10 | colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"), 11 | paste("proc", 1:(p/2), sep="_")) 12 | 13 | dimension_names <- c("drug", "proc") 14 | 15 | screened_covars_fit <- hdps_screen(out, trt, as.data.frame(covars), 16 | dimension_names = dimension_names, 17 | keep_n_per_dimension = 200, 18 | keep_k_total = 100, 19 | verbose=FALSE) 20 | 21 | screened_covars <- predict(screened_covars_fit) 22 | expect_equal(screened_covars, predict(screened_covars_fit, newdata=covars)) 23 | }) 24 | -------------------------------------------------------------------------------- /tests/testthat/test_predict.R: -------------------------------------------------------------------------------- 1 | context("predict") 2 | 3 | test_that("predict returns the same matrix when newdata is the same as the original data", { 4 | set.seed(123) 5 | n <- 100 6 | p <- 1000 7 | out <- rbinom(n, 1, 0.2) 8 | trt <- rbinom(n, 1, 0.5) 9 | covars <- matrix(rbinom(n*p, 3, 0.05), n) 10 | colnames(covars) <- c(paste("drug", 1:(p/2), sep="_"), 11 | paste("proc", 1:(p/2), sep="_")) 12 | 13 | dimension_names <- c("drug", "proc") 14 | 15 | screened_covars_fit <- hdps_screen(out, trt, covars, 16 | dimension_names = dimension_names, 17 | keep_n_per_dimension = 200, 18 | keep_k_total = 100, 19 | verbose=FALSE) 20 | 21 | screened_covars <- predict(screened_covars_fit) 22 | expect_equal(screened_covars, predict(screened_covars_fit, newdata=covars)) 23 | expect_equal(screened_covars, predict(screened_covars_fit, newdata=as.data.frame(covars))) 24 | }) 25 | --------------------------------------------------------------------------------