├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── R ├── gather.R ├── print.matrix1.R ├── print.pstest.R ├── pstest.R ├── pstest.package.R └── summary.pstest.R ├── README.md └── man ├── pstest-package.Rd └── pstest.Rd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | pstest.Rproj 6 | "pstest (from Vandy).Rproj" 7 | inst/doc 8 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: pstest 2 | Type: Package 3 | Title: Specification Tests for Parametric Propensity Score Models 4 | Version: 0.1.3.900 5 | Author: Pedro H. C. Sant'Anna , Xiaojun Song 6 | 7 | Maintainer: Pedro H. C. Sant'Anna 8 | Description: The propensity score is one of the most widely used tools in 9 | studying the causal effect of a treatment, intervention, or policy. Given that 10 | the propensity score is usually unknown, it has to be estimated, implying that 11 | the reliability of many treatment effect estimators depends on the correct 12 | specification of the (parametric) propensity score. This package implements the 13 | data-driven nonparametric diagnostic tools for detecting propensity score 14 | misspecification proposed by Sant'Anna and Song (2019) . 15 | URL: https://github.com/pedrohcgs/pstest 16 | Depends: 17 | R (>= 3.1) 18 | Imports: 19 | stats, 20 | parallel, 21 | glmx, 22 | MASS, 23 | utils 24 | License: GPL-2 25 | Encoding: UTF-8 26 | LazyData: TRUE 27 | RoxygenNote: 6.1.1.9000 28 | Date: 2019-08-25 10:45 29 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,pstest) 4 | S3method(summary,pstest) 5 | export(pstest) 6 | importFrom(MASS,ginv) 7 | importFrom(glmx,hetglm.fit) 8 | importFrom(parallel,makeCluster) 9 | importFrom(parallel,nextRNGStream) 10 | importFrom(parallel,parLapply) 11 | importFrom(parallel,stopCluster) 12 | importFrom(stats,binomial) 13 | importFrom(stats,glm) 14 | importFrom(stats,rbinom) 15 | importFrom(stats,runif) 16 | importFrom(utils,write.table) 17 | -------------------------------------------------------------------------------- /R/gather.R: -------------------------------------------------------------------------------- 1 | # Functions used to guarantee replicability of the bootstrap 2 | # These are based on the harvestr package, but since harvestr is not on CRAN anymore, we just use these functions 3 | # safe version of retrieving the .Random.seed - 4 | get.seed.ps <- function(){ 5 | if(exists(".Random.seed", envir=.GlobalEnv, mode="numeric")) { 6 | seed <- get(".Random.seed", envir=.GlobalEnv, inherits=FALSE) 7 | class(seed) <- c("rng-seed", "integer") 8 | seed 9 | } else { 10 | NULL 11 | } 12 | } 13 | 14 | gather.ps <- function(x, seed=1234){ 15 | set.seed(seed, kind="L'Ecuyer-CMRG", normal.kind="Inversion") 16 | r <- get.seed.ps() 17 | seeds <- vector('list', x) 18 | for(i in seq_len(x)) { 19 | r <- 20 | seeds[[i]] <- 21 | structure( parallel::nextRNGStream(r) 22 | , RNGlevel='stream' 23 | , class=c("rng-seed", "integer") 24 | ) 25 | } 26 | structure(seeds, class=c('rng-seeds', 'list')) 27 | } 28 | -------------------------------------------------------------------------------- /R/print.matrix1.R: -------------------------------------------------------------------------------- 1 | #' @title print.matrix1 2 | #' 3 | #' @description Helper function to print a matrix; used by the print methods 4 | #' 5 | #' @param m Some matrix 6 | #' 7 | #' @noRd 8 | #' @importFrom utils write.table 9 | #' 10 | 11 | # Got this from Brant QTE package 12 | print.matrix1 <- function(m){ 13 | utils::write.table(format(m, justify="right", digits=2, nsmall=2), 14 | row.names=F, col.names=F, quote=F, sep="\t") 15 | } 16 | -------------------------------------------------------------------------------- /R/print.pstest.R: -------------------------------------------------------------------------------- 1 | #' @title Print 2 | #' 3 | #' @description Prints a pstest Object 4 | #' 5 | #' @param x A pstest object 6 | #' @param ... Other params (required as generic function, but not used) 7 | #' 8 | #' @export 9 | #' @noRd 10 | # Define new print function 11 | print.pstest <- function(x, ...){ 12 | #----------------------------------------------------------------------------- 13 | # Preliminaries 14 | # Weight function used 15 | if(x$argu$w == "ind") ww <- "Indicator function: w(q,u) = 1(q<=u)" 16 | if(x$argu$w == "exp") ww <- "Exponential function: w(q,u) = exp(qu)" 17 | if(x$argu$w == "logistic") ww <- "Logistic function: w(q,u) = 1/[1+exp(-qu)]" 18 | if(x$argu$w == "sin") ww <- "Sine function: w(q,u) = sin(qu)" 19 | if(x$argu$w == "sincos") ww <- "Sine and cosine function: w(q,u) = sin(qu)+ cos(qu)" 20 | 21 | distt <- x$argu$dist 22 | if(distt == "Mammen") distt <- "Mammen's" 23 | 24 | #Creat parameters for the Table 25 | header <- c("", "Test statistic", "Bootstrapped P-value") 26 | body <- cbind(c("Kolmogorov-Smirnov", "Cramer-von Mises"), 27 | c(round(x$kstest, digits = 4), round(x$cvmtest, digits = 4)), 28 | c(round(x$pvks, digits = 4), round(x$pvcvm, digits = 4))) 29 | 30 | colnames(body) <- header 31 | #----------------------------------------------------------------------------- 32 | #Output 33 | cat(" Call:\n") 34 | print(x$call) 35 | cat("\n Sant'Anna and Song (2019) specification tests for the propensity score:\n") 36 | print.matrix1(rbind(header, body)) 37 | cat(" -------------------------------------------------------------------------") 38 | cat("\n Weight function:", "\t \t", ww) 39 | cat("\n Number of boostrap draws:", "\t", x$argu$nboot) 40 | cat("\n Boostrap draws from", distt, 'distribution') 41 | 42 | } 43 | -------------------------------------------------------------------------------- /R/pstest.R: -------------------------------------------------------------------------------- 1 | #' pstest: Tests for the Propensity Score 2 | #' 3 | #' \emph{pstest} computes Kolmogorov-Smirnov and Cramer-von Mises type tests 4 | #' for the null hypothesis that a parametric model for the propensity score is 5 | #' is correctly specified. For details of the testing procedure, see 6 | #' Sant'Anna and Song (2019),'Specification Tests for the Propensity Score' . 7 | #' 8 | #'@param d a vector containing the binary treatment indicator. 9 | #'@param pscore a vector containing the estimated propensity scores. 10 | #'@param xpscore a matrix (or data frame) containing the covariates (and their 11 | #' transformations) included in the propensity score 12 | #' estimation. It should also include the constant term. 13 | #'@param model a description of the functional form (link function) used 14 | #' to estimated propensity score. The alternatives are: 15 | #' 'logit' (default), 'probit', and het.probit 16 | #' @param pscore.model in case you you set model="het.probit", pscore.model is the entire hetglm object. 17 | #' Default for pscore.model is NULL. 18 | #'@param w a description of which weight function the projection is based on. 19 | #' The alternatives are 'ind' (default), which set \eqn{w(q,u)=1(q<=u)}, 20 | #' 'exp', which set \eqn{w(q,u)=exp(qu)}, 'logistic', which set 21 | #' \eqn{w(q,u)=1/[1+exp(1-qu)]}, 'sin', which set \eqn{w(q,u)=sin(qu)}, and 22 | #' 'sincos', which set \eqn{w(q,u)=sin(qu)+cos(qu)}. 23 | #'@param dist a description of which distribution to use during the bootstrap. 24 | #' The alternatives are 'Mammen' (default), and 'Rademacher'. 25 | #'@param nboot number of bootstrap replicates to perform. Default is 1,000. 26 | #'@param cores number of cores to use during the bootstrap. Default is 1. 27 | #' If cores is greater than 1, the bootstrap is conducted using 28 | #' parLapply, instead of lapply type call. 29 | #'@param chunk a value that determine the size of each 'tile'. Such argument is used 30 | #' to split the original data into chunks, saving memory. 31 | #' Default value is 1,000. If the \emph{pstest} function throw a 32 | #' memory error, you should choose a smaller value for \emph{chunk}. 33 | #' 34 | #' 35 | #'@return a list containing the Kolmogorov-Smirnov and Cramer-von Mises test 36 | #' statistics for the null hypothesis of correctly specified propensity 37 | #' score model (kstest and cvmtest, respectively), and their associate 38 | #' bootstrapped p-values, pvks and pvcvm, respectively. All inputs are also 39 | #' returned. 40 | #' 41 | #'@references 42 | #' Sant'Anna, Pedro H. C, and Song, Xiaojun (2019), \emph{Specification Tests for the Propensity Score}, 43 | #' Journal of Econometrics, vol. 210 (2), p. 379-404, . 44 | #' 45 | #'@examples 46 | #' # Example based on simulation data 47 | #' # Simulate vector of covariates 48 | #' set.seed(1234) 49 | #' x1 <- runif(100) 50 | #' x2 <- rt(100, 5) 51 | #' x3 <- rpois(100, 3) 52 | #' # generate treatment status score based on Probit Specification 53 | #' treat <- (x1 + x2 + x3 >= rnorm(100, 4, 5)) 54 | #' # estimate correctly specified propensity score based on Probit 55 | #' pscore <- stats::glm(treat ~ x1 + x2 + x3, family = binomial(link = "probit"), 56 | #' x = TRUE) 57 | #' # Test the correct specification of estimated propensity score, using 58 | #' # the weight function 'ind', and bootstrap based on 'Mammen'. 59 | #' pstest(d = pscore$y, pscore = pscore$fit, xpscore = pscore$x, 60 | #' model = "probit", w = "ind", dist = "Mammen") 61 | #' # Alternatively, one can use the 'sin' weight function 62 | #' pstest(d = pscore$y, pscore = pscore$fit, xpscore = pscore$x, 63 | #' model = "probit", w = "sin", dist = "Mammen") 64 | #' 65 | #'@export 66 | #' 67 | #'@importFrom stats binomial rbinom runif glm 68 | #'@importFrom parallel makeCluster parLapply stopCluster nextRNGStream 69 | #'@importFrom glmx hetglm.fit 70 | #'@importFrom MASS ginv 71 | #------------------------------------------------------------------------------- 72 | pstest = function(d, pscore, xpscore, pscore.model = NULL, 73 | model = "logit", 74 | w = "ind", 75 | dist = "Mammen", 76 | nboot = 1000, cores = 1, chunk = 1000) { 77 | #----------------------------------------------------------------------------- 78 | # Define some underlying variables 79 | n <- length(d) 80 | xx <- as.matrix(xpscore) 81 | pscore.fit <- pscore 82 | uhat <- d - pscore.fit 83 | #----------------------------------------------------------------------------- 84 | # Run some tests 85 | if( !is.element(model,c("logit", "probit", "het.probit"))) { 86 | stop("model must be either 'logit', 'probit' or 'het.probit' ") 87 | } 88 | if( !is.element(dist,c("Mammen", "Rademacher"))) { 89 | stop("dist must be either 'Mammen', or 'Rademacher' ") 90 | } 91 | if( !is.element(w,c("ind", "exp", "logistic", "sin", "sincos"))) { 92 | stop("w must be either 'ind', 'exp', 'logistic', 'sin', or 'sincos' ") 93 | } 94 | #----------------------------------------------------------------------------- 95 | # #Define the score variables for the projection 96 | if (model == "logit") { 97 | g <- pscore.fit * (1 - pscore.fit) * xx 98 | } 99 | if (model == "probit") { 100 | beta.x <- stats::qnorm(pscore.fit) 101 | g <- stats::dnorm(beta.x) * xx 102 | rm(beta.x) 103 | } 104 | if (model == "het.probit") { 105 | if(is.null(pscore.model)){ 106 | stop(" You must provide the entire hetglm model if you are using het. probit") 107 | } 108 | if(!class(pscore.model)=="hetglm"){ 109 | stop(" pscore.model must be estimated using the hetglm function. See glmx package") 110 | } 111 | if(is.null(pscore.model$x$scale)){ 112 | stop(" You must include the option x=T in your glmx model") 113 | } 114 | 115 | pp <- pscore.model 116 | index.mean <- as.numeric(pp$x$mean %*% pp$coefficients$mean) 117 | index.scale <- as.numeric(pp$x$scale %*% (pp$coefficients$scale)) 118 | 119 | #beta.x <- stats::qnorm(pscore.fit) 120 | index <- index.mean * exp(-index.scale) 121 | g <- cbind(stats::dnorm(index) * exp(-index.scale) *pp$x$mean, 122 | - stats::dnorm(index)*index.mean*exp(-index.scale)*pp$x$scale) 123 | 124 | 125 | xx <- as.matrix(cbind(pp$x$mean,pp$x$scale)) 126 | #rm(pp,xx.scale,index.mean,index.scale,index ) 127 | } 128 | 129 | gg <- crossprod(g) 130 | #----------------------------------------------------------------------------- 131 | # Define variables to be used in the loop 132 | # Number of covariates 133 | k.dim = dim(xx)[2] 134 | 135 | # unique pscores 136 | #un.pscores <- unique(pscore.fit) 137 | un.pscores <- (pscore.fit) 138 | n.unique <- length(un.pscores) 139 | 140 | # Initialize `beta` matrix (K coefficients for each of n.unique responses) 141 | beta <- matrix(0, k.dim, n.unique) 142 | 143 | # Initialize `Rw` row vector (n.unique dimension) 144 | Rw <- matrix(0, 1, n.unique) 145 | 146 | # We split n columns into l tiles, each with chunk columns 147 | l <- floor(n.unique/chunk) + 1 148 | 149 | # Initialize the bootststrap test statistics vectors 150 | ksb1 <- matrix(0, nboot, 1) 151 | cvmb1 <- matrix(0, nboot, 1) 152 | #----------------------------------------------------------------------------- 153 | # Let's define some parameters for the bootstrap 154 | # Better to define these outside the loop that will follow. 155 | 156 | if (dist == "Mammen"){ 157 | # Use the Mammen(1993) binary V's 158 | k1 <- 0.5 * (1 - 5^0.5) 159 | k2 <- 0.5 * (1 + 5^0.5) 160 | pkappa <- 0.5 * (1 + 5^0.5)/(5^0.5) 161 | } 162 | 163 | if (dist == "Rademacher"){ 164 | # Use the Rademacher V's 165 | k1 <- 1 166 | k2 <- -1 167 | pkappa <- 0.5 168 | } 169 | 170 | # function for the bootstrap 171 | bootapply <- function(nn, n, pkappa, k1, k2, uhat, w1.temp, Seed) { 172 | # to make each run fully reproducible, we set the seed 173 | seed.run <- Seed[nn, ] 174 | set.seed(seed.run, "L'Ecuyer-CMRG") 175 | v <- stats::rbinom(n, 1, pkappa) 176 | v <- ifelse(v == 1, k1, k2) 177 | # Bootstrapped emprirical process 178 | Rwb <- colSums(uhat * v * w1.temp)/n 179 | # KS test 180 | ksb <- sqrt(n) * max(abs(Rwb)) 181 | # Cramer-von Mises test 182 | cvmb <- sum(Rwb^2) 183 | # Return both tests 184 | return(cbind(ksb, cvmb)) 185 | } 186 | #----------------------------------------------------------------------------- 187 | # Define seeds: Guarantee reproducibility 188 | ss <- floor(stats::runif(1) * 10000) 189 | seed.temp <- gather.ps(nboot, seed = ss) 190 | 191 | Seed <- matrix(nrow = nboot, ncol = 6) 192 | for (i in 1:nboot) { 193 | Seed[i, ] <- seed.temp[[i]][2:7] 194 | } 195 | #----------------------------------------------------------------------------- 196 | # If we are going to use paralell coding, initialize the cores 197 | if (cores > 1) { 198 | cl <- parallel::makeCluster(cores) 199 | } 200 | #----------------------------------------------------------------------------- 201 | # Start the loop to compute the tests (this is more memory efficient) 202 | # we do a loop for each weight function, to avoid loss in speed 203 | # indicator weight 204 | if (w == "ind"){ 205 | for (i in 1:l) { 206 | start <- min(chunk * (i - 1) + 1, n.unique) 207 | end <- min(chunk * i, n.unique) 208 | w.temp <- outer(pscore.fit, un.pscores[start:end], "<=") 209 | Gw <- crossprod(g, w.temp) 210 | 211 | beta[, start:end] <- MASS::ginv(crossprod(g)) %*% Gw 212 | #beta[, start:end] <- solve(gg, Gw) 213 | w1.temp <- (w.temp - g %*% beta[, start:end]) 214 | Rw[start:end] <- colSums(uhat * w1.temp)/n 215 | # Now the bootstrapped test in the chunk 216 | if (cores == 1) { 217 | boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2, 218 | uhat, w1.temp, Seed) 219 | } 220 | if (cores > 1) { 221 | boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n, 222 | pkappa, k1, k2, uhat, w1.temp, Seed) 223 | } 224 | # Put the Bootstrap resuls in a matrix 225 | boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot)) 226 | # Compute the KSb and CvMb over chunks 227 | if (1000 * (i - 1) + 1 <= n.unique) { 228 | ksb1 <- pmax(ksb1, boot.chunk[, 1]) 229 | cvmb1 <- cvmb1 + boot.chunk[, 2] 230 | } 231 | } 232 | } 233 | #----------------------------------------------------------------------------- 234 | # exponential weight 235 | if (w == "exp"){ 236 | for (i in 1:l) { 237 | start <- min(chunk * (i - 1) + 1, n.unique) 238 | end <- min(chunk * i, n.unique) 239 | w.temp <- tcrossprod(pscore.fit, un.pscores[start:end]) 240 | w.temp <- exp(w.temp) 241 | Gw <- crossprod(g, w.temp) 242 | beta[, start:end] <- solve(gg, Gw) 243 | w1.temp <- (w.temp - g %*% beta[, start:end]) 244 | Rw[start:end] <- colSums(uhat * w1.temp)/n 245 | # Now the bootstrapped test in the chunk 246 | if (cores == 1) { 247 | boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2, 248 | uhat, w1.temp, Seed) 249 | } 250 | if (cores > 1) { 251 | boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n, 252 | pkappa, k1, k2, uhat, w1.temp, Seed) 253 | } 254 | # Put the Bootstrap resuls in a matrix 255 | boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot)) 256 | # Compute the KSb and CvMb over chunks 257 | if (1000 * (i - 1) + 1 <= n.unique) { 258 | ksb1 <- pmax(ksb1, boot.chunk[, 1]) 259 | cvmb1 <- cvmb1 + boot.chunk[, 2] 260 | } 261 | } 262 | } 263 | #----------------------------------------------------------------------------- 264 | # Logistic weight 265 | if (w == "logistic"){ 266 | for (i in 1:l) { 267 | start <- min(chunk * (i - 1) + 1, n.unique) 268 | end <- min(chunk * i, n.unique) 269 | w.temp <- tcrossprod(pscore.fit, un.pscores[start:end]) 270 | w.temp <- 1/(1+exp(1-w.temp)) 271 | Gw <- crossprod(g, w.temp) 272 | beta[, start:end] <- solve(gg, Gw) 273 | w1.temp <- (w.temp - g %*% beta[, start:end]) 274 | Rw[start:end] <- colSums(uhat * w1.temp)/n 275 | # Now the bootstrapped test in the chunk 276 | if (cores == 1) { 277 | boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2, 278 | uhat, w1.temp, Seed) 279 | } 280 | if (cores > 1) { 281 | boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n, 282 | pkappa, k1, k2, uhat, w1.temp, Seed) 283 | } 284 | # Put the Bootstrap resuls in a matrix 285 | boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot)) 286 | # Compute the KSb and CvMb over chunks 287 | if (1000 * (i - 1) + 1 <= n.unique) { 288 | ksb1 <- pmax(ksb1, boot.chunk[, 1]) 289 | cvmb1 <- cvmb1 + boot.chunk[, 2] 290 | } 291 | } 292 | } 293 | #----------------------------------------------------------------------------- 294 | # Sine weight 295 | if (w == "sin"){ 296 | for (i in 1:l) { 297 | start <- min(chunk * (i - 1) + 1, n.unique) 298 | end <- min(chunk * i, n.unique) 299 | w.temp <- tcrossprod(pscore.fit, un.pscores[start:end]) 300 | w.temp <- sin(w.temp) 301 | Gw <- crossprod(g, w.temp) 302 | beta[, start:end] <- solve(gg, Gw) 303 | w1.temp <- (w.temp - g %*% beta[, start:end]) 304 | Rw[start:end] <- colSums(uhat * w1.temp)/n 305 | # Now the bootstrapped test in the chunk 306 | if (cores == 1) { 307 | boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2, 308 | uhat, w1.temp, Seed) 309 | } 310 | if (cores > 1) { 311 | boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n, 312 | pkappa, k1, k2, uhat, w1.temp, Seed) 313 | } 314 | # Put the Bootstrap resuls in a matrix 315 | boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot)) 316 | # Compute the KSb and CvMb over chunks 317 | if (1000 * (i - 1) + 1 <= n.unique) { 318 | ksb1 <- pmax(ksb1, boot.chunk[, 1]) 319 | cvmb1 <- cvmb1 + boot.chunk[, 2] 320 | } 321 | } 322 | } 323 | #----------------------------------------------------------------------------- 324 | # Sine and cosine weight 325 | if (w == "sincos"){ 326 | for (i in 1:l) { 327 | start <- min(chunk * (i - 1) + 1, n.unique) 328 | end <- min(chunk * i, n.unique) 329 | w.temp <- tcrossprod(pscore.fit, un.pscores[start:end]) 330 | w.temp <- sin(w.temp)+cos(w.temp) 331 | Gw <- crossprod(g, w.temp) 332 | beta[, start:end] <- solve(gg, Gw) 333 | w1.temp <- (w.temp - g %*% beta[, start:end]) 334 | Rw[start:end] <- colSums(uhat * w1.temp)/n 335 | # Now the bootstrapped test in the chunk 336 | if (cores == 1) { 337 | boot.chunk <- lapply(1:nboot, bootapply, n, pkappa, k1, k2, 338 | uhat, w1.temp, Seed) 339 | } 340 | if (cores > 1) { 341 | boot.chunk <- parallel::parLapply(cl, 1:nboot, bootapply, n, 342 | pkappa, k1, k2, uhat, w1.temp, Seed) 343 | } 344 | # Put the Bootstrap resuls in a matrix 345 | boot.chunk <- t(matrix(unlist(boot.chunk), 2, nboot)) 346 | # Compute the KSb and CvMb over chunks 347 | if (1000 * (i - 1) + 1 <= n.unique) { 348 | ksb1 <- pmax(ksb1, boot.chunk[, 1]) 349 | cvmb1 <- cvmb1 + boot.chunk[, 2] 350 | } 351 | } 352 | } 353 | #----------------------------------------------------------------------------- 354 | # close the clusters, if we used paralell 355 | if (cores > 1) { 356 | parallel::stopCluster(cl) 357 | } 358 | #----------------------------------------------------------------------------- 359 | # Compute our test statistics 360 | cvmtest1 <- sum(Rw^2) 361 | kstest1 <- sqrt(n) * max(abs(Rw)) 362 | #----------------------------------------------------------------------------- 363 | # Put the bootstrap tests in a matrix 364 | boottest <- matrix(0, nboot, 2) 365 | boottest[, 1] <- ksb1 366 | boottest[, 2] <- cvmb1 367 | #----------------------------------------------------------------------------- 368 | # Name the Columns 369 | colnames(boottest) <- c("ksb", "cvmb") 370 | #----------------------------------------------------------------------------- 371 | # compute the Bootstrap P-value 372 | pvksb <- sum((boottest[, 1] > kstest1))/nboot 373 | pvcvmb <- sum((boottest[, 2] > cvmtest1))/nboot 374 | #--------------------------------------------------------------------- 375 | # record the call 376 | call.param <- match.call() 377 | # Record all arguments used in the function 378 | argu <- mget(names(formals()),sys.frame(sys.nframe())) 379 | argu <- list(model = argu$model, w = argu$w, dist = argu$dist, nboot = argu$nboot ) 380 | # Return these variables 381 | ret <- list(kstest = kstest1, cvmtest = cvmtest1, pvks = pvksb, pvcvm = pvcvmb, 382 | call.param = call.param, argu = argu) 383 | # Define a new class 384 | class(ret) <- "pstest" 385 | # return the list 386 | return(ret) 387 | } 388 | -------------------------------------------------------------------------------- /R/pstest.package.R: -------------------------------------------------------------------------------- 1 | #' pstest: An R Package for assessing the goodness-of-fit of parametric propensity score models. 2 | #' 3 | #'@description 4 | #' The propensity score is one of the most widely used tools in studying the causal effect 5 | #' of a treatment, intervention, or policy. Given that the propensity score is usually unknown, 6 | #' it has to be estimated, implying that the reliability of many treatment effect estimators depends 7 | #' on the correct specification of the (parametric) propensity score. This package provides 8 | #' data-driven nonparametric diagnostic tools for detecting propensity score misspecification. 9 | #' 10 | #' 11 | #'@details 12 | #' This R package implements the class of specification test for the propensity score 13 | #' proposed in Sant'Anna and Song (2019), `Specification Tests for the Propensity Score', . 14 | #' 15 | #' In short, this package implements Kolmogorov-Smirnov and Cramer-von Mises type tests 16 | #' for parametric propensity score models with either logistic ('logit'), or 17 | #' normal ('probit') link function. Critical values are computed with the assistance of a 18 | #' multiplier bootstrap. 19 | #' 20 | #' The tests are based on the integrated conditional moment approach, where the weight function 21 | #' used is based on an orthogonal projection onto the tangent space of nuisance parameters. 22 | #' As a result, the tests (a) enjoy improved power properties, (b) do not suffer from the 23 | #' 'curse of dimensionality' when the vector of covariates is of high-dimensionality, 24 | #' (c) are fully data-driven, (e) do not require tuning parameters such as bandwidths, and 25 | #' (e) are able to detect a broad class of local alternatives converging to the null at the 26 | #' parametric rate. These appealing features highlight that the tests can be of great use 27 | #' in practice. 28 | #' 29 | #' It is worth stressing that this package implements in a unified manner a large class of 30 | #' specification tests, depending on the chosen weight function \eqn{w(q,u)}: 31 | #' \itemize{ 32 | #' \item{`ind' - the indicator weight function \eqn{w(q,u)=1(q \le u)}. This is the default.} 33 | #' \item{`exp' - the exponential weight function \eqn{w(q,u)=exp(qu)}.} 34 | #' \item{`logistic' - the logistic weight function \eqn{w(q,u)=1/[1+exp(1-qu)]}.} 35 | #' \item{`sin' - the sine weight function \eqn{w(q,u)=sin(qu)}.} 36 | #' \item{`sincos' - the sine and cosine weight function \eqn{w(q,u)=sin(qu)+cos(qu)}.} 37 | #' } 38 | #' 39 | #'Different weight functions \eqn{w(q,u)} have different power properties, and therefore, 40 | #'being able to choose different \eqn{w(q,u)} gives us flexibility to direct power in desired 41 | #'directions. 42 | #' 43 | #' @references 44 | #' Sant'Anna, Pedro H. C, and Song, Xiaojun (2019), \emph{Specification Tests for the Propensity Score}, 45 | #' Journal of Econometrics 210 (2), pp. 379-404, . 46 | #' 47 | #'@docType package 48 | #'@name pstest-package 49 | NULL 50 | -------------------------------------------------------------------------------- /R/summary.pstest.R: -------------------------------------------------------------------------------- 1 | #' @title Summary 2 | #' 3 | #' @description Summary of a pstest object 4 | #' 5 | #' @param object A pstest object 6 | #' @param ... Other params (required as generic function, but not used) 7 | #' 8 | #' @export 9 | #' @noRd 10 | # Define new summary function 11 | summary.pstest <- function(object, ...){ 12 | pstest.obj <- object 13 | print(pstest.obj) 14 | 15 | } 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pstest: An R Package to assess the goodness-of-fit of parametric propensity score models. 2 | 3 | ## Overview 4 | The propensity score is one of the most widely used tools in studying the causal effect 5 | of a treatment, intervention, or policy. Given that the propensity score is usually unknown, 6 | it has to be estimated, implying that the reliability of many treatment effect estimators depends 7 | on the correct specification of the (parametric) propensity score. This package provides 8 | data-driven nonparametric diagnostic tools for detecting propensity score misspecification. 9 | 10 | In short, this package implements the class of specification test for the propensity score 11 | proposed in Sant'Anna and Song (2019), [Specification Tests for the Propensity Score](https://www.sciencedirect.com/science/article/pii/S0304407619300272). The package accommodates Logit and Probit propensity score specifications. Critical values are computed with the assistance of a multiplier bootstrap. 12 | 13 | The tests are based on the integrated conditional moment approach, where the weight function 14 | used is based on an orthogonal projection onto the tangent space of nuisance parameters. 15 | As a result, the tests (a) enjoy improved power properties, (b) do not suffer from the 16 | 'curse of dimensionality' when the vector of covariates is of high-dimensionality, 17 | (c) are fully data-driven, (e) do not require tuning parameters such as bandwidths, and 18 | (e) are able to detect a broad class of local alternatives converging to the null at the 19 | parametric rate. These appealing features highlight that the tests can be of great use 20 | in practice. 21 | 22 | It is worth stressing that the `pstest` package implements in a unified manner a large class of specification tests, depending on the chosen weight function $w(q,u)$. Current choices are: 23 | 24 | * `ind` - the indicator weight function $w(q,u) = 1(q \leq u)$. This is the default. 25 | * `exp` - the exponential weight function $w(q,u) = exp(qu)$. 26 | * `logistic` - the logistic weight function $w(q,u) = \frac{1}{1+exp(1-qu)}$. 27 | * `sin` - the sine weight function $w(q,u) = sin(qu)$. 28 | * `sincos` - the sine and cosine weight function $w(q,u) = sin(qu) + cos(qu)$. 29 | 30 | Different weight functions $w(q,u)$ have different power properties. Thus, being able to choose between different $w(q,u)$ gives one the flexibility to direct power in alternative directions. 31 | 32 | For further details, please see the paper Sant'Anna and Song (2019), [Specification Tests for the Propensity Score](https://www.sciencedirect.com/science/article/pii/S0304407619300272), or the [working paper version](https://papers.ssrn.com/abstract=2872084). 33 | 34 | ## Installing pstest 35 | This github website hosts the source code. The difference between what is here what is in CRAN is that here we always have the most updated version of the package. 36 | 37 | To install the `pstest` package, you have two options: (a) install the CRAN version, or (b) instal the GitHub (most updated - RECOMMENDED) version. 38 | 39 | You can install the package from CRAN with `install.packages("pstest")`. 40 | 41 | Alternatively, you can install the most recent version of the `pstest` package from GitHub (this is what we recommend): 42 | 43 | ```r 44 | #library(remotes) 45 | remotes::install_github("pedrohcgs/pstest") 46 | ``` 47 | 48 | ## Authors 49 | 50 | Pedro H. C. Sant'Anna, Vanderbilt University, Nashville, TN. E-mail: pedro.h.santanna [at] vanderbilt [dot] edu. 51 | 52 | Xiaojun Song, Peking University, Beijing, China. E-mail: sxj [at] gsm [dot] pku [dot] edu [dot] cn. 53 | 54 | In case you have questions, please do not hesitate to contact us. 55 | 56 | -------------------------------------------------------------------------------- /man/pstest-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pstest.package.R 3 | \docType{package} 4 | \name{pstest-package} 5 | \alias{pstest-package} 6 | \title{pstest: An R Package for assessing the goodness-of-fit of parametric propensity score models.} 7 | \description{ 8 | The propensity score is one of the most widely used tools in studying the causal effect 9 | of a treatment, intervention, or policy. Given that the propensity score is usually unknown, 10 | it has to be estimated, implying that the reliability of many treatment effect estimators depends 11 | on the correct specification of the (parametric) propensity score. This package provides 12 | data-driven nonparametric diagnostic tools for detecting propensity score misspecification. 13 | } 14 | \details{ 15 | This R package implements the class of specification test for the propensity score 16 | proposed in Sant'Anna and Song (2016), `Specification Tests for the Propensity Score', 17 | available at Pedro H.C. Sant'Anna webpage, \url{http://sites.google.com/site/pedrohcsantanna/}. 18 | 19 | In short, this package implements Kolmogorov-Smirnov and Cramer-von Mises type tests 20 | for parametric propensity score models with either logistic ('logit'), or 21 | normal ('probit') link function. Critical values are computed with the assistance of a 22 | multiplier bootstrap. 23 | 24 | The tests are based on the integrated conditional moment approach, where the weight function 25 | used is based on an orthogonal projection onto the tangent space of nuisance parameters. 26 | As a result, the tests (a) enjoy improved power properties, (b) do not suffer from the 27 | 'curse of dimensionality' when the vector of covariates is of high-dimensionality, 28 | (c) are fully data-driven, (e) do not require tuning parameters such as bandwidths, and 29 | (e) are able to detect a broad class of local alternatives converging to the null at the 30 | parametric rate. These appealing features highlight that the tests can be of great use 31 | in practice. 32 | 33 | It is worth stressing that this package implements in a unified manner a large class of 34 | specification tests, depending on the chosen weight function \eqn{w(q,u)}: 35 | \itemize{ 36 | \item{`ind' - the indicator weight function \eqn{w(q,u)=1(q \le u)}. This is the default.} 37 | \item{`exp' - the exponential weight function \eqn{w(q,u)=exp(qu)}.} 38 | \item{`logistic' - the logistic weight function \eqn{w(q,u)=1/[1+exp(1-qu)]}.} 39 | \item{`sin' - the sine weight function \eqn{w(q,u)=sin(qu)}.} 40 | \item{`sincos' - the sine and cosine weight function \eqn{w(q,u)=sin(qu)+cos(qu)}.} 41 | } 42 | 43 | Different weight functions \eqn{w(q,u)} have different power properties, and therefore, 44 | being able to choose different \eqn{w(q,u)} gives us flexibility to direct power in desired 45 | directions. 46 | } 47 | \references{ 48 | Sant'Anna, Pedro H. C, and Song, Xiaojun (2016), \emph{Specification Tests for the Propensity Score}, 49 | available at \url{http://sites.google.com/site/pedrohcsantanna/}. 50 | } 51 | -------------------------------------------------------------------------------- /man/pstest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pstest.R 3 | \name{pstest} 4 | \alias{pstest} 5 | \title{pstest: Tests for the Propensity Score} 6 | \usage{ 7 | pstest(d, pscore, xpscore, pscore.model = NULL, model = "logit", 8 | w = "ind", dist = "Mammen", nboot = 1000, cores = 1, 9 | chunk = 1000) 10 | } 11 | \arguments{ 12 | \item{d}{a vector containing the binary treatment indicator.} 13 | 14 | \item{pscore}{a vector containing the estimated propensity scores.} 15 | 16 | \item{xpscore}{a matrix (or data frame) containing the covariates (and their 17 | transformations) included in the propensity score 18 | estimation. It should also include the constant term.} 19 | 20 | \item{pscore.model}{in case you you set model="het.probit", pscore.model is the entire hetglm object. 21 | Default for pscore.model is NULL.} 22 | 23 | \item{model}{a description of the functional form (link function) used 24 | to estimated propensity score. The alternatives are: 25 | 'logit' (default), 'probit', and het.probit} 26 | 27 | \item{w}{a description of which weight function the projection is based on. 28 | The alternatives are 'ind' (default), which set \eqn{w(q,u)=1(q<=u)}, 29 | 'exp', which set \eqn{w(q,u)=exp(qu)}, 'logistic', which set 30 | \eqn{w(q,u)=1/[1+exp(1-qu)]}, 'sin', which set \eqn{w(q,u)=sin(qu)}, and 31 | 'sincos', which set \eqn{w(q,u)=sin(qu)+cos(qu)}.} 32 | 33 | \item{dist}{a description of which distribution to use during the bootstrap. 34 | The alternatives are 'Mammen' (default), and 'Rademacher'.} 35 | 36 | \item{nboot}{number of bootstrap replicates to perform. Default is 1,000.} 37 | 38 | \item{cores}{number of cores to use during the bootstrap. Default is 1. 39 | If cores is greater than 1, the bootstrap is conducted using 40 | parLapply, instead of lapply type call.} 41 | 42 | \item{chunk}{a value that determine the size of each 'tile'. Such argument is used 43 | to split the original data into chunks, saving memory. 44 | Default value is 1,000. If the \emph{pstest} function throw a 45 | memory error, you should choose a smaller value for \emph{chunk}.} 46 | } 47 | \value{ 48 | a list containing the Kolmogorov-Smirnov and Cramer-von Mises test 49 | statistics for the null hypothesis of correctly specified propensity 50 | score model (kstest and cvmtest, respectively), and their associate 51 | bootstrapped p-values, pvks and pvcvm, respectively. All inputs are also 52 | returned. 53 | } 54 | \description{ 55 | \emph{pstest} computes Kolmogorov-Smirnov and Cramer-von Mises type tests 56 | for the null hypothesis that a parametric model for the propensity score is 57 | is correctly specified. For details of the testing procedure, see 58 | Sant'Anna and Song (2016),'Specification Tests for the Propensity Score'. 59 | } 60 | \examples{ 61 | # Example based on simulation data 62 | # Simulate vector of covariates 63 | set.seed(1234) 64 | x1 <- runif(100) 65 | x2 <- rt(100, 5) 66 | x3 <- rpois(100, 3) 67 | # generate treatment status score based on Probit Specification 68 | treat <- (x1 + x2 + x3 >= rnorm(100, 4, 5)) 69 | # estimate correctly specified propensity score based on Probit 70 | pscore <- stats::glm(treat ~ x1 + x2 + x3, family = binomial(link = "probit"), 71 | x = TRUE) 72 | # Test the correct specification of estimated propensity score, using 73 | # the weight function 'ind', and bootstrap based on 'Mammen'. 74 | pstest(d = pscore$y, pscore = pscore$fit, xpscore = pscore$x, 75 | model = "probit", w = "ind", dist = "Mammen") 76 | # Alternatively, one can use the 'sin' weight function 77 | pstest(d = pscore$y, pscore = pscore$fit, xpscore = pscore$x, 78 | model = "probit", w = "sin", dist = "Mammen") 79 | 80 | } 81 | \references{ 82 | Sant'Anna, Pedro H. C, and Song, Xiaojun (2019), \emph{Specification Tests for the Propensity Score}, 83 | Journal of Econometrics, vol. 210 (2), p. 379-404. 84 | } 85 | --------------------------------------------------------------------------------