├── .DS_Store ├── .Rbuildignore ├── .Rbuildignore~ ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── .DS_Store ├── .Rapp.history ├── RcppExports.R ├── ebalance.R ├── functions.R └── lalondedata.R ├── README.Rmd ├── README.md ├── appveyor.yml ├── data └── lalonde.rda ├── kpop.Rproj ├── man ├── .Rapp.history ├── b_maxvarK.Rd ├── biasbound.Rd ├── dimw.Rd ├── drop_multicollin.Rd ├── ebalance_custom.Rd ├── figures │ └── README-pressure-1.png ├── getdist.Rd ├── getw.Rd ├── kbal.Rd ├── lalonde.Rd ├── makeK.Rd └── one_hot.Rd ├── src ├── .DS_Store ├── RcppExports.cpp ├── RcppExports.o ├── kbal.dll ├── kernel_parallel.cpp └── kernel_parallel.o ├── tests └── testthat │ ├── test_b_maxvarK.R │ ├── test_biasbound.R │ ├── test_dimw.R │ ├── test_drop_multicollin.R │ ├── test_getdist.R │ ├── test_getw.R │ ├── test_kbal.R │ ├── test_makeK.R │ ├── test_one_hot.R │ └── testthat.R └── tools ├── cran-comments.md └── example.R /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/.DS_Store -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README.md 4 | ^README\.Rmd$ 5 | ^appveyor\.yml$ 6 | ^\.travis\.yml$ 7 | -------------------------------------------------------------------------------- /.Rbuildignore~: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README.md 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | cache: packages 5 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: kbal 2 | Type: Package 3 | Title: Kernel Balancing 4 | Version: 0.1.1 5 | Date: 2024-09-10 6 | Authors@R: c(person("Chad", "Hazlett", email = "chazlett@ucla.edu", role = c("aut", "cre")), 7 | person("Ciara", "Sterbenz", email = "cster@g.ucla.edu", role = "aut"), 8 | person("Erin", "Hartman", email = "ekhartman@ucla.edu", role = "ctb"), 9 | person("Alex", "Kravetz", email = "alexdkravetz@gmail.com", role = "ctb"), 10 | person("Borna", "Bateni", email = "borna@ucla.edu", role = "aut")) 11 | Description: The "kbal" package provides a weighting approach that employs kernels to make one group have a similar distribution to another group on covariates, not only in terms of means or marginal distributions, but also on higher order transformations implied by the choice of kernel. The package is applicable to both treatment effect estimation and survey reweighting problems. Based on Hazlett, C. (2020) "Kernel Balancing: A flexible non-parametric weighting procedure for estimating causal effects." Statistica Sinica. . 12 | URL: https://github.com/chadhazlett/kbal 13 | License: GPL (>=2) 14 | LazyData: TRUE 15 | LazyLoad: yes 16 | Depends: 17 | R (>= 3.5.0) 18 | Imports: 19 | Rcpp (>= 0.11.0), 20 | RcppParallel (>= 4.4.4), 21 | dplyr, 22 | RSpectra 23 | LinkingTo: Rcpp, RcppParallel 24 | Maintainer: Borna Bateni 25 | RoxygenNote: 7.2.3 26 | Encoding: UTF-8 27 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(b_maxvarK) 4 | export(biasbound) 5 | export(dimw) 6 | export(drop_multicollin) 7 | export(ebalance_custom) 8 | export(getdist) 9 | export(getw) 10 | export(kbal) 11 | export(makeK) 12 | export(one_hot) 13 | importFrom(Rcpp,sourceCpp) 14 | importFrom(RcppParallel,RcppParallelLibs) 15 | importFrom(dplyr,'%>%') 16 | importFrom(dplyr,filter) 17 | importFrom(dplyr,group_by) 18 | importFrom(dplyr,n) 19 | importFrom(dplyr,pull) 20 | importFrom(dplyr,summarise) 21 | importFrom(stats,contrasts) 22 | importFrom(stats,cor) 23 | importFrom(stats,model.matrix) 24 | importFrom(stats,na.omit) 25 | importFrom(stats,optimize) 26 | importFrom(stats,sd) 27 | importFrom(stats,var) 28 | useDynLib(kbal) 29 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # kbal Package News 2 | 3 | ## Version 0.1.1 (2024-09-10) 4 | - Initial submission to CRAN. 5 | - Updated dependencies to require R version 3.5.0 or higher. 6 | - Fixed various documentation mismatches and notes. 7 | -------------------------------------------------------------------------------- /R/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/R/.DS_Store -------------------------------------------------------------------------------- /R/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/R/.Rapp.history -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | kernel_parallel <- function(X, b) { 5 | .Call('_kbal_kernel_parallel', PACKAGE = 'kbal', X, b) 6 | } 7 | 8 | kernel_parallel_2 <- function(X, Y, b) { 9 | .Call('_kbal_kernel_parallel_2', PACKAGE = 'kbal', X, Y, b) 10 | } 11 | 12 | kernel_parallel_old <- function(X, Y, b) { 13 | .Call('_kbal_kernel_parallel_old', PACKAGE = 'kbal', X, Y, b) 14 | } 15 | 16 | -------------------------------------------------------------------------------- /R/ebalance.R: -------------------------------------------------------------------------------- 1 | #' Modified version of ebalance (originally from Jens Hainmueller) 2 | #' @description This is a custom version of the \code{ebal} (entropy balancing) package by Jens Hainmueller. Chooses weights on controls to make 3 | #' covariate means equal to those of treated. This version differs from \code{ebal} only in that it handles 4 | #' cases where there is only a single unit, which otherwise causes a problem in the original code. 5 | #' @param Treatment a numeric vector of length equal to the total number of units where treated (population) units take a value of 1 and control (sampled) units take a value of 0. 6 | #' @param X matrix of data where rows are observations and columns are covariates. 7 | #' @param base.weight an optional numeric vector argument of length equal to the total number of control units to specify the base weight of each control unit within entropy balancing. Default is even weights (1) for all controls. 8 | #' @param norm.constant an optional numeric argument; users should leave unspecified in most cases. 9 | #' @param coefs an optional vector argument of length equal to one more than the number of covariates in \code{X}; users should leave unspecified in most cases. 10 | #' @param max.iterations numeric maximum number of iterations to use when searching for weights 11 | #' @param constraint.tolerance numeric tolerance level. 12 | #' @param print.level a numeric argument to specify the amount of information printed out. 0 is silent, 1 prints convergence status, 2 prints maximum deviance per iteration, 3 prints loss and step length. 13 | #' @importFrom stats var optimize 14 | #' @return \item{target.margins}{Column sums of \code{X} among the treated units.} 15 | #' \item{co.xdata}{Covariate matrix for the controls only built from \code{X} with an additional appended column of ones.} 16 | #' \item{w}{weights found using ebalance. Note that treated units all receive flat weights of 1} 17 | #' \item{maxdiff}{absolute value of the largest component of the gradient in the last iteration.} 18 | #' \item{norm.constant}{norm constant used} 19 | #' \item{constraint.tolerance}{tolerance used to evaluate convergence} 20 | #' \item{max.iterations}{max iterations used} 21 | #' \item{base.weight}{base weights used} 22 | #' \item{print.level}{print level used} 23 | #' \item{converged}{Convergence status. If ebalance failed to find weights within the specified \code{constraint.tolerance} after \code{max.iterations} this is \code{FALSE}. Note that even if ebalance does not converge, the last iteration's weights \code{w} are returned.} 24 | #' @export 25 | 26 | ebalance_custom <- 27 | function( 28 | Treatment, 29 | X, 30 | base.weight = NULL, 31 | norm.constant = NULL, 32 | coefs = NULL , 33 | max.iterations = 200, 34 | constraint.tolerance = 1e-3, 35 | print.level=0 36 | ){ 37 | 38 | # Checks 39 | if (sum(Treatment != 1 & Treatment != 0) > 0) { 40 | stop("Treatment indicator ('Treatment') must be a logical variable, TRUE (1) or FALSE (0)") 41 | } 42 | if (var(Treatment) == 0) { 43 | stop("Treatment indicator ('Treatment') must contain both treatment and control observations") 44 | } 45 | 46 | Treatment <- as.numeric(Treatment) 47 | X <- as.matrix(X) 48 | 49 | if (sum(is.na(X))>0){ 50 | stop("X contains missing data") 51 | } 52 | 53 | if (sum(is.na(Treatment))>0){ 54 | stop("Treatment contains missing data") 55 | } 56 | 57 | if (length(Treatment) != nrow(X)) { 58 | stop("length(Treatment) != nrow(X)") 59 | } 60 | 61 | if (length(max.iterations) != 1 ) { 62 | stop("length(max.iterations) != 1") 63 | } 64 | if (length(constraint.tolerance) != 1 ) { 65 | stop("length(constraint.tolerance) != 1") 66 | } 67 | 68 | # set up elements 69 | ntreated <- sum(Treatment==1) 70 | ncontrols <- sum(Treatment==0) 71 | 72 | if (is.null(base.weight)) { 73 | base.weight = rep(1, ncontrols) 74 | } 75 | if ( length(base.weight) != ncontrols) { 76 | stop("length(base.weight) != number of controls sum(Treatment==0)") 77 | } 78 | 79 | co.x <- X[Treatment==0,] 80 | co.x <- cbind(rep(1,ncontrols),co.x) 81 | 82 | if(qr(co.x)$rank != ncol(co.x)){ 83 | stop("collinearity in covariate matrix for controls (remove collinear covariates)") 84 | } 85 | 86 | 87 | tr.total <- apply(as.matrix(X[Treatment==1,,drop=FALSE]),2,sum) 88 | 89 | if (is.null(norm.constant)) { 90 | norm.constant <- ntreated 91 | } 92 | if (length(norm.constant) != 1) { 93 | stop("length(norm.constant) != 1") 94 | } 95 | 96 | tr.total <- c(norm.constant,tr.total) 97 | 98 | if(is.null(coefs)) { 99 | coefs = c(log(tr.total[1]/sum(base.weight)),rep(0,(ncol(co.x)-1))) 100 | } 101 | 102 | if(length(coefs) != ncol(co.x)) { 103 | stop("coefs needs to have same length as number of covariates plus one") 104 | } 105 | 106 | ## run algo 107 | eb.out <- eb(tr.total=tr.total, 108 | co.x=co.x, 109 | coefs=coefs, 110 | base.weight=base.weight, 111 | max.iterations=max.iterations, 112 | constraint.tolerance=constraint.tolerance, 113 | print.level=print.level 114 | ) 115 | 116 | #redundant because eb() run above will print this out if print.level >=1 117 | # if(eb.out$converged == TRUE & print.level>=1) { 118 | # cat("Converged within tolerance \n") 119 | # } 120 | 121 | z <- list( 122 | target.margins = tr.total, 123 | co.xdata = co.x, 124 | w=eb.out$Weights.ebal, 125 | coefs=eb.out$coefs, 126 | maxdiff=eb.out$maxdiff, 127 | norm.constant = norm.constant, 128 | constraint.tolerance=constraint.tolerance, 129 | max.iterations=max.iterations, 130 | base.weight=base.weight, 131 | print.level=print.level, 132 | converged=eb.out$converged 133 | ) 134 | 135 | class(z) <- "ebalance" 136 | return(z) 137 | 138 | } 139 | 140 | 141 | 142 | eb <- function( 143 | tr.total=tr.total, 144 | co.x=co.x, 145 | coefs=coefs, 146 | base.weight=base.weight, 147 | max.iterations=max.iterations, 148 | constraint.tolerance=constraint.tolerance, 149 | print.level=print.level 150 | ) { 151 | 152 | converged <- FALSE 153 | for(iter in 1:max.iterations) { 154 | weights.temp <- c(exp(co.x %*% coefs)) 155 | weights.ebal <- weights.temp * base.weight 156 | co.x.agg <- c(weights.ebal %*% co.x) 157 | gradient <- co.x.agg - tr.total 158 | if(max(abs(gradient))=2){ cat("Iteration",iter,"maximum deviation is =",format(max(abs(gradient)),digits=4),"\n") } 163 | hessian = t(co.x) %*% (weights.ebal * co.x) 164 | Coefs <- coefs 165 | newton <- solve(hessian,gradient) 166 | coefs <- coefs - newton 167 | loss.new <- line.searcher(Base.weight=base.weight,Co.x=co.x,Tr.total=tr.total,coefs=coefs,Newton=newton,ss=1) 168 | loss.old <- line.searcher(Base.weight=base.weight,Co.x=co.x,Tr.total=tr.total,coefs=Coefs,Newton=newton,ss=0) 169 | if(print.level>=3){cat("new loss",loss.new,"old loss=",loss.old,"\n")} 170 | 171 | if (is.na(loss.new)== FALSE && is.na(loss.old)==FALSE) { 172 | if(loss.old <= loss.new){ 173 | ss.out <- suppressWarnings(optimize(line.searcher, 174 | lower=.00001,upper=1,maximum=FALSE, 175 | Base.weight=base.weight,Co.x=co.x, 176 | Tr.total=tr.total,coefs=Coefs,Newton=newton)) 177 | 178 | if(print.level>=3){cat("LS Step Length is ",ss.out$minimum,"\n")} 179 | if(print.level>=3){cat("Loss is",ss.out$objective,"\n")} 180 | coefs = Coefs - ss.out$minimum*solve(hessian,gradient) 181 | } 182 | 183 | } 184 | 185 | 186 | } 187 | if(print.level>=1 && converged){cat("Converged within tolerance \n")} 188 | return( 189 | list( 190 | maxdiff=max(abs(gradient)), 191 | coefs=coefs, 192 | Weights.ebal=weights.ebal, 193 | converged=converged 194 | ) 195 | ) 196 | } 197 | 198 | 199 | 200 | 201 | # function to conduct line search for optimal step length 202 | line.searcher <- function( 203 | Base.weight, 204 | Co.x, 205 | Tr.total, 206 | coefs, 207 | Newton, 208 | ss) 209 | { 210 | weights.temp <- c(exp(Co.x %*% (coefs - (ss * Newton) ))) 211 | #weights.temp[is.infinite(weights.temp)] <- 100 212 | weights.temp <- weights.temp * Base.weight 213 | Co.x.agg <- c(weights.temp %*% Co.x) 214 | maxdiff <- max(abs(Co.x.agg-Tr.total)) 215 | return(maxdiff) 216 | } -------------------------------------------------------------------------------- /R/lalondedata.R: -------------------------------------------------------------------------------- 1 | #' Data from National Supported Work program and Panel Study in Income Dynamics 2 | #' @description 3 | #' Dehejia and Wahba (1999) sample of data from Lalonde (1986). 4 | #' This data set includes 185 treated units from the National 5 | #' Supported Work (NSW) program, paired with 2490 control units 6 | #' drawn from the Panel Study of Income Dynamics (PSID-1). 7 | #' 8 | #' The treatment variable of interest is \code{nsw}, which indicates that an individual 9 | #' was in the job training program. The main outcome of interest is 10 | #' real earnings in 1978 (\code{re78}). The remaining variables are characteristics 11 | #' of the individuals, to be used as controls. 12 | #' 13 | #' @format A data frame with 2675 rows and 14 columns. 14 | #' \describe{ 15 | #' \item{nsw}{treatment indicator: participation in the National Supported Work program.} 16 | #' \item{re78}{real earnings in 1978 (outcome)} 17 | #' \item{u78}{unemployed in 1978; actually an indicator for zero income in 1978} 18 | #' \item{age}{age in years} 19 | #' \item{black}{indicator for identifying as black} 20 | #' \item{hisp}{indicator for identifying as Hispanic} 21 | #' \item{race_ethnicity}{factor for self-identified race/ethnicity; same information as \code{black} and \code{hisp} in character form.} 22 | #' \item{married}{indicator for being married} 23 | #' \item{re74}{real income in 1974} 24 | #' \item{re75}{real income in 1975} 25 | #' \item{u74}{unemployment in 1974; actually an indicator for zero income in 1974} 26 | #' \item{u75}{unemployment in 1975; actually an indicator for zero income in 1975} 27 | #' \item{educ}{Years of education of the individual} 28 | #' \item{nodegr}{indicator for no high school degree; actually an indicator for years of education less than 12} 29 | #' } 30 | #' @references Dehejia, Rajeev H., and Sadek Wahba. "Causal effects in non-experimental studies: Reevaluating the evaluation of training programs." Journal of the American statistical Association 94.448 (1999): 1053-1062. 31 | #' 32 | #'LaLonde, Robert J. "Evaluating the econometric evaluations of training programs with experimental data." The American economic review (1986): 604-620. 33 | "lalonde" 34 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # KBAL 17 | 18 | 19 | [![CRAN status](https://www.r-pkg.org/badges/version/KBAL)](https://CRAN.R-project.org/package=KBAL) 20 | 21 | 22 | Kernel Balancing project 23 | 24 | Package for implementation of kernel balancing. 25 | 26 | Investigators often use matching and weighting techniques to adjust for differences between treated and control groups on observed characteristics. These methods, however, require the user to choose what functions of the covariates must be balanced, and do not in general ensure equal multivariate densities of the treated and control groups. Treatment effect estimates made after adjustment by these methods are thus sensitive to specification choices, and are biased if any function of the covariates influencing the outcome has a different mean for the treated and control groups. This paper introduces kernel balancing, a method designed to reduce this bias without relying on specification searches or balance tests. The weights derived by kernel balancing (1) achieve approximate mean balance on a large class of smooth functions of the covariates, and (2) approximately equalize the multivariate densities of the treated and controls, when estimated a certain way. In two empirical applications, kernel balancing (1) accurately recovers the experimentally estimated effect of a job training program, and (2) finding that after controlling for observed differences, democracies are less likely to win counterinsurgencies, consistent with theoretical expectation but in contrast to previous findings. 27 | 28 | See www.chadhazlett.com for details and paper. 29 | 30 | ## Installation 31 | 32 | You can install the released version of KBAL from [CRAN](https://CRAN.R-project.org) with: 33 | 34 | ``` r 35 | install.packages("kbal") 36 | ``` 37 | 38 | And the development version from [GitHub](https://github.com/) with: 39 | 40 | ``` r 41 | # install.packages("devtools") 42 | devtools::install_github("chadhazlett/KBAL") 43 | ``` 44 | 45 | ## OS-X speed optimization 46 | Mac users can see a significant speed up (5-10x) by using Apple's native Accelerate BLAS library (vecLib). 47 | Upgrade to the latest version of R and RStudio, then follow the steps outlined [here](https://cran.r-project.org/bin/macosx/RMacOSX-FAQ.html#Which-BLAS-is-used-and-how-can-it-be-changed_003f): 48 | ```bash 49 | cd /Library/Frameworks/R.framework/Resources/lib 50 | 51 | # for vecLib use 52 | ln -sf libRblas.vecLib.dylib libRblas.dylib 53 | ``` 54 | 55 | Details and examples forthcoming. 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # KBAL 5 | 6 | 7 | 8 | [![CRAN 9 | status](https://www.r-pkg.org/badges/version/KBAL)](https://CRAN.R-project.org/package=KBAL) 10 | 11 | 12 | Kernel Balancing project 13 | 14 | Package for implementation of kernel balancing. 15 | 16 | Investigators often use matching and weighting techniques to adjust for 17 | differences between treated and control groups on observed 18 | characteristics. These methods, however, require the user to choose what 19 | functions of the covariates must be balanced, and do not in general 20 | ensure equal multivariate densities of the treated and control groups. 21 | Treatment effect estimates made after adjustment by these methods are 22 | thus sensitive to specification choices, and are biased if any function 23 | of the covariates influencing the outcome has a different mean for the 24 | treated and control groups. This paper introduces kernel balancing, a 25 | method designed to reduce this bias without relying on specification 26 | searches or balance tests. The weights derived by kernel balancing (1) 27 | achieve approximate mean balance on a large class of smooth functions of 28 | the covariates, and (2) approximately equalize the multivariate 29 | densities of the treated and controls, when estimated a certain way. In 30 | two empirical applications, kernel balancing (1) accurately recovers the 31 | experimentally estimated effect of a job training program, and (2) 32 | finding that after controlling for observed differences, democracies are 33 | less likely to win counterinsurgencies, consistent with theoretical 34 | expectation but in contrast to previous findings. 35 | 36 | See www.chadhazlett.com for details and paper. 37 | 38 | ## Installation 39 | 40 | We will update the CRAN version shortly. In the meantime, we recommend 41 | installing the development version, from [GitHub](https://github.com/) with: 42 | 43 | ``` r 44 | # install.packages("devtools") 45 | devtools::install_github("chadhazlett/KBAL") 46 | ``` 47 | 48 | ## OS-X speed optimization 49 | 50 | Mac users can see a significant speed up (5-10x) by using Apple’s native 51 | Accelerate BLAS library (vecLib). Upgrade to the latest version of R and 52 | RStudio, then follow the steps outlined 53 | [here](https://cran.r-project.org/bin/macosx/RMacOSX-FAQ.html#Which-BLAS-is-used-and-how-can-it-be-changed_003f): 54 | 55 | ``` bash 56 | cd /Library/Frameworks/R.framework/Resources/lib 57 | 58 | # for vecLib use 59 | ln -sf libRblas.vecLib.dylib libRblas.dylib 60 | ``` 61 | 62 | Details and examples forthcoming. 63 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | 10 | install: 11 | ps: Bootstrap 12 | 13 | cache: 14 | - C:\RLibrary 15 | 16 | environment: 17 | NOT_CRAN: true 18 | # env vars that may need to be set, at least temporarily, from time to time 19 | # see https://github.com/krlmlr/r-appveyor#readme for details 20 | # USE_RTOOLS: true 21 | # R_REMOTES_STANDALONE: true 22 | 23 | # Adapt as necessary starting from here 24 | 25 | build_script: 26 | - travis-tool.sh install_deps 27 | 28 | test_script: 29 | - travis-tool.sh run_tests 30 | 31 | on_failure: 32 | - 7z a failure.zip *.Rcheck\* 33 | - appveyor PushArtifact failure.zip 34 | 35 | artifacts: 36 | - path: '*.Rcheck\**\*.log' 37 | name: Logs 38 | 39 | - path: '*.Rcheck\**\*.out' 40 | name: Logs 41 | 42 | - path: '*.Rcheck\**\*.fail' 43 | name: Logs 44 | 45 | - path: '*.Rcheck\**\*.Rout' 46 | name: Logs 47 | 48 | - path: '\*_*.tar.gz' 49 | name: Bits 50 | 51 | - path: '\*_*.zip' 52 | name: Bits 53 | -------------------------------------------------------------------------------- /data/lalonde.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/data/lalonde.rda -------------------------------------------------------------------------------- /kpop.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /man/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/man/.Rapp.history -------------------------------------------------------------------------------- /man/b_maxvarK.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{b_maxvarK} 4 | \alias{b_maxvarK} 5 | \title{Maximum Variance of Gaussian Kernel Matrix} 6 | \usage{ 7 | b_maxvarK(data, useasbases, cat_data = TRUE, maxsearch_b = 2000) 8 | } 9 | \arguments{ 10 | \item{data}{a matrix of data where rows are all units and columns are covariates. Where all covariates are categorical, this matrix should be one-hot encoded (refer to \code{\link{one_hot}} to produce) with \code{cat_data} argument true.} 11 | 12 | \item{useasbases}{binary vector specifying what observations are to be used in forming bases (columns) of the kernel matrix. Suggested default is: if the number of observations is under 4000, use all observations; when the number of observations is over 4000, use the sampled (control) units only.} 13 | 14 | \item{cat_data}{logical for whether kernel contains only categorical data or not. Default is \code{TRUE}.} 15 | 16 | \item{maxsearch_b}{the maximum value of \eqn{b}, the denominator of the Gaussian, searched during maximization. Default is \code{2000}.} 17 | } 18 | \value{ 19 | \item{b_maxvar}{numeric \eqn{b} value, the denominator of the Gaussian, which produces the maximum variance of \eqn{K} kernel matrix} 20 | \item{var_K}{numeric maximum variance of \eqn{K} kernel matrix found with \eqn{b} as \code{b_maxvar}} 21 | } 22 | \description{ 23 | Searches for the argmax of the variance of the Kernel matrix. 24 | } 25 | \examples{ 26 | \donttest{ 27 | #lalonde with only categorical data 28 | set.seed(123) 29 | data("lalonde") 30 | # Select a random subset of 500 rows 31 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 32 | lalonde <- lalonde[lalonde_sample, ] 33 | 34 | cat_vars <- c("black","hisp","married","nodegr","u74","u75") 35 | #Convert to one-hot encoded data matrix: 36 | onehot_lalonde = one_hot(lalonde[, cat_vars]) 37 | colnames(onehot_lalonde) 38 | best_b <- b_maxvarK(data = onehot_lalonde, 39 | useasbases = 1-lalonde$nsw) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /man/biasbound.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{biasbound} 4 | \alias{biasbound} 5 | \title{Worst-Case Bias Bound due to Incomplete Balance} 6 | \usage{ 7 | biasbound(observed, target, svd.out, w, w.pop = NULL, hilbertnorm = 1) 8 | } 9 | \arguments{ 10 | \item{observed}{a numeric vector of length equal to the total number of units where sampled/control units take a value of 1 and population/treated units take a value of 0.} 11 | 12 | \item{target}{a numeric vector of length equal to the total number of units where population/treated units take a value of 1 and sample/control units take a value of 0.} 13 | 14 | \item{svd.out}{the list object output from \code{svd()} performed on the kernel matrix. Requires a list object with left singular vectors in \code{svd.out$u} and singular values in \code{svd.out$d}} 15 | 16 | \item{w}{numeric vector containing the weight for every corresponding unit. Note that these weights should sum to the total number of units, not to one. They are divided by the number of control or sample and treated or population units internally.} 17 | 18 | \item{w.pop}{an optional vector input to specify population weights. Must be of length equal to the total number of units (rows in \code{svd.out}) with all sampled units receiving a weight of 1. The sum of the weights for population units must be either 1 or the number of population units.} 19 | 20 | \item{hilbertnorm}{numeric value of the Hilbert norm. Default is \code{1}.} 21 | } 22 | \value{ 23 | \item{biasbound}{value of worst-case bias bound due to incomplete balance with inputted weights} 24 | } 25 | \description{ 26 | Calculate the upper bound on the bias induced by approximate balance with a given \code{hilbertnorm}. Approximate balance is conducted in \code{kbal()} and uses only the first \code{numdims} dimensions of the singular value decomposition of the kernel matrix to generate weights \code{w} which produce mean balance between control or sampled units and treated or population units. The following function calculates the worse-case bias induced by this approximate balancing with weights \code{w} and a given \code{hilbertnorm}. 27 | } 28 | \examples{ 29 | \donttest{ 30 | #load and clean data a bit 31 | set.seed(123) 32 | data("lalonde") 33 | # Select a random subset of 500 rows 34 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 35 | lalonde <- lalonde[lalonde_sample, ] 36 | 37 | xvars=c("age","black","educ","hisp","married","re74","re75","nodegr","u74","u75") 38 | 39 | #need a kernel matrix to run SVD on and pass in so get that first with makeK 40 | #running makeK with the sampled units as the bases 41 | K = makeK(allx = lalonde[,xvars], useasbases = 1-lalonde$nsw) 42 | 43 | #svd on this kernel 44 | svd_pass = svd(K) 45 | #let's use the original weights of 1/number of sampled units, and 1/number of target units 46 | #this is the default if we pass in w as all 1's 47 | biasbound(observed=(1-lalonde$nsw), 48 | target=lalonde$nsw, 49 | svd.out = svd_pass, 50 | w = rep(1,nrow(lalonde)), hilbertnorm=1) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /man/dimw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{dimw} 4 | \alias{dimw} 5 | \title{Difference in Means and Difference in Weighted Means} 6 | \usage{ 7 | dimw(X, w, target) 8 | } 9 | \arguments{ 10 | \item{X}{matrix of data where rows are observations and columns are covariates.} 11 | 12 | \item{w}{numeric vector of weights for each observation.} 13 | 14 | \item{target}{numeric vector of length equal to the total number of units where population/treated units take a value of 1 and sample/control units take a value of 0.} 15 | } 16 | \value{ 17 | \item{dim}{the simple, unweighted difference in means.} 18 | \item{dimw}{the weighted difference in means.} 19 | } 20 | \description{ 21 | Calculates the simple difference in means or weighted difference in means between the control or sample population and the treated or target population. 22 | } 23 | \examples{ 24 | \donttest{ 25 | #let's say we want to get the unweighted DIM and the weighted DIM using weights from the kbal 26 | #function with the lalonde data: 27 | #load and clean data a bit 28 | set.seed(123) 29 | data("lalonde") 30 | # Select a random subset of 500 rows 31 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 32 | lalonde <- lalonde[lalonde_sample, ] 33 | 34 | xvars=c("age","black","educ","hisp","married","re74","re75","nodegr","u74","u75") 35 | 36 | #get the kbal weights 37 | kbalout= kbal(allx=lalonde[,xvars], 38 | sampledinpop=FALSE, 39 | treatment=lalonde$nsw) 40 | #now use dimw to get the DIMs 41 | dimw(X = lalonde[,xvars], w = kbalout$w, target = lalonde$nsw)} 42 | } 43 | -------------------------------------------------------------------------------- /man/drop_multicollin.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{drop_multicollin} 4 | \alias{drop_multicollin} 5 | \title{Drop Multicollinear Columns} 6 | \usage{ 7 | drop_multicollin(allx, printprogress = TRUE) 8 | } 9 | \arguments{ 10 | \item{allx}{a matrix of data to check for multicollinearity. All columns must be numeric.} 11 | 12 | \item{printprogress}{logical to indicate if progress should be printed out to the command line. Default is \code{TRUE}.} 13 | } 14 | \value{ 15 | A list containing: 16 | \item{allx_noMC}{resulting data matrix of full rank after multicollinear columns have been dropped.} 17 | \item{dropped_cols}{column names of the dropped columns.} 18 | } 19 | \description{ 20 | Drops multicollinear columns in order of highest correlation using the correlation matrix. 21 | This function uses the \code{cor} function from the \code{stats} package to calculate the correlations 22 | between columns. 23 | } 24 | \examples{ 25 | \donttest{ 26 | # Create data with multicollinearity 27 | data <- data.frame(x = rnorm(100), 28 | y = sample.int(100, 100), 29 | z = runif(100, 3, 6)) 30 | test = data.frame(mc_1 = data$x, 31 | mc_2 = data$x * 2 + data$y - data$z) 32 | dat = cbind(test, data) 33 | # Run function 34 | mc_check = drop_multicollin(dat) 35 | mc_check$dropped_cols 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /man/ebalance_custom.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebalance.R 3 | \name{ebalance_custom} 4 | \alias{ebalance_custom} 5 | \title{Modified version of ebalance (originally from Jens Hainmueller)} 6 | \usage{ 7 | ebalance_custom( 8 | Treatment, 9 | X, 10 | base.weight = NULL, 11 | norm.constant = NULL, 12 | coefs = NULL, 13 | max.iterations = 200, 14 | constraint.tolerance = 0.001, 15 | print.level = 0 16 | ) 17 | } 18 | \arguments{ 19 | \item{Treatment}{a numeric vector of length equal to the total number of units where treated (population) units take a value of 1 and control (sampled) units take a value of 0.} 20 | 21 | \item{X}{matrix of data where rows are observations and columns are covariates.} 22 | 23 | \item{base.weight}{an optional numeric vector argument of length equal to the total number of control units to specify the base weight of each control unit within entropy balancing. Default is even weights (1) for all controls.} 24 | 25 | \item{norm.constant}{an optional numeric argument; users should leave unspecified in most cases.} 26 | 27 | \item{coefs}{an optional vector argument of length equal to one more than the number of covariates in \code{X}; users should leave unspecified in most cases.} 28 | 29 | \item{max.iterations}{numeric maximum number of iterations to use when searching for weights} 30 | 31 | \item{constraint.tolerance}{numeric tolerance level.} 32 | 33 | \item{print.level}{a numeric argument to specify the amount of information printed out. 0 is silent, 1 prints convergence status, 2 prints maximum deviance per iteration, 3 prints loss and step length.} 34 | } 35 | \value{ 36 | \item{target.margins}{Column sums of \code{X} among the treated units.} 37 | \item{co.xdata}{Covariate matrix for the controls only built from \code{X} with an additional appended column of ones.} 38 | \item{w}{weights found using ebalance. Note that treated units all receive flat weights of 1} 39 | \item{maxdiff}{absolute value of the largest component of the gradient in the last iteration.} 40 | \item{norm.constant}{norm constant used} 41 | \item{constraint.tolerance}{tolerance used to evaluate convergence} 42 | \item{max.iterations}{max iterations used} 43 | \item{base.weight}{base weights used} 44 | \item{print.level}{print level used} 45 | \item{converged}{Convergence status. If ebalance failed to find weights within the specified \code{constraint.tolerance} after \code{max.iterations} this is \code{FALSE}. Note that even if ebalance does not converge, the last iteration's weights \code{w} are returned.} 46 | } 47 | \description{ 48 | This is a custom version of the \code{ebal} (entropy balancing) package by Jens Hainmueller. Chooses weights on controls to make 49 | covariate means equal to those of treated. This version differs from \code{ebal} only in that it handles 50 | cases where there is only a single unit, which otherwise causes a problem in the original code. 51 | } 52 | -------------------------------------------------------------------------------- /man/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/man/figures/README-pressure-1.png -------------------------------------------------------------------------------- /man/getdist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{getdist} 4 | \alias{getdist} 5 | \title{L1 Distance} 6 | \usage{ 7 | getdist( 8 | target, 9 | observed, 10 | K, 11 | w.pop = NULL, 12 | w = NULL, 13 | numdims = NULL, 14 | ebal.tol = 1e-06, 15 | ebal.maxit = 500, 16 | svd.U = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{target}{a numeric vector of length equal to the total number of units where population/treated units take a value of 1 and sample/control units take a value of 0.} 21 | 22 | \item{observed}{a numeric vector of length equal to the total number of units where sampled/control units take a value of 1 and population/treated units take a value of 0.} 23 | 24 | \item{K}{the kernel matrix} 25 | 26 | \item{w.pop}{an optional vector input to specify population weights. Must be of length equal to the total number of units (rows in \code{svd.U}) with all sampled units receiving a weight of 1. The sum of the weights for population units must be either 1 or the number of population units.} 27 | 28 | \item{w}{a optional numeric vector of weights for every observation. Note that these weights should sum to the total number of units, where treated or population units have a weight of 1 and control or sample units have appropriate weights derived from kernel balancing with mean 1, is consistent with the output of \code{getw()}. If unspecified, these weights are found internally using \code{numdims} dimensions of the SVD of the kernel matrix \code{svd.U} with \code{ebalance_custom()}.} 29 | 30 | \item{numdims}{an optional numeric input specifying the number of columns of the singular value decomposition of the kernel matrix to use when finding weights when \code{w} is not specified.} 31 | 32 | \item{ebal.tol}{an optional numeric input specifying the tolerance level used by custom entropy balancing function \code{ebalance_custom()} in the case that \code{w} is not specified. Default is \code{1e-6}.} 33 | 34 | \item{ebal.maxit}{maximum number of iterations in optimization search used by \code{ebalance_custom} when \code{w} is not specified. Default is \code{500}.} 35 | 36 | \item{svd.U}{an optional matrix of left singular vectors from performing \code{svd()} on the kernel matrix in the case that \code{w} is unspecified. If unspecified when \code{w} also not specified, internally computes the svd of \code{K}.} 37 | } 38 | \value{ 39 | \item{L1}{a numeric giving the L1 distance, the absolute difference between \code{pX_D1} and \code{pX_D0w}} 40 | \item{w}{numeric vector of weights used} 41 | \item{pX_D1}{a numeric vector of length equal to the total number of observations where the nth entry is the sum of the kernel distances from the nth unit to every treated or population unit. If population units are specified, this sum is weighted by \code{w.pop} accordingly.} 42 | \item{pX_D0}{a numeric vector of length equal to the total number of observations where the nth entry is the sum of the kernel distances from the nth unit to every control or sampled unit.} 43 | \item{pX_D0w}{a numeric vector of length equal to the total number of observations where the nth entry is the weighted sum of the kernel distances from the nth unit to every control or sampled unit. The weights are given by entropy balancing and produce mean balance on \eqn{\phi(X)}, the expanded features of \eqn{X} using a given kernel \eqn{\phi(.)}, for the control or sample group and treated group or target population.} 44 | } 45 | \description{ 46 | Calculates the L1 distance between the treated or population units and the kernel balanced control or sampled units. 47 | } 48 | \examples{ 49 | \donttest{ 50 | #loading and cleaning lalonde data 51 | set.seed(123) 52 | data("lalonde") 53 | # Select a random subset of 500 rows 54 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 55 | lalonde <- lalonde[lalonde_sample, ] 56 | 57 | xvars=c("age","black","educ","hisp","married","re74","re75","nodegr","u74","u75") 58 | 59 | #need to first build gaussian kernel matrix 60 | K_pass <- makeK(allx = lalonde[,xvars]) 61 | #also need the SVD of this matrix 62 | svd_pass <- svd(K_pass) 63 | 64 | #running without passing weights in directly, using numdims=33 65 | l1_lalonde <- getdist(target = lalonde$nsw, 66 | observed = 1-lalonde$nsw, 67 | K = K_pass, 68 | svd.U = svd_pass$u, 69 | numdims = 33) 70 | 71 | #alternatively, we can get the weights ourselves and pass them in directly 72 | #using the first 33 dims of svd_pass$u to match the above 73 | w_opt <- getw(target= lalonde$nsw, 74 | observed = 1-lalonde$nsw, 75 | svd.U = svd_pass$u[,1:33])$w 76 | l1_lalonde2 <- getdist(target = lalonde$nsw, 77 | observed = 1-lalonde$nsw, 78 | K = K_pass, 79 | w = w_opt) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /man/getw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{getw} 4 | \alias{getw} 5 | \title{Find Weights using Entropy Balancing.} 6 | \usage{ 7 | getw(target, observed, svd.U, ebal.tol = 1e-06, ebal.maxit = 500) 8 | } 9 | \arguments{ 10 | \item{target}{a numeric vector of length equal to the total number of units where population/treated units take a value of 1 and sample/control units take a value of 0.} 11 | 12 | \item{observed}{a numeric vector of length equal to the total number of units where sampled/control units take a value of 1 and population/treated units take a value of 0.} 13 | 14 | \item{svd.U}{a matrix of left singular vectors from performing \code{svd()} on the kernel matrix.} 15 | 16 | \item{ebal.tol}{tolerance level used by custom entropy balancing function \code{ebalance_custom}. Default is \code{1e-6}.} 17 | 18 | \item{ebal.maxit}{maximum number of iterations in optimization search used by \code{ebalance_custom}. Default is \code{500}.} 19 | } 20 | \value{ 21 | A list containing: 22 | \item{w}{A numeric vector of weights.} 23 | \item{converged}{boolean indicating if \code{ebalance_custom} converged} 24 | \item{ebal_error}{returns error message if \code{ebalance_custom} encounters an error} 25 | } 26 | \description{ 27 | Uses entropy balancing to find and return the weights that produce mean balance on \eqn{\phi(X_i)}, the expanded features of \eqn{X_i} using a given kernel \eqn{\phi(.)}, for the control or sample group and treated group or target population. 28 | } 29 | \examples{ 30 | \donttest{ 31 | #load and clean data 32 | set.seed(123) 33 | data("lalonde") 34 | # Select a random subset of 500 rows 35 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 36 | lalonde <- lalonde[lalonde_sample, ] 37 | 38 | xvars=c("age","black","educ","hisp","married","re74","re75","nodegr","u74","u75") 39 | 40 | #need a kernel matrix to run SVD on then find weights with; so get that first with makeK. 41 | #running makeK with the sampled units as the bases 42 | K = makeK(allx = lalonde[,xvars], useasbases = 1-lalonde$nsw) 43 | 44 | #SVD on this kernel and get matrix with left singular values 45 | U = svd(K)$u 46 | #Use the first 10 dimensions of U. 47 | U2=U[,1:10] 48 | getw.out=getw(target=lalonde$nsw, 49 | observed=1-lalonde$nsw, 50 | svd.U=U2) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /man/kbal.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{kbal} 4 | \alias{kbal} 5 | \title{Kernel Balancing} 6 | \usage{ 7 | kbal( 8 | allx, 9 | useasbases = NULL, 10 | b = NULL, 11 | sampled = NULL, 12 | sampledinpop = NULL, 13 | treatment = NULL, 14 | population.w = NULL, 15 | K = NULL, 16 | K.svd = NULL, 17 | cat_data = FALSE, 18 | mixed_data = FALSE, 19 | cat_columns = NULL, 20 | cont_scale = NULL, 21 | scale_data = NULL, 22 | drop_MC = NULL, 23 | linkernel = FALSE, 24 | meanfirst = FALSE, 25 | mf_columns = NULL, 26 | constraint = NULL, 27 | scale_constraint = TRUE, 28 | numdims = NULL, 29 | minnumdims = NULL, 30 | maxnumdims = NULL, 31 | fullSVD = FALSE, 32 | incrementby = 1, 33 | ebal.maxit = 500, 34 | ebal.tol = 1e-06, 35 | ebal.convergence = NULL, 36 | maxsearch_b = 2000, 37 | early.stopping = TRUE, 38 | printprogress = TRUE 39 | ) 40 | } 41 | \arguments{ 42 | \item{allx}{a data matrix containing all observations where rows are units and columns are covariates. When using only continuous covariates (\code{cat_data = F} and \code{mixed_data = F}), all columns must be numeric. When using categorical data (either \code{cat_data = T} or \code{mixed_data = T}), categorical columns can be characters or numerics which will be treated as factors. Users should one-hot encoded categorical covariates as this transformation occurs internally.} 43 | 44 | \item{useasbases}{optional binary vector to specify what observations are to be used in forming bases (columns) of the kernel matrix to get balance on. If the number of observations is under 4000, the default is to use all observations. When the number of observations is over 4000, the default is to use the sampled (control) units only.} 45 | 46 | \item{b}{scaling factor in the calculation of Gaussian kernel distance equivalent to the entire denominator \eqn{2\sigma^2} of the exponent. Default is to search for the value which maximizes the variance of the kernel matrix.} 47 | 48 | \item{sampled}{a numeric vector of length equal to the total number of units where sampled units take a value of 1 and population units take a value of 0.} 49 | 50 | \item{sampledinpop}{a logical to be used in combination with input \code{sampled} that, when \code{TRUE}, indicates that sampled units should also be included in the target population when searching for optimal weights.} 51 | 52 | \item{treatment}{an alternative input to \code{sampled} and \code{sampledinpop} that is a numeric vector of length equal to the total number of units. Current version supports the ATT estimand. Accordingly, the treated units are the target population, and the control are equivalent to the sampled. Weights play the role of making the control groups (sampled) look like the target population (treated). When specified, \code{sampledinpop} is forced to be \code{FALSE}.} 53 | 54 | \item{population.w}{optional vector of population weights length equal to the number of population units. Must sum to either 1 or the number of population units.} 55 | 56 | \item{K}{optional matrix input that takes a user-specified kernel matrix and performs SVD on it internally in the search for weights which minimize the bias bound.} 57 | 58 | \item{K.svd}{optional list input that takes a user-specified singular value decomposition of the kernel matrix. This list must include three objects \code{K.svd$u}, a matrix of left-singular vectors, \code{K.svd$v}, a matrix of right-singular vectors, and their corresponding singular values \code{K.svd$d}.} 59 | 60 | \item{cat_data}{logical argument that when true indicates \code{allx} contains only categorical data. When true, the internal construction of the kernel matrix uses a one-hot encoding of \code{allx} (multiplied by a factor of \eqn{\sqrt{0.5}} to compensate for double counting) and the value of \code{b} which maximizes the variance of this kernel matrix. When true, \code{mixed_data}, \code{scale_data}, \code{linkernel}, and \code{drop_MC} should be \code{FALSE}. Default is \code{FALSE}.} 61 | 62 | \item{mixed_data}{logical argument that when true indicates \code{allx} contains a combination of both continuous and categorical data. When true, the internal construction of the kernel matrix uses a one-hot encoding of the categorical variables in \code{allx} as specified by \code{cat_columns} (multiplied by a factor of \eqn{\sqrt{0.5}} to compensate for double counting) concatenated with the remaining continuous variables scaled to have default standard deviation of 1 or that specified in \code{cont_scale}. When both \code{cat_data} and \code{cat_data} are \code{FALSE}, the kernel matrix assumes all continuous data, does not one-hot encode any part of \code{allx} but still uses the value of \code{b} which produces maximal variance in \code{K}. Default is \code{FALSE}.} 63 | 64 | \item{cat_columns}{optional character argument that must be specified when \code{mixed_data} is \code{TRUE} and that indicates what columns of \code{allx} contain categorical variables.} 65 | 66 | \item{cont_scale}{optional numeric argument used when \code{mixed_data} is \code{TRUE} which specifies how to scale the standard deviation of continuous variables in \code{allx}. Can be either a a single value or a vector with length equal to the number of continuous variables in \code{allx} (columns not specified in \code{cat_columns}) and ordered accordingly.} 67 | 68 | \item{scale_data}{logical when true scales the columns of \code{allx} (demeans and scales variance to 1) before building the kernel matrix internally. This is appropriate when \code{allx} contains only continuous variables with different scales, but is not recommended when \code{allx} contains any categorical data. Default is \code{TRUE} when both \code{cat_data} and \code{mixed_data} are \code{FALSE} and \code{FALSE} otherwise.} 69 | 70 | \item{drop_MC}{logical for whether or not to drop multicollinear columns in \code{allx} before building \code{K}. When either \code{cat_data} or \code{mixed_data} is \code{TRUE}, forced to be \code{FALSE}. Otherwise, with continuous data only, default is \code{TRUE}.} 71 | 72 | \item{linkernel}{logical if true, uses the linear kernel \eqn{K=XX'} which achieves balance on the first moments of \eqn{X} (mean balance). Note that for computational ease, the code employs \eqn{K=X} and adjusts singular values accordingly. Default is \code{FALSE}.} 73 | 74 | \item{meanfirst}{logical if true, internally searches for the optimal number of dimensions of the svd of \code{allx} to append to \code{K} as additional constraints. This will produce mean balance on as many dimensions of \code{allx} as optimally feasible with specified ebalance convergence and a minimal bias bound on the remaining unbalances columns of the left singular vectors of \code{K}. Note that any scaling specified on \code{allx} will be also be applied in the meanfirst routine. Default is \code{FALSE}.} 75 | 76 | \item{mf_columns}{either character or numeric vector to specify what columns of \code{allx} to perform meanfirst with. If left unspecified, all columns will be used.} 77 | 78 | \item{constraint}{optional matrix argument of additional constraints which are appended to the front of the left singular vectors of \code{K}. When specified, the code conducts a constrained optimization requiring mean balance on the columns of this matrix throughout the search for the minimum bias bound over the dimensions of the left singular vectors of \code{K}.} 79 | 80 | \item{scale_constraint}{logical for whether constraints in \code{constraint} should be scaled before they are appended to the svd of \code{K}. Default is \code{TRUE}.} 81 | 82 | \item{numdims}{optional numeric argument specifying the number of dimensions of the left singular vectors of the kernel matrix to find balance bypassing the optimization search for the number of dimensions which minimize the biasbound.} 83 | 84 | \item{minnumdims}{numeric argument to specify the minimum number of the left singular vectors of the kernel matrix to seek balance on in the search for the number of dimensions which minimize the bias. Default minimum is 1.} 85 | 86 | \item{maxnumdims}{numeric argument to specify the maximum number of the left singular vectors of the kernel matrix to seek balance on in the search for the number of dimensions which minimize the bias. For a Gaussian kernel, the default is the minimum between 500 and the number of bases given by \code{useasbases}. With a linear kernel, the default is the minimum between 500 and the number of columns in \code{allx}.} 87 | 88 | \item{fullSVD}{logical argument for whether the full SVD should be conducted internally. When \code{FALSE}, the code uses truncated svd methods from the \code{Rspectra} package in the interest of improving run time. When \code{FALSE}, the code computes only the SVD up to the either 80 percent of the columns of \code{K} or \code{maxnumdims} singular vectors, whichever is larger. When the number of columns is less than 80 percent the number of rows, defaults to full svd. Default is \code{FALSE}.} 89 | 90 | \item{incrementby}{numeric argument to specify the number of dimensions to increase by from \code{minnumdims} to \code{maxnumdims} in each iteration of the search for the number of dimensions which minimizes the bias. Default is 1.} 91 | 92 | \item{ebal.maxit}{maximum number of iterations used by \code{ebalance_custom()} in optimization in the search for weights \code{w}. Default is \code{500}.} 93 | 94 | \item{ebal.tol}{tolerance level used by \code{ebalance_custom()}. Default is \code{1e-6}.} 95 | 96 | \item{ebal.convergence}{logical to require ebalance convergence when selecting the optimal \code{numdims} dimensions of \code{K} that minimize the biasbound. When constraints are appended to the left singular vectors of \code{K} via \code{meanfirst=TRUE} or \code{constraints}, forced to be \code{TRUE} and otherwise \code{FALSE}.} 97 | 98 | \item{maxsearch_b}{optional argument to specify the maximum b in search for maximum variance of \code{K} in \code{b_maxvarK()}. Default is \code{2000}.} 99 | 100 | \item{early.stopping}{logical argument indicating whether bias balance optimization should stop twenty rounds after finding a minimum. Default is \code{TRUE}.} 101 | 102 | \item{printprogress}{logical argument to print updates throughout. Default is \code{TRUE}.} 103 | } 104 | \value{ 105 | \item{w}{a vector of the weights found using entropy balancing on \code{numdims} dimensions of the SVD of the kernel matrix.} 106 | \item{biasbound_opt}{a numeric giving the minimal bias bound found using \code{numdims} as the number of dimensions of the SVD of the kernel matrix. When \code{numdims} is user-specified, the bias bound using this number of dimensions of the kernel matrix.} 107 | \item{biasbound_orig}{a numeric giving the bias bound found when all sampled (control) units have a weight equal to one over the number of sampled (control) units and all target units have a weight equal to one over the number of target units.} 108 | \item{biasbound_ratio}{a numeric giving the ratio of \code{biasbound_orig} to\code{biasbound_opt}. Can be informative when comparing the performance of different \code{b} values.} 109 | \item{dist_record}{a matrix recording the bias bound corresponding to balance on increasing dimensions of the SVD of the kernel matrix starting from \code{minnumdims} increasing by \code{incrementby} to \code{maxnumdims} or until the bias grows to be 1.25 times the minimal bias found.} 110 | \item{numdims}{a numeric giving the optimal number of dimensions of the SVD of the kernel matrix which minimizes the bias bound.} 111 | \item{L1_orig}{a numeric giving the L1 distance found when all sampled (control) units have a weight equal to one over the number of sampled (control) units and all target units have a weight equal to one over the number of target units.} 112 | \item{L1_opt}{a numeric giving the L1 distance at the minimum bias bound found using \code{numdims} as the number of dimensions of the SVD of the kernel matrix. When \code{numdims} is user-specified, the L1 distance using this number of dimensions of the kernel matrix.} 113 | \item{K}{the kernel matrix} 114 | \item{onehot_dat}{when categorical data is specified, the resulting one-hot encoded categorical data used in the construction of \code{K}. When mixed data is specified, returns concatenated one-hot encoded categorical data and scaled continuous data used to construct \code{K}.} 115 | \item{linkernel}{logical for whether linear kernel was used} 116 | \item{svdK}{a list giving the SVD of the kernel matrix with left singular vectors \code{svdK$u}, right singular vectors \code{svdK$v}, and singular values \code{svdK$d}} 117 | \item{b}{numeric scaling factor used in the the calculation of gaussian kernel equivalent to the denominator \eqn{2\sigma^2} of the exponent.} 118 | \item{maxvar_K}{returns the resulting variance of the kernel matrix when the \code{b} determined internally as the argmax of the variance \code{K}} 119 | \item{bases}{numeric vector indicating what bases (rows in \code{allx}) were used to construct kernel matrix (columns of K)} 120 | \item{truncatedSVD.var}{when truncated SVD methods are used on symmetric kernel matrices, a numeric which gives the proportion of the total variance of \code{K} captured by the first \code{maxnumdims} singular values found by the truncated SVD. When the kernel matrix is non-symmetric, this is a worst case approximation of the percent variance explained, assuming the remaining unknown singular values are the same magnitude as the last calculated in the truncated SVD.} 121 | \item{dropped_covariates}{provides a vector of character column names for covariates dropped due to multicollinearity.} 122 | \item{meanfirst_dims}{when \code{meanfirst=TRUE} the optimal number of the singular vectors of \code{allx} selected and appended to the front of the left singular vectors of \code{K}} 123 | \item{meanfirst_cols}{when \code{meanfirst=TRUE} \code{meanfirst_dims} first left singular vectors of \code{allx} selected that are appended to the front of the left singular vectors of \code{K} and balanced on} 124 | \item{ebal_error}{when ebalance is unable to find convergent weights, the associated error message it reports} 125 | } 126 | \description{ 127 | Kernel balancing (\code{kbal}) is non-parametric weighting tool to make two groups have a similar distribution of covariates, not only in terms of means or marginal distributions but also on (i) general smooth functions of the covariates, including on (ii) a smoothing estimator of the joint distribution of the covariates. It was originally designed (Hazlett, 2017) to make control and treated groups look alike, as desired when estimating causal effects under conditional ignorability. This package also facilitates use of this approach for more general distribution-alignment tasks, such as making a sampled group have a similar distribution of covariates as a target population, as in survey reweighting. The examples below provide an introduction to both settings. 128 | 129 | To proceed in the causal effect setting, kbal assumes that the expectation of the non-treatment potential outcome conditional on the covariates falls in a large, flexible space of functions associated with a kernel. It then constructs linear bases for this function space and achieves approximate balance on these bases. The approximation is one that minimizes the worst-case bias that could persist due to remaining imbalances. 130 | 131 | The \code{kbal} function implements kernel balancing using a gaussian kernel to expand the features of \eqn{X_i} to infinite dimensions. It finds approximate mean balance for the control or sample group and treated group or target population in this expanded feature space by using the first \code{numdims} dimensions of the singular value decomposition of the gaussian kernel matrix. It employs entropy balancing to find the weights for each unit which produce this approximate balance. When \code{numdims} is not user-specified, it searches through increasing dimensions of the SVD of the kernel matrix to find the number of dimensions which produce weights that minimizes the worst-case bias bound with a given \code{hilbertnorm}. It then returns these optimal weights, along with the minimized bias, the kernel matrix, a record of the number of dimensions used and the corresponding bias, as well as an original bias using naive group size weights for comparison. Note that while kernel balancing goes far beyond simple mean balancing, it may not result in perfect mean balance. Users who wish to require mean balancing can specify \code{meanfirst = T} to require mean balance on as many dimensions of the data as optimally feasible. Alternatively, users can manually specify \code{constraint} to append additional vector constraints to the kernel matrix in the bias bound optimization, requiring mean balance on these columns. Note further that \code{kbal} supports three types of input data: fully categorical, fully continuous, or mixed. When data is only categorical, as is common with demographic variables for survey reweighting, users should use argument \code{cat_data = TRUE} and can input their data as factors, numeric, or characters and \code{kbal} will internally transform the data to a more appropriate one-hot encoding and search for the value of \code{b}, the denominator of the exponent in the Gaussian, which maximizes the variance of the kernel matrix. When data is fully continuous, users should use default settings (\code{cat_data = FALSE} and \code{cont_data = FAlSE}, which will scale all columns and again conduct an internal search for the value of \code{b} which maximizes the variance of \code{K}. Note that with continuous data, this search may take considerably more computational time than the categorical case. When data is a mix of continuous and categorical data, users should use argument \code{mixed_data = TRUE}, specify by name what columns are categorical with \code{cat_columns}, and also set the scaling of the continuous variables with \code{cont_scale}. This will result in a one-hot encoding of categorical columns concatenated with the continuous columns scaled in accordance with \code{cont_scale} and again an internal search for the value of \code{b} which maximizes the variance in the kernel matrix. Again note that compared to the categorical case, this search will take more computational time. 132 | } 133 | \examples{ 134 | #---------------------------------------------------------------- 135 | # Example 1: Reweight a control group to a treated to estimate ATT. 136 | # Benchmark using Lalonde et al. 137 | #---------------------------------------------------------------- 138 | #1. Rerun Lalonde example with settings as in Hazlett, C (2017). Statistica Sinica paper: 139 | set.seed(123) 140 | data("lalonde") 141 | # Select a random subset of 500 rows 142 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 143 | lalonde <- lalonde[lalonde_sample, ] 144 | 145 | xvars=c("age","black","educ","hisp","married","re74","re75","nodegr","u74","u75") 146 | \donttest{ 147 | 148 | kbalout.full= kbal(allx=lalonde[,xvars], 149 | b=length(xvars), 150 | treatment=lalonde$nsw, 151 | fullSVD = TRUE) 152 | summary(lm(re78~nsw,w=kbalout.full$w, data = lalonde)) 153 | } 154 | 155 | #2. Lalonde with categorical data only: u74, u75, nodegree, race, married 156 | cat_vars=c("race_ethnicity","married","nodegr","u74","u75") 157 | \donttest{ 158 | kbalout_cat_only = kbal(allx=lalonde[,cat_vars], 159 | cat_data = TRUE, 160 | treatment=lalonde$nsw, 161 | fullSVD = TRUE) 162 | kbalout_cat_only$b 163 | summary(lm(re78~nsw,w=kbalout_cat_only$w, data = lalonde)) 164 | } 165 | 166 | #3. Lalonde with mixed categorical and continuous data 167 | cat_vars=c("race_ethnicity", "married") 168 | all_vars= c("age","educ","re74","re75","married", "race_ethnicity") 169 | \donttest{ 170 | kbalout_mixed = kbal(allx=lalonde[,all_vars], 171 | mixed_data = TRUE, 172 | cat_columns = cat_vars, 173 | treatment=lalonde$nsw, 174 | fullSVD = TRUE) 175 | kbalout_mixed$b 176 | summary(lm(re78~nsw,w=kbalout_mixed$w, data = lalonde)) 177 | } 178 | 179 | #---------------------------------------------------------------- 180 | # Example 1B: Reweight a control group to a treated to esimate ATT. 181 | # Benchmark using Lalonde et al. -- but just mean balancing now 182 | # via "linkernel". 183 | #---------------------------------------------------------------- 184 | 185 | # Rerun Lalonde example with settings as in Hazlett, C (2017). Statistica paper: 186 | kbalout.lin= kbal(allx=lalonde[,xvars], 187 | b=length(xvars), 188 | treatment=lalonde$nsw, 189 | linkernel=TRUE, 190 | fullSVD=TRUE) 191 | 192 | # Check balance with and without these weights: 193 | dimw(X=lalonde[,xvars], w=kbalout.lin$w, target=lalonde$nsw) 194 | 195 | summary(lm(re78~nsw,w=kbalout.lin$w, data = lalonde)) 196 | 197 | #---------------------------------------------------------------- 198 | # Example 2: Reweight a sample to a target population. 199 | #---------------------------------------------------------------- 200 | # Suppose a population consists of four groups in equal shares: 201 | # white republican, non-white republican, white non-republicans, 202 | # and non-white non-republicans. A given policy happens to be supported 203 | # by all white republicans, and nobody else. Thus the mean level of 204 | # support in the population should be 25\%. 205 | # 206 | # Further, the sample is surveyed in such a way that was careful 207 | # to quota on party and race, obtaining 50\% republican and 50\% white. 208 | # However, among republicans three-quarters are white and among non-republicans, 209 | # three quarters are non-white. This biases the average level of support 210 | # despite having a sample that matches the population on its marginal distributions. #' 211 | # We'd like to reweight the sample so it resembles the population not 212 | # just on the margins, but in the joint distribution of characteristics. 213 | 214 | pop <- data.frame( 215 | republican = c(rep(0,400), rep(1,400)), 216 | white = c(rep(1,200), rep(0,200), rep(1,200), rep(0,200)), 217 | support = c(rep(1,200), rep(0,600))) 218 | 219 | mean(pop$support) # Target value 220 | 221 | # Survey sample: correct margins/means, but wrong joint distribution 222 | samp <- data.frame( republican = c(rep(1, 40), rep(0,40)), 223 | white = c(rep(1,30), rep(0,10), rep(1,10), rep(0,30)), 224 | support = c(rep(1,30), rep(0,50))) 225 | 226 | mean(samp$support) # Appears that support is 37.5\% instead of 25\%. 227 | 228 | # Mean Balancing ----------------------------------------- 229 | # Sample is already mean-balanced to the population on each 230 | # characteristic. However for illustrative purposes, use ebal() 231 | dat <- rbind(pop,samp) 232 | 233 | # Indicate which units are sampled (1) and which are population units(0) 234 | sampled <- c(rep(0,800), rep(1,80)) 235 | 236 | # Run ebal (treatment = population units = 1-sampled) 237 | ebal_out <- ebalance_custom(Treatment = 1-sampled, 238 | X=dat[,1:2], 239 | constraint.tolerance=1e-6, 240 | print.level=-1) 241 | 242 | # We can see everything gets even weights, since already mean balanced. 243 | length(unique(ebal_out$w)) 244 | 245 | # And we end up with the same estimate we started with 246 | weighted.mean(samp[,3], w = ebal_out$w) 247 | 248 | # We see that, because the margins are correct, all weights are equal 249 | unique(cbind(samp, e_bal_weight = ebal_out$w)) 250 | 251 | # Kernel balancing for weighting to a population (i.e. kpop) ------- 252 | kbalout = kbal(allx=dat[,1:2], 253 | useasbases=rep(1,nrow(dat)), 254 | sampled = sampled, 255 | b = 1, 256 | sampledinpop = FALSE) 257 | 258 | # The weights now vary: 259 | plot(kbalout$w[sampled ==1], pch=16) 260 | 261 | # And produce correct estimate: 262 | weighted.mean(samp$support, w = kbalout$w[sampled==1]) 263 | 264 | # kbal correctly downweights white republicans and non-white non-republicans 265 | # and upweights the non-white republicans and white non-republicans 266 | unique(round(cbind(samp[,-3], k_bal_weight = kbalout$w[sampled==1]),6)) 267 | } 268 | \references{ 269 | Hazlett, C. (2017), "Kernel Balancing: A flexible non-parametric weighting procedure for estimating causal effects." Forthcoming in Statistica Sinica. https://doi.org/10.5705/ss.202017.0555 270 | } 271 | -------------------------------------------------------------------------------- /man/lalonde.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lalondedata.R 3 | \docType{data} 4 | \name{lalonde} 5 | \alias{lalonde} 6 | \title{Data from National Supported Work program and Panel Study in Income Dynamics} 7 | \format{ 8 | A data frame with 2675 rows and 14 columns. 9 | \describe{ 10 | \item{nsw}{treatment indicator: participation in the National Supported Work program.} 11 | \item{re78}{real earnings in 1978 (outcome)} 12 | \item{u78}{unemployed in 1978; actually an indicator for zero income in 1978} 13 | \item{age}{age in years} 14 | \item{black}{indicator for identifying as black} 15 | \item{hisp}{indicator for identifying as Hispanic} 16 | \item{race_ethnicity}{factor for self-identified race/ethnicity; same information as \code{black} and \code{hisp} in character form.} 17 | \item{married}{indicator for being married} 18 | \item{re74}{real income in 1974} 19 | \item{re75}{real income in 1975} 20 | \item{u74}{unemployment in 1974; actually an indicator for zero income in 1974} 21 | \item{u75}{unemployment in 1975; actually an indicator for zero income in 1975} 22 | \item{educ}{Years of education of the individual} 23 | \item{nodegr}{indicator for no high school degree; actually an indicator for years of education less than 12} 24 | } 25 | } 26 | \usage{ 27 | lalonde 28 | } 29 | \description{ 30 | Dehejia and Wahba (1999) sample of data from Lalonde (1986). 31 | This data set includes 185 treated units from the National 32 | Supported Work (NSW) program, paired with 2490 control units 33 | drawn from the Panel Study of Income Dynamics (PSID-1). 34 | 35 | The treatment variable of interest is \code{nsw}, which indicates that an individual 36 | was in the job training program. The main outcome of interest is 37 | real earnings in 1978 (\code{re78}). The remaining variables are characteristics 38 | of the individuals, to be used as controls. 39 | } 40 | \references{ 41 | Dehejia, Rajeev H., and Sadek Wahba. "Causal effects in non-experimental studies: Reevaluating the evaluation of training programs." Journal of the American statistical Association 94.448 (1999): 1053-1062. 42 | 43 | LaLonde, Robert J. "Evaluating the econometric evaluations of training programs with experimental data." The American economic review (1986): 604-620. 44 | } 45 | \keyword{datasets} 46 | -------------------------------------------------------------------------------- /man/makeK.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{makeK} 4 | \alias{makeK} 5 | \title{Build the Gaussian Kernel Matrix} 6 | \usage{ 7 | makeK(allx, useasbases = NULL, b = NULL, linkernel = FALSE, scale = TRUE) 8 | } 9 | \arguments{ 10 | \item{allx}{a data matrix containing all observations where rows are units and columns are covariates.} 11 | 12 | \item{useasbases}{a binary vector with length equal to the number of observations (rows in \code{allx}) to specify which bases to use when constructing the kernel matrix (columns of \eqn{K}). If not specified, the default is to use all observations.} 13 | 14 | \item{b}{Scaling factor in the calculation of Gaussian kernel distance equivalent to the entire denominator \eqn{2\sigma^2} of the exponent. Default is twice the number of covariates or columns in \code{allx}.} 15 | 16 | \item{linkernel}{a logical value indicating whether to use a linear kernel, \eqn{K=XX'}, which in practice employs \eqn{K=X}. Default is \code{FALSE}.} 17 | 18 | \item{scale}{a logical value indicating whether to standardize \code{allx} (demeaned with sd=1) before constructing the kernel matrix. Default is \code{TRUE}.} 19 | } 20 | \value{ 21 | \item{K}{The kernel matrix} 22 | } 23 | \description{ 24 | Builds the Gaussian kernel matrix using Rcpp. 25 | } 26 | \examples{ 27 | #load and clean data a bit 28 | \donttest{ 29 | set.seed(123) 30 | data("lalonde") 31 | # Select a random subset of 500 rows 32 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 33 | lalonde <- lalonde[lalonde_sample, ] 34 | 35 | xvars <- c("age","black","educ","hisp","married","re74","re75","nodegr","u74","u75") 36 | 37 | #note that lalonde$nsw is the treatment vector, so the observed is 1-lalonde$nsw 38 | #running makeK with the sampled/control units as the bases given 39 | #the large size of the data 40 | K <- makeK(allx = lalonde[,xvars], useasbases = 1-lalonde$nsw) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /man/one_hot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{one_hot} 4 | \alias{one_hot} 5 | \title{One-Hot Encoding for Categorical Data} 6 | \usage{ 7 | one_hot(data) 8 | } 9 | \arguments{ 10 | \item{data}{a dataframe or matrix where columns are string or factor type covariates} 11 | } 12 | \value{ 13 | \item{onehot_data}{a matrix of combined sample and population data with rows corresponding to units and columns one-hot encoded categorical covariates} 14 | } 15 | \description{ 16 | Converts raw categorical string/factor data matrix into numeric one-hot encoded data matrix. Intended to help prepare data to be passed to \code{kbal} argument \code{allx} when categorical data is used. 17 | } 18 | \examples{ 19 | \donttest{ 20 | #Ex 1. Make up some categorical demographic data 21 | dat = data.frame(pid = c(rep("Rep", 20), 22 | rep("Dem", 20), 23 | rep("Ind", 20)), 24 | gender = c(rep("female", 35), 25 | rep("male", 25))) 26 | #Convert to one-hot encoded data matrix: 27 | onehot_dat = one_hot(dat) 28 | } 29 | #Ex 2. lalonde data 30 | set.seed(123) 31 | data("lalonde") 32 | # Select a random subset of 500 rows 33 | lalonde_sample <- sample(1:nrow(lalonde), 500, replace = FALSE) 34 | lalonde <- lalonde[lalonde_sample, ] 35 | 36 | cat_vars=c("black","hisp","married","nodegr","u74","u75") 37 | onehot_lalonde = one_hot(lalonde[, cat_vars]) 38 | } 39 | -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/src/.DS_Store -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // kernel_parallel 14 | Rcpp::NumericMatrix kernel_parallel(Rcpp::NumericMatrix X, const double b); 15 | RcppExport SEXP _kbal_kernel_parallel(SEXP XSEXP, SEXP bSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type X(XSEXP); 20 | Rcpp::traits::input_parameter< const double >::type b(bSEXP); 21 | rcpp_result_gen = Rcpp::wrap(kernel_parallel(X, b)); 22 | return rcpp_result_gen; 23 | END_RCPP 24 | } 25 | // kernel_parallel_2 26 | Rcpp::NumericMatrix kernel_parallel_2(Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y, const double b); 27 | RcppExport SEXP _kbal_kernel_parallel_2(SEXP XSEXP, SEXP YSEXP, SEXP bSEXP) { 28 | BEGIN_RCPP 29 | Rcpp::RObject rcpp_result_gen; 30 | Rcpp::RNGScope rcpp_rngScope_gen; 31 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type X(XSEXP); 32 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type Y(YSEXP); 33 | Rcpp::traits::input_parameter< const double >::type b(bSEXP); 34 | rcpp_result_gen = Rcpp::wrap(kernel_parallel_2(X, Y, b)); 35 | return rcpp_result_gen; 36 | END_RCPP 37 | } 38 | // kernel_parallel_old 39 | Rcpp::NumericMatrix kernel_parallel_old(Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y, const double b); 40 | RcppExport SEXP _kbal_kernel_parallel_old(SEXP XSEXP, SEXP YSEXP, SEXP bSEXP) { 41 | BEGIN_RCPP 42 | Rcpp::RObject rcpp_result_gen; 43 | Rcpp::RNGScope rcpp_rngScope_gen; 44 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type X(XSEXP); 45 | Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type Y(YSEXP); 46 | Rcpp::traits::input_parameter< const double >::type b(bSEXP); 47 | rcpp_result_gen = Rcpp::wrap(kernel_parallel_old(X, Y, b)); 48 | return rcpp_result_gen; 49 | END_RCPP 50 | } 51 | 52 | static const R_CallMethodDef CallEntries[] = { 53 | {"_kbal_kernel_parallel", (DL_FUNC) &_kbal_kernel_parallel, 2}, 54 | {"_kbal_kernel_parallel_2", (DL_FUNC) &_kbal_kernel_parallel_2, 3}, 55 | {"_kbal_kernel_parallel_old", (DL_FUNC) &_kbal_kernel_parallel_old, 3}, 56 | {NULL, NULL, 0} 57 | }; 58 | 59 | RcppExport void R_init_kbal(DllInfo *dll) { 60 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 61 | R_useDynamicSymbols(dll, FALSE); 62 | } 63 | -------------------------------------------------------------------------------- /src/RcppExports.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/src/RcppExports.o -------------------------------------------------------------------------------- /src/kbal.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/src/kbal.dll -------------------------------------------------------------------------------- /src/kernel_parallel.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | // [[Rcpp::depends(RcppParallel)]] 5 | using namespace Rcpp; 6 | using namespace RcppParallel; 7 | 8 | 9 | // define both_non_NA(a, b) 10 | inline bool both_non_NA(double a, double b) { 11 | return (!ISNAN(a) && !ISNAN(b)); 12 | } 13 | 14 | struct Kernel : public Worker 15 | { 16 | // source matrix 17 | const RMatrix X; 18 | const double b; 19 | 20 | // destination matrix 21 | RMatrix out; 22 | 23 | // initialize with source and destination 24 | Kernel(const Rcpp::NumericMatrix X, 25 | const double b, 26 | Rcpp::NumericMatrix out) 27 | : X(X), b(b), out(out) {} 28 | 29 | // calculate the IBS kernel of the range of elements requested 30 | void operator()(std::size_t begin, std::size_t end) { 31 | int p = X.ncol(); 32 | for (std::size_t i = begin; i < end; i++) { 33 | for (std::size_t j = 0; j < i; j++) { 34 | double dist = 0; 35 | for (int k = 0; k < p; k++) { 36 | double xi = X(i, k), xj = X(j, k); 37 | if (both_non_NA(xi, xj)) { 38 | double diff = xi - xj; 39 | dist += diff*diff; 40 | } 41 | } 42 | out(i, j) = exp(-dist / b); 43 | out(j, i) = out(i, j); 44 | } 45 | out(i, i) = 1; 46 | } 47 | } 48 | }; 49 | 50 | // [[Rcpp::export]] 51 | Rcpp::NumericMatrix kernel_parallel(Rcpp::NumericMatrix X, 52 | const double b) { 53 | 54 | // allocate the output matrix 55 | Rcpp::NumericMatrix out(X.nrow(), X.nrow()); 56 | 57 | // IBSKernel functor (pass input and output matrixes) 58 | Kernel kernel(X, b, out); 59 | 60 | // call parallelFor to do the work 61 | parallelFor(0, X.nrow(), kernel); 62 | 63 | // return the output matrix 64 | return out; 65 | } 66 | 67 | struct Kernel_2 : public Worker 68 | { 69 | // source matrix 70 | const RMatrix X; 71 | const RMatrix Y; 72 | const double b; 73 | 74 | // destination matrix 75 | RMatrix out; 76 | 77 | // initialize with source and destination 78 | Kernel_2(const Rcpp::NumericMatrix X, 79 | const Rcpp::NumericMatrix Y, 80 | const double b, 81 | Rcpp::NumericMatrix out) 82 | : X(X), Y(Y), b(b), out(out) {} 83 | 84 | // calculate the IBS kernel of the range of elements requested 85 | void operator()(std::size_t begin, std::size_t end) { 86 | int p = X.ncol(); 87 | int m = Y.nrow(); 88 | for (std::size_t i = begin; i < end; i++) { 89 | for (std::size_t j = 0; j < m; j++) { 90 | double dist = 0; 91 | for (int k = 0; k < p; k++) { 92 | double xi = X(i, k), yj = Y(j, k); 93 | double diff = xi - yj; 94 | dist += diff*diff; 95 | } 96 | out(i, j) = exp(-dist / b); 97 | } 98 | } 99 | } 100 | }; 101 | 102 | // [[Rcpp::export]] 103 | Rcpp::NumericMatrix kernel_parallel_2(Rcpp::NumericMatrix X, 104 | Rcpp::NumericMatrix Y, 105 | const double b) { 106 | 107 | // allocate the output matrix 108 | Rcpp::NumericMatrix out(X.nrow(), Y.nrow()); 109 | 110 | // IBSKernel functor (pass input and output matrixes) 111 | Kernel_2 kernel_2(X, Y, b, out); 112 | 113 | // call parallelFor to do the work 114 | parallelFor(0, X.nrow(), kernel_2); 115 | 116 | // return the output matrix 117 | return out; 118 | } 119 | 120 | 121 | 122 | struct Kernel_old : public Worker 123 | { 124 | // source matrix 125 | const RMatrix X; 126 | const RMatrix Y; 127 | const double b; 128 | 129 | // destination matrix 130 | RMatrix out; 131 | 132 | // initialize with source and destination 133 | Kernel_old(const Rcpp::NumericMatrix X, 134 | const Rcpp::NumericMatrix Y, 135 | const double b, 136 | Rcpp::NumericMatrix out) 137 | : X(X), Y(Y), b(b), out(out) {} 138 | 139 | // calculate the IBS kernel of the range of elements requested 140 | 141 | void operator()(std::size_t begin, std::size_t end) { 142 | int p = X.ncol(); 143 | int m = Y.nrow(); 144 | for (std::size_t i = begin; i < end; i++) { 145 | for (std::size_t j = 0; j < m; j++) { 146 | double dist = 0; 147 | for (int k = 0; k < p; k++) { 148 | double xi = X(i, k), yj = Y(j, k); 149 | if (both_non_NA(xi, yj)) { 150 | double diff = xi - yj; 151 | dist += diff*diff; 152 | } 153 | } 154 | out(i, j) = exp(-dist / b); 155 | } 156 | } 157 | } 158 | }; 159 | 160 | 161 | // [[Rcpp::export]] 162 | Rcpp::NumericMatrix kernel_parallel_old(Rcpp::NumericMatrix X, 163 | Rcpp::NumericMatrix Y, 164 | const double b) { 165 | 166 | // allocate the output matrix 167 | Rcpp::NumericMatrix out(X.nrow(), Y.nrow()); 168 | 169 | // IBSKernel functor (pass input and output matrixes) 170 | Kernel_old kernel_old(X, Y, b, out); 171 | 172 | // call parallelFor to do the work 173 | parallelFor(0, X.nrow(), kernel_old); 174 | 175 | // return the output matrix 176 | return out; 177 | }; 178 | 179 | -------------------------------------------------------------------------------- /src/kernel_parallel.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chadhazlett/KBAL/6785483f6c2a30ed4f7530ad50e0907912f19c6e/src/kernel_parallel.o -------------------------------------------------------------------------------- /tests/testthat/test_b_maxvarK.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | test_that("b_maxvarK works correctly with valid input", { 5 | n <- 20 6 | data <- matrix(rnorm(n*5), ncol = 5) 7 | useasbases <- sample(c(0, 1), n, replace = TRUE) 8 | 9 | result <- b_maxvarK(data, useasbases) 10 | 11 | expect_true(is.list(result)) 12 | expect_true("b_maxvar" %in% names(result)) 13 | expect_true("var_K" %in% names(result)) 14 | }) 15 | 16 | test_that("b_maxvarK handles non-matrix data input", { 17 | expect_error(b_maxvarK(data = list('a', 2, 3), useasbases = c(1, 0, 1)), "`data` should be able to be converted into a numeric matrix.") 18 | }) 19 | 20 | test_that("b_maxvarK handles invalid useasbases input", { 21 | n <- 20 22 | data <- matrix(rnorm(n*5), ncol = 5) 23 | expect_error(b_maxvarK(data = data, useasbases = c(1, 0)), "`useasbases` must be a binary vector with the same length as the number of rows in `data`.") 24 | }) 25 | 26 | test_that("b_maxvarK handles invalid cat_data input", { 27 | n <- 20 28 | data <- matrix(rnorm(n*5), ncol = 5) 29 | useasbases <- sample(c(0, 1), n, replace = TRUE) 30 | expect_error(b_maxvarK(data = data, useasbases = useasbases, cat_data = "yes"), "`cat_data` must be a logical value.") 31 | }) 32 | 33 | test_that("b_maxvarK handles invalid maxsearch_b input", { 34 | n <- 20 35 | data <- matrix(rnorm(n*5), ncol = 5) 36 | useasbases <- sample(c(0, 1), n, replace = TRUE) 37 | expect_error(b_maxvarK(data = data, useasbases = useasbases, maxsearch_b = "high"), "`maxsearch_b` must be a single numeric value.") 38 | }) 39 | 40 | 41 | -------------------------------------------------------------------------------- /tests/testthat/test_biasbound.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Create example data 5 | set.seed(123) 6 | data <- matrix(rnorm(100), ncol = 5) 7 | K <- makeK(allx = data) 8 | svd.out <- svd(K) 9 | target <- sample(c(0, 1), 20, replace = TRUE) 10 | observed <- 1 - target 11 | weights <- runif(20, 0, 1) 12 | w.pop <- rep(1, 20) 13 | 14 | # Basic functionality tests 15 | test_that("biasbound works correctly with valid input", { 16 | result <- biasbound(observed = observed, target = target, svd.out = svd.out, w = weights, hilbertnorm = 1) 17 | 18 | expect_true(is.numeric(result)) 19 | expect_equal(length(result), 1) 20 | }) 21 | 22 | # Error handling tests 23 | test_that("biasbound handles invalid observed input", { 24 | invalid_observed <- sample(c(-1, 0, 2), 20, replace = TRUE) 25 | expect_error(biasbound(observed = invalid_observed, target = target, svd.out = svd.out, w = weights, hilbertnorm = 1), 26 | regexp = "`observed` must be a binary vector", 27 | fixed = TRUE 28 | ) 29 | }) 30 | 31 | test_that("biasbound handles invalid target input", { 32 | invalid_target <- sample(c(-1, 0, 2), 20, replace = TRUE) 33 | expect_error(biasbound(observed = observed, target = invalid_target, svd.out = svd.out, w = weights, hilbertnorm = 1), 34 | regexp = "`target` must be a binary vector", 35 | fixed = TRUE 36 | ) 37 | }) 38 | 39 | test_that("biasbound handles invalid svd.out input", { 40 | expect_error(biasbound(observed = observed, target = target, svd.out = list(u = svd.out$u), w = weights, hilbertnorm = 1), 41 | regexp = "`svd.out` must be a list containing `u`", fixed = TRUE) 42 | }) 43 | 44 | test_that("biasbound handles invalid w input", { 45 | invalid_weights <- c(weights, -0.5) # Negative weight 46 | expect_error(biasbound(observed = observed, target = target, svd.out = svd.out, w = invalid_weights, hilbertnorm = 1), 47 | regexp = "`w` must be a non-negative numeric vector", fixed = TRUE) 48 | }) 49 | 50 | test_that("biasbound handles invalid w.pop input", { 51 | invalid_w_pop <- c(w.pop, -0.5) # Negative value in w.pop 52 | expect_error(biasbound(observed = observed, target = target, svd.out = svd.out, w = weights, w.pop = invalid_w_pop, hilbertnorm = 1), 53 | regexp = "`w.pop` must be a non-negative numeric vector", fixed = TRUE) 54 | }) 55 | 56 | test_that("biasbound handles hilbertnorm correctly", { 57 | expect_error(biasbound(observed = observed, target = target, svd.out = svd.out, w = weights, hilbertnorm = -1), 58 | "`hilbertnorm` must be a positive numeric value.") 59 | }) 60 | 61 | test_that("biasbound handles negative eigenvalues in svd.out$d", { 62 | svd.out_with_neg <- svd.out 63 | svd.out_with_neg$d[1] <- -abs(svd.out_with_neg$d[1]) # Introduce a negative eigenvalue 64 | expect_error(biasbound(observed = observed, target = target, svd.out = svd.out_with_neg, w = weights, hilbertnorm = 1), 65 | "Encountered negative eigenvalues. Cannot compute biasbound.") 66 | }) 67 | 68 | # Additional tests for edge cases 69 | test_that("biasbound handles population weights normalization", { 70 | # Case where w.pop sums to 1, should normalize correctly 71 | w.pop_custom <- rep(0.05, 20) 72 | result <- biasbound(observed = observed, target = target, svd.out = svd.out, w = weights, w.pop = w.pop_custom, hilbertnorm = 1) 73 | expect_true(is.numeric(result)) 74 | }) 75 | 76 | test_that("biasbound handles population weights not summing to 1 or number of treated units", { 77 | # w.pop does not sum to either 1 or the number of treated units 78 | invalid_w_pop <- c(rep(0.1, 10), rep(0.2, 10)) 79 | expect_error(biasbound(observed = observed, target = target, svd.out = svd.out, w = weights, w.pop = invalid_w_pop, hilbertnorm = 1), 80 | regexp = "must sum to either 1", fixed = TRUE) 81 | }) 82 | -------------------------------------------------------------------------------- /tests/testthat/test_dimw.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Create example data 5 | set.seed(123) 6 | data <- matrix(rnorm(100), ncol = 5) 7 | weights <- runif(20, 0, 1) 8 | target <- sample(c(0, 1), 20, replace = TRUE) 9 | 10 | # Basic functionality tests 11 | test_that("dimw works correctly with valid input", { 12 | result <- dimw(X = data, w = weights, target = target) 13 | 14 | expect_true(is.list(result)) 15 | expect_true("dim" %in% names(result)) 16 | expect_true("dimw" %in% names(result)) 17 | expect_true(is.numeric(result$dim)) 18 | expect_true(is.numeric(result$dimw)) 19 | expect_equal(length(result$dim), ncol(data)) 20 | expect_equal(length(result$dimw), ncol(data)) 21 | }) 22 | 23 | # Error handling tests 24 | test_that("dimw handles non-matrix X input", { 25 | expect_error(dimw(X = list('a', 2, 3), w = weights, target = target), "`X` should be able to be converted into a numeric matrix.") 26 | }) 27 | 28 | test_that("dimw handles invalid length of w", { 29 | invalid_w <- rep(1, 19) 30 | expect_error(dimw(X = data, w = invalid_w, target = target), "`w` must be a non-negative numeric vector with the same length as the number of rows in `X`.") 31 | }) 32 | 33 | test_that("dimw handles non-numeric w input", { 34 | invalid_w <- c(rep("a", 20)) 35 | expect_error(dimw(X = data, w = invalid_w, target = target), "`w` must be a non-negative numeric vector with the same length as the number of rows in `X`.") 36 | }) 37 | 38 | test_that("dimw handles negative values in w", { 39 | invalid_w <- c(rep(-1, 20)) 40 | expect_error(dimw(X = data, w = invalid_w, target = target), "`w` must be a non-negative numeric vector with the same length as the number of rows in `X`.") 41 | }) 42 | 43 | test_that("dimw handles invalid target input", { 44 | invalid_target <- sample(c(-1, 0, 2), 20, replace = TRUE) 45 | expect_error(dimw(X = data, w = weights, target = invalid_target), "`target` must be a binary vector containing only 0 and 1 with the same length as the number of rows in `X`.") 46 | }) 47 | 48 | test_that("dimw handles mismatched lengths of target and X", { 49 | mismatched_target <- sample(c(0, 1), 19, replace = TRUE) 50 | expect_error(dimw(X = data, w = weights, target = mismatched_target), "`target` must be a binary vector containing only 0 and 1 with the same length as the number of rows in `X`.") 51 | }) 52 | 53 | test_that("dimw handles non-numeric target input", { 54 | invalid_target <- c(rep("a", 20)) 55 | expect_error(dimw(X = data, w = weights, target = invalid_target), "`target` must be a binary vector containing only 0 and 1 with the same length as the number of rows in `X`.") 56 | }) 57 | -------------------------------------------------------------------------------- /tests/testthat/test_drop_multicollin.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Basic functionality tests 5 | test_that("drop_multicollin works correctly with valid input", { 6 | set.seed(123) 7 | data <- data.frame(x = rnorm(100), 8 | y = sample.int(100, 100), 9 | z = runif(100, 3, 6)) 10 | test <- data.frame(mc_1 = data$x, 11 | mc_2 = data$x * 2 + data$y - data$z) 12 | dat <- cbind(test, data) 13 | 14 | result <- drop_multicollin(dat) 15 | 16 | expect_true(is.list(result)) 17 | expect_true("allx_noMC" %in% names(result)) 18 | expect_true("dropped_cols" %in% names(result)) 19 | expect_true(is.data.frame(result$allx_noMC) || is.matrix(result$allx_noMC)) 20 | expect_true(all(sapply(result$allx_noMC, is.numeric))) 21 | expect_true(qr(result$allx_noMC)$rank == ncol(result$allx_noMC)) 22 | }) 23 | 24 | # Error handling tests 25 | test_that("drop_multicollin handles non-matrix or non-data frame input", { 26 | expect_error(drop_multicollin(list(a = 'a', b = 2)), "`allx` should be able to be converted into a numeric matrix.") 27 | }) 28 | 29 | test_that("drop_multicollin handles non-numeric columns", { 30 | data <- data.frame(x = rnorm(100), y = (sample(c("a", "b"), 100, replace = TRUE))) 31 | expect_error(drop_multicollin(data), "`allx` should be able to be converted into a numeric matrix.") 32 | }) 33 | 34 | test_that("drop_multicollin handles already full-rank matrices", { 35 | data <- data.frame(x = rnorm(100), y = rnorm(100)) 36 | result <- drop_multicollin(data, printprogress = FALSE) 37 | 38 | expect_equal(result$dropped_cols, NULL) 39 | expect_true(is.data.frame(result$allx_noMC) || is.matrix(result$allx_noMC)) 40 | expect_true(qr(result$allx_noMC)$rank == ncol(result$allx_noMC)) 41 | }) 42 | -------------------------------------------------------------------------------- /tests/testthat/test_getdist.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Create example data 5 | set.seed(123) 6 | data <- matrix(rnorm(100), ncol = 5) 7 | K <- makeK(allx = data) 8 | svd.U <- svd(K)$u 9 | target <- sample(c(0, 1), 20, replace = TRUE) 10 | observed <- 1 - target 11 | w.pop <- rep(1, 20) 12 | weights <- runif(20, 0, 1) 13 | 14 | # Basic functionality tests 15 | test_that("getdist works correctly with valid input", { 16 | result <- getdist(target = target, observed = observed, K = K, w = weights, svd.U = svd.U) 17 | 18 | expect_true(is.list(result)) 19 | expect_true("L1" %in% names(result)) 20 | expect_true("w" %in% names(result)) 21 | expect_true("pX_D1" %in% names(result)) 22 | expect_true("pX_D0" %in% names(result)) 23 | expect_true("pX_D0w" %in% names(result)) 24 | expect_true(is.numeric(result$L1)) 25 | expect_true(is.numeric(result$w)) 26 | expect_true(length(result$w) == nrow(K)) 27 | }) 28 | 29 | # Error handling tests 30 | test_that("getdist handles invalid target input", { 31 | invalid_target <- sample(c(-1, 0, 2), 20, replace = TRUE) 32 | expect_error(getdist(target = invalid_target, observed = observed, K = K, w = weights, svd.U = svd.U), "`target` must be a binary vector containing only 0 and 1 with the same length as the number of rows in `K`.") 33 | }) 34 | 35 | test_that("getdist handles invalid observed input", { 36 | invalid_observed <- sample(c(-1, 0, 2), 20, replace = TRUE) 37 | expect_error(getdist(target = target, observed = invalid_observed, K = K, w = weights, svd.U = svd.U), "`observed` must be a binary vector containing only 0 and 1 with the same length as the number of rows in `K`.") 38 | }) 39 | 40 | test_that("getdist handles non-matrix K input", { 41 | expect_error(getdist(target = target, observed = observed, K = list('a', 2, 3), w = weights, svd.U = svd.U), "`K` should be able to be converted into a numeric matrix.") 42 | }) 43 | 44 | test_that("getdist handles invalid w.pop input", { 45 | invalid_w_pop <- rep(-1, 20) # Negative values in w.pop 46 | expect_error(getdist(target = target, observed = observed, K = K, w.pop = invalid_w_pop, svd.U = svd.U), "`w.pop` must be a non-negative numeric vector with the same length as the number of rows in `K`.") 47 | }) 48 | 49 | test_that("getdist handles invalid w input", { 50 | invalid_w <- rep(-1, 20) # Negative values in w 51 | expect_error(getdist(target = target, observed = observed, K = K, w = invalid_w, svd.U = svd.U), "`w` must be a non-negative numeric vector with the same length as the number of rows in `K`.") 52 | }) 53 | 54 | test_that("getdist handles invalid numdims input", { 55 | expect_error(getdist(target = target, observed = observed, K = K, numdims = "ten", svd.U = svd.U), "`numdims` must be a positive integer.") 56 | expect_error(getdist(target = target, observed = observed, K = K, numdims = 1.5, svd.U = svd.U), "`numdims` must be a positive integer.") 57 | }) 58 | 59 | test_that("getdist handles invalid ebal.tol input", { 60 | expect_error(getdist(target = target, observed = observed, K = K, ebal.tol = -1, svd.U = svd.U), "`ebal.tol` must be a positive numeric value.") 61 | }) 62 | 63 | test_that("getdist handles invalid ebal.maxit input", { 64 | expect_error(getdist(target = target, observed = observed, K = K, ebal.maxit = "many", svd.U = svd.U), "`ebal.maxit` must be a positive integer.") 65 | expect_error(getdist(target = target, observed = observed, K = K, ebal.maxit = 500.5, svd.U = svd.U), "`ebal.maxit` must be a positive integer.") 66 | }) 67 | 68 | test_that("getdist handles invalid svd.U input", { 69 | expect_error(getdist(target = target, observed = observed, K = K, svd.U = list(1, 2, 3)), "`svd.U` must be a matrix.") 70 | }) 71 | -------------------------------------------------------------------------------- /tests/testthat/test_getw.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Create example data 5 | set.seed(123) 6 | data <- matrix(rnorm(1000), ncol = 5) 7 | K <- makeK(allx = data) 8 | svd.U <- svd(K)$u[,1:10] 9 | target <- sample(c(0, 1), 200, replace = TRUE) 10 | observed <- 1 - target 11 | 12 | # Basic functionality tests 13 | test_that("getw works correctly with valid input", { 14 | result <- getw(target = target, observed = observed, svd.U = svd.U) 15 | 16 | expect_true(is.list(result)) 17 | expect_true("w" %in% names(result)) 18 | expect_true("converged" %in% names(result)) 19 | expect_true("ebal_error" %in% names(result)) 20 | expect_true(is.numeric(result$w)) 21 | expect_true(length(result$w) == nrow(svd.U)) 22 | expect_true(is.logical(result$converged)) 23 | }) 24 | 25 | # Error handling tests 26 | test_that("getw handles invalid target input", { 27 | invalid_target <- sample(c(-1, 0, 2), 20, replace = TRUE) 28 | expect_error(getw(target = invalid_target, observed = observed, svd.U = svd.U), "`target` must be a binary vector containing only 0 and 1 with the same length as the number of rows in `svd.U`.") 29 | }) 30 | 31 | test_that("getw handles invalid observed input", { 32 | invalid_observed <- sample(c(-1, 0, 2), 20, replace = TRUE) 33 | expect_error(getw(target = target, observed = invalid_observed, svd.U = svd.U), "`observed` must be a binary vector containing only 0 and 1 with the same length as the number of rows in `svd.U`.") 34 | }) 35 | 36 | test_that("getw handles non-matrix svd.U input", { 37 | expect_error(getw(target = target, observed = observed, svd.U = list(1, 2, 3)), "`svd.U` must be a matrix.") 38 | }) 39 | 40 | test_that("getw handles invalid ebal.tol input", { 41 | expect_error(getw(target = target, observed = observed, svd.U = svd.U, ebal.tol = "small"), "`ebal.tol` must be a positive numeric value.") 42 | expect_error(getw(target = target, observed = observed, svd.U = svd.U, ebal.tol = -1), "`ebal.tol` must be a positive numeric value.") 43 | }) 44 | 45 | test_that("getw handles invalid ebal.maxit input", { 46 | expect_error(getw(target = target, observed = observed, svd.U = svd.U, ebal.maxit = "many"), "`ebal.maxit` must be a positive integer.") 47 | expect_error(getw(target = target, observed = observed, svd.U = svd.U, ebal.maxit = 500.5), "`ebal.maxit` must be a positive integer.") 48 | expect_error(getw(target = target, observed = observed, svd.U = svd.U, ebal.maxit = -1), "`ebal.maxit` must be a positive integer.") 49 | }) 50 | -------------------------------------------------------------------------------- /tests/testthat/test_kbal.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Sample data for testing 5 | set.seed(123) 6 | sample_data <- matrix(rnorm(100), ncol = 5) 7 | sampled <- sample(c(0, 1), 20, replace = TRUE) 8 | treatment <- sample(c(0, 1), 20, replace = TRUE) 9 | 10 | test_that("kbal works correctly with valid continuous input", { 11 | result <- suppressWarnings(kbal(allx = sample_data, sampled = sampled, scale_data = TRUE)) 12 | 13 | expect_type(result, "list") 14 | expect_true(!is.null(result$w)) 15 | expect_true(!is.null(result$biasbound_opt)) 16 | }) 17 | 18 | test_that("kbal works correctly with categorical data", { 19 | cat_data <- data.frame(category = sample(c("A", "B", "C"), 20, replace = TRUE)) 20 | result <- suppressWarnings(kbal(allx = cat_data, sampled = sampled, cat_data = TRUE)) 21 | 22 | expect_type(result, "list") 23 | expect_true(!is.null(result$w)) 24 | expect_true(!is.null(result$biasbound_opt)) 25 | }) 26 | 27 | test_that("kbal works correctly with mixed data", { 28 | mixed_data <- data.frame( 29 | continuous = rnorm(20), 30 | category = sample(c("A", "B", "C"), 20, replace = TRUE) 31 | ) 32 | result <- suppressWarnings(kbal(allx = mixed_data, sampled = sampled, mixed_data = TRUE, cat_columns = 2)) 33 | 34 | expect_type(result, "list") 35 | expect_true(!is.null(result$w)) 36 | expect_true(!is.null(result$biasbound_opt)) 37 | }) 38 | 39 | test_that("kbal handles NA values in `allx` input", { 40 | na_data <- sample_data 41 | na_data[1, 1] <- NA 42 | 43 | expect_error(kbal(allx = na_data), 44 | regexp = "`allx` should be able to be converted into a numeric matrix.") 45 | }) 46 | 47 | test_that("kbal handles NA values in `allx` input", { 48 | na_data <- sample_data 49 | na_data[1, 1] <- NA 50 | 51 | expect_error(suppressWarnings(kbal(allx = na_data)), 52 | regexp = "`allx` should be able to be converted into a numeric matrix.") 53 | }) 54 | 55 | test_that("kbal handles invalid `sampled` input", { 56 | invalid_sampled <- sample(c(-1, 0, 2), 20, replace = TRUE) 57 | 58 | expect_error(suppressWarnings(kbal(allx = sample_data, sampled = invalid_sampled)), 59 | regexp = "\"sampled\" contains non-binary elements") 60 | }) 61 | 62 | test_that("kbal handles incompatible `sampled` dimensions", { 63 | incompatible_sampled <- sample(c(0, 1), 15, replace = TRUE) 64 | 65 | expect_error(kbal(allx = sample_data, sampled = incompatible_sampled), 66 | regexp = "Dimensions of \"sampled\" do not match data \"allx\"") 67 | }) 68 | 69 | test_that("kbal handles invalid `treatment` input", { 70 | invalid_treatment <- sample(c(-1, 0, 2), 20, replace = TRUE) 71 | 72 | expect_error(kbal(allx = sample_data, treatment = invalid_treatment), 73 | regexp = "\"treated\" contains non-binary elements") 74 | }) 75 | 76 | test_that("kbal handles incompatible `treatment` dimensions", { 77 | incompatible_treatment <- sample(c(0, 1), 15, replace = TRUE) 78 | 79 | expect_error(kbal(allx = sample_data, treatment = incompatible_treatment), 80 | regexp = "Dimensions of \"treatment\" do not match data \"allx\"") 81 | }) 82 | 83 | test_that("kbal handles specifying both `sampled` and `treatment` inputs", { 84 | expect_error(suppressWarnings(kbal(allx = sample_data, sampled = sampled, treatment = treatment)), 85 | regexp = "\"sampled\" and \"treatment\" arguments cannot be specified simultaneously") 86 | }) 87 | 88 | test_that("kbal handles large kernel dimensions correctly", { 89 | large_data <- matrix(rnorm(5000), ncol = 250) 90 | result <- suppressWarnings(kbal(allx = large_data, sampled = sampled)) 91 | 92 | expect_type(result, "list") 93 | expect_true(!is.null(result$w)) 94 | }) 95 | 96 | test_that("kbal handles invalid `useasbases` input", { 97 | invalid_useasbases <- sample(c(-1, 0, 2), 20, replace = TRUE) 98 | 99 | expect_error(suppressWarnings(kbal(allx = sample_data, sampled = sampled, useasbases = invalid_useasbases)), 100 | regexp = "\"useasbases\" contains non-binary elements") 101 | }) 102 | 103 | test_that("kbal handles zero variance in continuous data", { 104 | zero_var_data <- matrix(rep(1, 100), ncol = 5) 105 | 106 | expect_error(suppressWarnings(kbal(allx = zero_var_data, sampled = sampled)), 107 | regexp = "One or more column in \"allx\" has zero variance") 108 | }) 109 | 110 | test_that("kbal handles constraint input correctly", { 111 | constraint_data <- matrix(rnorm(20), ncol = 1) 112 | result <- suppressWarnings(kbal(allx = sample_data, sampled = sampled, constraint = constraint_data)) 113 | 114 | expect_type(result, "list") 115 | expect_true(!is.null(result$w)) 116 | }) 117 | 118 | test_that("kbal handles population weights correctly", { 119 | population_weights <- rep(0.5, sum(sampled)) 120 | 121 | expect_error(suppressWarnings(kbal(allx = sample_data, sampled = sampled, population.w = population_weights)), 122 | regexp = "\"population.w\" must have the same length as the number of population/treated units") 123 | }) 124 | -------------------------------------------------------------------------------- /tests/testthat/test_makeK.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Basic functionality tests 5 | test_that("makeK works correctly with valid input", { 6 | data <- matrix(rnorm(100), ncol = 5) 7 | useasbases <- sample(c(0, 1), 20, replace = TRUE) 8 | 9 | result <- makeK(allx = data, useasbases = useasbases) 10 | 11 | expect_true(is.matrix(result)) 12 | expect_equal(ncol(result), sum(useasbases)) 13 | expect_equal(nrow(result), nrow(data)) 14 | }) 15 | 16 | test_that("makeK produces expected results with linear kernel", { 17 | data <- matrix(rnorm(100), ncol = 5) 18 | useasbases <- sample(c(0, 1), 20, replace = TRUE) 19 | 20 | result <- makeK(allx = data, useasbases = useasbases, linkernel = TRUE, scale = FALSE) 21 | 22 | expect_true(is.matrix(result)) 23 | expect_equal(result, data) 24 | }) 25 | 26 | test_that("makeK handles non-matrix allx input", { 27 | expect_error(makeK(allx = list('a', 2, 3), useasbases = c(1, 0, 1)), "`allx` should be able to be converted into a numeric matrix.") 28 | }) 29 | 30 | test_that("makeK handles invalid useasbases input", { 31 | data <- matrix(rnorm(100), ncol = 5) 32 | expect_error(makeK(allx = data, useasbases = c(1, 0)), "`useasbases` must be a binary vector with the same length as the number of rows in `allx`.") 33 | }) 34 | 35 | test_that("makeK handles invalid b input", { 36 | data <- matrix(rnorm(100), ncol = 5) 37 | useasbases <- sample(c(0, 1), 20, replace = TRUE) 38 | expect_error(makeK(allx = data, useasbases = useasbases, b = "high"), "`b` must be a single numeric value.") 39 | }) 40 | 41 | test_that("makeK handles invalid linkernel input", { 42 | data <- matrix(rnorm(100), ncol = 5) 43 | useasbases <- sample(c(0, 1), 20, replace = TRUE) 44 | expect_error(makeK(allx = data, useasbases = useasbases, linkernel = "yes"), "`linkernel` must be a logical value.") 45 | }) 46 | 47 | test_that("makeK handles invalid scale input", { 48 | data <- matrix(rnorm(100), ncol = 5) 49 | useasbases <- sample(c(0, 1), 20, replace = TRUE) 50 | expect_error(makeK(allx = data, useasbases = useasbases, scale = "yes"), "`scale` must be a logical value.") 51 | }) 52 | 53 | test_that("makeK handles zero useasbases input", { 54 | data <- matrix(rnorm(100), ncol = 5) 55 | useasbases <- rep(0, nrow(data)) # No bases selected 56 | 57 | expect_error(makeK(allx = data, useasbases = useasbases), "`useasbases` must have at least one element set to 1.") 58 | }) 59 | -------------------------------------------------------------------------------- /tests/testthat/test_one_hot.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | # Basic functionality tests 5 | test_that("one_hot works correctly with valid input", { 6 | data <- data.frame(pid = c(rep("Rep", 3), rep("Dem", 3), rep("Ind", 3)), 7 | gender = c("female", "male", "female", "female", "male", "female", "male", "female", "male")) 8 | 9 | result <- one_hot(data) 10 | 11 | expect_true(is.matrix(result)) 12 | expect_equal(nrow(result), nrow(data)) 13 | expect_true(all(colnames(result) %in% c("pidRep", "pidDem", "pidInd", "genderfemale", "gendermale"))) 14 | }) 15 | 16 | # Error handling tests 17 | test_that("one_hot handles non-data frame or non-matrix input", { 18 | expect_error(one_hot(list(a = 1, b = 2)), "`data` must be a data frame or matrix.") 19 | }) 20 | -------------------------------------------------------------------------------- /tests/testthat/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(kbal) 3 | 4 | test_check("KBAL") 5 | -------------------------------------------------------------------------------- /tools/cran-comments.md: -------------------------------------------------------------------------------- 1 | ## CRAN Submission Comments for kbal 2 | 3 | ### First Submission 4 | This is the initial submission of the KBAL package to CRAN. 5 | 6 | ### R CMD check results 7 | devtools::check(): 8 | There were no ERRORs, WARNINGs, or NOTEs. 9 | devtools::check_win_devel(): 10 | NOTE: Maintainer field differs from that derived from Authors. The maintainer has been set correctly in DESCRIPTION. 11 | 12 | ### Downstream dependencies 13 | There are currently no known downstream dependencies for this package, as this is the first submission to CRAN. 14 | 15 | ### Additional Comments 16 | - All documentation has been reviewed and is consistent with the code. 17 | - The package has been tested thoroughly on multiple platforms. 18 | - Dependencies are up-to-date and compatible with R version 3.5.0 and higher. 19 | 20 | Thank you for considering the kbal package for inclusion on CRAN. -------------------------------------------------------------------------------- /tools/example.R: -------------------------------------------------------------------------------- 1 | rm(list=ls()) 2 | library(KBAL) 3 | #Run Lalonde example as in paper: 4 | data(lalonde) 5 | lalonde$nodegr=as.numeric(lalonde$educ<=11) 6 | xvars=c("age","black","educ","hisp","married","re74","re75","nodegr","u74","u75") 7 | attach(lalonde) 8 | 9 | #Raw diff-in-means: way off, -$15205 10 | mean(re78[nsw==1])-mean(re78[nsw==0]) 11 | 12 | #OLS with covariates: 13 | summary(lm(re78~nsw+., data=lalonde[,xvars])) 14 | 15 | #Kbal at new defaults: 16 | kbalout= kbal(allx=lalonde[,xvars],treatment=lalonde$nsw, 17 | ebal.tol=1e-6, printprogress =TRUE) 18 | summary(lm(re78~nsw,w=kbalout$w, data = lalonde)) 19 | 20 | # Examine bias bound due to remaining imbalances; 21 | #note that using KRLS, gamma = c'Kc for the related regression is approximately 55 22 | kbalout$biasbound.orig*sd(re78)*sqrt(55) 23 | kbalout$biasbound.opt*sd(re78)*sqrt(55) 24 | plot(x=kbalout$dist.record[1,],y=sd(re78)*sqrt(55)*kbalout$dist.record[2,],ylab="biasbound", xlab="Num. dims of K balanced", pch=16) 25 | --------------------------------------------------------------------------------