├── data └── recs.rda ├── _pkgdown.yml ├── man ├── figures │ ├── README-unnamed-chunk-10-1.png │ └── README-unnamed-chunk-21-1.png ├── fusionModel-package.Rd ├── importance.Rd ├── impute.Rd ├── read_fsd.Rd ├── recs.Rd ├── monotonic.Rd ├── assemble.Rd ├── prepXY.Rd ├── fuse.Rd ├── validate.Rd ├── plot_valid.Rd ├── fusionOutput.Rd ├── train.Rd ├── analyze.Rd └── analyze_fusionACS.Rd ├── R ├── fusionModel-package.R ├── zzz.R ├── data.R ├── lookup.R ├── importance.R ├── fitLGB.R ├── read_fsd.R ├── monotonic.R ├── impute.R ├── assemble.R ├── validate.R ├── prepXY.R ├── plot_valid.R └── fuseCART.R ├── NAMESPACE ├── DESCRIPTION ├── .github └── workflows │ └── pkgdown.yaml ├── data-raw └── recs.R └── README.Rmd /data/recs.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ummel/fusionModel/HEAD/data/recs.rda -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://ummel.github.io/fusionModel/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ummel/fusionModel/HEAD/man/figures/README-unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-21-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ummel/fusionModel/HEAD/man/figures/README-unnamed-chunk-21-1.png -------------------------------------------------------------------------------- /R/fusionModel-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | # The following block is used by usethis to automatically manage 5 | # roxygen namespace tags. Modify with care! 6 | ## usethis namespace: start 7 | ## usethis namespace: end 8 | NULL 9 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(analyze) 4 | export(analyze_fusionACS) 5 | export(assemble) 6 | export(fuse) 7 | export(fuseCART) 8 | export(fusionOutput) 9 | export(importance) 10 | export(impute) 11 | export(monotonic) 12 | export(plot_valid) 13 | export(prepXY) 14 | export(read_fsd) 15 | export(train) 16 | export(trainCART) 17 | export(validate) 18 | import(collapse) 19 | import(data.table, except = c(first, last, between, fdroplevels)) 20 | import(dplyr) 21 | import(stats, except = c(filter, lag, D)) 22 | import(stringr) 23 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | # Specify package imports 2 | #' @import stringr 3 | #' @import collapse 4 | #' @import dplyr 5 | #' @rawNamespace import(stats, except = c(filter, lag, D)) 6 | #' @rawNamespace import(data.table, except = c(first, last, between, fdroplevels)) 7 | NULL 8 | 9 | #----- 10 | 11 | .onLoad <- function (libname, pkgname) { 12 | 13 | # Create default option value for number of cores 14 | options(fusionModel.cores = max(1L, parallel::detectCores() - 1L)) 15 | 16 | # Package startup message 17 | packageStartupMessage("fusionModel v", utils::packageVersion("fusionModel"), " | https://github.com/ummel/fusionModel") 18 | 19 | } 20 | -------------------------------------------------------------------------------- /man/fusionModel-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fusionModel-package.R 3 | \docType{package} 4 | \name{fusionModel-package} 5 | \alias{fusionModel} 6 | \alias{fusionModel-package} 7 | \title{fusionModel: Data fusion and analysis of synthetic data in R} 8 | \description{ 9 | Data fusion and analysis of synthetic data in R. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://ummel.github.io/fusionModel/} 15 | } 16 | 17 | } 18 | \author{ 19 | \strong{Maintainer}: Kevin Ummel \email{ummel@berkeley.edu} 20 | 21 | Other contributors: 22 | \itemize{ 23 | \item Karthik Akkiraju [contributor] 24 | \item Miguel Poblete Cazenave [contributor] 25 | } 26 | 27 | } 28 | \keyword{internal} 29 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: fusionModel 2 | Title: Data fusion and analysis of synthetic data in R 3 | Version: 2.3.8 4 | Authors@R: c( 5 | person(given = "Kevin", family = "Ummel", role = c("aut", "cre"), email = "ummel@berkeley.edu"), 6 | person(given = "Karthik", family = "Akkiraju", role = "ctb"), 7 | person(given = "Miguel", family = "Poblete Cazenave", role = "ctb")) 8 | Description: Data fusion and analysis of synthetic data in R. 9 | License: GPL (>= 3) 10 | Depends: 11 | R (>= 3.5.0) 12 | Imports: 13 | data.table, 14 | dplyr, 15 | collapse, 16 | fst, 17 | glmnet, 18 | lightgbm, 19 | Matrix, 20 | matrixStats, 21 | mgcv, 22 | purrr, 23 | RANN, 24 | stats, 25 | stringr, 26 | utils, 27 | zip, 28 | rlang, 29 | tidyr 30 | Suggests: 31 | ggplot2, 32 | scam 33 | Encoding: UTF-8 34 | LazyData: true 35 | Roxygen: list(markdown = TRUE) 36 | RoxygenNote: 7.3.2 37 | URL: https://ummel.github.io/fusionModel/ 38 | -------------------------------------------------------------------------------- /man/importance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/importance.R 3 | \name{importance} 4 | \alias{importance} 5 | \title{Extract predictor variable importance from a fusion model} 6 | \usage{ 7 | importance(fsn) 8 | } 9 | \arguments{ 10 | \item{fsn}{Character. Path to fusion model file (.fsn) generated by \code{\link{train}}.} 11 | } 12 | \value{ 13 | A named list containing \code{detailed} and \code{summary} importance results. The \code{summary} results are most useful, as they return the average importance of each predictor across potentially multiple underlying LightGBM models; i.e. zero ("z"), mean ("m"), or quantile ("q") models. See Examples for suggested plotting of results. 14 | } 15 | \description{ 16 | Returns predictor variable (feature) importance of underlying LightGBM models stored in a fusion model file (.fsn) on disk. 17 | } 18 | \details{ 19 | Importance metrics are computed via \code{\link[lightgbm]{lgb.importance}}. Three types of measures are returned; "gain" is typically the preferred measure. 20 | } 21 | \examples{ 22 | # Build a fusion model using RECS microdata 23 | # Note that "fusion_model.fsn" will be written to working directory 24 | ?recs 25 | fusion.vars <- c("electricity", "natural_gas", "aircon") 26 | predictor.vars <- names(recs)[2:12] 27 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars) 28 | 29 | # Extract predictor variable importance 30 | ximp <- importance(fsn.path) 31 | 32 | # Plot summary results 33 | library(ggplot2) 34 | ggplot(ximp$summary, aes(x = x, y = gain)) + 35 | geom_bar(stat = "identity") + 36 | facet_grid(~ y) + 37 | coord_flip() 38 | 39 | # View detailed results 40 | View(ximp$detailed) 41 | } 42 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples#build-pkgdown-site 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 10/29/24: Set on:workflow_dispatch to allow workflow to be triggered manually: https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/manually-running-a-workflow 4 | on: 5 | # push: 6 | # branches: [main, master] 7 | # pull_request: 8 | # branches: [main, master] 9 | # release: 10 | # types: [published] 11 | workflow_dispatch: 12 | 13 | name: pkgdown.yaml 14 | 15 | permissions: read-all 16 | 17 | jobs: 18 | pkgdown: 19 | runs-on: ubuntu-latest 20 | # Only restrict concurrency for non-PR jobs 21 | concurrency: 22 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 23 | env: 24 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 25 | permissions: 26 | contents: write 27 | steps: 28 | - uses: actions/checkout@v4 29 | 30 | - uses: r-lib/actions/setup-pandoc@v2 31 | 32 | - uses: r-lib/actions/setup-r@v2 33 | with: 34 | use-public-rspm: true 35 | 36 | - uses: r-lib/actions/setup-r-dependencies@v2 37 | with: 38 | extra-packages: any::pkgdown, local::. 39 | needs: website 40 | 41 | # Manual edit by Kevin Ummel: add 'examples = FALSE' to prevent running of documentation examples 42 | - name: Build site 43 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, examples = FALSE) 44 | shell: Rscript {0} 45 | 46 | - name: Deploy to GitHub pages 🚀 47 | if: github.event_name != 'pull_request' 48 | uses: JamesIves/github-pages-deploy-action@v4.5.0 49 | with: 50 | clean: false 51 | branch: gh-pages 52 | folder: docs 53 | -------------------------------------------------------------------------------- /man/impute.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/impute.R 3 | \name{impute} 4 | \alias{impute} 5 | \title{Impute missing data via fusion} 6 | \usage{ 7 | impute( 8 | data, 9 | weight = NULL, 10 | ignore = NULL, 11 | cores = parallel::detectCores(logical = FALSE) - 1L 12 | ) 13 | } 14 | \arguments{ 15 | \item{data}{A data frame with missing values.} 16 | 17 | \item{weight}{Optional name of observation weights column in \code{data}.} 18 | 19 | \item{ignore}{Optional names of columns in \code{data} to ignore. These variables are neither imputed nor used as predictors.} 20 | 21 | \item{cores}{Number of physical CPU cores used by \code{\link[lightgbm]{lightgbm}}. LightGBM is parallel-enabled on all platforms if OpenMP is available.} 22 | } 23 | \value{ 24 | A data frame with all missing values imputed. 25 | } 26 | \description{ 27 | A universal missing data imputation tool that wraps successive calls to \code{\link{train}} and \code{\link{fuse}} under the hood. Designed for simplicity and ease of use. 28 | } 29 | \details{ 30 | Variables with missing values are imputed sequentially, beginning with the variable with the fewest missing values. Since LightGBM models accommodate NA values in the predictor set, all available variables are used as potential predictors (excluding \code{ignore} variables). For each call to \code{\link{train}}, 80\% of observations are randomly selected for training and the remaining 20\% are used as a validation set to determine an appropriate number of tree learners. All LightGBM model parameters are kept at the sensible default values in \code{\link{train}}. Since \code{\link[lightgbm]{lightgbm}} uses OpenMP multithreading, it is not advisable to use \code{\link{impute}} inside a forked/parallel process when \code{cores > 1}. 31 | } 32 | \examples{ 33 | # Create data frame with random NA values 34 | ?recs 35 | data <- recs[, 2:7] 36 | miss <- replicate(ncol(data), runif(nrow(data)) < runif(1, 0.01, 0.3)) 37 | data[miss] <- NA 38 | colSums(is.na(data)) 39 | 40 | # Impute the missing values 41 | result <- impute(data) 42 | anyNA(result) 43 | 44 | } 45 | -------------------------------------------------------------------------------- /data-raw/recs.R: -------------------------------------------------------------------------------- 1 | recs <- fst::read_fst("~/Documents/Projects/fusionData/survey-processed/RECS/2015/RECS_2015_H_processed.fst") |> 2 | dplyr::select( 3 | weight, 4 | moneypy, 5 | hhage, 6 | householder_race, 7 | education, 8 | employhh, 9 | nhsldmem, 10 | recs_division, 11 | ur12, 12 | recs_iecc_zone, 13 | kownrent, 14 | typehuq, 15 | yearmaderange, 16 | fuelheat, 17 | adqinsul, 18 | cooltype, 19 | agecenac, 20 | tvcolor, 21 | scalee, 22 | totsqft_en, 23 | kwh, 24 | cufeetng, 25 | gallonfo, 26 | gallonlp, 27 | btulp, 28 | dollarlp, 29 | sdescent, 30 | totalbtu, 31 | totalbtusph, 32 | btuelahucol, 33 | btuelcol, 34 | rep_1:rep_96 35 | ) |> 36 | dplyr::rename( 37 | income = moneypy, 38 | age = hhage, 39 | race = householder_race, 40 | employment = employhh, 41 | hh_size = nhsldmem, 42 | division = recs_division, 43 | urban_rural = ur12, 44 | climate = recs_iecc_zone, 45 | renter = kownrent, 46 | home_type = typehuq, 47 | year_built = yearmaderange, 48 | heat_type = fuelheat, 49 | insulation = adqinsul, 50 | aircon = cooltype, 51 | centralac_age = agecenac, 52 | televisions = tvcolor, 53 | disconnect = scalee, 54 | square_feet = totsqft_en, 55 | electricity = kwh, 56 | natural_gas = cufeetng, 57 | fuel_oil = gallonfo, 58 | propane = gallonlp, 59 | propane_btu = btulp, 60 | propane_expend = dollarlp 61 | ) |> 62 | dplyr::mutate( 63 | electricity = as.integer(round(electricity)), # Make integer for testing purposes 64 | heating_share = round(totalbtusph / totalbtu, 3), 65 | cooling_share = round((btuelahucol + btuelcol) / totalbtu, 3), 66 | other_share = 1 - heating_share - cooling_share, 67 | urban_rural = factor(ifelse(urban_rural == "U", "Urban", "Rural")), 68 | renter = renter != "Owned or being bought by someone in your household", 69 | use_ng = natural_gas > 0, 70 | have_ac = aircon != "No air conditioning", 71 | race = ifelse(sdescent == "Yes", "Latino Alone", as.character(race)), 72 | race = factor(trimws(gsub("Alone", "", race))) 73 | ) |> 74 | select(-sdescent, -totalbtu, -totalbtusph, -btuelahucol, -btuelcol) |> 75 | select(-starts_with("rep_"), starts_with("rep_")) 76 | 77 | usethis::use_data(recs, overwrite = TRUE) 78 | -------------------------------------------------------------------------------- /man/read_fsd.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read_fsd.R 3 | \name{read_fsd} 4 | \alias{read_fsd} 5 | \title{Read fusion output from disk} 6 | \usage{ 7 | read_fsd( 8 | path, 9 | columns = NULL, 10 | M = 1, 11 | df = NULL, 12 | cores = max(1, parallel::detectCores(logical = FALSE) - 1) 13 | ) 14 | } 15 | \arguments{ 16 | \item{path}{Character. Path to a \code{.fsd} (or \code{.fst}) file, typically produced by \code{\link{fuse}}.} 17 | 18 | \item{columns}{Character. Column names to read. The default is to return all columns.} 19 | 20 | \item{M}{Integer. The first \code{M} implicates are returned. Set \code{M = Inf} to return all implicates. Ignored if \code{M} column not present in data.} 21 | 22 | \item{df}{Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows.} 23 | 24 | \item{cores}{Integer. Number of cores used by \code{\link[fst]{fst}}.} 25 | } 26 | \value{ 27 | A \code{\link[data.table]{data.table}}; keys are preserved if present in the on-disk data. When \code{path} points to a \code{.fsd} file, it includes an integer column "M" indicating the implicate assignment of each observation (unless explicitly ignored by \code{columns}). 28 | } 29 | \description{ 30 | Efficiently read fusion output that was written to disk, optionally returning a subset of rows and/or columns. Since a \code{.fsd} file is simply a \code{\link[fst]{fst}} file under the hood, this function also works on any \code{.fst} file. 31 | } 32 | \details{ 33 | If \code{df} is provided and the file size on disk is less than 100 MB, then a full read and inner \code{\link[collapse]{join}} is performed. For larger files, a manual read of the required rows is performed, using \code{\link[collapse]{fmatch}} for the matching operation. 34 | } 35 | \examples{ 36 | # Build a fusion model using RECS microdata 37 | # Note that "fusion_model.fsn" will be written to working directory 38 | ?recs 39 | fusion.vars <- c("electricity", "natural_gas", "aircon") 40 | predictor.vars <- names(recs)[2:12] 41 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars) 42 | 43 | # Write fusion output directly to disk 44 | # Note that "results.fsd" will be written to working directory 45 | recipient <- recs[predictor.vars] 46 | sim <- fuse(data = recipient, fsn = fsn.path, M = 5, fsd = "results.fsd") 47 | 48 | # Read the fusion output saved to disk 49 | sim <- read_fsd(sim) 50 | head(sim) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Data from the 2015 Residential Energy Consumption Survey (RECS) 2 | #' 3 | #' Pre-processed, household-level microdata containing a selection of 31 4 | #' variables derived from the 2015 RECS, plus survey replicate weights. A variety of data types are included. 5 | #' There are no missing values. Variable names have been altered from the original. 6 | #' 7 | #' @format A tibble with 5,686 rows and 124 variables: 8 | #' \describe{ 9 | #' \item{weight}{Primary sampling weight} 10 | #' \item{income}{Annual gross household income for the last year} 11 | #' \item{age}{Respondent age} 12 | #' \item{race}{Respondent race} 13 | #' \item{education}{Highest education completed by respondent} 14 | #' \item{employment}{Respondent employment status} 15 | #' \item{hh_size}{Number of household members} 16 | #' \item{division}{Census Division} 17 | #' \item{urban_rural}{Census 2010 Urban Type} 18 | #' \item{climate}{IECC Climate Code} 19 | #' \item{renter}{Is household renting the home?} 20 | #' \item{home_type}{Type of housing unit} 21 | #' \item{year_built}{Range when housing unit was built} 22 | #' \item{square_feet}{Total square footage} 23 | #' \item{insulation}{Level of insulation} 24 | #' \item{heating}{Main space heating fuel} 25 | #' \item{aircon}{Type of air conditioning equipment used} 26 | #' \item{centralac_age}{Age of central air conditioner} 27 | #' \item{televisions}{Number of televisions used} 28 | #' \item{disconnect}{Frequency of receiving disconnect notice} 29 | #' \item{electricity}{Total annual electricity usage, in kilowatthours} 30 | #' \item{natural_gas}{Total annual natural gas usage, in hundred cubic feet} 31 | #' \item{fuel_oil}{Total annual fuel oil/kerosene usage, in gallons} 32 | #' \item{propane}{Total annual propane usage, in gallons} 33 | #' \item{propane_btu}{Total annual propane usage, in thousand Btu} 34 | #' \item{propane_expend}{Total annual propane expenditure, in dollars} 35 | #' \item{heating_share}{Share of total energy used for space heating} 36 | #' \item{heating_share}{Share of total energy used for cooling (AC and fans)} 37 | #' \item{other_share}{Share of total energy used for other end-uses} 38 | #' \item{use_ng}{Logical indicating if household uses natural gas} 39 | #' \item{have_ac}{Logical indicating if household has air conditioning} 40 | #' \item{rep_1:rep_96}{Replicate weights for uncertainty estimation} 41 | #' } 42 | #' @source \url{https://www.eia.gov/consumption/residential/data/2015/} 43 | "recs" 44 | -------------------------------------------------------------------------------- /man/recs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{recs} 5 | \alias{recs} 6 | \title{Data from the 2015 Residential Energy Consumption Survey (RECS)} 7 | \format{ 8 | A tibble with 5,686 rows and 124 variables: 9 | \describe{ 10 | \item{weight}{Primary sampling weight} 11 | \item{income}{Annual gross household income for the last year} 12 | \item{age}{Respondent age} 13 | \item{race}{Respondent race} 14 | \item{education}{Highest education completed by respondent} 15 | \item{employment}{Respondent employment status} 16 | \item{hh_size}{Number of household members} 17 | \item{division}{Census Division} 18 | \item{urban_rural}{Census 2010 Urban Type} 19 | \item{climate}{IECC Climate Code} 20 | \item{renter}{Is household renting the home?} 21 | \item{home_type}{Type of housing unit} 22 | \item{year_built}{Range when housing unit was built} 23 | \item{square_feet}{Total square footage} 24 | \item{insulation}{Level of insulation} 25 | \item{heating}{Main space heating fuel} 26 | \item{aircon}{Type of air conditioning equipment used} 27 | \item{centralac_age}{Age of central air conditioner} 28 | \item{televisions}{Number of televisions used} 29 | \item{disconnect}{Frequency of receiving disconnect notice} 30 | \item{electricity}{Total annual electricity usage, in kilowatthours} 31 | \item{natural_gas}{Total annual natural gas usage, in hundred cubic feet} 32 | \item{fuel_oil}{Total annual fuel oil/kerosene usage, in gallons} 33 | \item{propane}{Total annual propane usage, in gallons} 34 | \item{propane_btu}{Total annual propane usage, in thousand Btu} 35 | \item{propane_expend}{Total annual propane expenditure, in dollars} 36 | \item{heating_share}{Share of total energy used for space heating} 37 | \item{heating_share}{Share of total energy used for cooling (AC and fans)} 38 | \item{other_share}{Share of total energy used for other end-uses} 39 | \item{use_ng}{Logical indicating if household uses natural gas} 40 | \item{have_ac}{Logical indicating if household has air conditioning} 41 | \item{rep_1:rep_96}{Replicate weights for uncertainty estimation} 42 | } 43 | } 44 | \source{ 45 | \url{https://www.eia.gov/consumption/residential/data/2015/} 46 | } 47 | \usage{ 48 | recs 49 | } 50 | \description{ 51 | Pre-processed, household-level microdata containing a selection of 31 52 | variables derived from the 2015 RECS, plus survey replicate weights. A variety of data types are included. 53 | There are no missing values. Variable names have been altered from the original. 54 | } 55 | \keyword{datasets} 56 | -------------------------------------------------------------------------------- /R/lookup.R: -------------------------------------------------------------------------------- 1 | # Lookup details about available variables 2 | 3 | lookup <- function(var = NULL, year = NULL) { 4 | 5 | # Loading fusionData dictionary (not necessarily available) 6 | # data(dictionary) 7 | # names(dictionary) <- tolower(names(dictionary)) 8 | 9 | # Check validity of the working directory path 10 | # Checks if "/fusionData" is part of the path, as this is required 11 | b <- strsplit(full.path(getwd()), .Platform$file.sep, fixed = TRUE)[[1]] 12 | i <- which(b == "fusionData") 13 | if (length(i) == 0) stop("'/fusionData' is not part of the working directory path; this is required.") 14 | dir <- paste(b[1:i], collapse = .Platform$file.sep) 15 | 16 | # Get file path(s) to ACS and fused microdata, based on values in 'year' 17 | fpaths <- lapply(if (is.null(year)) "" else year, function(yr) { 18 | pa <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._processed.fst"), recursive = TRUE, full.names = TRUE) 19 | pc <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._custom.fst"), recursive = TRUE, full.names = TRUE) 20 | pf <- rev(list.files(file.path(dir, "fusion"), pattern = paste0(yr, "_._fused.fsd"), recursive = TRUE, full.names = TRUE)) 21 | c(pa, pc, pf) 22 | }) %>% 23 | unlist() 24 | 25 | # Extract the 'var' present in each file in 'fpaths' 26 | vlist <- if (is.null(var)) { 27 | lapply(fpaths, function(x) fst::metadata_fst(x)$columnNames) 28 | } else { 29 | lapply(fpaths, function(x) intersect(var, fst::metadata_fst(x)$columnNames)) 30 | } 31 | 32 | acs.year <- str_extract(basename(fpaths) , "(\\d{4})(?=_[^_]*_[^_]*$)") 33 | rtype <- str_extract(basename(fpaths), ".(?=_[^_]*$)") 34 | rtype <- case_when(rtype == "H" ~ "Household", rtype == "P" ~ "Person") 35 | source <- basename(fpaths) %>% 36 | str_extract("^[^_]*_[^_]*") %>% 37 | str_split(pattern = "_", n = 2, simplify = TRUE) %>% 38 | as.data.frame() %>% 39 | setNames(c('survey', 'vintage')) %>% 40 | mutate(source = 1:n()) 41 | 42 | out <- vlist %>% 43 | tibble::enframe(name = "source", value = "var") %>% 44 | left_join(source, by = join_by(source)) %>% 45 | mutate(respondent = rtype, 46 | acs_year = acs.year, 47 | path = gsub(dir, "", fpaths, fixed = TRUE), 48 | source = NULL) %>% 49 | tidyr::unnest(var) %>% 50 | filter(!var %in% c('M', 'year', 'hid', 'weight'), !str_detect(var, "^rep_\\d*$")) %>% 51 | #left_join(dictionary, by = join_by(var == variable, survey, vintage, respondent)) %>% 52 | #select(var, acs_year, respondent, survey, vintage, description, type, values, path) 53 | select(var, acs_year, respondent, survey, vintage, path) %>% 54 | arrange(across(everything())) 55 | 56 | return(out) 57 | 58 | } 59 | 60 | -------------------------------------------------------------------------------- /man/monotonic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/monotonic.R 3 | \name{monotonic} 4 | \alias{monotonic} 5 | \title{Create a monotonic relationship between two variables} 6 | \usage{ 7 | monotonic( 8 | x, 9 | y, 10 | w = NULL, 11 | preserve = TRUE, 12 | expend = TRUE, 13 | fast = TRUE, 14 | nmax = 5000, 15 | plot = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{x}{Numeric.} 20 | 21 | \item{y}{Numeric.} 22 | 23 | \item{w}{Numeric. Optional observation weights.} 24 | 25 | \item{preserve}{Logical. Preserve the original mean of the \code{y} values in the returned values?} 26 | 27 | \item{expend}{Logical. Assume \code{y} is an expenditure variable? If \code{TRUE}, a safety check is implemented to ensure \code{y > 0} when \code{x > 0}.} 28 | 29 | \item{fast}{Logical. If \code{TRUE}, only \code{\link[scam]{supsmu}} is used with coercion of result to monotone.} 30 | 31 | \item{nmax}{Integer. Maximum number of observations to use for smoothing. Set lower for faster computation. \code{nmax = Inf} eliminates sampling.} 32 | 33 | \item{plot}{Logical. Plot the (sampled) data points and derived monotonic relationship?} 34 | } 35 | \value{ 36 | A numeric vector of modified \code{y} values. Optionally, a plot showing the returned monotonic relationship. 37 | } 38 | \description{ 39 | \code{monotonic()} returns modified values of input vector \code{y} that are smoothed, monotonic, and consistent across all values of input \code{x}. It was designed to be used post-fusion when one wants to ensure a plausible relationship between consumption (\code{x}) and expenditure (\code{y}), under the assumption that all consumers face an identical, monotonic pricing structure. By default, the mean of the returned values is forced to equal the original mean of \code{y} (\code{preserve = TRUE}). The direction of monotonicity (increasing or decreasing) is detected automatically, so use cases are not limited to consumption and expenditure variables. 40 | } 41 | \details{ 42 | The initial smoothing is accomplished via \code{\link[scam]{supsmu}} with the result coerced to monotone. If \code{fast = FALSE} and the coercion step modifies the values too much, a second smooth is attempted via a \code{\link[scam]{scam}} model with either a monotone increasing or decreasing constraint. If the SCAM fails to fit, the function falls back to \code{\link[stats]{lm}} with simple linear predictions. If \code{y = 0} when \code{x = 0} (as typical for consumption-expenditure variables), then that outcome is enforced in the result. The input data are randomly sampled to no more than \code{nmax} observations, if necessary, for speed. 43 | } 44 | \examples{ 45 | y <- monotonic(x = recs$propane_btu, y = recs$propane_expend, plot = TRUE) 46 | mean(recs$propane_expend) 47 | mean(y) 48 | } 49 | -------------------------------------------------------------------------------- /man/assemble.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/assemble.R 3 | \name{assemble} 4 | \alias{assemble} 5 | \title{Assemble fusionACS microdata across surveys} 6 | \usage{ 7 | assemble( 8 | year, 9 | var, 10 | respondent = "household", 11 | M = 1, 12 | df = NULL, 13 | cores = 1, 14 | source = "all", 15 | silent = FALSE 16 | ) 17 | } 18 | \arguments{ 19 | \item{year}{Integer. One or more years for which to return results (i.e. the ACS recipient year).} 20 | 21 | \item{var}{Character. Name of one or more variables to return. May contain household- and/or person-level variables. See Details.} 22 | 23 | \item{respondent}{Character. Should \code{"household"}- or \code{"person"}-level microdata be returned?} 24 | 25 | \item{M}{Integer. The first \code{M} implicates are returned for fused variables. Set \code{M = Inf} to return all implicates. Ignored if \code{var} contains only ACS variables (i.e. no implicates)} 26 | 27 | \item{df}{Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows.} 28 | 29 | \item{cores}{Integer. Number of cores used by the \code{\link[fst]{fst-package}} when reading from disk.} 30 | 31 | \item{source}{Character Specifies where to look for \code{var}: all available microdata (\code{source = "all"}); only ACS microdata (\code{source = "ACS"}); or only fused microdata (\code{source = "fused"}). Note that no observation weights are returned if \code{source = "fused"}, since weights are stored in the ACS microdata.} 32 | 33 | \item{silent}{Logical. If \code{FALSE}, a warning is issued if any \code{var} cannot be located in available local files.} 34 | } 35 | \value{ 36 | A keyed data table containing the following columns, in addition to the variables named in \code{var}: 37 | \describe{ 38 | \item{M}{Implicate number. See \code{\link{fuse}}.} 39 | \item{year}{Year of the ACS recipient microdata.} 40 | \item{hid}{ACS household ID using fusionACS convention.} 41 | \item{pid}{ACS person ID using fusionACS convention, if \code{respondent = "person"}.} 42 | \item{weight}{ACS microdata primary sample weight.} 43 | } 44 | } 45 | \description{ 46 | For fusionACS usage only. Provides a safe and efficient way to assemble (merge) fused microdata across surveys to return a single data table with the requested variables. The requested variables can come from any fused (donor) survey and/or the American Community Survey (ACS). The necessary variables are automatically and efficiently read from the appropriate local file and safely merged on household and/or person ID variables, optionally collapsing or expanding records as necessary depending on the \code{respondent} argument. Assumes (and checks for) a local \code{/fusionData} directory with appropriate file structure and conventions. 47 | } 48 | \details{ 49 | The \code{var} argument can contain a mix of household- and/or person-level variables. When \code{respondent = "household"}, the reference person (i.e. head of household) value is returned for any person-level variables. When \code{respondent = "person"}, the values of any household-level variables are replicated for each person in the household. 50 | } 51 | \examples{ 52 | # NOTE: Requires local /fusionData directory containing the necessary ACS and .fsd files 53 | test <- assemble(year = 2018:2019, 54 | var = c("dollarel", "hincp", "agep", "state"), 55 | respondent = "household", 56 | M = 1) 57 | } 58 | -------------------------------------------------------------------------------- /man/prepXY.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prepXY.R 3 | \name{prepXY} 4 | \alias{prepXY} 5 | \title{Prepare the 'x' and 'y' inputs} 6 | \usage{ 7 | prepXY( 8 | data, 9 | y, 10 | x, 11 | weight = NULL, 12 | cor_thresh = 0.05, 13 | lasso_thresh = 0.95, 14 | xmax = 100, 15 | xforce = NULL, 16 | fraction = 1, 17 | cores = 1 18 | ) 19 | } 20 | \arguments{ 21 | \item{data}{Data frame. Training dataset. All categorical variables should be factors and ordered whenever possible.} 22 | 23 | \item{y}{Character or list. Variables in \code{data} to eventually fuse to a recipient dataset. If \code{y} is a list, each entry is a character vector possibly indicating multiple variables to fuse as a block.} 24 | 25 | \item{x}{Character. Predictor variables in \code{data} common to donor and eventual recipient.} 26 | 27 | \item{weight}{Character. Name of the observation weights column in \code{data}. If NULL (default), uniform weights are assumed.} 28 | 29 | \item{cor_thresh}{Numeric. Predictors that exhibit less than \code{cor_thresh} absolute Spearman (rank) correlation with a \code{y} variable are screened out prior to the LASSO step. Fast exclusion of predictors that the LASSO step probably doesn't need to consider.} 30 | 31 | \item{lasso_thresh}{Numeric. Controls how aggressively the LASSO step screens out predictors. Lower value is more aggressive. \code{lasso_thresh = 0.95}, for example, retains predictors that collectively explain at least 95\% of the deviance explained by a "full" model.} 32 | 33 | \item{xmax}{Integer. Maximum number of predictors returned by LASSO step. Does not strictly control the number of final predictors returned (especially for categorical \code{y} variables), but useful for setting a (very) soft upper bound. Lower \code{xmax} can help control computation time if a large number of \code{x} pass the correlation screen. \code{xmax = Inf} imposes no restriction.} 34 | 35 | \item{xforce}{Character. Subset of \code{x} variables to "force" as included predictors in the results.} 36 | 37 | \item{fraction}{Numeric. Fraction of observations in \code{data} to randomly sample. For larger datasets, sampling often has minimal effect on results but speeds up computation.} 38 | 39 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.} 40 | } 41 | \value{ 42 | List with named slots "y" and "x". Each is a list of the same length. Former gives the preferred fusion order. Latter gives the preferred sets of predictor variables. 43 | } 44 | \description{ 45 | Optional-but-useful function to: 1) provide a plausible ordering of the 'y' (fusion) variables and 2) identify the subset of 'x' (predictor) variables likely to be consequential during subsequent model training. Output can be passed directly to \code{\link{train}}. Most useful for large datasets with many and/or highly-correlated predictors. Employs an absolute Spearman rank correlation screen and then LASSO models (via \code{\link[glmnet]{glmnet}}) to return a plausible ordering of 'y' and the preferred subset of 'x' variables associated with each. 46 | } 47 | \examples{ 48 | y <- names(recs)[c(14:16, 20:22)] 49 | x <- names(recs)[2:13] 50 | 51 | # Fusion variable "blocks" are respected by prepXY() 52 | y <- c(list(y[1:2]), y[-c(1:2)]) 53 | 54 | # Do the prep work... 55 | prep <- prepXY(data = recs, y = y, x = x) 56 | 57 | # The result can be passed to train() 58 | train(data = recs, y = prep$y, x = prep$x) 59 | 60 | } 61 | -------------------------------------------------------------------------------- /R/importance.R: -------------------------------------------------------------------------------- 1 | #' Extract predictor variable importance from a fusion model 2 | #' 3 | #' @description 4 | #' Returns predictor variable (feature) importance of underlying LightGBM models stored in a fusion model file (.fsn) on disk. 5 | #' @param fsn Character. Path to fusion model file (.fsn) generated by \code{\link{train}}. 6 | #' @details Importance metrics are computed via \code{\link[lightgbm]{lgb.importance}}. Three types of measures are returned; "gain" is typically the preferred measure. 7 | #' @return A named list containing \code{detailed} and \code{summary} importance results. The \code{summary} results are most useful, as they return the average importance of each predictor across potentially multiple underlying LightGBM models; i.e. zero ("z"), mean ("m"), or quantile ("q") models. See Examples for suggested plotting of results. 8 | #' @examples 9 | #' # Build a fusion model using RECS microdata 10 | #' # Note that "fusion_model.fsn" will be written to working directory 11 | #' ?recs 12 | #' fusion.vars <- c("electricity", "natural_gas", "aircon") 13 | #' predictor.vars <- names(recs)[2:12] 14 | #' fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars) 15 | #' 16 | #' # Extract predictor variable importance 17 | #' ximp <- importance(fsn.path) 18 | #' 19 | #' # Plot summary results 20 | #' library(ggplot2) 21 | #' ggplot(ximp$summary, aes(x = x, y = gain)) + 22 | #' geom_bar(stat = "identity") + 23 | #' facet_grid(~ y) + 24 | #' coord_flip() 25 | #' 26 | #' # View detailed results 27 | #' View(ximp$detailed) 28 | #' @export 29 | 30 | importance <- function(fsn) { 31 | 32 | stopifnot(exprs = { 33 | file.exists(fsn) & endsWith(fsn, ".fsn") 34 | }) 35 | 36 | # Temporary directory to unzip .fsn contents to 37 | td <- tempfile() 38 | dir.create(td) 39 | 40 | # Names of files within the .fsn object 41 | fsn.files <- zip::zip_list(fsn) 42 | pfixes <- sort(unique(dirname(fsn.files$filename))) 43 | pfixes <- pfixes[pfixes != "."] 44 | 45 | # Unzip all files in .fsn to temporary directory 46 | zip::unzip(zipfile = fsn, exdir = td) 47 | 48 | # Extract full-detail variable importance metrics 49 | detail <- lapply(1:length(pfixes), FUN = function(i) { 50 | mods <- grep(pattern = utils::glob2rx(paste0(pfixes[i], "*.txt")), x = fsn.files$filename, value = TRUE) 51 | mods %>% 52 | lapply(FUN = function(m) { 53 | y <- sub("_[^_]+$", "", basename(m)) 54 | n <- sub(".txt", "", sub(paste0(y, "_"), "", basename(m), fixed = TRUE), fixed = TRUE) 55 | mod <- lightgbm::lgb.load(filename = file.path(td, m)) 56 | imp <- lightgbm::lgb.importance(mod) %>% 57 | mutate(y = y, 58 | model = n) 59 | return(imp) 60 | }) %>% 61 | data.table::rbindlist() 62 | }) %>% 63 | data.table::rbindlist() %>% 64 | rename_with(tolower) %>% 65 | rename(x = feature) %>% 66 | select(y, model, x, everything()) %>% 67 | as.data.frame() 68 | 69 | # Generate summary of full-detail results 70 | smry <- detail %>% 71 | group_by(y, x) %>% 72 | summarize(across(gain:frequency, mean), .groups = "drop_last") %>% 73 | mutate(across(gain:frequency, ~ .x / sum(.x))) %>% # Ensure percentages equal 1 74 | ungroup() %>% 75 | arrange(y, -gain) 76 | 77 | # Recommended plotting order for 'x' 78 | yvars <- unique(detail$y) 79 | xord <- smry %>% 80 | filter(!x %in% yvars) %>% 81 | group_by(x) %>% 82 | summarize(across(gain, mean)) %>% 83 | arrange(gain) %>% 84 | pull(x) 85 | 86 | # Set factor levels for summary 'x' and 'x' variables for suitable plotting 87 | smry$y <- factor(smry$y, levels = yvars) 88 | smry$x <- factor(smry$x, levels = c(rev(yvars), xord)) 89 | 90 | out <- list(summary = smry, detailed = detail) 91 | return(out) 92 | 93 | } 94 | -------------------------------------------------------------------------------- /man/fuse.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fuse.R 3 | \name{fuse} 4 | \alias{fuse} 5 | \title{Fuse variables to a recipient dataset} 6 | \usage{ 7 | fuse( 8 | data, 9 | fsn, 10 | fsd = NULL, 11 | M = 1, 12 | retain = NULL, 13 | kblock = 10, 14 | margin = 2, 15 | cores = 1 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{Data frame. Recipient dataset. All categorical variables should be factors and ordered whenever possible. Data types and levels are strictly validated against predictor variables defined in \code{fsn}.} 20 | 21 | \item{fsn}{Character. Path to fusion model file (.fsn) generated by \code{\link{train}}.} 22 | 23 | \item{fsd}{Character. Optional fusion output file to be created ending in \code{.fsd} (i.e. "fused data"). This is a compressed binary file that can be read using the \code{\link[fst]{fst}} package. If \code{fsd = NULL} (the default), the fusion results are returned as a \code{\link[data.table]{data.table}}.} 24 | 25 | \item{M}{Integer. Number of implicates to simulate.} 26 | 27 | \item{retain}{Character. Names of columns in \code{data} that should be retained in the output; i.e. repeated across implicates. Useful for retaining ID or weight variables for use in subsequent analysis of fusion output.} 28 | 29 | \item{kblock}{Integer. Fixed number of nearest neighbors to use when fusing variables in a block. Must be >= 5 and <= 30. Not applicable for variables fused on their own (i.e. no block).} 30 | 31 | \item{margin}{Numeric. Safety margin used when estimating how many implicates can be processed in memory at once. Set higher if \code{fuse()} experiences a memory shortfall. Alternatively, can be set to a negative value to manually specify the number of chunks to use. For example, \code{margin = -3} splits \code{M} implicates into three chunks of approximately equal size.} 32 | 33 | \item{cores}{Integer. Number of cores used. LightGBM prediction is parallel-enabled on all systems if OpenMP is available.} 34 | } 35 | \value{ 36 | If \code{fsd = NULL}, a \code{\link[data.table]{data.table}} with number of rows equal to \code{M * nrow(data)}. Integer column "M" indicates implicate assignment of each observation. Note that the ordering of recipient observations is consistent within implicates, so do not change the row order if using with \code{\link{analyze}}. 37 | 38 | If \code{fsd} is specified, the path to .fsd file where results were written. Metadata for column classes and factor levels are stored in the column names. \code{\link{read_fsd}} should be used to load files saved via the \code{fsd} argument. 39 | } 40 | \description{ 41 | Fuse variables to a recipient dataset using a .fsn model produced by \code{\link{train}}. Output can be passed to \code{\link{analyze}} and \code{\link{validate}}. 42 | } 43 | \details{ 44 | TO UPDATE. 45 | } 46 | \examples{ 47 | # Build a fusion model using RECS microdata 48 | # Note that "fusion_model.fsn" will be written to working directory 49 | ?recs 50 | fusion.vars <- c("electricity", "natural_gas", "aircon") 51 | predictor.vars <- names(recs)[2:12] 52 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars) 53 | 54 | # Generate single implicate of synthetic 'fusion.vars', 55 | # using original RECS data as the recipient 56 | recipient <- recs[predictor.vars] 57 | sim <- fuse(data = recipient, fsn = fsn.path) 58 | head(sim) 59 | 60 | # Calling fuse() again produces different results 61 | sim <- fuse(data = recipient, fsn = fsn.path) 62 | head(sim) 63 | 64 | # Generate multiple implicates 65 | sim <- fuse(data = recipient, fsn = fsn.path, M = 5) 66 | head(sim) 67 | table(sim$M) 68 | 69 | # Optionally, write results directly to disk 70 | # Note that "results.fsd" will be written to working directory 71 | sim <- fuse(data = recipient, fsn = fsn.path, M = 5, fsd = "results.fsd") 72 | sim <- read_fsd(sim) 73 | head(sim) 74 | 75 | } 76 | -------------------------------------------------------------------------------- /man/validate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate.R 3 | \name{validate} 4 | \alias{validate} 5 | \title{Validate fusion output} 6 | \usage{ 7 | validate( 8 | observed, 9 | implicates, 10 | subset_vars, 11 | weight = NULL, 12 | min_size = 30, 13 | plot = TRUE, 14 | cores = 1 15 | ) 16 | } 17 | \arguments{ 18 | \item{observed}{Data frame. Observed data against which to validate the \code{simulated} variables. Typically the same dataset used to \code{\link{train}} the fusion model used to generate \code{simulated}.} 19 | 20 | \item{implicates}{Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M".} 21 | 22 | \item{subset_vars}{Character. Vector of columns in \code{observed} used to define the population subsets across which the fusion variables are validated. The levels of each \code{subset_vars} (including all two-way interactions of \code{subset_vars}) define the population subsets. Continuous \code{subset_vars} are converted to a five-level ordered factor based on a univariate k-means clustering.} 23 | 24 | \item{weight}{Character. Name of the observation weights column in \code{observed}. If NULL (default), uniform weights are assumed.} 25 | 26 | \item{min_size}{Integer. Subsets with less than \code{min_size} observations are excluded. Since subsets with few observations are unlikely to give reliable estimates, it doesn't make sense to consider them for validation purposes.} 27 | 28 | \item{plot}{Logical. If TRUE (default), \code{\link{plot_valid}} is called internally and summary plots are returned along with complete validation results. Requires the \code{\link{ggplot2}} package.} 29 | 30 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.} 31 | } 32 | \value{ 33 | If \code{plot = FALSE}, a data frame containing complete validation results. If If \code{plot = FALSE}, a list containing full results as well as additional lot objects as described in \code{\link{plot_valid}}. 34 | } 35 | \description{ 36 | Performs internal validation analyses on fused microdata to estimate how well the simulated variables reflect patterns in the dataset used to train the underlying fusion model (i.e. observed/donor data). This provides a standard approach to validating fusion output and associated models. See Examples for recommended usage. 37 | } 38 | \details{ 39 | The objective of \code{\link{validate}} is to confirm that the fusion output is sensible and help establish the utility of the synthetic data across myriad analyses. Utility here is based on comparison of point estimates and confidence intervals derived using multiple-implicate synthetic data with those derived using the original donor data. 40 | 41 | The specific analyses tested include variable levels (means and proportions) across population subsets of varying size. This allows estimates of how each of the synthetic variables perform in analyses with real-world relevance, at varying levels of complexity. In effect, \code{validate()} performs a large number of analyses of the kind that the \code{\link{analyze}} function is designed to do on a one-by-one basis. 42 | 43 | Most users will want to use the default setting \code{plot = TRUE} to simultaneously return visualization (plots) of the validation results. Plot creation is detailed in \code{\link{plot_valid}}. 44 | } 45 | \examples{ 46 | # Build a fusion model using RECS microdata 47 | # Note that "fusion_model.fsn" will be written to working directory 48 | fusion.vars <- c("electricity", "natural_gas", "aircon") 49 | predictor.vars <- names(recs)[2:12] 50 | fsn.path <- train(data = recs, 51 | y = fusion.vars, 52 | x = predictor.vars, 53 | weight = "weight") 54 | 55 | # Fuse back onto the donor data (multiple implicates) 56 | sim <- fuse(data = recs, 57 | fsn = fsn.path, 58 | M = 20) 59 | 60 | # Calculate validation results 61 | valid <- validate(observed = recs, 62 | implicates = sim, 63 | subset_vars = c("income", "education", "race", "urban_rural")) 64 | 65 | } 66 | -------------------------------------------------------------------------------- /man/plot_valid.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_valid.R 3 | \name{plot_valid} 4 | \alias{plot_valid} 5 | \title{Plot validation results} 6 | \usage{ 7 | plot_valid(valid, y = NULL, path = NULL, cores = 1, ...) 8 | } 9 | \arguments{ 10 | \item{valid}{Object returned by \code{\link{validate}}.} 11 | 12 | \item{y}{Character. Fusion variables to use for validation graphics. Useful for plotting partial validation results. Default is to use all fusion variables present in \code{valid}.} 13 | 14 | \item{path}{Character. Path to directory where .png graphics are to be saved. Directory is created if necessary. If NULL (default), no files are saved to disk.} 15 | 16 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.} 17 | 18 | \item{...}{Arguments passed to \code{\link[ggplot2]{ggsave}} to control .png graphics saved to disk.} 19 | } 20 | \value{ 21 | A list with "plots", "smooth", and "data" slots. The "plots" slot contains the following \code{\link[ggplot2]{ggplot}} objects: 22 | \itemize{ 23 | \item est: Comparison of point estimates (median absolute percent error). 24 | \item moe: Comparison of 90\% margin of error (median ratio of simulated-to-observed MOE). 25 | \item Additional named slots (one for each of the fusion variables) contain the plots described above with scatterplot results. 26 | } 27 | "smooth" is a data frame with the plotting values used to produce the smoothed median plots. 28 | "data" is a data frame with the complete validation results as returned by the original call to \code{\link{validate}}. 29 | } 30 | \description{ 31 | Creates and optionally saves to disk representative plots of validation results returned by \code{\link{validate}}. Requires the suggested \code{\link{ggplot2}} package. This function is (by default) called within \code{\link{validate}}. Can be useful on its own to save graphics to disk or generate plots for a subset of fusion variables. 32 | } 33 | \details{ 34 | Validation results are visualized to convey expected, typical (median) performance of the fusion variables. That is, how well do the simulated data match the observed data with respect to point estimates and confidence intervals for population subsets of various size? 35 | 36 | Plausible error metrics are derived from the input validation data for plotting. For comparison of point estimates, the error metric is absolute percent error for continuous variables; in the categorical case it is absolute error scaled such that the maximum possible error is 1. Since these metrics are not strictly comparable, the all-variable plots denote categorical fusion variables with dotted lines. 37 | 38 | For a given fusion variable, the error metric will exhibit variation (often quite skewed) even for subsets of comparable size, due to the fact that each subset looks at a unique partition of the data. In order to convey how expected, typical performance varies with subset size, the smoothed median error conditional on subset size is approximated and plotted. 39 | } 40 | \examples{ 41 | # Build a fusion model using RECS microdata 42 | # Note that "fusion_model.fsn" will be written to working directory 43 | fusion.vars <- c("electricity", "natural_gas", "aircon") 44 | predictor.vars <- names(recs)[2:12] 45 | fsn.path <- train(data = recs, 46 | y = fusion.vars, 47 | x = predictor.vars, 48 | weight = "weight") 49 | 50 | # Fuse back onto the donor data (multiple implicates) 51 | sim <- fuse(data = recs, 52 | file = fsn.path, 53 | M = 30) 54 | 55 | # Calculate validation results but do not generate plots 56 | valid <- validate(observed = recs, 57 | implicates = sim, 58 | subset_vars = c("income", "education", "race", "urban_rural"), 59 | weight = "weight", 60 | plot = FALSE) 61 | 62 | # Create validation plots 63 | valid <- plot_valid(valid) 64 | 65 | # View some of the plots 66 | valid$plots$est 67 | valid$plots$moe 68 | valid$plots$electricity$bias 69 | 70 | # Can also save the plots to disk at creation 71 | # Will save .png files to 'valid_plots' folder in working directory 72 | # Note that it is fine to pass a 'valid' object with existing $plots slot 73 | # In that case, the plots are simply re-generated 74 | vplots <- plot_valid(valid, 75 | path = file.path(getwd(), "valid_plots"), 76 | width = 8, height = 6) 77 | 78 | } 79 | -------------------------------------------------------------------------------- /R/fitLGB.R: -------------------------------------------------------------------------------- 1 | fitLGB <- function(dfull, dtrain = NULL, dvalid = NULL, cv.folds = NULL, hyper.grid, params.obj) { 2 | 3 | # If full cross-validation is requested... 4 | if (is.list(cv.folds)) { 5 | perf <- lapply(hyper.grid, FUN = function(x) { 6 | sink <- utils::capture.output({ 7 | mod <- lightgbm::lgb.cv( 8 | params = c(as.list(x), params.obj), 9 | data = dfull, 10 | folds = cv.folds, 11 | early_stopping_rounds = 2L, 12 | verbose = -1L 13 | ) 14 | }) 15 | data.frame(best_score = mod$best_score, best_iter = mod$best_iter) 16 | }) 17 | } 18 | 19 | # If training/test-set validation is requested... 20 | if (is.logical(cv.folds)) { 21 | perf <- lapply(hyper.grid, FUN = function(x) { 22 | p <- c(as.list(x), params.obj) 23 | sink <- utils::capture.output({ 24 | mod <- lightgbm::lgb.train( 25 | params = p, 26 | data = dtrain, 27 | valids = list(valid = dvalid), 28 | early_stopping_rounds = 2L, 29 | verbose = -1L 30 | ) 31 | }) 32 | data.frame(best_score = mod$best_score, best_iter = mod$best_iter) 33 | }) 34 | } 35 | 36 | # # If no validation is requested; i.e. over-fitting scenario 37 | # if (is.null(dvalid) & is.null(cv.folds)) { 38 | # perf <- lapply(hyper.grid, FUN = function(x) { 39 | # p <- c(as.list(x), params.obj) 40 | # sink <- utils::capture.output({ 41 | # mod <- lightgbm::lgb.train( 42 | # params = p, 43 | # data = dfull, 44 | # verbose = -1L 45 | # ) 46 | # }) 47 | # # Can't get it to return training loss 48 | # # Return the training log-loss at the maximum number of iterations 49 | # #train.evals <- unlist(mod$record_evals$train) 50 | # #c(min(train.evals), which.min(train.evals)) 51 | # c(NA, NA) 52 | # }) 53 | # } 54 | 55 | # Compare validation set performance across hyper-parameter sets 56 | comp <- bind_rows(perf) 57 | opt <- which.min(comp$best_score) 58 | # This is not ideal -- overfitting process should return training loss and work with multiple hypergrid options (instead of just choosing #1) 59 | if (length(opt) == 0) { 60 | params.opt <- hyper.grid[[1]] 61 | } else { 62 | params.opt <- hyper.grid[[opt]] 63 | params.opt$num_iterations <- as.integer(comp$best_iter[[opt]]) 64 | } 65 | 66 | # Fit final model using full dataset and optimal parameter values 67 | mod <- lightgbm::lgb.train( 68 | params = c(params.opt, params.obj), 69 | data = dfull, 70 | verbose = -1L 71 | ) 72 | 73 | # Add the optimal validation score and number of iterations to the 'mod' object 74 | # These are NA and -1, by default, which doesn't provide any useful information 75 | mod$best_score <- as.numeric(comp$best_score[opt]) 76 | mod$best_iter <- params.opt$num_iterations 77 | 78 | # Storing hyper results in 'record_evals' slot, since adding a custom slot is not allowed 79 | hyper.results <- cbind(bind_rows(hyper.grid), comp, final_model = FALSE) 80 | hyper.results$final_model[opt] <- TRUE 81 | mod$record_evals <- hyper.results 82 | 83 | # Plot the evolution of the loss function 84 | #plot(unlist(mod$record_evals[[2]])) 85 | 86 | return(mod) 87 | 88 | } 89 | 90 | #--- 91 | #!!! NOTE: It appears callbacks are not (yet) exported but that could be coming soon. 92 | # See here: https://github.com/microsoft/LightGBM/pull/5018 93 | # https://github.com/Microsoft/LightGBM/blob/master/R-package/R/callback.R 94 | # https://stackoverflow.com/questions/54027734/adding-callbacks-to-lightgbm-in-r 95 | # IMPORTANT -- this is presumably the way to set the 'min_delta' argument: https://github.com/ummel/fusionModel/issues/24 96 | # IN PROGRESS: https://github.com/microsoft/LightGBM/pull/5018#pullrequestreview-892006092 97 | # Add here: https://github.com/microsoft/LightGBM/pull/5123 ( appears to be merged?!?!?) 98 | # mod.cv <- lightgbm::lgb.cv( 99 | # params = c(params.global, obj, min_data_in_leaf = min.leaf), 100 | # data = dfull, 101 | # nrounds = 500L, # See note at top about min_delta. Could set higher once min_delta is allowed... 102 | # folds = cv.folds, 103 | # early_stopping_rounds = 1L, 104 | # verbose = -1L, 105 | # callbacks = list(cb_early_stop(verbose = FALSE)) # !!! EXAMPLE -- may suppress console output 106 | # ) 107 | #--- 108 | 109 | -------------------------------------------------------------------------------- /man/fusionOutput.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fusionOutput.R 3 | \name{fusionOutput} 4 | \alias{fusionOutput} 5 | \title{Generate output files resulting from fusion} 6 | \usage{ 7 | fusionOutput( 8 | donor, 9 | respondent, 10 | acs_year, 11 | fusion_vars, 12 | M = 1, 13 | input_version = "latest", 14 | output_version = "today", 15 | fsn = NULL, 16 | rlocation = NULL, 17 | note = NULL, 18 | test_mode = TRUE, 19 | validation = TRUE, 20 | ncores = 1, 21 | margin = 4, 22 | ... 23 | ) 24 | } 25 | \arguments{ 26 | \item{donor}{Character. Donor survey identifier (e.g. \code{"RECS_2015"}).} 27 | 28 | \item{respondent}{Character. Desired respondent level of microdata. Either \code{"household"} or \code{"person"}.} 29 | 30 | \item{acs_year}{Integer. Year of the ACS microdata to be used as recipient.} 31 | 32 | \item{fusion_vars}{Character. Vector of variables in the donor microdata to be fused to the ACS microdata recipient.} 33 | 34 | \item{M}{Integer. Desired number of fusion implicates. If \code{M = NULL} (default) it is internally set to 40 or, if \code{test_mode = TRUE}, 2 implicates.} 35 | 36 | \item{input_version}{Character. Version (i.e. date stamp) of the desired fusion inputs. Defaults to the latest available version.} 37 | 38 | \item{output_version}{Character. Version (i.e. date stamp) to assign to the outputs. Defaults to today's date.} 39 | 40 | \item{fsn}{Character. Optional path to a .fsn object created by a previous call to \code{fusionOutput()}. If specified, code attempts to use \code{fsn} to bypass the training step.} 41 | 42 | \item{rlocation}{Data frame. Optional data frame to replace the donor input data imputed respondent locations with actual/disclosed locations specified in \code{rlocation}.} 43 | 44 | \item{note}{Character. Optional note supplied by user. Inserted in the log file for reference.} 45 | 46 | \item{test_mode}{Logical. If \code{test_mode = TRUE} (default), the result files are always saved within a "/fusion_" directory in \code{output} (possibly created); faster hyperparameters are used for \code{\link[fusionModel]{train}}; and the internal validation step is skipped by default.} 47 | 48 | \item{validation}{Logical or integer. Controls execution of internal validation (Steps 3 and 4). If \code{validation = 0} or \code{FALSE}, neither step is performed (default when \code{test_mode = TRUE}). If \code{1}, only Step 3. If \code{2} or \code{TRUE}, both Steps 3 and 4.} 49 | 50 | \item{ncores}{Integer. Number of physical CPU cores used for parallel computation.} 51 | 52 | \item{margin}{Numeric. Passed to same argument in \code{\link[fusionModel]{fuse}}.} 53 | 54 | \item{...}{Optional, non-default arguments passed to \code{\link[fusionModel]{train}}. For example, \code{fork = TRUE} to enable forked parallel processing.} 55 | } 56 | \value{ 57 | Saves resulting \code{output} data files to appropriate local directory. Also saves a .txt log file alongside data files that records console output from \code{fusionOutput}. 58 | } 59 | \description{ 60 | Handles all operations needed to "do fusion" using input files generated by a successful call to \code{fusionInput}. Trains a fusion model, generates internal validation results, and then simulates multiple implicates for recipient microdata. 61 | } 62 | \details{ 63 | The function checks arguments and determines the file path to the appropriate \code{output} directory (creating it if necessary). The output files are always placed within the appropriate directory hierarchy, based on the donor and recipient information detected in the \code{input} file names. In practice, \code{output} need only be specified if working in an environment where the output files need to located somewhere different from the input files. 64 | 65 | The function executes the following steps: 66 | \enumerate{ 67 | \item \strong{Load training data inputs}. Loads donor training microdata and results of \code{\link[fusionModel]{prepXY}}. 68 | \item \strong{Run fusionModel::train()}. Calls \code{\link[fusionModel]{train}} using sensible defaults and hyperparameters. If \code{test_mode = TRUE}, the hyperparameters are designed to do a fast/rough-and-ready model training. 69 | \item \strong{Fuse onto training data for internal validation}. Optional step (see \code{validation} argument). Fuses multiple implicates to original donor training data using \code{\link[fusionModel]{fuse}}. Results saved to disk. 70 | \item \strong{Run fusionModel::validate()}. Optional step (see \code{validation} argument). Passes previous step's results to \code{\link[fusionModel]{validate}}. Results saved to disk. 71 | \item \strong{Fuse onto prediction data}. Fuses multiple implicates to supplied input prediction data using \code{\link[fusionModel]{fuse}}. Results saved to disk. 72 | \item \strong{fusionOutput() is finished!} Upon completion, a log file named \code{"outputlog.txt"} is written to \code{output} for reference. 73 | } 74 | } 75 | \examples{ 76 | # Since 'test_mode = TRUE' by default, this will affect files in local '/fusion_' directory 77 | dir <- fusionInput(donor = "RECS_2015", 78 | recipient = "ACS_2015", 79 | respondent = "household", 80 | fuse = c("btung", "btuel", "cooltype"), 81 | force = c("moneypy", "householder_race", "education", "nhsldmem", "kownrent", "recs_division"), 82 | note = "Hello world. Reminder: running in test mode by default.") 83 | 84 | # List files in the /input directory 85 | list.files(dir) 86 | 87 | # Using default settings 88 | out <- fusionOutput(input = dir) 89 | list.files(out) 90 | 91 | } 92 | -------------------------------------------------------------------------------- /R/read_fsd.R: -------------------------------------------------------------------------------- 1 | #' Read fusion output from disk 2 | #' 3 | #' @description Efficiently read fusion output that was written to disk, optionally returning a subset of rows and/or columns. Since a \code{.fsd} file is simply a \code{\link[fst]{fst}} file under the hood, this function also works on any \code{.fst} file. 4 | #' @param path Character. Path to a \code{.fsd} (or \code{.fst}) file, typically produced by \code{\link{fuse}}. 5 | #' @param columns Character. Column names to read. The default is to return all columns. 6 | #' @param M Integer. The first \code{M} implicates are returned. Set \code{M = Inf} to return all implicates. Ignored if \code{M} column not present in data. 7 | #' @param df Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows. 8 | #' @param cores Integer. Number of cores used by \code{\link[fst]{fst}}. 9 | #' @details If \code{df} is provided and the file size on disk is less than 100 MB, then a full read and inner \code{\link[collapse]{join}} is performed. For larger files, a manual read of the required rows is performed, using \code{\link[collapse]{fmatch}} for the matching operation. 10 | #' @return A \code{\link[data.table]{data.table}}; keys are preserved if present in the on-disk data. When \code{path} points to a \code{.fsd} file, it includes an integer column "M" indicating the implicate assignment of each observation (unless explicitly ignored by \code{columns}). 11 | #' @examples 12 | #' # Build a fusion model using RECS microdata 13 | #' # Note that "fusion_model.fsn" will be written to working directory 14 | #' ?recs 15 | #' fusion.vars <- c("electricity", "natural_gas", "aircon") 16 | #' predictor.vars <- names(recs)[2:12] 17 | #' fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars) 18 | #' 19 | #' # Write fusion output directly to disk 20 | #' # Note that "results.fsd" will be written to working directory 21 | #' recipient <- recs[predictor.vars] 22 | #' sim <- fuse(data = recipient, fsn = fsn.path, M = 5, fsd = "results.fsd") 23 | #' 24 | #' # Read the fusion output saved to disk 25 | #' sim <- read_fsd(sim) 26 | #' head(sim) 27 | #' 28 | #' @export 29 | 30 | #----------- 31 | 32 | # TEST 33 | # library(tidyverse) 34 | # library(collapse) 35 | # library(data.table) 36 | # setwd("/home/kevin/Documents/Projects/fusionData") 37 | # 38 | # test <- read_fsd("fusion/RECS/2020/2019/RECS_2020_2019_H_fused.fsd", 39 | # columns = c("M", "dollarel"), 40 | # M = 5, 41 | # df = df, 42 | # cores = 3) 43 | 44 | #----------- 45 | 46 | read_fsd <- function(path, 47 | columns = NULL, 48 | M = 1, 49 | df = NULL, 50 | cores = max(1, parallel::detectCores(logical = FALSE) - 1)) { 51 | 52 | stopifnot({ 53 | endsWith(path, ".fsd") | endsWith(path, ".fst") 54 | file.exists(path) 55 | is.null(columns) | is.character(columns) 56 | is.null(df) | is.data.frame(df) 57 | M >= 1 58 | cores > 0 & cores %% 1 == 0 59 | }) 60 | 61 | #require(collapse, quietly = TRUE) 62 | n <- fst::threads_fst() 63 | fst::threads_fst(nr_of_threads = cores) 64 | 65 | meta <- fst::metadata_fst(path) 66 | if (is.null(columns)) { 67 | columns <- meta$columnNames 68 | } else { 69 | columns <- unique(columns) 70 | stopifnot(all(columns %in% meta$columnNames)) 71 | } 72 | 73 | # TURNED OFF FOR TESTING 74 | # Add 'M' column to 'df' to subset on the number of implicates 75 | # if (is.finite(M) & "M" %in% v) { 76 | # df <- if (nrow(df) == 0) { 77 | # data.table(M = 1:M) 78 | # } else { 79 | # df %>% 80 | # select(any_of(v)) %>% 81 | # select(-any_of("M")) %>% 82 | # unique() %>% 83 | # slice(rep(1:nrow(.), M)) %>% 84 | # mutate(M = rep(1:M, each = nrow(.) / M)) 85 | # } 86 | # } 87 | 88 | #----- 89 | 90 | # Determine which rows have the requested implicates (M) 91 | # Since the data are assumed to be sorted by M, this should yield consecutive integers 92 | 93 | if ('M' %in% meta$columnNames) { 94 | d <- fst::fst(path) 95 | i <- which(d$M <= M) 96 | stopifnot(!is.unsorted(i)) 97 | } else { 98 | i <- 1L:meta$nrOfRows # Return all rows in 'd', if no implicates column 'M' 99 | } 100 | 101 | #----- 102 | 103 | if (is.null(df)) { 104 | 105 | d <- fst::read_fst(path, columns = columns, from = i[1], to = i[length(i)], as.data.table = TRUE) 106 | 107 | } else { 108 | 109 | # Check for validity of 'df' column names 110 | stopifnot(all(names(df) %in% meta$columnNames)) 111 | df <- collapse::funique(df) 112 | 113 | # If the file size is less than 100 MB, simply read the full data and subset via collapse::join() 114 | if (file.size(path) / 1e6 < 100) { 115 | 116 | d <- fst::read_fst(path, columns = unique(c(columns, names(df))), from = i[1], to = i[length(i)], as.data.table = TRUE) 117 | d <- collapse::join(d, df, how = "inner", verbose = FALSE) 118 | d <- d[, ..columns] 119 | 120 | } else { 121 | 122 | d <- fst::fst(path) 123 | m <- qDT(d[i, names(df)]) 124 | i <- i[m %iin% df] # Uses 'collapse' package equivalent of which(x %in% table) using fmatch() 125 | d <- qDT(d[i, setdiff(columns, names(m)), drop = FALSE]) 126 | j <- intersect(names(m), columns) 127 | if (length(j)) d <- cbind(d, m[i, ..j]) 128 | 129 | } 130 | } 131 | 132 | # Set column order and data.table keys 133 | setcolorder(d, neworder = columns) 134 | suppressWarnings(setkeyv(d, cols = intersect(meta$keys, columns))) 135 | 136 | # Reset number of threads 137 | fst::threads_fst(nr_of_threads = n) 138 | 139 | return(d) 140 | 141 | } 142 | -------------------------------------------------------------------------------- /man/train.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/train.R 3 | \name{train} 4 | \alias{train} 5 | \title{Train a fusion model} 6 | \usage{ 7 | train( 8 | data, 9 | y, 10 | x, 11 | fsn = "fusion_model.fsn", 12 | weight = NULL, 13 | nfolds = 5, 14 | nquantiles = 2, 15 | nclusters = 2000, 16 | krange = c(10, 500), 17 | hyper = NULL, 18 | fork = FALSE, 19 | cores = 1 20 | ) 21 | } 22 | \arguments{ 23 | \item{data}{Data frame. Donor dataset. Categorical variables must be factors and ordered whenever possible.} 24 | 25 | \item{y}{Character or list. Variables in \code{data} to eventually fuse to a recipient dataset. Variables are fused in the order provided. If \code{y} is a list, each entry is a character vector possibly indicating multiple variables to fuse as a block.} 26 | 27 | \item{x}{Character or list. Predictor variables in \code{data} common to donor and eventual recipient. If a list, each slot specifies the \code{x} predictors to use for each \code{y}.} 28 | 29 | \item{fsn}{Character. File path where fusion model will be saved. Must use \code{.fsn} suffix.} 30 | 31 | \item{weight}{Character. Name of the observation weights column in \code{data}. If NULL (default), uniform weights are assumed.} 32 | 33 | \item{nfolds}{Numeric. Number of cross-validation folds used for LightGBM model training. Or, if \code{nfolds < 1}, the fraction of observations to use for training set; remainder used for validation (faster than cross-validation).} 34 | 35 | \item{nquantiles}{Numeric. Number of quantile models to train for continuous \code{y} variables, in addition to the conditional mean. \code{nquantiles} evenly-distributed percentiles are used. For example, the default \code{nquantiles = 2} yields quantile models for the 25th and 75th percentiles. Higher values may produce more accurate conditional distributions at the expense of computation time. Even \code{nquantiles} is recommended since the conditional mean tends to capture the central tendency, making a median model superfluous.} 36 | 37 | \item{nclusters}{Numeric. Maximum number of k-means clusters to use. Higher is better but at computational cost. \code{nclusters = 0} or \code{nclusters = Inf} turn off clustering.} 38 | 39 | \item{krange}{Numeric. Minimum and maximum number of nearest neighbors to use for construction of continuous conditional distributions. Higher \code{max(krange)} is better but at computational cost.} 40 | 41 | \item{hyper}{List. LightGBM hyperparameters to be used during model training. If \code{NULL}, default values are used. See Details and Examples.} 42 | 43 | \item{fork}{Logical. Should parallel processing via forking be used, if possible? See Details.} 44 | 45 | \item{cores}{Integer. Number of physical CPU cores used for parallel computation. When \code{fork = FALSE} or on Windows platform (since forking is not possible), the fusion variables/blocks are processed serially but LightGBM uses \code{cores} for internal multithreading via OpenMP. On a Unix system, if \code{fork = TRUE}, \code{cores > 1}, and \code{cores <= length(y)} then the fusion variables/blocks are processed in parallel via \code{\link[parallel]{mclapply}}.} 46 | } 47 | \value{ 48 | A fusion model object (.fsn) is saved to \code{fsn}. 49 | } 50 | \description{ 51 | Train a fusion model on "donor" data using sequential \href{https://lightgbm.readthedocs.io/en/latest/}{LightGBM} models to model the conditional distributions. The resulting fusion model (.fsn file) can be used with \code{\link{fuse}} to simulate outcomes for a "recipient" dataset. 52 | } 53 | \details{ 54 | When \code{y} is a list, each slot indicates either a single variable or, alternatively, multiple variables to fuse as a block. Variables within a block are sampled jointly from the original donor data during fusion. See Examples. 55 | 56 | \code{y} variables that exhibit no variance or continuous \code{y} variables with less than \code{10 * nfolds} non-zero observations (minimum required for cross-validation) are automatically removed with a warning. 57 | 58 | The fusion model written to \code{fsn} is a zipped archive created by \code{\link[zip]{zip}} containing models and data required by \code{\link{fuse}}. 59 | 60 | The \code{hyper} argument can be used to specify the LightGBM hyperparameter values over which to perform a "grid search" during model training. \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{See here} for the full list of parameters. For each combination of hyperparameters, \code{nfolds} cross-validation is performed using \code{\link[lightgbm]{lgb.cv}} with an early stopping condition. The parameter combination with the lowest loss function value is used to fit the final model via \code{\link[lightgbm]{lgb.train}}. The more candidate parameter values specified in \code{hyper}, the longer the processing time. If \code{hyper = NULL}, a single set of parameters is used with the following default values: 61 | 62 | \itemize{ 63 | \item boosting = "gbdt" 64 | \item data_sample_strategy = "goss" 65 | \item num_leaves = 31 66 | \item feature_fraction = 0.8 67 | \item max_depth = 5 68 | \item min_data_in_leaf = max(10, round(0.001 * nrow(data))) 69 | \item num_iterations = 2500 70 | \item learning_rate= 0.1 71 | \item max_bin = 255 72 | \item min_data_in_bin = 3 73 | \item max_cat_threshold = 32 74 | } 75 | Typical users will only have reason to modify the hyperparameters listed above. Note that \code{num_iterations} only imposes a ceiling, since early stopping will typically result in models with a lower number of iterations. See Examples. 76 | 77 | Testing with small-to-medium size datasets suggests that forking is typically faster than OpenMP multithreading (the default). However, forking will sometimes "hang" (continue to run with no CPU usage or error message) if an OpenMP process has been previously used in the same session. The issue appears to be related to Intel's OpenMP implementation (\href{https://github.com/Rdatatable/data.table/issues/2418}{see here}). This can be triggered when other operations are called before \code{train()} that use \code{\link[data.table]{data.table}} or \code{\link[fst]{fst}} in multithread mode. If you experience hanged forking, try calling \code{data.table::setDTthreads(1)} and \code{fst::threads_fst(1)} immediately after \code{library(fusionModel)} in a new session. 78 | } 79 | \examples{ 80 | # Build a fusion model using RECS microdata 81 | # Note that "fusion_model.fsn" will be written to working directory 82 | ?recs 83 | fusion.vars <- c("electricity", "natural_gas", "aircon") 84 | predictor.vars <- names(recs)[2:12] 85 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars) 86 | 87 | # When 'y' is a list, it can specify variables to fuse as a block 88 | fusion.vars <- list("electricity", "natural_gas", c("heating_share", "cooling_share", "other_share")) 89 | fusion.vars 90 | train(data = recs, y = fusion.vars, x = predictor.vars) 91 | 92 | # When 'x' is a list, it specifies which predictor variables to use for each 'y' 93 | xlist <- list(predictor.vars[1:4], predictor.vars[2:8], predictor.vars) 94 | xlist 95 | train(data = recs, y = fusion.vars, x = xlist) 96 | 97 | # Specify a single set of LightGBM hyperparameters 98 | # Here we use Random Forests instead of the default Gradient Boosting Decision Trees 99 | train(data = recs, y = fusion.vars, x = predictor.vars, 100 | hyper = list(boosting = "rf", 101 | feature_fraction = 0.6, 102 | max_depth = 10 103 | )) 104 | 105 | # Specify a range of LightGBM hyperparameters to search over 106 | # This takes longer, because there are more models to test 107 | train(data = recs, y = fusion.vars, x = predictor.vars, 108 | hyper = list(max_depth = c(5, 10), 109 | feature_fraction = c(0.7, 0.9) 110 | )) 111 | } 112 | -------------------------------------------------------------------------------- /man/analyze.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/analyze.R 3 | \name{analyze} 4 | \alias{analyze} 5 | \title{Analyze fusion output} 6 | \usage{ 7 | analyze( 8 | x, 9 | implicates, 10 | static = NULL, 11 | weight = NULL, 12 | rep_weights = NULL, 13 | by = NULL, 14 | fun = NULL, 15 | var_scale = 4, 16 | cores = 1 17 | ) 18 | } 19 | \arguments{ 20 | \item{x}{List. Named list specifying the desired analysis type(s) and the associated target variable(s). Example: \code{x = list(mean = c("v1", "v2"), median = "v3")} translates as: "Return the mean value of variables v1 and v2 and the median of v3". Supported analysis types include \code{mean}, \code{sum}, and \code{median}. Mean and sum automatically return proportions and counts, respectively, if the target variable is a factor. Target variables must be in \code{implicates}, \code{static}, or a data.frame returned by a custom \code{fun}.} 21 | 22 | \item{implicates}{Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M".} 23 | 24 | \item{static}{Data frame. Optional static (non-synthetic) variables that do not vary across implicates. Note that \code{nrow(static) = nrow(implicates) / max(implicates$M)} and the row-ordering is assumed to be consistent between \code{static} and \code{implicates}.} 25 | 26 | \item{weight}{Character. Name of the observation weights column in \code{static}. If NULL (default), uniform weights are assumed.} 27 | 28 | \item{rep_weights}{Character. Optional vector of replicate weight columns in \code{static}. If provided, the returned margin of errors reflect additional variance due to uncertainty in sample weights.} 29 | 30 | \item{by}{Character. Optional column name(s) in \code{implicates} or \code{static} (typically factors) that collectively define the set of population subgroups for which each analysis is executed. If \code{NULL}, analysis is done for the whole sample.} 31 | 32 | \item{fun}{Function. Optional function applied to input data prior to executing analyses. Can be used to do non-conventional/custom analyses.} 33 | 34 | \item{var_scale}{Scalar. Factor by which to scale the unadjusted replicate weight variance. This is determined by the survey design. The default (\code{var_scale = 4}) is appropriate for ACS and RECS.} 35 | 36 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.} 37 | } 38 | \value{ 39 | A data.table reporting analysis results, possibly across subgroups defined in \code{by}. The returned quantities include: 40 | 41 | \describe{ 42 | \item{N}{Number of observations used for the analysis.} 43 | \item{y}{Target variable.} 44 | \item{level}{Levels of factor target variables.} 45 | \item{type}{Type of estimate returned: mean, proportion, sum, count, or median.} 46 | \item{est}{Point estimate.} 47 | \item{moe}{Margin of error associated with the 90\% confidence interval.} 48 | } 49 | } 50 | \description{ 51 | Calculation of point estimates and associated margin of error for analyses using fused/synthetic microdata. Can calculate means, proportions, sums, counts, and medians, optionally across population subgroups. 52 | } 53 | \details{ 54 | At a minimum, the user must supply synthetic implicates (typically generated by \link{fuse}). Inputs are checked for consistent dimensions. 55 | 56 | If \code{implicates} contains only a single implicate and \code{rep_weights = NULL}, the "typical" standard error is returned with a warning to make sure the user is aware of the situation. 57 | 58 | Estimates and standard errors for the requested analysis are calculated separately for each implicate. The final point estimate is the mean estimate across implicates. The final standard error is the pooled SE across implicates, calculated using Rubin's pooling rules (1987). 59 | 60 | When replicate weights are provided, the standard errors of each implicate are calculated via the variance of estimates across replicates. Calculations leverage \code{\link[data.table]{data.table}} operations for speed and memory efficiency. The within-implicate variance is calculated around the point estimate (rather than around the mean of the replicates). This is equivalent to \code{mse = TRUE} in \code{\link[survey]{svrepdesign}}. This seems to be the appropriate method for most surveys. 61 | 62 | If replicate weights are NOT provided, the standard errors of each implicate are calculated using variance within the implicate. For means, the ratio variance approximation of Cochran (1977) is used, as this is known to be a good approximation of bootstrapped SE's for weighted means (Gatz and Smith 1995). For proportions, a generalization of the unweighted SE formula is used (\href{https://stats.stackexchange.com/questions/159204/how-to-calculate-the-standard-error-of-a-proportion-using-weighted-data}{see here}). 63 | } 64 | \examples{ 65 | # Build a fusion model using RECS microdata 66 | fusion.vars <- c("electricity", "natural_gas", "aircon") 67 | predictor.vars <- names(recs)[2:12] 68 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars) 69 | 70 | # Generate 30 implicates of the 'fusion.vars' using original RECS as the recipient 71 | sim <- fuse(data = recs, fsn = fsn.path, M = 30) 72 | head(sim) 73 | 74 | #--------- 75 | 76 | # Multiple types of analyses can be done at once 77 | # This calculates estimates using the full sample 78 | result <- analyze(x = list(mean = c("natural_gas", "aircon"), 79 | median = "electricity", 80 | sum = c("electricity", "aircon")), 81 | implicates = sim, 82 | weight = "weight") 83 | 84 | View(result) 85 | 86 | #----- 87 | 88 | # Mean electricity consumption, by climate zone and urban/rural status 89 | result1 <- analyze(x = list(mean = "electricity"), 90 | implicates = sim, 91 | static = recs, 92 | weight = "weight", 93 | by = c("climate", "urban_rural")) 94 | 95 | # Same as above but including sample weight uncertainty 96 | # Note that only the first 30 replicate weights are used internally 97 | result2 <- analyze(x = list(mean = "electricity"), 98 | implicates = sim, 99 | static = recs, 100 | weight = "weight", 101 | rep_weights = paste0("rep_", 1:96), 102 | by = c("climate", "urban_rural")) 103 | 104 | # Helper function for comparison plots 105 | pfun <- function(x, y) {plot(x, y); abline(0, 1, lty = 2)} 106 | 107 | # Inclusion of replicate weights does not affect estimates, but it does 108 | # increase margin of error due to uncertainty in RECS sample weights 109 | pfun(result1$est, result2$est) 110 | pfun(result1$moe, result2$moe) 111 | 112 | # Notice that relative uncertainty declines with subset size 113 | plot(result1$N, result1$moe / result1$est) 114 | 115 | #----- 116 | 117 | # Use a custom function to perform more complex analyses 118 | # Custom function should return a data frame with non-standard target variables 119 | 120 | my_fun <- function(data) { 121 | 122 | # Manipulate 'data' as desired 123 | # All variables in 'implicates' and 'static' are available 124 | 125 | # Construct electricity consumption per square foot 126 | kwh_per_ft2 <- data$electricity / data$square_feet 127 | 128 | # Binary (T/F) indicator if household uses natural gas 129 | use_natural_gas <- data$natural_gas > 0 130 | 131 | # Return data.frame of custom variables to be analyzed 132 | data.frame(kwh_per_ft2, use_natural_gas) 133 | } 134 | 135 | # Do analysis using variables produced by custom function 136 | # Can included non-custom target variables as well 137 | result <- analyze(x = list(mean = c("kwh_per_ft2", "use_natural_gas", "electricity")), 138 | implicates = sim, 139 | static = recs, 140 | weight = "weight", 141 | fun = my_fun) 142 | 143 | } 144 | \references{ 145 | Cochran, W. G. (1977). \emph{Sampling Techniques} (3rd Edition). Wiley, New York. 146 | 147 | Gatz, D.F., and Smith, L. (1995). The Standard Error of a Weighted Mean Concentration — I. Bootstrapping vs Other Methods. \emph{Atmospheric Environment}, vol. 29, no. 11, 1185–1193. 148 | 149 | Rubin, D.B. (1987). \emph{Multiple imputation for nonresponse in surveys}. Hoboken, NJ: Wiley. 150 | } 151 | -------------------------------------------------------------------------------- /R/monotonic.R: -------------------------------------------------------------------------------- 1 | #' Create a monotonic relationship between two variables 2 | #' 3 | #' @description 4 | #' \code{monotonic()} returns modified values of input vector \code{y} that are smoothed, monotonic, and consistent across all values of input \code{x}. It was designed to be used post-fusion when one wants to ensure a plausible relationship between consumption (\code{x}) and expenditure (\code{y}), under the assumption that all consumers face an identical, monotonic pricing structure. By default, the mean of the returned values is forced to equal the original mean of \code{y} (\code{preserve = TRUE}). The direction of monotonicity (increasing or decreasing) is detected automatically, so use cases are not limited to consumption and expenditure variables. 5 | #' @param x Numeric. 6 | #' @param y Numeric. 7 | #' @param w Numeric. Optional observation weights. 8 | #' @param preserve Logical. Preserve the original mean of the \code{y} values in the returned values? 9 | #' @param expend Logical. Assume \code{y} is an expenditure variable? If \code{TRUE}, a safety check is implemented to ensure \code{y > 0} when \code{x > 0}. 10 | #' @param fast Logical. If \code{TRUE}, only \code{\link[scam]{supsmu}} is used with coercion of result to monotone. 11 | #' @param nmax Integer. Maximum number of observations to use for smoothing. Set lower for faster computation. \code{nmax = Inf} eliminates sampling. 12 | #' @param plot Logical. Plot the (sampled) data points and derived monotonic relationship? 13 | #' @details The initial smoothing is accomplished via \code{\link[scam]{supsmu}} with the result coerced to monotone. If \code{fast = FALSE} and the coercion step modifies the values too much, a second smooth is attempted via a \code{\link[scam]{scam}} model with either a monotone increasing or decreasing constraint. If the SCAM fails to fit, the function falls back to \code{\link[stats]{lm}} with simple linear predictions. If \code{y = 0} when \code{x = 0} (as typical for consumption-expenditure variables), then that outcome is enforced in the result. The input data are randomly sampled to no more than \code{nmax} observations, if necessary, for speed. 14 | #' @return A numeric vector of modified \code{y} values. Optionally, a plot showing the returned monotonic relationship. 15 | #' @examples 16 | #' y <- monotonic(x = recs$propane_btu, y = recs$propane_expend, plot = TRUE) 17 | #' mean(recs$propane_expend) 18 | #' mean(y) 19 | #' @export 20 | 21 | #--------- 22 | 23 | # TEST 24 | # library(tidyverse) 25 | # library(data.table) 26 | # 27 | # d <- fusionModel::read_fsd("~/Downloads/RECS_2020_2019_fused_UP.fsd") 28 | # acs <- fst::read_fst("~/Documents/Projects/fusionData/survey-processed/ACS/2019/ACS_2019_H_processed.fst", columns = c('weight', 'state', 'puma10')) 29 | # d <- cbind(d, acs) 30 | # system.time( 31 | # d[, `:=`(dollarel_z = monotonic(x = btuel, y = dollarel, w = weight), 32 | # dollarng_z = monotonic(x = btung, y = dollarng, w = weight), 33 | # dollarlp_z = monotonic(x = btulp, y = dollarlp, w = weight), 34 | # dollarfo_z = monotonic(x = btufo, y = dollarfo, w = weight)), 35 | # by = .(state, puma10)] 36 | # ) 37 | 38 | #--------- 39 | 40 | monotonic <- function(x, 41 | y, 42 | w = NULL, 43 | preserve = TRUE, 44 | expend = TRUE, 45 | fast = TRUE, 46 | nmax = 5000, 47 | plot = FALSE) { 48 | 49 | stopifnot(exprs = { 50 | length(x) == length(y) 51 | is.numeric(x) & !anyNA(x) 52 | is.numeric(y) & !anyNA(y) 53 | is.null(w) | length(w) == length(x) 54 | is.logical(preserve) 55 | is.logical(expend) 56 | is.logical(fast) 57 | nmax > 1 58 | is.logical(plot) 59 | }) 60 | 61 | if (is.null(w)) w <- rep.int(1L, length(x)) 62 | ymean <- weighted.mean(y, w, na.rm = TRUE) 63 | yint <- is.integer(y) 64 | ymin <- if (all(y == 0)) 0 else min(y[y != 0]) 65 | x0 <- x 66 | w0 <- w 67 | 68 | # If 'expend = TRUE', check for violations 69 | # If any issues are detected, a helpful warning is issued 70 | if (expend) { 71 | if (any(x < 0)) stop("'expend = TRUE' but detected negative values in 'x'") 72 | if (any(y < 0)) stop("'expend = TRUE' but detected negative values in 'y'") 73 | i <- x == 0 & y != 0 74 | if (any(i)) { 75 | y[i] <- 0L 76 | warning("Set ", sum(i), " non-zero y-value(s) (", paste0(round(100 * sum(i) / length(y), 2), "%"), ") to zero where x == 0 because 'expend = TRUE'") 77 | } 78 | i <- x > 0 & y == 0 79 | if (any(i)) { 80 | y[i] <- ymin 81 | warning("Set ", sum(i), " zero y-value(s) (", paste0(round(100 * sum(i) / length(y), 2), "%"), ") to observed non-zero minimum where x > 0 because 'expend = TRUE'") 82 | } 83 | } 84 | 85 | # If 'expend = TRUE' OR zeros in 'x' (almost) always produce zeros in 'y', restrict to non-zero observations in 'x' 86 | force.zero <- FALSE 87 | if (expend | (any(x == 0) & sum(y[x == 0] == 0) / sum(x == 0) > 0.995)) { 88 | force.zero <- TRUE 89 | i <- c(match(0, x), which(x != 0)) # Retains first instance of zero in 'x' 90 | x <- x[i] 91 | y <- y[i] 92 | w <- w[i] 93 | } 94 | 95 | # If necessary, sample the data for speed 96 | n <- length(x) 97 | if (n > nmax) { 98 | i <- match(range(x), x) # Retains first instance of min and max 'x' 99 | i <- c(i, sample.int(n = n, size = nmax - 2)) # Downsample to 'nmax' observations 100 | x <- x[i] 101 | y <- y[i] 102 | w <- w[i] 103 | } 104 | 105 | # Initial smooth via stats::supsmu() 106 | # 'span' set based on recommendation in ?supsmu 107 | m <- stats::supsmu(x, y, wt = w, span = ifelse(length(x) < 40, 0.3, "cv")) 108 | xu <- m$x 109 | inc <- suppressWarnings(cor(m$x, m$y) >= 0) 110 | if (is.na(inc)) inc <- TRUE 111 | p <- sort(m$y, decreasing = !inc) # Force monotonic predictions 112 | 113 | # Check if supsmu() smoothed and monotonic output is sufficient 114 | # Ideally, coercion to monotonic via sort() does not cause significant difference between 'p' and m$y 115 | fail <- sum(abs((p - m$y) / m$y) > 0.05) / length(p) # Percent of observations with more than 5% absolute error 116 | if (is.na(fail)) fail <- Inf 117 | if (!fast & fail > 0.05 & length(p) >= 100) { 118 | # Attempt to fit SCAM model with monotonic constraint 119 | m <- try(scam::scam(y ~ s(x, bs = ifelse(inc, "mpi", "mpd")), data = data.frame(x, y), weights = w), silent = TRUE) 120 | if (inherits(m, "scam")) { 121 | p <- as.vector(predict(m, newdata = data.frame(x = xu), type = "response", newdata.guaranteed = TRUE)) 122 | } else { 123 | # If SCAM model fails, fall back to OLS model and make simple linear predictions 124 | m <- stats::lm(y ~ x, weights = w) 125 | p <- as.vector(suppressWarnings(predict(m, newdata = data.frame(x = xu)))) 126 | } 127 | } 128 | 129 | # First time: If expend = TRUE, ensure that the output values meet some minimum positive value when x > 0 130 | if (expend) p[xu > 0] <- pmax(p[xu > 0], ymin) 131 | 132 | # If necessary, set values to zero when 'x' is zero 133 | if (force.zero) p[xu == 0] = 0 134 | 135 | # Make 'y' predictions for all original 'x' 136 | yout <- if (length(xu) == 1) rep(p, length(x0)) else approx(xu, p, xout = x0, rule = 2)$y 137 | 138 | # If requested, adjustment factor to ensure mean of transformed 'y' matches original mean value 139 | yadj <- 1 # Defined for use in plotting code, below, if 'preserve = FALSE' 140 | if (preserve) { 141 | yadj <- ymean / weighted.mean(yout, w0) 142 | if (!is.finite(yadj)) yadj <- 1 # Catch divide by zero case (mean not strictly preserved in this case) 143 | yout <- yout * yadj 144 | } 145 | 146 | # If 'y' input is integer, force output to integer 147 | # May cause input mean of 'y' to not be strictly preserved in output 148 | if (yint) yout <- as.integer(round(yout)) 149 | 150 | # May cause input mean of 'y' to NOT be strictly preserved in output 151 | if (expend) yout[x0 > 0] <- pmax(yout[x0 > 0], ymin) 152 | 153 | # Optional plot of transformation 154 | if (plot) { 155 | plot(x, y, col = "#00000033", ylim = range(c(y, p)), xlab = "x", ylab = "y") 156 | lines(xu, p * yadj, col = "red") 157 | } 158 | 159 | # Safety check 160 | if (anyNA(yout)) stop("NA values in result vector") 161 | 162 | return(yout) 163 | 164 | } 165 | -------------------------------------------------------------------------------- /man/analyze_fusionACS.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/analyze_fusionACS.R 3 | \name{analyze_fusionACS} 4 | \alias{analyze_fusionACS} 5 | \title{Analyze fusionACS microdata} 6 | \usage{ 7 | analyze_fusionACS( 8 | analyses, 9 | year, 10 | respondent = "household", 11 | by = NULL, 12 | area = NULL, 13 | fun = NULL, 14 | M = Inf, 15 | R = Inf, 16 | cores = 1, 17 | version_up = 2, 18 | force_up = FALSE 19 | ) 20 | } 21 | \arguments{ 22 | \item{analyses}{List. Specifies the desired analyses. Each analysis is a formula. See Details and Examples.} 23 | 24 | \item{year}{Integer. One or more years for which microdata are pooled to compute \code{analyses} (i.e. ACS recipient year). Currently defaults to \code{year = 2015:2019}, if the \code{by} variables indicate a sub-PUMA analysis requiring UrbanPop weights.} 25 | 26 | \item{respondent}{Character. Should the \code{analyses} be computed using \code{"household"}- or \code{"person"}-level microdata?} 27 | 28 | \item{by}{Character. Optional variable(s) that collectively define the set of population subgroups for which each analysis is computed. Can be a mix of geographic (e.g. census tract) and/or socio-demographic microdata variables (e.g. poverty status); the latter may be existing variables on disk or custom variables created on-the-fly via \code{fun()}. If \code{NULL}, analysis is done for the whole (national) sample.} 29 | 30 | \item{area}{Call. Optional unquoted call specifying a geographic area within which to compute the \code{analyses}. Useful for restricting the study area to a manageable size.} 31 | 32 | \item{fun}{Function. Optional function for creating custom microdata variables that cannot be accommodated in \code{analyses}. Must take \code{data} and (optionally) \code{weight} as the only function arguments and must return a \code{data.frame} with number of rows equal to \code{nrow(data)}. See Details and Examples.} 33 | 34 | \item{M}{Integer. The first \code{M} implicates are used. Set \code{M = Inf} to use all available implicates.} 35 | 36 | \item{R}{Integer. The first \code{R} replicate weights are used. Set \code{R = Inf} to use all available replicate weights.} 37 | 38 | \item{cores}{Integer. Number of cores used for multithreading in \code{\link[collapse]{collapse-package}} functions.} 39 | 40 | \item{version_up}{Integer. Use \code{version_up = 2} to access 10-replicate weights for 17 metro areas or \code{version_up = 3} to access 40-replicate weights for 17 metro areas.} 41 | 42 | \item{force_up}{Logical. If \code{TRUE}, force use of UrbanPop weights even if the requested analysis can be done using native ACS weights.} 43 | } 44 | \value{ 45 | A tibble reporting analysis results, possibly across subgroups defined in \code{by}. The returned quantities include: 46 | 47 | \describe{ 48 | \item{lhs}{Optional analysis name; the "left hand side" of the analysis formula.} 49 | \item{rhs}{The "right hand side" of the analysis formula.} 50 | \item{type}{Type of analysis: sum, mean, median, prop(ortion) or count.} 51 | \item{level}{Factor levels for categorical analyses; NA otherwise.} 52 | \item{N}{Mean number of valid microdata observations across all implicates and replicates; i.e. the sample size used to construct the estimate.} 53 | \item{est}{Point estimate; mean estimate across all implicates and replicates.} 54 | \item{moe}{Margin of error associated with the 90\% confidence interval.} 55 | \item{se}{Standard error of the estimate.} 56 | \item{df}{Degrees of freedom used to calculate the margin of error.} 57 | \item{cv}{Coefficient of variation; conventional scale-independent measure of estimate reliability. Calculated as: \code{100 * moe / 1.645 / est}} 58 | } 59 | } 60 | \description{ 61 | For fusionACS internal use only. Calculation of point estimates and associated uncertainty (margin of error) for analyses using ACS and/or fused donor survey variables. 62 | Efficiently computes means, medians, sums, proportions, and counts, optionally across population subgroups. 63 | The use of native ACS weights or ORNL UrbanPop synthetic population weights is automatically determined given the requested geographic resolution. 64 | Requires a local \code{/fusionData} directory in the working directory path with assumed file structure and conventions. 65 | } 66 | \details{ 67 | Allowable geographic units of analysis specified in \code{by} are currently limited to: region, division, state, cbsa10, puma10, county10, cousubfp10 (county subdivision), zcta10 (zip code), tract10 (census tract), and bg10 (block group). 68 | 69 | The final point estimates are the mean estimates across implicates. The final margin of error is derived from the pooled standard error across implicates, calculated using Rubin's pooling rules (1987). The within-implicate standard error's are calculated using the replicate weights. 70 | 71 | Each entry in the \code{analyses} list is a \code{\link[stats]{formula}} of the format \code{Z ~ F(E)}, where \code{Z} is an optional, user-friendly name for the analysis, \code{F} is an allowable “outer function”, and \code{E} is an “inner expression” containing one or more microdata variables. For example: 72 | 73 | \code{mysum ~ mean(Var1 + Var2)} 74 | 75 | In this case, the outer function is mean(). Allowable outer functions are: mean(), sum(), median(), sd(), and var(). When the inner expression contains more than one variable, it is first evaluated and then \code{F()} is applied to the result. In this case, an internal variable \code{X = Var1 + Var2} is generated across all observations, and then \code{mean(X)} is computed. 76 | 77 | If no inner expression is desired, the \code{analyses} list can use the following convenient syntax to apply a single outer function to multiple variables: 78 | 79 | \code{mean = c("Var1", "Var2")} 80 | 81 | The inner expression can also utilize any function that takes variable names as arguments and returns a vector with the same length as the inputs. This is useful for defining complex operations in a separate function (e.g. microsimulation). For example: 82 | 83 | \code{myfun = function(Var1, Var2) {Var1 + Var2}} 84 | 85 | \code{mysum ~ mean(myfun(Var1, Var2))} 86 | 87 | The use of sum() or mean() with an inner expression that returns a categorical vector automatically results in category-wise weighted counts and proportions, respectively. For example, the following analysis would fail if evaluated literally, since mean() expects numeric input but the inner expression returns character. But this is interpreted as a request to return weighted proportions for each categorical outcome. 88 | 89 | \code{myprop ~ mean(ifelse(Var1 > 10 , 'Yes', 'No'))} 90 | 91 | \code{analyze_fusionACS()} uses "fast" versions of the allowable outer functions, as provided by \code{\link[collapse]{fast-statistical-functions}} in the \code{collapse} package. These functions are highly optimized for weighted, grouped calculations. In addition, outer functions mean(), sum(), and median() enjoy the use of platform-independent multithreading across columns when \code{cores > 1}. Analyses with numerical inner expressions are processed using a series of calls to \code{\link[collapse]{collap}} with unique observation weights. Analyses with categorical inner expressions utilize a series of calls to \code{\link[collapse]{fsum}}. 92 | } 93 | \examples{ 94 | # Analysis using ACS native weights for year 2017, by PUMA, in South Atlantic Census Division 95 | # Uses all available implicates and replicate weights 96 | test <- analyze_fusionACS(analyses = list(high_burden ~ mean(dollarel / hincp > 0.05)), 97 | year = 2017, 98 | by = "puma10", 99 | area = division == "South Atlantic") 100 | 101 | # Analysis using UrbanPop 2015-2019 weights, by tract, in Utah (actually Salt Lake City metro given current UrbanPop data) 102 | # Uses 5 (of possible 20) fusion implicates for RECS "dollarel" variable 103 | # Uses 5 (of possible 10) UrbanPop replicate weights 104 | test <- analyze_fusionACS(analyses = list(median_burden ~ median(dollarel / hincp)), 105 | year = 2015:2019, 106 | by = "tract10", 107 | area = state_name == "Utah", 108 | M = 5, 109 | R = 5) 110 | 111 | # User function to create custom variables from microdata 112 | # Variables explicitly referenced in my_fun() are automatically loaded into 'data' within analyze_fusionACS() 113 | # Variables returned by my_fun() may be used in 'by' or inner expressions of 'analyses' 114 | my_fun <- function(data) { 115 | require(tidyverse, quietly = TRUE) 116 | data \%>\% 117 | mutate(elderly = agep >= 65, 118 | energy_expend = dollarel + dollarfo + dollarlp + dollarng, 119 | energy_burden = energy_expend / hincp, 120 | energy_burden = ifelse(hincp < 5000, NA, energy_burden)) \%>\% 121 | select(elderly, energy_burden, energy_expend) 122 | } 123 | 124 | # Analysis using UrbanPop 2015-2019 weights, by zip code and elderly head of household, in Atlanta CBSA 125 | test <- analyze_fusionACS(analyses = list(energy_burden ~ mean(energy_burden), 126 | at_risk ~ mean(energy_burden > 0.075 | acequipm_pub == "No air conditioning")), 127 | year = 2015:2019, 128 | by = c("zcta10", "elderly"), 129 | area = cbsa10 == "12060", 130 | fun = my_fun, 131 | M = 5, 132 | R = 5) 133 | 134 | } 135 | \references{ 136 | Rubin, D.B. (1987). \emph{Multiple imputation for nonresponse in surveys}. Hoboken, NJ: Wiley. 137 | } 138 | -------------------------------------------------------------------------------- /R/impute.R: -------------------------------------------------------------------------------- 1 | #' Impute missing data via fusion 2 | #' 3 | #' @description 4 | #' A universal missing data imputation tool that wraps successive calls to \code{\link{train}} and \code{\link{fuse}} under the hood. Designed for simplicity and ease of use. 5 | #' @param data A data frame with missing values. 6 | #' @param weight Optional name of observation weights column in \code{data}. 7 | #' @param ignore Optional names of columns in \code{data} to ignore. These variables are neither imputed nor used as predictors. 8 | #' @param cores Number of physical CPU cores used by \code{\link[lightgbm]{lightgbm}}. LightGBM is parallel-enabled on all platforms if OpenMP is available. 9 | #' @details Variables with missing values are imputed sequentially, beginning with the variable with the fewest missing values. Since LightGBM models accommodate NA values in the predictor set, all available variables are used as potential predictors (excluding \code{ignore} variables). For each call to \code{\link{train}}, 80% of observations are randomly selected for training and the remaining 20% are used as a validation set to determine an appropriate number of tree learners. All LightGBM model parameters are kept at the sensible default values in \code{\link{train}}. Since \code{\link[lightgbm]{lightgbm}} uses OpenMP multithreading, it is not advisable to use \code{\link{impute}} inside a forked/parallel process when \code{cores > 1}. 10 | #' @return A data frame with all missing values imputed. 11 | #' @examples 12 | #' # Create data frame with random NA values 13 | #' ?recs 14 | #' data <- recs[, 2:7] 15 | #' miss <- replicate(ncol(data), runif(nrow(data)) < runif(1, 0.01, 0.3)) 16 | #' data[miss] <- NA 17 | #' colSums(is.na(data)) 18 | #' 19 | #' # Impute the missing values 20 | #' result <- impute(data) 21 | #' anyNA(result) 22 | #' 23 | #' @export 24 | 25 | #--- 26 | 27 | # library(tidyverse) 28 | # library(data.table) 29 | # source("R/utils.R") 30 | # data <- fst::read_fst("~/Documents/Projects/fusionData/test_hus.fst") 31 | # weight = "WGTP" 32 | # cores = 1 33 | # 34 | # #ignore <- names(select(data[1, ], SERIALNO, PUMA, WGTP1:SERIALNO_original)) 35 | # # Specify this in ASC processing script 36 | # ignore <- setdiff(names(data), c(y, "WGTP", "puma_rent", "puma_value", "puma_income", "puma_mortgage", "ST", "NP", "ACR", "BLD", "FS", "HFL", "HHL", "BDSP", "BDS", "RMSP", "RMS", "TEN", "VEH", "YBL", "YRBLT", "HINCP", "FES", "WIF", "R18", "R65")) 37 | # 38 | # test <- impute(data, "WGTP", ignore = ignore) 39 | 40 | #--- 41 | 42 | impute <- function(data, 43 | weight = NULL, 44 | ignore = NULL, 45 | cores = parallel::detectCores(logical = FALSE) - 1L) { 46 | 47 | t0 <- Sys.time() 48 | 49 | fst::threads_fst(nr_of_threads = cores) 50 | setDTthreads(threads = cores) 51 | 52 | stopifnot(exprs = { 53 | is.data.frame(data) 54 | ncol(data) > 1 55 | anyNA(data) 56 | any(is.null(weight), length(weight) == 1 & weight %in% names(data)) 57 | any(is.null(ignore), all(ignore %in% names(data))) 58 | cores > 0 & cores %% 1 == 0 & cores <= parallel::detectCores(logical = FALSE) 59 | }) 60 | 61 | dnames <- names(data) 62 | data.dt <- is.data.table(data) 63 | d <- as.data.table(data) 64 | rm(data) 65 | 66 | if (is.null(weight)) { 67 | weight = "W_.._" 68 | d[, W_.._ := 1L] 69 | } else { 70 | if (anyNA(d[[weight]])) stop("NA values are not allowed in 'weight'") 71 | } 72 | 73 | miss <- sort(colSums(is.na(d))) 74 | y <- names(miss)[miss > 0 & miss < nrow(d)] 75 | y <- setdiff(y, ignore) 76 | if (!length(y)) stop("No un-ignored columns with NA values to impute") 77 | temp.fsn <- paste0(tempfile(), ".fsn") 78 | 79 | #--- 80 | 81 | # Predictor prescreen step if 'd' is sufficiently large 82 | #if (length(x) & (nrow(d) > 10e3 | ncol(d) > 10)) { 83 | if ((nrow(d) > 10e3 | ncol(d) > 20)) { 84 | 85 | d2 <- copy(d) 86 | d2[, (weight) := NULL] 87 | 88 | # Convert 'd2' to plausible ranks for correlation screening 89 | # All output columns should be NA, integer, or logical 90 | # NA's in input are preserved in output 91 | for (i in 1:ncol(d2)) { 92 | z <- d2[[i]] 93 | if (is.numeric(z)) { 94 | # Ties method 'dense' ensures integer output with minimum of 1 and maximum of length(na.omit(z)) 95 | z <- frank(z, ties.method = "dense", na.last = "keep") 96 | } else { 97 | # Converts ordered factors to integer 98 | if (is.ordered(z)) { 99 | z <- as.integer(z) 100 | } else { 101 | if (!is.logical(z)) { 102 | # Converts character and un-ordered factors to TRUE for the most-common (non-NA) value and FALSE otherwise 103 | #zt <- table2(z, na.rm = TRUE) 104 | zt <- collapse::qtable(z, na.exclude = TRUE) 105 | z <- z == names(which.max(zt)) 106 | } 107 | } 108 | } 109 | set(d2, j = i, value = z) 110 | } 111 | 112 | # Randomly down-sample when 'd2' is large 113 | if (nrow(d2) > 100e3) d2 <- d2[sample.int(nrow(d2), 100e3), ] 114 | 115 | # Correlation matrix 116 | # Note that Spearman (rank) correlations are used (data pre-ranked) to reduce effect of outliers 117 | ok <- setdiff(names(d2), ignore) 118 | cmat <- suppressWarnings(cor(d2[, ..y], d2[, ..ok], use = "pairwise.complete.obs")) 119 | cmat[is.na(cmat)] <- 0 120 | 121 | # Initial correlation screening, based on absolute correlation value 122 | xlist <- lapply(y, function(v) { 123 | 124 | p <- cmat[v, ] 125 | names(p) <- colnames(cmat) 126 | p <- p[names(p) != v] 127 | p <- sort(abs(p), decreasing = TRUE) 128 | 129 | # Restrict to predictors that meet arbitrary absolute correlation threshold (> 0.025) 130 | # Limit to maximum 30 potential predictors for each model (hard cap) 131 | # Attempt to retain minimum 5 potential predictors regardless of correlation 132 | xv <- names(p[p > 0.025]) 133 | xv <- xv[1:min(30, length(xv))] 134 | if (length(xv) < 5) xv <- names(p)[1:min(5, length(p))] 135 | 136 | return(xv) 137 | 138 | }) 139 | 140 | rm(d2) 141 | 142 | } else { 143 | 144 | xlist <- lapply(y, function(v) setdiff(names(d), c(v, ignore, weight))) 145 | 146 | } 147 | 148 | xlist <- setNames(xlist, y) 149 | 150 | #--- 151 | 152 | # Coerce any character variables in 'y' or 'xlist' to unordered factor 153 | # This is necessary to avoid errors in train() and fuse(), which expect factors 154 | # The 'cconv' columns are converted to character prior to returning final function output 155 | ccols <- names(which(sapply(d, is.character))) 156 | cconv <- intersect(ccols, unique(c(y, unlist(xlist)))) 157 | if (length(cconv) > 0) d[, (cconv) := lapply(.SD, factor), .SDcols = cconv] 158 | 159 | # Check number of factor levels in 'y' variables 160 | nlev <- sapply(d[, ..y], nlevels) 161 | bad <- names(nlev)[nlev >= 200] 162 | if(length(bad)) { 163 | stop("Detected categorical imputation variable(s) with more than 200 levels (not allowed):\n", paste(bad, collapse = "\n")) 164 | } 165 | 166 | #--- 167 | 168 | pb <- txtProgressBar(max = length(y), style = 3) 169 | 170 | for (i in 1:length(y)) { 171 | 172 | # Response and predictor variables 173 | v <- y[i] 174 | xv <- xlist[[i]] 175 | vtrain <- c(weight, v, xv) 176 | 177 | # Observations to impute 178 | imp <- is.na(d[[v]]) 179 | 180 | # Training dataset 181 | dtrain <- d[!imp, ..vtrain] 182 | 183 | # If sample size is large, use a stratified sample of the training data 184 | # If there are few missing values, then fewer training observations are possible (10x the number missing) 185 | # Restricts training sample to no less than 5k and no more than 50k observations 186 | maxn <- min(max(5e3, 10 * sum(imp)), 50e3) 187 | if (nrow(dtrain) > maxn) { 188 | keep <- stratify(y = dtrain[[v]], 189 | ycont = is.numeric(dtrain[[v]]), 190 | tfrac = maxn / nrow(dtrain), 191 | ntiles = 20) 192 | dtrain <- dtrain[keep, ] 193 | } 194 | 195 | # Train fusion model (saved to 'temp.fsn' temporary file) 196 | # Suppress output to the console 197 | invisible(capture.output( 198 | fusionModel::train(data = dtrain, 199 | y = v, 200 | x = xv, 201 | fsn = temp.fsn, 202 | weight = weight, 203 | nfolds = 0.8, 204 | nquantiles = 2, 205 | cores = cores) 206 | )) 207 | 208 | # Perform fusion/imputation 209 | # Suppress output to the console 210 | invisible(capture.output( 211 | p <- fusionModel::fuse(data = d[imp, ], 212 | fsn = temp.fsn, 213 | M = 1, 214 | cores = cores) 215 | )) 216 | 217 | # Update 'd' with imputed values 218 | set(d, i = which(imp), j = v, value = p[[2]]) 219 | 220 | # Update progress bar 221 | setTxtProgressBar(pb, value = i) 222 | 223 | } 224 | 225 | # Close progress bar 226 | close(pb) 227 | 228 | # Remove temporary fusion model 229 | unlink(temp.fsn) 230 | 231 | # Check for NA's in output 232 | stopifnot(!anyNA(d[, ..y])) 233 | 234 | # If any character columns were converted to factor, convert back to character 235 | if (length(cconv) > 0) d[, (cconv) := lapply(.SD, as.character), .SDcols = cconv] 236 | 237 | # Ensure output column order and class matches input 'data' 238 | suppressWarnings(set(d, j = "W_.._", value = NULL)) # Removes the placeholder weight variable, if present 239 | setcolorder(d, dnames) 240 | if (!data.dt) d <- as.data.frame(d) 241 | 242 | # Report processing time 243 | tout <- difftime(Sys.time(), t0) 244 | cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ") 245 | 246 | return(d) 247 | 248 | } 249 | -------------------------------------------------------------------------------- /R/assemble.R: -------------------------------------------------------------------------------- 1 | #' Assemble fusionACS microdata across surveys 2 | #' 3 | #' @description 4 | #' For fusionACS usage only. Provides a safe and efficient way to assemble (merge) fused microdata across surveys to return a single data table with the requested variables. The requested variables can come from any fused (donor) survey and/or the American Community Survey (ACS). The necessary variables are automatically and efficiently read from the appropriate local file and safely merged on household and/or person ID variables, optionally collapsing or expanding records as necessary depending on the \code{respondent} argument. Assumes (and checks for) a local \code{/fusionData} directory with appropriate file structure and conventions. 5 | #' 6 | #' @param year Integer. One or more years for which to return results (i.e. the ACS recipient year). 7 | #' @param var Character. Name of one or more variables to return. May contain household- and/or person-level variables. See Details. 8 | #' @param respondent Character. Should \code{"household"}- or \code{"person"}-level microdata be returned? 9 | #' @param M Integer. The first \code{M} implicates are returned for fused variables. Set \code{M = Inf} to return all implicates. Ignored if \code{var} contains only ACS variables (i.e. no implicates) 10 | #' @param df Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows. 11 | #' @param cores Integer. Number of cores used by the \code{\link[fst]{fst-package}} when reading from disk. 12 | #' @param source Character Specifies where to look for \code{var}: all available microdata (\code{source = "all"}); only ACS microdata (\code{source = "ACS"}); or only fused microdata (\code{source = "fused"}). Note that no observation weights are returned if \code{source = "fused"}, since weights are stored in the ACS microdata. 13 | #' @param silent Logical. If \code{FALSE}, a warning is issued if any \code{var} cannot be located in available local files. 14 | #' 15 | #' @details The \code{var} argument can contain a mix of household- and/or person-level variables. When \code{respondent = "household"}, the reference person (i.e. head of household) value is returned for any person-level variables. When \code{respondent = "person"}, the values of any household-level variables are replicated for each person in the household. 16 | #' 17 | #' @return A keyed data table containing the following columns, in addition to the variables named in \code{var}: 18 | #' \describe{ 19 | #' \item{M}{Implicate number. See \code{\link{fuse}}.} 20 | #' \item{year}{Year of the ACS recipient microdata.} 21 | #' \item{hid}{ACS household ID using fusionACS convention.} 22 | #' \item{pid}{ACS person ID using fusionACS convention, if \code{respondent = "person"}.} 23 | #' \item{weight}{ACS microdata primary sample weight.} 24 | #' } 25 | #' 26 | #' @examples 27 | #' # NOTE: Requires local /fusionData directory containing the necessary ACS and .fsd files 28 | #' test <- assemble(year = 2018:2019, 29 | #' var = c("dollarel", "hincp", "agep", "state"), 30 | #' respondent = "household", 31 | #' M = 1) 32 | #' @export 33 | 34 | #------------ 35 | 36 | # library(tidyverse) 37 | # library(data.table) 38 | # source("R/utils.R") 39 | # setwd( "/home/kevin/Documents/Projects/fusionData") 40 | # 41 | # test <- assemble(year = 2015, 42 | # #var = c("dollarel" ,"dollarng", "dollarfo" ,"dollarlp" ,"hincp", "pov_ratio", "agep", "ref_race5", "rac1p", "hisp", "state", "puma10"), 43 | # var = c("hincp", "pov_ratio", "state", "puma10", "agep"), 44 | # respondent = "household", 45 | # M = 1, 46 | # cores = 2) 47 | 48 | #----- 49 | 50 | # year = 2019 51 | # var = c("dollarel") 52 | # respondent = "household" 53 | # M = 2 54 | # df = NULL 55 | # cores = 2 56 | # source = "fused" 57 | # silent= TRUE 58 | 59 | #----- 60 | 61 | assemble <- function(year, 62 | var, 63 | respondent = "household", 64 | M = 1, 65 | df = NULL, 66 | cores = 1, 67 | source = "all", 68 | silent = FALSE) { 69 | 70 | # Check validity of the working directory path 71 | # Checks if "/fusionData" is part of the path, as this is required 72 | b <- strsplit(full.path(getwd()), .Platform$file.sep, fixed = TRUE)[[1]] 73 | i <- which(b == "fusionData") 74 | if (length(i) == 0) stop("'/fusionData' is not part of the working directory path; this is required.") 75 | dir <- paste(b[1:i], collapse = .Platform$file.sep) 76 | 77 | # Capture the function call; added as attribute in the final output 78 | mcall <- match.call() 79 | 80 | # Respondent identifier ("H" or "P") 81 | rtype <- substring(toupper(respondent), 1, 1) 82 | 83 | # Initial argument check 84 | stopifnot({ 85 | year >= 2005 & year %% 1 == 0 86 | is.character(var) & length(var) > 0 87 | rtype %in% c("H", "P") 88 | is.null(df) | is.data.frame(df) 89 | M > 0 90 | cores > 0 & cores %% 1 == 0 91 | tolower(source) %in% c('all', 'acs', 'fused') 92 | is.logical(silent) 93 | }) 94 | 95 | # Universal variables always returned in output, if possible 96 | uvar <- c('M', 'year', 'hid', 'pid', 'weight') 97 | 98 | fst::threads_fst(nr_of_threads = cores) 99 | setDTthreads(threads = cores) 100 | hh <- rtype == "H" 101 | 102 | # Specified source for each 'var', if provided 103 | vsrc <- ifelse(str_detect(var, ":"), str_extract(var, "^[^:]+"), "") 104 | v <- ifelse(str_detect(var, ":"), str_extract(var, "(?<=:).*"), var) 105 | i <- !v %in% uvar 106 | vsrc <- vsrc[i] 107 | v <- v[i] 108 | 109 | #----- 110 | 111 | result <- lapply(year, function(yr) { 112 | 113 | # Get file paths to ACS microdata for 'yr' 114 | pa <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._processed.fst"), recursive = TRUE, full.names = TRUE) 115 | pa <- sort(pa, decreasing = rtype == "P") # Sort the processed ACS paths, to place either Household or Person microdata first 116 | pc <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._custom.fst"), recursive = TRUE, full.names = TRUE) 117 | 118 | # Get file path(s) to fused microdata for 'yr' 119 | pf <- rev(list.files(file.path(dir, "fusion"), pattern = paste0(yr, "_._fused.fsd"), recursive = TRUE, full.names = TRUE)) 120 | 121 | # Select paths based on 'source' argument 122 | fpaths <- switch(tolower(source), 123 | all = c(pa, pc, pf), 124 | acs = c(pa, pc), 125 | fused = pf) 126 | 127 | # The survey name (e.g. RECS_2020) associated with each path 128 | survey <- str_extract(basename(fpaths), "^[^_]*_[^_]*") 129 | 130 | #--- 131 | 132 | # Check the donor survey 'fpaths' for duplicate occurrences of requested variables 133 | # Stop with error if conflict(s) detected 134 | # This can occur if there are identically named variables across surveys OR multiple vintages of a donor survey are fused to the same ACS vintage (e.g. RECS 2015 and RECS 2015 fused to ACS 2015-2019) 135 | i <- substring(survey, 1, 4) != "ACS_" 136 | vlist <- lapply(fpaths[i], function(x) intersect(v[vsrc == ""], fst::metadata_fst(x)$columnNames)) 137 | u <- table(unlist(vlist)) 138 | check <- lapply(vlist, function(x) intersect(x, names(u)[u > 1])) 139 | names(vlist) <- names(check) <- survey[i] 140 | if (any(lengths(check) > 0)) { 141 | check <- check[lengths(check) > 0] 142 | error.msg <- paste(capture.output(str(check)), collapse = "\n") 143 | stop("Conflicting 'var' names. The following variables are present in more than one source file:\n", error.msg, "\nRevise 'var' to use a colon to specify the source; e.g. ", paste0("'", names(check)[1], ":", check[[1]][1], "'")) 144 | } 145 | 146 | # Update the 'vsrc' object to assign the source survey for 'v' that are unassigned 147 | temp <- lapply(vlist, function(x) intersect(x, names(u)[u == 1])) 148 | for (i in seq_along(temp)) { 149 | for (j in temp[[i]]) { 150 | vsrc[v == j] <- names(vlist)[i] 151 | } 152 | } 153 | 154 | # Data frame with variable sources 155 | dv <- data.frame(year = yr, var = v, source = vsrc) 156 | 157 | #--- 158 | 159 | d <- data.table() 160 | for (x in fpaths) { 161 | 162 | a <- substring(basename(x), 1, 4) == "ACS_" 163 | r <- rev(strsplit(x, "_")[[1]])[[2]] 164 | xn <- fst::metadata_fst(x)$columnNames 165 | 166 | keep <- if (!all(c('year', 'hid') %in% xn)) { 167 | warning("Skipping file ", basename(x), " due to irregular file structure") 168 | NULL 169 | } else { 170 | i <- vsrc == str_extract(basename(x), "^[^_]*_[^_]*") | vsrc == "" 171 | temp <- intersect(xn, c(v[i], names(df))) 172 | if (a & rtype == r) temp <- c(temp, 'weight') 173 | setdiff(temp, names(d)) # Excludes any variables already in 'd' 174 | } 175 | 176 | # Load requested variables from disk 177 | if (length(keep)) { 178 | dt <- fusionModel::read_fsd(path = x, 179 | columns = intersect(xn, c('M', 'year', 'hid', 'pid', keep)), 180 | M = M, 181 | df = if (any(xn %in% names(df))) select(df, any_of(xn)) else NULL, 182 | cores = cores) 183 | } else { 184 | dt <- data.table() 185 | } 186 | 187 | # If household-level data requested, merge the reference person value for any person-level variables 188 | # TO DO: Allow different summary metric besides just the reference person value 189 | if (hh & rtype != r & nrow(dt) > 0) { 190 | i <- !duplicated(dt$hid) # Retains the reference person data; first entry within each 'hid' 191 | dt <- select(dt[i], -pid) 192 | } 193 | 194 | # NOTE: If person-level data is requested, the merge() below causes any household-level variables to be replicated for each person 195 | # Since it is an left merge, persons not in the household data (i.e. group quarter population) will contain NA values 196 | # If no household variables need to be added, then the returned person microdata will include the group quarter individuals 197 | if (nrow(dt) > 0) { 198 | setkeyv(dt, cols = intersect(c('M', 'year', 'hid', 'pid'), names(dt))) 199 | if (nrow(d) == 0) { 200 | d <- dt 201 | } else { 202 | #d <- merge(d, dt, by = intersect(key(d), key(dt)), allow.cartesian = TRUE) 203 | d <- collapse::join(x = d, 204 | y = dt, 205 | on = intersect(key(d), key(dt)), 206 | how = "left", # See NOTE above about left join and household vs. person data 207 | multiple = TRUE, 208 | verbose = FALSE) 209 | setkeyv(d, cols = key(dt)) 210 | } 211 | } 212 | rm(dt) 213 | 214 | } 215 | 216 | #return(d) 217 | return(list(d, dv)) 218 | 219 | }) 220 | 221 | # Extract and rbind the attribute data.frames 222 | attr.df <- result %>% 223 | purrr::map(2) %>% 224 | rbindlist() 225 | 226 | # Extract and rbind the microdata output 227 | result <- result %>% 228 | purrr::map(1) %>% 229 | rbindlist() %>% 230 | setcolorder(neworder = intersect(c(uvar, var), names(.))) %>% 231 | setattr(name = "origin", value = attr.df) %>% 232 | setattr(name = "assemble", value = mcall) 233 | 234 | # Set keys 235 | if (nrow(result) > 0) setkeyv(result, cols = intersect(c('M', 'year', 'hid', 'pid'), names(result))) 236 | 237 | # Any 'var' missing in the output? If so, report as warning. 238 | miss <- setdiff(v, names(result)) 239 | if (length(miss) & !silent) warning("Could not locate the following variable(s): ", paste(miss, collapse = ", ")) 240 | 241 | return(result) 242 | 243 | } 244 | -------------------------------------------------------------------------------- /R/validate.R: -------------------------------------------------------------------------------- 1 | #' Validate fusion output 2 | #' 3 | #' @description 4 | #' Performs internal validation analyses on fused microdata to estimate how well the simulated variables reflect patterns in the dataset used to train the underlying fusion model (i.e. observed/donor data). This provides a standard approach to validating fusion output and associated models. See Examples for recommended usage. 5 | #' 6 | #' @param observed Data frame. Observed data against which to validate the \code{simulated} variables. Typically the same dataset used to \code{\link{train}} the fusion model used to generate \code{simulated}. 7 | #' @param implicates Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M". 8 | #' @param subset_vars Character. Vector of columns in \code{observed} used to define the population subsets across which the fusion variables are validated. The levels of each \code{subset_vars} (including all two-way interactions of \code{subset_vars}) define the population subsets. Continuous \code{subset_vars} are converted to a five-level ordered factor based on a univariate k-means clustering. 9 | #' @param weight Character. Name of the observation weights column in \code{observed}. If NULL (default), uniform weights are assumed. 10 | #' @param min_size Integer. Subsets with less than \code{min_size} observations are excluded. Since subsets with few observations are unlikely to give reliable estimates, it doesn't make sense to consider them for validation purposes. 11 | #' @param plot Logical. If TRUE (default), \code{\link{plot_valid}} is called internally and summary plots are returned along with complete validation results. Requires the \code{\link{ggplot2}} package. 12 | #' @param cores Integer. Number of cores used. Only applicable on Unix systems. 13 | 14 | #' @details The objective of \code{\link{validate}} is to confirm that the fusion output is sensible and help establish the utility of the synthetic data across myriad analyses. Utility here is based on comparison of point estimates and confidence intervals derived using multiple-implicate synthetic data with those derived using the original donor data. 15 | #' 16 | #' The specific analyses tested include variable levels (means and proportions) across population subsets of varying size. This allows estimates of how each of the synthetic variables perform in analyses with real-world relevance, at varying levels of complexity. In effect, \code{validate()} performs a large number of analyses of the kind that the \code{\link{analyze}} function is designed to do on a one-by-one basis. 17 | #' 18 | #' Most users will want to use the default setting \code{plot = TRUE} to simultaneously return visualization (plots) of the validation results. Plot creation is detailed in \code{\link{plot_valid}}. 19 | #' 20 | #' @return If \code{plot = FALSE}, a data frame containing complete validation results. If If \code{plot = FALSE}, a list containing full results as well as additional lot objects as described in \code{\link{plot_valid}}. 21 | #' 22 | #' @examples 23 | #' # Build a fusion model using RECS microdata 24 | #' # Note that "fusion_model.fsn" will be written to working directory 25 | #' fusion.vars <- c("electricity", "natural_gas", "aircon") 26 | #' predictor.vars <- names(recs)[2:12] 27 | #' fsn.path <- train(data = recs, 28 | #' y = fusion.vars, 29 | #' x = predictor.vars, 30 | #' weight = "weight") 31 | #' 32 | #' # Fuse back onto the donor data (multiple implicates) 33 | #' sim <- fuse(data = recs, 34 | #' fsn = fsn.path, 35 | #' M = 20) 36 | #' 37 | #' # Calculate validation results 38 | #' valid <- validate(observed = recs, 39 | #' implicates = sim, 40 | #' subset_vars = c("income", "education", "race", "urban_rural")) 41 | #' 42 | #' @export 43 | 44 | #----- 45 | 46 | # library(fusionModel) 47 | # library(dplyr) 48 | # library(data.table) 49 | # library(purrr) 50 | # library(ggplot2) 51 | # library(fst) 52 | # source("R/utils.R") 53 | # source("R/analyze_funs.R") 54 | # observed <- read_fst("~/Documents/Projects/fusionData/fusion/RECS/2015/2015/input/RECS_2015_2015_train.fst") 55 | # implicates <- read_fsd("~/Documents/Projects/fusionData/fusion/RECS/2015/2015/output/RECS_2015_2015_valid.fsd") 56 | # subset_vars = c("moneypy__hincp", "hhage__agep", "householder_race__rac1p", "education__schl", "nhsldmem__np", "kownrent__ten", "loc..recs_division") 57 | # weight = "weight" 58 | # min_size = 30 59 | # plot = TRUE 60 | # cores = 1 61 | 62 | #----- 63 | 64 | validate <- function(observed, 65 | implicates, 66 | subset_vars, 67 | weight = NULL, 68 | min_size = 30, 69 | plot = TRUE, 70 | cores = 1) { 71 | 72 | t0 <- Sys.time() 73 | 74 | obs <- as.data.table(observed) 75 | sim <- as.data.table(implicates) 76 | rm(observed, implicates) 77 | 78 | # Check inputs 79 | stopifnot({ 80 | all(setdiff(names(sim), "M") %in% names(obs)) 81 | is.character(subset_vars) & all(subset_vars %in% names(obs)) 82 | is.null(weight) | (length(weight) == 1 & weight %in% names(obs)) 83 | min_size > 0 84 | is.logical(plot) 85 | cores > 0 & cores %% 1 == 0 86 | }) 87 | 88 | # Check that input dimensions are consistent with one another 89 | N <- nrow(obs) 90 | Mimp <- max(sim$M) 91 | if (Mimp == 1) stop("Validation requires 'implicates' to contain more than one implicate") 92 | nM <- sim[, .N, by = M] 93 | stopifnot(N == nrow(sim) / Mimp) 94 | stopifnot(all(nM$M %in% seq_len(Mimp))) 95 | stopifnot(all(nM$N == N)) 96 | 97 | if (is.null(weight)) { 98 | cat("Assuming uniform sample weights\n") 99 | obs$W <- 1L 100 | } else { 101 | setnames(obs, weight, "W") 102 | obs[, W := W / mean(W)] 103 | } 104 | 105 | # Variables for which estimates will be computed 106 | y <- setdiff(names(sim), "M") 107 | yN <- length(y) # Reported to console later 108 | if (any(y %in% subset_vars)) stop("Fused variabes are not allowed as 'subset_vars'") 109 | 110 | # Generate list of combinations of subset variables to test 111 | # Restricted to no more than 2 subset variables per combination (m = 2) 112 | scomb <- as.list(subset_vars) 113 | if (length(subset_vars) > 1) scomb <- c(scomb, combn(subset_vars, m = 2, simplify = FALSE)) 114 | 115 | # Restrict 'obs' to only the necessary variables 116 | v <- c("W", y, subset_vars) 117 | obs <- obs[, ..v] 118 | 119 | #--- 120 | 121 | # Which subset_vars are numeric or ordered factor? 122 | sclus <- names(which(sapply(obs[, ..subset_vars], function(x) is.ordered(x) | is.numeric(x)))) 123 | 124 | # Convert numeric and ordered factor subset variables into clustered, ordered factors with k levels via uniCluster() 125 | for (v in sclus) set(obs, j = v, value = uniCluster(x = obs[[v]], k = 5)) 126 | 127 | #--- 128 | 129 | # Convert logical 'y' variables to factor so they are handled appropriately 130 | ylog <- names(which(sapply(obs[, ..y], is.logical))) 131 | for (v in ylog) { 132 | set(obs, j = v, value = as.factor(obs[[v]])) 133 | set(sim, j = v, value = as.factor(sim[[v]])) 134 | } 135 | 136 | # One-hot encode the y factor variables 137 | yfct <- names(which(sapply(obs[, ..y], is.factor))) 138 | if (length(yfct) > 0) { 139 | cat("One-hot encoding categorical fusion variables\n") 140 | obs <- one_hot(obs, yfct) 141 | sim <- one_hot(sim, yfct) 142 | y <- setdiff(names(sim), "M") 143 | } 144 | 145 | #--- 146 | 147 | # Detect and impute any missing values in 'observed' 148 | # Note that 'simulated' should not have any NA's by design 149 | na.cols <- names(which(sapply(obs, anyNA))) 150 | if (length(na.cols) > 0) { 151 | cat("Missing values imputed for the following variable(s):\n", paste(na.cols, collapse = ", "), "\n") 152 | for (j in na.cols) { 153 | x <- observed[[j]] 154 | ind <- is.na(x) 155 | observed[ind, j] <- imputationValue(x, ind) 156 | } 157 | } 158 | 159 | #--- 160 | 161 | # Combine observed and simulated data 162 | v <- c("W", subset_vars) 163 | sim <- cbind(sim, obs[rep(1:N, Mimp), ..v]) 164 | 165 | # Key the data.tables for speed in 'by' operations below 166 | setkeyv(sim, c("M", subset_vars)) 167 | setkeyv(obs, subset_vars) 168 | 169 | #--- 170 | 171 | # Report the overall correlation between observed and simulated fusion variables 172 | # This is a quick check if the simulated data are simply replicating the original values (over-fitting) 173 | ycor <- sapply(y, function(v) suppressWarnings(cor(rep(obs[[v]], Mimp), sim[, ..v]))) 174 | cat("Correlation between observed and fused values:\n") 175 | print(summary(ycor), digits = 2) 176 | 177 | #--- 178 | 179 | # TEST -- generate random versions of the 'y' variables in 'sim' 180 | # for (v in y) { 181 | # set(sim, j = paste0(v, "__RND"), value = sample(obs[[v]], size = nrow(sim), replace = TRUE)) 182 | # } 183 | # ysim <- c(y, paste0(y, "__RND")) 184 | 185 | #--- 186 | 187 | # !!! TO DO: Potentially return correlation coefficients for each subset 188 | # x <- filter(sim, M == 1) 189 | # test <- cor(x[, ..y]) 190 | # f <- function(x) cor(x[, ..y]) # Restrict to desired entries... 191 | # test <- sim[, f(.SD), by = M, .SDcols = y] 192 | 193 | #--- 194 | 195 | # Function to compute confidence interval bounds 196 | # p = 0.95 entails a 90% confidence interval 197 | # calcCI <- function(d, p = 0.95) { 198 | # d %>% 199 | # mutate( 200 | # lwr = est - qt(p, df) * se, # CI lower bound 201 | # upr = est + qt(p, df) * se # CI upper bound 202 | # ) 203 | # } 204 | 205 | # Function to compute margin of error (MOE) 206 | # p = 0.95 entails a 90% confidence interval 207 | calcMOE <- function(d, p = 0.95) { 208 | d %>% 209 | mutate( 210 | moe = se * qt(p, df) 211 | ) 212 | } 213 | 214 | #--- 215 | 216 | # Process a particular combination of subsetting variables 217 | 218 | processSubset <- function(iset) { 219 | 220 | svar <- scomb[[iset]] 221 | 222 | # Calculate the share of observations within each subset, using the observed data 223 | subshr <- obs[, .(share = .N / N), by = svar] 224 | subshr <- subshr[share >= min_size / N] 225 | subshr$id <- paste(iset, 1:nrow(subshr), sep = "_") 226 | 227 | # Restrict 'obs' and 'sim' to subsets with at least 'min_size' observations 228 | obs <- obs[subshr, on = svar] 229 | sim <- sim[subshr, on = svar] 230 | 231 | #--- 232 | 233 | # Calculate outcomes for the observed data 234 | g <- obs[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = svar, .SDcols = y] 235 | g$metric <- rep(c("estimate1", "estimate2", "variance"), times = nrow(g) / 3) 236 | g <- melt(g, measure.vars = y, variable.name = "y") 237 | g <- dcast(g, ... ~ metric, value.var = "value") 238 | g <- g[subshr, on = svar] # Adds "share" and "id" variables 239 | g <- mutate(g, 240 | est = estimate1, 241 | se = sqrt(variance), 242 | df = as.integer(share * N - 1)) %>% 243 | select(-estimate1, -estimate2, -variance) %>% 244 | calcMOE() 245 | 246 | #--- 247 | 248 | # Calculate outcomes for the simulated data 249 | 250 | # Weighted mean and variance of the mean for each implicate 251 | d <- sim[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = c("M", svar), .SDcols = y] 252 | #d <- sim[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = c("M", svar), .SDcols = ysim] 253 | d$metric <- rep(c("estimate1", "estimate2", "variance"), times = nrow(d) / 3) 254 | d <- melt(d, id.vars = c("M", "metric", svar), variable.name = "y") 255 | d <- dcast(d, ... ~ metric, value.var = "value") 256 | 257 | # Calculate the necessary quantities 258 | # est: Point estimates 259 | # ubar: Mean of the within-implicate variances 260 | # b: Variance of estimates, across the implicates (approximated by variance of mixture of normal distributions) 261 | d <- d[, .(est = mean(estimate1), 262 | ubar = mean(variance), 263 | b = var(estimate1)), 264 | #b = (sum(estimate1 ^ 2 + variance) / Mimp) - mean(estimate1) ^ 2), # Conservative approximation of var(estimate1); does not risk b1 < ubar 265 | by = c(svar, "y")] 266 | 267 | # Adds "share" and "id" variables 268 | d <- d[subshr, on = svar] 269 | 270 | # Calculate margin of error 271 | #maxr <- maxr_fun(Mimp) 272 | d <- mutate(d, 273 | # b = ifelse(ubar / b > maxr, ubar / maxr, b), 274 | # b = ifelse(ubar == 0, 0, b), 275 | 276 | # Rubin 1987 277 | se = sqrt(ubar + (1 + Mimp^(-1)) * b), 278 | r = (1 + Mimp^(-1)) * b / ubar, 279 | df = (Mimp - 1) * (1 + r^(-1)) ^ 2, 280 | df = ifelse(is.infinite(df), as.integer(share * N - 1), df), # Set 'df' to standard value if Inf is returned by Rubin's formula (i.e. when b = 0; no variance across implicates) 281 | r = NULL 282 | 283 | # Reiter and Raghunathan (2007) 284 | # se = sqrt(b * (1 + 1 / Mimp) - ubar), 285 | # df = (Mimp - 1) * (1 - Mimp * ubar / ((Mimp + 1) * b)) ^ 2, 286 | # df = ifelse(se == 0, 1, df) 287 | ) %>% 288 | select(-ubar, -b, -share, -id) %>% 289 | #select(-ubar, -b) %>% 290 | calcMOE() 291 | 292 | # Merge observed and simulated results 293 | comp <- merge(g, d, by = c(svar, "y"), suffixes = c(".obs", ".sim")) %>% 294 | select(-all_of(svar)) 295 | return(comp) 296 | 297 | } 298 | 299 | #--- 300 | 301 | # Process each subset combination in 'scomb' 302 | cat("Processing validation analyses for", yN, "fusion variables\n") 303 | comp <- parallel::mclapply(seq_along(scomb), processSubset, mc.cores = cores) %>% 304 | rbindlist() 305 | 306 | #--- 307 | 308 | # Add the observed mean estimates (est.mean) 309 | avg <- matrixStats::colWeightedMeans(x = as.matrix(obs[, ..y]), w = obs$W) 310 | set(comp, j = "est.mean", value = avg[match(comp$y, names(avg))]) 311 | 312 | #--- 313 | 314 | # Add original factor variable levels, if necessary 315 | if (length(yfct) > 0) { 316 | comp <- merge(x = comp, y = attr(obs, "one_hot_link"), by.x = "y", by.y = "dummy", all.x = TRUE, sort = FALSE) 317 | comp <- comp %>% 318 | mutate(y = ifelse(is.na(level), y, original)) %>% 319 | select(-original) 320 | } else { 321 | comp$level <- NA 322 | } 323 | 324 | #--- 325 | 326 | # Report total unique subsets analyzed 327 | cat("Performed", nrow(comp), "analyses across", uniqueN(comp$id), "subsets\n") 328 | 329 | # Data frame with comparison results 330 | keep <- c("id", "share", "y", "level", "est.obs", "moe.obs", "est.sim", "moe.sim", "est.mean") 331 | result <- comp[, ..keep] 332 | result$y <- as.character(result$y) 333 | class(result) <- c("validate", class(comp)) 334 | 335 | # Clean up 336 | rm(obs, sim, comp) 337 | 338 | #--- 339 | 340 | # Check if ggplot2 is installed; if so, call plot_valid() prior to returning result 341 | if (plot) { 342 | suppressMessages(ok <- require(ggplot2, quietly = TRUE, warn.conflicts = FALSE)) 343 | if (ok) { 344 | result <- plot_valid(result, cores = cores) 345 | class(result) <- c("validate", class(result)) 346 | } else { 347 | cat("Skipping plot generation because 'ggplot2' package is not installed\n") 348 | } 349 | } 350 | 351 | # Report processing time 352 | tout <- difftime(Sys.time(), t0) 353 | cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ") 354 | 355 | return(result) 356 | 357 | } 358 | -------------------------------------------------------------------------------- /R/prepXY.R: -------------------------------------------------------------------------------- 1 | #' Prepare the 'x' and 'y' inputs 2 | #' 3 | #' @description 4 | #' Optional-but-useful function to: 1) provide a plausible ordering of the 'y' (fusion) variables and 2) identify the subset of 'x' (predictor) variables likely to be consequential during subsequent model training. Output can be passed directly to \code{\link{train}}. Most useful for large datasets with many and/or highly-correlated predictors. Employs an absolute Spearman rank correlation screen and then LASSO models (via \code{\link[glmnet]{glmnet}}) to return a plausible ordering of 'y' and the preferred subset of 'x' variables associated with each. 5 | #' 6 | #' @param data Data frame. Training dataset. All categorical variables should be factors and ordered whenever possible. 7 | #' @param y Character or list. Variables in \code{data} to eventually fuse to a recipient dataset. If \code{y} is a list, each entry is a character vector possibly indicating multiple variables to fuse as a block. 8 | #' @param x Character. Predictor variables in \code{data} common to donor and eventual recipient. 9 | #' @param weight Character. Name of the observation weights column in \code{data}. If NULL (default), uniform weights are assumed. 10 | #' @param cor_thresh Numeric. Predictors that exhibit less than \code{cor_thresh} absolute Spearman (rank) correlation with a \code{y} variable are screened out prior to the LASSO step. Fast exclusion of predictors that the LASSO step probably doesn't need to consider. 11 | #' @param lasso_thresh Numeric. Controls how aggressively the LASSO step screens out predictors. Lower value is more aggressive. \code{lasso_thresh = 0.95}, for example, retains predictors that collectively explain at least 95% of the deviance explained by a "full" model. 12 | #' @param xmax Integer. Maximum number of predictors returned by LASSO step. Does not strictly control the number of final predictors returned (especially for categorical \code{y} variables), but useful for setting a (very) soft upper bound. Lower \code{xmax} can help control computation time if a large number of \code{x} pass the correlation screen. \code{xmax = Inf} imposes no restriction. 13 | #' @param xforce Character. Subset of \code{x} variables to "force" as included predictors in the results. 14 | #' @param fraction Numeric. Fraction of observations in \code{data} to randomly sample. For larger datasets, sampling often has minimal effect on results but speeds up computation. 15 | #' @param cores Integer. Number of cores used. Only applicable on Unix systems. 16 | #' 17 | #' @return List with named slots "y" and "x". Each is a list of the same length. Former gives the preferred fusion order. Latter gives the preferred sets of predictor variables. 18 | #' 19 | #' @examples 20 | #' y <- names(recs)[c(14:16, 20:22)] 21 | #' x <- names(recs)[2:13] 22 | #' 23 | #' # Fusion variable "blocks" are respected by prepXY() 24 | #' y <- c(list(y[1:2]), y[-c(1:2)]) 25 | #' 26 | #' # Do the prep work... 27 | #' prep <- prepXY(data = recs, y = y, x = x) 28 | #' 29 | #' # The result can be passed to train() 30 | #' train(data = recs, y = prep$y, x = prep$x) 31 | #' 32 | #' @export 33 | 34 | #----- 35 | 36 | # library(fusionModel) 37 | # library(data.table) 38 | # library(tidyverse) 39 | # source("R/utils.R") 40 | # 41 | # data <- select(recs, -starts_with("rep_")) 42 | # y <- names(recs)[c(13:16, 20:22)] 43 | # x <- setdiff(names(data), c(y, "weight")) 44 | # weight = "weight" 45 | # fraction = 1 46 | # cor_thresh = 0.05 47 | # lasso_thresh = 0.95 48 | # cores = 1 49 | # xmax = 5 50 | # xforce = NULL 51 | # 52 | # # Let y have a block 53 | # y <- c(list(y[1:2]), y[-c(1:2)]) 54 | # 55 | # test <- prepXY(data, y, x, weight = "weight") 56 | 57 | #----- 58 | 59 | prepXY <- function(data, 60 | y, 61 | x, 62 | weight = NULL, 63 | cor_thresh = 0.05, 64 | lasso_thresh = 0.95, 65 | xmax = 100, 66 | xforce = NULL, 67 | fraction = 1, 68 | cores = 1) { 69 | 70 | t0 <- Sys.time() 71 | 72 | # Create correct 'y' and 'ylist', depending on input type 73 | if (is.list(y)) { 74 | ylist <- y 75 | y <- unlist(ylist) 76 | } else { 77 | ylist <- as.list(y) 78 | } 79 | 80 | stopifnot(exprs = { 81 | is.data.frame(data) 82 | all(y %in% names(data)) 83 | all(x %in% names(data)) 84 | length(intersect(y, x)) == 0 85 | all(lengths(ylist) > 0) 86 | is.null(weight) | weight %in% names(data) 87 | is.null(xforce) | all(xforce %in% x) 88 | xmax >= 1 89 | fraction > 0 & fraction <= 1 90 | cor_thresh > 0 & cor_thresh <= 1 91 | lasso_thresh > 0 & lasso_thresh <= 1 92 | cores >= 1 & cores %% 1 == 0 93 | }) 94 | 95 | # TO DO: Make operations data.table for efficiency 96 | if (is.data.table(data)) data <- as.data.frame(data) 97 | 98 | # Check for character-type variables; stop with error if any detected 99 | # Check for no-variance (constant) variables 100 | # Detect and impute any missing values in 'x' variables 101 | data <- checkData(data = data, y = y, x = x, nfolds = NULL, impute = TRUE) 102 | 103 | # In case any variables are removed by checkData(), update 'x', 'y', and 'ylist' 104 | x <- intersect(x, names(data)) 105 | for (i in 1:length(ylist)) ylist[[i]] <- intersect(ylist[[i]], names(data)) 106 | ylist <- purrr::compact(ylist) 107 | y <- unlist(y) 108 | 109 | #--- 110 | 111 | # Observation weights vector 112 | W <- if (is.null(weight)) { 113 | rep(1L, nrow(data)) 114 | } else { 115 | data[[weight]] / mean(data[[weight]]) 116 | } 117 | set(data, j = weight, value = NULL) 118 | 119 | #----- 120 | 121 | # Which 'y' variables are zero-inflated? 122 | yinf <- names(which(sapply(data[y], inflated))) 123 | 124 | if (length(yinf)) { 125 | 126 | # Create '*_zero' versions of the zero-inflated variables 127 | dinf <- data[yinf] %>% 128 | mutate_all(~ .x != 0) %>% 129 | setNames(paste0(yinf, "_zero")) # Variable has "_zero" suffix, but it actually indicates when the original 'y' variable is NON-zero. 130 | 131 | # Update the zero-inflated variables in data to have NA instead of zero values 132 | data <- data %>% 133 | cbind(dinf) 134 | 135 | # Update 'ylist' to include '*_zero' versions in block with original zero-inflated variables 136 | ylist <- lapply(ylist, function(v) if (any(v %in% yinf)) c(v, paste0(intersect(v, yinf), "_zero")) else v) 137 | y <- unlist(ylist) 138 | 139 | } 140 | 141 | #----- 142 | 143 | # Sample 'data', if requested 144 | if (fraction < 1) { 145 | samp <- sample.int(n = nrow(data), size = round(nrow(data) * fraction)) 146 | data <- data[samp, ] 147 | W <- W[samp] 148 | } 149 | 150 | #----- 151 | 152 | # Assemble the 'Z' matrix with all required variables 153 | 154 | X <- data[x] %>% 155 | mutate_if(is.ordered, as.integer) %>% 156 | mutate_if(is.logical, as.integer) %>% 157 | mutate_if(is.factor, lumpFactor, nmax = 5) %>% 158 | one_hot(dropUnusedLevels = TRUE) 159 | xlink <- attr(X, "one_hot_link") 160 | xcols <- names(X) 161 | 162 | Y <- data[y] %>% 163 | mutate_if(is.logical, as.integer) %>% 164 | mutate_if(is.factor, lumpFactor, nmax = 5) %>% 165 | one_hot(dropOriginal = TRUE, dropUnusedLevels = TRUE) 166 | ylink <- attr(Y, "one_hot_link") 167 | yfactor <- names(which(sapply(data[y], is.factor))) 168 | ycols <- names(Y) 169 | 170 | xylink <- rbind(xlink, ylink) 171 | rm(data) 172 | Z <- as.matrix(cbind(Y, X)) # Could make sparse? 173 | rm(X, Y) 174 | 175 | #----- 176 | 177 | # TO DO: Switch to using this code instead of 'X' and 'Y' blocks above 178 | # 8/9/24:TEST ALT 179 | # MOVE this could into separate function? It is also in impute() 180 | # d2 <- copy(data) 181 | # 182 | # # Convert 'd2' to plausible ranks for correlation screening 183 | # # All output columns should be NA, integer, or logical 184 | # # NA's in input are preserved in output 185 | # for (i in 1:ncol(d2)) { 186 | # z <- d2[[i]] 187 | # if (is.numeric(z)) { 188 | # # Ties method 'dense' ensures integer output with minimum of 1 and maximum of length(na.omit(z)) 189 | # z <- frank(z, ties.method = "dense", na.last = "keep") 190 | # } else { 191 | # if (is.ordered(z)) { 192 | # z <- as.integer(z) 193 | # } else { 194 | # if (!is.logical(z)) { 195 | # # Converts character and un-ordered factors to TRUE for the most-common (non-NA) value and FALSE otherwise 196 | # zt <- table2(z, na.rm = TRUE) 197 | # z <- z == names(which.max(zt)) 198 | # } 199 | # } 200 | # } 201 | # set(d2, j = i, value = z) 202 | # } 203 | 204 | #----- 205 | 206 | # intersect() call restricts to factor levels present in 'Z' (some levels can be dropped when 'data' is randomly subsampled) 207 | vc <- lapply(y, function(v) { 208 | out <- if (v %in% yfactor) { 209 | vl <- filter(ylink, original == v)$dummy 210 | intersect(vl, colnames(Z)) 211 | } else { 212 | v 213 | } 214 | return(out) 215 | }) %>% 216 | setNames(y) 217 | 218 | #----- 219 | 220 | # Determine the x-predictors that pass absolute correlation threshold for each y 221 | # The 'Zr' matrix contains the ranks, so the correlation threshold refers to Spearman (rank) correlation 222 | cat("Identifying 'x' that pass absolute Spearman correlation threshold\n") 223 | Zr <- matrixStats::colRanks(Z, ties.method = "average", preserveShape = TRUE, useNames = TRUE) 224 | xok <- parallel::mclapply(unlist(vc), function(v) { 225 | 226 | # Initial correlation screening, based on absolute correlation value 227 | p <- abs(suppressWarnings(cor(Zr[, v], Zr[, xcols], use = "pairwise.complete.obs"))) 228 | p[is.na(p)] <- 0 229 | vx <- which(p > cor_thresh) # Arbitrary correlation threshold 230 | 231 | # Ensure some minimum number of predictors are passed to glmnet() 232 | # If there are too few predictors, glmnet() may fail 233 | if (length(vx) < 20) vx <- order(p, decreasing = TRUE)[1:min(20, length(p))] 234 | 235 | return(xcols[vx]) 236 | 237 | }, mc.cores = cores) %>% 238 | setNames(unlist(vc)) 239 | rm(Zr) 240 | 241 | #----- 242 | 243 | # Wrapper function for fitting a glmnet LASSO model 244 | # Used repeatedly in looped calls below 245 | gfit <- function(y, x) { 246 | i <- if (y %in% yinf) Z[, y] != 0 else rep(TRUE, nrow(Z)) 247 | suppressWarnings({ 248 | glmnet::glmnet( 249 | x = Z[i, x], 250 | y = Z[i, y], 251 | weights = W[i], 252 | family = "gaussian", 253 | pmax = min(xmax, length(x)), 254 | alpha = 1) 255 | }) 256 | } 257 | 258 | # Testing with 'pmax' argument enabled 259 | # y = "square_feet" 260 | # x = setdiff(xcols, y) 261 | # m <- gfit(y, x) 262 | # i <- which(m$dev.ratio / max(m$dev.ratio) >= ifelse(m$jerr == 0, lasso_thresh, 1))[1] # Preferred lambda index value for each model fit, based on supplied lasso threshold 263 | # cf <- coef(m, s = m$lambda[i]) 264 | # sum(cf[, 1] != 0) 265 | 266 | #----- 267 | 268 | cat("Fitting full models for each 'y'\n") 269 | 270 | # Weights 271 | ywgt <- matrixStats::colWeightedMeans(Z, W, cols = ycols) 272 | 273 | # Fit the "full" models for each fusion variable/block 274 | rmax <- parallel::mclapply(ylist, function(yvar) { 275 | sapply(yvar, function(v) { 276 | V <- vc[[v]] # Column names of dummies in case where 'v' is a factor 277 | fits <- lapply(V, function(yv) gfit(y = yv, x = c(xok[[yv]], setdiff(ycols, c(yvar, V))))) 278 | r2 <- sapply(fits, function(m) max(m$dev.ratio)) 279 | if (length(r2) > 1) r2 <- sum(r2 * ywgt[V]) # Only applies weighting when 'r2' contains multiple values (i.e. unique value for each factor level within a variable) 280 | r2 281 | }) %>% 282 | mean() # Returns mean of R2 in case of multiple 'yvar' 283 | }, mc.cores = cores) %>% 284 | simplify2array() %>% 285 | setNames(ylist) 286 | 287 | #----- 288 | 289 | cat("Iteratively constructing preferred fusion order\n") 290 | 291 | # Start building the preferred fusion order... 292 | ord <- NULL # Vector with preferred fusion variable sequence 293 | xpred <- NULL 294 | 295 | # Print loop progress to console? 296 | for (i in 1:length(ylist)) { 297 | 298 | # Candidate y variables remaining to add to 'ord' 299 | ycand <- setdiff(ylist, ord) 300 | 301 | out <- parallel::mclapply(ycand, function(yvar) { 302 | 303 | # This wrapper is necessary to handle cases of blocked 'yvar' with 2+ fusion variables OR case of zero-inflated fusion variable with a "_zero" version included. 304 | out2 <- lapply(yvar, function(v) { 305 | V <- vc[[v]] 306 | fits <- lapply(V, function(yv) { 307 | xx <- xok[[yv]] # x-predictors that pass minimum correlation threshold (or best-n to meet minimum number of predictors) 308 | m <- gfit(y = yv, x = c(xx, unlist(vc[unlist(ord)]))) 309 | i <- which(m$dev.ratio / max(m$dev.ratio) >= ifelse(m$jerr == 0, lasso_thresh, 1))[1] # Preferred lambda index value for each model fit, based on supplied lasso threshold 310 | r2 <- m$dev.ratio[i] 311 | cf <- coef(m, s = m$lambda[i]) 312 | xk <- names(which(Matrix::rowSums(cf != 0) > 0)[-1]) # Predictors with non-zero coefficients 313 | xx <- setdiff(xx, xk) # Remaining zero-coefficient x-predictors, in order of correlation preference 314 | if (length(xk) < 20 & length(xx) > 0) xk <- c(xk, xx[1:min(20 - length(xk), length(xx))]) # Adds zero-coefficient predictors to achieve some minimum number 315 | list(r2 = r2, xk = xk) 316 | }) 317 | 318 | # Extract R-squared for each model and calculate weighted mean across factor levels, if necessary 319 | r2 <- sapply(fits, function(m) m$r2) 320 | if (length(r2) > 1) r2 <- sum(r2 * ywgt[V]) 321 | 322 | # Extract full set of useful x-predictors 323 | xk <- lapply(fits, function(m) m$xk) %>% 324 | unlist() %>% 325 | unique() 326 | 327 | # Return R2 and x-predictor results 328 | list(r2 = r2, xk = xk) 329 | 330 | }) 331 | 332 | #--- 333 | 334 | # Handle blocked 'v' case (combine results across individual variables) 335 | out2 <- if (length(out2) > 1) { 336 | list(r2 = mean(sapply(out2, function(x) x$r2)), 337 | xk = unique(unlist(lapply(out2, function(x) x$xk)))) 338 | } else { 339 | out2[[1]] 340 | } 341 | 342 | return(out2) 343 | 344 | }, mc.cores = cores) %>% 345 | setNames(ycand) 346 | 347 | r2 <- sapply(out, function(m) m$r2) 348 | score <- r2 / rmax[names(r2)] 349 | best <- which.max(score) 350 | 351 | # Update 'ord' with best next fusion variable(s) in the chain 352 | ord <- c(ord, ycand[best]) 353 | 354 | # Extract the predictor variables to be used for 'best' fusion variable 355 | keep <- out[[best]]$xk 356 | # if (!is.null(xlink)) { 357 | # i <- keep %in% xlink$dummy 358 | # keep[i] <- filter(xlink, dummy %in% keep[i])$original 359 | if (!is.null(xylink)) { 360 | # i <- keep %in% xylink$dummy 361 | # keep[i] <- filter(xylink, dummy %in% keep[i])$original 362 | i <- xylink$original[match(keep, xylink$dummy)] 363 | i[is.na(i)] <- keep[is.na(i)] 364 | keep <- i 365 | } 366 | keep <- unique(keep) 367 | #keep <- setdiff(keep, ycols) # Remove any fusion variables from the preferred predictor set 368 | xpred <- c(xpred, list(keep)) 369 | 370 | } 371 | 372 | #--- 373 | 374 | # Remove any "*_zero" variables from the 'ord' result or 'xpred' results 375 | ord <- lapply(ord, function(v) setdiff(v, paste0(yinf, "_zero"))) 376 | xpred <- lapply(xpred, function(v) setdiff(v, paste0(yinf, "_zero"))) 377 | 378 | # Force inclusion of 'xforce' predictor variables 379 | xpred <- lapply(xpred, function(v) unique(c(v, xforce))) 380 | 381 | # Nicely name and order the x-predictors list in order of original 'x' 382 | # names(xpred) <- sapply(ord, paste, collapse = " | ") 383 | # xpred <- lapply(xpred, function(v) v[order(match(v, x))]) 384 | 385 | # Results list 386 | result <- list(y = ord, x = xpred) 387 | 388 | # The full set of variables being retained, stored as attribute 389 | pvars <- unique(unlist(xpred)) 390 | #stopifnot(all(pvars %in% x)) 391 | attr(result, "xpredictors") <- intersect(x, pvars) 392 | attr(result, "xforce") <- xforce 393 | attr(result, "xoriginal") <- x # Original, full set of potential predictor variables 394 | #cat("Retained", length(pvars), "of", length(x), "potential predictor variables\n") 395 | cat("Retained", length(intersect(x, pvars)), "of", length(x), "potential predictor variables\n") 396 | 397 | # Report processing time 398 | tout <- difftime(Sys.time(), t0) 399 | cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ") 400 | 401 | return(result) 402 | 403 | } 404 | -------------------------------------------------------------------------------- /R/plot_valid.R: -------------------------------------------------------------------------------- 1 | #' Plot validation results 2 | #' 3 | #' @description 4 | #' Creates and optionally saves to disk representative plots of validation results returned by \code{\link{validate}}. Requires the suggested \code{\link{ggplot2}} package. This function is (by default) called within \code{\link{validate}}. Can be useful on its own to save graphics to disk or generate plots for a subset of fusion variables. 5 | #' 6 | #' @param valid Object returned by \code{\link{validate}}. 7 | #' @param y Character. Fusion variables to use for validation graphics. Useful for plotting partial validation results. Default is to use all fusion variables present in \code{valid}. 8 | #' @param path Character. Path to directory where .png graphics are to be saved. Directory is created if necessary. If NULL (default), no files are saved to disk. 9 | #' @param cores Integer. Number of cores used. Only applicable on Unix systems. 10 | #' @param ... Arguments passed to \code{\link[ggplot2]{ggsave}} to control .png graphics saved to disk. 11 | #' 12 | #' @details Validation results are visualized to convey expected, typical (median) performance of the fusion variables. That is, how well do the simulated data match the observed data with respect to point estimates and confidence intervals for population subsets of various size? 13 | #' 14 | #' Plausible error metrics are derived from the input validation data for plotting. For comparison of point estimates, the error metric is absolute percent error for continuous variables; in the categorical case it is absolute error scaled such that the maximum possible error is 1. Since these metrics are not strictly comparable, the all-variable plots denote categorical fusion variables with dotted lines. 15 | #' 16 | #' For a given fusion variable, the error metric will exhibit variation (often quite skewed) even for subsets of comparable size, due to the fact that each subset looks at a unique partition of the data. In order to convey how expected, typical performance varies with subset size, the smoothed median error conditional on subset size is approximated and plotted. 17 | #' 18 | #' @return A list with "plots", "smooth", and "data" slots. The "plots" slot contains the following \code{\link[ggplot2]{ggplot}} objects: 19 | #' \itemize{ 20 | #' \item est: Comparison of point estimates (median absolute percent error). 21 | #' \item moe: Comparison of 90% margin of error (median ratio of simulated-to-observed MOE). 22 | #' \item Additional named slots (one for each of the fusion variables) contain the plots described above with scatterplot results. 23 | #' } 24 | #' "smooth" is a data frame with the plotting values used to produce the smoothed median plots. 25 | #' "data" is a data frame with the complete validation results as returned by the original call to \code{\link{validate}}. 26 | #' 27 | #' @examples 28 | #' # Build a fusion model using RECS microdata 29 | #' # Note that "fusion_model.fsn" will be written to working directory 30 | #' fusion.vars <- c("electricity", "natural_gas", "aircon") 31 | #' predictor.vars <- names(recs)[2:12] 32 | #' fsn.path <- train(data = recs, 33 | #' y = fusion.vars, 34 | #' x = predictor.vars, 35 | #' weight = "weight") 36 | #' 37 | #' # Fuse back onto the donor data (multiple implicates) 38 | #' sim <- fuse(data = recs, 39 | #' file = fsn.path, 40 | #' M = 30) 41 | #' 42 | #' # Calculate validation results but do not generate plots 43 | #' valid <- validate(observed = recs, 44 | #' implicates = sim, 45 | #' subset_vars = c("income", "education", "race", "urban_rural"), 46 | #' weight = "weight", 47 | #' plot = FALSE) 48 | #' 49 | #' # Create validation plots 50 | #' valid <- plot_valid(valid) 51 | #' 52 | #' # View some of the plots 53 | #' valid$plots$est 54 | #' valid$plots$moe 55 | #' valid$plots$electricity$bias 56 | #' 57 | #' # Can also save the plots to disk at creation 58 | #' # Will save .png files to 'valid_plots' folder in working directory 59 | #' # Note that it is fine to pass a 'valid' object with existing $plots slot 60 | #' # In that case, the plots are simply re-generated 61 | #' vplots <- plot_valid(valid, 62 | #' path = file.path(getwd(), "valid_plots"), 63 | #' width = 8, height = 6) 64 | #' 65 | #' @export 66 | 67 | #----- 68 | 69 | # From ?validate example 70 | # library(fusionModel) 71 | # library(dplyr) 72 | # library(data.table) 73 | # source("R/utils.R") 74 | # 75 | # fusion.vars <- c("electricity", "natural_gas", "aircon") 76 | # predictor.vars <- names(recs)[2:12] 77 | # fsn.path <- train(data = recs, 78 | # y = fusion.vars, 79 | # x = predictor.vars, 80 | # weight = "weight") 81 | # # Fuse back onto the donor data (multiple implicates) 82 | # sim <- fuse(data = recs, 83 | # fsn = fsn.path, 84 | # M = 40) 85 | # 86 | # valid <- validate(observed = recs, 87 | # implicates = sim, 88 | # subset_vars = c("income", "education", "race", "urban_rural"), 89 | # weight = "weight") 90 | 91 | #------------------ 92 | 93 | plot_valid <- function(valid, 94 | y = NULL, 95 | path = NULL, 96 | cores = 1, 97 | ...) { 98 | 99 | # Check if ggplot is installed 100 | suppressMessages(ok <- require(ggplot2, quietly = TRUE, warn.conflicts = FALSE)) 101 | if (!ok) stop("package 'ggplot2' must be installed for plot_valid() to work") 102 | 103 | if (inherits(valid, "validate")) { 104 | if (!is.data.frame(valid)) valid <- valid$data # This forces 'valid' to just the data frame with validation results 105 | } else { 106 | stop("'valid' must be an object generated by validate()") 107 | } 108 | 109 | # Check inputs 110 | stopifnot({ 111 | all(y %in% valid$y) 112 | is.null(path) | is.character(path) 113 | }) 114 | 115 | # Create 'directory', if necessary 116 | if (!is.null(path)) { 117 | path <- normalizePath(path, mustWork = FALSE) 118 | if (!dir.exists(path)) dir.create(path, recursive = TRUE) 119 | } 120 | 121 | #--- 122 | 123 | # Restrict fusion (y) variables, if requested 124 | if (!is.null(y)) valid <- filter(valid, y %in% !!y) 125 | 126 | #--- 127 | 128 | # Confidence interval overlap 129 | # See Compare.CI() here: https://github.com/cran/synthpop/blob/master/R/compare.syn.r 130 | # CIoverlap(10, 20, 5, 25) 131 | # CIoverlap <- function(lwr_obs, upr_obs, lwr_sim, upr_sim) { 132 | # L <- pmax(lwr_obs, lwr_sim) 133 | # U <- pmin(upr_obs, upr_sim) 134 | # 0.5 * (((U - L) / (upr_obs - lwr_obs)) + ((U - L) / (upr_sim - lwr_sim))) 135 | # } 136 | 137 | #--- 138 | 139 | errorFun <- function(obs, sim) { 140 | # out <- suppressWarnings({ 141 | # exp(abs(log(sim / obs))) - 1 # Morley 2018 preferred error metric (zeta); will be NA if ratio is negative and Inf if 'obs' is zero; https://agupubs.onlinelibrary.wiley.com/doi/full/10.1002/2017SW001669 142 | # }) 143 | out <- abs((sim - obs) / obs) # Conventional absolute percent error (gives Inf when obs = 0) 144 | #out <- ifelse(!is.finite(out), ape, out) # If Morley's zeta is non-finite, replace with APE 145 | out[!is.finite(out)] <- NA # If non-finite, return NA 146 | out[obs == sim] <- 0 # If 'obs' and 'sim' are identical, return zero error 147 | return(out) 148 | } 149 | 150 | # Calculate the error metrics for plotting 151 | vest <- valid %>% 152 | mutate( 153 | cont = is.na(level), 154 | 155 | # Point estimate error 156 | est = errorFun(obs = est.obs, sim = est.sim), 157 | 158 | # Relative uncertainty ratio (sim / obs) 159 | moe = (moe.sim / est.sim) / (moe.obs / est.obs), 160 | moe = ifelse(moe.sim == moe.obs, 1, moe), 161 | moe = ifelse(is.infinite(moe), NA, moe), 162 | 163 | # Point estimate value-added 164 | vad = 1 - abs(est.sim - est.obs) / abs(est.mean - est.obs), 165 | vad = ifelse(est.mean == est.obs, 0, vad), 166 | vad = pmax(0, vad) 167 | 168 | ) %>% 169 | # group_by(id, share, y, cont) %>% 170 | # summarize_at(c("est", "moe", "vad"), .funs = mean) %>% # Collapses categorical levels to mean of performance metrics 171 | # ungroup() %>% 172 | mutate_if(is.numeric, ~ ifelse(is.finite(.x), .x, NA)) # Sets any non-finite values to NA 173 | 174 | #--- 175 | 176 | # Conditional median of the error metrics 177 | cat("Smoothing validation metrics\n") 178 | y <- unique(vest$y) 179 | qest <- parallel::mclapply(y, function(v) { 180 | 181 | d <- filter(vest, y == v) 182 | 183 | est <- smoothQuantile(x = d$share, y = d$est, qu = 0.5) 184 | #est <- smoothMean(x = d$share, y = d$est, verbose = FALSE) 185 | 186 | moe <- smoothQuantile(x = d$share, y = d$moe, qu = 0.5) 187 | #moe <- smoothMean(x = d$share, y = d$moe, verbose = FALSE) 188 | 189 | vad <- smoothQuantile(x = d$share, y = d$vad, qu = 0.5) 190 | 191 | out <- rbindlist(list(est = est, moe = moe, vad = vad), idcol = "metric", fill = TRUE) 192 | out[, CAT := !d$cont[1]] 193 | out[, VAR := v] 194 | setnames(out, "x", "SHR") 195 | return(out) 196 | 197 | }, mc.cores = cores) %>% 198 | data.table::rbindlist() 199 | 200 | # Drop invalid rows -- cases where the returned smoothed value is infeasible 201 | # qest <- qest %>% 202 | # filter(y >= 0 | (y <= 1 & metric == "vad")) 203 | 204 | # qest <- qest %>% 205 | # mutate(y = ifelse(metric %in% c("est", "moe"), pmax(0, y), pmin(1, y))) 206 | 207 | # Ensure feasible smoothed values 208 | 209 | min.max <- vest %>% 210 | group_by(y) %>% 211 | summarize_at(c("est", "vad", "moe"), list(min = min, max = max), na.rm = TRUE) %>% 212 | rename(VAR = y) 213 | 214 | qest <- qest %>% 215 | left_join(min.max, by = "VAR") %>% 216 | mutate(y = ifelse(metric == "est", pmax(pmin(y, est_max), est_min), y), 217 | y = ifelse(metric == "moe", pmax(pmin(y, moe_max), moe_min), y), 218 | y = ifelse(metric == "vad", pmax(pmin(y, vad_max), vad_min), y)) %>% 219 | select(metric:VAR) 220 | 221 | #--- 222 | 223 | # Summary performance metrics (print to console) 224 | 225 | sum.perf <- dcast(qest, ... ~ metric, value.var = "y") %>% 226 | group_by(VAR) %>% 227 | summarize_at(c("est", "vad", "moe"), mean) %>% 228 | rename(y = VAR) 229 | cat("Average smoothed performance metrics across subset range:\n") 230 | print(as.data.frame(sum.perf), digits = 3, print.gap = 2) 231 | 232 | #--- 233 | 234 | # Restrict the smooth results to x-range available for all of the variables 235 | # This ensures that the plots have a same/consistent x-axis for each of the y variables 236 | 237 | xrng <- range(qest$SHR) 238 | # xrng <- qest %>% 239 | # group_by(VAR) %>% 240 | # summarize(min = min(SHR), max = max(SHR), .groups = "drop") %>% 241 | # summarize(min = max(min), max = min(max)) %>% 242 | # unlist() 243 | 244 | qest <- qest %>% 245 | filter(SHR >= xrng[1], SHR <= xrng[2]) 246 | 247 | #--- 248 | 249 | # Create plot objects 250 | cat("Creating ggplot2 graphics", ifelse(is.null(path), "", "and saving .png files to disk"), "\n") 251 | 252 | # X-axis definition 253 | b <- c(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1) # Nice break marks on square root scale 254 | b <- b[max(1, which(b <= xrng[1])[1], na.rm = TRUE) : which(b >= xrng[2])[1]] 255 | pct <- function(x) paste0(as.character(x * 100), "%") 256 | xaxis <- scale_x_continuous(name = "Subset size (percent of total population)", 257 | limits = xrng, breaks = b, trans = "sqrt", labels = pct) 258 | 259 | # ggplot elements to add to all plots 260 | plot.all <- list(xaxis, 261 | theme_bw(), 262 | theme(plot.title = element_text(face = "bold")), 263 | guides(color = guide_legend(title = "Fusion variable"), 264 | linetype = guide_legend(title = "Categorical"))) 265 | 266 | #--- 267 | 268 | # ggplot elements to add to each specific plot 269 | p1.add <- list(scale_y_continuous(name = "Median absolute percent error", n.breaks = 8, limits = c(0, NA), expand = expansion(mult = c(0.01, 0.05)), labels = pct), 270 | labs(subtitle = "Comparison of point estimates")) 271 | 272 | p2.add <- list(scale_y_continuous(name = "Median value-added", n.breaks = 8, limits = c(0, 1), expand = expansion(mult = c(0.01, 0.05))), 273 | geom_hline(yintercept = 1, linetype = 2), 274 | labs(subtitle = "Value-added relative to naive estimates")) 275 | 276 | p3.add <- list(scale_y_continuous(name = "Median ratio of simulated-to-observed uncertainty", n.breaks = 8, limits = c(0, NA), expand = expansion(mult = c(0.01, 0.05))), 277 | geom_hline(yintercept = 1, linetype = 2), 278 | labs(subtitle = "Comparison of relative uncertainty (MOE / estimate)")) 279 | 280 | 281 | # p3.add <- list(scale_y_continuous(name = "Median ratio of simulated-to-observed MOE", n.breaks = 8, limits = c(0, NA), expand = expansion(mult = c(0.01, 0.05))), 282 | # geom_hline(yintercept = 1, linetype = 2), 283 | # labs(subtitle = "Bias in 90% margin of error (MOE)")) 284 | 285 | #--- 286 | 287 | # Multi-variable plots 288 | # NOTE: Set 'linetype = CAT' to indicate whether each 'y' is continuous or categorical 289 | p1 <- ggplot(filter(qest, metric == "est"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p1.add + plot.all + geom_line() 290 | p2 <- ggplot(filter(qest, metric == "vad"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p2.add + plot.all + geom_line() 291 | p3 <- ggplot(filter(qest, metric == "moe"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p3.add + plot.all + geom_line() 292 | #p3 <- ggplot(filter(qest, metric == "bias"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p3.add + plot.all + geom_line() 293 | 294 | out1 <- list(p1, p2, p3) 295 | names(out1) <- c("est", "vad", "moe") 296 | 297 | if (!is.null(path)) { 298 | for (i in names(out1)) { 299 | suppressMessages(ggsave(filename = paste0("allvars_", i, ".png"), plot = out1[[i]], path = path, ...)) 300 | } 301 | } 302 | 303 | #--- 304 | 305 | # Single variable scatterplots 306 | 307 | out2 <- lapply(y, function(v) { 308 | 309 | fct <- 10 # Controls extent of extreme value inclusion in scatterplots 310 | pdata <- filter(qest, VAR == v) 311 | ed <- filter(vest, y == v, share >= xrng[1], share <= xrng[2]) 312 | 313 | r <- median(ed$est, na.rm = TRUE) + c(-1, 1) * fct * mad(ed$est, na.rm = TRUE) 314 | p1 <- ggplot(filter(pdata, metric == "est"), aes(x = SHR, y = y)) + 315 | geom_point(data = filter(ed, est >= r[1], est <= r[2]), 316 | aes(x = share, y = est), shape = 1) + 317 | #geom_point(data = filter(ed, error_est < max(pdata$est + ofct * pdata$peSE)), aes(x = share, y = error_pe), shape = 1) + 318 | # geom_ribbon(aes(ymin = pmax(pe - peSE * qt(0.975, DFpe), 0), 319 | # ymax = pmin(pe + peSE * qt(0.975, DFpe), ifelse(categorical, 1, Inf))), 320 | # fill = "gray", alpha = 0.25, color = "red", linetype = "dotted") + 321 | p1.add + plot.all + labs(title = v) + geom_line(color = "red") 322 | 323 | r <- c(0, 1) 324 | p2 <- ggplot(filter(pdata, metric == "vad"), aes(x = SHR, y = y)) + 325 | geom_point(data = filter(ed, vad >= r[1], vad <= r[2]), 326 | aes(x = share, y = vad), shape = 1) + 327 | p2.add + plot.all + labs(title = v) + geom_line(color = "red") 328 | 329 | r <- median(ed$moe, na.rm = TRUE) + c(-1, 1) * fct * mad(ed$moe, na.rm = TRUE) 330 | p3 <- ggplot(filter(pdata, metric == "moe"), aes(x = SHR, y = y)) + 331 | geom_point(data = filter(ed, moe >= r[1], moe <= r[2]), 332 | aes(x = share, y = moe), shape = 1) + 333 | p3.add + plot.all + labs(title = v) + geom_line(color = "red") 334 | 335 | # r <- median(ed$bias) + c(-1, 1) * fct * mad(ed$bias) 336 | # p3 <- ggplot(filter(pdata, metric == "bias"), aes(x = SHR, y = y)) + 337 | # geom_point(data = filter(ed, bias >= r[1], bias <= r[2]), 338 | # aes(x = share, y = bias), shape = 1) + 339 | # p3.add + plot.all + labs(title = v) + geom_line(color = "red") 340 | 341 | out <- list(p1, p2, p3) 342 | names(out) <- c("est", "vad", "moe") 343 | 344 | # Save plots to disk 345 | if (!is.null(path)) { 346 | for (i in names(out)) { 347 | suppressMessages(ggsave(filename = paste0(v, "_", i, ".png"), plot = out[[i]], path = path, ...)) 348 | } 349 | } 350 | 351 | return(out) 352 | 353 | }) %>% 354 | setNames(y) 355 | 356 | #----- 357 | 358 | # Report location of output plots, if requested 359 | if (!is.null(path)) cat("Plots saved to:", path, "\n") 360 | 361 | # Assemble final result 362 | result <- list(plots = c(out1, out2), 363 | perf = sum.perf, 364 | smooth = qest, 365 | data = valid) 366 | if (!inherits(valid, "validate")) class(result) <- c("validate", class(result)) 367 | return(result) 368 | 369 | } 370 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "fusionModel" 3 | subtitle: "Synthetic data fusion and analysis in R" 4 | author: Kevin Ummel ([ummel@berkeley.edu](mailto:ummel@berkeley.edu)) 5 | output: 6 | github_document: 7 | toc: true 8 | toc_depth: 2 9 | html_preview: true 10 | --- 11 | 12 | ```{r setup, include=FALSE} 13 | knitr::opts_chunk$set( 14 | echo = TRUE, 15 | cache = TRUE, # Useful for testing/editing; will create /README_cache directory 16 | comment = NA, 17 | fig.path = "man/figures/README-" 18 | ) 19 | ``` 20 | 21 | # Overview 22 | 23 | **fusionModel** enables variables unique to a "donor" dataset to be statistically simulated for (i.e. *fused to*) a "recipient" dataset. Variables common to both the donor and recipient are used to model and simulate the fused variables. The package provides a simple and efficient interface for general data fusion in *R*, leveraging state-of-the-art machine learning algorithms from Microsoft's [LightGBM](https://lightgbm.readthedocs.io/en/latest/) framework. It also provides tools for analyzing synthetic/simulated data, calculating uncertainty, and validating fusion output. 24 | 25 | fusionModel was developed to allow statistical integration of microdata from disparate social surveys. It is the data fusion workhorse underpinning the larger fusionACS data platform under development at the [Socio-Spatial Climate Collaborative](https://sc2.berkeley.edu/fusionacs-people/). In this context, fusionModel is used to fuse variables from a range of social surveys onto microdata from the American Community Survey, allowing for analysis and spatial resolution otherwise impossible. 26 | 27 | # Motivation 28 | 29 | The desire to “fuse” or otherwise integrate independent datasets has a long history, dating to at least the early 1970’s ([Ruggles and Ruggles 1974](https://www.nber.org/system/files/chapters/c10115/c10115.pdf); [Alter 1974](https://www.nber.org/system/files/chapters/c10116/c10116.pdf)). Social scientists have long recognized that large amounts of unconnected data are “out there” – usually concerning the characteristics of households and individuals (i.e. microdata) – which we would, ideally, like to integrate and analyze as a whole. This aim falls under the general heading of “Statistical Data Integration” (SDI) ([Lewaa et al. 2021](https://content.iospress.com/articles/statistical-journal-of-the-iaos/sji210835)). 30 | 31 | The most prominent examples of data fusion have involved administrative record linkage. This consists of exact matching or probabilistic linking of independent datasets, using observable information like social security numbers, names, or birth dates of individuals. Record linkage is the gold standard and can yield incredibly important insights and high levels of statistical confidence, as evidenced by the pioneering work of [Raj Chetty](https://opportunityinsights.org/team/raj-chetty/) and colleagues. 32 | 33 | However, record linkage is rarely feasible for the kinds of microdata that most researchers use day-to-day (nevermind the difficulty of accessing administrative data). While the explosion of online tracking and social network data will undoubtedly offer new lines of analysis, for the time being, at least, social survey microdata remain indispensable. The challenge and promise recognized 50 years ago by Nancy and Richard Ruggles remains true today: 34 | 35 | >Unfortunately, no single microdata set contains all of the different kinds of information required for the problems which the economist wishes to analyze. Different microdata sets contain different kinds of information…A great deal of information is collected on a sample basis. Where two samples are involved the probability of the same individual appearing in both may be very small, so that exact matching is impossible. Other methods of combining the types of information contained in the two different samples into one microdata set will be required. (Ruggles and Ruggles 1974; 353-354) 36 | 37 | Practitioners regularly impute or otherwise predict a variable or two from one dataset on to another. Piecemeal, *ad hoc* data fusion is a common necessity of quantitative research. Proper data fusion, on the other hand, seeks to systematically combine “two different samples into one microdata set”. 38 | 39 | The size and nature of the samples involved and the intended analyses strongly influence the choice of data integration technique and the structure of the output. This has led to the relevant literature being both diverse and convoluted, as practitioners take on different data “setups” and objectives. In the context of fusionACS, we are interested in the following problem: 40 | 41 | We have microdata from two independent surveys, A and B, that sample the same underlying population and time period (e.g. occupied U.S. households nationwide in 2018). We specify that A is the “recipient” dataset and B is the “donor”. The goal is to generate a new dataset, C, that has the original survey responses of A plus a realistic representation of how each respondent in A might have answered the questionnaire of survey B. To do this, we identify a set of common/shared variables X that both surveys solicit. We then attempt to fuse a set of variables unique to B – call them Z, the “fusion variables” – onto the original microdata of A, conditional on X. 42 | 43 | # Methodology 44 | 45 | The fusion strategy implemented in the fusionModel package borrows and expands upon ideas from the statistical matching ([D’Orazio et al. 2006](https://onlinelibrary.wiley.com/doi/book/10.1002/0470023554)), imputation ([Little and Rubin 2019](https://onlinelibrary.wiley.com/doi/book/10.1002/9781119482260)), and data synthesis ([Drechsler 2011](https://link.springer.com/book/10.1007/978-1-4614-0326-5)) literatures to create a flexible data fusion tool. It employs variable-*k*, conditional expectation matching that leverages high-performance gradient boosting algorithms. The package accommodates fusion of many variables, individually or in blocks, and efficient computation when the recipient is large relative to the donor. 46 | 47 | Specifically, the goal was to create a data fusion tool that meets the following requirements: 48 | 49 | 1. Accommodate donor and recipient datasets with divergent sample sizes 50 | 2. Handle continuous, categorical, and semi-continuous (zero-inflated) variable types 51 | 3. Ensure realistic values for fused variables 52 | 4. Scale efficiently for larger datasets 53 | 5. Fuse variables “one-by-one” or in “blocks” 54 | 6. Employ a data modeling approach that: 55 | + Makes no distributional assumptions (i.e. non-parametric) 56 | + Automatically detects non-linear and interaction effects 57 | + Automatically selects predictor variables from a potentially large set 58 | + Ability to prevent overfitting (e.g. cross-validation) 59 | 60 | Complete methodological details are available in the fusionACS Guidebook (INSERT LINK). 61 | 62 | # Installation 63 | 64 | ```r 65 | devtools::install_github("ummel/fusionModel") 66 | ``` 67 | 68 | ```{r} 69 | library(fusionModel) 70 | ``` 71 | 72 | # Simple fusion 73 | 74 | ```{r, echo = FALSE, results = 'hide'} 75 | suppressMessages(suppressWarnings(library(tidyverse))) 76 | suppressMessages(suppressWarnings(library(fst))) # Just to void messages when it loads within code 77 | ``` 78 | 79 | The package includes example microdata from the [2015 Residential Energy Consumption Survey](https://www.eia.gov/consumption/residential/data/2015/) (see `?recs` for details). For real-world use cases, the donor and recipient data are typically independent and vary in sample size. For illustrative purposes, we will randomly split the `recs` microdata into separate "donor" and "recipient" datasets with an equal number of observations. 80 | 81 | ```{r} 82 | # Rows to use for donor dataset 83 | d <- seq(from = 1, to = nrow(recs), by = 2) 84 | 85 | # Create donor and recipient datasets 86 | donor <- recs[d, c(2:16, 20:22)] 87 | recipient <- recs[-d, 2:14] 88 | 89 | # Specify fusion and shared/common predictor variables 90 | predictor.vars <- names(recipient) 91 | fusion.vars <- setdiff(names(donor), predictor.vars) 92 | ``` 93 | 94 | The `recipient` dataset contains `r ncol(recipient) ` variables that are shared with `donor`. These shared "predictor" variables provide a statistical link between the two datasets. fusionModel exploits the information in these shared variables. 95 | 96 | ```{r} 97 | predictor.vars 98 | ``` 99 | 100 | There are `r length(fusion.vars)` "fusion variables" unique to `donor`. These are the variables that will be fused to `recipient`. This includes a mix of continuous and categorical (factor) variables. 101 | 102 | ```{r} 103 | # The variables to be fused 104 | sapply(donor[fusion.vars], class) 105 | ``` 106 | We create a fusion model using the `train()` function. The minimal usage is shown below. See `?train` for additional function arguments and options. By default, this results in a ".fsn" (fusion) object being saved to "fusion_model.fsn" in the current working directory. 107 | 108 | ```{r} 109 | # Train a fusion model 110 | fsn.model <- train(data = donor, 111 | y = fusion.vars, 112 | x = predictor.vars) 113 | ``` 114 | To fuse variables to `recipient`, we simply pass the recipient data and path of the .fsn model to the `fuse()` function. Each variable specified in `fusion.vars` is fused in the order provided. By default, `fuse()` generates a single implicate (version) of synthetic outcomes. Later, we'll work with multiple implicates to perform proper analysis and uncertainty estimation. 115 | 116 | ```{r} 117 | # Fuse 'fusion.vars' to the recipient 118 | sim <- fuse(data = recipient, 119 | fsn = fsn.model) 120 | ``` 121 | Let's look at the the recipient dataset's fused/simulated variables. Note that your results will look different, because each call to `fuse()` generates a unique, probabilistic set of outcomes. 122 | 123 | ```{r} 124 | head(sim) 125 | ``` 126 | 127 | We can do some quick sanity checks to compare the distribution of the fusion variables in `donor` with those in `sim`. This, at least, confirms that the fusion output is not obviously wrong. Later, we'll perform a formal internal validation exercise using multiple implicates. 128 | 129 | ```{r} 130 | sim <- data.frame(sim) 131 | 132 | # Compare means of the continuous variables 133 | cbind(donor = colMeans(donor[fusion.vars[3:5]]), sim = colMeans(sim[fusion.vars[3:5]])) 134 | 135 | # Compare frequencies of categorical variable classes 136 | cbind(donor = table(donor$insulation), sim = table(sim$insulation)) 137 | cbind(donor = table(donor$aircon), sim = table(sim$aircon)) 138 | 139 | ``` 140 | And we can look at kernel density plots of the non-zero values for the continuous variables to see if the univariate distributions in `donor` are generally similar in `sim`. 141 | 142 | ```{r, echo = FALSE} 143 | 144 | # Univariate naturally continuous case 145 | # Compare density plots for select continuous variables 146 | 147 | # Create 'pdata' data frame for subsequent plots 148 | #rec <- mutate(cbind(recipient, sim), dataset = "fused") 149 | fsd <- mutate(sim, dataset = "fused") 150 | don <- mutate(donor, dataset = "donor") 151 | pdata <- bind_rows(don, fsd) 152 | 153 | # The "natural" continuous variables (SPECIFY MANUALLY) 154 | ncont <- c("square_feet", "electricity", "natural_gas") 155 | 156 | # Density plots 157 | pdata[c("dataset", ncont)] %>% 158 | pivot_longer(cols = -1L) %>% 159 | filter(value != 0) %>% 160 | ggplot(aes(x = value, color = dataset)) + 161 | geom_density(size = 1) + 162 | theme(legend.position = "top") + 163 | facet_wrap(~ name, scales = "free") 164 | #ggtitle("Distribution of continuous variables (non-zero values)") 165 | 166 | ``` 167 | 168 | # Advanced fusion 169 | 170 | For this call to `train()`, we specify a set of hyperparameters to search over when training each LightGBM gradient boosting model (see `?train` for details). The hyperparameters can be used to tune the underlying GBM models for better cross-validated performance. We also set `nfolds = 10` (default is 5) to indicate the number of cross-validation folds to use. Since this requires additional computation, the `cores` argument is used to enable parallel processing. 171 | 172 | ```{r} 173 | # Train a fusion model with variable blocks 174 | fsn.model <- train(data = donor, 175 | y = fusion.vars, 176 | x = predictor.vars, 177 | nfolds = 10, 178 | hyper = list(boosting = c("gbdt", "goss"), 179 | num_leaves = c(10, 30), 180 | feature_fraction = c(0.7, 0.9)), 181 | cores = 2) 182 | ``` 183 | We generally want to create multiple versions of the simulated fusion variables -- called *implicates* -- in order to reduce bias in point estimates and calculate associated uncertainty. We can do this using the `M` argument within `fuse()`. Here we generate 10 implicates; i.e. 10 unique, probabilistic representations of what the recipient records might look like with respect to the fusion variables. 184 | 185 | ```{r} 186 | # Fuse multiple implicates to the recipient 187 | sim10 <- fuse(data = recipient, 188 | fsn = fsn.model, 189 | M = 10) 190 | ``` 191 | Note that each implicate in `sim10` is identified by the "M" variable/column. 192 | 193 | ```{r} 194 | head(sim10) 195 | table(sim10$M) 196 | ``` 197 | 198 | # Analyzing fused data 199 | 200 | The fused values are inherently probabilistic, reflecting uncertainty in the underlying statistical models. Multiple implicates are needed to calculate unbiased point estimates and associated uncertainty for any particular analysis of the data. In general, more implicates is preferable but requires more computation. 201 | 202 | Since proper analysis of multiple implicates can be rather cumbersome – both from a coding and mathematical standpoint – the `analyze()` function provides a convenient way to calculate point estimates and associated uncertainty for common analyses. Potential analyses currently include variable means, proportions, sums, counts, and medians, (optionally) calculated for population subgroups. 203 | 204 | For example, to calculate the mean value of the "electricity" variable across all observations in the recipient dataset, we do the following. 205 | 206 | ```{r} 207 | analyze(x = list(mean = "electricity"), 208 | implicates = sim10) 209 | ``` 210 | When the response variable is categorical, `analyze()` automatically returns the proportions associated with each factor level. 211 | 212 | ```{r} 213 | analyze(x = list(mean = "aircon"), 214 | implicates = sim10) 215 | ``` 216 | 217 | If we want to perform an analysis across subsets of the recipient population -- for example, calculate the mean value of "electricity" by household "income" -- we can use the `by` and `static` arguments. We see that mean electricity consumption increases with household income. 218 | 219 | ```{r} 220 | analyze(x = list(mean = "electricity"), 221 | implicates = sim10, 222 | static = recipient, 223 | by = "income") 224 | ``` 225 | 226 | It is also possible to do multiple kinds of analyses in a single call to `analyze()`. For example, the following call calculates the mean value of "natural_gas" and "square_feet", the median value of "square_feet", and the sum of "electricity" (i.e. total consumption) and "insulation" (i.e. total count of each level). All of these estimates are calculated for each population subgroup defined by the intersection of "race" and "urban_rural" status. 227 | 228 | ```{r} 229 | result <- analyze(x = list(mean = c("natural_gas", "square_feet"), 230 | median = "square_feet", 231 | sum = c("electricity", "insulation")), 232 | implicates = sim10, 233 | static = recipient, 234 | by = c("race", "urban_rural")) 235 | ``` 236 | We can then (for example) isolate the results for white households in rural areas. Notice that the mean estimate of "square_feet" exceeds the median, reflecting the skewed distribution. 237 | 238 | ```{r} 239 | subset(result, race == "White" & urban_rural == "Rural") 240 | ``` 241 | More complicated analyses can be performed using the custom `fun` argument to `analyze()`. See the Examples section of `?analyze`. 242 | 243 | # Validating fusion models 244 | 245 | The `validate()` function provides a convenient way to perform internal validation tests on synthetic variables that have been fused back onto the original donor data. This allows us to assess the quality of the underlying fusion model; it is analogous to assessing model skill by comparing predictions to the observed training data. 246 | 247 | `validate()` compares analytical results derived using the multiple-implicate fusion output with those derived using the original donor microdata. By performing analyses on population subsets of varying size, `validate()` estimates how the synthetic variables perform for analyses of varying difficulty/complexity. It computes fusion variable means and proportions for subsets of the full sample – separately for both the observed and fused data – and then compares the results. 248 | 249 | First, we fuse multiple implicates of the `fusion.vars` using the original donor data -- *not* the `recipient` data, as we did previously. 250 | 251 | ```{r} 252 | sim <- fuse(data = donor, 253 | fsn = fsn.model, 254 | M = 40) 255 | ``` 256 | 257 | Next, we pass the `sim` results to `validate()`. The argument `subset_vars` specifies that we want the validation exercise to compare observed (donor) and simulated point estimates across population subsets defined by "income", "age", "race", and "education". See `?validate` for more details. 258 | 259 | ```{r} 260 | valid <- validate(observed = donor, 261 | implicates = sim, 262 | subset_vars = c("income", "age", "race", "education")) 263 | ``` 264 | The `validate()` output includes ggplot2 graphics that helpfully summarize the validation results. For example, the plot below shows how the observed and simulated point estimates compare, using median absolute percent error as the performance metric. We see that the synthetic data do a very good job reproducing the point estimates for all fusion variables when the population subset in question is reasonably large. For smaller subsets -- i.e. more difficult analyses due to small sample size -- "square_feet", "natural_gas", and "electricity" remain well modeled, but the error increases more rapidly for "aircon" and "insulation". This information is useful for understanding what kind of reliability we can expect for particular variables and types of analyses, given the underlying fusion model and data. 265 | 266 | ```{r} 267 | valid$plots$est 268 | ``` 269 | 270 | Happy fusing! 271 | -------------------------------------------------------------------------------- /R/fuseCART.R: -------------------------------------------------------------------------------- 1 | #' Fuse variables to a recipient dataset using CART fusion model 2 | #' 3 | #' @description 4 | #' Fuse variables to a recipient dataset using a fusion model object produced by \code{train()}. \code{fuseM()} provides a convenience wrapper for generating multiple implicates. 5 | #' 6 | #' @param data Data frame. Recipient dataset. All categorical variables should be factors and ordered whenever possible. Data types and levels are strictly validated against predictor variables defined in \code{train.object}. 7 | #' @param train.object Output from a successful call to \link{train}. 8 | #' @param ignore_self Logical. If \code{TRUE}, the simulation step excludes "self matches" (i.e. row 1 in \code{data} cannot match with row 1 in the original donor. Only useful for validation exercises. Do not use otherwise. 9 | #' 10 | #' @export 11 | #' @noRd 12 | 13 | #--------------------- 14 | 15 | # Manual testing 16 | 17 | # library(fusionModel) 18 | # source("R/utils.R") 19 | 20 | # Example inputs 21 | # donor <- recs 22 | # data <- subset(recs, select = c(division, urban_rural, climate, income, age, race)) 23 | # induce = FALSE 24 | # induce.ignore = NULL 25 | # fusion.vars <- setdiff(names(donor), names(recipient)) 26 | # train.object <- train(data = donor, y = fusion.vars) 27 | 28 | # data = readRDS("~/Documents/Projects/fusionData/recs_recipient.rds") 29 | # train.object <- readRDS("~/Documents/Projects/fusionData/recs_fit.rds") 30 | 31 | #--------------------- 32 | 33 | fuseCART <- function(data, 34 | train.object, 35 | ignore_self = FALSE) { 36 | 37 | stopifnot(exprs = { 38 | is.data.frame(data) 39 | is.logical(ignore_self) 40 | #!(!induce & !is.null(induce.ignore)) # Nonsensical input 41 | }) 42 | 43 | verbose <- TRUE 44 | 45 | #----- 46 | 47 | # Coerce 'data' to data.table, if necessary 48 | data <- data.table::as.data.table(data) 49 | 50 | # Check nrow(data) against the size of the training dataset; they must be the same size if ignore_self = TRUE 51 | if (ignore_self & nrow(data) != train.object$nobs) stop("When 'ignore_self = TRUE', recipient 'data' must have same number of rows as the training dataset") 52 | 53 | # Check that predictor variables are present 54 | xclass <- train.object$xclass 55 | xvars <- names(xclass) 56 | xlevels <- train.object$xlevels 57 | miss <- setdiff(xvars, names(data)) 58 | if (length(miss) > 0) stop("The following predictor variables are missing from 'data':\n", paste(miss, collapse = ", ")) 59 | 60 | # Restrict 'data' to the xvars and ensure correct ordering of columns consistent with names(xclass) 61 | data <- subset(data, select = xvars) 62 | 63 | #----- 64 | 65 | # Check for appropriate class/type of predictor variables 66 | 67 | xtest <- lapply(data, class) 68 | miss <- !map2_lgl(xclass, xtest, sameClass) 69 | if (any(miss)) stop("Incompatible data type for the following predictor variables:\n", paste(names(miss)[miss], collapse = ", ")) 70 | 71 | # Check for appropriate levels of factor predictor variables 72 | xtest <- lapply(subset(data, select = names(xlevels)), levels) 73 | miss <- !map2_lgl(xlevels, xtest, identical) 74 | if (any(miss)) stop("Incompatible levels for the following predictor variables\n", paste(names(miss)[miss], collapse = ", ")) 75 | 76 | #----- 77 | 78 | # Names and order of modeled variables to be fused 79 | yord <- c(names(train.object$models), names(train.object$derivative$model)) 80 | 81 | # Identify continuous yvars 82 | ycont <- names(which(sapply(train.object$yclass, function(x) x[1] %in% c("integer", "numeric")))) 83 | 84 | # Create the percentile values associated with the quantile function values 85 | # Note: Only really necessary if using smoothing 86 | if (length(ycont) > 0) { 87 | Qx <- map(train.object$models[ycont], ~ nrow(.x$Q)) %>% compact() 88 | Qx <- if (length(Qx) > 0) { 89 | Qx <- dnorm(seq(-3, 3, length.out = Qx[[1]] - 1)) 90 | Qx <- c(0, cumsum(Qx / sum(Qx))) 91 | } 92 | } 93 | 94 | #----- 95 | 96 | # Set 'induce.vars'; these are the variables for which correlation will be induced 97 | # if (induce) { 98 | # induce.vars <- if (is.null(induce.ignore)) { 99 | # c(xvars, yord) 100 | # } else { 101 | # validNames(induce.ignore, c(xvars, yord), exclude = TRUE) 102 | # } 103 | # stopifnot(length(induce.vars) > 0) 104 | # if (verbose) cat("Will attempt to induce correlation for a total of", length(induce.vars), "variables\n") 105 | # } 106 | 107 | #----- 108 | 109 | # Detect and impute any missing values in 'data' 110 | na.cols <- names(which(sapply(data, anyNA))) 111 | if (length(na.cols) > 0) { 112 | if (verbose) cat("Missing values imputed for the following predictors:\n", paste(na.cols, collapse = ", "), "\n") 113 | for (j in na.cols) { 114 | x <- data[[j]] 115 | ind <- is.na(x) 116 | data.table::set(data, i = which(ind), j = j, value = imputationValue(x, na.ind = ind)) 117 | } 118 | } 119 | 120 | #----- 121 | 122 | # Build 'ranks' data.table for the 'xvars', if 'induce = TRUE' 123 | # TO DO: Restrict the variables for which this is applicable? 124 | # This is a memory-efficient implementation using data.table 125 | 126 | # if (induce) { 127 | # 128 | # if (verbose) cat("Building ranks matrix (induce = TRUE)...\n") 129 | # 130 | # # Correlation variables to retain in initial 'ranks' data.table, based on 'induce.vars' argument 131 | # retain <- intersect(induce.vars, xvars) 132 | # 133 | # # Unordered factor variables among retained 'xvars' 134 | # xunordered <- sapply(xclass[retain], function(x) x[1] == "factor") 135 | # 136 | # # Build 'ranks' data.table for 'xvars' that are NOT unordered factors 137 | # ranks <- subset(data, select = names(which(!xunordered))) 138 | # for (v in names(ranks)) data.table::set(ranks, j = v, value = data.table::frank(ranks[[v]], ties.method = "average")) 139 | # 140 | # # Create dummy variable columns in 'ranks' for the 'xvars' that ARE unordered factors 141 | # for (v in names(which(xunordered))) { 142 | # dt <- subset(data, select = v) 143 | # u <- xlevels[[v]] 144 | # newv <- paste0(v, u) 145 | # data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(dt == x))) 146 | # } 147 | # 148 | # # Scale all variable ranks for unit variance and zero mean 149 | # # Makes computations simpler in induceCor() 150 | # for (v in names(ranks)) data.table::set(ranks, j = v, value = as.vector(scale(ranks[[v]]))) 151 | # 152 | # # Clean up 153 | # rm(dt) 154 | # gc() 155 | # 156 | # } 157 | 158 | #----- 159 | 160 | # Extract names of the numeric predictor variables for which binary versions are required 161 | 162 | # Variables for which binary versions are used in models 163 | xbin.model <- train.object$models %>% 164 | map("xbin") %>% 165 | unlist(use.names = FALSE) 166 | 167 | # Variables for which binary versions are used in merges 168 | temp <- train.object$derivative$merge %>% 169 | map(names) %>% 170 | unlist(use.names = FALSE) 171 | xbin.merge <- grep("_BINARY_", temp, value = TRUE) 172 | xbin.merge <- gsub("_BINARY_", "", xbin.merge) 173 | 174 | # Full vector of variables for which binary versions are required 175 | xbin <- unique(c(xbin.model, xbin.merge)) 176 | 177 | # Create binary versions of any 'xbin' among the initial predictor variables 178 | vbin <- intersect(xbin, names(data)) 179 | for (v in vbin) data.table::set(data, j = paste0(v, "_BINARY_"), value = data[[v]] == 0) 180 | 181 | #------------------------- 182 | #------------------------- 183 | 184 | if (verbose) cat("Fusing donor variables to recipient...\n") 185 | 186 | # Progress bar printed to console 187 | if (verbose) pb <- pbapply::timerProgressBar(min = 0, max = length(yord), char = "+", width = 50, style = 3) 188 | 189 | for (y in yord) { 190 | 191 | cont <- y %in% ycont 192 | 193 | yclass <- train.object$yclass[[y]] 194 | 195 | m <- if (y %in% names(train.object$models)) train.object$models[[y]] else train.object$derivative$model[[y]] 196 | 197 | #----- 198 | 199 | # Extract the 'xmerge' object, if available 200 | # This is used to merge known 'y' values up front, prior to probabilistic simulation 201 | # Object 'ind' gives the row numbers for which values are not already known and must be simulated 202 | xmerge <- m$xmerge 203 | if (!is.null(xmerge)) { 204 | data <- merge(data, xmerge, by = names(xmerge)[1], all.x = TRUE, sort = FALSE) 205 | ind <- which(is.na(data[[y]])) 206 | } else { 207 | ind <- 1L:nrow(data) 208 | } 209 | 210 | #----- 211 | 212 | # If 'y' is continuous... 213 | if (cont) { 214 | 215 | if (class(m) == "rpart") { 216 | 217 | smoothed <- is.null(names(m$Q)) 218 | 219 | # Vector of nodes in model 'm' 220 | nodes <- as.integer(c(names(m$Q), colnames(m$Q))) 221 | 222 | # Predicted node for rows in 'data' 223 | pnode <- predictNode(object = m, newdata = data[ind, ]) 224 | gc() 225 | 226 | # Catch and fix/kludge rare case of missing node (unclear why this occurs) 227 | # Note that the 'popsynth' package has a fix for the same issue: https://github.com/cran/synthpop/blob/master/R/functions.syn.r 228 | # Erroneous nodes are re-assigned to the valid node with the closest predicted 'yval' 229 | miss <- setdiff(pnode, nodes) 230 | for (n in miss) pnode[pnode == n] <- nodes[which.min(abs(m$frame[n, "yval"] - m$frame[nodes, "yval"]))] 231 | stopifnot(all(pnode %in% nodes)) 232 | 233 | #--- 234 | 235 | # Placeholder vector for simulated values 236 | S <- vector(mode = "numeric", length = length(pnode)) 237 | 238 | # Fit density to observations in each node 239 | for (n in nodes) { 240 | 241 | # Index identifying rows in 'data' that are in node 'n' 242 | i <- pnode == n 243 | 244 | if (any(i)) { 245 | 246 | if (smoothed) { 247 | 248 | # Extract inputs needed for quantile function and proportion of zeros 249 | Q <- m$Q[, as.character(n)] 250 | 251 | # Randomly simulate values from the conditional distribution 252 | # Note that this is repeated as necessary to ensure 's' does not contain any values already assigned (exclusively) via 'xmerge' 253 | f <- approxfun(x = Qx, y = Q) 254 | s <- f(runif(n = sum(i))) 255 | while (any(s %in% xmerge[[y]])) { 256 | j <- s %in% xmerge[[y]] 257 | s[j] <- f(runif(n = sum(j))) 258 | } 259 | 260 | } else { 261 | 262 | if (ignore_self) { 263 | 264 | # This gives the values, weights, and original row index for each training observation assigned to node 'n' 265 | Q <- m$Q[[as.character(n)]] 266 | 267 | irow <- which(i) 268 | s <- vector(mode = "numeric", length = sum(i)) 269 | fix <- rep(TRUE, length(s)) 270 | sint <- 0 271 | while (sum(fix) > 0 & sint < 20) { 272 | s[fix] <- sample(Q[, 3], size = sum(fix), replace = TRUE, prob = Q[, 2]) 273 | fix <- s == irow # Find any self-references and replace them in next iteration 274 | sint <- sint + 1 275 | } 276 | s <- Q[match(s, Q[, 3]), 1] # Extract actual simulated values 277 | 278 | # TOO SLOW... 279 | # id <- matrix(rep(Q[, 3], each = sum(i)), nrow = sum(i)) 280 | # self.ref <- which(i) == id 281 | # w <- matrix(rep(Q[, 2], each = sum(i)), nrow = sum(i)) 282 | # w[self.ref] <- 0 # Set any self-references to zero weight 283 | # w <- w / matrixStats::rowSums2(w, na.rm = TRUE) 284 | # w <- matrixStats::rowCumsums(w) 285 | # s <- max.col(runif(n = nrow(w)) < w, ties.method = "first") 286 | # s <- Q[s, 1] # Extract actual simulated values 287 | 288 | } else { 289 | 290 | # Randomly sample the observed values within the node 291 | # Note that this is repeated as necessary to ensure 's' does not contain any values already assigned (exclusively) via 'xmerge' 292 | Q <- m$Q[[as.character(n)]] 293 | s <- sample(x = Q[, 1], size = sum(i), replace = TRUE, prob = Q[, 2]) 294 | while (any(s %in% xmerge[[y]])) { 295 | j <- s %in% xmerge[[y]] 296 | s[j] <- sample(x = Q[, 1], size = sum(j), replace = TRUE, prob = Q[, 2]) 297 | } 298 | 299 | } 300 | 301 | } 302 | 303 | # Assign simulated values to 'S' 304 | S[i] <- s 305 | 306 | } 307 | 308 | } 309 | 310 | } else { 311 | 312 | # Make predictions using linear (biglm) model in 'm' 313 | fobj <- formula(paste("~", as.character(m$terms)[3L])) 314 | newmf <- model.frame(formula = fobj, data[ind, ]) 315 | newmm <- model.matrix(fobj, newmf) 316 | S <- drop(newmm %*% replace_na(coef(m), 0)) 317 | 318 | #--- 319 | 320 | # Adjust simulated values to enforce the "outer.range" constraint 321 | # This isn't strictly necessary with rpart() simulated values, because the constraint is enforced in the quantile values themselves 322 | # It might be relevant, however, when a linear model is used for simulation (? - unclear) 323 | # outer.range <- train.object$youter[[y]] 324 | # S <- pmin(pmax(S, outer.range[1]), outer.range[2]) 325 | 326 | # Adjust simulated values to enforce the "inner.range" constraint 327 | inner.range <- train.object$yinner[[y]] 328 | S[S > inner.range[1] & S < 0] <- inner.range[1] 329 | S[S > 0 & S < inner.range[2]] <- inner.range[2] 330 | 331 | } 332 | 333 | # Ensure simulated column is correct data type 334 | if (yclass == "integer") S <- as.integer(round(S)) 335 | 336 | } 337 | 338 | #----- 339 | 340 | if (!cont) { 341 | 342 | if (class(m) == "rpart") { 343 | 344 | # Add the clustered predictors, if necessary 345 | km <- m$kmeans.xwalk 346 | if (!is.null(km)) { 347 | for (d in km) { 348 | k <- match(data[[names(d)[1]]], d[[1]]) 349 | data.table::set(data, j = names(d)[2], value = d[k, 2]) 350 | } 351 | } 352 | 353 | # Class probabilities 354 | p <- predict(object = m, newdata = data[ind, ]) 355 | gc() 356 | 357 | # Simulated value 358 | ptile <- runif(n = length(ind)) 359 | for (i in 2:ncol(p)) p[, i] <- p[, i - 1] + p[, i] 360 | S <- rowSums(ptile > p) + 1L 361 | S <- colnames(p)[S] 362 | 363 | } else { 364 | 365 | # Make predictions using linear (biglm) model in 'm' 366 | # Note that 'm' predicts the integerized factor/logical values, so it must be converted to the correct label 367 | fobj <- formula(paste(as.character(m$terms)[-2], collapse = "")) 368 | newmf <- model.frame(formula = fobj, data[ind, ]) 369 | newmm <- model.matrix(fobj, newmf) 370 | S <- drop(newmm %*% replace_na(coef(m), 0)) 371 | S <- round(S) # Round prediction to integer 372 | 373 | lev <- if ("factor" %in% yclass) train.object$ylevels[[y]] else c(FALSE, TRUE) # Factor/logical labels 374 | S <- pmin(pmax(S, 1), length(lev)) # Ensure plausible integer values 375 | S <- lev[S] # Convert to factor level 376 | 377 | } 378 | 379 | # Ensure simulated vector is correct data type 380 | # 'S' is a character vector by default; must be coerced to factor or logical 381 | if ("factor" %in% yclass) S <- factor(S, levels = train.object$ylevels[[y]], ordered = "ordered" %in% yclass) 382 | if ("logical" %in% yclass) S <- as.logical(S) 383 | 384 | } 385 | 386 | #----- 387 | 388 | # Assign simulated vector to 'data' 389 | data.table::set(data, i = ind, j = y, value = S) 390 | 391 | #----- 392 | 393 | # Proceed to induce correlation and/or update 'ranks' matrix, if requested 394 | # if (induce) { 395 | # 396 | # # Create appropriate column(s) in 'ranks' for simulated 'y' values 397 | # # NOTE that this is only done if 'y' is in 'induce.vars'; otherwise, its rank can be ignored 398 | # if (y %in% induce.vars) { 399 | # if (yclass[1] == "factor") { 400 | # u <- levels(S) 401 | # newv <- paste0(y, u) 402 | # data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(S == x))) 403 | # } else { 404 | # data.table::set(ranks, j = y, value = as.vector(scale(data.table::frank(S, ties.method = "average")))) 405 | # } 406 | # } 407 | # 408 | # #----- 409 | # 410 | # # For continuous and ordered factors, adjust initial simulated values to better match known rank correlation with other variables 411 | # if (y %in% names(train.object$ycor)) { 412 | # 413 | # # Target rank correlations 414 | # rho <- train.object$ycor[[y]] 415 | # 416 | # # Restrict target correlation to variables present in 'ranks' 417 | # rho <- rho[names(rho) %in% names(ranks)] 418 | # 419 | # # Restrict correlation correction to non-zero observations in 'y' 420 | # # NOTE: Setting of this option should be consistent with analogous line in train() 421 | # #ind <- which(as.numeric(data[[y]]) != 0) 422 | # 423 | # # No restriction on correlation correction 424 | # ind <- 1:nrow(data) 425 | # 426 | # # Attempt to induce target rank correlations 427 | # Yout <- induceCor(data = data.table::copy(ranks[ind, ]), rho = rho, y = y, scale.data = FALSE) 428 | # 429 | # # Only updated y-values if the correlation adjustment was successful (sigma2 >= 0) 430 | # if (Yout$sigma2 >= 0) { 431 | # 432 | # # Re-order original y data to match ranks in Y (this preserve the original distribution) 433 | # Y <- sort(data[ind, ][[y]])[data.table::frank(Yout$Y, ties.method = "random")] 434 | # 435 | # # NOTE: The confirmation code below has not been updated in some time (probably broken) 436 | # # Before and after rank correlations compared to 'rho' target correlation 437 | # # plot(rho, cor(ranks[, -..y], ranks[[y]])[, 1]) # Before 438 | # # plot(rho, cor(ranks[, -..y], Yout$Y)[, 1]) # After 439 | # # abline(0, 1) 440 | # 441 | # # Confirm that univariate distribution is unchanged 442 | # # hist(data[i, y]) 443 | # # hist(Y) 444 | # 445 | # # Comparing before and after y values, original scale 446 | # # plot(data[i, y], Y) 447 | # # abline(0, 1, col = 2) 448 | # # cor(data[i, y], Y) 449 | # 450 | # # Update original 'y' in 'data' with adjusted simulated values 451 | # data.table::set(data, i = ind, j = y, value = Y) 452 | # 453 | # # Update the 'ranks' matrix with ranks derived from adjusted 'Y' 454 | # if (y %in% names(ranks)) { 455 | # if (yclass[1] == "factor") { 456 | # dt <- subset(data, select = y) 457 | # u <- levels(dt) 458 | # newv <- paste0(y, u) 459 | # data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(dt == x))) 460 | # #data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(Y == x))) # Safe only when 'ind' is 1:nrow(data) 461 | # } else { 462 | # data.table::set(ranks, j = y, value = as.vector(scale(data.table::frank(data[[y]], ties.method = "average")))) 463 | # #data.table::set(ranks, j = y, value = as.vector(scale(data.table::frank(Y, ties.method = "average")))) # Safe only when 'ind' is 1:nrow(data) 464 | # } 465 | # } 466 | # 467 | # } 468 | # 469 | # } 470 | # 471 | # } 472 | 473 | #----- 474 | 475 | # If 'y' is one of the 'xbin', add a binary version of simulated values to 'data' 476 | if (y %in% xbin) { 477 | data.table::set(data, j = paste0(y, "_BINARY_"), value = data[[y]] == 0) 478 | xbin <- setdiff(xbin, y) 479 | } 480 | 481 | #----- 482 | 483 | # Update for() loop progress bar 484 | if (verbose) pbapply::setTimerProgressBar(pb, match(y, yord)) 485 | 486 | } 487 | 488 | # Close progress bar 489 | if (verbose) pbapply::closepb(pb) 490 | 491 | #------------------------- 492 | #------------------------- 493 | 494 | # Add any constant fusion variables 495 | for (v in names(train.object$derivative$constant)) { 496 | set(data, j = v, value = train.object$derivative$constant[[v]]) 497 | } 498 | 499 | # Merge categorical fusion variables that are identified by a 1-to-1 linear relationship with another categorical variable 500 | for (m in train.object$derivative$merge) { 501 | data <- merge(data, m, by = names(m)[1], all.x = TRUE, sort = FALSE) 502 | } 503 | 504 | #----- 505 | 506 | # Simulation complete 507 | # Return only the fusion variables, in the order in which they were added to 'data' 508 | return(data %>% 509 | subset(select = c(yord, setdiff(names(train.object$yclass), yord))) %>% 510 | as.data.frame() 511 | ) 512 | 513 | } 514 | --------------------------------------------------------------------------------