├── data
    └── recs.rda
├── _pkgdown.yml
├── man
    ├── figures
    │   ├── README-unnamed-chunk-10-1.png
    │   └── README-unnamed-chunk-21-1.png
    ├── fusionModel-package.Rd
    ├── importance.Rd
    ├── impute.Rd
    ├── read_fsd.Rd
    ├── recs.Rd
    ├── monotonic.Rd
    ├── assemble.Rd
    ├── prepXY.Rd
    ├── fuse.Rd
    ├── validate.Rd
    ├── plot_valid.Rd
    ├── fusionOutput.Rd
    ├── train.Rd
    ├── analyze.Rd
    └── analyze_fusionACS.Rd
├── R
    ├── fusionModel-package.R
    ├── zzz.R
    ├── data.R
    ├── lookup.R
    ├── importance.R
    ├── fitLGB.R
    ├── read_fsd.R
    ├── monotonic.R
    ├── impute.R
    ├── assemble.R
    ├── validate.R
    ├── prepXY.R
    ├── plot_valid.R
    └── fuseCART.R
├── NAMESPACE
├── DESCRIPTION
├── .github
    └── workflows
    │   └── pkgdown.yaml
├── data-raw
    └── recs.R
└── README.Rmd


/data/recs.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ummel/fusionModel/HEAD/data/recs.rda


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://ummel.github.io/fusionModel/
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ummel/fusionModel/HEAD/man/figures/README-unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ummel/fusionModel/HEAD/man/figures/README-unnamed-chunk-21-1.png


--------------------------------------------------------------------------------
/R/fusionModel-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 | 
4 | # The following block is used by usethis to automatically manage
5 | # roxygen namespace tags. Modify with care!
6 | ## usethis namespace: start
7 | ## usethis namespace: end
8 | NULL
9 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(analyze)
 4 | export(analyze_fusionACS)
 5 | export(assemble)
 6 | export(fuse)
 7 | export(fuseCART)
 8 | export(fusionOutput)
 9 | export(importance)
10 | export(impute)
11 | export(monotonic)
12 | export(plot_valid)
13 | export(prepXY)
14 | export(read_fsd)
15 | export(train)
16 | export(trainCART)
17 | export(validate)
18 | import(collapse)
19 | import(data.table, except = c(first, last, between, fdroplevels))
20 | import(dplyr)
21 | import(stats, except = c(filter, lag, D))
22 | import(stringr)
23 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | # Specify package imports
 2 | #' @import stringr
 3 | #' @import collapse
 4 | #' @import dplyr
 5 | #' @rawNamespace import(stats, except = c(filter, lag, D))
 6 | #' @rawNamespace import(data.table, except = c(first, last, between, fdroplevels))
 7 | NULL
 8 | 
 9 | #-----
10 | 
11 | .onLoad <- function (libname, pkgname) {
12 | 
13 |   # Create default option value for number of cores
14 |   options(fusionModel.cores = max(1L, parallel::detectCores() - 1L))
15 | 
16 |   # Package startup message
17 |   packageStartupMessage("fusionModel v", utils::packageVersion("fusionModel"), " | https://github.com/ummel/fusionModel")
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/man/fusionModel-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fusionModel-package.R
 3 | \docType{package}
 4 | \name{fusionModel-package}
 5 | \alias{fusionModel}
 6 | \alias{fusionModel-package}
 7 | \title{fusionModel: Data fusion and analysis of synthetic data in R}
 8 | \description{
 9 | Data fusion and analysis of synthetic data in R.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://ummel.github.io/fusionModel/}
15 | }
16 | 
17 | }
18 | \author{
19 | \strong{Maintainer}: Kevin Ummel \email{ummel@berkeley.edu}
20 | 
21 | Other contributors:
22 | \itemize{
23 |   \item Karthik Akkiraju [contributor]
24 |   \item Miguel Poblete Cazenave [contributor]
25 | }
26 | 
27 | }
28 | \keyword{internal}
29 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: fusionModel
 2 | Title: Data fusion and analysis of synthetic data in R
 3 | Version: 2.3.8
 4 | Authors@R: c(
 5 |     person(given = "Kevin", family = "Ummel", role = c("aut", "cre"), email = "ummel@berkeley.edu"),
 6 |     person(given = "Karthik", family = "Akkiraju", role = "ctb"),
 7 |     person(given = "Miguel", family = "Poblete Cazenave", role = "ctb"))
 8 | Description: Data fusion and analysis of synthetic data in R.
 9 | License: GPL (>= 3)
10 | Depends: 
11 |     R (>= 3.5.0)
12 | Imports:
13 |     data.table,
14 |     dplyr,
15 |     collapse,
16 |     fst,
17 |     glmnet,
18 |     lightgbm,
19 |     Matrix,
20 |     matrixStats,
21 |     mgcv,
22 |     purrr,
23 |     RANN,
24 |     stats,
25 |     stringr,
26 |     utils,
27 |     zip,
28 |     rlang,
29 |     tidyr
30 | Suggests:
31 |     ggplot2,
32 |     scam
33 | Encoding: UTF-8
34 | LazyData: true
35 | Roxygen: list(markdown = TRUE)
36 | RoxygenNote: 7.3.2
37 | URL: https://ummel.github.io/fusionModel/
38 | 


--------------------------------------------------------------------------------
/man/importance.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/importance.R
 3 | \name{importance}
 4 | \alias{importance}
 5 | \title{Extract predictor variable importance from a fusion model}
 6 | \usage{
 7 | importance(fsn)
 8 | }
 9 | \arguments{
10 | \item{fsn}{Character. Path to fusion model file (.fsn) generated by \code{\link{train}}.}
11 | }
12 | \value{
13 | A named list containing \code{detailed} and \code{summary} importance results. The \code{summary} results are most useful, as they return the average importance of each predictor across potentially multiple underlying LightGBM models; i.e. zero ("z"), mean ("m"), or quantile ("q") models. See Examples for suggested plotting of results.
14 | }
15 | \description{
16 | Returns predictor variable (feature) importance of underlying LightGBM models stored in a fusion model file (.fsn) on disk.
17 | }
18 | \details{
19 | Importance metrics are computed via \code{\link[lightgbm]{lgb.importance}}. Three types of measures are returned; "gain" is typically the preferred measure.
20 | }
21 | \examples{
22 | # Build a fusion model using RECS microdata
23 | # Note that "fusion_model.fsn" will be written to working directory
24 | ?recs
25 | fusion.vars <- c("electricity", "natural_gas", "aircon")
26 | predictor.vars <- names(recs)[2:12]
27 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
28 | 
29 | # Extract predictor variable importance
30 | ximp <- importance(fsn.path)
31 | 
32 | # Plot summary results
33 | library(ggplot2)
34 | ggplot(ximp$summary, aes(x = x, y = gain)) +
35 |   geom_bar(stat = "identity") +
36 |   facet_grid(~ y) +
37 |   coord_flip()
38 | 
39 | # View detailed results
40 | View(ximp$detailed)
41 | }
42 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples#build-pkgdown-site
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | # 10/29/24: Set on:workflow_dispatch to allow workflow to be triggered manually: https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/manually-running-a-workflow
 4 | on:
 5 |   # push:
 6 |   #   branches: [main, master]
 7 |   # pull_request:
 8 |   #   branches: [main, master]
 9 |   # release:
10 |   #   types: [published]
11 |   workflow_dispatch:
12 | 
13 | name: pkgdown.yaml
14 | 
15 | permissions: read-all
16 | 
17 | jobs:
18 |   pkgdown:
19 |     runs-on: ubuntu-latest
20 |     # Only restrict concurrency for non-PR jobs
21 |     concurrency:
22 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
23 |     env:
24 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
25 |     permissions:
26 |       contents: write
27 |     steps:
28 |       - uses: actions/checkout@v4
29 | 
30 |       - uses: r-lib/actions/setup-pandoc@v2
31 | 
32 |       - uses: r-lib/actions/setup-r@v2
33 |         with:
34 |           use-public-rspm: true
35 | 
36 |       - uses: r-lib/actions/setup-r-dependencies@v2
37 |         with:
38 |           extra-packages: any::pkgdown, local::.
39 |           needs: website
40 | 
41 | # Manual edit by Kevin Ummel: add 'examples = FALSE' to prevent running of documentation examples
42 |       - name: Build site
43 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE, examples = FALSE)
44 |         shell: Rscript {0}
45 | 
46 |       - name: Deploy to GitHub pages 🚀
47 |         if: github.event_name != 'pull_request'
48 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
49 |         with:
50 |           clean: false
51 |           branch: gh-pages
52 |           folder: docs
53 | 


--------------------------------------------------------------------------------
/man/impute.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/impute.R
 3 | \name{impute}
 4 | \alias{impute}
 5 | \title{Impute missing data via fusion}
 6 | \usage{
 7 | impute(
 8 |   data,
 9 |   weight = NULL,
10 |   ignore = NULL,
11 |   cores = parallel::detectCores(logical = FALSE) - 1L
12 | )
13 | }
14 | \arguments{
15 | \item{data}{A data frame with missing values.}
16 | 
17 | \item{weight}{Optional name of observation weights column in \code{data}.}
18 | 
19 | \item{ignore}{Optional names of columns in \code{data} to ignore. These variables are neither imputed nor used as predictors.}
20 | 
21 | \item{cores}{Number of physical CPU cores used by \code{\link[lightgbm]{lightgbm}}. LightGBM is parallel-enabled on all platforms if OpenMP is available.}
22 | }
23 | \value{
24 | A data frame with all missing values imputed.
25 | }
26 | \description{
27 | A universal missing data imputation tool that wraps successive calls to \code{\link{train}} and \code{\link{fuse}} under the hood. Designed for simplicity and ease of use.
28 | }
29 | \details{
30 | Variables with missing values are imputed sequentially, beginning with the variable with the fewest missing values. Since LightGBM models accommodate NA values in the predictor set, all available variables are used as potential predictors (excluding \code{ignore} variables). For each call to \code{\link{train}}, 80\% of observations are randomly selected for training and the remaining 20\% are used as a validation set to determine an appropriate number of tree learners. All LightGBM model parameters are kept at the sensible default values in \code{\link{train}}. Since \code{\link[lightgbm]{lightgbm}} uses OpenMP multithreading, it is not advisable to use \code{\link{impute}} inside a forked/parallel process when \code{cores > 1}.
31 | }
32 | \examples{
33 | # Create data frame with random NA values
34 | ?recs
35 | data <- recs[, 2:7]
36 | miss <- replicate(ncol(data), runif(nrow(data)) < runif(1, 0.01, 0.3))
37 | data[miss] <- NA
38 | colSums(is.na(data))
39 | 
40 | # Impute the missing values
41 | result <- impute(data)
42 | anyNA(result)
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/data-raw/recs.R:
--------------------------------------------------------------------------------
 1 | recs <- fst::read_fst("~/Documents/Projects/fusionData/survey-processed/RECS/2015/RECS_2015_H_processed.fst") |>
 2 |   dplyr::select(
 3 |     weight,
 4 |     moneypy,
 5 |     hhage,
 6 |     householder_race,
 7 |     education,
 8 |     employhh,
 9 |     nhsldmem,
10 |     recs_division,
11 |     ur12,
12 |     recs_iecc_zone,
13 |     kownrent,
14 |     typehuq,
15 |     yearmaderange,
16 |     fuelheat,
17 |     adqinsul,
18 |     cooltype,
19 |     agecenac,
20 |     tvcolor,
21 |     scalee,
22 |     totsqft_en,
23 |     kwh,
24 |     cufeetng,
25 |     gallonfo,
26 |     gallonlp,
27 |     btulp,
28 |     dollarlp,
29 |     sdescent,
30 |     totalbtu,
31 |     totalbtusph,
32 |     btuelahucol,
33 |     btuelcol,
34 |     rep_1:rep_96
35 |   ) |>
36 |   dplyr::rename(
37 |     income = moneypy,
38 |     age = hhage,
39 |     race = householder_race,
40 |     employment = employhh,
41 |     hh_size = nhsldmem,
42 |     division = recs_division,
43 |     urban_rural = ur12,
44 |     climate = recs_iecc_zone,
45 |     renter = kownrent,
46 |     home_type = typehuq,
47 |     year_built = yearmaderange,
48 |     heat_type = fuelheat,
49 |     insulation = adqinsul,
50 |     aircon = cooltype,
51 |     centralac_age = agecenac,
52 |     televisions = tvcolor,
53 |     disconnect = scalee,
54 |     square_feet = totsqft_en,
55 |     electricity = kwh,
56 |     natural_gas = cufeetng,
57 |     fuel_oil = gallonfo,
58 |     propane = gallonlp,
59 |     propane_btu = btulp,
60 |     propane_expend = dollarlp
61 |   ) |>
62 |   dplyr::mutate(
63 |     electricity = as.integer(round(electricity)),  # Make integer for testing purposes
64 |     heating_share = round(totalbtusph / totalbtu, 3),
65 |     cooling_share = round((btuelahucol + btuelcol) / totalbtu, 3),
66 |     other_share = 1 - heating_share - cooling_share,
67 |     urban_rural = factor(ifelse(urban_rural == "U", "Urban", "Rural")),
68 |     renter = renter != "Owned or being bought by someone in your household",
69 |     use_ng = natural_gas > 0,
70 |     have_ac = aircon != "No air conditioning",
71 |     race = ifelse(sdescent == "Yes", "Latino Alone", as.character(race)),
72 |     race = factor(trimws(gsub("Alone", "", race)))
73 |   ) |>
74 |   select(-sdescent, -totalbtu, -totalbtusph, -btuelahucol, -btuelcol) |>
75 |   select(-starts_with("rep_"), starts_with("rep_"))
76 | 
77 | usethis::use_data(recs, overwrite = TRUE)
78 | 


--------------------------------------------------------------------------------
/man/read_fsd.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/read_fsd.R
 3 | \name{read_fsd}
 4 | \alias{read_fsd}
 5 | \title{Read fusion output from disk}
 6 | \usage{
 7 | read_fsd(
 8 |   path,
 9 |   columns = NULL,
10 |   M = 1,
11 |   df = NULL,
12 |   cores = max(1, parallel::detectCores(logical = FALSE) - 1)
13 | )
14 | }
15 | \arguments{
16 | \item{path}{Character. Path to a \code{.fsd} (or \code{.fst}) file, typically produced by \code{\link{fuse}}.}
17 | 
18 | \item{columns}{Character. Column names to read. The default is to return all columns.}
19 | 
20 | \item{M}{Integer. The first \code{M} implicates are returned. Set \code{M = Inf} to return all implicates. Ignored if \code{M} column not present in data.}
21 | 
22 | \item{df}{Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows.}
23 | 
24 | \item{cores}{Integer. Number of cores used by \code{\link[fst]{fst}}.}
25 | }
26 | \value{
27 | A \code{\link[data.table]{data.table}}; keys are preserved if present in the on-disk data. When \code{path} points to a \code{.fsd} file, it includes an integer column "M" indicating the implicate assignment of each observation (unless explicitly ignored by \code{columns}).
28 | }
29 | \description{
30 | Efficiently read fusion output that was written to disk, optionally returning a subset of rows and/or columns. Since a \code{.fsd} file is simply a \code{\link[fst]{fst}} file under the hood, this function also works on any \code{.fst} file.
31 | }
32 | \details{
33 | If \code{df} is provided and the file size on disk is less than 100 MB, then a full read and inner \code{\link[collapse]{join}} is performed. For larger files, a manual read of the required rows is performed, using \code{\link[collapse]{fmatch}} for the matching operation.
34 | }
35 | \examples{
36 | # Build a fusion model using RECS microdata
37 | # Note that "fusion_model.fsn" will be written to working directory
38 | ?recs
39 | fusion.vars <- c("electricity", "natural_gas", "aircon")
40 | predictor.vars <- names(recs)[2:12]
41 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
42 | 
43 | # Write fusion output directly to disk
44 | # Note that "results.fsd" will be written to working directory
45 | recipient <- recs[predictor.vars]
46 | sim <- fuse(data = recipient, fsn = fsn.path, M = 5, fsd = "results.fsd")
47 | 
48 | # Read the fusion output saved to disk
49 | sim <- read_fsd(sim)
50 | head(sim)
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Data from the 2015 Residential Energy Consumption Survey (RECS)
 2 | #'
 3 | #' Pre-processed, household-level microdata containing a selection of 31
 4 | #' variables derived from the 2015 RECS, plus survey replicate weights. A variety of data types are included.
 5 | #' There are no missing values. Variable names have been altered from the original.
 6 | #'
 7 | #' @format A tibble with 5,686 rows and 124 variables:
 8 | #' \describe{
 9 | #'   \item{weight}{Primary sampling weight}
10 | #'   \item{income}{Annual gross household income for the last year}
11 | #'   \item{age}{Respondent age}
12 | #'   \item{race}{Respondent race}
13 | #'   \item{education}{Highest education completed by respondent}
14 | #'   \item{employment}{Respondent employment status}
15 | #'   \item{hh_size}{Number of household members}
16 | #'   \item{division}{Census Division}
17 | #'   \item{urban_rural}{Census 2010 Urban Type}
18 | #'   \item{climate}{IECC Climate Code}
19 | #'   \item{renter}{Is household renting the home?}
20 | #'   \item{home_type}{Type of housing unit}
21 | #'   \item{year_built}{Range when housing unit was built}
22 | #'   \item{square_feet}{Total square footage}
23 | #'   \item{insulation}{Level of insulation}
24 | #'   \item{heating}{Main space heating fuel}
25 | #'   \item{aircon}{Type of air conditioning equipment used}
26 | #'   \item{centralac_age}{Age of central air conditioner}
27 | #'   \item{televisions}{Number of televisions used}
28 | #'   \item{disconnect}{Frequency of receiving disconnect notice}
29 | #'   \item{electricity}{Total annual electricity usage, in kilowatthours}
30 | #'   \item{natural_gas}{Total annual natural gas usage, in hundred cubic feet}
31 | #'   \item{fuel_oil}{Total annual fuel oil/kerosene usage, in gallons}
32 | #'   \item{propane}{Total annual propane usage, in gallons}
33 | #'   \item{propane_btu}{Total annual propane usage, in thousand Btu}
34 | #'   \item{propane_expend}{Total annual propane expenditure, in dollars}
35 | #'   \item{heating_share}{Share of total energy used for space heating}
36 | #'   \item{heating_share}{Share of total energy used for cooling (AC and fans)}
37 | #'   \item{other_share}{Share of total energy used for other end-uses}
38 | #'   \item{use_ng}{Logical indicating if household uses natural gas}
39 | #'   \item{have_ac}{Logical indicating if household has air conditioning}
40 | #'   \item{rep_1:rep_96}{Replicate weights for uncertainty estimation}
41 | #'   }
42 | #' @source \url{https://www.eia.gov/consumption/residential/data/2015/}
43 | "recs"
44 | 


--------------------------------------------------------------------------------
/man/recs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{recs}
 5 | \alias{recs}
 6 | \title{Data from the 2015 Residential Energy Consumption Survey (RECS)}
 7 | \format{
 8 | A tibble with 5,686 rows and 124 variables:
 9 | \describe{
10 | \item{weight}{Primary sampling weight}
11 | \item{income}{Annual gross household income for the last year}
12 | \item{age}{Respondent age}
13 | \item{race}{Respondent race}
14 | \item{education}{Highest education completed by respondent}
15 | \item{employment}{Respondent employment status}
16 | \item{hh_size}{Number of household members}
17 | \item{division}{Census Division}
18 | \item{urban_rural}{Census 2010 Urban Type}
19 | \item{climate}{IECC Climate Code}
20 | \item{renter}{Is household renting the home?}
21 | \item{home_type}{Type of housing unit}
22 | \item{year_built}{Range when housing unit was built}
23 | \item{square_feet}{Total square footage}
24 | \item{insulation}{Level of insulation}
25 | \item{heating}{Main space heating fuel}
26 | \item{aircon}{Type of air conditioning equipment used}
27 | \item{centralac_age}{Age of central air conditioner}
28 | \item{televisions}{Number of televisions used}
29 | \item{disconnect}{Frequency of receiving disconnect notice}
30 | \item{electricity}{Total annual electricity usage, in kilowatthours}
31 | \item{natural_gas}{Total annual natural gas usage, in hundred cubic feet}
32 | \item{fuel_oil}{Total annual fuel oil/kerosene usage, in gallons}
33 | \item{propane}{Total annual propane usage, in gallons}
34 | \item{propane_btu}{Total annual propane usage, in thousand Btu}
35 | \item{propane_expend}{Total annual propane expenditure, in dollars}
36 | \item{heating_share}{Share of total energy used for space heating}
37 | \item{heating_share}{Share of total energy used for cooling (AC and fans)}
38 | \item{other_share}{Share of total energy used for other end-uses}
39 | \item{use_ng}{Logical indicating if household uses natural gas}
40 | \item{have_ac}{Logical indicating if household has air conditioning}
41 | \item{rep_1:rep_96}{Replicate weights for uncertainty estimation}
42 | }
43 | }
44 | \source{
45 | \url{https://www.eia.gov/consumption/residential/data/2015/}
46 | }
47 | \usage{
48 | recs
49 | }
50 | \description{
51 | Pre-processed, household-level microdata containing a selection of 31
52 | variables derived from the 2015 RECS, plus survey replicate weights. A variety of data types are included.
53 | There are no missing values. Variable names have been altered from the original.
54 | }
55 | \keyword{datasets}
56 | 


--------------------------------------------------------------------------------
/R/lookup.R:
--------------------------------------------------------------------------------
 1 | # Lookup details about available variables
 2 | 
 3 | lookup <- function(var = NULL, year = NULL) {
 4 | 
 5 |   # Loading fusionData dictionary (not necessarily available)
 6 |   # data(dictionary)
 7 |   # names(dictionary) <- tolower(names(dictionary))
 8 | 
 9 |   # Check validity of the working directory path
10 |   # Checks if "/fusionData" is part of the path, as this is required
11 |   b <- strsplit(full.path(getwd()), .Platform$file.sep, fixed = TRUE)[[1]]
12 |   i <- which(b == "fusionData")
13 |   if (length(i) == 0) stop("'/fusionData' is not part of the working directory path; this is required.")
14 |   dir <- paste(b[1:i], collapse = .Platform$file.sep)
15 | 
16 |   # Get file path(s) to ACS and fused microdata, based on values in 'year'
17 |   fpaths <- lapply(if (is.null(year)) "" else year, function(yr) {
18 |     pa <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._processed.fst"), recursive = TRUE, full.names = TRUE)
19 |     pc <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._custom.fst"), recursive = TRUE, full.names = TRUE)
20 |     pf <- rev(list.files(file.path(dir, "fusion"), pattern = paste0(yr, "_._fused.fsd"), recursive = TRUE, full.names = TRUE))
21 |     c(pa, pc, pf)
22 |   }) %>%
23 |     unlist()
24 | 
25 |   # Extract the 'var' present in each file in 'fpaths'
26 |   vlist <- if (is.null(var)) {
27 |     lapply(fpaths, function(x) fst::metadata_fst(x)$columnNames)
28 |   } else {
29 |     lapply(fpaths, function(x) intersect(var, fst::metadata_fst(x)$columnNames))
30 |   }
31 | 
32 |   acs.year <- str_extract(basename(fpaths) , "(\\d{4})(?=_[^_]*_[^_]*$)")
33 |   rtype <- str_extract(basename(fpaths), ".(?=_[^_]*$)")
34 |   rtype <- case_when(rtype == "H" ~ "Household", rtype == "P" ~ "Person")
35 |   source <- basename(fpaths) %>%
36 |     str_extract("^[^_]*_[^_]*") %>%
37 |     str_split(pattern = "_", n = 2, simplify = TRUE) %>%
38 |     as.data.frame() %>%
39 |     setNames(c('survey', 'vintage')) %>%
40 |     mutate(source = 1:n())
41 | 
42 |   out <- vlist %>%
43 |     tibble::enframe(name = "source", value = "var") %>%
44 |     left_join(source, by = join_by(source)) %>%
45 |     mutate(respondent = rtype,
46 |            acs_year = acs.year,
47 |            path = gsub(dir, "", fpaths, fixed = TRUE),
48 |            source = NULL) %>%
49 |     tidyr::unnest(var) %>%
50 |     filter(!var %in% c('M', 'year', 'hid', 'weight'), !str_detect(var, "^rep_\\d*$")) %>%
51 |     #left_join(dictionary, by = join_by(var == variable, survey, vintage, respondent)) %>%
52 |     #select(var, acs_year, respondent, survey, vintage, description, type, values, path)
53 |     select(var, acs_year, respondent, survey, vintage, path) %>%
54 |     arrange(across(everything()))
55 | 
56 |   return(out)
57 | 
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/man/monotonic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/monotonic.R
 3 | \name{monotonic}
 4 | \alias{monotonic}
 5 | \title{Create a monotonic relationship between two variables}
 6 | \usage{
 7 | monotonic(
 8 |   x,
 9 |   y,
10 |   w = NULL,
11 |   preserve = TRUE,
12 |   expend = TRUE,
13 |   fast = TRUE,
14 |   nmax = 5000,
15 |   plot = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{x}{Numeric.}
20 | 
21 | \item{y}{Numeric.}
22 | 
23 | \item{w}{Numeric. Optional observation weights.}
24 | 
25 | \item{preserve}{Logical. Preserve the original mean of the \code{y} values in the returned values?}
26 | 
27 | \item{expend}{Logical. Assume \code{y} is an expenditure variable? If \code{TRUE}, a safety check is implemented to ensure \code{y > 0} when \code{x > 0}.}
28 | 
29 | \item{fast}{Logical. If \code{TRUE}, only \code{\link[scam]{supsmu}} is used with coercion of result to monotone.}
30 | 
31 | \item{nmax}{Integer. Maximum number of observations to use for smoothing. Set lower for faster computation. \code{nmax = Inf} eliminates sampling.}
32 | 
33 | \item{plot}{Logical. Plot the (sampled) data points and derived monotonic relationship?}
34 | }
35 | \value{
36 | A numeric vector of modified \code{y} values. Optionally, a plot showing the returned monotonic relationship.
37 | }
38 | \description{
39 | \code{monotonic()} returns modified values of input vector \code{y} that are smoothed, monotonic, and consistent across all values of input \code{x}. It was designed to be used post-fusion when one wants to ensure a plausible relationship between consumption (\code{x}) and expenditure (\code{y}), under the assumption that all consumers face an identical, monotonic pricing structure. By default, the mean of the returned values is forced to equal the original mean of \code{y} (\code{preserve = TRUE}). The direction of monotonicity (increasing or decreasing) is detected automatically, so use cases are not limited to consumption and expenditure variables.
40 | }
41 | \details{
42 | The initial smoothing is accomplished via \code{\link[scam]{supsmu}} with the result coerced to monotone. If \code{fast = FALSE} and the coercion step modifies the values too much, a second smooth is attempted via a \code{\link[scam]{scam}} model with either a monotone increasing or decreasing constraint. If the SCAM fails to fit, the function falls back to \code{\link[stats]{lm}} with simple linear predictions. If \code{y = 0} when \code{x = 0} (as typical for consumption-expenditure variables), then that outcome is enforced in the result. The input data are randomly sampled to no more than \code{nmax} observations, if necessary, for speed.
43 | }
44 | \examples{
45 | y <- monotonic(x = recs$propane_btu, y = recs$propane_expend, plot = TRUE)
46 | mean(recs$propane_expend)
47 | mean(y)
48 | }
49 | 


--------------------------------------------------------------------------------
/man/assemble.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/assemble.R
 3 | \name{assemble}
 4 | \alias{assemble}
 5 | \title{Assemble fusionACS microdata across surveys}
 6 | \usage{
 7 | assemble(
 8 |   year,
 9 |   var,
10 |   respondent = "household",
11 |   M = 1,
12 |   df = NULL,
13 |   cores = 1,
14 |   source = "all",
15 |   silent = FALSE
16 | )
17 | }
18 | \arguments{
19 | \item{year}{Integer. One or more years for which to return results (i.e. the ACS recipient year).}
20 | 
21 | \item{var}{Character. Name of one or more variables to return. May contain household- and/or person-level variables. See Details.}
22 | 
23 | \item{respondent}{Character. Should \code{"household"}- or \code{"person"}-level microdata be returned?}
24 | 
25 | \item{M}{Integer. The first \code{M} implicates are returned for fused variables. Set \code{M = Inf} to return all implicates. Ignored if \code{var} contains only ACS variables (i.e. no implicates)}
26 | 
27 | \item{df}{Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows.}
28 | 
29 | \item{cores}{Integer. Number of cores used by the \code{\link[fst]{fst-package}} when reading from disk.}
30 | 
31 | \item{source}{Character Specifies where to look for \code{var}: all available microdata (\code{source = "all"}); only ACS microdata (\code{source = "ACS"}); or only fused microdata (\code{source = "fused"}). Note that no observation weights are returned if \code{source = "fused"}, since weights are stored in the ACS microdata.}
32 | 
33 | \item{silent}{Logical. If \code{FALSE}, a warning is issued if any \code{var} cannot be located in available local files.}
34 | }
35 | \value{
36 | A keyed data table containing the following columns, in addition to the variables named in \code{var}:
37 | \describe{
38 | \item{M}{Implicate number. See \code{\link{fuse}}.}
39 | \item{year}{Year of the ACS recipient microdata.}
40 | \item{hid}{ACS household ID using fusionACS convention.}
41 | \item{pid}{ACS person ID using fusionACS convention, if \code{respondent = "person"}.}
42 | \item{weight}{ACS microdata primary sample weight.}
43 | }
44 | }
45 | \description{
46 | For fusionACS usage only. Provides a safe and efficient way to assemble (merge) fused microdata across surveys to return a single data table with the requested variables. The requested variables can come from any fused (donor) survey and/or the American Community Survey (ACS). The necessary variables are automatically and efficiently read from the appropriate local file and safely merged on household and/or person ID variables, optionally collapsing or expanding records as necessary depending on the \code{respondent} argument. Assumes (and checks for) a local \code{/fusionData} directory with appropriate file structure and conventions.
47 | }
48 | \details{
49 | The \code{var} argument can contain a mix of household- and/or person-level variables. When \code{respondent = "household"}, the reference person (i.e. head of household) value is returned for any person-level variables. When \code{respondent = "person"}, the values of any household-level variables are replicated for each person in the household.
50 | }
51 | \examples{
52 | # NOTE: Requires local /fusionData directory containing the necessary ACS and .fsd files
53 | test <- assemble(year = 2018:2019,
54 |                  var = c("dollarel", "hincp", "agep", "state"),
55 |                  respondent = "household",
56 |                  M = 1)
57 | }
58 | 


--------------------------------------------------------------------------------
/man/prepXY.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/prepXY.R
 3 | \name{prepXY}
 4 | \alias{prepXY}
 5 | \title{Prepare the 'x' and 'y' inputs}
 6 | \usage{
 7 | prepXY(
 8 |   data,
 9 |   y,
10 |   x,
11 |   weight = NULL,
12 |   cor_thresh = 0.05,
13 |   lasso_thresh = 0.95,
14 |   xmax = 100,
15 |   xforce = NULL,
16 |   fraction = 1,
17 |   cores = 1
18 | )
19 | }
20 | \arguments{
21 | \item{data}{Data frame. Training dataset. All categorical variables should be factors and ordered whenever possible.}
22 | 
23 | \item{y}{Character or list. Variables in \code{data} to eventually fuse to a recipient dataset. If \code{y} is a list, each entry is a character vector possibly indicating multiple variables to fuse as a block.}
24 | 
25 | \item{x}{Character. Predictor variables in \code{data} common to donor and eventual recipient.}
26 | 
27 | \item{weight}{Character. Name of the observation weights column in \code{data}. If NULL (default), uniform weights are assumed.}
28 | 
29 | \item{cor_thresh}{Numeric. Predictors that exhibit less than \code{cor_thresh} absolute Spearman (rank) correlation with a \code{y} variable are screened out prior to the LASSO step. Fast exclusion of predictors that the LASSO step probably doesn't need to consider.}
30 | 
31 | \item{lasso_thresh}{Numeric. Controls how aggressively the LASSO step screens out predictors. Lower value is more aggressive. \code{lasso_thresh = 0.95}, for example, retains predictors that collectively explain at least 95\% of the deviance explained by a "full" model.}
32 | 
33 | \item{xmax}{Integer. Maximum number of predictors returned by LASSO step. Does not strictly control the number of final predictors returned (especially for categorical \code{y} variables), but useful for setting a (very) soft upper bound. Lower \code{xmax} can help control computation time if a large number of \code{x} pass the correlation screen. \code{xmax = Inf} imposes no restriction.}
34 | 
35 | \item{xforce}{Character. Subset of \code{x} variables to "force" as included predictors in the results.}
36 | 
37 | \item{fraction}{Numeric. Fraction of observations in \code{data} to randomly sample. For larger datasets, sampling often has minimal effect on results but speeds up computation.}
38 | 
39 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.}
40 | }
41 | \value{
42 | List with named slots "y" and "x". Each is a list of the same length. Former gives the preferred fusion order. Latter gives the preferred sets of predictor variables.
43 | }
44 | \description{
45 | Optional-but-useful function to: 1) provide a plausible ordering of the 'y' (fusion) variables and 2) identify the subset of 'x' (predictor) variables likely to be consequential during subsequent model training. Output can be passed directly to \code{\link{train}}. Most useful for large datasets with many and/or highly-correlated predictors. Employs an absolute Spearman rank correlation screen and then LASSO models (via \code{\link[glmnet]{glmnet}}) to return a plausible ordering of 'y' and the preferred subset of 'x' variables associated with each.
46 | }
47 | \examples{
48 | y <- names(recs)[c(14:16, 20:22)]
49 | x <- names(recs)[2:13]
50 | 
51 | # Fusion variable "blocks" are respected by prepXY()
52 | y <- c(list(y[1:2]), y[-c(1:2)])
53 | 
54 | # Do the prep work...
55 | prep <- prepXY(data = recs, y = y, x = x)
56 | 
57 | # The result can be passed to train()
58 | train(data = recs, y = prep$y, x = prep$x)
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/R/importance.R:
--------------------------------------------------------------------------------
 1 | #' Extract predictor variable importance from a fusion model
 2 | #'
 3 | #' @description
 4 | #' Returns predictor variable (feature) importance of underlying LightGBM models stored in a fusion model file (.fsn) on disk.
 5 | #' @param fsn Character. Path to fusion model file (.fsn) generated by \code{\link{train}}.
 6 | #' @details Importance metrics are computed via \code{\link[lightgbm]{lgb.importance}}. Three types of measures are returned; "gain" is typically the preferred measure.
 7 | #' @return A named list containing \code{detailed} and \code{summary} importance results. The \code{summary} results are most useful, as they return the average importance of each predictor across potentially multiple underlying LightGBM models; i.e. zero ("z"), mean ("m"), or quantile ("q") models. See Examples for suggested plotting of results.
 8 | #' @examples
 9 | #' # Build a fusion model using RECS microdata
10 | #' # Note that "fusion_model.fsn" will be written to working directory
11 | #' ?recs
12 | #' fusion.vars <- c("electricity", "natural_gas", "aircon")
13 | #' predictor.vars <- names(recs)[2:12]
14 | #' fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
15 | #'
16 | #' # Extract predictor variable importance
17 | #' ximp <- importance(fsn.path)
18 | #'
19 | #' # Plot summary results
20 | #' library(ggplot2)
21 | #' ggplot(ximp$summary, aes(x = x, y = gain)) +
22 | #'   geom_bar(stat = "identity") +
23 | #'   facet_grid(~ y) +
24 | #'   coord_flip()
25 | #'
26 | #' # View detailed results
27 | #' View(ximp$detailed)
28 | #' @export
29 | 
30 | importance <- function(fsn) {
31 | 
32 |   stopifnot(exprs = {
33 |     file.exists(fsn) & endsWith(fsn, ".fsn")
34 |   })
35 | 
36 |   # Temporary directory to unzip .fsn contents to
37 |   td <- tempfile()
38 |   dir.create(td)
39 | 
40 |   # Names of files within the .fsn object
41 |   fsn.files <- zip::zip_list(fsn)
42 |   pfixes <- sort(unique(dirname(fsn.files$filename)))
43 |   pfixes <- pfixes[pfixes != "."]
44 | 
45 |   # Unzip all files in .fsn to temporary directory
46 |   zip::unzip(zipfile = fsn, exdir = td)
47 | 
48 |   # Extract full-detail variable importance metrics
49 |   detail <- lapply(1:length(pfixes), FUN = function(i) {
50 |     mods <- grep(pattern = utils::glob2rx(paste0(pfixes[i], "*.txt")), x = fsn.files$filename, value = TRUE)
51 |     mods %>%
52 |       lapply(FUN = function(m) {
53 |         y <- sub("_[^_]+$", "", basename(m))
54 |         n <- sub(".txt", "", sub(paste0(y, "_"), "", basename(m), fixed = TRUE), fixed = TRUE)
55 |         mod <- lightgbm::lgb.load(filename = file.path(td, m))
56 |         imp <- lightgbm::lgb.importance(mod) %>%
57 |           mutate(y = y,
58 |                  model = n)
59 |         return(imp)
60 |       }) %>%
61 |       data.table::rbindlist()
62 |   }) %>%
63 |     data.table::rbindlist() %>%
64 |     rename_with(tolower) %>%
65 |     rename(x = feature) %>%
66 |     select(y, model, x, everything()) %>%
67 |     as.data.frame()
68 | 
69 |   # Generate summary of full-detail results
70 |   smry <- detail %>%
71 |     group_by(y, x) %>%
72 |     summarize(across(gain:frequency, mean), .groups = "drop_last") %>%
73 |     mutate(across(gain:frequency, ~ .x / sum(.x))) %>%  # Ensure percentages equal 1
74 |     ungroup() %>%
75 |     arrange(y, -gain)
76 | 
77 |   # Recommended plotting order for 'x'
78 |   yvars <- unique(detail$y)
79 |   xord <- smry %>%
80 |     filter(!x %in% yvars) %>%
81 |     group_by(x) %>%
82 |     summarize(across(gain, mean)) %>%
83 |     arrange(gain) %>%
84 |     pull(x)
85 | 
86 |   # Set factor levels for summary 'x' and 'x' variables for suitable plotting
87 |   smry$y <- factor(smry$y, levels = yvars)
88 |   smry$x <- factor(smry$x, levels = c(rev(yvars), xord))
89 | 
90 |   out <- list(summary = smry, detailed = detail)
91 |   return(out)
92 | 
93 | }
94 | 


--------------------------------------------------------------------------------
/man/fuse.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fuse.R
 3 | \name{fuse}
 4 | \alias{fuse}
 5 | \title{Fuse variables to a recipient dataset}
 6 | \usage{
 7 | fuse(
 8 |   data,
 9 |   fsn,
10 |   fsd = NULL,
11 |   M = 1,
12 |   retain = NULL,
13 |   kblock = 10,
14 |   margin = 2,
15 |   cores = 1
16 | )
17 | }
18 | \arguments{
19 | \item{data}{Data frame. Recipient dataset. All categorical variables should be factors and ordered whenever possible. Data types and levels are strictly validated against predictor variables defined in \code{fsn}.}
20 | 
21 | \item{fsn}{Character. Path to fusion model file (.fsn) generated by \code{\link{train}}.}
22 | 
23 | \item{fsd}{Character. Optional fusion output file to be created ending in \code{.fsd} (i.e. "fused data"). This is a compressed binary file that can be read using the \code{\link[fst]{fst}} package. If \code{fsd = NULL} (the default), the fusion results are returned as a \code{\link[data.table]{data.table}}.}
24 | 
25 | \item{M}{Integer. Number of implicates to simulate.}
26 | 
27 | \item{retain}{Character. Names of columns in \code{data} that should be retained in the output; i.e. repeated across implicates. Useful for retaining ID or weight variables for use in subsequent analysis of fusion output.}
28 | 
29 | \item{kblock}{Integer. Fixed number of nearest neighbors to use when fusing variables in a block. Must be >= 5 and <= 30. Not applicable for variables fused on their own (i.e. no block).}
30 | 
31 | \item{margin}{Numeric. Safety margin used when estimating how many implicates can be processed in memory at once. Set higher if \code{fuse()} experiences a memory shortfall. Alternatively, can be set to a negative value to manually specify the number of chunks to use. For example, \code{margin = -3} splits \code{M} implicates into three chunks of approximately equal size.}
32 | 
33 | \item{cores}{Integer. Number of cores used. LightGBM prediction is parallel-enabled on all systems if OpenMP is available.}
34 | }
35 | \value{
36 | If \code{fsd = NULL}, a \code{\link[data.table]{data.table}} with number of rows equal to \code{M * nrow(data)}. Integer column "M" indicates implicate assignment of each observation. Note that the ordering of recipient observations is consistent within implicates, so do not change the row order if using with \code{\link{analyze}}.
37 | 
38 | If \code{fsd} is specified, the path to .fsd file where results were written. Metadata for column classes and factor levels are stored in the column names. \code{\link{read_fsd}} should be used to load files saved via the \code{fsd} argument.
39 | }
40 | \description{
41 | Fuse variables to a recipient dataset using a .fsn model produced by \code{\link{train}}. Output can be passed to \code{\link{analyze}} and \code{\link{validate}}.
42 | }
43 | \details{
44 | TO UPDATE.
45 | }
46 | \examples{
47 | # Build a fusion model using RECS microdata
48 | # Note that "fusion_model.fsn" will be written to working directory
49 | ?recs
50 | fusion.vars <- c("electricity", "natural_gas", "aircon")
51 | predictor.vars <- names(recs)[2:12]
52 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
53 | 
54 | # Generate single implicate of synthetic 'fusion.vars',
55 | #  using original RECS data as the recipient
56 | recipient <- recs[predictor.vars]
57 | sim <- fuse(data = recipient, fsn = fsn.path)
58 | head(sim)
59 | 
60 | # Calling fuse() again produces different results
61 | sim <- fuse(data = recipient, fsn = fsn.path)
62 | head(sim)
63 | 
64 | # Generate multiple implicates
65 | sim <- fuse(data = recipient, fsn = fsn.path, M = 5)
66 | head(sim)
67 | table(sim$M)
68 | 
69 | # Optionally, write results directly to disk
70 | # Note that "results.fsd" will be written to working directory
71 | sim <- fuse(data = recipient, fsn = fsn.path, M = 5, fsd = "results.fsd")
72 | sim <- read_fsd(sim)
73 | head(sim)
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/man/validate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validate.R
 3 | \name{validate}
 4 | \alias{validate}
 5 | \title{Validate fusion output}
 6 | \usage{
 7 | validate(
 8 |   observed,
 9 |   implicates,
10 |   subset_vars,
11 |   weight = NULL,
12 |   min_size = 30,
13 |   plot = TRUE,
14 |   cores = 1
15 | )
16 | }
17 | \arguments{
18 | \item{observed}{Data frame. Observed data against which to validate the \code{simulated} variables. Typically the same dataset used to \code{\link{train}} the fusion model used to generate \code{simulated}.}
19 | 
20 | \item{implicates}{Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M".}
21 | 
22 | \item{subset_vars}{Character. Vector of columns in \code{observed} used to define the population subsets across which the fusion variables are validated. The levels of each \code{subset_vars} (including all two-way interactions of \code{subset_vars}) define the population subsets. Continuous \code{subset_vars} are converted to a five-level ordered factor based on a univariate k-means clustering.}
23 | 
24 | \item{weight}{Character. Name of the observation weights column in \code{observed}. If NULL (default), uniform weights are assumed.}
25 | 
26 | \item{min_size}{Integer. Subsets with less than \code{min_size} observations are excluded. Since subsets with few observations are unlikely to give reliable estimates, it doesn't make sense to consider them for validation purposes.}
27 | 
28 | \item{plot}{Logical. If TRUE (default), \code{\link{plot_valid}} is called internally and summary plots are returned along with complete validation results. Requires the \code{\link{ggplot2}} package.}
29 | 
30 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.}
31 | }
32 | \value{
33 | If \code{plot = FALSE}, a data frame containing complete validation results. If If \code{plot = FALSE}, a list containing full results as well as additional lot objects as described in \code{\link{plot_valid}}.
34 | }
35 | \description{
36 | Performs internal validation analyses on fused microdata to estimate how well the simulated variables reflect patterns in the dataset used to train the underlying fusion model (i.e. observed/donor data). This provides a standard approach to validating fusion output and associated models. See Examples for recommended usage.
37 | }
38 | \details{
39 | The objective of \code{\link{validate}} is to confirm that the fusion output is sensible and help establish the utility of the synthetic data across myriad analyses. Utility here is based on comparison of point estimates and confidence intervals derived using multiple-implicate synthetic data with those derived using the original donor data.
40 | 
41 | The specific analyses tested include variable levels (means and proportions) across population subsets of varying size. This allows estimates of how each of the synthetic variables perform in analyses with real-world relevance, at varying levels of complexity. In effect, \code{validate()} performs a large number of analyses of the kind that the \code{\link{analyze}} function is designed to do on a one-by-one basis.
42 | 
43 | Most users will want to use the default setting \code{plot = TRUE} to simultaneously return visualization (plots) of the validation results. Plot creation is detailed in \code{\link{plot_valid}}.
44 | }
45 | \examples{
46 | # Build a fusion model using RECS microdata
47 | # Note that "fusion_model.fsn" will be written to working directory
48 | fusion.vars <- c("electricity", "natural_gas", "aircon")
49 | predictor.vars <- names(recs)[2:12]
50 | fsn.path <- train(data = recs,
51 |                   y = fusion.vars,
52 |                   x = predictor.vars,
53 |                   weight = "weight")
54 | 
55 | # Fuse back onto the donor data (multiple implicates)
56 | sim <- fuse(data = recs,
57 |             fsn = fsn.path,
58 |             M = 20)
59 | 
60 | # Calculate validation results
61 | valid <- validate(observed = recs,
62 |                   implicates = sim,
63 |                   subset_vars = c("income", "education", "race", "urban_rural"))
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/man/plot_valid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_valid.R
 3 | \name{plot_valid}
 4 | \alias{plot_valid}
 5 | \title{Plot validation results}
 6 | \usage{
 7 | plot_valid(valid, y = NULL, path = NULL, cores = 1, ...)
 8 | }
 9 | \arguments{
10 | \item{valid}{Object returned by \code{\link{validate}}.}
11 | 
12 | \item{y}{Character. Fusion variables to use for validation graphics. Useful for plotting partial validation results. Default is to use all fusion variables present in \code{valid}.}
13 | 
14 | \item{path}{Character. Path to directory where .png graphics are to be saved. Directory is created if necessary. If NULL (default), no files are saved to disk.}
15 | 
16 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.}
17 | 
18 | \item{...}{Arguments passed to \code{\link[ggplot2]{ggsave}} to control .png graphics saved to disk.}
19 | }
20 | \value{
21 | A list with "plots", "smooth", and "data" slots. The "plots" slot contains the following \code{\link[ggplot2]{ggplot}} objects:
22 | \itemize{
23 | \item est: Comparison of point estimates (median absolute percent error).
24 | \item moe: Comparison of 90\% margin of error (median ratio of simulated-to-observed MOE).
25 | \item Additional named slots (one for each of the fusion variables) contain the plots described above with scatterplot results.
26 | }
27 | "smooth" is a data frame with the plotting values used to produce the smoothed median plots.
28 | "data" is a data frame with the complete validation results as returned by the original call to \code{\link{validate}}.
29 | }
30 | \description{
31 | Creates and optionally saves to disk representative plots of validation results returned by \code{\link{validate}}. Requires the suggested \code{\link{ggplot2}} package. This function is (by default) called within \code{\link{validate}}. Can be useful on its own to save graphics to disk or generate plots for a subset of fusion variables.
32 | }
33 | \details{
34 | Validation results are visualized to convey expected, typical (median) performance of the fusion variables. That is, how well do the simulated data match the observed data with respect to point estimates and confidence intervals for population subsets of various size?
35 | 
36 | Plausible error metrics are derived from the input validation data for plotting. For comparison of point estimates, the error metric is absolute percent error for continuous variables; in the categorical case it is absolute error scaled such that the maximum possible error is 1. Since these metrics are not strictly comparable, the all-variable plots denote categorical fusion variables with dotted lines.
37 | 
38 | For a given fusion variable, the error metric will exhibit variation (often quite skewed) even for subsets of comparable size, due to the fact that each subset looks at a unique partition of the data. In order to convey how expected, typical performance varies with subset size, the smoothed median error conditional on subset size is approximated and plotted.
39 | }
40 | \examples{
41 | # Build a fusion model using RECS microdata
42 | # Note that "fusion_model.fsn" will be written to working directory
43 | fusion.vars <- c("electricity", "natural_gas", "aircon")
44 | predictor.vars <- names(recs)[2:12]
45 | fsn.path <- train(data = recs,
46 |                   y = fusion.vars,
47 |                   x = predictor.vars,
48 |                   weight = "weight")
49 | 
50 | # Fuse back onto the donor data (multiple implicates)
51 | sim <- fuse(data = recs,
52 |             file = fsn.path,
53 |             M = 30)
54 | 
55 | # Calculate validation results but do not generate plots
56 | valid <- validate(observed = recs,
57 |                   implicates = sim,
58 |                   subset_vars = c("income", "education", "race", "urban_rural"),
59 |                   weight = "weight",
60 |                   plot = FALSE)
61 | 
62 | # Create validation plots
63 | valid <- plot_valid(valid)
64 | 
65 | # View some of the plots
66 | valid$plots$est
67 | valid$plots$moe
68 | valid$plots$electricity$bias
69 | 
70 | # Can also save the plots to disk at creation
71 | # Will save .png files to 'valid_plots' folder in working directory
72 | # Note that it is fine to pass a 'valid' object with existing $plots slot
73 | # In that case, the plots are simply re-generated
74 | vplots <- plot_valid(valid,
75 |                      path = file.path(getwd(), "valid_plots"),
76 |                      width = 8, height = 6)
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/R/fitLGB.R:
--------------------------------------------------------------------------------
  1 | fitLGB <- function(dfull, dtrain = NULL, dvalid = NULL, cv.folds = NULL, hyper.grid, params.obj) {
  2 | 
  3 |   # If full cross-validation is requested...
  4 |   if (is.list(cv.folds)) {
  5 |     perf <-  lapply(hyper.grid, FUN = function(x) {
  6 |       sink <- utils::capture.output({
  7 |         mod <- lightgbm::lgb.cv(
  8 |           params = c(as.list(x), params.obj),
  9 |           data = dfull,
 10 |           folds = cv.folds,
 11 |           early_stopping_rounds = 2L,
 12 |           verbose = -1L
 13 |         )
 14 |       })
 15 |       data.frame(best_score = mod$best_score, best_iter = mod$best_iter)
 16 |     })
 17 |   }
 18 | 
 19 |   # If training/test-set validation is requested...
 20 |   if (is.logical(cv.folds)) {
 21 |     perf <- lapply(hyper.grid, FUN = function(x) {
 22 |       p <- c(as.list(x), params.obj)
 23 |       sink <- utils::capture.output({
 24 |         mod <- lightgbm::lgb.train(
 25 |           params = p,
 26 |           data = dtrain,
 27 |           valids = list(valid = dvalid),
 28 |           early_stopping_rounds = 2L,
 29 |           verbose = -1L
 30 |         )
 31 |       })
 32 |       data.frame(best_score = mod$best_score, best_iter = mod$best_iter)
 33 |     })
 34 |   }
 35 | 
 36 |   # # If no validation is requested; i.e. over-fitting scenario
 37 |   # if (is.null(dvalid) & is.null(cv.folds)) {
 38 |   #   perf <- lapply(hyper.grid, FUN = function(x) {
 39 |   #     p <- c(as.list(x), params.obj)
 40 |   #     sink <- utils::capture.output({
 41 |   #       mod <- lightgbm::lgb.train(
 42 |   #         params = p,
 43 |   #         data = dfull,
 44 |   #         verbose = -1L
 45 |   #       )
 46 |   #     })
 47 |   #     # Can't get it to return training loss
 48 |   #     # Return the training log-loss at the maximum number of iterations
 49 |   #     #train.evals <- unlist(mod$record_evals$train)
 50 |   #     #c(min(train.evals), which.min(train.evals))
 51 |   #     c(NA, NA)
 52 |   #   })
 53 |   # }
 54 | 
 55 |   # Compare validation set performance across hyper-parameter sets
 56 |   comp <- bind_rows(perf)
 57 |   opt <- which.min(comp$best_score)
 58 |   # This is not ideal -- overfitting process should return training loss and work with multiple hypergrid options (instead of just choosing #1)
 59 |   if (length(opt) == 0) {
 60 |     params.opt <- hyper.grid[[1]]
 61 |   } else {
 62 |     params.opt <- hyper.grid[[opt]]
 63 |     params.opt$num_iterations <- as.integer(comp$best_iter[[opt]])
 64 |   }
 65 | 
 66 |   # Fit final model using full dataset and optimal parameter values
 67 |   mod <- lightgbm::lgb.train(
 68 |     params = c(params.opt, params.obj),
 69 |     data = dfull,
 70 |     verbose = -1L
 71 |   )
 72 | 
 73 |   # Add the optimal validation score and number of iterations to the 'mod' object
 74 |   # These are NA and -1, by default, which doesn't provide any useful information
 75 |   mod$best_score <- as.numeric(comp$best_score[opt])
 76 |   mod$best_iter <- params.opt$num_iterations
 77 | 
 78 |   # Storing hyper results in 'record_evals' slot, since adding a custom slot is not allowed
 79 |   hyper.results <- cbind(bind_rows(hyper.grid), comp, final_model = FALSE)
 80 |   hyper.results$final_model[opt] <- TRUE
 81 |   mod$record_evals <- hyper.results
 82 | 
 83 |   # Plot the evolution of the loss function
 84 |   #plot(unlist(mod$record_evals[[2]]))
 85 | 
 86 |   return(mod)
 87 | 
 88 | }
 89 | 
 90 | #---
 91 | #!!! NOTE: It appears callbacks are not (yet) exported but that could be coming soon.
 92 | # See here: https://github.com/microsoft/LightGBM/pull/5018
 93 | # https://github.com/Microsoft/LightGBM/blob/master/R-package/R/callback.R
 94 | # https://stackoverflow.com/questions/54027734/adding-callbacks-to-lightgbm-in-r
 95 | # IMPORTANT -- this is presumably the way to set the 'min_delta' argument: https://github.com/ummel/fusionModel/issues/24
 96 | # IN PROGRESS: https://github.com/microsoft/LightGBM/pull/5018#pullrequestreview-892006092
 97 | # Add here: https://github.com/microsoft/LightGBM/pull/5123  ( appears to be merged?!?!?)
 98 | # mod.cv <- lightgbm::lgb.cv(
 99 | #   params = c(params.global, obj, min_data_in_leaf = min.leaf),
100 | #   data = dfull,
101 | #   nrounds = 500L,  # See note at top about min_delta. Could set higher once min_delta is allowed...
102 | #   folds = cv.folds,
103 | #   early_stopping_rounds = 1L,
104 | #   verbose = -1L,
105 | #   callbacks = list(cb_early_stop(verbose = FALSE))  # !!! EXAMPLE -- may suppress console output
106 | # )
107 | #---
108 | 
109 | 


--------------------------------------------------------------------------------
/man/fusionOutput.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fusionOutput.R
 3 | \name{fusionOutput}
 4 | \alias{fusionOutput}
 5 | \title{Generate output files resulting from fusion}
 6 | \usage{
 7 | fusionOutput(
 8 |   donor,
 9 |   respondent,
10 |   acs_year,
11 |   fusion_vars,
12 |   M = 1,
13 |   input_version = "latest",
14 |   output_version = "today",
15 |   fsn = NULL,
16 |   rlocation = NULL,
17 |   note = NULL,
18 |   test_mode = TRUE,
19 |   validation = TRUE,
20 |   ncores = 1,
21 |   margin = 4,
22 |   ...
23 | )
24 | }
25 | \arguments{
26 | \item{donor}{Character. Donor survey identifier (e.g. \code{"RECS_2015"}).}
27 | 
28 | \item{respondent}{Character. Desired respondent level of microdata. Either \code{"household"} or \code{"person"}.}
29 | 
30 | \item{acs_year}{Integer. Year of the ACS microdata to be used as recipient.}
31 | 
32 | \item{fusion_vars}{Character. Vector of variables in the donor microdata to be fused to the ACS microdata recipient.}
33 | 
34 | \item{M}{Integer. Desired number of fusion implicates. If \code{M = NULL} (default) it is internally set to 40 or, if \code{test_mode = TRUE}, 2 implicates.}
35 | 
36 | \item{input_version}{Character. Version (i.e. date stamp) of the desired fusion inputs. Defaults to the latest available version.}
37 | 
38 | \item{output_version}{Character. Version (i.e. date stamp) to assign to the outputs. Defaults to today's date.}
39 | 
40 | \item{fsn}{Character. Optional path to a .fsn object created by a previous call to \code{fusionOutput()}. If specified, code attempts to use \code{fsn} to bypass the training step.}
41 | 
42 | \item{rlocation}{Data frame. Optional data frame to replace the donor input data imputed respondent locations with actual/disclosed locations specified in \code{rlocation}.}
43 | 
44 | \item{note}{Character. Optional note supplied by user. Inserted in the log file for reference.}
45 | 
46 | \item{test_mode}{Logical. If \code{test_mode = TRUE} (default), the result files are always saved within a "/fusion_" directory in \code{output} (possibly created); faster hyperparameters are used for \code{\link[fusionModel]{train}}; and the internal validation step is skipped by default.}
47 | 
48 | \item{validation}{Logical or integer. Controls execution of internal validation (Steps 3 and 4). If \code{validation = 0} or \code{FALSE}, neither step is performed (default when \code{test_mode = TRUE}). If \code{1}, only Step 3. If \code{2} or \code{TRUE}, both Steps 3 and 4.}
49 | 
50 | \item{ncores}{Integer. Number of physical CPU cores used for parallel computation.}
51 | 
52 | \item{margin}{Numeric. Passed to same argument in \code{\link[fusionModel]{fuse}}.}
53 | 
54 | \item{...}{Optional, non-default arguments passed to \code{\link[fusionModel]{train}}. For example, \code{fork = TRUE} to enable forked parallel processing.}
55 | }
56 | \value{
57 | Saves resulting \code{output} data files to appropriate local directory. Also saves a .txt log file alongside data files that records console output from \code{fusionOutput}.
58 | }
59 | \description{
60 | Handles all operations needed to "do fusion" using input files generated by a successful call to \code{fusionInput}. Trains a fusion model, generates internal validation results, and then simulates multiple implicates for recipient microdata.
61 | }
62 | \details{
63 | The function checks arguments and determines the file path to the appropriate \code{output} directory (creating it if necessary). The output files are always placed within the appropriate directory hierarchy, based on the donor and recipient information detected in the \code{input} file names. In practice, \code{output} need only be specified if working in an environment where the output files need to located somewhere different from the input files.
64 | 
65 | The function executes the following steps:
66 | \enumerate{
67 | \item \strong{Load training data inputs}. Loads donor training microdata and results of \code{\link[fusionModel]{prepXY}}.
68 | \item \strong{Run fusionModel::train()}. Calls \code{\link[fusionModel]{train}} using sensible defaults and hyperparameters. If \code{test_mode = TRUE}, the hyperparameters are designed to do a fast/rough-and-ready model training.
69 | \item \strong{Fuse onto training data for internal validation}. Optional step (see \code{validation} argument). Fuses multiple implicates to original donor training data using \code{\link[fusionModel]{fuse}}. Results saved to disk.
70 | \item \strong{Run fusionModel::validate()}. Optional step (see \code{validation} argument). Passes previous step's results to \code{\link[fusionModel]{validate}}. Results saved to disk.
71 | \item \strong{Fuse onto prediction data}. Fuses multiple implicates to supplied input prediction data using \code{\link[fusionModel]{fuse}}. Results saved to disk.
72 | \item \strong{fusionOutput() is finished!} Upon completion, a log file named \code{"outputlog.txt"} is written to \code{output} for reference.
73 | }
74 | }
75 | \examples{
76 | # Since 'test_mode = TRUE' by default, this will affect files in local '/fusion_' directory
77 | dir <- fusionInput(donor = "RECS_2015",
78 |                    recipient = "ACS_2015",
79 |                    respondent = "household",
80 |                    fuse = c("btung", "btuel", "cooltype"),
81 |                    force = c("moneypy", "householder_race", "education", "nhsldmem", "kownrent", "recs_division"),
82 |                    note = "Hello world. Reminder: running in test mode by default.")
83 | 
84 | # List files in the /input directory
85 | list.files(dir)
86 | 
87 | # Using default settings
88 | out <- fusionOutput(input = dir)
89 | list.files(out)
90 | 
91 | }
92 | 


--------------------------------------------------------------------------------
/R/read_fsd.R:
--------------------------------------------------------------------------------
  1 | #' Read fusion output from disk
  2 | #'
  3 | #' @description Efficiently read fusion output that was written to disk, optionally returning a subset of rows and/or columns. Since a \code{.fsd} file is simply a \code{\link[fst]{fst}} file under the hood, this function also works on any \code{.fst} file.
  4 | #' @param path Character. Path to a \code{.fsd} (or \code{.fst}) file, typically produced by \code{\link{fuse}}.
  5 | #' @param columns Character. Column names to read. The default is to return all columns.
  6 | #' @param M Integer. The first \code{M} implicates are returned. Set \code{M = Inf} to return all implicates. Ignored if \code{M} column not present in data.
  7 | #' @param df Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows.
  8 | #' @param cores Integer. Number of cores used by \code{\link[fst]{fst}}.
  9 | #' @details If \code{df} is provided and the file size on disk is less than 100 MB, then a full read and inner \code{\link[collapse]{join}} is performed. For larger files, a manual read of the required rows is performed, using \code{\link[collapse]{fmatch}} for the matching operation.
 10 | #' @return A \code{\link[data.table]{data.table}}; keys are preserved if present in the on-disk data. When \code{path} points to a \code{.fsd} file, it includes an integer column "M" indicating the implicate assignment of each observation (unless explicitly ignored by \code{columns}).
 11 | #' @examples
 12 | #' # Build a fusion model using RECS microdata
 13 | #' # Note that "fusion_model.fsn" will be written to working directory
 14 | #' ?recs
 15 | #' fusion.vars <- c("electricity", "natural_gas", "aircon")
 16 | #' predictor.vars <- names(recs)[2:12]
 17 | #' fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
 18 | #'
 19 | #' # Write fusion output directly to disk
 20 | #' # Note that "results.fsd" will be written to working directory
 21 | #' recipient <- recs[predictor.vars]
 22 | #' sim <- fuse(data = recipient, fsn = fsn.path, M = 5, fsd = "results.fsd")
 23 | #'
 24 | #' # Read the fusion output saved to disk
 25 | #' sim <- read_fsd(sim)
 26 | #' head(sim)
 27 | #'
 28 | #' @export
 29 | 
 30 | #-----------
 31 | 
 32 | # TEST
 33 | # library(tidyverse)
 34 | # library(collapse)
 35 | # library(data.table)
 36 | # setwd("/home/kevin/Documents/Projects/fusionData")
 37 | #
 38 | # test <- read_fsd("fusion/RECS/2020/2019/RECS_2020_2019_H_fused.fsd",
 39 | #                  columns = c("M", "dollarel"),
 40 | #                  M = 5,
 41 | #                  df = df,
 42 | #                  cores = 3)
 43 | 
 44 | #-----------
 45 | 
 46 | read_fsd <- function(path,
 47 |                      columns = NULL,
 48 |                      M = 1,
 49 |                      df = NULL,
 50 |                      cores = max(1, parallel::detectCores(logical = FALSE) - 1)) {
 51 | 
 52 |   stopifnot({
 53 |     endsWith(path, ".fsd") | endsWith(path, ".fst")
 54 |     file.exists(path)
 55 |     is.null(columns) | is.character(columns)
 56 |     is.null(df) | is.data.frame(df)
 57 |     M >= 1
 58 |     cores > 0 & cores %% 1 == 0
 59 |   })
 60 | 
 61 |   #require(collapse, quietly = TRUE)
 62 |   n <- fst::threads_fst()
 63 |   fst::threads_fst(nr_of_threads = cores)
 64 | 
 65 |   meta <- fst::metadata_fst(path)
 66 |   if (is.null(columns)) {
 67 |     columns <- meta$columnNames
 68 |   } else {
 69 |     columns <- unique(columns)
 70 |     stopifnot(all(columns %in% meta$columnNames))
 71 |   }
 72 | 
 73 |   # TURNED OFF FOR TESTING
 74 |   # Add 'M' column to 'df' to subset on the number of implicates
 75 |   # if (is.finite(M) & "M" %in% v) {
 76 |   #   df <- if (nrow(df) == 0) {
 77 |   #     data.table(M = 1:M)
 78 |   #   } else {
 79 |   #     df %>%
 80 |   #       select(any_of(v)) %>%
 81 |   #       select(-any_of("M")) %>%
 82 |   #       unique() %>%
 83 |   #       slice(rep(1:nrow(.), M)) %>%
 84 |   #       mutate(M = rep(1:M, each = nrow(.) / M))
 85 |   #   }
 86 |   # }
 87 | 
 88 |   #-----
 89 | 
 90 |   # Determine which rows have the requested implicates (M)
 91 |   # Since the data are assumed to be sorted by M, this should yield consecutive integers
 92 | 
 93 |   if ('M' %in% meta$columnNames) {
 94 |     d <- fst::fst(path)
 95 |     i <- which(d$M <= M)
 96 |     stopifnot(!is.unsorted(i))
 97 |   } else {
 98 |     i <- 1L:meta$nrOfRows  # Return all rows in 'd', if no implicates column 'M'
 99 |   }
100 | 
101 |   #-----
102 | 
103 |   if (is.null(df)) {
104 | 
105 |     d <- fst::read_fst(path, columns = columns, from = i[1], to = i[length(i)], as.data.table = TRUE)
106 | 
107 |   } else {
108 | 
109 |     # Check for validity of 'df' column names
110 |     stopifnot(all(names(df) %in% meta$columnNames))
111 |     df <- collapse::funique(df)
112 | 
113 |     # If the file size is less than 100 MB, simply read the full data and subset via collapse::join()
114 |     if (file.size(path) / 1e6 < 100) {
115 | 
116 |       d <- fst::read_fst(path, columns = unique(c(columns, names(df))), from = i[1], to = i[length(i)], as.data.table = TRUE)
117 |       d <- collapse::join(d, df, how = "inner", verbose = FALSE)
118 |       d <- d[, ..columns]
119 | 
120 |     } else {
121 | 
122 |       d <- fst::fst(path)
123 |       m <- qDT(d[i, names(df)])
124 |       i <- i[m %iin% df] # Uses 'collapse' package equivalent of which(x %in% table) using fmatch()
125 |       d <- qDT(d[i, setdiff(columns, names(m)), drop = FALSE])
126 |       j <- intersect(names(m), columns)
127 |       if (length(j)) d <- cbind(d, m[i, ..j])
128 | 
129 |     }
130 |   }
131 | 
132 |   # Set column order and data.table keys
133 |   setcolorder(d, neworder = columns)
134 |   suppressWarnings(setkeyv(d, cols = intersect(meta$keys, columns)))
135 | 
136 |   # Reset number of threads
137 |   fst::threads_fst(nr_of_threads = n)
138 | 
139 |   return(d)
140 | 
141 | }
142 | 


--------------------------------------------------------------------------------
/man/train.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/train.R
  3 | \name{train}
  4 | \alias{train}
  5 | \title{Train a fusion model}
  6 | \usage{
  7 | train(
  8 |   data,
  9 |   y,
 10 |   x,
 11 |   fsn = "fusion_model.fsn",
 12 |   weight = NULL,
 13 |   nfolds = 5,
 14 |   nquantiles = 2,
 15 |   nclusters = 2000,
 16 |   krange = c(10, 500),
 17 |   hyper = NULL,
 18 |   fork = FALSE,
 19 |   cores = 1
 20 | )
 21 | }
 22 | \arguments{
 23 | \item{data}{Data frame. Donor dataset. Categorical variables must be factors and ordered whenever possible.}
 24 | 
 25 | \item{y}{Character or list. Variables in \code{data} to eventually fuse to a recipient dataset. Variables are fused in the order provided. If \code{y} is a list, each entry is a character vector possibly indicating multiple variables to fuse as a block.}
 26 | 
 27 | \item{x}{Character or list. Predictor variables in \code{data} common to donor and eventual recipient. If a list, each slot specifies the \code{x} predictors to use for each \code{y}.}
 28 | 
 29 | \item{fsn}{Character. File path where fusion model will be saved. Must use \code{.fsn} suffix.}
 30 | 
 31 | \item{weight}{Character. Name of the observation weights column in \code{data}. If NULL (default), uniform weights are assumed.}
 32 | 
 33 | \item{nfolds}{Numeric. Number of cross-validation folds used for LightGBM model training. Or, if \code{nfolds < 1}, the fraction of observations to use for training set; remainder used for validation (faster than cross-validation).}
 34 | 
 35 | \item{nquantiles}{Numeric. Number of quantile models to train for continuous \code{y} variables, in addition to the conditional mean. \code{nquantiles} evenly-distributed percentiles are used. For example, the default \code{nquantiles = 2} yields quantile models for the 25th and 75th percentiles. Higher values may produce more accurate conditional distributions at the expense of computation time. Even \code{nquantiles} is recommended since the conditional mean tends to capture the central tendency, making a median model superfluous.}
 36 | 
 37 | \item{nclusters}{Numeric. Maximum number of k-means clusters to use. Higher is better but at computational cost. \code{nclusters = 0} or \code{nclusters = Inf} turn off clustering.}
 38 | 
 39 | \item{krange}{Numeric. Minimum and maximum number of nearest neighbors to use for construction of continuous conditional distributions. Higher \code{max(krange)} is better but at computational cost.}
 40 | 
 41 | \item{hyper}{List. LightGBM hyperparameters to be used during model training. If \code{NULL}, default values are used. See Details and Examples.}
 42 | 
 43 | \item{fork}{Logical. Should parallel processing via forking be used, if possible? See Details.}
 44 | 
 45 | \item{cores}{Integer. Number of physical CPU cores used for parallel computation. When \code{fork = FALSE} or on Windows platform (since forking is not possible), the fusion variables/blocks are processed serially but LightGBM uses \code{cores} for internal multithreading via OpenMP. On a Unix system, if \code{fork = TRUE}, \code{cores > 1}, and \code{cores <= length(y)} then the fusion variables/blocks are processed in parallel via \code{\link[parallel]{mclapply}}.}
 46 | }
 47 | \value{
 48 | A fusion model object (.fsn) is saved to \code{fsn}.
 49 | }
 50 | \description{
 51 | Train a fusion model on "donor" data using sequential \href{https://lightgbm.readthedocs.io/en/latest/}{LightGBM} models to model the conditional distributions. The resulting fusion model (.fsn file) can be used with \code{\link{fuse}} to simulate outcomes for a "recipient" dataset.
 52 | }
 53 | \details{
 54 | When \code{y} is a list, each slot indicates either a single variable or, alternatively, multiple variables to fuse as a block. Variables within a block are sampled jointly from the original donor data during fusion. See Examples.
 55 | 
 56 | \code{y} variables that exhibit no variance or continuous \code{y} variables with less than \code{10 * nfolds} non-zero observations (minimum required for cross-validation) are automatically removed with a warning.
 57 | 
 58 | The fusion model written to \code{fsn} is a zipped archive created by \code{\link[zip]{zip}} containing models and data required by \code{\link{fuse}}.
 59 | 
 60 | The \code{hyper} argument can be used to specify the LightGBM hyperparameter values over which to perform a "grid search" during model training. \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{See here} for the full list of parameters. For each combination of hyperparameters, \code{nfolds} cross-validation is performed using \code{\link[lightgbm]{lgb.cv}} with an early stopping condition. The parameter combination with the lowest loss function value is used to fit the final model via \code{\link[lightgbm]{lgb.train}}. The more candidate parameter values specified in \code{hyper}, the longer the processing time. If \code{hyper = NULL}, a single set of parameters is used with the following default values:
 61 | 
 62 | \itemize{
 63 | \item boosting = "gbdt"
 64 | \item data_sample_strategy = "goss"
 65 | \item num_leaves = 31
 66 | \item feature_fraction = 0.8
 67 | \item max_depth = 5
 68 | \item min_data_in_leaf = max(10, round(0.001 * nrow(data)))
 69 | \item num_iterations = 2500
 70 | \item learning_rate= 0.1
 71 | \item max_bin = 255
 72 | \item min_data_in_bin = 3
 73 | \item max_cat_threshold = 32
 74 | }
 75 | Typical users will only have reason to modify the hyperparameters listed above. Note that \code{num_iterations} only imposes a ceiling, since early stopping will typically result in models with a lower number of iterations. See Examples.
 76 | 
 77 | Testing with small-to-medium size datasets suggests that forking is typically faster than OpenMP multithreading (the default). However, forking will sometimes "hang" (continue to run with no CPU usage or error message) if an OpenMP process has been previously used in the same session. The issue appears to be related to Intel's OpenMP implementation (\href{https://github.com/Rdatatable/data.table/issues/2418}{see here}). This can be triggered when other operations are called before \code{train()} that use \code{\link[data.table]{data.table}} or \code{\link[fst]{fst}} in multithread mode. If you experience hanged forking, try calling \code{data.table::setDTthreads(1)} and \code{fst::threads_fst(1)} immediately after \code{library(fusionModel)} in a new session.
 78 | }
 79 | \examples{
 80 | # Build a fusion model using RECS microdata
 81 | # Note that "fusion_model.fsn" will be written to working directory
 82 | ?recs
 83 | fusion.vars <- c("electricity", "natural_gas", "aircon")
 84 | predictor.vars <- names(recs)[2:12]
 85 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
 86 | 
 87 | # When 'y' is a list, it can specify variables to fuse as a block
 88 | fusion.vars <- list("electricity", "natural_gas", c("heating_share", "cooling_share", "other_share"))
 89 | fusion.vars
 90 | train(data = recs, y = fusion.vars, x = predictor.vars)
 91 | 
 92 | # When 'x' is a list, it specifies which predictor variables to use for each 'y'
 93 | xlist <- list(predictor.vars[1:4], predictor.vars[2:8], predictor.vars)
 94 | xlist
 95 | train(data = recs, y = fusion.vars, x = xlist)
 96 | 
 97 | # Specify a single set of LightGBM hyperparameters
 98 | # Here we use Random Forests instead of the default Gradient Boosting Decision Trees
 99 | train(data = recs, y = fusion.vars, x = predictor.vars,
100 |       hyper = list(boosting = "rf",
101 |                    feature_fraction = 0.6,
102 |                    max_depth = 10
103 |       ))
104 | 
105 | # Specify a range of LightGBM hyperparameters to search over
106 | # This takes longer, because there are more models to test
107 | train(data = recs, y = fusion.vars, x = predictor.vars,
108 |       hyper = list(max_depth = c(5, 10),
109 |                    feature_fraction = c(0.7, 0.9)
110 |       ))
111 | }
112 | 


--------------------------------------------------------------------------------
/man/analyze.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/analyze.R
  3 | \name{analyze}
  4 | \alias{analyze}
  5 | \title{Analyze fusion output}
  6 | \usage{
  7 | analyze(
  8 |   x,
  9 |   implicates,
 10 |   static = NULL,
 11 |   weight = NULL,
 12 |   rep_weights = NULL,
 13 |   by = NULL,
 14 |   fun = NULL,
 15 |   var_scale = 4,
 16 |   cores = 1
 17 | )
 18 | }
 19 | \arguments{
 20 | \item{x}{List. Named list specifying the desired analysis type(s) and the associated target variable(s). Example: \code{x = list(mean = c("v1", "v2"), median = "v3")} translates as: "Return the mean value of variables v1 and v2 and the median of v3". Supported analysis types include \code{mean}, \code{sum}, and \code{median}. Mean and sum automatically return proportions and counts, respectively, if the target variable is a factor. Target variables must be in \code{implicates}, \code{static}, or a data.frame returned by a custom \code{fun}.}
 21 | 
 22 | \item{implicates}{Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M".}
 23 | 
 24 | \item{static}{Data frame. Optional static (non-synthetic) variables that do not vary across implicates. Note that \code{nrow(static) = nrow(implicates) / max(implicates$M)} and the row-ordering is assumed to be consistent between \code{static} and \code{implicates}.}
 25 | 
 26 | \item{weight}{Character. Name of the observation weights column in \code{static}. If NULL (default), uniform weights are assumed.}
 27 | 
 28 | \item{rep_weights}{Character. Optional vector of replicate weight columns in \code{static}. If provided, the returned margin of errors reflect additional variance due to uncertainty in sample weights.}
 29 | 
 30 | \item{by}{Character. Optional column name(s) in \code{implicates} or \code{static} (typically factors) that collectively define the set of population subgroups for which each analysis is executed. If \code{NULL}, analysis is done for the whole sample.}
 31 | 
 32 | \item{fun}{Function. Optional function applied to input data prior to executing analyses. Can be used to do non-conventional/custom analyses.}
 33 | 
 34 | \item{var_scale}{Scalar. Factor by which to scale the unadjusted replicate weight variance. This is determined by the survey design. The default (\code{var_scale = 4}) is appropriate for ACS and RECS.}
 35 | 
 36 | \item{cores}{Integer. Number of cores used. Only applicable on Unix systems.}
 37 | }
 38 | \value{
 39 | A data.table reporting analysis results, possibly across subgroups defined in \code{by}. The returned quantities include:
 40 | 
 41 | \describe{
 42 | \item{N}{Number of observations used for the analysis.}
 43 | \item{y}{Target variable.}
 44 | \item{level}{Levels of factor target variables.}
 45 | \item{type}{Type of estimate returned: mean, proportion, sum, count, or median.}
 46 | \item{est}{Point estimate.}
 47 | \item{moe}{Margin of error associated with the 90\% confidence interval.}
 48 | }
 49 | }
 50 | \description{
 51 | Calculation of point estimates and associated margin of error for analyses using fused/synthetic microdata. Can calculate means, proportions, sums, counts, and medians, optionally across population subgroups.
 52 | }
 53 | \details{
 54 | At a minimum, the user must supply synthetic implicates (typically generated by \link{fuse}). Inputs are checked for consistent dimensions.
 55 | 
 56 | If \code{implicates} contains only a single implicate and \code{rep_weights = NULL}, the "typical" standard error is returned with a warning to make sure the user is aware of the situation.
 57 | 
 58 | Estimates and standard errors for the requested analysis are calculated separately for each implicate. The final point estimate is the mean estimate across implicates. The final standard error is the pooled SE across implicates, calculated using Rubin's pooling rules (1987).
 59 | 
 60 | When replicate weights are provided, the standard errors of each implicate are calculated via the variance of estimates across replicates. Calculations leverage \code{\link[data.table]{data.table}} operations for speed and memory efficiency. The within-implicate variance is calculated around the point estimate (rather than around the mean of the replicates). This is equivalent to \code{mse = TRUE} in \code{\link[survey]{svrepdesign}}. This seems to be the appropriate method for most surveys.
 61 | 
 62 | If replicate weights are NOT provided, the standard errors of each implicate are calculated using variance within the implicate. For means, the ratio variance approximation of Cochran (1977) is used, as this is known to be a good approximation of bootstrapped SE's for weighted means (Gatz and Smith 1995). For proportions, a generalization of the unweighted SE formula is used (\href{https://stats.stackexchange.com/questions/159204/how-to-calculate-the-standard-error-of-a-proportion-using-weighted-data}{see here}).
 63 | }
 64 | \examples{
 65 | # Build a fusion model using RECS microdata
 66 | fusion.vars <- c("electricity", "natural_gas", "aircon")
 67 | predictor.vars <- names(recs)[2:12]
 68 | fsn.path <- train(data = recs, y = fusion.vars, x = predictor.vars)
 69 | 
 70 | # Generate 30 implicates of the 'fusion.vars' using original RECS as the recipient
 71 | sim <- fuse(data = recs, fsn = fsn.path, M = 30)
 72 | head(sim)
 73 | 
 74 | #---------
 75 | 
 76 | # Multiple types of analyses can be done at once
 77 | # This calculates estimates using the full sample
 78 | result <- analyze(x = list(mean = c("natural_gas", "aircon"),
 79 |                            median = "electricity",
 80 |                            sum = c("electricity", "aircon")),
 81 |                   implicates = sim,
 82 |                   weight = "weight")
 83 | 
 84 | View(result)
 85 | 
 86 | #-----
 87 | 
 88 | # Mean electricity consumption, by climate zone and urban/rural status
 89 | result1 <- analyze(x = list(mean = "electricity"),
 90 |                   implicates = sim,
 91 |                   static = recs,
 92 |                   weight = "weight",
 93 |                   by = c("climate", "urban_rural"))
 94 | 
 95 | # Same as above but including sample weight uncertainty
 96 | # Note that only the first 30 replicate weights are used internally
 97 | result2 <- analyze(x = list(mean = "electricity"),
 98 |                   implicates = sim,
 99 |                   static = recs,
100 |                   weight = "weight",
101 |                   rep_weights = paste0("rep_", 1:96),
102 |                   by = c("climate", "urban_rural"))
103 | 
104 | # Helper function for comparison plots
105 | pfun <- function(x, y) {plot(x, y); abline(0, 1, lty = 2)}
106 | 
107 | # Inclusion of replicate weights does not affect estimates, but it does
108 | # increase margin of error due to uncertainty in RECS sample weights
109 | pfun(result1$est, result2$est)
110 | pfun(result1$moe, result2$moe)
111 | 
112 | # Notice that relative uncertainty declines with subset size
113 | plot(result1$N, result1$moe / result1$est)
114 | 
115 | #-----
116 | 
117 | # Use a custom function to perform more complex analyses
118 | # Custom function should return a data frame with non-standard target variables
119 | 
120 | my_fun <- function(data) {
121 | 
122 |   # Manipulate 'data' as desired
123 |   # All variables in 'implicates' and 'static' are available
124 | 
125 |   # Construct electricity consumption per square foot
126 |   kwh_per_ft2 <- data$electricity / data$square_feet
127 | 
128 |   # Binary (T/F) indicator if household uses natural gas
129 |   use_natural_gas <- data$natural_gas > 0
130 | 
131 |   # Return data.frame of custom variables to be analyzed
132 |   data.frame(kwh_per_ft2, use_natural_gas)
133 | }
134 | 
135 | # Do analysis using variables produced by custom function
136 | # Can included non-custom target variables as well
137 | result <- analyze(x = list(mean = c("kwh_per_ft2", "use_natural_gas", "electricity")),
138 |                   implicates = sim,
139 |                   static = recs,
140 |                   weight = "weight",
141 |                   fun = my_fun)
142 | 
143 | }
144 | \references{
145 | Cochran, W. G. (1977). \emph{Sampling Techniques} (3rd Edition). Wiley, New York.
146 | 
147 | Gatz, D.F., and Smith, L. (1995). The Standard Error of a Weighted Mean Concentration — I. Bootstrapping vs Other Methods. \emph{Atmospheric Environment}, vol. 29, no. 11, 1185–1193.
148 | 
149 | Rubin, D.B. (1987). \emph{Multiple imputation for nonresponse in surveys}. Hoboken, NJ: Wiley.
150 | }
151 | 


--------------------------------------------------------------------------------
/R/monotonic.R:
--------------------------------------------------------------------------------
  1 | #' Create a monotonic relationship between two variables
  2 | #'
  3 | #' @description
  4 | #' \code{monotonic()} returns modified values of input vector \code{y} that are smoothed, monotonic, and consistent across all values of input \code{x}. It was designed to be used post-fusion when one wants to ensure a plausible relationship between consumption (\code{x}) and expenditure (\code{y}), under the assumption that all consumers face an identical, monotonic pricing structure. By default, the mean of the returned values is forced to equal the original mean of \code{y} (\code{preserve = TRUE}). The direction of monotonicity (increasing or decreasing) is detected automatically, so use cases are not limited to consumption and expenditure variables.
  5 | #' @param x Numeric.
  6 | #' @param y Numeric.
  7 | #' @param w Numeric. Optional observation weights.
  8 | #' @param preserve Logical. Preserve the original mean of the \code{y} values in the returned values?
  9 | #' @param expend Logical. Assume \code{y} is an expenditure variable? If \code{TRUE}, a safety check is implemented to ensure \code{y > 0} when \code{x > 0}.
 10 | #' @param fast Logical. If \code{TRUE}, only \code{\link[scam]{supsmu}} is used with coercion of result to monotone.
 11 | #' @param nmax Integer. Maximum number of observations to use for smoothing. Set lower for faster computation. \code{nmax = Inf} eliminates sampling.
 12 | #' @param plot Logical. Plot the (sampled) data points and derived monotonic relationship?
 13 | #' @details The initial smoothing is accomplished via \code{\link[scam]{supsmu}} with the result coerced to monotone. If \code{fast = FALSE} and the coercion step modifies the values too much, a second smooth is attempted via a \code{\link[scam]{scam}} model with either a monotone increasing or decreasing constraint. If the SCAM fails to fit, the function falls back to \code{\link[stats]{lm}} with simple linear predictions. If \code{y = 0} when \code{x = 0} (as typical for consumption-expenditure variables), then that outcome is enforced in the result. The input data are randomly sampled to no more than \code{nmax} observations, if necessary, for speed.
 14 | #' @return A numeric vector of modified \code{y} values. Optionally, a plot showing the returned monotonic relationship.
 15 | #' @examples
 16 | #' y <- monotonic(x = recs$propane_btu, y = recs$propane_expend, plot = TRUE)
 17 | #' mean(recs$propane_expend)
 18 | #' mean(y)
 19 | #' @export
 20 | 
 21 | #---------
 22 | 
 23 | # TEST
 24 | # library(tidyverse)
 25 | # library(data.table)
 26 | #
 27 | # d <- fusionModel::read_fsd("~/Downloads/RECS_2020_2019_fused_UP.fsd")
 28 | # acs <- fst::read_fst("~/Documents/Projects/fusionData/survey-processed/ACS/2019/ACS_2019_H_processed.fst", columns = c('weight', 'state', 'puma10'))
 29 | # d <- cbind(d, acs)
 30 | # system.time(
 31 | #   d[, `:=`(dollarel_z =  monotonic(x = btuel, y = dollarel, w = weight),
 32 | #            dollarng_z =  monotonic(x = btung, y = dollarng, w = weight),
 33 | #            dollarlp_z =  monotonic(x = btulp, y = dollarlp, w = weight),
 34 | #            dollarfo_z =  monotonic(x = btufo, y = dollarfo, w = weight)),
 35 | #     by = .(state, puma10)]
 36 | # )
 37 | 
 38 | #---------
 39 | 
 40 | monotonic <- function(x,
 41 |                       y,
 42 |                       w = NULL,
 43 |                       preserve = TRUE,
 44 |                       expend = TRUE,
 45 |                       fast = TRUE,
 46 |                       nmax = 5000,
 47 |                       plot = FALSE) {
 48 | 
 49 |   stopifnot(exprs = {
 50 |     length(x) == length(y)
 51 |     is.numeric(x) & !anyNA(x)
 52 |     is.numeric(y) & !anyNA(y)
 53 |     is.null(w) | length(w) == length(x)
 54 |     is.logical(preserve)
 55 |     is.logical(expend)
 56 |     is.logical(fast)
 57 |     nmax > 1
 58 |     is.logical(plot)
 59 |   })
 60 | 
 61 |   if (is.null(w)) w <- rep.int(1L, length(x))
 62 |   ymean <- weighted.mean(y, w, na.rm = TRUE)
 63 |   yint <- is.integer(y)
 64 |   ymin <- if (all(y == 0)) 0 else min(y[y != 0])
 65 |   x0 <- x
 66 |   w0 <- w
 67 | 
 68 |   # If 'expend = TRUE', check for violations
 69 |   # If any issues are detected, a helpful warning is issued
 70 |   if (expend) {
 71 |     if (any(x < 0)) stop("'expend = TRUE' but detected negative values in 'x'")
 72 |     if (any(y < 0)) stop("'expend = TRUE' but detected negative values in 'y'")
 73 |     i <- x == 0 & y != 0
 74 |     if (any(i)) {
 75 |       y[i] <- 0L
 76 |       warning("Set ", sum(i), " non-zero y-value(s) (", paste0(round(100 * sum(i) / length(y), 2), "%"), ") to zero where x == 0 because 'expend = TRUE'")
 77 |     }
 78 |     i <- x > 0 & y == 0
 79 |     if (any(i)) {
 80 |       y[i] <- ymin
 81 |       warning("Set ", sum(i), " zero y-value(s) (", paste0(round(100 * sum(i) / length(y), 2), "%"), ") to observed non-zero minimum where x > 0 because 'expend = TRUE'")
 82 |     }
 83 |   }
 84 | 
 85 |   # If 'expend = TRUE' OR zeros in 'x' (almost) always produce zeros in 'y', restrict to non-zero observations in 'x'
 86 |   force.zero <- FALSE
 87 |   if (expend | (any(x == 0) & sum(y[x == 0] == 0) / sum(x == 0) > 0.995)) {
 88 |     force.zero <- TRUE
 89 |     i <- c(match(0, x), which(x != 0))  # Retains first instance of zero in 'x'
 90 |     x <- x[i]
 91 |     y <- y[i]
 92 |     w <- w[i]
 93 |   }
 94 | 
 95 |   # If necessary, sample the data for speed
 96 |   n <- length(x)
 97 |   if (n > nmax) {
 98 |     i <- match(range(x), x)  # Retains first instance of min and max 'x'
 99 |     i <- c(i, sample.int(n = n, size = nmax - 2))  # Downsample to 'nmax' observations
100 |     x <- x[i]
101 |     y <- y[i]
102 |     w <- w[i]
103 |   }
104 | 
105 |   # Initial smooth via stats::supsmu()
106 |   # 'span' set based on recommendation in ?supsmu
107 |   m <- stats::supsmu(x, y, wt = w, span = ifelse(length(x) < 40, 0.3, "cv"))
108 |   xu <- m$x
109 |   inc <- suppressWarnings(cor(m$x, m$y) >= 0)
110 |   if (is.na(inc)) inc <- TRUE
111 |   p <- sort(m$y, decreasing = !inc) # Force monotonic predictions
112 | 
113 |   # Check if supsmu() smoothed and monotonic output is sufficient
114 |   # Ideally, coercion to monotonic via sort() does not cause significant difference between 'p' and m$y
115 |   fail <- sum(abs((p - m$y) / m$y) > 0.05) / length(p)  # Percent of observations with more than 5% absolute error
116 |   if (is.na(fail)) fail <- Inf
117 |   if (!fast & fail > 0.05 & length(p) >= 100) {
118 |     # Attempt to fit SCAM model with monotonic constraint
119 |     m <- try(scam::scam(y ~ s(x, bs = ifelse(inc, "mpi", "mpd")), data = data.frame(x, y), weights = w), silent = TRUE)
120 |     if (inherits(m, "scam")) {
121 |       p <- as.vector(predict(m, newdata = data.frame(x = xu), type = "response", newdata.guaranteed = TRUE))
122 |     } else {
123 |       # If SCAM model fails, fall back to OLS model and make simple linear predictions
124 |       m <- stats::lm(y ~ x, weights = w)
125 |       p <- as.vector(suppressWarnings(predict(m, newdata = data.frame(x = xu))))
126 |     }
127 |   }
128 | 
129 |   # First time: If expend = TRUE, ensure that the output values meet some minimum positive value when x > 0
130 |   if (expend) p[xu > 0] <- pmax(p[xu > 0], ymin)
131 | 
132 |   # If necessary, set values to zero when 'x' is zero
133 |   if (force.zero) p[xu == 0] = 0
134 | 
135 |   # Make 'y' predictions for all original 'x'
136 |   yout <- if (length(xu) == 1) rep(p, length(x0)) else approx(xu, p, xout = x0, rule = 2)$y
137 | 
138 |   # If requested, adjustment factor to ensure mean of transformed 'y' matches original mean value
139 |   yadj <- 1  # Defined for use in plotting code, below, if 'preserve = FALSE'
140 |   if (preserve) {
141 |     yadj <- ymean / weighted.mean(yout, w0)
142 |     if (!is.finite(yadj)) yadj <- 1  # Catch divide by zero case (mean not strictly preserved in this case)
143 |     yout <- yout * yadj
144 |   }
145 | 
146 |   # If 'y' input is integer, force output to integer
147 |   # May cause input mean of 'y' to not be strictly preserved in output
148 |   if (yint) yout <- as.integer(round(yout))
149 | 
150 |   # May cause input mean of 'y' to NOT be strictly preserved in output
151 |   if (expend) yout[x0 > 0] <- pmax(yout[x0 > 0], ymin)
152 | 
153 |   # Optional plot of transformation
154 |   if (plot) {
155 |     plot(x, y, col = "#00000033", ylim = range(c(y, p)), xlab = "x", ylab = "y")
156 |     lines(xu, p * yadj, col = "red")
157 |   }
158 | 
159 |   # Safety check
160 |   if (anyNA(yout)) stop("NA values in result vector")
161 | 
162 |   return(yout)
163 | 
164 | }
165 | 


--------------------------------------------------------------------------------
/man/analyze_fusionACS.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/analyze_fusionACS.R
  3 | \name{analyze_fusionACS}
  4 | \alias{analyze_fusionACS}
  5 | \title{Analyze fusionACS microdata}
  6 | \usage{
  7 | analyze_fusionACS(
  8 |   analyses,
  9 |   year,
 10 |   respondent = "household",
 11 |   by = NULL,
 12 |   area = NULL,
 13 |   fun = NULL,
 14 |   M = Inf,
 15 |   R = Inf,
 16 |   cores = 1,
 17 |   version_up = 2,
 18 |   force_up = FALSE
 19 | )
 20 | }
 21 | \arguments{
 22 | \item{analyses}{List. Specifies the desired analyses. Each analysis is a formula. See Details and Examples.}
 23 | 
 24 | \item{year}{Integer. One or more years for which microdata are pooled to compute \code{analyses} (i.e. ACS recipient year). Currently defaults to \code{year = 2015:2019}, if the \code{by} variables indicate a sub-PUMA analysis requiring UrbanPop weights.}
 25 | 
 26 | \item{respondent}{Character. Should the \code{analyses} be computed using \code{"household"}- or \code{"person"}-level microdata?}
 27 | 
 28 | \item{by}{Character. Optional variable(s) that collectively define the set of population subgroups for which each analysis is computed. Can be a mix of geographic (e.g. census tract) and/or socio-demographic microdata variables (e.g. poverty status); the latter may be existing variables on disk or custom variables created on-the-fly via \code{fun()}. If \code{NULL}, analysis is done for the whole (national) sample.}
 29 | 
 30 | \item{area}{Call. Optional unquoted call specifying a geographic area within which to compute the \code{analyses}. Useful for restricting the study area to a manageable size.}
 31 | 
 32 | \item{fun}{Function. Optional function for creating custom microdata variables that cannot be accommodated in \code{analyses}. Must take \code{data} and (optionally) \code{weight} as the only function arguments and must return a \code{data.frame} with number of rows equal to \code{nrow(data)}. See Details and Examples.}
 33 | 
 34 | \item{M}{Integer. The first \code{M} implicates are used. Set \code{M = Inf} to use all available implicates.}
 35 | 
 36 | \item{R}{Integer. The first \code{R} replicate weights are used. Set \code{R = Inf} to use all available replicate weights.}
 37 | 
 38 | \item{cores}{Integer. Number of cores used for multithreading in \code{\link[collapse]{collapse-package}} functions.}
 39 | 
 40 | \item{version_up}{Integer. Use \code{version_up = 2} to access 10-replicate weights for 17 metro areas or \code{version_up = 3} to access 40-replicate weights for 17 metro areas.}
 41 | 
 42 | \item{force_up}{Logical. If \code{TRUE}, force use of UrbanPop weights even if the requested analysis can be done using native ACS weights.}
 43 | }
 44 | \value{
 45 | A tibble reporting analysis results, possibly across subgroups defined in \code{by}. The returned quantities include:
 46 | 
 47 | \describe{
 48 | \item{lhs}{Optional analysis name; the "left hand side" of the analysis formula.}
 49 | \item{rhs}{The "right hand side" of the analysis formula.}
 50 | \item{type}{Type of analysis: sum, mean, median, prop(ortion) or count.}
 51 | \item{level}{Factor levels for categorical analyses; NA otherwise.}
 52 | \item{N}{Mean number of valid microdata observations across all implicates and replicates; i.e. the sample size used to construct the estimate.}
 53 | \item{est}{Point estimate; mean estimate across all implicates and replicates.}
 54 | \item{moe}{Margin of error associated with the 90\% confidence interval.}
 55 | \item{se}{Standard error of the estimate.}
 56 | \item{df}{Degrees of freedom used to calculate the margin of error.}
 57 | \item{cv}{Coefficient of variation; conventional scale-independent measure of estimate reliability. Calculated as: \code{100 * moe / 1.645 / est}}
 58 | }
 59 | }
 60 | \description{
 61 | For fusionACS internal use only. Calculation of point estimates and associated uncertainty (margin of error) for analyses using ACS and/or fused donor survey variables.
 62 | Efficiently computes means, medians, sums, proportions, and counts, optionally across population subgroups.
 63 | The use of native ACS weights or ORNL UrbanPop synthetic population weights is automatically determined given the requested geographic resolution.
 64 | Requires a local \code{/fusionData} directory in the working directory path with assumed file structure and conventions.
 65 | }
 66 | \details{
 67 | Allowable geographic units of analysis specified in \code{by} are currently limited to: region, division, state, cbsa10, puma10, county10, cousubfp10 (county subdivision), zcta10 (zip code), tract10 (census tract), and bg10 (block group).
 68 | 
 69 | The final point estimates are the mean estimates across implicates. The final margin of error is derived from the pooled standard error across implicates, calculated using Rubin's pooling rules (1987). The within-implicate standard error's are calculated using the replicate weights.
 70 | 
 71 | Each entry in the \code{analyses} list is a \code{\link[stats]{formula}} of the format \code{Z ~ F(E)}, where \code{Z} is an optional, user-friendly name for the analysis, \code{F} is an allowable “outer function”, and \code{E} is an “inner expression” containing one or more microdata variables. For example:
 72 | 
 73 | \code{mysum ~ mean(Var1 + Var2)}
 74 | 
 75 | In this case, the outer function is mean(). Allowable outer functions are: mean(), sum(), median(), sd(), and var(). When the inner expression contains more than one variable, it is first evaluated and then \code{F()} is applied to the result. In this case, an internal variable \code{X = Var1 + Var2} is generated across all observations, and then \code{mean(X)} is computed.
 76 | 
 77 | If no inner expression is desired, the \code{analyses} list can use the following convenient syntax to apply a single outer function to multiple variables:
 78 | 
 79 | \code{mean = c("Var1", "Var2")}
 80 | 
 81 | The inner expression can also utilize any function that takes variable names as arguments and returns a vector with the same length as the inputs. This is useful for defining complex operations in a separate function (e.g. microsimulation). For example:
 82 | 
 83 | \code{myfun = function(Var1, Var2) {Var1 + Var2}}
 84 | 
 85 | \code{mysum ~ mean(myfun(Var1, Var2))}
 86 | 
 87 | The use of sum() or mean() with an inner expression that returns a categorical vector automatically results in category-wise weighted counts and proportions, respectively. For example, the following analysis would fail if evaluated literally, since mean() expects numeric input but the inner expression returns character. But this is interpreted as a request to return weighted proportions for each categorical outcome.
 88 | 
 89 | \code{myprop ~ mean(ifelse(Var1 > 10 , 'Yes', 'No'))}
 90 | 
 91 | \code{analyze_fusionACS()} uses "fast" versions of the allowable outer functions, as provided by \code{\link[collapse]{fast-statistical-functions}} in the \code{collapse} package. These functions are highly optimized for weighted, grouped calculations. In addition, outer functions mean(), sum(), and median() enjoy the use of platform-independent multithreading across columns when \code{cores > 1}. Analyses with numerical inner expressions are processed using a series of calls to \code{\link[collapse]{collap}} with unique observation weights. Analyses with categorical inner expressions utilize a series of calls to \code{\link[collapse]{fsum}}.
 92 | }
 93 | \examples{
 94 | # Analysis using ACS native weights for year 2017, by PUMA, in South Atlantic Census Division
 95 | # Uses all available implicates and replicate weights
 96 | test <- analyze_fusionACS(analyses = list(high_burden ~ mean(dollarel / hincp > 0.05)),
 97 |                           year = 2017,
 98 |                           by = "puma10",
 99 |                           area = division == "South Atlantic")
100 | 
101 | # Analysis using UrbanPop 2015-2019 weights, by tract, in Utah (actually Salt Lake City metro given current UrbanPop data)
102 | # Uses 5 (of possible 20) fusion implicates for RECS "dollarel" variable
103 | # Uses 5 (of possible 10) UrbanPop replicate weights
104 | test <- analyze_fusionACS(analyses = list(median_burden ~ median(dollarel / hincp)),
105 |                           year = 2015:2019,
106 |                           by = "tract10",
107 |                           area = state_name == "Utah",
108 |                           M = 5,
109 |                           R = 5)
110 | 
111 | # User function to create custom variables from microdata
112 | # Variables explicitly referenced in my_fun() are automatically loaded into 'data' within analyze_fusionACS()
113 | # Variables returned by my_fun() may be used in 'by' or inner expressions of 'analyses'
114 | my_fun <- function(data) {
115 |   require(tidyverse, quietly = TRUE)
116 |   data \%>\%
117 |     mutate(elderly = agep >= 65,
118 |            energy_expend = dollarel + dollarfo + dollarlp + dollarng,
119 |            energy_burden = energy_expend / hincp,
120 |            energy_burden = ifelse(hincp < 5000, NA, energy_burden)) \%>\%
121 |     select(elderly, energy_burden, energy_expend)
122 | }
123 | 
124 | # Analysis using UrbanPop 2015-2019 weights, by zip code and elderly head of household, in Atlanta CBSA
125 | test <- analyze_fusionACS(analyses = list(energy_burden ~ mean(energy_burden),
126 |                                           at_risk ~ mean(energy_burden > 0.075 | acequipm_pub == "No air conditioning")),
127 |                           year = 2015:2019,
128 |                           by = c("zcta10", "elderly"),
129 |                           area = cbsa10 == "12060",
130 |                           fun = my_fun,
131 |                           M = 5,
132 |                           R = 5)
133 | 
134 | }
135 | \references{
136 | Rubin, D.B. (1987). \emph{Multiple imputation for nonresponse in surveys}. Hoboken, NJ: Wiley.
137 | }
138 | 


--------------------------------------------------------------------------------
/R/impute.R:
--------------------------------------------------------------------------------
  1 | #' Impute missing data via fusion
  2 | #'
  3 | #' @description
  4 | #' A universal missing data imputation tool that wraps successive calls to \code{\link{train}} and \code{\link{fuse}} under the hood. Designed for simplicity and ease of use.
  5 | #' @param data A data frame with missing values.
  6 | #' @param weight Optional name of observation weights column in \code{data}.
  7 | #' @param ignore Optional names of columns in \code{data} to ignore. These variables are neither imputed nor used as predictors.
  8 | #' @param cores Number of physical CPU cores used by \code{\link[lightgbm]{lightgbm}}. LightGBM is parallel-enabled on all platforms if OpenMP is available.
  9 | #' @details Variables with missing values are imputed sequentially, beginning with the variable with the fewest missing values. Since LightGBM models accommodate NA values in the predictor set, all available variables are used as potential predictors (excluding \code{ignore} variables). For each call to \code{\link{train}}, 80% of observations are randomly selected for training and the remaining 20% are used as a validation set to determine an appropriate number of tree learners. All LightGBM model parameters are kept at the sensible default values in \code{\link{train}}. Since \code{\link[lightgbm]{lightgbm}} uses OpenMP multithreading, it is not advisable to use \code{\link{impute}} inside a forked/parallel process when \code{cores > 1}.
 10 | #' @return A data frame with all missing values imputed.
 11 | #' @examples
 12 | #' # Create data frame with random NA values
 13 | #' ?recs
 14 | #' data <- recs[, 2:7]
 15 | #' miss <- replicate(ncol(data), runif(nrow(data)) < runif(1, 0.01, 0.3))
 16 | #' data[miss] <- NA
 17 | #' colSums(is.na(data))
 18 | #'
 19 | #' # Impute the missing values
 20 | #' result <- impute(data)
 21 | #' anyNA(result)
 22 | #'
 23 | #' @export
 24 | 
 25 | #---
 26 | 
 27 | # library(tidyverse)
 28 | # library(data.table)
 29 | # source("R/utils.R")
 30 | # data <- fst::read_fst("~/Documents/Projects/fusionData/test_hus.fst")
 31 | # weight = "WGTP"
 32 | # cores = 1
 33 | #
 34 | # #ignore <- names(select(data[1, ], SERIALNO, PUMA, WGTP1:SERIALNO_original))
 35 | # # Specify this in ASC processing script
 36 | # ignore <- setdiff(names(data), c(y, "WGTP", "puma_rent", "puma_value", "puma_income", "puma_mortgage", "ST", "NP", "ACR", "BLD", "FS", "HFL", "HHL", "BDSP", "BDS", "RMSP", "RMS", "TEN", "VEH", "YBL", "YRBLT", "HINCP", "FES", "WIF", "R18", "R65"))
 37 | #
 38 | # test <- impute(data, "WGTP", ignore = ignore)
 39 | 
 40 | #---
 41 | 
 42 | impute <- function(data,
 43 |                    weight = NULL,
 44 |                    ignore = NULL,
 45 |                    cores = parallel::detectCores(logical = FALSE) - 1L) {
 46 | 
 47 |   t0 <- Sys.time()
 48 | 
 49 |   fst::threads_fst(nr_of_threads = cores)
 50 |   setDTthreads(threads = cores)
 51 | 
 52 |   stopifnot(exprs = {
 53 |     is.data.frame(data)
 54 |     ncol(data) > 1
 55 |     anyNA(data)
 56 |     any(is.null(weight), length(weight) == 1 & weight %in% names(data))
 57 |     any(is.null(ignore), all(ignore %in% names(data)))
 58 |     cores > 0 & cores %% 1 == 0 & cores <= parallel::detectCores(logical = FALSE)
 59 |   })
 60 | 
 61 |   dnames <- names(data)
 62 |   data.dt <- is.data.table(data)
 63 |   d <- as.data.table(data)
 64 |   rm(data)
 65 | 
 66 |   if (is.null(weight)) {
 67 |     weight = "W_.._"
 68 |     d[, W_.._ := 1L]
 69 |   } else {
 70 |     if (anyNA(d[[weight]])) stop("NA values are not allowed in 'weight'")
 71 |   }
 72 | 
 73 |   miss <- sort(colSums(is.na(d)))
 74 |   y <- names(miss)[miss > 0 & miss < nrow(d)]
 75 |   y <- setdiff(y, ignore)
 76 |   if (!length(y)) stop("No un-ignored columns with NA values to impute")
 77 |   temp.fsn <- paste0(tempfile(), ".fsn")
 78 | 
 79 |   #---
 80 | 
 81 |   # Predictor prescreen step if 'd' is sufficiently large
 82 |   #if (length(x) & (nrow(d) > 10e3 | ncol(d) > 10)) {
 83 |   if ((nrow(d) > 10e3 | ncol(d) > 20)) {
 84 | 
 85 |     d2 <- copy(d)
 86 |     d2[, (weight) := NULL]
 87 | 
 88 |     # Convert 'd2' to plausible ranks for correlation screening
 89 |     # All output columns should be NA, integer, or logical
 90 |     # NA's in input are preserved in output
 91 |     for (i in 1:ncol(d2)) {
 92 |       z <- d2[[i]]
 93 |       if (is.numeric(z)) {
 94 |         # Ties method 'dense' ensures integer output with minimum of 1 and maximum of length(na.omit(z))
 95 |         z <- frank(z, ties.method = "dense", na.last = "keep")
 96 |       } else {
 97 |         # Converts ordered factors to integer
 98 |         if (is.ordered(z)) {
 99 |           z <- as.integer(z)
100 |         } else {
101 |           if (!is.logical(z)) {
102 |             # Converts character and un-ordered factors to TRUE for the most-common (non-NA) value and FALSE otherwise
103 |             #zt <- table2(z, na.rm = TRUE)
104 |             zt <- collapse::qtable(z, na.exclude = TRUE)
105 |             z <- z == names(which.max(zt))
106 |           }
107 |         }
108 |       }
109 |       set(d2, j = i, value = z)
110 |     }
111 | 
112 |     # Randomly down-sample when 'd2' is large
113 |     if (nrow(d2) > 100e3) d2 <- d2[sample.int(nrow(d2), 100e3), ]
114 | 
115 |     # Correlation matrix
116 |     # Note that Spearman (rank) correlations are used (data pre-ranked) to reduce effect of outliers
117 |     ok <- setdiff(names(d2), ignore)
118 |     cmat <- suppressWarnings(cor(d2[, ..y], d2[, ..ok], use = "pairwise.complete.obs"))
119 |     cmat[is.na(cmat)] <- 0
120 | 
121 |     # Initial correlation screening, based on absolute correlation value
122 |     xlist <- lapply(y, function(v) {
123 | 
124 |       p <- cmat[v, ]
125 |       names(p) <- colnames(cmat)
126 |       p <- p[names(p) != v]
127 |       p <- sort(abs(p), decreasing = TRUE)
128 | 
129 |       # Restrict to predictors that meet arbitrary absolute correlation threshold (> 0.025)
130 |       # Limit to maximum 30 potential predictors for each model (hard cap)
131 |       # Attempt to retain minimum 5 potential predictors regardless of correlation
132 |       xv <- names(p[p > 0.025])
133 |       xv <- xv[1:min(30, length(xv))]
134 |       if (length(xv) < 5) xv <- names(p)[1:min(5, length(p))]
135 | 
136 |       return(xv)
137 | 
138 |     })
139 | 
140 |     rm(d2)
141 | 
142 |   } else {
143 | 
144 |     xlist <- lapply(y, function(v) setdiff(names(d), c(v, ignore, weight)))
145 | 
146 |   }
147 | 
148 |   xlist <- setNames(xlist, y)
149 | 
150 |   #---
151 | 
152 |   # Coerce any character variables in 'y' or 'xlist' to unordered factor
153 |   # This is necessary to avoid errors in train() and fuse(), which expect factors
154 |   # The 'cconv' columns are converted to character prior to returning final function output
155 |   ccols <- names(which(sapply(d, is.character)))
156 |   cconv <- intersect(ccols, unique(c(y, unlist(xlist))))
157 |   if (length(cconv) > 0)  d[, (cconv) := lapply(.SD, factor), .SDcols = cconv]
158 | 
159 |   # Check number of factor levels in 'y' variables
160 |   nlev <- sapply(d[, ..y], nlevels)
161 |   bad <- names(nlev)[nlev >= 200]
162 |   if(length(bad)) {
163 |     stop("Detected categorical imputation variable(s) with more than 200 levels (not allowed):\n", paste(bad, collapse = "\n"))
164 |   }
165 | 
166 |   #---
167 | 
168 |   pb <- txtProgressBar(max = length(y), style = 3)
169 | 
170 |   for (i in 1:length(y)) {
171 | 
172 |     # Response and predictor variables
173 |     v <- y[i]
174 |     xv <- xlist[[i]]
175 |     vtrain <- c(weight, v, xv)
176 | 
177 |     # Observations to impute
178 |     imp <- is.na(d[[v]])
179 | 
180 |     # Training dataset
181 |     dtrain <- d[!imp, ..vtrain]
182 | 
183 |     # If sample size is large, use a stratified sample of the training data
184 |     # If there are few missing values, then fewer training observations are possible (10x the number missing)
185 |     # Restricts training sample to no less than 5k and no more than 50k observations
186 |     maxn <- min(max(5e3, 10 * sum(imp)), 50e3)
187 |     if (nrow(dtrain) > maxn) {
188 |       keep <- stratify(y = dtrain[[v]],
189 |                        ycont = is.numeric(dtrain[[v]]),
190 |                        tfrac = maxn / nrow(dtrain),
191 |                        ntiles = 20)
192 |       dtrain <- dtrain[keep, ]
193 |     }
194 | 
195 |     # Train fusion model (saved to 'temp.fsn' temporary file)
196 |     # Suppress output to the console
197 |     invisible(capture.output(
198 |       fusionModel::train(data = dtrain,
199 |                          y = v,
200 |                          x = xv,
201 |                          fsn = temp.fsn,
202 |                          weight = weight,
203 |                          nfolds = 0.8,
204 |                          nquantiles = 2,
205 |                          cores = cores)
206 |     ))
207 | 
208 |     # Perform fusion/imputation
209 |     # Suppress output to the console
210 |     invisible(capture.output(
211 |       p <- fusionModel::fuse(data = d[imp, ],
212 |                              fsn = temp.fsn,
213 |                              M = 1,
214 |                              cores = cores)
215 |     ))
216 | 
217 |     # Update 'd' with imputed values
218 |     set(d, i = which(imp), j = v, value = p[[2]])
219 | 
220 |     # Update progress bar
221 |     setTxtProgressBar(pb, value = i)
222 | 
223 |   }
224 | 
225 |   # Close progress bar
226 |   close(pb)
227 | 
228 |   # Remove temporary fusion model
229 |   unlink(temp.fsn)
230 | 
231 |   # Check for NA's in output
232 |   stopifnot(!anyNA(d[, ..y]))
233 | 
234 |   # If any character columns were converted to factor, convert back to character
235 |   if (length(cconv) > 0)  d[, (cconv) := lapply(.SD, as.character), .SDcols = cconv]
236 | 
237 |   # Ensure output column order and class matches input 'data'
238 |   suppressWarnings(set(d, j = "W_.._", value = NULL))  # Removes the placeholder weight variable, if present
239 |   setcolorder(d, dnames)
240 |   if (!data.dt) d <- as.data.frame(d)
241 | 
242 |   # Report processing time
243 |   tout <- difftime(Sys.time(), t0)
244 |   cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ")
245 | 
246 |   return(d)
247 | 
248 | }
249 | 


--------------------------------------------------------------------------------
/R/assemble.R:
--------------------------------------------------------------------------------
  1 | #' Assemble fusionACS microdata across surveys
  2 | #'
  3 | #' @description
  4 | #' For fusionACS usage only. Provides a safe and efficient way to assemble (merge) fused microdata across surveys to return a single data table with the requested variables. The requested variables can come from any fused (donor) survey and/or the American Community Survey (ACS). The necessary variables are automatically and efficiently read from the appropriate local file and safely merged on household and/or person ID variables, optionally collapsing or expanding records as necessary depending on the \code{respondent} argument. Assumes (and checks for) a local \code{/fusionData} directory with appropriate file structure and conventions.
  5 | #'
  6 | #' @param year Integer. One or more years for which to return results (i.e. the ACS recipient year).
  7 | #' @param var Character. Name of one or more variables to return. May contain household- and/or person-level variables. See Details.
  8 | #' @param respondent Character. Should \code{"household"}- or \code{"person"}-level microdata be returned?
  9 | #' @param M Integer. The first \code{M} implicates are returned for fused variables. Set \code{M = Inf} to return all implicates. Ignored if \code{var} contains only ACS variables (i.e. no implicates)
 10 | #' @param df Data frame. Data frame used to identify a subset of rows to return. Default is to return all rows.
 11 | #' @param cores Integer. Number of cores used by the \code{\link[fst]{fst-package}} when reading from disk.
 12 | #' @param source Character Specifies where to look for \code{var}: all available microdata (\code{source = "all"}); only ACS microdata (\code{source = "ACS"}); or only fused microdata (\code{source = "fused"}). Note that no observation weights are returned if \code{source = "fused"}, since weights are stored in the ACS microdata.
 13 | #' @param silent Logical. If \code{FALSE}, a warning is issued if any \code{var} cannot be located in available local files.
 14 | #'
 15 | #' @details The \code{var} argument can contain a mix of household- and/or person-level variables. When \code{respondent = "household"}, the reference person (i.e. head of household) value is returned for any person-level variables. When \code{respondent = "person"}, the values of any household-level variables are replicated for each person in the household.
 16 | #'
 17 | #' @return A keyed data table containing the following columns, in addition to the variables named in \code{var}:
 18 | #' \describe{
 19 | #'  \item{M}{Implicate number. See \code{\link{fuse}}.}
 20 | #'  \item{year}{Year of the ACS recipient microdata.}
 21 | #'  \item{hid}{ACS household ID using fusionACS convention.}
 22 | #'  \item{pid}{ACS person ID using fusionACS convention, if \code{respondent = "person"}.}
 23 | #'  \item{weight}{ACS microdata primary sample weight.}
 24 | #'  }
 25 | #'
 26 | #' @examples
 27 | #' # NOTE: Requires local /fusionData directory containing the necessary ACS and .fsd files
 28 | #' test <- assemble(year = 2018:2019,
 29 | #'                  var = c("dollarel", "hincp", "agep", "state"),
 30 | #'                  respondent = "household",
 31 | #'                  M = 1)
 32 | #' @export
 33 | 
 34 | #------------
 35 | 
 36 | # library(tidyverse)
 37 | # library(data.table)
 38 | # source("R/utils.R")
 39 | # setwd( "/home/kevin/Documents/Projects/fusionData")
 40 | #
 41 | # test <- assemble(year = 2015,
 42 | #                  #var = c("dollarel" ,"dollarng", "dollarfo" ,"dollarlp" ,"hincp", "pov_ratio", "agep", "ref_race5", "rac1p", "hisp", "state", "puma10"),
 43 | #                  var = c("hincp", "pov_ratio", "state", "puma10", "agep"),
 44 | #                  respondent = "household",
 45 | #                  M = 1,
 46 | #                  cores = 2)
 47 | 
 48 | #-----
 49 | 
 50 | # year = 2019
 51 | # var = c("dollarel")
 52 | # respondent = "household"
 53 | # M = 2
 54 | # df = NULL
 55 | # cores = 2
 56 | # source = "fused"
 57 | # silent= TRUE
 58 | 
 59 | #-----
 60 | 
 61 | assemble <- function(year,
 62 |                      var,
 63 |                      respondent = "household",
 64 |                      M = 1,
 65 |                      df = NULL,
 66 |                      cores = 1,
 67 |                      source = "all",
 68 |                      silent = FALSE) {
 69 | 
 70 |   # Check validity of the working directory path
 71 |   # Checks if "/fusionData" is part of the path, as this is required
 72 |   b <- strsplit(full.path(getwd()), .Platform$file.sep, fixed = TRUE)[[1]]
 73 |   i <- which(b == "fusionData")
 74 |   if (length(i) == 0) stop("'/fusionData' is not part of the working directory path; this is required.")
 75 |   dir <- paste(b[1:i], collapse = .Platform$file.sep)
 76 | 
 77 |   # Capture the function call; added as attribute in the final output
 78 |   mcall <- match.call()
 79 | 
 80 |   # Respondent identifier ("H" or "P")
 81 |   rtype <- substring(toupper(respondent), 1, 1)
 82 | 
 83 |   # Initial argument check
 84 |   stopifnot({
 85 |     year >= 2005 & year %% 1 == 0
 86 |     is.character(var) & length(var) > 0
 87 |     rtype %in% c("H", "P")
 88 |     is.null(df) | is.data.frame(df)
 89 |     M > 0
 90 |     cores > 0 & cores %% 1 == 0
 91 |     tolower(source) %in% c('all', 'acs', 'fused')
 92 |     is.logical(silent)
 93 |   })
 94 | 
 95 |   # Universal variables always returned in output, if possible
 96 |   uvar <- c('M', 'year', 'hid', 'pid', 'weight')
 97 | 
 98 |   fst::threads_fst(nr_of_threads = cores)
 99 |   setDTthreads(threads = cores)
100 |   hh <- rtype == "H"
101 | 
102 |   # Specified source for each 'var', if provided
103 |   vsrc <- ifelse(str_detect(var, ":"), str_extract(var, "^[^:]+"), "")
104 |   v <- ifelse(str_detect(var, ":"), str_extract(var, "(?<=:).*"), var)
105 |   i <- !v %in% uvar
106 |   vsrc <- vsrc[i]
107 |   v <- v[i]
108 | 
109 |   #-----
110 | 
111 |   result <- lapply(year, function(yr) {
112 | 
113 |     # Get file paths to ACS microdata for 'yr'
114 |     pa <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._processed.fst"), recursive = TRUE, full.names = TRUE)
115 |     pa <- sort(pa, decreasing = rtype == "P") # Sort the processed ACS paths, to place either Household or Person microdata first
116 |     pc <- list.files(file.path(dir, "survey-processed/ACS"), pattern = paste0(yr, "_._custom.fst"), recursive = TRUE, full.names = TRUE)
117 | 
118 |     # Get file path(s) to fused microdata for 'yr'
119 |     pf <- rev(list.files(file.path(dir, "fusion"), pattern = paste0(yr, "_._fused.fsd"), recursive = TRUE, full.names = TRUE))
120 | 
121 |     # Select paths based on 'source' argument
122 |     fpaths <- switch(tolower(source),
123 |                      all = c(pa, pc, pf),
124 |                      acs = c(pa, pc),
125 |                      fused = pf)
126 | 
127 |     # The survey name (e.g. RECS_2020) associated with each path
128 |     survey <- str_extract(basename(fpaths), "^[^_]*_[^_]*")
129 | 
130 |     #---
131 | 
132 |     # Check the donor survey 'fpaths' for duplicate occurrences of requested variables
133 |     # Stop with error if conflict(s) detected
134 |     # This can occur if there are identically named variables across surveys OR multiple vintages of a donor survey are fused to the same ACS vintage (e.g. RECS 2015 and RECS 2015 fused to ACS 2015-2019)
135 |     i <- substring(survey, 1, 4) != "ACS_"
136 |     vlist <- lapply(fpaths[i], function(x) intersect(v[vsrc == ""], fst::metadata_fst(x)$columnNames))
137 |     u <- table(unlist(vlist))
138 |     check <- lapply(vlist, function(x) intersect(x, names(u)[u > 1]))
139 |     names(vlist) <- names(check) <- survey[i]
140 |     if (any(lengths(check) > 0)) {
141 |       check <- check[lengths(check) > 0]
142 |       error.msg <- paste(capture.output(str(check)), collapse = "\n")
143 |       stop("Conflicting 'var' names. The following variables are present in more than one source file:\n", error.msg, "\nRevise 'var' to use a colon to specify the source; e.g. ", paste0("'", names(check)[1], ":", check[[1]][1], "'"))
144 |     }
145 | 
146 |     # Update the 'vsrc' object to assign the source survey for 'v' that are unassigned
147 |     temp <- lapply(vlist, function(x) intersect(x, names(u)[u == 1]))
148 |     for (i in seq_along(temp)) {
149 |       for (j in temp[[i]]) {
150 |         vsrc[v == j] <- names(vlist)[i]
151 |       }
152 |     }
153 | 
154 |     # Data frame with variable sources
155 |     dv <- data.frame(year = yr, var = v, source = vsrc)
156 | 
157 |     #---
158 | 
159 |     d <- data.table()
160 |     for (x in fpaths) {
161 | 
162 |       a <- substring(basename(x), 1, 4) == "ACS_"
163 |       r <- rev(strsplit(x, "_")[[1]])[[2]]
164 |       xn <- fst::metadata_fst(x)$columnNames
165 | 
166 |       keep <- if (!all(c('year', 'hid') %in% xn)) {
167 |         warning("Skipping file ",  basename(x), " due to irregular file structure")
168 |         NULL
169 |       } else {
170 |         i <- vsrc == str_extract(basename(x), "^[^_]*_[^_]*") | vsrc == ""
171 |         temp <- intersect(xn, c(v[i], names(df)))
172 |         if (a & rtype == r) temp <- c(temp, 'weight')
173 |         setdiff(temp, names(d))  # Excludes any variables already in 'd'
174 |       }
175 | 
176 |       # Load requested variables from disk
177 |       if (length(keep)) {
178 |         dt <- fusionModel::read_fsd(path = x,
179 |                                     columns = intersect(xn, c('M', 'year', 'hid', 'pid', keep)),
180 |                                     M = M,
181 |                                     df = if (any(xn %in% names(df))) select(df, any_of(xn)) else NULL,
182 |                                     cores = cores)
183 |       } else {
184 |         dt <- data.table()
185 |       }
186 | 
187 |       # If household-level data requested, merge the reference person value for any person-level variables
188 |       # TO DO: Allow different summary metric besides just the reference person value
189 |       if (hh & rtype != r & nrow(dt) > 0) {
190 |         i <- !duplicated(dt$hid)  # Retains the reference person data; first entry within each 'hid'
191 |         dt <- select(dt[i], -pid)
192 |       }
193 | 
194 |       # NOTE: If person-level data is requested, the merge() below causes any household-level variables to be replicated for each person
195 |       # Since it is an left merge, persons not in the household data (i.e. group quarter population) will contain NA values
196 |       # If no household variables need to be added, then the returned person microdata will include the group quarter individuals
197 |       if (nrow(dt) > 0) {
198 |         setkeyv(dt, cols = intersect(c('M', 'year', 'hid', 'pid'), names(dt)))
199 |         if (nrow(d) == 0) {
200 |           d <- dt
201 |         } else {
202 |           #d <- merge(d, dt, by = intersect(key(d), key(dt)), allow.cartesian = TRUE)
203 |           d <- collapse::join(x = d,
204 |                               y = dt,
205 |                               on = intersect(key(d), key(dt)),
206 |                               how = "left",  # See NOTE above about left join and household vs. person data
207 |                               multiple = TRUE,
208 |                               verbose = FALSE)
209 |           setkeyv(d, cols = key(dt))
210 |         }
211 |       }
212 |       rm(dt)
213 | 
214 |     }
215 | 
216 |     #return(d)
217 |     return(list(d, dv))
218 | 
219 |   })
220 | 
221 |   # Extract and rbind the attribute data.frames
222 |   attr.df <- result %>%
223 |     purrr::map(2) %>%
224 |     rbindlist()
225 | 
226 |   # Extract and rbind the microdata output
227 |   result <- result %>%
228 |     purrr::map(1) %>%
229 |     rbindlist() %>%
230 |     setcolorder(neworder = intersect(c(uvar, var), names(.))) %>%
231 |     setattr(name = "origin", value = attr.df) %>%
232 |     setattr(name = "assemble", value = mcall)
233 | 
234 |   # Set keys
235 |   if (nrow(result) > 0) setkeyv(result, cols = intersect(c('M', 'year', 'hid', 'pid'), names(result)))
236 | 
237 |   # Any 'var' missing in the output? If so, report as warning.
238 |   miss <- setdiff(v, names(result))
239 |   if (length(miss) & !silent) warning("Could not locate the following variable(s): ", paste(miss, collapse = ", "))
240 | 
241 |   return(result)
242 | 
243 | }
244 | 


--------------------------------------------------------------------------------
/R/validate.R:
--------------------------------------------------------------------------------
  1 | #' Validate fusion output
  2 | #'
  3 | #' @description
  4 | #' Performs internal validation analyses on fused microdata to estimate how well the simulated variables reflect patterns in the dataset used to train the underlying fusion model (i.e. observed/donor data). This provides a standard approach to validating fusion output and associated models. See Examples for recommended usage.
  5 | #'
  6 | #' @param observed Data frame. Observed data against which to validate the \code{simulated} variables. Typically the same dataset used to \code{\link{train}} the fusion model used to generate \code{simulated}.
  7 | #' @param implicates Data frame. Implicates of synthetic (fused) variables. Typically generated by \link{fuse}. The implicates should be row-stacked and identified by integer column "M".
  8 | #' @param subset_vars Character. Vector of columns in \code{observed} used to define the population subsets across which the fusion variables are validated. The levels of each \code{subset_vars} (including all two-way interactions of \code{subset_vars}) define the population subsets. Continuous \code{subset_vars} are converted to a five-level ordered factor based on a univariate k-means clustering.
  9 | #' @param weight Character. Name of the observation weights column in \code{observed}. If NULL (default), uniform weights are assumed.
 10 | #' @param min_size Integer. Subsets with less than \code{min_size} observations are excluded. Since subsets with few observations are unlikely to give reliable estimates, it doesn't make sense to consider them for validation purposes.
 11 | #' @param plot Logical. If TRUE (default), \code{\link{plot_valid}} is called internally and summary plots are returned along with complete validation results. Requires the \code{\link{ggplot2}} package.
 12 | #' @param cores Integer. Number of cores used. Only applicable on Unix systems.
 13 | 
 14 | #' @details The objective of \code{\link{validate}} is to confirm that the fusion output is sensible and help establish the utility of the synthetic data across myriad analyses. Utility here is based on comparison of point estimates and confidence intervals derived using multiple-implicate synthetic data with those derived using the original donor data.
 15 | #'
 16 | #' The specific analyses tested include variable levels (means and proportions) across population subsets of varying size. This allows estimates of how each of the synthetic variables perform in analyses with real-world relevance, at varying levels of complexity. In effect, \code{validate()} performs a large number of analyses of the kind that the \code{\link{analyze}} function is designed to do on a one-by-one basis.
 17 | #'
 18 | #' Most users will want to use the default setting \code{plot = TRUE} to simultaneously return visualization (plots) of the validation results. Plot creation is detailed in \code{\link{plot_valid}}.
 19 | #'
 20 | #' @return If \code{plot = FALSE}, a data frame containing complete validation results. If If \code{plot = FALSE}, a list containing full results as well as additional lot objects as described in \code{\link{plot_valid}}.
 21 | #'
 22 | #' @examples
 23 | #' # Build a fusion model using RECS microdata
 24 | #' # Note that "fusion_model.fsn" will be written to working directory
 25 | #' fusion.vars <- c("electricity", "natural_gas", "aircon")
 26 | #' predictor.vars <- names(recs)[2:12]
 27 | #' fsn.path <- train(data = recs,
 28 | #'                   y = fusion.vars,
 29 | #'                   x = predictor.vars,
 30 | #'                   weight = "weight")
 31 | #'
 32 | #' # Fuse back onto the donor data (multiple implicates)
 33 | #' sim <- fuse(data = recs,
 34 | #'             fsn = fsn.path,
 35 | #'             M = 20)
 36 | #'
 37 | #' # Calculate validation results
 38 | #' valid <- validate(observed = recs,
 39 | #'                   implicates = sim,
 40 | #'                   subset_vars = c("income", "education", "race", "urban_rural"))
 41 | #'
 42 | #' @export
 43 | 
 44 | #-----
 45 | 
 46 | # library(fusionModel)
 47 | # library(dplyr)
 48 | # library(data.table)
 49 | # library(purrr)
 50 | # library(ggplot2)
 51 | # library(fst)
 52 | # source("R/utils.R")
 53 | # source("R/analyze_funs.R")
 54 | # observed <- read_fst("~/Documents/Projects/fusionData/fusion/RECS/2015/2015/input/RECS_2015_2015_train.fst")
 55 | # implicates <- read_fsd("~/Documents/Projects/fusionData/fusion/RECS/2015/2015/output/RECS_2015_2015_valid.fsd")
 56 | # subset_vars = c("moneypy__hincp", "hhage__agep", "householder_race__rac1p", "education__schl", "nhsldmem__np", "kownrent__ten", "loc..recs_division")
 57 | # weight = "weight"
 58 | # min_size = 30
 59 | # plot = TRUE
 60 | # cores = 1
 61 | 
 62 | #-----
 63 | 
 64 | validate <- function(observed,
 65 |                      implicates,
 66 |                      subset_vars,
 67 |                      weight = NULL,
 68 |                      min_size = 30,
 69 |                      plot = TRUE,
 70 |                      cores = 1) {
 71 | 
 72 |   t0 <- Sys.time()
 73 | 
 74 |   obs <- as.data.table(observed)
 75 |   sim <- as.data.table(implicates)
 76 |   rm(observed, implicates)
 77 | 
 78 |   # Check inputs
 79 |   stopifnot({
 80 |     all(setdiff(names(sim), "M") %in% names(obs))
 81 |     is.character(subset_vars) & all(subset_vars %in% names(obs))
 82 |     is.null(weight) | (length(weight) == 1 & weight %in% names(obs))
 83 |     min_size > 0
 84 |     is.logical(plot)
 85 |     cores > 0 & cores %% 1 == 0
 86 |   })
 87 | 
 88 |   # Check that input dimensions are consistent with one another
 89 |   N <- nrow(obs)
 90 |   Mimp <- max(sim$M)
 91 |   if (Mimp == 1) stop("Validation requires 'implicates' to contain more than one implicate")
 92 |   nM <- sim[, .N, by = M]
 93 |   stopifnot(N == nrow(sim) / Mimp)
 94 |   stopifnot(all(nM$M %in% seq_len(Mimp)))
 95 |   stopifnot(all(nM$N == N))
 96 | 
 97 |   if (is.null(weight)) {
 98 |     cat("Assuming uniform sample weights\n")
 99 |     obs$W <- 1L
100 |   } else {
101 |     setnames(obs, weight, "W")
102 |     obs[, W := W / mean(W)]
103 |   }
104 | 
105 |   # Variables for which estimates will be computed
106 |   y <- setdiff(names(sim), "M")
107 |   yN <- length(y)  # Reported to console later
108 |   if (any(y %in% subset_vars)) stop("Fused variabes are not allowed as 'subset_vars'")
109 | 
110 |   # Generate list of combinations of subset variables to test
111 |   # Restricted to no more than 2 subset variables per combination (m = 2)
112 |   scomb <- as.list(subset_vars)
113 |   if (length(subset_vars) > 1) scomb <- c(scomb, combn(subset_vars, m = 2, simplify = FALSE))
114 | 
115 |   # Restrict 'obs' to only the necessary variables
116 |   v <- c("W", y, subset_vars)
117 |   obs <- obs[, ..v]
118 | 
119 |   #---
120 | 
121 |   # Which subset_vars are numeric or ordered factor?
122 |   sclus <- names(which(sapply(obs[, ..subset_vars], function(x) is.ordered(x) | is.numeric(x))))
123 | 
124 |   # Convert numeric and ordered factor subset variables into clustered, ordered factors with k levels via uniCluster()
125 |   for (v in sclus) set(obs, j = v, value = uniCluster(x = obs[[v]], k = 5))
126 | 
127 |   #---
128 | 
129 |   # Convert logical 'y' variables to factor so they are handled appropriately
130 |   ylog <- names(which(sapply(obs[, ..y], is.logical)))
131 |   for (v in ylog) {
132 |     set(obs, j = v, value = as.factor(obs[[v]]))
133 |     set(sim, j = v, value = as.factor(sim[[v]]))
134 |   }
135 | 
136 |   # One-hot encode the y factor variables
137 |   yfct <- names(which(sapply(obs[, ..y], is.factor)))
138 |   if (length(yfct) > 0) {
139 |     cat("One-hot encoding categorical fusion variables\n")
140 |     obs <- one_hot(obs, yfct)
141 |     sim <- one_hot(sim, yfct)
142 |     y <- setdiff(names(sim), "M")
143 |   }
144 | 
145 |   #---
146 | 
147 |   # Detect and impute any missing values in 'observed'
148 |   # Note that 'simulated' should not have any NA's by design
149 |   na.cols <- names(which(sapply(obs, anyNA)))
150 |   if (length(na.cols) > 0) {
151 |     cat("Missing values imputed for the following variable(s):\n", paste(na.cols, collapse = ", "), "\n")
152 |     for (j in na.cols) {
153 |       x <- observed[[j]]
154 |       ind <- is.na(x)
155 |       observed[ind, j] <- imputationValue(x, ind)
156 |     }
157 |   }
158 | 
159 |   #---
160 | 
161 |   # Combine observed and simulated data
162 |   v <- c("W", subset_vars)
163 |   sim <- cbind(sim, obs[rep(1:N, Mimp), ..v])
164 | 
165 |   # Key the data.tables for speed in 'by' operations below
166 |   setkeyv(sim, c("M", subset_vars))
167 |   setkeyv(obs, subset_vars)
168 | 
169 |   #---
170 | 
171 |   # Report the overall correlation between observed and simulated fusion variables
172 |   # This is a quick check if the simulated data are simply replicating the original values (over-fitting)
173 |   ycor <- sapply(y, function(v) suppressWarnings(cor(rep(obs[[v]], Mimp), sim[, ..v])))
174 |   cat("Correlation between observed and fused values:\n")
175 |   print(summary(ycor), digits = 2)
176 | 
177 |   #---
178 | 
179 |   # TEST -- generate random versions of the 'y' variables in 'sim'
180 |   # for (v in y) {
181 |   #   set(sim, j = paste0(v, "__RND"), value = sample(obs[[v]], size = nrow(sim), replace = TRUE))
182 |   # }
183 |   # ysim <- c(y, paste0(y, "__RND"))
184 | 
185 |   #---
186 | 
187 |   # !!! TO DO: Potentially return correlation coefficients for each subset
188 |   # x <- filter(sim, M == 1)
189 |   # test <- cor(x[, ..y])
190 |   # f <- function(x) cor(x[, ..y])  # Restrict to desired entries...
191 |   # test <- sim[, f(.SD), by = M, .SDcols = y]
192 | 
193 |   #---
194 | 
195 |   # Function to compute confidence interval bounds
196 |   # p = 0.95 entails a 90% confidence interval
197 |   # calcCI <- function(d, p = 0.95) {
198 |   #   d %>%
199 |   #     mutate(
200 |   #       lwr = est - qt(p, df) * se, # CI lower bound
201 |   #       upr = est + qt(p, df) * se  # CI upper bound
202 |   #     )
203 |   # }
204 | 
205 |   # Function to compute margin of error (MOE)
206 |   # p = 0.95 entails a 90% confidence interval
207 |   calcMOE <- function(d, p = 0.95) {
208 |     d %>%
209 |       mutate(
210 |         moe = se * qt(p, df)
211 |       )
212 |   }
213 | 
214 |   #---
215 | 
216 |   # Process a particular combination of subsetting variables
217 | 
218 |   processSubset <- function(iset) {
219 | 
220 |     svar <- scomb[[iset]]
221 | 
222 |     # Calculate the share of observations within each subset, using the observed data
223 |     subshr <- obs[, .(share = .N / N), by = svar]
224 |     subshr <- subshr[share >= min_size / N]
225 |     subshr$id <- paste(iset, 1:nrow(subshr), sep = "_")
226 | 
227 |     # Restrict 'obs' and 'sim' to subsets with at least 'min_size' observations
228 |     obs <- obs[subshr, on = svar]
229 |     sim <- sim[subshr, on = svar]
230 | 
231 |     #---
232 | 
233 |     # Calculate outcomes for the observed data
234 |     g <- obs[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = svar, .SDcols = y]
235 |     g$metric <- rep(c("estimate1", "estimate2", "variance"), times = nrow(g) / 3)
236 |     g <- melt(g, measure.vars = y, variable.name = "y")
237 |     g <- dcast(g, ... ~ metric, value.var = "value")
238 |     g <- g[subshr, on = svar]  # Adds "share" and "id" variables
239 |     g <- mutate(g,
240 |                 est = estimate1,
241 |                 se = sqrt(variance),
242 |                 df = as.integer(share * N - 1)) %>%
243 |       select(-estimate1, -estimate2, -variance) %>%
244 |       calcMOE()
245 | 
246 |     #---
247 | 
248 |     # Calculate outcomes for the simulated data
249 | 
250 |     # Weighted mean and variance of the mean for each implicate
251 |     d <- sim[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = c("M", svar), .SDcols = y]
252 |     #d <- sim[, lapply(.SD, weighted_mean, w1 = W, w2 = W), by = c("M", svar), .SDcols = ysim]
253 |     d$metric <- rep(c("estimate1", "estimate2", "variance"), times = nrow(d) / 3)
254 |     d <- melt(d, id.vars = c("M", "metric", svar), variable.name = "y")
255 |     d <- dcast(d, ... ~ metric, value.var = "value")
256 | 
257 |     # Calculate the necessary quantities
258 |     # est: Point estimates
259 |     # ubar: Mean of the within-implicate variances
260 |     # b: Variance of estimates, across the implicates (approximated by variance of mixture of normal distributions)
261 |     d <- d[, .(est = mean(estimate1),
262 |                ubar = mean(variance),
263 |                b = var(estimate1)),
264 |                #b = (sum(estimate1 ^ 2 + variance) / Mimp) - mean(estimate1) ^ 2),  # Conservative approximation of var(estimate1); does not risk b1 < ubar
265 |            by = c(svar, "y")]
266 | 
267 |     # Adds "share" and "id" variables
268 |     d <- d[subshr, on = svar]
269 | 
270 |     # Calculate margin of error
271 |     #maxr <- maxr_fun(Mimp)
272 |     d <- mutate(d,
273 |                 # b = ifelse(ubar / b > maxr, ubar / maxr, b),
274 |                 # b = ifelse(ubar == 0, 0, b),
275 | 
276 |                 # Rubin 1987
277 |                 se = sqrt(ubar + (1 + Mimp^(-1)) * b),
278 |                 r = (1 + Mimp^(-1)) * b / ubar,
279 |                 df = (Mimp - 1) * (1 + r^(-1)) ^ 2,
280 |                 df = ifelse(is.infinite(df), as.integer(share * N - 1), df), # Set 'df' to standard value if Inf is returned by Rubin's formula (i.e. when b = 0; no variance across implicates)
281 |                 r = NULL
282 | 
283 |                 # Reiter and Raghunathan (2007)
284 |                 # se = sqrt(b * (1 + 1 / Mimp) - ubar),
285 |                 # df = (Mimp - 1) * (1 - Mimp * ubar / ((Mimp + 1) * b)) ^ 2,
286 |                 # df = ifelse(se == 0, 1, df)
287 |     ) %>%
288 |       select(-ubar, -b, -share, -id) %>%
289 |       #select(-ubar, -b) %>%
290 |       calcMOE()
291 | 
292 |     # Merge observed and simulated results
293 |     comp <- merge(g, d, by = c(svar, "y"), suffixes = c(".obs", ".sim")) %>%
294 |       select(-all_of(svar))
295 |     return(comp)
296 | 
297 |   }
298 | 
299 |   #---
300 | 
301 |   # Process each subset combination in 'scomb'
302 |   cat("Processing validation analyses for", yN, "fusion variables\n")
303 |   comp <- parallel::mclapply(seq_along(scomb), processSubset, mc.cores = cores) %>%
304 |     rbindlist()
305 | 
306 |   #---
307 | 
308 |   # Add the observed mean estimates (est.mean)
309 |   avg <- matrixStats::colWeightedMeans(x = as.matrix(obs[, ..y]), w = obs$W)
310 |   set(comp, j = "est.mean", value = avg[match(comp$y, names(avg))])
311 | 
312 |   #---
313 | 
314 |   # Add original factor variable levels, if necessary
315 |   if (length(yfct) > 0) {
316 |     comp <- merge(x = comp, y = attr(obs, "one_hot_link"), by.x = "y", by.y = "dummy", all.x = TRUE, sort = FALSE)
317 |     comp <- comp %>%
318 |       mutate(y = ifelse(is.na(level), y, original)) %>%
319 |       select(-original)
320 |   } else {
321 |     comp$level <- NA
322 |   }
323 | 
324 |   #---
325 | 
326 |   # Report total unique subsets analyzed
327 |   cat("Performed", nrow(comp), "analyses across", uniqueN(comp$id), "subsets\n")
328 | 
329 |   # Data frame with comparison results
330 |   keep <- c("id", "share", "y", "level", "est.obs", "moe.obs", "est.sim", "moe.sim", "est.mean")
331 |   result <- comp[, ..keep]
332 |   result$y <- as.character(result$y)
333 |   class(result) <- c("validate", class(comp))
334 | 
335 |   # Clean up
336 |   rm(obs, sim, comp)
337 | 
338 |   #---
339 | 
340 |   # Check if ggplot2 is installed; if so, call plot_valid() prior to returning result
341 |   if (plot) {
342 |     suppressMessages(ok <- require(ggplot2, quietly = TRUE, warn.conflicts = FALSE))
343 |     if (ok) {
344 |       result <- plot_valid(result, cores = cores)
345 |       class(result) <- c("validate", class(result))
346 |     } else {
347 |       cat("Skipping plot generation because 'ggplot2' package is not installed\n")
348 |     }
349 |   }
350 | 
351 |   # Report processing time
352 |   tout <- difftime(Sys.time(), t0)
353 |   cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ")
354 | 
355 |   return(result)
356 | 
357 | }
358 | 


--------------------------------------------------------------------------------
/R/prepXY.R:
--------------------------------------------------------------------------------
  1 | #' Prepare the 'x' and 'y' inputs
  2 | #'
  3 | #' @description
  4 | #' Optional-but-useful function to: 1) provide a plausible ordering of the 'y' (fusion) variables and 2) identify the subset of 'x' (predictor) variables likely to be consequential during subsequent model training. Output can be passed directly to \code{\link{train}}. Most useful for large datasets with many and/or highly-correlated predictors. Employs an absolute Spearman rank correlation screen and then LASSO models (via \code{\link[glmnet]{glmnet}}) to return a plausible ordering of 'y' and the preferred subset of 'x' variables associated with each.
  5 | #'
  6 | #' @param data Data frame. Training dataset. All categorical variables should be factors and ordered whenever possible.
  7 | #' @param y Character or list. Variables in \code{data} to eventually fuse to a recipient dataset. If \code{y} is a list, each entry is a character vector possibly indicating multiple variables to fuse as a block.
  8 | #' @param x Character. Predictor variables in \code{data} common to donor and eventual recipient.
  9 | #' @param weight Character. Name of the observation weights column in \code{data}. If NULL (default), uniform weights are assumed.
 10 | #' @param cor_thresh Numeric. Predictors that exhibit less than \code{cor_thresh} absolute Spearman (rank) correlation with a \code{y} variable are screened out prior to the LASSO step. Fast exclusion of predictors that the LASSO step probably doesn't need to consider.
 11 | #' @param lasso_thresh Numeric. Controls how aggressively the LASSO step screens out predictors. Lower value is more aggressive. \code{lasso_thresh = 0.95}, for example, retains predictors that collectively explain at least 95% of the deviance explained by a "full" model.
 12 | #' @param xmax Integer. Maximum number of predictors returned by LASSO step. Does not strictly control the number of final predictors returned (especially for categorical \code{y} variables), but useful for setting a (very) soft upper bound. Lower \code{xmax} can help control computation time if a large number of \code{x} pass the correlation screen. \code{xmax = Inf} imposes no restriction.
 13 | #' @param xforce Character. Subset of \code{x} variables to "force" as included predictors in the results.
 14 | #' @param fraction Numeric. Fraction of observations in \code{data} to randomly sample. For larger datasets, sampling often has minimal effect on results but speeds up computation.
 15 | #' @param cores Integer. Number of cores used. Only applicable on Unix systems.
 16 | #'
 17 | #' @return List with named slots "y" and "x". Each is a list of the same length. Former gives the preferred fusion order. Latter gives the preferred sets of predictor variables.
 18 | #'
 19 | #' @examples
 20 | #' y <- names(recs)[c(14:16, 20:22)]
 21 | #' x <- names(recs)[2:13]
 22 | #'
 23 | #' # Fusion variable "blocks" are respected by prepXY()
 24 | #' y <- c(list(y[1:2]), y[-c(1:2)])
 25 | #'
 26 | #' # Do the prep work...
 27 | #' prep <- prepXY(data = recs, y = y, x = x)
 28 | #'
 29 | #' # The result can be passed to train()
 30 | #' train(data = recs, y = prep$y, x = prep$x)
 31 | #'
 32 | #' @export
 33 | 
 34 | #-----
 35 | 
 36 | # library(fusionModel)
 37 | # library(data.table)
 38 | # library(tidyverse)
 39 | # source("R/utils.R")
 40 | #
 41 | # data <- select(recs, -starts_with("rep_"))
 42 | # y <- names(recs)[c(13:16, 20:22)]
 43 | # x <- setdiff(names(data), c(y, "weight"))
 44 | # weight = "weight"
 45 | # fraction = 1
 46 | # cor_thresh = 0.05
 47 | # lasso_thresh = 0.95
 48 | # cores = 1
 49 | # xmax = 5
 50 | # xforce = NULL
 51 | #
 52 | # # Let y have a block
 53 | # y <- c(list(y[1:2]), y[-c(1:2)])
 54 | #
 55 | # test <- prepXY(data, y, x, weight = "weight")
 56 | 
 57 | #-----
 58 | 
 59 | prepXY <- function(data,
 60 |                    y,
 61 |                    x,
 62 |                    weight = NULL,
 63 |                    cor_thresh = 0.05,
 64 |                    lasso_thresh = 0.95,
 65 |                    xmax = 100,
 66 |                    xforce = NULL,
 67 |                    fraction = 1,
 68 |                    cores = 1) {
 69 | 
 70 |   t0 <- Sys.time()
 71 | 
 72 |   # Create correct 'y' and 'ylist', depending on input type
 73 |   if (is.list(y)) {
 74 |     ylist <- y
 75 |     y <- unlist(ylist)
 76 |   } else {
 77 |     ylist <- as.list(y)
 78 |   }
 79 | 
 80 |   stopifnot(exprs = {
 81 |     is.data.frame(data)
 82 |     all(y %in% names(data))
 83 |     all(x %in% names(data))
 84 |     length(intersect(y, x)) == 0
 85 |     all(lengths(ylist) > 0)
 86 |     is.null(weight) | weight %in% names(data)
 87 |     is.null(xforce) | all(xforce %in% x)
 88 |     xmax >= 1
 89 |     fraction > 0 & fraction <= 1
 90 |     cor_thresh > 0 & cor_thresh <= 1
 91 |     lasso_thresh > 0 & lasso_thresh <= 1
 92 |     cores >= 1 & cores %% 1 == 0
 93 |   })
 94 | 
 95 |   # TO DO: Make operations data.table for efficiency
 96 |   if (is.data.table(data)) data <- as.data.frame(data)
 97 | 
 98 |   # Check for character-type variables; stop with error if any detected
 99 |   # Check for no-variance (constant) variables
100 |   # Detect and impute any missing values in 'x' variables
101 |   data <- checkData(data = data, y = y, x = x, nfolds = NULL, impute = TRUE)
102 | 
103 |   # In case any variables are removed by checkData(), update 'x', 'y', and 'ylist'
104 |   x <- intersect(x, names(data))
105 |   for (i in 1:length(ylist)) ylist[[i]] <- intersect(ylist[[i]], names(data))
106 |   ylist <- purrr::compact(ylist)
107 |   y <- unlist(y)
108 | 
109 |   #---
110 | 
111 |   # Observation weights vector
112 |   W <- if (is.null(weight)) {
113 |     rep(1L, nrow(data))
114 |   } else {
115 |     data[[weight]] / mean(data[[weight]])
116 |   }
117 |   set(data, j = weight, value = NULL)
118 | 
119 |   #-----
120 | 
121 |   # Which 'y' variables are zero-inflated?
122 |   yinf <- names(which(sapply(data[y], inflated)))
123 | 
124 |   if (length(yinf)) {
125 | 
126 |     # Create '*_zero' versions of the zero-inflated variables
127 |     dinf <- data[yinf] %>%
128 |       mutate_all(~ .x != 0) %>%
129 |       setNames(paste0(yinf, "_zero"))  # Variable has "_zero" suffix, but it actually indicates when the original 'y' variable is NON-zero.
130 | 
131 |     # Update the zero-inflated variables in data to have NA instead of zero values
132 |     data <- data %>%
133 |       cbind(dinf)
134 | 
135 |     # Update 'ylist' to include '*_zero' versions in block with original zero-inflated variables
136 |     ylist <- lapply(ylist, function(v) if (any(v %in% yinf)) c(v, paste0(intersect(v, yinf), "_zero")) else v)
137 |     y <- unlist(ylist)
138 | 
139 |   }
140 | 
141 |   #-----
142 | 
143 |   # Sample 'data', if requested
144 |   if (fraction < 1) {
145 |     samp <- sample.int(n = nrow(data), size = round(nrow(data) * fraction))
146 |     data <- data[samp, ]
147 |     W <- W[samp]
148 |   }
149 | 
150 |   #-----
151 | 
152 |   # Assemble the 'Z' matrix with all required variables
153 | 
154 |   X <- data[x] %>%
155 |     mutate_if(is.ordered, as.integer) %>%
156 |     mutate_if(is.logical, as.integer) %>%
157 |     mutate_if(is.factor, lumpFactor, nmax = 5) %>%
158 |     one_hot(dropUnusedLevels = TRUE)
159 |   xlink <- attr(X, "one_hot_link")
160 |   xcols <- names(X)
161 | 
162 |   Y <- data[y] %>%
163 |     mutate_if(is.logical, as.integer) %>%
164 |     mutate_if(is.factor, lumpFactor, nmax = 5) %>%
165 |     one_hot(dropOriginal = TRUE, dropUnusedLevels = TRUE)
166 |   ylink <- attr(Y, "one_hot_link")
167 |   yfactor <- names(which(sapply(data[y], is.factor)))
168 |   ycols <- names(Y)
169 | 
170 |   xylink <- rbind(xlink, ylink)
171 |   rm(data)
172 |   Z <- as.matrix(cbind(Y, X))  # Could make sparse?
173 |   rm(X, Y)
174 | 
175 |   #-----
176 | 
177 |   # TO DO: Switch to using this code instead of 'X' and 'Y' blocks above
178 |   # 8/9/24:TEST ALT
179 |   # MOVE this could into separate function? It is also in impute()
180 |   # d2 <- copy(data)
181 |   #
182 |   # # Convert 'd2' to plausible ranks for correlation screening
183 |   # # All output columns should be NA, integer, or logical
184 |   # # NA's in input are preserved in output
185 |   # for (i in 1:ncol(d2)) {
186 |   #   z <- d2[[i]]
187 |   #   if (is.numeric(z)) {
188 |   #     # Ties method 'dense' ensures integer output with minimum of 1 and maximum of length(na.omit(z))
189 |   #     z <- frank(z, ties.method = "dense", na.last = "keep")
190 |   #   } else {
191 |   #     if (is.ordered(z)) {
192 |   #       z <- as.integer(z)
193 |   #     } else {
194 |   #       if (!is.logical(z)) {
195 |   #         # Converts character and un-ordered factors to TRUE for the most-common (non-NA) value and FALSE otherwise
196 |   #         zt <- table2(z, na.rm = TRUE)
197 |   #         z <- z == names(which.max(zt))
198 |   #       }
199 |   #     }
200 |   #   }
201 |   #   set(d2, j = i, value = z)
202 |   # }
203 | 
204 |   #-----
205 | 
206 |   # intersect() call restricts to factor levels present in 'Z' (some levels can be dropped when 'data' is randomly subsampled)
207 |   vc <- lapply(y, function(v) {
208 |     out <- if (v %in% yfactor) {
209 |       vl <- filter(ylink, original == v)$dummy
210 |       intersect(vl, colnames(Z))
211 |     } else {
212 |       v
213 |     }
214 |     return(out)
215 |   }) %>%
216 |     setNames(y)
217 | 
218 |   #-----
219 | 
220 |   # Determine the x-predictors that pass absolute correlation threshold for each y
221 |   # The 'Zr' matrix contains the ranks, so the correlation threshold refers to Spearman (rank) correlation
222 |   cat("Identifying 'x' that pass absolute Spearman correlation threshold\n")
223 |   Zr <- matrixStats::colRanks(Z, ties.method = "average", preserveShape = TRUE, useNames = TRUE)
224 |   xok <- parallel::mclapply(unlist(vc), function(v) {
225 | 
226 |     # Initial correlation screening, based on absolute correlation value
227 |     p <- abs(suppressWarnings(cor(Zr[, v], Zr[, xcols], use = "pairwise.complete.obs")))
228 |     p[is.na(p)] <- 0
229 |     vx <- which(p > cor_thresh)  # Arbitrary correlation threshold
230 | 
231 |     # Ensure some minimum number of predictors are passed to glmnet()
232 |     # If there are too few predictors, glmnet() may fail
233 |     if (length(vx) < 20) vx <- order(p, decreasing = TRUE)[1:min(20, length(p))]
234 | 
235 |     return(xcols[vx])
236 | 
237 |   }, mc.cores = cores) %>%
238 |     setNames(unlist(vc))
239 |   rm(Zr)
240 | 
241 |   #-----
242 | 
243 |   # Wrapper function for fitting a glmnet LASSO model
244 |   # Used repeatedly in looped calls below
245 |   gfit <- function(y, x) {
246 |     i <- if (y %in% yinf) Z[, y] != 0 else rep(TRUE, nrow(Z))
247 |     suppressWarnings({
248 |       glmnet::glmnet(
249 |         x = Z[i, x],
250 |         y = Z[i, y],
251 |         weights = W[i],
252 |         family = "gaussian",
253 |         pmax = min(xmax, length(x)),
254 |         alpha = 1)
255 |     })
256 |   }
257 | 
258 |   # Testing with 'pmax' argument enabled
259 |   # y = "square_feet"
260 |   # x = setdiff(xcols, y)
261 |   # m <- gfit(y, x)
262 |   # i <- which(m$dev.ratio / max(m$dev.ratio) >= ifelse(m$jerr == 0, lasso_thresh, 1))[1] # Preferred lambda index value for each model fit, based on supplied lasso threshold
263 |   # cf <- coef(m, s = m$lambda[i])
264 |   # sum(cf[, 1] != 0)
265 | 
266 |   #-----
267 | 
268 |   cat("Fitting full models for each 'y'\n")
269 | 
270 |   # Weights
271 |   ywgt <- matrixStats::colWeightedMeans(Z, W, cols = ycols)
272 | 
273 |   # Fit the "full" models for each fusion variable/block
274 |   rmax <- parallel::mclapply(ylist, function(yvar) {
275 |     sapply(yvar, function(v) {
276 |       V <- vc[[v]]  # Column names of dummies in case where 'v' is a factor
277 |       fits <- lapply(V, function(yv) gfit(y = yv, x = c(xok[[yv]], setdiff(ycols, c(yvar, V)))))
278 |       r2 <- sapply(fits, function(m) max(m$dev.ratio))
279 |       if (length(r2) > 1) r2 <- sum(r2 * ywgt[V]) # Only applies weighting when 'r2' contains multiple values (i.e. unique value for each factor level within a variable)
280 |       r2
281 |     }) %>%
282 |       mean()  # Returns mean of R2 in case of multiple 'yvar'
283 |   }, mc.cores = cores) %>%
284 |     simplify2array() %>%
285 |     setNames(ylist)
286 | 
287 |   #-----
288 | 
289 |   cat("Iteratively constructing preferred fusion order\n")
290 | 
291 |   # Start building the preferred fusion order...
292 |   ord <- NULL  # Vector with preferred fusion variable sequence
293 |   xpred <- NULL
294 | 
295 |   # Print loop progress to console?
296 |   for (i in 1:length(ylist)) {
297 | 
298 |     # Candidate y variables remaining to add to 'ord'
299 |     ycand <- setdiff(ylist, ord)
300 | 
301 |     out <- parallel::mclapply(ycand, function(yvar) {
302 | 
303 |       # This wrapper is necessary to handle cases of blocked 'yvar' with 2+ fusion variables OR case of zero-inflated fusion variable with a "_zero" version included.
304 |       out2 <- lapply(yvar, function(v) {
305 |         V <- vc[[v]]
306 |         fits <- lapply(V, function(yv) {
307 |           xx <- xok[[yv]]  # x-predictors that pass minimum correlation threshold (or best-n to meet minimum number of predictors)
308 |           m <-  gfit(y = yv, x = c(xx, unlist(vc[unlist(ord)])))
309 |           i <- which(m$dev.ratio / max(m$dev.ratio) >= ifelse(m$jerr == 0, lasso_thresh, 1))[1] # Preferred lambda index value for each model fit, based on supplied lasso threshold
310 |           r2 <- m$dev.ratio[i]
311 |           cf <- coef(m, s = m$lambda[i])
312 |           xk <- names(which(Matrix::rowSums(cf != 0) > 0)[-1])  # Predictors with non-zero coefficients
313 |           xx <- setdiff(xx, xk)  # Remaining zero-coefficient x-predictors, in order of correlation preference
314 |           if (length(xk) < 20 & length(xx) > 0) xk <- c(xk, xx[1:min(20 - length(xk), length(xx))])  # Adds zero-coefficient predictors to achieve some minimum number
315 |           list(r2 = r2, xk = xk)
316 |         })
317 | 
318 |         # Extract R-squared for each model and calculate weighted mean across factor levels, if necessary
319 |         r2 <- sapply(fits, function(m) m$r2)
320 |         if (length(r2) > 1) r2 <- sum(r2 * ywgt[V])
321 | 
322 |         # Extract full set of useful x-predictors
323 |         xk <- lapply(fits, function(m) m$xk) %>%
324 |           unlist() %>%
325 |           unique()
326 | 
327 |         # Return R2 and x-predictor results
328 |         list(r2 = r2, xk = xk)
329 | 
330 |       })
331 | 
332 |       #---
333 | 
334 |       # Handle blocked 'v' case (combine results across individual variables)
335 |       out2 <- if (length(out2) > 1) {
336 |         list(r2 = mean(sapply(out2, function(x) x$r2)),
337 |              xk = unique(unlist(lapply(out2, function(x) x$xk))))
338 |       } else {
339 |         out2[[1]]
340 |       }
341 | 
342 |       return(out2)
343 | 
344 |     }, mc.cores = cores) %>%
345 |       setNames(ycand)
346 | 
347 |     r2 <- sapply(out, function(m) m$r2)
348 |     score <- r2 / rmax[names(r2)]
349 |     best <- which.max(score)
350 | 
351 |     # Update 'ord' with best next fusion variable(s) in the chain
352 |     ord <- c(ord, ycand[best])
353 | 
354 |     # Extract the predictor variables to be used for 'best' fusion variable
355 |     keep <- out[[best]]$xk
356 |     # if (!is.null(xlink)) {
357 |     #   i <- keep %in% xlink$dummy
358 |     #   keep[i] <- filter(xlink, dummy %in% keep[i])$original
359 |     if (!is.null(xylink)) {
360 |       # i <- keep %in% xylink$dummy
361 |       # keep[i] <- filter(xylink, dummy %in% keep[i])$original
362 |       i <- xylink$original[match(keep, xylink$dummy)]
363 |       i[is.na(i)] <- keep[is.na(i)]
364 |       keep <- i
365 |     }
366 |     keep <- unique(keep)
367 |     #keep <- setdiff(keep, ycols)  # Remove any fusion variables from the preferred predictor set
368 |     xpred <- c(xpred, list(keep))
369 | 
370 |   }
371 | 
372 |   #---
373 | 
374 |   # Remove any "*_zero" variables from the 'ord' result or 'xpred' results
375 |   ord <- lapply(ord, function(v) setdiff(v, paste0(yinf, "_zero")))
376 |   xpred <- lapply(xpred, function(v) setdiff(v, paste0(yinf, "_zero")))
377 | 
378 |   # Force inclusion of 'xforce' predictor variables
379 |   xpred <- lapply(xpred, function(v) unique(c(v, xforce)))
380 | 
381 |   # Nicely name and order the x-predictors list in order of original 'x'
382 |   # names(xpred) <- sapply(ord, paste, collapse = " | ")
383 |   # xpred <- lapply(xpred, function(v) v[order(match(v, x))])
384 | 
385 |   # Results list
386 |   result <- list(y = ord, x = xpred)
387 | 
388 |   # The full set of variables being retained, stored as attribute
389 |   pvars <- unique(unlist(xpred))
390 |   #stopifnot(all(pvars %in% x))
391 |   attr(result, "xpredictors") <- intersect(x, pvars)
392 |   attr(result, "xforce") <- xforce
393 |   attr(result, "xoriginal") <- x  # Original, full set of potential predictor variables
394 |   #cat("Retained", length(pvars), "of", length(x), "potential predictor variables\n")
395 |   cat("Retained", length(intersect(x, pvars)), "of", length(x), "potential predictor variables\n")
396 | 
397 |   # Report processing time
398 |   tout <- difftime(Sys.time(), t0)
399 |   cat("Total processing time:", signif(as.numeric(tout), 3), attr(tout, "units"), "\n", sep = " ")
400 | 
401 |   return(result)
402 | 
403 | }
404 | 


--------------------------------------------------------------------------------
/R/plot_valid.R:
--------------------------------------------------------------------------------
  1 | #' Plot validation results
  2 | #'
  3 | #' @description
  4 | #' Creates and optionally saves to disk representative plots of validation results returned by \code{\link{validate}}. Requires the suggested \code{\link{ggplot2}} package. This function is (by default) called within \code{\link{validate}}. Can be useful on its own to save graphics to disk or generate plots for a subset of fusion variables.
  5 | #'
  6 | #' @param valid Object returned by \code{\link{validate}}.
  7 | #' @param y Character. Fusion variables to use for validation graphics. Useful for plotting partial validation results. Default is to use all fusion variables present in \code{valid}.
  8 | #' @param path Character. Path to directory where .png graphics are to be saved. Directory is created if necessary. If NULL (default), no files are saved to disk.
  9 | #' @param cores Integer. Number of cores used. Only applicable on Unix systems.
 10 | #' @param ... Arguments passed to \code{\link[ggplot2]{ggsave}} to control .png graphics saved to disk.
 11 | #'
 12 | #' @details Validation results are visualized to convey expected, typical (median) performance of the fusion variables. That is, how well do the simulated data match the observed data with respect to point estimates and confidence intervals for population subsets of various size?
 13 | #'
 14 | #' Plausible error metrics are derived from the input validation data for plotting. For comparison of point estimates, the error metric is absolute percent error for continuous variables; in the categorical case it is absolute error scaled such that the maximum possible error is 1. Since these metrics are not strictly comparable, the all-variable plots denote categorical fusion variables with dotted lines.
 15 | #'
 16 | #' For a given fusion variable, the error metric will exhibit variation (often quite skewed) even for subsets of comparable size, due to the fact that each subset looks at a unique partition of the data. In order to convey how expected, typical performance varies with subset size, the smoothed median error conditional on subset size is approximated and plotted.
 17 | #'
 18 | #' @return A list with "plots", "smooth", and "data" slots. The "plots" slot contains the following \code{\link[ggplot2]{ggplot}} objects:
 19 | #' \itemize{
 20 | #'   \item est: Comparison of point estimates (median absolute percent error).
 21 | #'   \item moe: Comparison of 90% margin of error (median ratio of simulated-to-observed MOE).
 22 | #'   \item Additional named slots (one for each of the fusion variables) contain the plots described above with scatterplot results.
 23 | #'  }
 24 | #' "smooth" is a data frame with the plotting values used to produce the smoothed median plots.
 25 | #' "data" is a data frame with the complete validation results as returned by the original call to \code{\link{validate}}.
 26 | #'
 27 | #' @examples
 28 | #' # Build a fusion model using RECS microdata
 29 | #' # Note that "fusion_model.fsn" will be written to working directory
 30 | #' fusion.vars <- c("electricity", "natural_gas", "aircon")
 31 | #' predictor.vars <- names(recs)[2:12]
 32 | #' fsn.path <- train(data = recs,
 33 | #'                   y = fusion.vars,
 34 | #'                   x = predictor.vars,
 35 | #'                   weight = "weight")
 36 | #'
 37 | #' # Fuse back onto the donor data (multiple implicates)
 38 | #' sim <- fuse(data = recs,
 39 | #'             file = fsn.path,
 40 | #'             M = 30)
 41 | #'
 42 | #' # Calculate validation results but do not generate plots
 43 | #' valid <- validate(observed = recs,
 44 | #'                   implicates = sim,
 45 | #'                   subset_vars = c("income", "education", "race", "urban_rural"),
 46 | #'                   weight = "weight",
 47 | #'                   plot = FALSE)
 48 | #'
 49 | #' # Create validation plots
 50 | #' valid <- plot_valid(valid)
 51 | #'
 52 | #' # View some of the plots
 53 | #' valid$plots$est
 54 | #' valid$plots$moe
 55 | #' valid$plots$electricity$bias
 56 | #'
 57 | #' # Can also save the plots to disk at creation
 58 | #' # Will save .png files to 'valid_plots' folder in working directory
 59 | #' # Note that it is fine to pass a 'valid' object with existing $plots slot
 60 | #' # In that case, the plots are simply re-generated
 61 | #' vplots <- plot_valid(valid,
 62 | #'                      path = file.path(getwd(), "valid_plots"),
 63 | #'                      width = 8, height = 6)
 64 | #'
 65 | #' @export
 66 | 
 67 | #-----
 68 | 
 69 | # From ?validate example
 70 | # library(fusionModel)
 71 | # library(dplyr)
 72 | # library(data.table)
 73 | # source("R/utils.R")
 74 | #
 75 | # fusion.vars <- c("electricity", "natural_gas", "aircon")
 76 | # predictor.vars <- names(recs)[2:12]
 77 | # fsn.path <- train(data = recs,
 78 | #                   y = fusion.vars,
 79 | #                   x = predictor.vars,
 80 | #                   weight = "weight")
 81 | # # Fuse back onto the donor data (multiple implicates)
 82 | # sim <- fuse(data = recs,
 83 | #             fsn = fsn.path,
 84 | #             M = 40)
 85 | #
 86 | # valid <- validate(observed = recs,
 87 | #                   implicates = sim,
 88 | #                   subset_vars = c("income", "education", "race", "urban_rural"),
 89 | #                   weight = "weight")
 90 | 
 91 | #------------------
 92 | 
 93 | plot_valid <- function(valid,
 94 |                        y = NULL,
 95 |                        path = NULL,
 96 |                        cores = 1,
 97 |                        ...) {
 98 | 
 99 |   # Check if ggplot is installed
100 |   suppressMessages(ok <- require(ggplot2, quietly = TRUE, warn.conflicts = FALSE))
101 |   if (!ok) stop("package 'ggplot2' must be installed for plot_valid() to work")
102 | 
103 |   if (inherits(valid, "validate")) {
104 |     if (!is.data.frame(valid)) valid <- valid$data  # This forces 'valid' to just the data frame with validation results
105 |   } else {
106 |     stop("'valid' must be an object generated by validate()")
107 |   }
108 | 
109 |   # Check inputs
110 |   stopifnot({
111 |     all(y %in% valid$y)
112 |     is.null(path) | is.character(path)
113 |   })
114 | 
115 |   # Create 'directory', if necessary
116 |   if (!is.null(path)) {
117 |     path <- normalizePath(path, mustWork = FALSE)
118 |     if (!dir.exists(path)) dir.create(path, recursive = TRUE)
119 |   }
120 | 
121 |   #---
122 | 
123 |   # Restrict fusion (y) variables, if requested
124 |   if (!is.null(y)) valid <- filter(valid, y %in% !!y)
125 | 
126 |   #---
127 | 
128 |   # Confidence interval overlap
129 |   # See Compare.CI() here: https://github.com/cran/synthpop/blob/master/R/compare.syn.r
130 |   # CIoverlap(10, 20, 5, 25)
131 |   # CIoverlap <- function(lwr_obs, upr_obs, lwr_sim, upr_sim) {
132 |   #   L <- pmax(lwr_obs, lwr_sim)
133 |   #   U <- pmin(upr_obs, upr_sim)
134 |   #   0.5 * (((U - L) / (upr_obs - lwr_obs)) + ((U - L) / (upr_sim - lwr_sim)))
135 |   # }
136 | 
137 |   #---
138 | 
139 |   errorFun <- function(obs, sim) {
140 |     # out <- suppressWarnings({
141 |     #   exp(abs(log(sim / obs))) - 1  # Morley 2018 preferred error metric (zeta); will be NA if ratio is negative and Inf if 'obs' is zero; https://agupubs.onlinelibrary.wiley.com/doi/full/10.1002/2017SW001669
142 |     # })
143 |     out <- abs((sim - obs) / obs)  # Conventional absolute percent error (gives Inf when obs = 0)
144 |     #out <- ifelse(!is.finite(out), ape, out)  # If Morley's zeta is non-finite, replace with APE
145 |     out[!is.finite(out)] <- NA  # If non-finite, return NA
146 |     out[obs == sim] <- 0  # If 'obs' and 'sim' are identical, return zero error
147 |     return(out)
148 |   }
149 | 
150 |   # Calculate the error metrics for plotting
151 |   vest <- valid %>%
152 |     mutate(
153 |       cont = is.na(level),
154 | 
155 |       # Point estimate error
156 |       est = errorFun(obs = est.obs, sim = est.sim),
157 | 
158 |       # Relative uncertainty ratio (sim / obs)
159 |       moe = (moe.sim / est.sim) / (moe.obs / est.obs),
160 |       moe = ifelse(moe.sim == moe.obs, 1, moe),
161 |       moe = ifelse(is.infinite(moe), NA, moe),
162 | 
163 |       # Point estimate value-added
164 |       vad = 1 - abs(est.sim - est.obs) / abs(est.mean - est.obs),
165 |       vad = ifelse(est.mean == est.obs, 0, vad),
166 |       vad = pmax(0, vad)
167 | 
168 |     ) %>%
169 |     # group_by(id, share, y, cont) %>%
170 |     # summarize_at(c("est", "moe", "vad"), .funs = mean) %>%  # Collapses categorical levels to mean of performance metrics
171 |     # ungroup() %>%
172 |     mutate_if(is.numeric, ~ ifelse(is.finite(.x), .x, NA))  # Sets any non-finite values to NA
173 | 
174 |   #---
175 | 
176 |   # Conditional median of the error metrics
177 |   cat("Smoothing validation metrics\n")
178 |   y <- unique(vest$y)
179 |   qest <- parallel::mclapply(y, function(v) {
180 | 
181 |     d <- filter(vest, y == v)
182 | 
183 |     est <- smoothQuantile(x = d$share, y = d$est, qu = 0.5)
184 |     #est <- smoothMean(x = d$share, y = d$est, verbose = FALSE)
185 | 
186 |     moe <- smoothQuantile(x = d$share, y = d$moe, qu = 0.5)
187 |     #moe  <- smoothMean(x = d$share, y = d$moe, verbose = FALSE)
188 | 
189 |     vad <- smoothQuantile(x = d$share, y = d$vad, qu = 0.5)
190 | 
191 |     out <- rbindlist(list(est = est, moe = moe, vad = vad), idcol = "metric", fill = TRUE)
192 |     out[, CAT := !d$cont[1]]
193 |     out[, VAR := v]
194 |     setnames(out, "x", "SHR")
195 |     return(out)
196 | 
197 |   }, mc.cores = cores) %>%
198 |     data.table::rbindlist()
199 | 
200 |   # Drop invalid rows -- cases where the returned smoothed value is infeasible
201 |   # qest <- qest %>%
202 |   #   filter(y >= 0 | (y <= 1 & metric == "vad"))
203 | 
204 |   # qest <- qest %>%
205 |   #   mutate(y = ifelse(metric %in% c("est", "moe"), pmax(0, y), pmin(1, y)))
206 | 
207 |   # Ensure feasible smoothed values
208 | 
209 |   min.max <- vest %>%
210 |     group_by(y) %>%
211 |     summarize_at(c("est", "vad", "moe"), list(min = min, max = max), na.rm = TRUE) %>%
212 |     rename(VAR = y)
213 | 
214 |   qest <- qest %>%
215 |     left_join(min.max, by = "VAR") %>%
216 |     mutate(y = ifelse(metric == "est", pmax(pmin(y, est_max), est_min), y),
217 |            y = ifelse(metric == "moe", pmax(pmin(y, moe_max), moe_min), y),
218 |            y = ifelse(metric == "vad", pmax(pmin(y, vad_max), vad_min), y)) %>%
219 |     select(metric:VAR)
220 | 
221 |   #---
222 | 
223 |   # Summary performance metrics (print to console)
224 | 
225 |   sum.perf <- dcast(qest, ... ~ metric, value.var = "y") %>%
226 |     group_by(VAR) %>%
227 |     summarize_at(c("est", "vad", "moe"), mean) %>%
228 |     rename(y = VAR)
229 |   cat("Average smoothed performance metrics across subset range:\n")
230 |   print(as.data.frame(sum.perf), digits = 3, print.gap = 2)
231 | 
232 |   #---
233 | 
234 |   # Restrict the smooth results to x-range available for all of the variables
235 |   # This ensures that the plots have a same/consistent x-axis for each of the y variables
236 | 
237 |   xrng <- range(qest$SHR)
238 |   # xrng <- qest %>%
239 |   #   group_by(VAR) %>%
240 |   #   summarize(min = min(SHR), max = max(SHR), .groups = "drop") %>%
241 |   #   summarize(min = max(min), max = min(max)) %>%
242 |   #   unlist()
243 | 
244 |   qest <- qest %>%
245 |     filter(SHR >= xrng[1], SHR <= xrng[2])
246 | 
247 |   #---
248 | 
249 |   # Create plot objects
250 |   cat("Creating ggplot2 graphics", ifelse(is.null(path), "", "and saving .png files to disk"), "\n")
251 | 
252 |   # X-axis definition
253 |   b <- c(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1) # Nice break marks on square root scale
254 |   b <- b[max(1, which(b <= xrng[1])[1], na.rm = TRUE) : which(b >= xrng[2])[1]]
255 |   pct <- function(x) paste0(as.character(x * 100), "%")
256 |   xaxis <- scale_x_continuous(name = "Subset size (percent of total population)",
257 |                               limits = xrng, breaks = b, trans = "sqrt", labels = pct)
258 | 
259 |   # ggplot elements to add to all plots
260 |   plot.all <- list(xaxis,
261 |                    theme_bw(),
262 |                    theme(plot.title = element_text(face = "bold")),
263 |                    guides(color = guide_legend(title = "Fusion variable"),
264 |                           linetype = guide_legend(title = "Categorical")))
265 | 
266 |   #---
267 | 
268 |   # ggplot elements to add to each specific plot
269 |   p1.add <- list(scale_y_continuous(name = "Median absolute percent error", n.breaks = 8, limits = c(0, NA), expand = expansion(mult = c(0.01, 0.05)), labels = pct),
270 |                  labs(subtitle = "Comparison of point estimates"))
271 | 
272 |   p2.add <- list(scale_y_continuous(name = "Median value-added", n.breaks = 8, limits = c(0, 1), expand = expansion(mult = c(0.01, 0.05))),
273 |                  geom_hline(yintercept = 1, linetype = 2),
274 |                  labs(subtitle = "Value-added relative to naive estimates"))
275 | 
276 |   p3.add <- list(scale_y_continuous(name = "Median ratio of simulated-to-observed uncertainty", n.breaks = 8, limits = c(0, NA), expand = expansion(mult = c(0.01, 0.05))),
277 |                  geom_hline(yintercept = 1, linetype = 2),
278 |                  labs(subtitle = "Comparison of relative uncertainty (MOE / estimate)"))
279 | 
280 | 
281 |   # p3.add <- list(scale_y_continuous(name = "Median ratio of simulated-to-observed MOE", n.breaks = 8, limits = c(0, NA), expand = expansion(mult = c(0.01, 0.05))),
282 |   #                geom_hline(yintercept = 1, linetype = 2),
283 |   #                labs(subtitle = "Bias in 90% margin of error (MOE)"))
284 | 
285 |   #---
286 | 
287 |   # Multi-variable plots
288 |   # NOTE: Set 'linetype = CAT' to indicate whether each 'y' is continuous or categorical
289 |   p1 <- ggplot(filter(qest, metric == "est"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p1.add + plot.all + geom_line()
290 |   p2 <- ggplot(filter(qest, metric == "vad"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p2.add + plot.all + geom_line()
291 |   p3 <- ggplot(filter(qest, metric == "moe"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p3.add + plot.all + geom_line()
292 |   #p3 <- ggplot(filter(qest, metric == "bias"), aes(x = SHR, y = y, color = VAR, linetype = NULL)) + p3.add + plot.all + geom_line()
293 | 
294 |   out1 <- list(p1, p2, p3)
295 |   names(out1) <- c("est", "vad", "moe")
296 | 
297 |   if (!is.null(path)) {
298 |     for (i in names(out1)) {
299 |       suppressMessages(ggsave(filename = paste0("allvars_", i, ".png"), plot = out1[[i]], path = path, ...))
300 |     }
301 |   }
302 | 
303 |   #---
304 | 
305 |   # Single variable scatterplots
306 | 
307 |   out2 <- lapply(y, function(v) {
308 | 
309 |     fct <- 10  # Controls extent of extreme value inclusion in scatterplots
310 |     pdata <- filter(qest, VAR == v)
311 |     ed <- filter(vest, y == v, share >= xrng[1], share <= xrng[2])
312 | 
313 |     r <- median(ed$est, na.rm = TRUE) + c(-1, 1) * fct * mad(ed$est, na.rm = TRUE)
314 |     p1 <- ggplot(filter(pdata, metric == "est"), aes(x = SHR, y = y)) +
315 |       geom_point(data = filter(ed, est >= r[1], est <= r[2]),
316 |                  aes(x = share, y = est), shape = 1) +
317 |       #geom_point(data = filter(ed, error_est < max(pdata$est + ofct * pdata$peSE)), aes(x = share, y = error_pe), shape = 1) +
318 |       # geom_ribbon(aes(ymin = pmax(pe - peSE * qt(0.975, DFpe), 0),
319 |       #                 ymax = pmin(pe + peSE * qt(0.975, DFpe), ifelse(categorical, 1, Inf))),
320 |       #             fill = "gray", alpha = 0.25, color = "red", linetype = "dotted") +
321 |       p1.add + plot.all + labs(title = v) + geom_line(color = "red")
322 | 
323 |     r <- c(0, 1)
324 |     p2 <- ggplot(filter(pdata, metric == "vad"), aes(x = SHR, y = y)) +
325 |       geom_point(data = filter(ed, vad >= r[1], vad <= r[2]),
326 |                  aes(x = share, y = vad), shape = 1) +
327 |       p2.add + plot.all + labs(title = v) + geom_line(color = "red")
328 | 
329 |     r <- median(ed$moe, na.rm = TRUE) + c(-1, 1) * fct * mad(ed$moe, na.rm = TRUE)
330 |     p3 <- ggplot(filter(pdata, metric == "moe"), aes(x = SHR, y = y)) +
331 |       geom_point(data = filter(ed, moe >= r[1], moe <= r[2]),
332 |                  aes(x = share, y = moe), shape = 1) +
333 |       p3.add + plot.all + labs(title = v) + geom_line(color = "red")
334 | 
335 |     # r <- median(ed$bias) + c(-1, 1) * fct * mad(ed$bias)
336 |     # p3 <- ggplot(filter(pdata, metric == "bias"), aes(x = SHR, y = y)) +
337 |     #   geom_point(data = filter(ed, bias >= r[1], bias <= r[2]),
338 |     #              aes(x = share, y = bias), shape = 1) +
339 |     #   p3.add + plot.all + labs(title = v) + geom_line(color = "red")
340 | 
341 |     out <- list(p1, p2, p3)
342 |     names(out) <- c("est", "vad", "moe")
343 | 
344 |     # Save plots to disk
345 |     if (!is.null(path)) {
346 |       for (i in names(out)) {
347 |         suppressMessages(ggsave(filename = paste0(v, "_", i, ".png"), plot = out[[i]], path = path, ...))
348 |       }
349 |     }
350 | 
351 |     return(out)
352 | 
353 |   }) %>%
354 |     setNames(y)
355 | 
356 |   #-----
357 | 
358 |   # Report location of output plots, if requested
359 |   if (!is.null(path)) cat("Plots saved to:", path, "\n")
360 | 
361 |   # Assemble final result
362 |   result <- list(plots = c(out1, out2),
363 |                  perf = sum.perf,
364 |                  smooth = qest,
365 |                  data = valid)
366 |   if (!inherits(valid, "validate")) class(result) <- c("validate", class(result))
367 |   return(result)
368 | 
369 | }
370 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "fusionModel"
  3 | subtitle: "Synthetic data fusion and analysis in R"
  4 | author: Kevin Ummel ([ummel@berkeley.edu](mailto:ummel@berkeley.edu))
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 |     toc_depth: 2
  9 |     html_preview: true
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(
 14 |   echo = TRUE,
 15 |   cache = TRUE, # Useful for testing/editing; will create /README_cache directory
 16 |   comment = NA,
 17 |   fig.path = "man/figures/README-"
 18 | )
 19 | ```
 20 | 
 21 | # Overview
 22 | 
 23 | **fusionModel** enables variables unique to a "donor" dataset to be statistically simulated for (i.e. *fused to*) a "recipient" dataset. Variables common to both the donor and recipient are used to model and simulate the fused variables. The package provides a simple and efficient interface for general data fusion in *R*, leveraging state-of-the-art machine learning algorithms from Microsoft's [LightGBM](https://lightgbm.readthedocs.io/en/latest/) framework. It also provides tools for analyzing synthetic/simulated data, calculating uncertainty, and validating fusion output.
 24 | 
 25 | fusionModel was developed to allow statistical integration of microdata from disparate social surveys. It is the data fusion workhorse underpinning the larger fusionACS data platform under development at the [Socio-Spatial Climate Collaborative](https://sc2.berkeley.edu/fusionacs-people/). In this context, fusionModel is used to fuse variables from a range of social surveys onto microdata from the American Community Survey, allowing for analysis and spatial resolution otherwise impossible.
 26 | 
 27 | # Motivation
 28 | 
 29 | The desire to “fuse” or otherwise integrate independent datasets has a long history, dating to at least the early 1970’s ([Ruggles and Ruggles 1974](https://www.nber.org/system/files/chapters/c10115/c10115.pdf); [Alter 1974](https://www.nber.org/system/files/chapters/c10116/c10116.pdf)). Social scientists have long recognized that large amounts of unconnected data are “out there” – usually concerning the characteristics of households and individuals (i.e. microdata) – which we would, ideally, like to integrate and analyze as a whole. This aim falls under the general heading of “Statistical Data Integration” (SDI) ([Lewaa et al. 2021](https://content.iospress.com/articles/statistical-journal-of-the-iaos/sji210835)).
 30 | 
 31 | The most prominent examples of data fusion have involved administrative record linkage. This consists of exact matching or probabilistic linking of independent datasets, using observable information like social security numbers, names, or birth dates of individuals. Record linkage is the gold standard and can yield incredibly important insights and high levels of statistical confidence, as evidenced by the pioneering work of [Raj Chetty](https://opportunityinsights.org/team/raj-chetty/) and colleagues.
 32 | 
 33 | However, record linkage is rarely feasible for the kinds of microdata that most researchers use day-to-day (nevermind the difficulty of accessing administrative data). While the explosion of online tracking and social network data will undoubtedly offer new lines of analysis, for the time being, at least, social survey microdata remain indispensable. The challenge and promise recognized 50 years ago by Nancy and Richard Ruggles remains true today:
 34 | 
 35 | >Unfortunately, no single microdata set contains all of the different kinds of information required for the problems which the economist wishes to analyze. Different microdata sets contain different kinds of information…A great deal of information is collected on a sample basis. Where two samples are involved the probability of the same individual appearing in both may be very small, so that exact matching is impossible. Other methods of combining the types of information contained in the two different samples into one microdata set will be required. (Ruggles and Ruggles 1974; 353-354)
 36 | 
 37 | Practitioners regularly impute or otherwise predict a variable or two from one dataset on to another. Piecemeal, *ad hoc* data fusion is a common necessity of quantitative research. Proper data fusion, on the other hand, seeks to systematically combine “two different samples into one microdata set”.
 38 | 
 39 | The size and nature of the samples involved and the intended analyses strongly influence the choice of data integration technique and the structure of the output. This has led to the relevant literature being both diverse and convoluted, as practitioners take on different data “setups” and objectives. In the context of fusionACS, we are interested in the following problem:
 40 | 
 41 | We have microdata from two independent surveys, A and B, that sample the same underlying population and time period (e.g. occupied U.S. households nationwide in 2018). We specify that A is the “recipient” dataset and B is the “donor”. The goal is to generate a new dataset, C, that has the original survey responses of A plus a realistic representation of how each respondent in A might have answered the questionnaire of survey B. To do this, we identify a set of common/shared variables X that both surveys solicit. We then attempt to fuse a set of variables unique to B – call them Z, the “fusion variables” – onto the original microdata of A, conditional on X.
 42 | 
 43 | # Methodology
 44 | 
 45 | The fusion strategy implemented in the fusionModel package borrows and expands upon ideas from the statistical matching ([D’Orazio et al. 2006](https://onlinelibrary.wiley.com/doi/book/10.1002/0470023554)), imputation ([Little and Rubin 2019](https://onlinelibrary.wiley.com/doi/book/10.1002/9781119482260)), and data synthesis ([Drechsler 2011](https://link.springer.com/book/10.1007/978-1-4614-0326-5)) literatures to create a flexible data fusion tool. It employs variable-*k*, conditional expectation matching that leverages high-performance gradient boosting algorithms. The package accommodates fusion of many variables, individually or in blocks, and efficient computation when the recipient is large relative to the donor.
 46 | 
 47 | Specifically, the goal was to create a data fusion tool that meets the following requirements:
 48 | 
 49 | 1. Accommodate donor and recipient datasets with divergent sample sizes
 50 | 2. Handle continuous, categorical, and semi-continuous (zero-inflated) variable types
 51 | 3. Ensure realistic values for fused variables
 52 | 4. Scale efficiently for larger datasets
 53 | 5. Fuse variables “one-by-one” or in “blocks”
 54 | 6. Employ a data modeling approach that:
 55 | + Makes no distributional assumptions (i.e. non-parametric)
 56 | + Automatically detects non-linear and interaction effects
 57 | + Automatically selects predictor variables from a potentially large set
 58 | + Ability to prevent overfitting (e.g. cross-validation)
 59 | 
 60 | Complete methodological details are available in the fusionACS Guidebook (INSERT LINK).
 61 | 
 62 | # Installation
 63 | 
 64 | ```r
 65 | devtools::install_github("ummel/fusionModel")
 66 | ```
 67 | 
 68 | ```{r}
 69 | library(fusionModel)
 70 | ```
 71 | 
 72 | # Simple fusion
 73 | 
 74 | ```{r, echo = FALSE, results = 'hide'}
 75 | suppressMessages(suppressWarnings(library(tidyverse)))
 76 | suppressMessages(suppressWarnings(library(fst)))  # Just to void messages when it loads within code
 77 | ```
 78 | 
 79 | The package includes example microdata from the [2015 Residential Energy Consumption Survey](https://www.eia.gov/consumption/residential/data/2015/) (see `?recs` for details). For real-world use cases, the donor and recipient data are typically independent and vary in sample size. For illustrative purposes, we will randomly split the `recs` microdata into separate "donor" and "recipient" datasets with an equal number of observations.
 80 | 
 81 | ```{r}
 82 | # Rows to use for donor dataset
 83 | d <- seq(from = 1, to = nrow(recs), by = 2)
 84 | 
 85 | # Create donor and recipient datasets
 86 | donor <- recs[d, c(2:16, 20:22)]
 87 | recipient <- recs[-d, 2:14]
 88 | 
 89 | # Specify fusion and shared/common predictor variables
 90 | predictor.vars <- names(recipient)
 91 | fusion.vars <- setdiff(names(donor), predictor.vars)
 92 | ```
 93 | 
 94 | The `recipient` dataset contains `r ncol(recipient) ` variables that are shared with `donor`. These shared "predictor" variables provide a statistical link between the two datasets. fusionModel exploits the information in these shared variables.
 95 | 
 96 | ```{r}
 97 | predictor.vars
 98 | ```
 99 | 
100 | There are `r length(fusion.vars)` "fusion variables" unique to `donor`. These are the variables that will be fused to `recipient`. This includes a mix of continuous and categorical (factor) variables.
101 | 
102 | ```{r}
103 | # The variables to be fused
104 | sapply(donor[fusion.vars], class)
105 | ```
106 | We create a fusion model using the `train()` function. The minimal usage is shown below. See `?train` for additional function arguments and options. By default, this results in a ".fsn" (fusion) object being saved to "fusion_model.fsn" in the current working directory.
107 | 
108 | ```{r}
109 | # Train a fusion model
110 | fsn.model <- train(data = donor, 
111 |                    y = fusion.vars, 
112 |                    x = predictor.vars)
113 | ```
114 | To fuse variables to `recipient`, we simply pass the recipient data and path of the .fsn model to the `fuse()` function. Each variable specified in `fusion.vars` is fused in the order provided. By default, `fuse()` generates a single implicate (version) of synthetic outcomes. Later, we'll work with multiple implicates to perform proper analysis and uncertainty estimation.
115 | 
116 | ```{r}
117 | # Fuse 'fusion.vars' to the recipient
118 | sim <- fuse(data = recipient, 
119 |             fsn = fsn.model)
120 | ```
121 | Let's look at the the recipient dataset's fused/simulated variables. Note that your results will look different, because each call to `fuse()` generates a unique, probabilistic set of outcomes.
122 | 
123 | ```{r}
124 | head(sim)
125 | ```
126 | 
127 | We can do some quick sanity checks to compare the distribution of the fusion variables in `donor` with those in `sim`. This, at least, confirms that the fusion output is not obviously wrong. Later, we'll perform a formal internal validation exercise using multiple implicates.
128 | 
129 | ```{r}
130 | sim <- data.frame(sim)
131 | 
132 | # Compare means of the continuous variables
133 | cbind(donor = colMeans(donor[fusion.vars[3:5]]), sim = colMeans(sim[fusion.vars[3:5]]))
134 | 
135 | # Compare frequencies of categorical variable classes
136 | cbind(donor = table(donor$insulation), sim = table(sim$insulation))
137 | cbind(donor = table(donor$aircon), sim = table(sim$aircon))
138 | 
139 | ```
140 | And we can look at kernel density plots of the non-zero values for the continuous variables to see if the univariate distributions in `donor` are generally similar in `sim`.
141 | 
142 | ```{r, echo = FALSE}
143 | 
144 | # Univariate naturally continuous case
145 | # Compare density plots for select continuous variables
146 | 
147 | # Create 'pdata' data frame for subsequent plots
148 | #rec <- mutate(cbind(recipient, sim), dataset = "fused")
149 | fsd <- mutate(sim, dataset = "fused")
150 | don <- mutate(donor, dataset = "donor")
151 | pdata <- bind_rows(don, fsd)
152 | 
153 | # The "natural" continuous variables (SPECIFY MANUALLY)
154 | ncont <- c("square_feet", "electricity", "natural_gas")
155 | 
156 | # Density plots
157 | pdata[c("dataset", ncont)] %>%
158 |   pivot_longer(cols = -1L) %>%
159 |   filter(value != 0) %>%
160 |   ggplot(aes(x = value, color = dataset)) +
161 |   geom_density(size = 1) +
162 |   theme(legend.position = "top") +
163 |   facet_wrap(~ name, scales = "free")
164 | #ggtitle("Distribution of continuous variables (non-zero values)")
165 | 
166 | ```
167 | 
168 | # Advanced fusion
169 | 
170 | For this call to `train()`, we specify a set of hyperparameters to search over when training each LightGBM gradient boosting model (see `?train` for details). The hyperparameters can be used to tune the underlying GBM models for better cross-validated performance. We also set `nfolds = 10` (default is 5) to indicate the number of cross-validation folds to use. Since this requires additional computation, the `cores` argument is used to enable parallel processing. 
171 | 
172 | ```{r}
173 | # Train a fusion model with variable blocks
174 | fsn.model <- train(data = donor, 
175 |                    y = fusion.vars, 
176 |                    x = predictor.vars,
177 |                    nfolds = 10,
178 |                    hyper = list(boosting = c("gbdt", "goss"),
179 |                                 num_leaves = c(10, 30),
180 |                                 feature_fraction = c(0.7, 0.9)),
181 |                    cores = 2)
182 | ```
183 | We generally want to create multiple versions of the simulated fusion variables -- called *implicates* -- in order to reduce bias in point estimates and calculate associated uncertainty. We can do this using the `M` argument within `fuse()`. Here we generate 10 implicates; i.e. 10 unique, probabilistic representations of what the recipient records might look like with respect to the fusion variables.
184 | 
185 | ```{r}
186 | # Fuse multiple implicates to the recipient
187 | sim10 <- fuse(data = recipient, 
188 |               fsn = fsn.model,
189 |               M = 10)
190 | ```
191 | Note that each implicate in `sim10` is identified by the "M" variable/column.
192 | 
193 | ```{r}
194 | head(sim10)
195 | table(sim10$M)
196 | ```
197 | 
198 | # Analyzing fused data
199 | 
200 | The fused values are inherently probabilistic, reflecting uncertainty in the underlying statistical models. Multiple implicates are needed to calculate unbiased point estimates and associated uncertainty for any particular analysis of the data. In general, more implicates is preferable but requires more computation.
201 | 
202 | Since proper analysis of multiple implicates can be rather cumbersome – both from a coding and mathematical standpoint – the `analyze()` function provides a convenient way to calculate point estimates and associated uncertainty for common analyses. Potential analyses currently include variable means, proportions, sums, counts, and medians, (optionally) calculated for population subgroups.
203 | 
204 | For example, to calculate the mean value of the "electricity" variable across all observations in the recipient dataset, we do the following.
205 | 
206 | ```{r}
207 | analyze(x = list(mean = "electricity"),
208 |         implicates = sim10)
209 | ```
210 | When the response variable is categorical, `analyze()` automatically returns the proportions associated with each factor level.
211 | 
212 | ```{r}
213 | analyze(x = list(mean = "aircon"),
214 |         implicates = sim10)
215 | ```
216 | 
217 | If we want to perform an analysis across subsets of the recipient population -- for example, calculate the mean value of "electricity" by household "income" -- we can use the `by` and `static` arguments. We see that mean electricity consumption increases with household income.
218 | 
219 | ```{r}
220 | analyze(x = list(mean = "electricity"),
221 |         implicates = sim10,
222 |         static = recipient,
223 |         by = "income")
224 | ```
225 | 
226 | It is also possible to do multiple kinds of analyses in a single call to `analyze()`. For example, the following call calculates the mean value of "natural_gas" and "square_feet", the median value of "square_feet", and the sum of "electricity" (i.e. total consumption) and "insulation" (i.e. total count of each level). All of these estimates are calculated for each population subgroup defined by the intersection of "race" and "urban_rural" status.
227 | 
228 | ```{r}
229 | result <- analyze(x = list(mean = c("natural_gas", "square_feet"),
230 |                            median = "square_feet",
231 |                            sum = c("electricity", "insulation")),
232 |                   implicates = sim10,
233 |                   static = recipient,
234 |                   by = c("race", "urban_rural"))
235 | ```
236 | We can then (for example) isolate the results for white households in rural areas. Notice that the mean estimate of "square_feet" exceeds the median, reflecting the skewed distribution.
237 | 
238 | ```{r}
239 | subset(result, race == "White" & urban_rural == "Rural")
240 | ```
241 | More complicated analyses can be performed using the custom `fun` argument to `analyze()`. See the Examples section of `?analyze`.
242 | 
243 | # Validating fusion models
244 | 
245 | The `validate()` function provides a convenient way to perform internal validation tests on synthetic variables that have been fused back onto the original donor data. This allows us to assess the quality of the underlying fusion model; it is analogous to assessing model skill by comparing predictions to the observed training data.
246 | 
247 | `validate()` compares analytical results derived using the multiple-implicate fusion output with those derived using the original donor microdata. By performing analyses on population subsets of varying size, `validate()` estimates how the synthetic variables perform for analyses of varying difficulty/complexity. It computes fusion variable means and proportions for subsets of the full sample – separately for both the observed and fused data – and then compares the results.
248 | 
249 | First, we fuse multiple implicates of the `fusion.vars` using the original donor data -- *not* the `recipient` data, as we did previously.
250 | 
251 | ```{r}
252 | sim <- fuse(data = donor,
253 |             fsn = fsn.model,
254 |             M = 40)
255 | ```
256 | 
257 | Next, we pass the `sim` results to `validate()`. The argument `subset_vars` specifies that we want the validation exercise to compare observed (donor) and simulated point estimates across population subsets defined by "income", "age", "race", and "education". See `?validate` for more details.
258 | 
259 | ```{r}
260 | valid <- validate(observed = donor,
261 |                   implicates = sim,
262 |                   subset_vars = c("income", "age", "race", "education"))
263 | ```
264 | The `validate()` output includes ggplot2 graphics that helpfully summarize the validation results. For example, the plot below shows how the observed and simulated point estimates compare, using median absolute percent error as the performance metric. We see that the synthetic data do a very good job reproducing the point estimates for all fusion variables when the population subset in question is reasonably large. For smaller subsets -- i.e. more difficult analyses due to small sample size -- "square_feet", "natural_gas", and "electricity" remain well modeled, but the error increases more rapidly for "aircon" and "insulation". This information is useful for understanding what kind of reliability we can expect for particular variables and types of analyses, given the underlying fusion model and data.
265 | 
266 | ```{r}
267 | valid$plots$est
268 | ```
269 | 
270 | Happy fusing!
271 | 


--------------------------------------------------------------------------------
/R/fuseCART.R:
--------------------------------------------------------------------------------
  1 | #' Fuse variables to a recipient dataset using CART fusion model
  2 | #'
  3 | #' @description
  4 | #' Fuse variables to a recipient dataset using a fusion model object produced by \code{train()}. \code{fuseM()} provides a convenience wrapper for generating multiple implicates.
  5 | #'
  6 | #' @param data Data frame. Recipient dataset. All categorical variables should be factors and ordered whenever possible. Data types and levels are strictly validated against predictor variables defined in \code{train.object}.
  7 | #' @param train.object Output from a successful call to \link{train}.
  8 | #' @param ignore_self Logical. If \code{TRUE}, the simulation step excludes "self matches" (i.e. row 1 in \code{data} cannot match with row 1 in the original donor. Only useful for validation exercises. Do not use otherwise.
  9 | #'
 10 | #' @export
 11 | #' @noRd
 12 | 
 13 | #---------------------
 14 | 
 15 | # Manual testing
 16 | 
 17 | # library(fusionModel)
 18 | # source("R/utils.R")
 19 | 
 20 | # Example inputs
 21 | # donor <- recs
 22 | # data <- subset(recs, select = c(division, urban_rural, climate, income, age, race))
 23 | # induce = FALSE
 24 | # induce.ignore = NULL
 25 | # fusion.vars <- setdiff(names(donor), names(recipient))
 26 | # train.object <- train(data = donor, y = fusion.vars)
 27 | 
 28 | # data = readRDS("~/Documents/Projects/fusionData/recs_recipient.rds")
 29 | # train.object <- readRDS("~/Documents/Projects/fusionData/recs_fit.rds")
 30 | 
 31 | #---------------------
 32 | 
 33 | fuseCART <- function(data,
 34 |                      train.object,
 35 |                      ignore_self = FALSE) {
 36 | 
 37 |   stopifnot(exprs = {
 38 |     is.data.frame(data)
 39 |     is.logical(ignore_self)
 40 |     #!(!induce & !is.null(induce.ignore))  # Nonsensical input
 41 |   })
 42 | 
 43 |   verbose <- TRUE
 44 | 
 45 |   #-----
 46 | 
 47 |   # Coerce 'data' to data.table, if necessary
 48 |   data <- data.table::as.data.table(data)
 49 | 
 50 |   # Check nrow(data) against the size of the training dataset; they must be the same size if ignore_self = TRUE
 51 |   if (ignore_self & nrow(data) != train.object$nobs) stop("When 'ignore_self = TRUE', recipient 'data' must have same number of rows as the training dataset")
 52 | 
 53 |   # Check that predictor variables are present
 54 |   xclass <- train.object$xclass
 55 |   xvars <- names(xclass)
 56 |   xlevels <- train.object$xlevels
 57 |   miss <- setdiff(xvars, names(data))
 58 |   if (length(miss) > 0) stop("The following predictor variables are missing from 'data':\n", paste(miss, collapse = ", "))
 59 | 
 60 |   # Restrict 'data' to the xvars and ensure correct ordering of columns consistent with names(xclass)
 61 |   data <- subset(data, select = xvars)
 62 | 
 63 |   #-----
 64 | 
 65 |   # Check for appropriate class/type of predictor variables
 66 | 
 67 |   xtest <- lapply(data, class)
 68 |   miss <- !map2_lgl(xclass, xtest, sameClass)
 69 |   if (any(miss)) stop("Incompatible data type for the following predictor variables:\n", paste(names(miss)[miss], collapse = ", "))
 70 | 
 71 |   # Check for appropriate levels of factor predictor variables
 72 |   xtest <- lapply(subset(data, select = names(xlevels)), levels)
 73 |   miss <- !map2_lgl(xlevels, xtest, identical)
 74 |   if (any(miss)) stop("Incompatible levels for the following predictor variables\n", paste(names(miss)[miss], collapse = ", "))
 75 | 
 76 |   #-----
 77 | 
 78 |   # Names and order of modeled variables to be fused
 79 |   yord <- c(names(train.object$models), names(train.object$derivative$model))
 80 | 
 81 |   # Identify continuous yvars
 82 |   ycont <- names(which(sapply(train.object$yclass, function(x) x[1] %in% c("integer", "numeric"))))
 83 | 
 84 |   # Create the percentile values associated with the quantile function values
 85 |   # Note: Only really necessary if using smoothing
 86 |   if (length(ycont) > 0) {
 87 |     Qx <- map(train.object$models[ycont], ~ nrow(.x$Q)) %>% compact()
 88 |     Qx <- if (length(Qx) > 0) {
 89 |       Qx <- dnorm(seq(-3, 3, length.out = Qx[[1]] - 1))
 90 |       Qx <- c(0, cumsum(Qx / sum(Qx)))
 91 |     }
 92 |   }
 93 | 
 94 |   #-----
 95 | 
 96 |   # Set 'induce.vars'; these are the variables for which correlation will be induced
 97 |   # if (induce) {
 98 |   #   induce.vars <- if (is.null(induce.ignore)) {
 99 |   #     c(xvars, yord)
100 |   #   } else {
101 |   #     validNames(induce.ignore, c(xvars, yord), exclude = TRUE)
102 |   #   }
103 |   #   stopifnot(length(induce.vars) > 0)
104 |   #   if (verbose) cat("Will attempt to induce correlation for a total of", length(induce.vars), "variables\n")
105 |   # }
106 | 
107 |   #-----
108 | 
109 |   # Detect and impute any missing values in 'data'
110 |   na.cols <- names(which(sapply(data, anyNA)))
111 |   if (length(na.cols) > 0) {
112 |     if (verbose) cat("Missing values imputed for the following predictors:\n", paste(na.cols, collapse = ", "), "\n")
113 |     for (j in na.cols) {
114 |       x <- data[[j]]
115 |       ind <- is.na(x)
116 |       data.table::set(data, i = which(ind), j = j, value = imputationValue(x, na.ind = ind))
117 |     }
118 |   }
119 | 
120 |   #-----
121 | 
122 |   # Build 'ranks' data.table for the 'xvars', if 'induce = TRUE'
123 |   # TO DO: Restrict the variables for which this is applicable?
124 |   # This is a memory-efficient implementation using data.table
125 | 
126 |   # if (induce) {
127 |   #
128 |   #   if (verbose) cat("Building ranks matrix (induce = TRUE)...\n")
129 |   #
130 |   #   # Correlation variables to retain in initial 'ranks' data.table, based on 'induce.vars' argument
131 |   #   retain <- intersect(induce.vars, xvars)
132 |   #
133 |   #   # Unordered factor variables among retained 'xvars'
134 |   #   xunordered <- sapply(xclass[retain], function(x) x[1] == "factor")
135 |   #
136 |   #   # Build 'ranks' data.table for 'xvars' that are NOT unordered factors
137 |   #   ranks <- subset(data, select = names(which(!xunordered)))
138 |   #   for (v in names(ranks)) data.table::set(ranks, j = v, value = data.table::frank(ranks[[v]], ties.method = "average"))
139 |   #
140 |   #   # Create dummy variable columns in 'ranks' for the 'xvars' that ARE unordered factors
141 |   #   for (v in names(which(xunordered))) {
142 |   #     dt <- subset(data, select = v)
143 |   #     u <- xlevels[[v]]
144 |   #     newv <- paste0(v, u)
145 |   #     data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(dt == x)))
146 |   #   }
147 |   #
148 |   #   # Scale all variable ranks for unit variance and zero mean
149 |   #   # Makes computations simpler in induceCor()
150 |   #   for (v in names(ranks)) data.table::set(ranks, j = v, value = as.vector(scale(ranks[[v]])))
151 |   #
152 |   #   # Clean up
153 |   #   rm(dt)
154 |   #   gc()
155 |   #
156 |   # }
157 | 
158 |   #-----
159 | 
160 |   # Extract names of the numeric predictor variables for which binary versions are required
161 | 
162 |   # Variables for which binary versions are used in models
163 |   xbin.model <- train.object$models %>%
164 |     map("xbin") %>%
165 |     unlist(use.names = FALSE)
166 | 
167 |   # Variables for which binary versions are used in merges
168 |   temp <- train.object$derivative$merge %>%
169 |     map(names) %>%
170 |     unlist(use.names = FALSE)
171 |   xbin.merge <- grep("_BINARY_", temp, value = TRUE)
172 |   xbin.merge <- gsub("_BINARY_", "", xbin.merge)
173 | 
174 |   # Full vector of variables for which binary versions are required
175 |   xbin <- unique(c(xbin.model, xbin.merge))
176 | 
177 |   # Create binary versions of any 'xbin' among the initial predictor variables
178 |   vbin <- intersect(xbin, names(data))
179 |   for (v in vbin) data.table::set(data, j = paste0(v, "_BINARY_"), value = data[[v]] == 0)
180 | 
181 |   #-------------------------
182 |   #-------------------------
183 | 
184 |   if (verbose) cat("Fusing donor variables to recipient...\n")
185 | 
186 |   # Progress bar printed to console
187 |   if (verbose) pb <- pbapply::timerProgressBar(min = 0, max = length(yord), char = "+", width = 50, style = 3)
188 | 
189 |   for (y in yord) {
190 | 
191 |     cont <- y %in% ycont
192 | 
193 |     yclass <- train.object$yclass[[y]]
194 | 
195 |     m <- if (y %in% names(train.object$models)) train.object$models[[y]] else train.object$derivative$model[[y]]
196 | 
197 |     #-----
198 | 
199 |     # Extract the 'xmerge' object, if available
200 |     # This is used to merge known 'y' values up front, prior to probabilistic simulation
201 |     # Object 'ind' gives the row numbers for which values are not already known and must be simulated
202 |     xmerge <- m$xmerge
203 |     if (!is.null(xmerge)) {
204 |       data <- merge(data, xmerge, by = names(xmerge)[1], all.x = TRUE, sort = FALSE)
205 |       ind <- which(is.na(data[[y]]))
206 |     } else {
207 |       ind <- 1L:nrow(data)
208 |     }
209 | 
210 |     #-----
211 | 
212 |     # If 'y' is continuous...
213 |     if (cont) {
214 | 
215 |       if (class(m) == "rpart") {
216 | 
217 |         smoothed <- is.null(names(m$Q))
218 | 
219 |         # Vector of nodes in model 'm'
220 |         nodes <- as.integer(c(names(m$Q), colnames(m$Q)))
221 | 
222 |         # Predicted node for rows in 'data'
223 |         pnode <- predictNode(object = m, newdata = data[ind, ])
224 |         gc()
225 | 
226 |         # Catch and fix/kludge rare case of missing node (unclear why this occurs)
227 |         # Note that the 'popsynth' package has a fix for the same issue: https://github.com/cran/synthpop/blob/master/R/functions.syn.r
228 |         # Erroneous nodes are re-assigned to the valid node with the closest predicted 'yval'
229 |         miss <- setdiff(pnode, nodes)
230 |         for (n in miss) pnode[pnode == n] <- nodes[which.min(abs(m$frame[n, "yval"] - m$frame[nodes, "yval"]))]
231 |         stopifnot(all(pnode %in% nodes))
232 | 
233 |         #---
234 | 
235 |         # Placeholder vector for simulated values
236 |         S <- vector(mode = "numeric", length = length(pnode))
237 | 
238 |         # Fit density to observations in each node
239 |         for (n in nodes) {
240 | 
241 |           # Index identifying rows in 'data' that are in node 'n'
242 |           i <- pnode == n
243 | 
244 |           if (any(i)) {
245 | 
246 |             if (smoothed) {
247 | 
248 |               # Extract inputs needed for quantile function and proportion of zeros
249 |               Q <- m$Q[, as.character(n)]
250 | 
251 |               # Randomly simulate values from the conditional distribution
252 |               # Note that this is repeated as necessary to ensure 's' does not contain any values already assigned (exclusively) via 'xmerge'
253 |               f <- approxfun(x = Qx, y = Q)
254 |               s <- f(runif(n = sum(i)))
255 |               while (any(s %in% xmerge[[y]])) {
256 |                 j <- s %in% xmerge[[y]]
257 |                 s[j] <- f(runif(n = sum(j)))
258 |               }
259 | 
260 |             } else {
261 | 
262 |               if (ignore_self) {
263 | 
264 |                 # This gives the values, weights, and original row index for each training observation assigned to node 'n'
265 |                 Q <- m$Q[[as.character(n)]]
266 | 
267 |                 irow <- which(i)
268 |                 s <- vector(mode = "numeric", length = sum(i))
269 |                 fix <- rep(TRUE, length(s))
270 |                 sint <- 0
271 |                 while (sum(fix) > 0 & sint < 20) {
272 |                   s[fix] <- sample(Q[, 3], size = sum(fix), replace = TRUE, prob = Q[, 2])
273 |                   fix <- s == irow  # Find any self-references and replace them in next iteration
274 |                   sint <- sint + 1
275 |                 }
276 |                 s <- Q[match(s, Q[, 3]), 1]  # Extract actual simulated values
277 | 
278 |                 # TOO SLOW...
279 |                 # id <- matrix(rep(Q[, 3], each = sum(i)), nrow = sum(i))
280 |                 # self.ref <- which(i) == id
281 |                 # w <- matrix(rep(Q[, 2], each = sum(i)), nrow = sum(i))
282 |                 # w[self.ref] <- 0  # Set any self-references to zero weight
283 |                 # w <- w / matrixStats::rowSums2(w, na.rm = TRUE)
284 |                 # w <- matrixStats::rowCumsums(w)
285 |                 # s <- max.col(runif(n = nrow(w)) < w, ties.method = "first")
286 |                 # s <- Q[s, 1]  # Extract actual simulated values
287 | 
288 |               } else {
289 | 
290 |                 # Randomly sample the observed values within the node
291 |                 # Note that this is repeated as necessary to ensure 's' does not contain any values already assigned (exclusively) via 'xmerge'
292 |                 Q <- m$Q[[as.character(n)]]
293 |                 s <- sample(x = Q[, 1], size = sum(i), replace = TRUE, prob = Q[, 2])
294 |                 while (any(s %in% xmerge[[y]])) {
295 |                   j <- s %in% xmerge[[y]]
296 |                   s[j] <- sample(x = Q[, 1], size = sum(j), replace = TRUE, prob = Q[, 2])
297 |                 }
298 | 
299 |               }
300 | 
301 |             }
302 | 
303 |             # Assign simulated values to 'S'
304 |             S[i] <- s
305 | 
306 |           }
307 | 
308 |         }
309 | 
310 |       } else {
311 | 
312 |         # Make predictions using linear (biglm) model in 'm'
313 |         fobj <- formula(paste("~", as.character(m$terms)[3L]))
314 |         newmf <- model.frame(formula = fobj, data[ind, ])
315 |         newmm <- model.matrix(fobj, newmf)
316 |         S <- drop(newmm %*% replace_na(coef(m), 0))
317 | 
318 |         #---
319 | 
320 |         # Adjust simulated values to enforce the "outer.range" constraint
321 |         # This isn't strictly necessary with rpart() simulated values, because the constraint is enforced in the quantile values themselves
322 |         # It might be relevant, however, when a linear model is used for simulation (? - unclear)
323 |         # outer.range <- train.object$youter[[y]]
324 |         # S <- pmin(pmax(S, outer.range[1]), outer.range[2])
325 | 
326 |         # Adjust simulated values to enforce the "inner.range" constraint
327 |         inner.range <- train.object$yinner[[y]]
328 |         S[S > inner.range[1] & S < 0] <- inner.range[1]
329 |         S[S > 0 & S < inner.range[2]] <- inner.range[2]
330 | 
331 |       }
332 | 
333 |       # Ensure simulated column is correct data type
334 |       if (yclass == "integer") S <- as.integer(round(S))
335 | 
336 |     }
337 | 
338 |     #-----
339 | 
340 |     if (!cont) {
341 | 
342 |       if (class(m) == "rpart") {
343 | 
344 |         # Add the clustered predictors, if necessary
345 |         km <- m$kmeans.xwalk
346 |         if (!is.null(km)) {
347 |           for (d in km) {
348 |             k <- match(data[[names(d)[1]]], d[[1]])
349 |             data.table::set(data, j = names(d)[2], value = d[k, 2])
350 |           }
351 |         }
352 | 
353 |         # Class probabilities
354 |         p <- predict(object = m, newdata = data[ind, ])
355 |         gc()
356 | 
357 |         # Simulated value
358 |         ptile <- runif(n = length(ind))
359 |         for (i in 2:ncol(p)) p[, i] <- p[, i - 1] + p[, i]
360 |         S <- rowSums(ptile > p) + 1L
361 |         S <- colnames(p)[S]
362 | 
363 |       } else {
364 | 
365 |         # Make predictions using linear (biglm) model in 'm'
366 |         # Note that 'm' predicts the integerized factor/logical values, so it must be converted to the correct label
367 |         fobj <- formula(paste(as.character(m$terms)[-2], collapse = ""))
368 |         newmf <- model.frame(formula = fobj, data[ind, ])
369 |         newmm <- model.matrix(fobj, newmf)
370 |         S <- drop(newmm %*% replace_na(coef(m), 0))
371 |         S <- round(S)  # Round prediction to integer
372 | 
373 |         lev <- if ("factor" %in% yclass) train.object$ylevels[[y]] else c(FALSE, TRUE)  # Factor/logical labels
374 |         S <- pmin(pmax(S, 1), length(lev))  # Ensure plausible integer values
375 |         S <- lev[S]  # Convert to factor level
376 | 
377 |       }
378 | 
379 |       # Ensure simulated vector is correct data type
380 |       # 'S' is a character vector by default; must be coerced to factor or logical
381 |       if ("factor" %in% yclass) S <- factor(S, levels = train.object$ylevels[[y]], ordered = "ordered" %in% yclass)
382 |       if ("logical" %in% yclass) S <- as.logical(S)
383 | 
384 |     }
385 | 
386 |     #-----
387 | 
388 |     # Assign simulated vector to 'data'
389 |     data.table::set(data, i = ind, j = y, value = S)
390 | 
391 |     #-----
392 | 
393 |     # Proceed to induce correlation and/or update 'ranks' matrix, if requested
394 |     # if (induce) {
395 |     #
396 |     #   # Create appropriate column(s) in 'ranks' for simulated 'y' values
397 |     #   # NOTE that this is only done if 'y' is in 'induce.vars'; otherwise, its rank can be ignored
398 |     #   if (y %in% induce.vars) {
399 |     #     if (yclass[1] == "factor") {
400 |     #       u <- levels(S)
401 |     #       newv <- paste0(y, u)
402 |     #       data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(S == x)))
403 |     #     } else {
404 |     #       data.table::set(ranks, j = y, value = as.vector(scale(data.table::frank(S, ties.method = "average"))))
405 |     #     }
406 |     #   }
407 |     #
408 |     #   #-----
409 |     #
410 |     #   # For continuous and ordered factors, adjust initial simulated values to better match known rank correlation with other variables
411 |     #   if (y %in% names(train.object$ycor)) {
412 |     #
413 |     #     # Target rank correlations
414 |     #     rho <- train.object$ycor[[y]]
415 |     #
416 |     #     # Restrict target correlation to variables present in 'ranks'
417 |     #     rho <- rho[names(rho) %in% names(ranks)]
418 |     #
419 |     #     # Restrict correlation correction to non-zero observations in 'y'
420 |     #     # NOTE: Setting of this option should be consistent with analogous line in train()
421 |     #     #ind <- which(as.numeric(data[[y]]) != 0)
422 |     #
423 |     #     # No restriction on correlation correction
424 |     #     ind <- 1:nrow(data)
425 |     #
426 |     #     # Attempt to induce target rank correlations
427 |     #     Yout <- induceCor(data = data.table::copy(ranks[ind, ]), rho = rho, y = y, scale.data = FALSE)
428 |     #
429 |     #     # Only updated y-values if the correlation adjustment was successful (sigma2 >= 0)
430 |     #     if (Yout$sigma2 >= 0) {
431 |     #
432 |     #       # Re-order original y data to match ranks in Y (this preserve the original distribution)
433 |     #       Y <- sort(data[ind, ][[y]])[data.table::frank(Yout$Y, ties.method = "random")]
434 |     #
435 |     #       # NOTE: The confirmation code below has not been updated in some time (probably broken)
436 |     #       # Before and after rank correlations compared to 'rho' target correlation
437 |     #       # plot(rho, cor(ranks[, -..y], ranks[[y]])[, 1])  # Before
438 |     #       # plot(rho, cor(ranks[, -..y], Yout$Y)[, 1])  # After
439 |     #       # abline(0, 1)
440 |     #
441 |     #       # Confirm that univariate distribution is unchanged
442 |     #       # hist(data[i, y])
443 |     #       # hist(Y)
444 |     #
445 |     #       # Comparing before and after y values, original scale
446 |     #       # plot(data[i, y], Y)
447 |     #       # abline(0, 1, col = 2)
448 |     #       # cor(data[i, y], Y)
449 |     #
450 |     #       # Update original 'y' in 'data' with adjusted simulated values
451 |     #       data.table::set(data, i = ind, j = y, value = Y)
452 |     #
453 |     #       # Update the 'ranks' matrix with ranks derived from adjusted 'Y'
454 |     #       if (y %in% names(ranks)) {
455 |     #         if (yclass[1] == "factor") {
456 |     #           dt <- subset(data, select = y)
457 |     #           u <- levels(dt)
458 |     #           newv <- paste0(y, u)
459 |     #           data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(dt == x)))
460 |     #           #data.table::set(ranks, j = newv, value = lapply(u, function(x) as.integer(Y == x)))  # Safe only when 'ind' is 1:nrow(data)
461 |     #         } else {
462 |     #           data.table::set(ranks, j = y, value = as.vector(scale(data.table::frank(data[[y]], ties.method = "average"))))
463 |     #           #data.table::set(ranks, j = y, value = as.vector(scale(data.table::frank(Y, ties.method = "average"))))  # Safe only when 'ind' is 1:nrow(data)
464 |     #         }
465 |     #       }
466 |     #
467 |     #     }
468 |     #
469 |     #   }
470 |     #
471 |     # }
472 | 
473 |     #-----
474 | 
475 |     # If 'y' is one of the 'xbin', add a binary version of simulated values to 'data'
476 |     if (y %in% xbin) {
477 |       data.table::set(data, j = paste0(y, "_BINARY_"), value = data[[y]] == 0)
478 |       xbin <- setdiff(xbin, y)
479 |     }
480 | 
481 |     #-----
482 | 
483 |     # Update for() loop progress bar
484 |     if (verbose) pbapply::setTimerProgressBar(pb, match(y, yord))
485 | 
486 |   }
487 | 
488 |   # Close progress bar
489 |   if (verbose) pbapply::closepb(pb)
490 | 
491 |   #-------------------------
492 |   #-------------------------
493 | 
494 |   # Add any constant fusion variables
495 |   for (v in names(train.object$derivative$constant)) {
496 |     set(data, j = v, value = train.object$derivative$constant[[v]])
497 |   }
498 | 
499 |   # Merge categorical fusion variables that are identified by a 1-to-1 linear relationship with another categorical variable
500 |   for (m in train.object$derivative$merge) {
501 |     data <- merge(data, m, by = names(m)[1], all.x = TRUE, sort = FALSE)
502 |   }
503 | 
504 |   #-----
505 | 
506 |   # Simulation complete
507 |   # Return only the fusion variables, in the order in which they were added to 'data'
508 |   return(data %>%
509 |            subset(select = c(yord, setdiff(names(train.object$yclass), yord))) %>%
510 |            as.data.frame()
511 |   )
512 | 
513 | }
514 | 


--------------------------------------------------------------------------------