├── .Rbuildignore
├── .gitignore
├── .vscode
    └── launch.json
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── absentee_census.R
    ├── absentee_voting.R
    ├── apply_stack_weights.r
    ├── auto_mrp.R
    ├── best_subset_classifier.R
    ├── boot_auto_mrp.R
    ├── build_folds.R
    ├── census_data.R
    ├── deep_mrp_classifier.r
    ├── ebma.R
    ├── gb_classifier.R
    ├── get_predictions.r
    ├── globals.R
    ├── lasso_classifier.R
    ├── post_stratification.R
    ├── run_best_subset.R
    ├── run_classifiers.R
    ├── run_deep_bs.r
    ├── run_deep_pca.r
    ├── run_gb.R
    ├── run_lasso.R
    ├── run_pca.R
    ├── run_svm.R
    ├── stacking_weights.r
    ├── survey_data.R
    ├── svm_classifier.R
    ├── taxes_census.R
    ├── taxes_survey.R
    ├── taxes_truth.R
    └── utils.R
├── README.md
├── autoMrP.Rproj
├── data
    ├── absentee_census.RData
    ├── absentee_voting.RData
    ├── census.RData
    ├── survey_item.RData
    ├── taxes_census.RData
    └── taxes_survey.RData
├── man
    ├── .Rapp.history
    ├── absentee_census.Rd
    ├── absentee_voting.Rd
    ├── auto_MrP.Rd
    ├── best_subset_classifier.Rd
    ├── binary_cross_entropy.Rd
    ├── boot_auto_mrp.Rd
    ├── census.Rd
    ├── cv_folding.Rd
    ├── deep_mrp_classifier.Rd
    ├── ebma.Rd
    ├── ebma_folding.Rd
    ├── ebma_mc_draws.Rd
    ├── ebma_mc_tol.Rd
    ├── error_checks.Rd
    ├── f1_score.Rd
    ├── gb_classifier.Rd
    ├── gb_classifier_update.Rd
    ├── lasso_classifier.Rd
    ├── log_spaced.Rd
    ├── loss_function.Rd
    ├── loss_score_ranking.Rd
    ├── mean_absolute_error.Rd
    ├── mean_squared_error.Rd
    ├── mean_squared_false_error.Rd
    ├── model_list.Rd
    ├── model_list_pca.Rd
    ├── multicore.Rd
    ├── output_table.Rd
    ├── plot.autoMrP.Rd
    ├── post_stratification.Rd
    ├── predict_glmmLasso.Rd
    ├── quiet.Rd
    ├── run_best_subset.Rd
    ├── run_best_subset_mc.Rd
    ├── run_classifiers.Rd
    ├── run_deep_bs.Rd
    ├── run_deep_pca.Rd
    ├── run_gb.Rd
    ├── run_gb_mc.Rd
    ├── run_lasso.Rd
    ├── run_lasso_mc_lambda.Rd
    ├── run_pca.Rd
    ├── run_svm.Rd
    ├── run_svm_mc.Rd
    ├── summary.autoMrP.Rd
    ├── survey_item.Rd
    ├── svm_classifier.Rd
    ├── taxes_census.Rd
    └── taxes_survey.Rd
└── vignettes
    ├── autoMrP_vignette.pdf
    └── autoMrP_vignette.pdf.asis


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^renv$
 2 | ^renv\.lock$
 3 | ^packrat/
 4 | ^\.Rprofile$
 5 | ^.*\.Rproj$
 6 | ^\.Rproj\.user$
 7 | ^make-data\.R$
 8 | ^Meta$
 9 | ^doc$
10 | ^\.vscode$
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .gitignore
 2 | packrat/lib*/
 3 | .Rproj.user
 4 | .vscode
 5 | .Rhistory
 6 | packrat/src/
 7 | testing/
 8 | inst/doc
 9 | doc
10 | Meta
11 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         
 8 |         {
 9 |             "type": "R-Debugger",
10 |             "name": "Launch R-Workspace",
11 |             "request": "launch",
12 |             "debugMode": "workspace",
13 |             "workingDirectory": "${workspaceFolder}"
14 |         },
15 |         {
16 |             "type": "R-Debugger",
17 |             "name": "Debug R-File",
18 |             "request": "launch",
19 |             "debugMode": "file",
20 |             "workingDirectory": "${workspaceFolder}",
21 |             "file": "${file}"
22 |         },
23 |         {
24 |             "type": "R-Debugger",
25 |             "name": "Debug R-Function",
26 |             "request": "launch",
27 |             "debugMode": "function",
28 |             "workingDirectory": "${workspaceFolder}",
29 |             "file": "${file}",
30 |             "mainFunction": "main",
31 |             "allowGlobalDebugging": false
32 |         },
33 |         {
34 |             "type": "R-Debugger",
35 |             "name": "Debug R-Package",
36 |             "request": "launch",
37 |             "debugMode": "workspace",
38 |             "workingDirectory": "${workspaceFolder}",
39 |             "includePackageScopes": true,
40 |             "loadPackages": [
41 |                 "."
42 |             ]
43 |         },
44 |         {
45 |             "type": "R-Debugger",
46 |             "request": "attach",
47 |             "name": "Attach to R process",
48 |             "splitOverwrittenOutput": true
49 |         }
50 |     ]
51 | }


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: autoMrP
 2 | Type: Package
 3 | Title: Improving MrP with Ensemble Learning
 4 | Version: 1.1.1
 5 | Authors@R: c(
 6 |   person(
 7 | 		given = "Reto",
 8 | 		family = "Wüest",
 9 | 		role = c("aut"),
10 | 		email = "wuest.reto@gmail.com",
11 | 		comment = c(ORCID = "0000-0002-7502-6489")),
12 |   person(
13 | 		given = "Lucas",
14 | 	  family = "Leemann",
15 | 		role = c("aut"),
16 | 		email = "leemann@ipz.uzh.ch",
17 | 		comment = c(ORCID = "0000-0001-5201-869X")),
18 | 	person(
19 | 		given = "Florian",
20 | 	  family = "Schaffner",
21 | 		role = c("aut"),
22 | 		email = " schaffner@ipz.uzh.ch",
23 | 		comment = c(ORCID = "0000-0003-3352-6191")),
24 | 	person(
25 | 		given = "Philipp",
26 | 	  family = "Broniecki",
27 | 		role = c("aut", "cre"),
28 | 		email = "philippbroniecki@gmail.com",
29 | 		comment = c(ORCID = "0000-0001-9214-4404")),
30 | 	person(
31 | 		given = "Hadley",
32 | 		family = "Wickham",
33 | 		role = "ctb",
34 | 		email = "hadley@rstudio.com"))
35 | Description: A tool that improves the prediction performance of multilevel
36 |     regression with post-stratification (MrP) by combining a number of machine
37 |     learning methods. For information on the method, please refer to Broniecki, 
38 | 	Wüest, Leemann (2020) ''Improving Multilevel Regression with 
39 | 	Post-Stratification Through Machine Learning (autoMrP)'' in the 
40 | 	'Journal of Politics'. Final pre-print version: 
41 | 	<https://lucasleemann.files.wordpress.com/2020/07/automrp-r2pa.pdf>.
42 | URL: https://github.com/retowuest/autoMrP
43 | BugReports: https://github.com/retowuest/autoMrP/issues
44 | Depends: R (>= 3.6)
45 | Imports: 
46 | 	rlang (>= 0.4.5), dplyr (>= 1.0.2), lme4 (>= 1.1), gbm (>= 2.1.5),
47 | 	e1071 (>= 1.7-3), tibble (>= 3.0.1), glmmLasso (>= 1.5.1),
48 | 	EBMAforecast (>= 1.0.0), foreach (>= 1.5.0), doParallel (>= 1.0.15),
49 | 	doRNG (>= 1.8.2), ggplot2 (>= 3.3.2), knitr (>= 1.29), tidyr (>= 1.1.2),
50 | 	purrr (>= 0.3.4), forcats (>= 0.5.1), vglmer (>= 1.0.3), stringr (>= 1.5.0),
51 | 	R.rsp (>= 0.46.0), nloptr (>= 2.1.1), quadprog (>= 1.5-8), cli (>= 3.6.3)
52 | Suggests: R.rsp
53 | License: GPL-3
54 | Encoding: UTF-8
55 | LazyData: true
56 | RoxygenNote: 7.3.2
57 | VignetteBuilder: R.rsp
58 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(plot,autoMrP)
 4 | S3method(summary,autoMrP)
 5 | export(auto_MrP)
 6 | export(plot.autoMrP)
 7 | export(summary.autoMrP)
 8 | importFrom(doRNG,"%dorng%")
 9 | importFrom(dplyr,"%>%")
10 | importFrom(foreach,"%dopar%")
11 | importFrom(rlang,.data)
12 | importFrom(stats,as.formula)
13 | importFrom(stats,binomial)
14 | importFrom(stats,median)
15 | importFrom(stats,predict)
16 | importFrom(stats,sd)
17 | importFrom(stats,setNames)
18 | importFrom(stats,weighted.mean)
19 | importFrom(utils,combn)
20 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # autoMrP 1.1.0
 2 | 
 3 | + implements stacking
 4 | 
 5 | # autoMrP 1.0.6
 6 | 
 7 | + implements Deep MrP by Gopelrud as presented in https://doi.org/10.1017/S0003055423000035
 8 | + Set argument deep.mrp = TRUE to include Deep MrP in the ensemble 
 9 | 
10 | # autoMrP 1.0.5
11 | 
12 | + drops missing values on y, L1.x, L2.x, L2.unit, L2.reg. Missing values on the DV would previously lead to errors in SVM
13 | + works with continuous DV.
14 | 
15 | # autoMrP 0.93
16 | 
17 | + block sampling in bootstrapping instead of state-stratified sampling
18 | 
19 | # autoMrP 0.91
20 | 
21 | + bootstrapping returns GB prediction
22 | + predictions do not fail if census data contains more factor levels than training data for SVM and Lasso
23 | + svm post-stratification uses the user-specified formula instead of all information
24 | + lasso post-stratification uses correct user-specified context level variables if L2.x and lasso.L2.x differ
25 | + parallel processing loops are replicable now
26 | 


--------------------------------------------------------------------------------
/R/absentee_census.R:
--------------------------------------------------------------------------------
 1 | #' Quasi census data.
 2 | #'
 3 | #' The census file is generated from the full 2008 Cooperative Congressional Election Studies
 4 | #' item cc419_1 by dissaggregating the 64 ideal type combinations of the individual level variables
 5 | #' L1x1, L2x2 and L1x3. A row is an ideal type in a given state.
 6 | #'
 7 | #'
 8 | #' @format A data frame with 2934 rows and 13 variables:
 9 | #' \describe{
10 | #'   \item{state}{U.S. state}
11 | #'   \item{L2.unit}{U.S. state id}
12 | #'   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
13 | #'   \item{L1x1}{Age group (four categories)}
14 | #'   \item{L1x2}{Education level (four categories)}
15 | #'   \item{L1x3}{Gender-race combination (six categories)}
16 | #'   \item{proportion}{State-level proportion of respondents of that ideal type in the population}
17 | #'   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
18 | #'   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
19 | #'   \item{L2.x3}{State-level percentage of the population living in urban areas}
20 | #'   \item{L2.x4}{State-level unemployment rate}
21 | #'   \item{L2.x5}{State-level share of Hispanics}
22 | #'   \item{L2.x6}{State-level share of Whites}
23 | #' }
24 | #' @usage data(absentee_census)
25 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
26 | #'   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
27 | #'   multilevel regression and poststrat-stratification perform with
28 | #'   conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3,
29 | #'   L2.x3, L2.x4, L2.x5 and L2.x6 are available at
30 | #'   \url{https://www.census.gov}.
31 | "absentee_census"
32 | 


--------------------------------------------------------------------------------
/R/absentee_voting.R:
--------------------------------------------------------------------------------
 1 | #' A sample of the absentee voting item from the CCES 2008
 2 | #'
 3 | #' The Cooperative Congressional Election Stuides (CCES) item (cc419_1) asked:
 4 | #' "States have tried many new ways to run elections in recent years. Do you
 5 | #' support or oppose any of the following ways of voting or conducting elections
 6 | #' in your state? Election Reform - Allow absentee voting over the Internet?"
 7 | #' The original 2008 CCES item contains 26,934 respondents. This sample mimics a
 8 | #' typical national survey. It contains at least 5 respondents from each state
 9 | #' but is otherwise a random sample.
10 | #'
11 | #' @format A data frame with 1500 rows and 13 variables:
12 | #' \describe{
13 | #'   \item{YES}{1 if individual supports use of troops; 0 otherwise}
14 | #'   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
15 | #'   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
16 | #'   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
17 | #'   \item{state}{U.S. state}
18 | #'   \item{L2.unit}{U.S. state id}
19 | #'   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
20 | #'   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
21 | #'   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
22 | #'   \item{L2.x3}{State-level percentage of the population living in urban areas}
23 | #'   \item{L2.x4}{State-level unemployment rate}
24 | #'   \item{L2.x5}{State-level share of Hispanics}
25 | #'   \item{L2.x6}{State-level share of Whites}
26 | #' }
27 | #' @usage data(absentee_voting)
28 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
29 | #'   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
30 | #'   multilevel regression and poststrat-stratification perform with
31 | #'   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
32 | #'   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
33 | #'   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
34 | "absentee_voting"
35 | 


--------------------------------------------------------------------------------
/R/apply_stack_weights.r:
--------------------------------------------------------------------------------
  1 | apply_stack_weights <- function(ebma_out, stack_out, L2.unit, preds_all, y) {
  2 | 
  3 |   # initial binding of globals
  4 |   ebma_preds <- NULL
  5 | 
  6 |   # check whether stacking weights were calculated
  7 |   if (all(is.null(stack_out$stack_preds))) {
  8 | 
  9 |     # generate an object for individual level predicitions
 10 |     individual_level_predictions <- preds_all %>%
 11 |       dplyr::rename(!!rlang::sym(y) := y)
 12 | 
 13 |     # output object
 14 |     ebma_out <- list(
 15 |       ebma = ebma_out$ebma,
 16 |       classifiers = ebma_out$classifiers,
 17 |       weights = ebma_out$weights,
 18 |       stacking = "Stacking step skipped (only 1 classifier run)",
 19 |       stacking_weights = "Stacking step skipped (only 1 classifier run)",
 20 |       individual_level_predictions = individual_level_predictions
 21 |     )
 22 | 
 23 |   } else {
 24 | 
 25 |     # 1) non-negative least squares stack
 26 |     nnls_stack <- as.matrix(ebma_out$classifiers[, -1]) %*%
 27 |       stack_out$stack_weights$stack_nnls
 28 | 
 29 |     # generate stack predictions
 30 |     stacked_preds <- tibble::tibble(
 31 |       !!rlang::sym(L2.unit) := dplyr::pull(
 32 |         .data = ebma_out$classifiers, var = 1
 33 |       ),
 34 |       stack_nnls = as.numeric(nnls_stack)
 35 |     )
 36 | 
 37 |     # 2) optim with constraints stack
 38 |     optim_stack <- as.matrix(ebma_out$classifiers[, -1]) %*%
 39 |       stack_out$stack_weights$stack_optim
 40 | 
 41 |     # add optim stack to stack predictions
 42 |     stacked_preds <- stacked_preds %>%
 43 |       dplyr::mutate(stack_optim = as.numeric(optim_stack))
 44 | 
 45 |     # 3) quadratic programming stack
 46 |     qp_stack <- as.matrix(ebma_out$classifiers[, -1]) %*%
 47 |       stack_out$stack_weights$stack_qp
 48 | 
 49 |     # add qp stack to stack predictions
 50 |     stacked_preds <- stacked_preds %>%
 51 |       dplyr::mutate(stack_qp = as.numeric(qp_stack))
 52 | 
 53 |     # 4) ornstein stack
 54 |     ornstein_stack <- as.matrix(ebma_out$classifiers[, -1]) %*%
 55 |       stack_out$stack_weights$stack_ornstein
 56 | 
 57 |     # add ornstein stack to stack predictions
 58 |     stacked_preds <- stacked_preds %>%
 59 |       dplyr::mutate(stack_ornstein = as.numeric(ornstein_stack))
 60 | 
 61 |     # 5) stack of stacks
 62 |     stack_of_stacks <- as.matrix(stacked_preds[, -1]) %*%
 63 |       stack_out$stack_weights$stack_of_stacks
 64 | 
 65 |     # add stack of stacks to stack predictions
 66 |     stacked_preds <- stacked_preds %>%
 67 |       dplyr::mutate(stack_of_stacks = as.numeric(stack_of_stacks))
 68 | 
 69 |     # 6) stack of stacks with ebma
 70 |     stack_of_stacks_ebma <- as.matrix(
 71 |       cbind(stacked_preds[, "stack_of_stacks"], ebma_out$ebma[, "ebma"])
 72 |     ) %*%
 73 |       stack_out$stack_weights$stack_of_stacks_ebma
 74 | 
 75 |     # add stack of stacks with ebma to stack predictions
 76 |     stacked_preds <- stacked_preds %>%
 77 |       dplyr::mutate(stack_of_stacks_ebma = as.numeric(stack_of_stacks_ebma))
 78 | 
 79 |     # generate an object for individual level predicitions
 80 |     individual_level_predictions <- preds_all %>%
 81 |       dplyr::mutate(
 82 |         ebma = ebma_out$individual_level_predictions %>%
 83 |           dplyr::pull(var = ebma_preds)
 84 |       ) %>%
 85 |       dplyr::bind_cols(
 86 |         stack_out$stack_preds %>%
 87 |           dplyr::select(-id, -y, -dplyr::all_of(L2.unit))
 88 |       ) %>%
 89 |       dplyr::rename(!!rlang::sym(y) := y)
 90 | 
 91 |     # combine everything
 92 |     ebma_out <- list(
 93 |       ebma = ebma_out$ebma,
 94 |       classifiers = ebma_out$classifiers,
 95 |       weights = ebma_out$weights,
 96 |       stacking = stacked_preds,
 97 |       stacking_weights = stack_out$stack_weights,
 98 |       individual_level_predictions = individual_level_predictions
 99 |     )
100 |   }
101 | 
102 |   return(ebma_out)
103 | }


--------------------------------------------------------------------------------
/R/best_subset_classifier.R:
--------------------------------------------------------------------------------
  1 | #' Best subset classifier
  2 | #'
  3 | #' \code{best_subset_classifier} applies best subset classification to a data
  4 | #' set.
  5 | #'
  6 | #' @inheritParams auto_MrP
  7 | #' @param model Multilevel model. A model formula describing the multilevel
  8 | #'   model to be estimated on the basis of the provided training data.
  9 | #' @param data.train Training data. A data.frame containing the training data
 10 | #'   used to train the model.
 11 | #' @param model.family Model family. A variable indicating the model family
 12 | #'   to be used by glmer. Defaults to binomial(link = "probit").
 13 | #' @param model.optimizer Optimization method. A character-valued scalar
 14 | #'   describing the optimization method to be used by glmer. Defaults to
 15 | #'   "bobyqa".
 16 | #' @param n.iter Iterations. A integer-valued scalar specifying the maximum
 17 | #'   number of function evaluations tried by the optimization method.
 18 | #' @param verbose Verbose output. A logical vector indicating whether or not
 19 | #'   verbose output should be printed.
 20 | #' @return The multilevel model. An \code{\link[lme4]{glmer}} object.
 21 | 
 22 | best_subset_classifier <- function(
 23 |   model, data.train, model.family, model.optimizer, n.iter, y,
 24 |   verbose = c(TRUE, FALSE)
 25 | ) {
 26 | 
 27 |   # Determine type of dependent variable
 28 |   if (
 29 |     data.train %>%
 30 |       dplyr::pull(!!y) %>%
 31 |       unique() %>%
 32 |       length() == 2
 33 |   ) {
 34 |     dv_type <- "binary"
 35 |   } else {
 36 |     dv_type <- "continuous"
 37 |   }
 38 | 
 39 |   # Train model on training data
 40 |   if (isTRUE(verbose == TRUE)) {
 41 |     # DV type
 42 |     if (dv_type == "continuous") {
 43 |       out <- lme4::lmer(
 44 |         formula =  model,
 45 |         data = data.train,
 46 |       )
 47 |     } else {
 48 |       # optimizer
 49 |       if (model.optimizer == "bobyqa") {
 50 |         out <- lme4::glmer(
 51 |           formula =  model,
 52 |           data = data.train,
 53 |           family = model.family,
 54 |           lme4::glmerControl(
 55 |             optimizer = model.optimizer,
 56 |             optCtrl = list(maxfun = n.iter)
 57 |           )
 58 |         )
 59 |       } else if (model.optimizer == "nloptwrap") {
 60 |         out <- lme4::glmer(
 61 |           formula =  model,
 62 |           data = data.train,
 63 |           family = model.family,
 64 |           lme4::glmerControl(
 65 |             calc.derivs = FALSE,
 66 |             optimizer = model.optimizer,
 67 |             optCtrl = list(
 68 |               method = "NLOPT_LN_NELDERMEAD",
 69 |               starttests = TRUE, kkt = TRUE
 70 |             )
 71 |           )
 72 |         )
 73 |       }
 74 |     }
 75 |   } else {
 76 |     # DV type
 77 |     if (dv_type == "continuous") {
 78 |       out <- suppressMessages(suppressWarnings(
 79 |         lme4::lmer(
 80 |           formula =  model,
 81 |           data = data.train,
 82 |         )
 83 |       ))
 84 |     } else {
 85 |       # optimizer
 86 |       if (model.optimizer == "bobyqa") {
 87 |         out <- suppressMessages(suppressWarnings(
 88 |           lme4::glmer(
 89 |             formula =  model,
 90 |             data = data.train,
 91 |             family = model.family,
 92 |             lme4::glmerControl(
 93 |               optimizer = model.optimizer,
 94 |               optCtrl = list(maxfun = n.iter)
 95 |             )
 96 |           )
 97 |         ))
 98 |       } else if (model.optimizer == "nloptwrap") {
 99 |         out <- suppressMessages(suppressWarnings(
100 |           lme4::glmer(
101 |             formula =  model,
102 |             data = data.train,
103 |             family = model.family,
104 |             lme4::glmerControl(
105 |               calc.derivs = FALSE,
106 |               optimizer = model.optimizer,
107 |               optCtrl = list(
108 |                 method = "NLOPT_LN_NELDERMEAD",
109 |                 starttests = TRUE,
110 |                 kkt = TRUE
111 |               )
112 |             )
113 |           )
114 |         ))
115 |       }
116 |     }
117 |   }
118 | 
119 |   # Function output
120 |   return(out)
121 | }
122 | 


--------------------------------------------------------------------------------
/R/boot_auto_mrp.R:
--------------------------------------------------------------------------------
  1 | #' Bootstrappinng wrapper for auto_mrp
  2 | #'
  3 | #' \code{boot_auto_mrp} estimates uncertainty for auto_mrp via botstrapping.
  4 | #'
  5 | #' @inheritParams auto_MrP
  6 | #' @param pc.names A character vector of the principal component variable names
  7 | #'   in the data.
  8 | 
  9 | boot_auto_mrp <- function(
 10 |   y, L1.x, L2.x, mrp.L2.x, L2.unit, L2.reg, L2.x.scale, pcs, folds,
 11 |   bin.proportion, bin.size, survey, census, ebma.size, k.folds, cv.sampling,
 12 |   loss.unit, loss.fun, best.subset, lasso, pca, gb, svm, mrp, deep.mrp,
 13 |   best.subset.L2.x, lasso.L2.x, pca.L2.x, pc.names, gb.L2.x, svm.L2.x,
 14 |   svm.L2.unit, svm.L2.reg, gb.L2.unit, gb.L2.reg, deep.splines, lasso.lambda,
 15 |   lasso.n.iter, gb.interaction.depth, gb.shrinkage, gb.n.trees.init,
 16 |   gb.n.trees.increase, gb.n.trees.max, gb.n.minobsinnode, svm.kernel, svm.gamma,
 17 |   svm.cost, ebma.tol, boot.iter, cores
 18 | ) {
 19 | 
 20 |   # Binding for global variables
 21 |   `%>%` <- dplyr::`%>%`
 22 | 
 23 |   # Register cores
 24 |   cl <- multicore(cores = cores, type = "open", cl = NULL)
 25 | 
 26 |   # Bootstrap iterations
 27 |   boot_out <- foreach::foreach(
 28 |     idx_boot = 1:boot.iter, .packages = "autoMrP"
 29 |   ) %dorng% {
 30 | 
 31 |     boot_mrp <- boot_fun(
 32 |       y = y,
 33 |       L1.x = L1.x,
 34 |       L2.x = L2.x,
 35 |       mrp.L2.x = mrp.L2.x,
 36 |       L2.unit = L2.unit,
 37 |       L2.reg = L2.reg,
 38 |       pcs = pcs,
 39 |       folds = folds,
 40 |       survey = survey,
 41 |       census = census,
 42 |       k.folds = k.folds,
 43 |       cv.sampling = cv.sampling,
 44 |       ebma.size = ebma.size,
 45 |       loss.unit = loss.unit,
 46 |       loss.fun = loss.fun,
 47 |       best.subset = best.subset,
 48 |       lasso = lasso,
 49 |       pca = pca,
 50 |       gb = gb,
 51 |       svm = svm,
 52 |       mrp = mrp,
 53 |       deep.mrp = deep.mrp,
 54 |       best.subset.L2.x = best.subset.L2.x,
 55 |       lasso.L2.x = lasso.L2.x,
 56 |       pca.L2.x = pca.L2.x,
 57 |       pc.names = pc.names,
 58 |       gb.L2.x = gb.L2.x,
 59 |       svm.L2.x = svm.L2.x,
 60 |       svm.L2.unit = svm.L2.unit,
 61 |       svm.L2.reg = svm.L2.reg,
 62 |       gb.L2.unit = gb.L2.unit,
 63 |       gb.L2.reg = gb.L2.reg,
 64 |       deep.splines = deep.splines,
 65 |       lasso.lambda = lasso.lambda,
 66 |       lasso.n.iter = lasso.n.iter,
 67 |       gb.interaction.depth = gb.interaction.depth,
 68 |       gb.shrinkage = gb.shrinkage,
 69 |       gb.n.trees.init = gb.n.trees.init,
 70 |       gb.n.trees.increase = gb.n.trees.increase,
 71 |       gb.n.trees.max = gb.n.trees.max,
 72 |       gb.n.minobsinnode = gb.n.minobsinnode,
 73 |       svm.kernel = svm.kernel,
 74 |       svm.gamma = svm.gamma,
 75 |       svm.cost = svm.cost,
 76 |       ebma.tol = ebma.tol
 77 |     )
 78 |   } # end of foreach loop
 79 | 
 80 |   # Median and standard deviation of EBMA estimates
 81 |   if (!any(
 82 |     boot_out[[1]]$ebma == "EBMA step skipped (only 1 classifier run)"
 83 |   )) {
 84 |     ebma <- base::do.call(
 85 |       base::rbind, base::do.call(base::rbind, boot_out)[, "ebma"]
 86 |     )
 87 | 
 88 |     # weights
 89 |     weights <- base::do.call(
 90 |       base::rbind, base::do.call(base::rbind, boot_out)[, "weights"]
 91 |     ) %>%
 92 |       dplyr::as_tibble() %>%
 93 |       dplyr::select(
 94 |         contains("best_subset"),
 95 |         contains("pca"),
 96 |         contains("lasso"),
 97 |         contains("gb"),
 98 |         contains("svm"),
 99 |         contains("mrp")
100 |       )
101 | 
102 |   } else {
103 |     ebma <- "EBMA step skipped (only 1 classifier run)"
104 |     weights <- NULL
105 |   }
106 | 
107 |   # Median and standard deviations for classifier estimates
108 |   classifiers <- base::do.call(
109 |     base::rbind, base::do.call(base::rbind, boot_out)[, "classifiers"]
110 |   ) %>%
111 |     dplyr::select(
112 |       one_of(L2.unit),
113 |       contains("best_subset"),
114 |       contains("pca"),
115 |       contains("lasso"),
116 |       contains("gb"),
117 |       contains("svm"),
118 |       contains("mrp")
119 |     )
120 | 
121 |   if (!is.null(weights)) {
122 |     boot_out <- list(ebma = ebma, classifiers = classifiers, weights = weights)
123 |   } else {
124 |     boot_out <- list(ebma = ebma, classifiers = classifiers)
125 |   }
126 | 
127 |   # De-register cluster
128 |   multicore(cores = cores, type = "close", cl = cl)
129 | 
130 |   return(boot_out)
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/R/build_folds.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | build_folds <- function(survey,
 4 |                         L2.unit,
 5 |                         ebma.size = 1/3,
 6 |                         k.folds = 5,
 7 |                         cv.sampling = "L2 units") {
 8 |   # EBMA hold-out fold
 9 |   ebma.size <- round(nrow(survey) * ebma.size, digits = 0)
10 | 
11 |   if(ebma.size > 0) {
12 |     ebma_folding_out <- ebma_folding(data = survey,
13 |                                      L2.unit = L2.unit,
14 |                                      ebma.size = ebma.size)
15 |     ebma_fold <- ebma_folding_out$ebma_fold
16 |     cv_data <- ebma_folding_out$cv_data
17 |   } else{
18 |     ebma_fold <- NULL
19 |     cv_data <- survey
20 |   }
21 |   
22 |   # K folds for cross-validation
23 |   cv_folds <- cv_folding(data = cv_data,
24 |                          L2.unit = L2.unit,
25 |                          k.folds = k.folds,
26 |                          cv.sampling = cv.sampling)
27 | }
28 | 


--------------------------------------------------------------------------------
/R/census_data.R:
--------------------------------------------------------------------------------
 1 | #' Quasi census data.
 2 | #'
 3 | #' The census file is generated from the full 2008 Cooperative Congressional Election Studies
 4 | #' item cc418_1 by dissaggregating the 64 ideal type combinations of the individual level variables
 5 | #' L1x1, L2x2 and L1x3. A row is an ideal type in a given state.
 6 | #'
 7 | #'
 8 | #' @format A data frame with 2934 rows and 13 variables:
 9 | #' \describe{
10 | #'   \item{state}{U.S. state}
11 | #'   \item{L2.unit}{U.S. state id}
12 | #'   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
13 | #'   \item{L1x1}{Age group (four categories)}
14 | #'   \item{L1x2}{Education level (four categories)}
15 | #'   \item{L1x3}{Gender-race combination (six categories)}
16 | #'   \item{proportion}{State-level proportion of respondents of that ideal type in the population}
17 | #'   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
18 | #'   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
19 | #'   \item{L2.x3}{State-level percentage of the population living in urban areas}
20 | #'   \item{L2.x4}{State-level unemployment rate}
21 | #'   \item{L2.x5}{State-level share of Hispanics}
22 | #'   \item{L2.x6}{State-level share of Whites}
23 | #' }
24 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
25 | #'   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
26 | #'   multilevel regression and poststrat-stratification perform with
27 | #'   conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3,
28 | #'   L2.x3, L2.x4, L2.x5 and L2.x6 are available at
29 | #'   \url{https://www.census.gov}.
30 | "census"
31 | 


--------------------------------------------------------------------------------
/R/deep_mrp_classifier.r:
--------------------------------------------------------------------------------
 1 | #' Deep MrP classifier
 2 | #'
 3 | #' \code{deep_mrp_classifier} applies Deep MrP implemented in the \pkg{vglmer}
 4 | #' package to a data set.
 5 | #'
 6 | #' @inheritParams auto_MrP
 7 | #' @param form Model formula. A two-sided linear formula describing
 8 | #'   the model to be fit, with the outcome on the LHS and the covariates
 9 | #'   separated by + operators on the RHS.
10 | #' @param data Data. A data.frame containing the data used to train the model.
11 | #' @return A Deep MrP model. A \code{\link[vglmer]{vglmer}} object.
12 | 
13 | deep_mrp_classifier <- function(y, form, data, verbose) {
14 | 
15 |   # Determine type of dependent variable
16 |   if (
17 |     data %>%
18 |       dplyr::pull(!!y) %>%
19 |       unique() %>%
20 |       length() == 2
21 |   ) {
22 |     family <- "binomial"
23 |   } else {
24 |     family <- "linear"
25 |   }
26 | 
27 |   # run vglmer model
28 |   if (verbose) {
29 |     out <- vglmer::vglmer(
30 |       formula = as.formula(form),
31 |       data = data,
32 |       family = family
33 |     )
34 |   } else {
35 |     out <- suppressMessages(suppressWarnings(
36 |       vglmer::vglmer(
37 |         formula = as.formula(form),
38 |         data = data,
39 |         family = family
40 |       )
41 |     ))
42 |   }
43 |   return(out)
44 | }


--------------------------------------------------------------------------------
/R/gb_classifier.R:
--------------------------------------------------------------------------------
  1 | #' GB classifier
  2 | #'
  3 | #' \code{gb_classifier} applies gradient boosting classification to a data set.
  4 | #'
  5 | #' @inheritParams auto_MrP
  6 | #' @param form Model formula. A two-sided linear formula describing
  7 | #'   the model to be fit, with the outcome on the LHS and the covariates
  8 | #'   separated by + operators on the RHS.
  9 | #' @param distribution Model distribution. A character string specifying the
 10 | #'   name of the distribution to be used.
 11 | #' @param data.train Training data. A data.frame containing the training data
 12 | #'   used to train the model.
 13 | #' @param n.trees Total number of trees. An integer-valued scalar specifying
 14 | #'   the total number of trees to be fit.
 15 | #' @param interaction.depth Interaction depth. An integer-valued scalar
 16 | #'   specifying the maximum depth of each tree.
 17 | #' @param n.minobsinnode Minimum number of observations in terminal nodes. An
 18 | #'   integer-valued scalar specifying the minimum number of observations in the
 19 | #'   terminal nodes of the trees.
 20 | #' @param shrinkage Learning rate. A numeric scalar specifying the shrinkage or
 21 | #'   learning rate applied to each tree in the expansion.
 22 | #' @param verbose Verbose output. A logical vector indicating whether or not
 23 | #'   verbose output should be printed.
 24 | #' @return A gradient tree boosting model. A \code{\link[gbm]{gbm}} object.
 25 | 
 26 | gb_classifier <- function(
 27 |   y, form, distribution, data.train,
 28 |   n.trees, interaction.depth,
 29 |   n.minobsinnode, shrinkage,
 30 |   verbose = c(TRUE, FALSE)
 31 | ) {
 32 | 
 33 |   # Determine type of dependent variable
 34 |   if (
 35 |     data.train %>%
 36 |       dplyr::pull(!!y) %>%
 37 |       unique() %>%
 38 |       length() > 2
 39 |   ) {
 40 |     # set model family to gaussian
 41 |     distribution <- "gaussian"
 42 |   }
 43 | 
 44 |   # Train model on training data with number of total trees, interaction depth,
 45 |   # and learning rate as tuning parameters
 46 |   if (isTRUE(verbose == TRUE)) {
 47 |     out <- gbm::gbm(
 48 |       formula = form,
 49 |       distribution = distribution,
 50 |       data = data.train,
 51 |       n.trees = n.trees,
 52 |       interaction.depth = interaction.depth,
 53 |       n.minobsinnode = n.minobsinnode,
 54 |       shrinkage = shrinkage,
 55 |       train.fraction = 1,
 56 |       n.cores = 1
 57 |     )
 58 |   } else {
 59 |     out <- suppressMessages(suppressWarnings(
 60 |       gbm::gbm(
 61 |         formula = form,
 62 |         distribution = distribution,
 63 |         data = data.train, n.trees = n.trees,
 64 |         interaction.depth = interaction.depth,
 65 |         n.minobsinnode = n.minobsinnode,
 66 |         shrinkage = shrinkage,
 67 |         train.fraction = 1,
 68 |         n.cores = 1
 69 |       )
 70 |     ))
 71 |   }
 72 | 
 73 |   # Function output
 74 |   return(out)
 75 | }
 76 | 
 77 | #' GB classifier update
 78 | #'
 79 | #' \code{gb_classifier_update()} grows additional trees in gradient tree
 80 | #' boosting ensemble.
 81 | #'
 82 | #' @param object Gradient tree boosting output. A gbm object.
 83 | #' @param n.new.trees Number of additional trees to grow. A numeric scalar.
 84 | #' @param verbose Verbose output. A logical vector indicating whether or not
 85 | #'   verbose output should be printed.
 86 | #' @return An updated gradient tree boosting model.
 87 | #'   A \code{\link[gbm]{gbm.more}} object.
 88 | 
 89 | gb_classifier_update <- function(
 90 |   object, n.new.trees, verbose = c(TRUE, FALSE)
 91 | ) {
 92 | 
 93 |   # Train model on training data with number of total trees, interaction depth,
 94 |   # and learning rate as tuning parameters
 95 |   if (isTRUE(verbose == TRUE)) {
 96 |     out <- gbm::gbm.more(
 97 |       object = object,
 98 |       n.new.trees = n.new.trees
 99 |     )
100 |   } else {
101 |     out <- suppressMessages(suppressWarnings(
102 |       gbm::gbm.more(
103 |         object = object,
104 |         n.new.trees = n.new.trees
105 |       )
106 |     ))
107 |   }
108 | 
109 |   # Function output
110 |   return(out)
111 | }
112 | 


--------------------------------------------------------------------------------
/R/globals.R:
--------------------------------------------------------------------------------
 1 | globalVariables(c("%>%",
 2 |                   "%dopar%",
 3 |                   "%dorng%",
 4 |                   ".",
 5 |                   ":=",
 6 |                   "ae",
 7 |                   "all_of",
 8 |                   "bce",
 9 |                   "ce",
10 |                   "contains",
11 |                   "data",
12 |                   "depth",
13 |                   "err",
14 |                   "err_rates",
15 |                   "estimates",
16 |                   "fn",
17 |                   "fp",
18 |                   "index",
19 |                   "lambda",
20 |                   "lasso_opt",
21 |                   "lb",
22 |                   "level",
23 |                   "mae",
24 |                   "measure",
25 |                   "method",
26 |                   "model",
27 |                   "mrp",
28 |                   "mse",
29 |                   "msfe",
30 |                   "n",
31 |                   "n_L2",
32 |                   "ntrees",
33 |                   "one_of",
34 |                   "os",
35 |                   "pc_names",
36 |                   "prop",
37 |                   "pval",
38 |                   "row_number",
39 |                   "sqe",
40 |                   "state",
41 |                   "tp",
42 |                   "ub",
43 |                   "value",
44 |                   "verbose",
45 |                   "y_svm",
46 |                   "deep_mrp",
47 |                   "gaussian"))
48 | 


--------------------------------------------------------------------------------
/R/lasso_classifier.R:
--------------------------------------------------------------------------------
 1 | #' Lasso classifier
 2 | #'
 3 | #' \code{lasso_classifier} applies lasso classification to a data set.
 4 | #'
 5 | #' @inheritParams auto_MrP
 6 | #' @param L2.fix Fixed effects. A two-sided linear formula describing
 7 | #'   the fixed effects part of the model, with the outcome on the LHS and
 8 | #'   the fixed effects separated by + operators on the RHS.
 9 | #' @param L1.re Random effects. A named list object, with the random effects
10 | #'   providing the names of the list elements and ~ 1 being the list elements.
11 | #' @param data.train Training data. A data.frame containing the training data
12 | #'   used to train the model.
13 | #' @param lambda Tuning parameter. Lambda is the penalty parameter that controls
14 | #'   the shrinkage of fixed effects.
15 | #' @param model.family Model family. A variable indicating the model family
16 | #'   to be used by glmmLasso. Defaults to binomial(link = "probit").
17 | #' @param verbose Verbose output. A logical vector indicating whether or not
18 | #'   verbose output should be printed.
19 | #' @return A multilevel lasso model. An \code{\link[glmmLasso]{glmmLasso}}
20 | #'   object.
21 | 
22 | lasso_classifier <- function(
23 |   L2.fix, L1.re, data.train, lambda, model.family, y,
24 |   verbose = c(TRUE, FALSE)
25 | ) {
26 | 
27 |   # Determine type of dependent variable
28 |   if (
29 |     data.train %>%
30 |       dplyr::pull(!!y) %>%
31 |       unique() %>%
32 |       length() > 2
33 |   ) {
34 |     # set model family to gaussian
35 |     model.family <- gaussian(link = "identity")
36 |   }
37 | 
38 |   # Train model on training data with lambda as tuning parameter
39 |   if (isTRUE(verbose == TRUE)) {
40 |     out <- glmmLasso::glmmLasso(
41 |       fix = L2.fix,
42 |       rnd = L1.re,
43 |       data = data.train,
44 |       lambda = lambda,
45 |       family = model.family,
46 |       switch.NR = FALSE,
47 |       final.re = TRUE,
48 |       control = list(
49 |         center = TRUE,
50 |         standardize = TRUE
51 |       )
52 |     )
53 |   } else {
54 |     out <- quiet(
55 |       suppressMessages(suppressWarnings(
56 |         glmmLasso::glmmLasso(
57 |           fix = L2.fix,
58 |           rnd = L1.re,
59 |           data = data.train,
60 |           lambda = lambda,
61 |           family = model.family,
62 |           switch.NR = FALSE,
63 |           final.re = TRUE,
64 |           control = list(
65 |             center = TRUE,
66 |             standardize = TRUE
67 |           )
68 |         )
69 |       ))
70 |     )
71 |   }
72 | 
73 |   # Function output
74 |   return(out)
75 | }
76 | 


--------------------------------------------------------------------------------
/R/run_best_subset.R:
--------------------------------------------------------------------------------
  1 | #' Apply best subset classifier to MrP.
  2 | #'
  3 | #' \code{run_best_subset} is a wrapper function that applies the best subset
  4 | #' classifier to a list of models provided by the user, evaluates the models'
  5 | #' prediction performance, and chooses the best-performing model.
  6 | #'
  7 | #' @inheritParams auto_MrP
  8 | #' @param data Data for cross-validation. A \code{list} of \eqn{k}
  9 | #'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
 10 | #'   cross-validation.
 11 | #' @return A model formula of the winning best subset classifier model.
 12 | 
 13 | run_best_subset <- function(
 14 |   y, L1.x, L2.x, L2.unit, L2.reg,
 15 |   loss.unit, loss.fun, data, verbose, cores
 16 | ) {
 17 | 
 18 |   # List of all models to be evaluated
 19 |   models <- model_list(
 20 |     y = y,
 21 |     L1.x = L1.x,
 22 |     L2.x = L2.x,
 23 |     L2.unit = L2.unit,
 24 |     L2.reg = L2.reg
 25 |   )
 26 | 
 27 |   # prallel tuning if cores > 1
 28 |   if (cores > 1) {
 29 | 
 30 |     # Train all models in parallel
 31 |     m_errors <- run_best_subset_mc(
 32 |       verbose = verbose,
 33 |       models = models,
 34 |       data = data,
 35 |       loss.unit = loss.unit,
 36 |       loss.fun = loss.fun,
 37 |       y = y,
 38 |       L1.x = L1.x,
 39 |       L2.x = L2.x,
 40 |       L2.unit = L2.unit,
 41 |       L2.reg = L2.reg,
 42 |       cores = cores
 43 |     )
 44 |   } else {
 45 | 
 46 |     # Train and evaluate each model
 47 |     m_errors <- lapply(seq_along(models), function(m) {
 48 |       # Print model m
 49 |       if (isTRUE(verbose)) {
 50 |         M <- length(models)
 51 |         cat(paste(
 52 |           "Best subset: Running model ", m,
 53 |           " out of ", M, " models\n", sep = ""
 54 |         ))
 55 |       }
 56 | 
 57 |       # Loop over each fold
 58 |       k_errors <- lapply(seq_along(data), function(k) {
 59 |         # Split data in training and validation sets
 60 |         data_train <- dplyr::bind_rows(data[-k])
 61 |         data_valid <- dplyr::bind_rows(data[k])
 62 | 
 63 |         # Train mth model on kth training set
 64 |         model_m <- best_subset_classifier(
 65 |           model = models[[m]],
 66 |           y = y,
 67 |           data.train = data_train,
 68 |           model.family = binomial(link = "probit"),
 69 |           model.optimizer = "bobyqa",
 70 |           n.iter = 1000000,
 71 |           verbose = verbose
 72 |         )
 73 | 
 74 |         # Use trained model to make predictions for kth validation set
 75 |         pred_m <- stats::predict(
 76 |           model_m,
 77 |           newdata = data_valid,
 78 |           type = "response",
 79 |           allow.new.levels = TRUE
 80 |         )
 81 | 
 82 |         # Evaluate predictions based on loss function
 83 |         perform_m <- loss_function(
 84 |           pred = pred_m,
 85 |           data.valid = data_valid,
 86 |           loss.unit = loss.unit,
 87 |           loss.fun = loss.fun,
 88 |           y = y,
 89 |           L2.unit = L2.unit
 90 |         )
 91 |       })
 92 | 
 93 |       # Mean over loss functions
 94 |       k_errors <- dplyr::bind_rows(k_errors) %>%
 95 |         dplyr::group_by(measure) %>%
 96 |         dplyr::summarise(value = mean(value), .groups = "drop") %>%
 97 |         dplyr::mutate(model = m)
 98 |     })
 99 |   }
100 | 
101 |   # Extract best tuning parameters
102 |   grid_cells <- dplyr::bind_rows(m_errors)
103 |   best_params <- dplyr::slice(
104 |     loss_score_ranking(
105 |       score = grid_cells,
106 |       loss.fun = loss.fun), 1)
107 | 
108 |   # Choose best-performing model
109 |   out <- models[[dplyr::pull(.data = best_params, var = model)]]
110 | 
111 |   # Function output
112 |   return(out)
113 | 
114 | }
115 | 
116 | ################################################################################
117 | #                Multicore tuning for best subset                              #
118 | ################################################################################
119 | #' Best subset multicore tuning.
120 | #'
121 | #' \code{run_best_subset_mc} is called from within \code{run_best_subset}. It
122 | #' tunes using multiple cores.
123 | #'
124 | #' @param y Outcome variable. A character scalar containing the column name of
125 | #'   the outcome variable in \code{survey}.
126 | #' @param L1.x Individual-level covariates. A character vector containing the
127 | #'   column names of the individual-level variables in \code{survey} and
128 | #'   \code{census} used to predict outcome \code{y}. Note that geographic unit
129 | #'   is specified in argument \code{L2.unit}.
130 | #' @param L2.x Context-level covariates. A character vector containing the
131 | #'   column names of the context-level variables in \code{survey} and
132 | #'   \code{census} used to predict outcome \code{y}.
133 | #' @param L2.unit Geographic unit. A character scalar containing the column
134 | #'   name of the geographic unit in \code{survey} and \code{census} at which
135 | #'   outcomes should be aggregated.
136 | #' @param L2.reg Geographic region. A character scalar containing the column
137 | #'   name of the geographic region in \code{survey} and \code{census} by which
138 | #'   geographic units are grouped (\code{L2.unit} must be nested within
139 | #'   \code{L2.reg}). Default is \code{NULL}.
140 | #' @param loss.unit Loss function unit. A character-valued scalar indicating
141 | #'   whether performance loss should be evaluated at the level of individual
142 | #'   respondents (\code{individuals}) or geographic units (\code{L2 units}).
143 | #'   Default is \code{individuals}.
144 | #' @param loss.fun Loss function. A character-valued scalar indicating whether
145 | #'   prediction loss should be measured by the mean squared error (\code{MSE})
146 | #'   or the mean absolute error (\code{MAE}). Default is \code{MSE}.
147 | #' @param data Data for cross-validation. A \code{list} of \eqn{k}
148 | #'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
149 | #'   cross-validation.
150 | #' @param cores The number of cores to be used. An integer indicating the number
151 | #'   of processor cores used for parallel computing. Default is 1.
152 | #' @param models The models to perform best subset selection on. A list of model
153 | #'   formulas.
154 | #' @param verbose Verbose output. A logical argument indicating whether or not
155 | #'   verbose output should be printed. Default is \code{TRUE}.
156 | #' @return The cross-validation errors for all models. A list.
157 | #' @examples \dontrun{
158 | #' # not yet
159 | #' }
160 | 
161 | run_best_subset_mc <- function(
162 |   y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun,
163 |   data, cores, models, verbose
164 | ) {
165 | 
166 |   # Binding for global variables
167 |   m <- NULL
168 | 
169 |   # Register cores
170 |   cl <- multicore(cores = cores, type = "open", cl = NULL)
171 | 
172 |   # Train and evaluate each model
173 |   m_errors <- foreach::foreach(
174 |     m = seq_along(models), .packages = "autoMrP"
175 |   ) %dorng% {
176 | 
177 |     # Loop over each fold
178 |     k_errors <- lapply(seq_along(data), function(k) {
179 |       # Split data in training and validation sets
180 |       data_train <- dplyr::bind_rows(data[-k])
181 |       data_valid <- dplyr::bind_rows(data[k])
182 | 
183 |       # Train mth model on kth training set
184 |       model_m <- best_subset_classifier(
185 |         model = models[[m]],
186 |         y = y,
187 |         data.train = data_train,
188 |         model.family = binomial(link = "probit"),
189 |         model.optimizer = "bobyqa",
190 |         n.iter = 1000000,
191 |         verbose = verbose
192 |       )
193 | 
194 |       # Use trained model to make predictions for kth validation set
195 |       pred_m <- stats::predict(
196 |         model_m, newdata = data_valid,
197 |         type = "response", allow.new.levels = TRUE
198 |       )
199 | 
200 |       # Evaluate predictions based on loss function
201 |       perform_m <- loss_function(
202 |         pred = pred_m,
203 |         data.valid = data_valid,
204 |         loss.unit = loss.unit,
205 |         loss.fun = loss.fun,
206 |         y = y,
207 |         L2.unit = L2.unit
208 |       )
209 |     })
210 | 
211 |     # Mean over loss functions
212 |     k_errors <- dplyr::bind_rows(k_errors) %>%
213 |       dplyr::group_by(measure) %>%
214 |       dplyr::summarise(value = mean(value), .groups = "drop") %>%
215 |       dplyr::mutate(model = m)
216 |   }
217 | 
218 |   # De-register cluster
219 |   multicore(cores = cores, type = "close", cl = cl)
220 | 
221 |   # Function output
222 |   return(m_errors)
223 | }
224 | 


--------------------------------------------------------------------------------
/R/run_deep_bs.r:
--------------------------------------------------------------------------------
  1 | #' Apply deep mrp to the best subset classifier to MrP.
  2 | #'
  3 | #' \code{run_deep_bs} is a wrapper function that applies the bestsubset
  4 | #' classifier to a list of models provided by the user, evaluates the models'
  5 | #' prediction performance, and chooses the best-performing model. It differs
  6 | #' from \code{run_best_subset} in that it includes L1.x interactions.
  7 | #'
  8 | #' @inheritParams auto_MrP
  9 | #' @param data Data for cross-validation. A \code{list} of \eqn{k}
 10 | #'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
 11 | #'   cross-validation.
 12 | #' @return A model formula of the winning best subset classifier model.
 13 | 
 14 | run_deep_bs <- function(
 15 |   y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, deep.splines, data,
 16 |   k.folds, verbose, cores
 17 | ) {
 18 | 
 19 |   # Determine type of dependent variable
 20 |   if (
 21 |     data[[1]] %>%
 22 |       dplyr::pull(!!y) %>%
 23 |       unique() %>%
 24 |       length() == 2
 25 |   ) {
 26 |     dv_type <- "binary"
 27 |   } else {
 28 |     dv_type <- "linear"
 29 |   }
 30 | 
 31 |   # List of all models to be evaluated
 32 |   models <- model_list(
 33 |     y = y,
 34 |     L1.x = L1.x,
 35 |     L2.x = L2.x,
 36 |     L2.unit = L2.unit,
 37 |     L2.reg = L2.reg
 38 |   )
 39 | 
 40 |   # no nesting with deep interactions
 41 |   if (!is.null(L2.reg) && !is.null(L2.unit)) {
 42 |     models <- lapply(models, function(x) {
 43 |       # model formula to character
 44 |       m_form <- as.character(x)
 45 |       # replace (1 | region/state) with (1 | region) + (1 | state)
 46 |       m_form <- stringr::str_replace_all(
 47 |         string = m_form,
 48 |         pattern = "\\(1 \\| region/state\\)",
 49 |         replacement = "\\(1 | state\\) + \\(1 | region\\)"
 50 |       )
 51 |       # character to formula
 52 |       m_form <- as.formula(sprintf("%s%s%s", m_form[2], m_form[1], m_form[3]))
 53 |     })
 54 |   }
 55 | 
 56 |   # add interactions to the models
 57 |   models <- lapply(models, function(x) {
 58 | 
 59 |     # get all level 1 variables in the current model
 60 |     c_l1_x <- x %>%
 61 |       as.character() %>%
 62 |       .[3] %>%
 63 |       stringr::str_extract_all(pattern = "L1x\\d+") %>%
 64 |       unlist()
 65 | 
 66 |     # generate all interactions of L1.x
 67 |     l1_comb <- unlist(lapply(2:length(c_l1_x), function(x) {
 68 |       apply(combn(L1.x, x), 2, paste, collapse = ".")
 69 |     }))
 70 | 
 71 |     # generate all interactions of L1.x with L2.unit
 72 |     l1_state <- paste(L1.x, L2.unit, sep = ".")
 73 | 
 74 |     # generate all interactions of L1.x with L2.reg
 75 |     if (!is.null(L2.reg)) {
 76 |       l1_region <- paste(L1.x, L2.reg, sep = ".")
 77 |     } else {
 78 |       l1_region <- NULL
 79 |     }
 80 | 
 81 |     # interactions
 82 |     add_interactions <- paste0(
 83 |       # interactions of L1x
 84 |       paste("(1 | ", l1_comb, ")", collapse = " + "), " + ",
 85 |       # interactions of L1x with L2.unit
 86 |       paste("(1 | ", l1_state, ")", collapse = " + "), " + ",
 87 |       # interactions of L1x with L2.reg
 88 |       if (any(!is.null(l1_region))) {
 89 |         paste("(1 | ", l1_region, ")", collapse = " + ")
 90 |       }
 91 |     )
 92 | 
 93 |     # remove trailing " + " from interactions
 94 |     add_interactions <- stringr::str_extract(
 95 |       string = add_interactions,
 96 |       pattern = "^.*\\)"
 97 |     )
 98 | 
 99 |     # character to formula
100 |     add_interactions <- as.formula(paste("~ . +", add_interactions))
101 | 
102 |     # update formula with interactions
103 |     x <- update(x, add_interactions)
104 | 
105 |     # add splines to context level variables
106 |     if (deep.splines) {
107 | 
108 |       # formula to character
109 |       char_form <- as.character(x)
110 |       char_form <- sprintf("%s %s %s", char_form[2], char_form[1], char_form[3])
111 | 
112 |       # get all context level variables in the current model
113 |       c_l2_x <- char_form %>%
114 |         stringr::str_extract_all(pattern = "L2\\.x\\d+") %>%
115 |         unlist()
116 | 
117 |       # replace in string
118 |       for (i in seq_along(c_l2_x)) {
119 |         char_form <- stringr::str_replace(
120 |           string = char_form,
121 |           pattern = c_l2_x[i],
122 |           replacement = sprintf("v_s(%s)", c_l2_x[i])
123 |         )
124 |       }
125 | 
126 |       # character to formula
127 |       x <- as.formula(char_form)
128 | 
129 |     }
130 | 
131 |     return(x)
132 |   })
133 | 
134 |   # Register cores
135 |   cl <- multicore(cores = cores, type = "open", cl = NULL)
136 | 
137 |   # loop over models
138 |   m_errors <- foreach::foreach(
139 |     m = seq_along(models), .packages = "autoMrP"
140 |   ) %dorng% {
141 | 
142 |     `%>%` <- magrittr::`%>%`
143 | 
144 |     k_errors <- lapply(seq_len(k.folds), function(k) {
145 | 
146 |       # Split data in training and validation sets
147 |       data_train <- dplyr::bind_rows(data[-k])
148 |       data_valid <- dplyr::bind_rows(data[k])
149 | 
150 |       # Train mth model on kth training set
151 |       model_m <- deep_mrp_classifier(
152 |         form = models[[m]],
153 |         y = y,
154 |         data = data_train,
155 |         verbose = TRUE
156 |       )
157 | 
158 |       # predictions based on DV type (binary or continuous)
159 |       if (dv_type == "binary") {
160 |         # use trained model to make predictions for kth validation set
161 |         pred_m <- vglmer::predict_MAVB(
162 |           samples = 1000,
163 |           model_m,
164 |           newdata = data_valid,
165 |           allow_missing_levels = TRUE
166 |         )[["mean"]]
167 | 
168 |         # convert to response probabilities
169 |         pred_m <- stats::plogis(pred_m)
170 | 
171 |       } else if (dv_type == "linear") {
172 |         # Use trained model to make predictions for kth validation set
173 |         pred_m <- predict(
174 |           samples = 1000,
175 |           object = model_m,
176 |           newdata = data_valid,
177 |           allow_missing_levels = TRUE
178 |         )[["mean"]]
179 |       }
180 | 
181 |       # evaluate predictions based on loss function
182 |       perform_m <- loss_function(
183 |         pred = pred_m,
184 |         data.valid = data_valid,
185 |         loss.unit = loss.unit,
186 |         loss.fun = loss.fun,
187 |         y = y,
188 |         L2.unit = L2.unit
189 |       )
190 | 
191 |       return(perform_m)
192 |     })
193 | 
194 |     # Mean over loss functions
195 |     k_errors <- dplyr::bind_rows(k_errors) %>%
196 |       dplyr::group_by(measure) %>%
197 |       dplyr::summarise(value = mean(value), .groups = "drop") %>%
198 |       dplyr::mutate(model = m)
199 | 
200 |     return(k_errors)
201 |   }
202 | 
203 |   # De-register cluster
204 |   multicore(cores = cores, type = "close", cl = cl)
205 | 
206 |   # Extract best tuning parameters
207 |   grid_cells <- dplyr::bind_rows(m_errors)
208 |   best_params <- dplyr::slice(
209 |     loss_score_ranking(
210 |       score = grid_cells,
211 |       loss.fun = loss.fun
212 |     ), 1
213 |   )
214 | 
215 |   # Choose best-performing model
216 |   out <- models[[dplyr::pull(.data = best_params, var = model)]]
217 | 
218 | 
219 |   # Function output
220 |   return(out)
221 | 
222 | }


--------------------------------------------------------------------------------
/R/run_deep_pca.r:
--------------------------------------------------------------------------------
  1 | #' Apply PCA classifier to MrP.
  2 | #'
  3 | #' \code{run_deep_pca} is a wrapper function that applies the PCA classifier to
  4 | #' data provided by the user, evaluates prediction performance, and chooses the
  5 | #' best-performing model. It differs from \code{run_best_subset} in that it
  6 | #' includes L1.x interactions.
  7 | #'
  8 | #' @inheritParams auto_MrP
  9 | #' @param data Data for cross-validation. A \code{list} of \eqn{k}
 10 | #'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
 11 | #'   cross-validation.
 12 | #'
 13 | #' @return A model formula of the winning best subset classifier model.
 14 | 
 15 | run_deep_pca <- function(
 16 |   y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, deep.splines, data,
 17 |   cores, verbose
 18 | ) {
 19 | 
 20 |   # Determine type of dependent variable
 21 |   if (
 22 |     data[[1]] %>%
 23 |       dplyr::pull(!!y) %>%
 24 |       unique() %>%
 25 |       length() == 2
 26 |   ) {
 27 |     dv_type <- "binary"
 28 |   } else {
 29 |     dv_type <- "linear"
 30 |   }
 31 | 
 32 |   # List of all models to be evaluated
 33 |   models <- model_list_pca(
 34 |     y = y,
 35 |     L1.x = L1.x,
 36 |     L2.x = L2.x,
 37 |     L2.unit = L2.unit,
 38 |     L2.reg = L2.reg
 39 |   )
 40 | 
 41 |   # no nesting with deep interactions
 42 |   if (!is.null(L2.reg)) {
 43 |     models <- lapply(models, function(x) {
 44 |       # model formula to character
 45 |       m_form <- as.character(x)
 46 |       # replace (1 | region/state) with (1 | region) + (1 | state)
 47 |       m_form <- stringr::str_replace_all(
 48 |         string = m_form,
 49 |         pattern = "\\(1 \\| region/state\\)",
 50 |         replacement = "\\(1 | state\\) + \\(1 | region\\)"
 51 |       )
 52 |       # character to formula
 53 |       m_form <- as.formula(sprintf("%s%s%s", m_form[2], m_form[1], m_form[3]))
 54 |     })
 55 |   }
 56 | 
 57 |   # add interactions to the models
 58 |   models <- lapply(models, function(x) {
 59 | 
 60 |     # get all level 1 variables in the current model
 61 |     c_l1_x <- x %>%
 62 |       as.character() %>%
 63 |       .[3] %>%
 64 |       stringr::str_extract_all(pattern = "L1x\\d+") %>%
 65 |       unlist()
 66 | 
 67 |     # generate all interactions of L1.x
 68 |     l1_comb <- unlist(lapply(2:length(c_l1_x), function(x) {
 69 |       apply(combn(L1.x, x), 2, paste, collapse = ".")
 70 |     }))
 71 | 
 72 |     # generate all interactions of L1.x with L2.unit
 73 |     l1_state <- paste(L1.x, L2.unit, sep = ".")
 74 | 
 75 |     # generate all interactions of L1.x with L2.reg
 76 |     if (!is.null(L2.reg)) {
 77 |       l1_region <- paste(L1.x, L2.reg, sep = ".")
 78 |     } else {
 79 |       l1_region <- NULL
 80 |     }
 81 | 
 82 |     # interactions
 83 |     add_interactions <- paste0(
 84 |       # interactions of L1x
 85 |       paste("(1 | ", l1_comb, ")", collapse = " + "), " + ",
 86 |       # interactions of L1x with L2.unit
 87 |       paste("(1 | ", l1_state, ")", collapse = " + "), " + ",
 88 |       # interactions of L1x with L2.reg
 89 |       if (any(!is.null(l1_region))) {
 90 |         paste("(1 | ", l1_region, ")", collapse = " + ")
 91 |       }
 92 |     )
 93 | 
 94 |     # remove trailing " + " from interactions
 95 |     add_interactions <- stringr::str_extract(
 96 |       string = add_interactions,
 97 |       pattern = "^.*\\)"
 98 |     )
 99 | 
100 |     # character to formula
101 |     add_interactions <- as.formula(paste("~ . +", add_interactions))
102 | 
103 |     # update formula with interactions
104 |     x <- update(x, add_interactions)
105 | 
106 |     # add splines to context level variables
107 |     if (deep.splines) {
108 | 
109 |       # formula to character
110 |       char_form <- as.character(x)
111 |       char_form <- sprintf("%s %s %s", char_form[2], char_form[1], char_form[3])
112 | 
113 |       # get all context level variables in the current model
114 |       c_l2_x <- char_form %>%
115 |         stringr::str_extract_all(pattern = "L2\\.x\\d+") %>%
116 |         unlist()
117 | 
118 |       # replace in string
119 |       for (i in seq_along(c_l2_x)) {
120 |         char_form <- stringr::str_replace(
121 |           string = char_form,
122 |           pattern = c_l2_x[i],
123 |           replacement = sprintf("v_s(%s)", c_l2_x[i])
124 |         )
125 |       }
126 | 
127 |       # character to formula
128 |       x <- as.formula(char_form)
129 | 
130 |     }
131 | 
132 |     return(x)
133 |   })
134 | 
135 |   # Register cores
136 |   cl <- multicore(cores = cores, type = "open", cl = NULL)
137 | 
138 |   # Train and evaluate each model
139 |   m_errors <- foreach::foreach(
140 |     m = seq_along(models), .packages = "autoMrP",
141 |     .export = c("deep_mrp_classifier", "loss_function")
142 |   ) %dorng% {
143 | 
144 |     `%>%` <- magrittr::`%>%`
145 | 
146 |     # Loop over each fold
147 |     k_errors <- lapply(seq_along(data), function(k) {
148 | 
149 |       # Split data in training and validation sets
150 |       data_train <- dplyr::bind_rows(data[-k])
151 |       data_valid <- dplyr::bind_rows(data[k])
152 | 
153 |       # Train mth model on kth training set
154 |       model_m <- deep_mrp_classifier(
155 |         form = models[[m]],
156 |         y = y,
157 |         data = data_train,
158 |         verbose = TRUE
159 |       )
160 | 
161 |       # predictions based on DV type (binary or continuous)
162 |       if (dv_type == "binary") {
163 |         # use trained model to make predictions for kth validation set
164 |         pred_m <- vglmer::predict_MAVB(
165 |           samples = 1000,
166 |           model_m,
167 |           newdata = data_valid,
168 |           allow_missing_levels = TRUE
169 |         )[["mean"]]
170 | 
171 |         # convert to response probabilities
172 |         pred_m <- stats::plogis(pred_m)
173 | 
174 |       } else if (dv_type == "linear") {
175 |         # Use trained model to make predictions for kth validation set
176 |         pred_m <- predict(
177 |           samples = 1000,
178 |           object = model_m,
179 |           newdata = data_valid,
180 |           allow_missing_levels = TRUE
181 |         )[["mean"]]
182 |       }
183 | 
184 |       # evaluate predictions based on loss function
185 |       perform_m <- loss_function(
186 |         pred = pred_m,
187 |         data.valid = data_valid,
188 |         loss.unit = loss.unit,
189 |         loss.fun = loss.fun,
190 |         y = y,
191 |         L2.unit = L2.unit
192 |       )
193 |     })
194 | 
195 |     # Mean over loss functions
196 |     k_errors <- dplyr::bind_rows(k_errors) %>%
197 |       dplyr::group_by(measure) %>%
198 |       dplyr::summarise(value = mean(value), .groups = "drop") %>%
199 |       dplyr::mutate(model = m)
200 |   }
201 | 
202 |   # De-register cluster
203 |   multicore(cores = cores, type = "close", cl = cl)
204 | 
205 |   # Extract best tuning parameters
206 |   grid_cells <- dplyr::bind_rows(m_errors)
207 |   best_params <- dplyr::slice(
208 |     loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1
209 |   )
210 | 
211 |   # Choose best-performing model
212 |   out <- models[[dplyr::pull(.data = best_params, var = model)]]
213 | 
214 |   # Function output
215 |   return(out)
216 | }
217 | 


--------------------------------------------------------------------------------
/R/run_lasso.R:
--------------------------------------------------------------------------------
  1 | #' Apply lasso classifier to MrP.
  2 | #'
  3 | #' \code{run_lasso} is a wrapper function that applies the lasso classifier to
  4 | #' data provided by the user, evaluates prediction performance, and chooses the
  5 | #' best-performing model.
  6 | #'
  7 | #' @inheritParams auto_MrP
  8 | #' @param lambda Lasso penalty parameter. A numeric \code{vector} of
  9 | #'   non-negative values. The penalty parameter controls the shrinkage of the
 10 | #'   context-level variables in the lasso model. Default is a sequence with
 11 | #'   minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The
 12 | #'   number of values is controlled by the \code{lasso.n.iter} parameter.
 13 | #' @param n.iter Lasso number of lambda values. An integer-valued scalar
 14 | #'   specifying the number of lambda values to search over. Default is
 15 | #'   \eqn{100}.
 16 | #'   \emph{Note:} Is ignored if a vector of \code{lasso.lambda} values is
 17 | #'   provided.
 18 | #' @param data Data for cross-validation. A \code{list} of \eqn{k}
 19 | #'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
 20 | #'   cross-validation.
 21 | #'
 22 | #' @return The tuned lambda value. A numeric scalar.
 23 | 
 24 | run_lasso <- function(
 25 |   y, L1.x, L2.x, L2.unit, L2.reg, n.iter, loss.unit, loss.fun,
 26 |   lambda, data, verbose, cores
 27 | ) {
 28 | 
 29 |   # Lasso search grid
 30 |   if (is.null(lambda)) {
 31 |     lambda <- log_spaced(min = 0.1, max = 250, n = n.iter)
 32 |   }
 33 | 
 34 |   # Context-level fixed effects
 35 |   L2_fe <- paste(L2.x, collapse = " + ")
 36 |   if (L2_fe == "") {
 37 |     L2_fe_form <- as.formula(paste(y, " ~ 1", sep = ""))
 38 |     L2.x <- NULL
 39 |   } else {
 40 |     L2_fe_form <- as.formula(paste(y, " ~ ", L2_fe, sep = ""))
 41 |   }
 42 | 
 43 |   # Individual-level random effects as named list
 44 |   L1_re <- setNames(
 45 |     as.list(rep(c(~ 1), times = length(c(L1.x, L2.unit, L2.reg)))),
 46 |     c(L1.x, L2.unit, L2.reg)
 47 |   )
 48 | 
 49 |   # Parallel processing
 50 |   if (cores > 1) {
 51 |     lambda_errors <- run_lasso_mc_lambda(
 52 |       y = y, L1.x = L1.x, L2.x = L2.x, L2.unit = L2.unit, L2.reg = L2.reg,
 53 |       loss.unit = loss.unit, loss.fun = loss.fun, data = data,
 54 |       cores = cores, L2.fe.form = L2_fe_form, L1.re = L1_re,
 55 |       lambda = lambda
 56 |     )
 57 |   } else {
 58 | 
 59 |     # Train and evaluate each model
 60 |     lambda_errors <- lapply(seq_along(lambda), function(l) {
 61 | 
 62 |       # Print lambda value
 63 |       if (isTRUE(verbose)) {
 64 |         L <- length(lambda)
 65 |         cat(paste(
 66 |           "Lasso: Running lambda w/ value ", lambda[l],
 67 |           " (lambda ", l, " out of max. ",
 68 |           L, " lambdas)\n", sep = ""
 69 |         ))
 70 |       }
 71 | 
 72 |       # Loop over each fold
 73 |       k_errors <- lapply(seq_along(data), function(k) {
 74 |         # Split data in training and validation sets
 75 |         data_train <- dplyr::bind_rows(data[-k])
 76 |         data_valid <- dplyr::bind_rows(data[k])
 77 | 
 78 |         # Convert individual-level, geographic unit, and geographic region
 79 |         # covariates to factor variables in training and validation sets
 80 |         data_train <- data_train %>%
 81 |           dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>%
 82 |           dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>%
 83 |           tidyr::drop_na()
 84 | 
 85 |         data_valid <- data_valid %>%
 86 |           dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>%
 87 |           dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>%
 88 |           tidyr::drop_na()
 89 | 
 90 |         # Train model using lambda value on kth training set
 91 |         model_l <- lasso_classifier(
 92 |           y = y,
 93 |           L2.fix = L2_fe_form,
 94 |           L1.re = L1_re,
 95 |           data.train = data_train,
 96 |           lambda = lambda[l],
 97 |           model.family = binomial(link = "probit"),
 98 |           verbose = verbose
 99 |         )
100 | 
101 |         # Use trained model to make predictions for kth validation set
102 |         pred_l <- stats::predict(model_l, newdata = data.frame(data_valid))
103 | 
104 |         # Evaluate predictions based on loss function
105 |         perform_l <- loss_function(
106 |           pred = pred_l,
107 |           data.valid = data_valid,
108 |           loss.unit = loss.unit,
109 |           loss.fun = loss.fun,
110 |           y = y,
111 |           L2.unit = L2.unit
112 |         )
113 |       })
114 | 
115 |       # Mean over loss functions
116 |       k_errors <- dplyr::bind_rows(k_errors) %>%
117 |         dplyr::group_by(measure) %>%
118 |         dplyr::summarise(value = mean(value), .groups = "drop") %>%
119 |         dplyr::mutate(lambda = lambda[l])
120 |     })
121 |   }
122 |   # Extract best tuning parameters
123 |   grid_cells <- dplyr::bind_rows(lambda_errors)
124 |   best_params <- dplyr::slice(
125 |     loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1
126 |   )
127 | 
128 |   # Choose best-performing model
129 |   out <- dplyr::pull(.data = best_params, var = lambda)
130 | 
131 |   return(out)
132 | 
133 | }
134 | 
135 | 
136 | ################################################################################
137 | #                Multicore tuning for lasso parallel across lambda values      #
138 | ################################################################################
139 | #' Lasso multicore tuning.
140 | #'
141 | #' \code{run_lasso_mc_lambda} is called from within \code{run_lasso}. It
142 | #' tunes using multiple cores.
143 | #'
144 | #' @inheritParams auto_MrP
145 | #' @inheritParams run_lasso
146 | #' @param L2.fe.form The fixed effects part of the Lasso classifier formula. The
147 | #'   formula is inherited from \code{run_lasso}.
148 | #' @param L1.re A list of random effects for the Lasso classifier formula. The
149 | #'   formula is inherited from \code{run_lasso}.
150 | #' @return The cross-validation errors for all models. A list.
151 | 
152 | run_lasso_mc_lambda <- function(
153 |   y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, data,
154 |   cores, L2.fe.form, L1.re, lambda
155 | ) {
156 | 
157 |   # Binding for global variables
158 |   `%>%` <- dplyr::`%>%`
159 |   l <- NULL
160 | 
161 |   # Register cores
162 |   cl <- multicore(cores = cores, type = "open", cl = NULL)
163 | 
164 |   # Loop over each lambda value
165 |   lambda_errors <- foreach::foreach(l = seq_along(lambda)) %dorng% {
166 | 
167 |     # Set lambda value to 0
168 |     lambda_value <- lambda[l]
169 | 
170 |     # Loop over each fold
171 |     k_errors <- lapply(seq_along(data), function(k) {
172 |       # Split data in training and validation sets
173 |       data_train <- dplyr::bind_rows(data[-k])
174 |       data_valid <- dplyr::bind_rows(data[k])
175 | 
176 |       # Convert individual-level, geographic unit, and geographic region
177 |       # covariates to factor variables in training and validation sets
178 |       data_train <- data_train %>%
179 |         dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>%
180 |         dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>%
181 |         tidyr::drop_na()
182 | 
183 |       data_valid <- data_valid %>%
184 |         dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>%
185 |         dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>%
186 |         tidyr::drop_na()
187 | 
188 |       # Train model using lambda value on kth training set
189 |       model_l <- lasso_classifier(
190 |         y = y,
191 |         L2.fix = L2.fe.form,
192 |         L1.re = L1.re,
193 |         data.train = data_train,
194 |         lambda = lambda_value,
195 |         model.family = binomial(link = "probit"),
196 |         verbose = FALSE
197 |       )
198 | 
199 |       # Use trained model to make predictions for kth validation set
200 |       pred_l <- stats::predict(model_l, newdata = data.frame(data_valid))
201 | 
202 |       # Evaluate predictions based on loss function
203 |       perform_l <- loss_function(
204 |         pred = pred_l,
205 |         data.valid = data_valid,
206 |         loss.unit = loss.unit,
207 |         loss.fun = loss.fun,
208 |         y = y, L2.unit = L2.unit
209 |       )
210 |     })
211 | 
212 |     # Mean over loss functions
213 |     k_errors <- dplyr::bind_rows(k_errors) %>%
214 |       dplyr::group_by(measure) %>%
215 |       dplyr::summarise(value = mean(value), .groups = "drop") %>%
216 |       dplyr::mutate(lambda = lambda[l])
217 | 
218 |   }
219 | 
220 |   # De-register cluster
221 |   multicore(cores = cores, type = "close", cl = cl)
222 | 
223 |   # Function output
224 |   return(lambda_errors)
225 | 
226 | }
227 | 


--------------------------------------------------------------------------------
/R/run_pca.R:
--------------------------------------------------------------------------------
  1 | #' Apply PCA classifier to MrP.
  2 | #'
  3 | #' \code{run_pca} is a wrapper function that applies the PCA classifier to data
  4 | #' provided by the user, evaluates prediction performance, and chooses the
  5 | #' best-performing model.
  6 | #'
  7 | #' @inheritParams auto_MrP
  8 | #' @param data Data for cross-validation. A \code{list} of \eqn{k}
  9 | #'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
 10 | #'   cross-validation.
 11 | #'
 12 | #' @return A model formula of the winning best subset classifier model.
 13 | 
 14 | run_pca <- function(
 15 |   y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, data, cores,
 16 |   verbose
 17 | ) {
 18 | 
 19 |   # List of all models to be evaluated
 20 |   models <- model_list_pca(
 21 |       y = y,
 22 |       L1.x = L1.x,
 23 |       L2.x = L2.x,
 24 |       L2.unit = L2.unit,
 25 |       L2.reg = L2.reg
 26 |     )
 27 | 
 28 |   # prallel tuning if cores > 1
 29 |   if (cores > 1) {
 30 | 
 31 |     # Train all models in parallel
 32 |     m_errors <- run_best_subset_mc(
 33 |       verbose = verbose,
 34 |       models = models,
 35 |       data = data,
 36 |       loss.unit = loss.unit,
 37 |       loss.fun = loss.fun,
 38 |       y = y,
 39 |       L1.x = L1.x,
 40 |       L2.x = L2.x,
 41 |       L2.unit = L2.unit,
 42 |       L2.reg = L2.reg,
 43 |       cores = cores
 44 |     )
 45 |   } else {
 46 |     # Train and evaluate each model
 47 |     m_errors <- lapply(seq_along(models), function(m) {
 48 |       # Print model m
 49 |       if (isTRUE(verbose)) {
 50 |         M <- length(models)
 51 |         message(
 52 |           "Best subset: Running model ", m,
 53 |           " out of ", M, " models\n")
 54 |       }
 55 | 
 56 |       # Loop over each fold
 57 |       k_errors <- lapply(seq_along(data), function(k) {
 58 |         # Split data in training and validation sets
 59 |         data_train <- dplyr::bind_rows(data[-k])
 60 |         data_valid <- dplyr::bind_rows(data[k])
 61 | 
 62 |         # Train mth model on kth training set
 63 |         model_m <- best_subset_classifier(
 64 |           y = y,
 65 |           model = models[[m]],
 66 |           data.train = data_train,
 67 |           model.family = binomial(link = "probit"),
 68 |           model.optimizer = "bobyqa",
 69 |           n.iter = 1000000,
 70 |           verbose = verbose
 71 |         )
 72 | 
 73 |         # Use trained model to make predictions for kth validation set
 74 |         pred_m <- stats::predict(
 75 |           model_m, newdata = data_valid,
 76 |           type = "response", allow.new.levels = TRUE
 77 |         )
 78 | 
 79 |         # Evaluate predictions based on loss function
 80 |         perform_m <- loss_function(
 81 |           pred = pred_m,
 82 |           data.valid = data_valid,
 83 |           loss.unit = loss.unit,
 84 |           loss.fun = loss.fun,
 85 |           y = y,
 86 |           L2.unit = L2.unit
 87 |         )
 88 |       })
 89 | 
 90 |       # Mean over loss functions
 91 |       k_errors <- dplyr::bind_rows(k_errors) %>%
 92 |         dplyr::group_by(measure) %>%
 93 |         dplyr::summarise(value = mean(value), .groups = "drop") %>%
 94 |         dplyr::mutate(model = m)
 95 |     })
 96 |   }
 97 | 
 98 |   # Extract best tuning parameters
 99 |   grid_cells <- dplyr::bind_rows(m_errors)
100 |   best_params <- dplyr::slice(
101 |     loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1
102 |   )
103 | 
104 |   # Choose best-performing model
105 |   out <- models[[dplyr::pull(.data = best_params, var = model)]]
106 | 
107 |   # Function output
108 |   return(out)
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/R/run_svm.R:
--------------------------------------------------------------------------------
  1 | #' Apply support vector machine classifier to MrP.
  2 | #'
  3 | #' \code{run_svm} is a wrapper function that applies the support vector machine
  4 | #' classifier to data provided by the user, evaluates prediction performance,
  5 | #' and chooses the best-performing model.
  6 | #'
  7 | #' @inheritParams auto_MrP
  8 | #' @param L2.eval.unit Geographic unit for the loss function. A character scalar
  9 | #'   containing the column name of the geographic unit in \code{survey} and
 10 | #'   \code{census}.
 11 | #' @param L2.reg Geographic region. A character scalar containing the column
 12 | #'   name of the geographic region in \code{survey} and \code{census} by which
 13 | #'   geographic units are grouped (\code{L2.unit} must be nested within
 14 | #'   \code{L2.reg}). Default is \code{NULL}.
 15 | #' @param loss.fun Loss function. A character-valued scalar indicating whether
 16 | #'   prediction loss should be measured by the mean squared error (\code{MSE})
 17 | #'   or the mean absolute error (\code{MAE}). Default is \code{MSE}.
 18 | #' @param kernel SVM kernel. A character-valued scalar specifying the kernel to
 19 | #'   be used by SVM. The possible values are \code{linear}, \code{polynomial},
 20 | #'   \code{radial}, and \code{sigmoid}. Default is \code{radial}.
 21 | #' @param gamma SVM kernel parameter. A numeric vector whose values specify the
 22 | #'   gamma parameter in the SVM kernel. This parameter is needed for all kernel
 23 | #'   types except linear. Default is a sequence with minimum = 1e-5, maximum =
 24 | #'   1e-1, and length = 20 that is equally spaced on the log-scale.
 25 | #' @param cost SVM cost parameter. A numeric vector whose values specify the
 26 | #'   cost of constraints violation in SVM. Default is a sequence with minimum =
 27 | #'   0.5, maximum = 10, and length = 5 that is equally spaced on the log-scale.
 28 | #' @param data Data for cross-validation. A \code{list} of \eqn{k}
 29 | #'   \code{data.frames}, one for each fold to be used in \eqn{k}-fold
 30 | #'   cross-validation.
 31 | #'
 32 | #' @return The support vector machine tuned parameters. A list.
 33 | 
 34 | run_svm <- function(
 35 |   y, L1.x, L2.x, L2.eval.unit, L2.unit, L2.reg,
 36 |   kernel = "radial", loss.fun, loss.unit, gamma,
 37 |   cost, data, verbose, cores
 38 | ) {
 39 | 
 40 |   # Create model formula
 41 |   x <- paste(c(L1.x, L2.x, L2.unit, L2.reg), collapse = " + ")
 42 |   form <- as.formula(paste(y, " ~ ", x, sep = ""))
 43 | 
 44 |   # Default Gamma values
 45 |   if (is.null(gamma)) {
 46 |     # SVM Gamma values
 47 |     gamma <- log_spaced(min = 1e-5, 1e-1, n = 20)
 48 |   }
 49 | 
 50 |   # Default Cost values
 51 |   if (is.null(cost)) {
 52 |     cost <- log_spaced(min = 0.5, max = 10, n = 5)
 53 |   }
 54 | 
 55 |   # tuning parameter grid
 56 |   svm_grid <- expand.grid(gamma, cost, kernel)
 57 |   names(svm_grid) <- c("gamma", "cost", "kernel")
 58 | 
 59 |   # prallel tuning if cores > 1
 60 |   if (cores > 1) {
 61 | 
 62 |     # Train all models in parallel
 63 |     grid_cells <- run_svm_mc(
 64 |       verbose = verbose,
 65 |       svm.grid = svm_grid,
 66 |       data = data,
 67 |       L2.eval.unit = L2.eval.unit,
 68 |       loss.unit = loss.unit,
 69 |       loss.fun = loss.fun,
 70 |       y = y,
 71 |       L1.x = L1.x,
 72 |       L2.x = L2.x,
 73 |       L2.unit = L2.unit,
 74 |       L2.reg = L2.reg,
 75 |       form = form,
 76 |       cores = cores
 77 |     )
 78 | 
 79 |   # Train all models sequentially
 80 |   } else {
 81 |     # loop over tuning grid
 82 |     grid_cells <- apply(svm_grid, 1, function(g) {
 83 | 
 84 |       # Set tuning parameters
 85 |       gamma_value <- as.numeric(g["gamma"])
 86 |       cost_value <- as.numeric(g["cost"])
 87 |       kernel_value <- as.character(g[["kernel"]])
 88 | 
 89 |       # Loop over each fold
 90 |       k_errors <- lapply(seq_along(data), function(k) {
 91 | 
 92 |         # Split data in training and validation sets and factorize DV
 93 |         data_train <- dplyr::bind_rows(data[-k]) %>%
 94 |           dplyr::mutate_at(.vars = y, as.factor) %>%
 95 |           dplyr::select(dplyr::all_of(
 96 |             c(y, L1.x, L2.x, L2.eval.unit, L2.reg)
 97 |           )) %>%
 98 |           tidyr::drop_na()
 99 | 
100 |         data_valid <- dplyr::bind_rows(data[k]) %>%
101 |           dplyr::mutate_at(.vars = y, as.factor) %>%
102 |           dplyr::select(dplyr::all_of(
103 |             c(y, L1.x, L2.x, L2.eval.unit, L2.reg)
104 |           )) %>%
105 |           tidyr::drop_na()
106 | 
107 |         # Svm classifier
108 |         model_l <- svm_classifier(
109 |           y = y,
110 |           form = form,
111 |           data = data_train,
112 |           kernel = kernel_value,
113 |           type = "C-classification",
114 |           probability = TRUE,
115 |           svm.gamma = gamma_value,
116 |           svm.cost = cost_value,
117 |           verbose = verbose
118 |         )
119 | 
120 |         # Use trained model to make predictions for kth validation set
121 |         pred_l <- predict(
122 |           model_l, newdata = data.frame(data_valid),
123 |           probability = TRUE
124 |         )
125 |         if (!is.null(attr(pred_l, "probabilities")[, "1"])) {
126 |           pred_l <- as.numeric(attr(pred_l, "probabilities")[, "1"])
127 |         }
128 | 
129 |         # Transform factor DV to numeric for loss function
130 |         data_valid <- data_valid %>%
131 |           dplyr::mutate_at(.vars = y, function(x) as.numeric(levels(x))[x])
132 | 
133 |         # Evaluate predictions based on loss function
134 |         perform_l <- loss_function(
135 |           pred = pred_l, data.valid = data_valid,
136 |           loss.unit = loss.unit,
137 |           loss.fun = loss.fun,
138 |           y = y, L2.unit = L2.eval.unit
139 |         )
140 |       })
141 | 
142 |       # Mean over loss functions
143 |       k_errors <- dplyr::bind_rows(k_errors) %>%
144 |         dplyr::group_by(measure) %>%
145 |         dplyr::summarise(value = mean(value), .groups = "drop") %>%
146 |         dplyr::mutate(
147 |           gamma = gamma_value,
148 |           cost = cost_value,
149 |           kernel = kernel_value
150 |         )
151 | 
152 |     })
153 |   }
154 | 
155 |   # Extract best tuning parameters
156 |   grid_cells <- dplyr::bind_rows(grid_cells)
157 |   best_params <- dplyr::slice(
158 |     loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1
159 |   )
160 | 
161 |   out <- list(
162 |     gamma =  dplyr::pull(.data = best_params, var = gamma),
163 |     cost = dplyr::pull(.data = best_params, var = cost),
164 |     kernel = dplyr::pull(.data = best_params, var = kernel)
165 |   )
166 | 
167 |   # Function output
168 |   return(out)
169 | 
170 | }
171 | 
172 | ################################################################################
173 | #                     Multicore tuning for svm                                 #
174 | ################################################################################
175 | #' SVM multicore tuning.
176 | #'
177 | #' \code{run_svm_mc} is called from within \code{run_svm}. It tunes using
178 | #' multiple cores.
179 | #'
180 | #' @inheritParams run_svm
181 | #' @param form The model formula. A formula object.
182 | #' @param svm.grid The hyper-parameter search grid. A matrix of all
183 | #'   hyper-parameter combinations.
184 | #' @return The cross-validation errors for all models. A list.
185 | 
186 | run_svm_mc <- function(
187 |   y, L1.x, L2.x, L2.eval.unit, L2.unit, L2.reg, form,
188 |   loss.unit, loss.fun, data, cores, svm.grid, verbose
189 | ) {
190 | 
191 |   # Binding for global variables
192 |   g <- NULL
193 |   `%>%` <- dplyr::`%>%`
194 | 
195 |   # Register cores
196 |   cl <- multicore(cores = cores, type = "open", cl = NULL)
197 | 
198 |   # Train and evaluate each model
199 |   grid_cells <- foreach::foreach(
200 |     g = seq_len(nrow(svm.grid)), .packages = "autoMrP"
201 |   ) %dorng% {
202 | 
203 |     # Set tuning parameters
204 |     gamma_value <- as.numeric(svm.grid[g, "gamma"])
205 |     cost_value <- as.numeric(svm.grid[g, "cost"])
206 |     kernel_value <- svm.grid[g, "kernel"]
207 | 
208 |     # Loop over each fold
209 |     k_errors <- lapply(seq_along(data), function(k) {
210 | 
211 |       # Split data in training and validation sets and factorize DV
212 |       data_train <- dplyr::bind_rows(data[-k]) %>%
213 |         dplyr::mutate_at(.vars = y, as.factor) %>%
214 |         dplyr::select(dplyr::all_of(
215 |           c(y, L1.x, L2.x, L2.eval.unit, L2.reg)
216 |         )) %>%
217 |         tidyr::drop_na()
218 | 
219 |       data_valid <- dplyr::bind_rows(data[k]) %>%
220 |         dplyr::mutate_at(.vars = y, as.factor) %>%
221 |         dplyr::select(dplyr::all_of(
222 |           c(y, L1.x, L2.x, L2.eval.unit, L2.reg)
223 |         )) %>%
224 |         tidyr::drop_na()
225 | 
226 |       # Svm classifier
227 |       model_l <- svm_classifier(
228 |         y = y,
229 |         form = form,
230 |         data = data_train,
231 |         kernel = kernel_value,
232 |         type = "C-classification",
233 |         probability = TRUE,
234 |         svm.gamma = gamma_value,
235 |         svm.cost = cost_value,
236 |         verbose = verbose
237 |       )
238 | 
239 |       # Use trained model to make predictions for kth validation set
240 |       pred_l <- predict(
241 |         model_l, newdata = data.frame(data_valid),
242 |         probability = TRUE
243 |       )
244 |       if (!is.null(attr(pred_l, "probabilities")[, "1"])) {
245 |         pred_l <- as.numeric(attr(pred_l, "probabilities")[, "1"])
246 |       }
247 | 
248 |       # Transform factor DV to numeric for loss function
249 |       data_valid <- data_valid %>%
250 |         dplyr::mutate_at(.vars = y, function(x) as.numeric(levels(x))[x])
251 | 
252 |       # Evaluate predictions based on loss function
253 |       perform_l <- loss_function(
254 |         pred = pred_l,
255 |         data.valid = data_valid,
256 |         loss.unit = loss.unit,
257 |         loss.fun = loss.fun,
258 |         y = y,
259 |         L2.unit = L2.eval.unit
260 |       )
261 | 
262 |       return(perform_l)
263 |     })
264 | 
265 |     # Mean over loss functions
266 |     k_errors <- dplyr::bind_rows(k_errors) %>%
267 |       dplyr::group_by(measure) %>%
268 |       dplyr::summarise(value = mean(value), .groups = "drop") %>%
269 |       dplyr::mutate(
270 |         gamma = gamma_value,
271 |         cost = cost_value,
272 |         kernel = kernel_value
273 |       )
274 |   }
275 | 
276 |   # De-register cluster
277 |   multicore(cores = cores, type = "close", cl = cl)
278 | 
279 |   # Function output
280 |   return(grid_cells)
281 | }
282 | 


--------------------------------------------------------------------------------
/R/survey_data.R:
--------------------------------------------------------------------------------
 1 | #' A sample of a survey item from the CCES 2008
 2 | #'
 3 | #' The Cooperative Congressional Election Stuides (CCES) item (cc418_1) asked:
 4 | #' "Would you approve of the use of U.S. military troops in order to ensure the
 5 | #' supply of oil?" The original 2008 CCES item contains 36,832 respondents. This
 6 | #' sample mimics a typical national survey. It contains at least 5 respondents
 7 | #' from each state but is otherwise a random sample.
 8 | #'
 9 | #' @format A data frame with 1500 rows and 13 variables:
10 | #' \describe{
11 | #'   \item{YES}{1 if individual supports use of troops; 0 otherwise}
12 | #'   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
13 | #'   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
14 | #'   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
15 | #'   \item{state}{U.S. state}
16 | #'   \item{L2.unit}{U.S. state id}
17 | #'   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
18 | #'   \item{L2.x1}{Normalized state-level share of votes for the Republican candidate in the previous presidential election}
19 | #'   \item{L2.x2}{Normalized state-level percentage of Evangelical Protestant or Mormon respondents}
20 | #'   \item{L2.x3}{Normalized state-level percentage of the population living in urban areas}
21 | #'   \item{L2.x4}{Normalized state-level unemployment rate}
22 | #'   \item{L2.x5}{Normalized state-level share of Hispanics}
23 | #'   \item{L2.x6}{Normalized state-level share of Whites}
24 | #' }
25 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
26 | #'   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
27 | #'   multilevel regression and poststrat-stratification perform with
28 | #'   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
29 | #'   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
30 | #'   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
31 | "survey_item"
32 | 


--------------------------------------------------------------------------------
/R/svm_classifier.R:
--------------------------------------------------------------------------------
 1 | #' SVM classifier
 2 | #'
 3 | #' \code{svm_classifier} applies support vector machine classification to a
 4 | #' data set.
 5 | #'
 6 | #' @inheritParams auto_MrP
 7 | #' @param form Model formula. A two-sided linear formula describing
 8 | #'   the model to be fit, with the outcome on the LHS and the covariates
 9 | #'   separated by + operators on the RHS.
10 | #' @param data Data. A data.frame containing the cross-validation data used to
11 | #'   train and evaluate the model.
12 | #' @param kernel Kernel for SVM. A character string specifying the kernel to
13 | #'   be used for SVM. The possible types are linear, polynomial, radial, and
14 | #'   sigmoid. Default is radial.
15 | #' @param type svm can be used as a classification machine, as a regression
16 | #'   machine, or for novelty detection. Depending of whether y is a factor or
17 | #'   not, the default setting for type is C-classification or eps-regression,
18 | #'   respectively, but may be overwritten by setting an explicit value. Valid
19 | #'   options are: #' \enumerate{
20 | #'   \item C-classification
21 | #'   \item nu-classification
22 | #'   \item one-classification (for novelty detection)
23 | #'   \item eps-regression
24 | #'   \item nu-regression
25 | #' }
26 | #' @param probability Probability predictions. A logical argument indicating
27 | #'   whether the model should allow for probability predictions
28 | #' @param svm.gamma Gamma parameter for SVM. This parameter is needed for all
29 | #'   kernels except linear.
30 | #' @param svm.cost Cost parameter for SVM. This parameter specifies the cost of
31 | #'   constraints violation.
32 | #' @param verbose Verbose output. A logical vector indicating whether or not
33 | #'   verbose output should be printed.
34 | #' @return The support vector machine model. An \code{\link[e1071]{svm}} object.
35 | 
36 | svm_classifier <- function(
37 |   y, form, data, kernel, type, probability, svm.gamma,
38 |   svm.cost, verbose = c(TRUE, FALSE)
39 | ) {
40 | 
41 |   # Determine type of dependent variable
42 |   if (
43 |     data %>%
44 |       dplyr::pull(!!y) %>%
45 |       unique() %>%
46 |       length() > 2
47 |   ) {
48 |     # set type
49 |     type <- "eps-regression"
50 |     # numeric dv
51 |     data <- data %>%
52 |       dplyr::mutate_at(.vars = y, function(x) as.numeric(levels(x))[x])
53 |   }
54 | 
55 |   # Train and evaluate model using the supplied set of tuning parameters
56 |   if (isTRUE(verbose == TRUE)) {
57 |     out <- e1071::svm(
58 |       formula = form,
59 |       data = data,
60 |       type = type,
61 |       kernel = kernel,
62 |       gamma = svm.gamma,
63 |       cost = svm.cost,
64 |       probability = probability
65 |     )
66 |   } else {
67 |     out <- suppressMessages(suppressWarnings(
68 |       e1071::svm(
69 |         formula = form,
70 |         data = data,
71 |         type = type,
72 |         kernel = kernel,
73 |         gamma = svm.gamma,
74 |         cost = svm.cost,
75 |         probability = probability
76 |       )
77 |     ))
78 |   }
79 | 
80 |   # Function output
81 |   return(out)
82 | }
83 | 


--------------------------------------------------------------------------------
/R/taxes_census.R:
--------------------------------------------------------------------------------
 1 | #' Quasi census data.
 2 | #'
 3 | #' The census file is generated from the full 2008 National Annenberg Election
 4 | #' Studies item CBb01 by dissaggregating the 64 ideal type combinations of the
 5 | #' individual level variables L1x1, L2x2 and L1x3. A row is an ideal type in a
 6 | #' given state.
 7 | #'
 8 | #'
 9 | #' @format A data frame with 2934 rows and 13 variables:
10 | #' \describe{
11 | #'   \item{state}{U.S. state}
12 | #'   \item{L2.unit}{U.S. state id}
13 | #'   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
14 | #'   \item{L1x1}{Age group (four categories)}
15 | #'   \item{L1x2}{Education level (four categories)}
16 | #'   \item{L1x3}{Gender-race combination (six categories)}
17 | #'   \item{freq}{State-level frequency of ideal type}
18 | #'   \item{proportion}{State-level proportion of respondents of that ideal type in the population}
19 | #'   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
20 | #'   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
21 | #'   \item{L2.x3}{State-level percentage of the population living in urban areas}
22 | #'   \item{L2.x4}{State-level unemployment rate}
23 | #'   \item{L2.x5}{State-level share of Hispanics}
24 | #'   \item{L2.x6}{State-level share of Whites}
25 | #' }
26 | #' @usage data(taxes_census)
27 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
28 | #'   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
29 | #'   multilevel regression and poststrat-stratification perform with
30 | #'   conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3,
31 | #'   L2.x3, L2.x4, L2.x5 and L2.x6 are available at
32 | #'   \url{https://www.census.gov}.
33 | "taxes_census"
34 | 


--------------------------------------------------------------------------------
/R/taxes_survey.R:
--------------------------------------------------------------------------------
 1 | #' Sample on raising taxes from the 2008 National Annenberg Election Studies.
 2 | #'
 3 | #' The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm
 4 | #' going to read you some options about federal income taxes. Please tell me
 5 | #' which one comes closest to your view on what we should be doing about federal
 6 | #' income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if
 7 | #' necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3)
 8 | #' was turned into a 'raise taxes response,' categories (1) and (2) were
 9 | #' combined into a 'do not raise taxes' response. The original item from the
10 | #' phone and online surveys contains 50,483 respondents. This sample mimics a
11 | #' typical national survey. It contains at least 5 respondents from each state
12 | #' but is otherwise a random sample.
13 | #'
14 | #'
15 | #' @format A data frame with 1500 rows and 13 variables:
16 | #' \describe{
17 | #'   \item{YES}{1 if individual supports raising taxes; 0 otherwise}
18 | #'   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
19 | #'   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
20 | #'   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
21 | #'   \item{state}{U.S. state}
22 | #'   \item{L2.unit}{U.S. state id}
23 | #'   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
24 | #'   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
25 | #'   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
26 | #'   \item{L2.x3}{State-level percentage of the population living in urban areas}
27 | #'   \item{L2.x4}{State-level unemployment rate}
28 | #'   \item{L2.x5}{State-level share of Hispanics}
29 | #'   \item{L2.x6}{State-level share of Whites}
30 | #' }
31 | #' @usage data(taxes_survey)
32 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
33 | #'   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
34 | #'   multilevel regression and poststrat-stratification perform with
35 | #'   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
36 | #'   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
37 | #'   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
38 | "taxes_survey"
39 | 


--------------------------------------------------------------------------------
/R/taxes_truth.R:
--------------------------------------------------------------------------------
 1 | #' Sample of tax rates item from the 2008 National Annenberg Election Studies.
 2 | #'
 3 | #' The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm
 4 | #' going to read you some options about federal income taxes. Please tell me
 5 | #' which one comes closest to your view on what we should be doing about federal
 6 | #' income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if
 7 | #' necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3)
 8 | #' was turned into a 'raise taxes response,' categories (1) and (2) were
 9 | #' combined into a 'do not raise taxes' response. The original item from the
10 | #' phone and online surveys contains 50,483 respondents. This sample mimics a
11 | #' typical national survey. It contains at least 5 respondents from each state
12 | #' but is otherwise a random sample.
13 | #'
14 | #'
15 | #' @format A data frame with 1500 rows and 13 variables:
16 | #' \describe{
17 | #'   \item{YES}{1 if individual supports raising taxes; 0 otherwise}
18 | #'   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
19 | #'   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
20 | #'   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
21 | #'   \item{state}{U.S. state}
22 | #'   \item{L2.unit}{U.S. state id}
23 | #'   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
24 | #'   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
25 | #'   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
26 | #'   \item{L2.x3}{State-level percentage of the population living in urban areas}
27 | #'   \item{L2.x4}{State-level unemployment rate}
28 | #'   \item{L2.x5}{State-level share of Hispanics}
29 | #'   \item{L2.x6}{State-level share of Whites}
30 | #' }
31 | #' @usage data(taxes_survey)
32 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
33 | #'   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
34 | #'   multilevel regression and poststrat-stratification perform with
35 | #'   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
36 | #'   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
37 | #'   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
38 | "taxes_survey"
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # autoMrP
 2 | 
 3 | autoMrP improves the prediction performance of multilevel regression with post-stratification (MrP) by combining a number of machine learning methods through ensemble Bayesian model averaging (EBMA). For more information, see: Broniecki, Leemann, and Wüest. 2022. "Improving Multilevel Regression with Post-Stratification Through Machine Learning (autoMrP)", published in the *Journal of Politics*: https://doi.org/10.1086/714777.
 4 | 
 5 | ## Installation
 6 | 
 7 | To install autoMrP from GitHub, run:
 8 | 
 9 | ```R
10 | devtools::install_github("retowuest/autoMrP", build_vignettes = TRUE)
11 | ```
12 | 
13 | Please refer to the vignette for a detailed introduction to autoMrP. Access the vignette via:
14 | 
15 | ```R
16 | utils::browseVignettes("autoMrP")
17 | ```
18 | 


--------------------------------------------------------------------------------
/autoMrP.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageCheckArgs: --as-cran
22 | PackageRoxygenize: rd,namespace,vignette
23 | 


--------------------------------------------------------------------------------
/data/absentee_census.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/absentee_census.RData


--------------------------------------------------------------------------------
/data/absentee_voting.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/absentee_voting.RData


--------------------------------------------------------------------------------
/data/census.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/census.RData


--------------------------------------------------------------------------------
/data/survey_item.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/survey_item.RData


--------------------------------------------------------------------------------
/data/taxes_census.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/taxes_census.RData


--------------------------------------------------------------------------------
/data/taxes_survey.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/taxes_survey.RData


--------------------------------------------------------------------------------
/man/.Rapp.history:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/man/.Rapp.history


--------------------------------------------------------------------------------
/man/absentee_census.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/absentee_census.R
 3 | \docType{data}
 4 | \name{absentee_census}
 5 | \alias{absentee_census}
 6 | \title{Quasi census data.}
 7 | \format{
 8 | A data frame with 2934 rows and 13 variables:
 9 | \describe{
10 |   \item{state}{U.S. state}
11 |   \item{L2.unit}{U.S. state id}
12 |   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
13 |   \item{L1x1}{Age group (four categories)}
14 |   \item{L1x2}{Education level (four categories)}
15 |   \item{L1x3}{Gender-race combination (six categories)}
16 |   \item{proportion}{State-level proportion of respondents of that ideal type in the population}
17 |   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
18 |   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
19 |   \item{L2.x3}{State-level percentage of the population living in urban areas}
20 |   \item{L2.x4}{State-level unemployment rate}
21 |   \item{L2.x5}{State-level share of Hispanics}
22 |   \item{L2.x6}{State-level share of Whites}
23 | }
24 | }
25 | \source{
26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
27 |   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
28 |   multilevel regression and poststrat-stratification perform with
29 |   conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3,
30 |   L2.x3, L2.x4, L2.x5 and L2.x6 are available at
31 |   \url{https://www.census.gov}.
32 | }
33 | \usage{
34 | data(absentee_census)
35 | }
36 | \description{
37 | The census file is generated from the full 2008 Cooperative Congressional Election Studies
38 | item cc419_1 by dissaggregating the 64 ideal type combinations of the individual level variables
39 | L1x1, L2x2 and L1x3. A row is an ideal type in a given state.
40 | }
41 | \keyword{datasets}
42 | 


--------------------------------------------------------------------------------
/man/absentee_voting.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/absentee_voting.R
 3 | \docType{data}
 4 | \name{absentee_voting}
 5 | \alias{absentee_voting}
 6 | \title{A sample of the absentee voting item from the CCES 2008}
 7 | \format{
 8 | A data frame with 1500 rows and 13 variables:
 9 | \describe{
10 |   \item{YES}{1 if individual supports use of troops; 0 otherwise}
11 |   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
12 |   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
13 |   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
14 |   \item{state}{U.S. state}
15 |   \item{L2.unit}{U.S. state id}
16 |   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
17 |   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
18 |   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
19 |   \item{L2.x3}{State-level percentage of the population living in urban areas}
20 |   \item{L2.x4}{State-level unemployment rate}
21 |   \item{L2.x5}{State-level share of Hispanics}
22 |   \item{L2.x6}{State-level share of Whites}
23 | }
24 | }
25 | \source{
26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
27 |   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
28 |   multilevel regression and poststrat-stratification perform with
29 |   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
30 |   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
31 |   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
32 | }
33 | \usage{
34 | data(absentee_voting)
35 | }
36 | \description{
37 | The Cooperative Congressional Election Stuides (CCES) item (cc419_1) asked:
38 | "States have tried many new ways to run elections in recent years. Do you
39 | support or oppose any of the following ways of voting or conducting elections
40 | in your state? Election Reform - Allow absentee voting over the Internet?"
41 | The original 2008 CCES item contains 26,934 respondents. This sample mimics a
42 | typical national survey. It contains at least 5 respondents from each state
43 | but is otherwise a random sample.
44 | }
45 | \keyword{datasets}
46 | 


--------------------------------------------------------------------------------
/man/best_subset_classifier.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/best_subset_classifier.R
 3 | \name{best_subset_classifier}
 4 | \alias{best_subset_classifier}
 5 | \title{Best subset classifier}
 6 | \usage{
 7 | best_subset_classifier(
 8 |   model,
 9 |   data.train,
10 |   model.family,
11 |   model.optimizer,
12 |   n.iter,
13 |   y,
14 |   verbose = c(TRUE, FALSE)
15 | )
16 | }
17 | \arguments{
18 | \item{model}{Multilevel model. A model formula describing the multilevel
19 | model to be estimated on the basis of the provided training data.}
20 | 
21 | \item{data.train}{Training data. A data.frame containing the training data
22 | used to train the model.}
23 | 
24 | \item{model.family}{Model family. A variable indicating the model family
25 | to be used by glmer. Defaults to binomial(link = "probit").}
26 | 
27 | \item{model.optimizer}{Optimization method. A character-valued scalar
28 | describing the optimization method to be used by glmer. Defaults to
29 | "bobyqa".}
30 | 
31 | \item{n.iter}{Iterations. A integer-valued scalar specifying the maximum
32 | number of function evaluations tried by the optimization method.}
33 | 
34 | \item{y}{Outcome variable. A character vector containing the column names of
35 | the outcome variable. A character scalar containing the column name of
36 | the outcome variable in \code{survey}.}
37 | 
38 | \item{verbose}{Verbose output. A logical vector indicating whether or not
39 | verbose output should be printed.}
40 | }
41 | \value{
42 | The multilevel model. An \code{\link[lme4]{glmer}} object.
43 | }
44 | \description{
45 | \code{best_subset_classifier} applies best subset classification to a data
46 | set.
47 | }
48 | 


--------------------------------------------------------------------------------
/man/binary_cross_entropy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{binary_cross_entropy}
 4 | \alias{binary_cross_entropy}
 5 | \title{Estimates the inverse binary cross-entropy, i.e. 0 is the best score and 1
 6 | the worst.}
 7 | \usage{
 8 | binary_cross_entropy(
 9 |   pred,
10 |   data.valid,
11 |   loss.unit = c("individuals", "L2 units"),
12 |   y,
13 |   L2.unit
14 | )
15 | }
16 | \arguments{
17 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.}
18 | 
19 | \item{data.valid}{Test data set. A tibble of data that was not used for
20 | prediction.}
21 | 
22 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
23 | whether performance loss should be evaluated at the level of individual
24 | respondents (\code{individuals}) or geographic units (\code{L2 units}).
25 | Default is \code{individuals}.}
26 | 
27 | \item{y}{Outcome variable. A character vector containing the column names of
28 | the outcome variable.}
29 | 
30 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
31 | of the geographic unit in \code{survey} and \code{census} at which outcomes
32 | should be aggregated.}
33 | }
34 | \value{
35 | Returns a tibble containing two binary cross-entropy prediction
36 |   errors. The first is measured at the level of individuals and the second is
37 |   measured at the context level. The tibble dimensions are 2x3 with
38 |   variables: measure, value and level.
39 | }
40 | \description{
41 | \code{binary_cross_entropy()} estimates the inverse binary cross-entropy on
42 | the individual and state-level.
43 | }
44 | 


--------------------------------------------------------------------------------
/man/census.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/census_data.R
 3 | \docType{data}
 4 | \name{census}
 5 | \alias{census}
 6 | \title{Quasi census data.}
 7 | \format{
 8 | A data frame with 2934 rows and 13 variables:
 9 | \describe{
10 |   \item{state}{U.S. state}
11 |   \item{L2.unit}{U.S. state id}
12 |   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
13 |   \item{L1x1}{Age group (four categories)}
14 |   \item{L1x2}{Education level (four categories)}
15 |   \item{L1x3}{Gender-race combination (six categories)}
16 |   \item{proportion}{State-level proportion of respondents of that ideal type in the population}
17 |   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
18 |   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
19 |   \item{L2.x3}{State-level percentage of the population living in urban areas}
20 |   \item{L2.x4}{State-level unemployment rate}
21 |   \item{L2.x5}{State-level share of Hispanics}
22 |   \item{L2.x6}{State-level share of Whites}
23 | }
24 | }
25 | \source{
26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
27 |   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
28 |   multilevel regression and poststrat-stratification perform with
29 |   conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3,
30 |   L2.x3, L2.x4, L2.x5 and L2.x6 are available at
31 |   \url{https://www.census.gov}.
32 | }
33 | \usage{
34 | census
35 | }
36 | \description{
37 | The census file is generated from the full 2008 Cooperative Congressional Election Studies
38 | item cc418_1 by dissaggregating the 64 ideal type combinations of the individual level variables
39 | L1x1, L2x2 and L1x3. A row is an ideal type in a given state.
40 | }
41 | \keyword{datasets}
42 | 


--------------------------------------------------------------------------------
/man/cv_folding.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{cv_folding}
 4 | \alias{cv_folding}
 5 | \title{Generates folds for cross-validation}
 6 | \usage{
 7 | cv_folding(data, L2.unit, k.folds, cv.sampling = c("individuals", "L2 units"))
 8 | }
 9 | \arguments{
10 | \item{data}{The survey data; must be a tibble.}
11 | 
12 | \item{L2.unit}{The column name of the factor variable identifying the
13 | context-level unit}
14 | 
15 | \item{k.folds}{An integer value indicating the number of folds to be
16 | generated.}
17 | 
18 | \item{cv.sampling}{Cross-validation sampling method. A character-valued
19 | scalar indicating whether cross-validation folds should be created by
20 | sampling individual respondents (\code{individuals}) or geographic units
21 | (\code{L2 units}). Default is \code{L2 units}. \emph{Note:} ignored if
22 | \code{folds} is provided, but must be specified otherwise.}
23 | }
24 | \value{
25 | Returns a list with length specified by \code{k.folds} argument. Each
26 |   element is a tibble with a fold used in k-fold cross-validation.
27 | }
28 | \description{
29 | \code{cv_folding} creates folds used in classifier training within the survey
30 | data.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/deep_mrp_classifier.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deep_mrp_classifier.r
 3 | \name{deep_mrp_classifier}
 4 | \alias{deep_mrp_classifier}
 5 | \title{Deep MrP classifier}
 6 | \usage{
 7 | deep_mrp_classifier(y, form, data, verbose)
 8 | }
 9 | \arguments{
10 | \item{y}{Outcome variable. A character vector containing the column names of
11 | the outcome variable. A character scalar containing the column name of
12 | the outcome variable in \code{survey}.}
13 | 
14 | \item{form}{Model formula. A two-sided linear formula describing
15 | the model to be fit, with the outcome on the LHS and the covariates
16 | separated by + operators on the RHS.}
17 | 
18 | \item{data}{Data. A data.frame containing the data used to train the model.}
19 | 
20 | \item{verbose}{Verbose output. A logical argument indicating whether or not
21 | verbose output should be printed. Default is \code{FALSE}.}
22 | }
23 | \value{
24 | A Deep MrP model. A \code{\link[vglmer]{vglmer}} object.
25 | }
26 | \description{
27 | \code{deep_mrp_classifier} applies Deep MrP implemented in the \pkg{vglmer}
28 | package to a data set.
29 | }
30 | 


--------------------------------------------------------------------------------
/man/ebma.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ebma.R
  3 | \name{ebma}
  4 | \alias{ebma}
  5 | \title{Bayesian Ensemble Model Averaging EBMA}
  6 | \usage{
  7 | ebma(
  8 |   ebma.fold,
  9 |   y,
 10 |   L1.x,
 11 |   L2.x,
 12 |   L2.unit,
 13 |   L2.reg,
 14 |   pc.names,
 15 |   post.strat,
 16 |   n.draws,
 17 |   tol,
 18 |   best.subset.opt,
 19 |   pca.opt,
 20 |   lasso.opt,
 21 |   gb.opt,
 22 |   svm.opt,
 23 |   deep.mrp,
 24 |   verbose,
 25 |   cores,
 26 |   preds_all
 27 | )
 28 | }
 29 | \arguments{
 30 | \item{ebma.fold}{New data for EBMA tuning. A list containing the the data
 31 | that must not have been used in classifier training.}
 32 | 
 33 | \item{y}{Outcome variable. A character vector containing the column names of
 34 | the outcome variable. A character scalar containing the column name of
 35 | the outcome variable in \code{survey}.}
 36 | 
 37 | \item{L1.x}{Individual-level covariates. A character vector containing the
 38 | column names of the individual-level variables in \code{survey} and
 39 | \code{census} used to predict outcome \code{y}. Note that geographic unit
 40 | is specified in argument \code{L2.unit}.}
 41 | 
 42 | \item{L2.x}{Context-level covariates. A character vector containing the
 43 | column names of the context-level variables in \code{survey} and
 44 | \code{census} used to predict outcome \code{y}. To exclude context-level
 45 | variables, set \code{L2.x = NULL}.}
 46 | 
 47 | \item{L2.unit}{Geographic unit. A character scalar containing the column
 48 | name of the geographic unit in \code{survey} and \code{census} at which
 49 | outcomes should be aggregated.}
 50 | 
 51 | \item{L2.reg}{Geographic region. A character scalar containing the column
 52 | name of the geographic region in \code{survey} and \code{census} by which
 53 | geographic units are grouped (\code{L2.unit} must be nested within
 54 | \code{L2.reg}). Default is \code{NULL}.}
 55 | 
 56 | \item{pc.names}{Principal Component Variable names. A character vector
 57 | containing the names of the context-level principal components variables.}
 58 | 
 59 | \item{post.strat}{Post-stratification results. A list containing the best
 60 | models for each of the tuned classifiers, the individual level predictions
 61 | on the data classifier trainig data and the post-stratified context-level
 62 | predictions.}
 63 | 
 64 | \item{n.draws}{EBMA number of samples. An integer-valued scalar specifying
 65 | the number of bootstrapped samples to be drawn from the EBMA fold and used
 66 | for tuning EBMA. Default is \eqn{100}. Passed on from \code{ebma.n.draws}.}
 67 | 
 68 | \item{tol}{EBMA tolerance. A numeric vector containing the tolerance values
 69 | for improvements in the log-likelihood before the EM algorithm stops
 70 | optimization. Values should range at least from \eqn{0.01} to \eqn{0.001}.
 71 | Default is \code{c(0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001)}.
 72 | Passed on from \code{ebma.tol}.}
 73 | 
 74 | \item{best.subset.opt}{Tuned best subset parameters. A list returned from
 75 | \code{run_best_subset()}.}
 76 | 
 77 | \item{pca.opt}{Tuned best subset with principal components parameters. A list
 78 | returned from \code{run_pca()}.}
 79 | 
 80 | \item{lasso.opt}{Tuned lasso parameters. A list returned from
 81 | \code{run_lasso()}.}
 82 | 
 83 | \item{gb.opt}{Tuned gradient tree boosting parameters. A list returned from
 84 | \code{run_gb()}.}
 85 | 
 86 | \item{svm.opt}{Tuned support vector machine parameters. A list returned from
 87 | \code{run_svm()}.}
 88 | 
 89 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether
 90 | the deep MRP classifier should be used for best subset prediction. Setting
 91 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best
 92 | subset classifier. Default is \code{FALSE}.}
 93 | 
 94 | \item{verbose}{Verbose output. A logical argument indicating whether or not
 95 | verbose output should be printed. Default is \code{FALSE}.}
 96 | 
 97 | \item{cores}{The number of cores to be used. An integer indicating the number
 98 | of processor cores used for parallel computing. Default is 1.}
 99 | }
100 | \description{
101 | \code{ebma} tunes EBMA and generates weights for classifier averaging.
102 | }
103 | 


--------------------------------------------------------------------------------
/man/ebma_folding.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{ebma_folding}
 4 | \alias{ebma_folding}
 5 | \title{Generates data fold to be used for EBMA tuning}
 6 | \usage{
 7 | ebma_folding(data, L2.unit, ebma.size)
 8 | }
 9 | \arguments{
10 | \item{data}{The full survey data. A tibble.}
11 | 
12 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
13 | of the geographic unit in \code{survey} and \code{census} at which outcomes
14 | should be aggregated.}
15 | 
16 | \item{ebma.size}{EBMA fold size. A number in the open unit interval
17 | indicating the proportion of respondents to be allocated to the EBMA fold.
18 | Default is \eqn{1/3}.}
19 | }
20 | \value{
21 | Returns a list with two elements which are both tibble. List element
22 |   one is named \code{ebma_fold} and contains the tibble used in Ensemble
23 |   Bayesian Model Averaging Tuning. List element two is named \code{cv_data}
24 |   and contains the tibble used for classifier tuning.
25 | }
26 | \description{
27 | #' \code{ebma_folding()} generates a data fold that will not be used in
28 | classifier tuning. It is data that is needed to determine the optimal
29 | tolerance for EBMA.
30 | }
31 | 


--------------------------------------------------------------------------------
/man/ebma_mc_draws.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ebma.R
  3 | \name{ebma_mc_draws}
  4 | \alias{ebma_mc_draws}
  5 | \title{EBMA multicore tuning - parallelises over draws.}
  6 | \usage{
  7 | ebma_mc_draws(
  8 |   train.preds,
  9 |   train.y,
 10 |   ebma.fold,
 11 |   y,
 12 |   L1.x,
 13 |   L2.x,
 14 |   L2.unit,
 15 |   L2.reg,
 16 |   pc.names,
 17 |   model.bs,
 18 |   model.pca,
 19 |   model.lasso,
 20 |   model.gb,
 21 |   model.svm,
 22 |   model.mrp,
 23 |   tol,
 24 |   n.draws,
 25 |   cores,
 26 |   preds_all,
 27 |   post.strat,
 28 |   dv_type,
 29 |   deep.mrp
 30 | )
 31 | }
 32 | \arguments{
 33 | \item{train.preds}{Predictions of classifiers on the classifier training
 34 | data. A tibble.}
 35 | 
 36 | \item{train.y}{Outcome variable of the classifier training data. A numeric
 37 | vector.}
 38 | 
 39 | \item{ebma.fold}{New data for EBMA tuning. A list containing the the data
 40 | that must not have been used in classifier training.}
 41 | 
 42 | \item{y}{Outcome variable. A character vector containing the column names of
 43 | the outcome variable. A character scalar containing the column name of
 44 | the outcome variable in \code{survey}.}
 45 | 
 46 | \item{L1.x}{Individual-level covariates. A character vector containing the
 47 | column names of the individual-level variables in \code{survey} and
 48 | \code{census} used to predict outcome \code{y}. Note that geographic unit
 49 | is specified in argument \code{L2.unit}.}
 50 | 
 51 | \item{L2.x}{Context-level covariates. A character vector containing the
 52 | column names of the context-level variables in \code{survey} and
 53 | \code{census} used to predict outcome \code{y}. To exclude context-level
 54 | variables, set \code{L2.x = NULL}.}
 55 | 
 56 | \item{L2.unit}{Geographic unit. A character scalar containing the column
 57 | name of the geographic unit in \code{survey} and \code{census} at which
 58 | outcomes should be aggregated.}
 59 | 
 60 | \item{L2.reg}{Geographic region. A character scalar containing the column
 61 | name of the geographic region in \code{survey} and \code{census} by which
 62 | geographic units are grouped (\code{L2.unit} must be nested within
 63 | \code{L2.reg}). Default is \code{NULL}.}
 64 | 
 65 | \item{pc.names}{Principal Component Variable names. A character vector
 66 | containing the names of the context-level principal components variables.}
 67 | 
 68 | \item{model.bs}{The tuned model from the multilevel regression with best
 69 | subset selection classifier. An \code{\link[lme4]{glmer}} object.}
 70 | 
 71 | \item{model.pca}{The tuned model from the multilevel regression with
 72 | principal components as context-level predictors classifier. An
 73 | \code{\link[lme4]{glmer}} object.}
 74 | 
 75 | \item{model.lasso}{The tuned model from the multilevel regression with L1
 76 | regularization classifier. A \code{\link[glmmLasso]{glmmLasso}} object.}
 77 | 
 78 | \item{model.gb}{The tuned model from the gradient boosting classifier. A
 79 | \code{\link[gbm]{gbm}} object.}
 80 | 
 81 | \item{model.svm}{The tuned model from the support vector machine classifier.
 82 | An \code{\link[e1071]{svm}} object.}
 83 | 
 84 | \item{model.mrp}{The standard MrP model. An \code{\link[lme4]{glmer}} object}
 85 | 
 86 | \item{tol}{EBMA tolerance. A numeric vector containing the tolerance values
 87 | for improvements in the log-likelihood before the EM algorithm stops
 88 | optimization. Values should range at least from \eqn{0.01} to \eqn{0.001}.
 89 | Default is \code{c(0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001)}.
 90 | Passed on from \code{ebma.tol}.}
 91 | 
 92 | \item{n.draws}{EBMA number of samples. An integer-valued scalar specifying
 93 | the number of bootstrapped samples to be drawn from the EBMA fold and used
 94 | for tuning EBMA. Default is \eqn{100}. Passed on from \code{ebma.n.draws}.}
 95 | 
 96 | \item{cores}{The number of cores to be used. An integer indicating the number
 97 | of processor cores used for parallel computing. Default is 1.}
 98 | 
 99 | \item{post.strat}{Post-stratification results. A list containing the best
100 | models for each of the tuned classifiers, the individual level predictions
101 | on the data classifier trainig data and the post-stratified context-level
102 | predictions.}
103 | 
104 | \item{dv_type}{The type of the depenedent variable. A character string.
105 | Either "binary" or "linear".}
106 | 
107 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether
108 | the deep MRP classifier should be used for best subset prediction. Setting
109 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best
110 | subset classifier. Default is \code{FALSE}.}
111 | }
112 | \value{
113 | The classifier weights. A numeric vector.
114 | }
115 | \description{
116 | \code{ebma_mc_draws} is called from within \code{ebma}. It tunes using
117 | multiple cores.
118 | }
119 | 


--------------------------------------------------------------------------------
/man/ebma_mc_tol.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/ebma.R
  3 | \name{ebma_mc_tol}
  4 | \alias{ebma_mc_tol}
  5 | \title{EBMA multicore tuning - parallelises over tolerance values.}
  6 | \usage{
  7 | ebma_mc_tol(
  8 |   train.preds,
  9 |   train.y,
 10 |   ebma.fold,
 11 |   y,
 12 |   L1.x,
 13 |   L2.x,
 14 |   L2.unit,
 15 |   L2.reg,
 16 |   pc.names,
 17 |   model.bs,
 18 |   model.pca,
 19 |   model.lasso,
 20 |   model.gb,
 21 |   model.svm,
 22 |   model.mrp,
 23 |   tol,
 24 |   n.draws,
 25 |   cores,
 26 |   preds_all,
 27 |   post.strat,
 28 |   dv_type,
 29 |   deep.mrp
 30 | )
 31 | }
 32 | \arguments{
 33 | \item{train.preds}{Predictions of classifiers on the classifier training
 34 | data. A tibble.}
 35 | 
 36 | \item{train.y}{Outcome variable of the classifier training data. A numeric
 37 | vector.}
 38 | 
 39 | \item{ebma.fold}{The data used for EBMA tuning. A tibble.}
 40 | 
 41 | \item{y}{Outcome variable. A character vector containing the column names of
 42 | the outcome variable. A character scalar containing the column name of
 43 | the outcome variable in \code{survey}.}
 44 | 
 45 | \item{L1.x}{Individual-level covariates. A character vector containing the
 46 | column names of the individual-level variables in \code{survey} and
 47 | \code{census} used to predict outcome \code{y}. Note that geographic unit
 48 | is specified in argument \code{L2.unit}.}
 49 | 
 50 | \item{L2.x}{Context-level covariates. A character vector containing the
 51 | column names of the context-level variables in \code{survey} and
 52 | \code{census} used to predict outcome \code{y}. To exclude context-level
 53 | variables, set \code{L2.x = NULL}.}
 54 | 
 55 | \item{L2.unit}{Geographic unit. A character scalar containing the column
 56 | name of the geographic unit in \code{survey} and \code{census} at which
 57 | outcomes should be aggregated.}
 58 | 
 59 | \item{L2.reg}{Geographic region. A character scalar containing the column
 60 | name of the geographic region in \code{survey} and \code{census} by which
 61 | geographic units are grouped (\code{L2.unit} must be nested within
 62 | \code{L2.reg}). Default is \code{NULL}.}
 63 | 
 64 | \item{pc.names}{Principal Component Variable names. A character vector
 65 | containing the names of the context-level principal components variables.}
 66 | 
 67 | \item{model.bs}{The tuned model from the multilevel regression with best
 68 | subset selection classifier. An \code{\link[lme4]{glmer}} object.}
 69 | 
 70 | \item{model.pca}{The tuned model from the multilevel regression with
 71 | principal components as context-level predictors classifier. An
 72 | \code{\link[lme4]{glmer}} object.}
 73 | 
 74 | \item{model.lasso}{The tuned model from the multilevel regression with L1
 75 | regularization classifier. A \code{\link[glmmLasso]{glmmLasso}} object.}
 76 | 
 77 | \item{model.gb}{The tuned model from the gradient boosting classifier. A
 78 | \code{\link[gbm]{gbm}} object.}
 79 | 
 80 | \item{model.svm}{The tuned model from the support vector machine classifier.
 81 | An \code{\link[e1071]{svm}} object.}
 82 | 
 83 | \item{model.mrp}{The standard MrP model. An \code{\link[lme4]{glmer}} object}
 84 | 
 85 | \item{tol}{The tolerance values used for EBMA. A numeric vector.}
 86 | 
 87 | \item{n.draws}{EBMA number of samples. An integer-valued scalar specifying
 88 | the number of bootstrapped samples to be drawn from the EBMA fold and used
 89 | for tuning EBMA. Default is \eqn{100}. Passed on from \code{ebma.n.draws}.}
 90 | 
 91 | \item{cores}{The number of cores to be used. An integer indicating the number
 92 | of processor cores used for parallel computing. Default is 1.}
 93 | 
 94 | \item{post.strat}{Post-stratification results. A list containing the best
 95 | models for each of the tuned classifiers, the individual level predictions
 96 | on the data classifier trainig data and the post-stratified context-level
 97 | predictions.}
 98 | 
 99 | \item{dv_type}{The type of the depenedent variable. A character string.
100 | Either "binary" or "linear".}
101 | 
102 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether
103 | the deep MRP classifier should be used for best subset prediction. Setting
104 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best
105 | subset classifier. Default is \code{FALSE}.}
106 | }
107 | \value{
108 | The classifier weights. A numeric vector.
109 | }
110 | \description{
111 | \code{ebma_mc_tol} is called from within \code{ebma}. It tunes using
112 | multiple cores.
113 | }
114 | \examples{
115 | \dontrun{
116 | # not yet
117 | }
118 | }
119 | 


--------------------------------------------------------------------------------
/man/error_checks.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/utils.R
  3 | \name{error_checks}
  4 | \alias{error_checks}
  5 | \title{Catches user input errors}
  6 | \usage{
  7 | error_checks(
  8 |   y,
  9 |   L1.x,
 10 |   L2.x,
 11 |   L2.unit,
 12 |   L2.reg,
 13 |   L2.x.scale,
 14 |   pcs,
 15 |   folds,
 16 |   bin.proportion,
 17 |   bin.size,
 18 |   survey,
 19 |   census,
 20 |   ebma.size,
 21 |   k.folds,
 22 |   cv.sampling,
 23 |   loss.unit,
 24 |   loss.fun,
 25 |   best.subset,
 26 |   lasso,
 27 |   pca,
 28 |   gb,
 29 |   svm,
 30 |   mrp,
 31 |   best.subset.L2.x,
 32 |   lasso.L2.x,
 33 |   deep.mrp,
 34 |   gb.L2.x,
 35 |   svm.L2.x,
 36 |   mrp.L2.x,
 37 |   gb.L2.unit,
 38 |   gb.L2.reg,
 39 |   lasso.lambda,
 40 |   lasso.n.iter,
 41 |   deep.splines,
 42 |   uncertainty,
 43 |   boot.iter
 44 | )
 45 | }
 46 | \arguments{
 47 | \item{y}{Outcome variable. A character vector containing the column names of
 48 | the outcome variable. A character scalar containing the column name of
 49 | the outcome variable in \code{survey}.}
 50 | 
 51 | \item{L1.x}{Individual-level covariates. A character vector containing the
 52 | column names of the individual-level variables in \code{survey} and
 53 | \code{census} used to predict outcome \code{y}. Note that geographic unit
 54 | is specified in argument \code{L2.unit}.}
 55 | 
 56 | \item{L2.x}{Context-level covariates. A character vector containing the
 57 | column names of the context-level variables in \code{survey} and
 58 | \code{census} used to predict outcome \code{y}. To exclude context-level
 59 | variables, set \code{L2.x = NULL}.}
 60 | 
 61 | \item{L2.unit}{Geographic unit. A character scalar containing the column
 62 | name of the geographic unit in \code{survey} and \code{census} at which
 63 | outcomes should be aggregated.}
 64 | 
 65 | \item{L2.reg}{Geographic region. A character scalar containing the column
 66 | name of the geographic region in \code{survey} and \code{census} by which
 67 | geographic units are grouped (\code{L2.unit} must be nested within
 68 | \code{L2.reg}). Default is \code{NULL}.}
 69 | 
 70 | \item{L2.x.scale}{Scale context-level covariates. A logical argument
 71 | indicating whether the context-level covariates should be normalized.
 72 | Default is \code{TRUE}. Note that if set to \code{FALSE}, then the
 73 | context-level covariates should be normalized prior to calling
 74 | \code{auto_MrP()}.}
 75 | 
 76 | \item{pcs}{Principal components. A character vector containing the column
 77 | names of the principal components of the context-level variables in
 78 | \code{survey} and \code{census}. Default is \code{NULL}.}
 79 | 
 80 | \item{folds}{EBMA and cross-validation folds. A character scalar containing
 81 | the column name of the variable in \code{survey} that specifies the fold
 82 | to which an observation is allocated. The variable should contain integers
 83 | running from \eqn{1} to \eqn{k + 1}, where \eqn{k} is the number of
 84 | cross-validation folds. Value \eqn{k + 1} refers to the EBMA fold. Default
 85 | is \code{NULL}. \emph{Note:} if \code{folds} is \code{NULL}, then
 86 | \code{ebma.size}, \code{k.folds}, and \code{cv.sampling} must be specified.}
 87 | 
 88 | \item{bin.proportion}{Proportion of ideal types. A character scalar
 89 | containing the column name of the variable in \code{census} that indicates
 90 | the proportion of individuals by ideal type and geographic unit. Default is
 91 | \code{NULL}. \emph{Note:} if \code{bin.proportion} is \code{NULL}, then
 92 | \code{bin.size} must be specified.}
 93 | 
 94 | \item{bin.size}{Bin size of ideal types. A character scalar containing the
 95 | column name of the variable in \code{census} that indicates the bin size of
 96 | ideal types by geographic unit. Default is \code{NULL}. \emph{Note:}
 97 | ignored if \code{bin.proportion} is provided, but must be specified
 98 | otherwise.}
 99 | 
100 | \item{survey}{Survey data. A \code{data.frame} whose column names include
101 | \code{y}, \code{L1.x}, \code{L2.x}, \code{L2.unit}, and, if specified,
102 | \code{L2.reg}, \code{pcs}, and \code{folds}.}
103 | 
104 | \item{census}{Census data. A \code{data.frame} whose column names include
105 | \code{L1.x}, \code{L2.x}, \code{L2.unit}, if specified, \code{L2.reg} and
106 | \code{pcs}, and either \code{bin.proportion} or \code{bin.size}.}
107 | 
108 | \item{ebma.size}{EBMA fold size. A number in the open unit interval
109 | indicating the proportion of respondents to be allocated to the EBMA fold.
110 | Default is \eqn{1/3}. \emph{Note:} ignored if \code{folds} is provided, but
111 | must be specified otherwise.}
112 | 
113 | \item{k.folds}{Number of cross-validation folds. An integer-valued scalar
114 | indicating the number of folds to be used in cross-validation. Default is
115 | \eqn{5}. \emph{Note:} ignored if \code{folds} is provided, but must be
116 | specified otherwise.}
117 | 
118 | \item{cv.sampling}{Cross-validation sampling method. A character-valued
119 | scalar indicating whether cross-validation folds should be created by
120 | sampling individual respondents (\code{individuals}) or geographic units
121 | (\code{L2 units}). Default is \code{L2 units}. \emph{Note:} ignored if
122 | \code{folds} is provided, but must be specified otherwise.}
123 | 
124 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
125 | whether performance loss should be evaluated at the level of individual
126 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
127 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
128 | loss units, parameters are ranked for each loss unit and the loss unit with
129 | the lowest rank sum is chosen. Ties are broken according to the order in
130 | the search grid.}
131 | 
132 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
133 | prediction loss should be measured by the mean squared error (\code{MSE}),
134 | the mean absolute error (\code{MAE}), binary cross-entropy
135 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1
136 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE",
137 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters
138 | are ranked for each loss function and the parameter combination with the
139 | lowest rank sum is chosen. Ties are broken according to the order in the
140 | search grid.}
141 | 
142 | \item{best.subset}{Best subset classifier. A logical argument indicating
143 | whether the best subset classifier should be used for predicting outcome
144 | \code{y}. Default is \code{TRUE}.}
145 | 
146 | \item{lasso}{Lasso classifier. A logical argument indicating whether the
147 | lasso classifier should be used for predicting outcome \code{y}. Default is
148 | \code{TRUE}.}
149 | 
150 | \item{pca}{PCA classifier. A logical argument indicating whether the PCA
151 | classifier should be used for predicting outcome \code{y}. Default is
152 | \code{TRUE}.}
153 | 
154 | \item{gb}{GB classifier. A logical argument indicating whether the GB
155 | classifier should be used for predicting outcome \code{y}. Default is
156 | \code{TRUE}.}
157 | 
158 | \item{svm}{SVM classifier. A logical argument indicating whether the SVM
159 | classifier should be used for predicting outcome \code{y}. Default is
160 | \code{TRUE}.}
161 | 
162 | \item{mrp}{MRP classifier. A logical argument indicating whether the standard
163 | MRP classifier should be used for predicting outcome \code{y}. Default is
164 | \code{FALSE}.}
165 | 
166 | \item{best.subset.L2.x}{Best subset context-level covariates. A character
167 | vector containing the column names of the context-level variables in
168 | \code{survey} and \code{census} to be used by the best subset classifier.
169 | If \code{NULL} and \code{best.subset} is set to \code{TRUE}, then best
170 | subset uses the variables specified in \code{L2.x}. Default is \code{NULL}.}
171 | 
172 | \item{lasso.L2.x}{Lasso context-level covariates. A character vector
173 | containing the column names of the context-level variables in
174 | \code{survey} and \code{census} to be used by the lasso classifier. If
175 | \code{NULL} and \code{lasso} is set to \code{TRUE}, then lasso uses the
176 | variables specified in \code{L2.x}. Default is \code{NULL}.}
177 | 
178 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether
179 | the deep MRP classifier should be used for best subset prediction. Setting
180 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best
181 | subset classifier. Default is \code{FALSE}.}
182 | 
183 | \item{gb.L2.x}{GB context-level covariates. A character vector containing the
184 | column names of the context-level variables in \code{survey} and
185 | \code{census} to be used by the GB classifier. If \code{NULL} and \code{gb}
186 | is set to \code{TRUE}, then GB uses the variables specified in \code{L2.x}.
187 | Default is \code{NULL}.}
188 | 
189 | \item{svm.L2.x}{SVM context-level covariates. A character vector containing
190 | the column names of the context-level variables in \code{survey} and
191 | \code{census} to be used by the SVM classifier. If \code{NULL} and
192 | \code{svm} is set to \code{TRUE}, then SVM uses the variables specified in
193 | \code{L2.x}. Default is \code{NULL}.}
194 | 
195 | \item{mrp.L2.x}{MRP context-level covariates. A character vector containing
196 | the column names of the context-level variables in \code{survey} and
197 | \code{census} to be used by the MRP classifier. The character vector
198 | \emph{empty} if no context-level variables should be used by the MRP
199 | classifier. If \code{NULL} and \code{mrp} is set to \code{TRUE}, then MRP
200 | uses the variables specified in \code{L2.x}. Default is \code{NULL}. Note:
201 | For the empty MrP model, set \code{L2.x = NULL} and \code{mrp.L2.x = ""}.}
202 | 
203 | \item{gb.L2.unit}{GB L2.unit. A logical argument indicating whether
204 | \code{L2.unit} should be included in the GB classifier. Default is
205 | \code{FALSE}.}
206 | 
207 | \item{gb.L2.reg}{GB L2.reg. A logical argument indicating whether
208 | \code{L2.reg} should be included in the GB classifier. Default is
209 | \code{FALSE}.}
210 | 
211 | \item{lasso.lambda}{Lasso penalty parameter. A numeric \code{vector} of
212 | non-negative values. The penalty parameter controls the shrinkage of the
213 | context-level variables in the lasso model. Default is a sequence with
214 | minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The
215 | number of values is controlled by the \code{lasso.n.iter} parameter.}
216 | 
217 | \item{lasso.n.iter}{Lasso number of lambda values. An integer-valued scalar
218 | specifying the number of lambda values to search over. Default is
219 | \eqn{100}. \emph{Note:} Is ignored if a vector of \code{lasso.lambda}
220 | values is provided.}
221 | 
222 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether
223 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.}
224 | 
225 | \item{uncertainty}{Uncertainty estimates. A logical argument indicating
226 | whether uncertainty estimates should be computed. Default is \code{FALSE}.}
227 | 
228 | \item{boot.iter}{Number of bootstrap iterations. An integer argument
229 | indicating the number of bootstrap iterations to be computed. Will be
230 | ignored unless \code{uncertainty = TRUE}. Default is \code{200} if
231 | \code{uncertainty = TRUE} and \code{NULL} if \code{uncertainty = FALSE}.}
232 | }
233 | \value{
234 | No return value, called for detection of errors in autoMrP() call.
235 | }
236 | \description{
237 | \code{error_checks()} checks for incorrect data entry in \code{autoMrP()}
238 | call.
239 | }
240 | 


--------------------------------------------------------------------------------
/man/f1_score.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{f1_score}
 4 | \alias{f1_score}
 5 | \title{Estimates the inverse f1 score, i.e. 0 is the best score and 1 the worst.}
 6 | \usage{
 7 | f1_score(pred, data.valid, y, L2.unit)
 8 | }
 9 | \arguments{
10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.}
11 | 
12 | \item{data.valid}{Test data set. A tibble of data that was not used for
13 | prediction.}
14 | 
15 | \item{y}{Outcome variable. A character vector containing the column names of
16 | the outcome variable.}
17 | 
18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
19 | of the geographic unit in \code{survey} and \code{census} at which outcomes
20 | should be aggregated.}
21 | }
22 | \value{
23 | Returns a tibble containing two f1 prediction errors. The first is
24 |   measured at the level of individuals and the second is measured at the
25 |   context level. The tibble dimensions are 2x3 with variables: measure, value
26 |   and level.
27 | }
28 | \description{
29 | \code{f1_score()} estimates the inverse f1 scores on the individual and state
30 | levels.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/gb_classifier.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gb_classifier.R
 3 | \name{gb_classifier}
 4 | \alias{gb_classifier}
 5 | \title{GB classifier}
 6 | \usage{
 7 | gb_classifier(
 8 |   y,
 9 |   form,
10 |   distribution,
11 |   data.train,
12 |   n.trees,
13 |   interaction.depth,
14 |   n.minobsinnode,
15 |   shrinkage,
16 |   verbose = c(TRUE, FALSE)
17 | )
18 | }
19 | \arguments{
20 | \item{y}{Outcome variable. A character vector containing the column names of
21 | the outcome variable. A character scalar containing the column name of
22 | the outcome variable in \code{survey}.}
23 | 
24 | \item{form}{Model formula. A two-sided linear formula describing
25 | the model to be fit, with the outcome on the LHS and the covariates
26 | separated by + operators on the RHS.}
27 | 
28 | \item{distribution}{Model distribution. A character string specifying the
29 | name of the distribution to be used.}
30 | 
31 | \item{data.train}{Training data. A data.frame containing the training data
32 | used to train the model.}
33 | 
34 | \item{n.trees}{Total number of trees. An integer-valued scalar specifying
35 | the total number of trees to be fit.}
36 | 
37 | \item{interaction.depth}{Interaction depth. An integer-valued scalar
38 | specifying the maximum depth of each tree.}
39 | 
40 | \item{n.minobsinnode}{Minimum number of observations in terminal nodes. An
41 | integer-valued scalar specifying the minimum number of observations in the
42 | terminal nodes of the trees.}
43 | 
44 | \item{shrinkage}{Learning rate. A numeric scalar specifying the shrinkage or
45 | learning rate applied to each tree in the expansion.}
46 | 
47 | \item{verbose}{Verbose output. A logical vector indicating whether or not
48 | verbose output should be printed.}
49 | }
50 | \value{
51 | A gradient tree boosting model. A \code{\link[gbm]{gbm}} object.
52 | }
53 | \description{
54 | \code{gb_classifier} applies gradient boosting classification to a data set.
55 | }
56 | 


--------------------------------------------------------------------------------
/man/gb_classifier_update.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gb_classifier.R
 3 | \name{gb_classifier_update}
 4 | \alias{gb_classifier_update}
 5 | \title{GB classifier update}
 6 | \usage{
 7 | gb_classifier_update(object, n.new.trees, verbose = c(TRUE, FALSE))
 8 | }
 9 | \arguments{
10 | \item{object}{Gradient tree boosting output. A gbm object.}
11 | 
12 | \item{n.new.trees}{Number of additional trees to grow. A numeric scalar.}
13 | 
14 | \item{verbose}{Verbose output. A logical vector indicating whether or not
15 | verbose output should be printed.}
16 | }
17 | \value{
18 | An updated gradient tree boosting model.
19 |   A \code{\link[gbm]{gbm.more}} object.
20 | }
21 | \description{
22 | \code{gb_classifier_update()} grows additional trees in gradient tree
23 | boosting ensemble.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/lasso_classifier.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lasso_classifier.R
 3 | \name{lasso_classifier}
 4 | \alias{lasso_classifier}
 5 | \title{Lasso classifier}
 6 | \usage{
 7 | lasso_classifier(
 8 |   L2.fix,
 9 |   L1.re,
10 |   data.train,
11 |   lambda,
12 |   model.family,
13 |   y,
14 |   verbose = c(TRUE, FALSE)
15 | )
16 | }
17 | \arguments{
18 | \item{L2.fix}{Fixed effects. A two-sided linear formula describing
19 | the fixed effects part of the model, with the outcome on the LHS and
20 | the fixed effects separated by + operators on the RHS.}
21 | 
22 | \item{L1.re}{Random effects. A named list object, with the random effects
23 | providing the names of the list elements and ~ 1 being the list elements.}
24 | 
25 | \item{data.train}{Training data. A data.frame containing the training data
26 | used to train the model.}
27 | 
28 | \item{lambda}{Tuning parameter. Lambda is the penalty parameter that controls
29 | the shrinkage of fixed effects.}
30 | 
31 | \item{model.family}{Model family. A variable indicating the model family
32 | to be used by glmmLasso. Defaults to binomial(link = "probit").}
33 | 
34 | \item{y}{Outcome variable. A character vector containing the column names of
35 | the outcome variable. A character scalar containing the column name of
36 | the outcome variable in \code{survey}.}
37 | 
38 | \item{verbose}{Verbose output. A logical vector indicating whether or not
39 | verbose output should be printed.}
40 | }
41 | \value{
42 | A multilevel lasso model. An \code{\link[glmmLasso]{glmmLasso}}
43 |   object.
44 | }
45 | \description{
46 | \code{lasso_classifier} applies lasso classification to a data set.
47 | }
48 | 


--------------------------------------------------------------------------------
/man/log_spaced.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{log_spaced}
 4 | \alias{log_spaced}
 5 | \title{Sequence that is equally spaced on the log scale}
 6 | \usage{
 7 | log_spaced(min, max, n)
 8 | }
 9 | \arguments{
10 | \item{min}{The minimum value of the sequence. A positive numeric scalar (min
11 | > 0).}
12 | 
13 | \item{max}{The maximum value of the sequence. a positive numeric scalar (max
14 | > 0).}
15 | 
16 | \item{n}{The length of the sequence. An integer valued scalar.}
17 | }
18 | \value{
19 | Returns a numeric vector with length specified in argument \code{n}.
20 |   The vector elements are equally spaced on the log-scale.
21 | }
22 | \description{
23 | Sequence that is equally spaced on the log scale
24 | }
25 | 


--------------------------------------------------------------------------------
/man/loss_function.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{loss_function}
 4 | \alias{loss_function}
 5 | \title{Estimates loss value.}
 6 | \usage{
 7 | loss_function(
 8 |   pred,
 9 |   data.valid,
10 |   loss.unit = c("individuals", "L2 units"),
11 |   loss.fun = c("MSE", "MAE", "cross-entropy"),
12 |   y,
13 |   L2.unit
14 | )
15 | }
16 | \arguments{
17 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.}
18 | 
19 | \item{data.valid}{Test data set. A tibble of data that was not used for
20 | prediction.}
21 | 
22 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
23 | whether performance loss should be evaluated at the level of individual
24 | respondents (\code{individuals}) or geographic units (\code{L2 units}).
25 | Default is \code{individuals}.}
26 | 
27 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
28 | prediction loss should be measured by the mean squared error (\code{MSE})
29 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.}
30 | 
31 | \item{y}{Outcome variable. A character vector containing the column names of
32 | the outcome variable.}
33 | 
34 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
35 | of the geographic unit in \code{survey} and \code{census} at which outcomes
36 | should be aggregated.}
37 | }
38 | \value{
39 | Returns a tibble with number of rows equal to the number of loss
40 |   functions tested (defaults to 4 for cross-entropy, f1, MSE, and msfe). The
41 |   number of columns is 2 where the first is called measure and contains the
42 |   names of the loss-functions and the second is called value and contains the
43 |   loss-function scores.
44 | }
45 | \description{
46 | \code{loss_function()} estimates the loss based on a loss function.
47 | }
48 | 


--------------------------------------------------------------------------------
/man/loss_score_ranking.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{loss_score_ranking}
 4 | \alias{loss_score_ranking}
 5 | \title{Ranks tuning parameters according to loss functions}
 6 | \usage{
 7 | loss_score_ranking(score, loss.fun)
 8 | }
 9 | \arguments{
10 | \item{score}{A data set containing loss function names, the loss function
11 | values, and the tuning parameter values.}
12 | 
13 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
14 | prediction loss should be measured by the mean squared error (\code{MSE})
15 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.}
16 | }
17 | \value{
18 | Returns a tibble containing the parameter grid as well as a rank
19 |   column that corresponds to the cross-validation rank of a parameter
20 |   combination across all loss function scores.
21 | }
22 | \description{
23 | \code{loss_score_ranking()} ranks tuning parameters according to the scores
24 | received in multiple loss functions.
25 | }
26 | 


--------------------------------------------------------------------------------
/man/mean_absolute_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{mean_absolute_error}
 4 | \alias{mean_absolute_error}
 5 | \title{Estimates the mean absolute prediction error.}
 6 | \usage{
 7 | mean_absolute_error(pred, data.valid, y, L2.unit)
 8 | }
 9 | \arguments{
10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.}
11 | 
12 | \item{data.valid}{Test data set. A tibble of data that was not used for
13 | prediction.}
14 | 
15 | \item{y}{Outcome variable. A character vector containing the column names of
16 | the outcome variable.}
17 | 
18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
19 | of the geographic unit in \code{survey} and \code{census} at which outcomes
20 | should be aggregated.}
21 | }
22 | \value{
23 | Returns a tibble containing two mean absolute prediction errors. The
24 |   first is measured at the level of individuals and the second is measured at
25 |   the context level. The tibble dimensions are 2x3 with variables: measure,
26 |   value and level.
27 | }
28 | \description{
29 | \code{mean_absolute_error()} estimates the mean absolute error for the
30 | desired loss unit.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/mean_squared_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{mean_squared_error}
 4 | \alias{mean_squared_error}
 5 | \title{Estimates the mean squared prediction error.}
 6 | \usage{
 7 | mean_squared_error(pred, data.valid, y, L2.unit)
 8 | }
 9 | \arguments{
10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.}
11 | 
12 | \item{data.valid}{Test data set. A tibble of data that was not used for
13 | prediction.}
14 | 
15 | \item{y}{Outcome variable. A character vector containing the column names of
16 | the outcome variable.}
17 | 
18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
19 | of the geographic unit in \code{survey} and \code{census} at which outcomes
20 | should be aggregated.}
21 | }
22 | \value{
23 | Returns a tibble containing two mean squared prediction errors. The
24 |   first is measured at the level of individuals and the second is measured at
25 |   the context level. The tibble dimensions are 2x3 with variables: measure,
26 |   value and level.
27 | }
28 | \description{
29 | \code{mean_squared_error()} estimates the mean squared error for the desired
30 | loss unit.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/mean_squared_false_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{mean_squared_false_error}
 4 | \alias{mean_squared_false_error}
 5 | \title{Estimates the mean squared false error.}
 6 | \usage{
 7 | mean_squared_false_error(pred, data.valid, y, L2.unit)
 8 | }
 9 | \arguments{
10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.}
11 | 
12 | \item{data.valid}{Test data set. A tibble of data that was not used for
13 | prediction.}
14 | 
15 | \item{y}{Outcome variable. A character vector containing the column names of
16 | the outcome variable.}
17 | 
18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
19 | of the geographic unit in \code{survey} and \code{census} at which outcomes
20 | should be aggregated.}
21 | }
22 | \value{
23 | Returns a tibble containing two mean squared false prediction errors.
24 |   The first is measured at the level of individuals and the second is
25 |   measured at the context level. The tibble dimensions are 2x3 with
26 |   variables: measure, value and level.
27 | }
28 | \description{
29 | \code{msfe()} estimates the inverse f1 scores on the individual and state
30 | levels.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/model_list.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{model_list}
 4 | \alias{model_list}
 5 | \title{A list of models for the best subset selection.}
 6 | \usage{
 7 | model_list(y, L1.x, L2.x, L2.unit, L2.reg = NULL)
 8 | }
 9 | \arguments{
10 | \item{y}{Outcome variable. A character vector containing the column names of
11 | the outcome variable.}
12 | 
13 | \item{L1.x}{Individual-level covariates. A character vector containing the
14 | column names of the individual-level variables in \code{survey} and
15 | \code{census} used to predict outcome \code{y}. Note that geographic unit
16 | is specified in argument \code{L2.unit}.}
17 | 
18 | \item{L2.x}{Context-level covariates. A character vector containing the
19 | column names of the context-level variables in \code{survey} and
20 | \code{census} used to predict outcome \code{y}.}
21 | 
22 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
23 | of the geographic unit in \code{survey} and \code{census} at which outcomes
24 | should be aggregated.}
25 | 
26 | \item{L2.reg}{Geographic region. A character scalar containing the column
27 | name of the geographic region in \code{survey} and \code{census} by which
28 | geographic units are grouped (\code{L2.unit} must be nested within
29 | \code{L2.reg}). Default is \code{NULL}.}
30 | }
31 | \value{
32 | Returns a list with the number of elements equal to 2^k where k is
33 |   the number context-level variables. Each element is of class formula.
34 | }
35 | \description{
36 | \code{model_list()} generates an exhaustive list of lme4 model formulas from
37 | the individual level and context level variables as well as geographic unit
38 | variables to be iterated over in best subset selection.
39 | }
40 | 


--------------------------------------------------------------------------------
/man/model_list_pca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{model_list_pca}
 4 | \alias{model_list_pca}
 5 | \title{A list of models for the best subset selection with PCA.}
 6 | \usage{
 7 | model_list_pca(y, L1.x, L2.x, L2.unit, L2.reg = NULL)
 8 | }
 9 | \arguments{
10 | \item{y}{Outcome variable. A character vector containing the column names of
11 | the outcome variable.}
12 | 
13 | \item{L1.x}{Individual-level covariates. A character vector containing the
14 | column names of the individual-level variables in \code{survey} and
15 | \code{census} used to predict outcome \code{y}. Note that geographic unit
16 | is specified in argument \code{L2.unit}.}
17 | 
18 | \item{L2.x}{Context-level covariates. A character vector containing the
19 | column names of the context-level variables in \code{survey} and
20 | \code{census} used to predict outcome \code{y}.}
21 | 
22 | \item{L2.unit}{Geographic unit. A character scalar containing the column name
23 | of the geographic unit in \code{survey} and \code{census} at which outcomes
24 | should be aggregated.}
25 | 
26 | \item{L2.reg}{Geographic region. A character scalar containing the column
27 | name of the geographic region in \code{survey} and \code{census} by which
28 | geographic units are grouped (\code{L2.unit} must be nested within
29 | \code{L2.reg}). Default is \code{NULL}.}
30 | }
31 | \value{
32 | Returns a list with the number of elements k+1 where k is the number
33 |   of context-level variables. Each element is of class formula. The first
34 |   element is a model with context-level variables and the following models
35 |   iteratively add the principal components as context-level variables.
36 | }
37 | \description{
38 | \code{model_list_pca()} generates an exhaustive list of lme4 model formulas
39 | from the individual level and context level principal components as well as
40 | geographic unit variables to be iterated over in best subset selection with
41 | principal components.
42 | }
43 | 


--------------------------------------------------------------------------------
/man/multicore.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{multicore}
 4 | \alias{multicore}
 5 | \title{Register cores for multicore computing}
 6 | \usage{
 7 | multicore(cores = 1, type, cl = NULL)
 8 | }
 9 | \arguments{
10 | \item{cores}{Number of cores to be used. An integer. Default is \code{1}.}
11 | 
12 | \item{type}{Whether to start or end parallel processing. A character string.
13 | The possible values are \code{open}, \code{close}.}
14 | 
15 | \item{cl}{The registered cluster. Default is \code{NULL}}
16 | }
17 | \value{
18 | No return value, called to register or un-register clusters for
19 |   parallel processing.
20 | }
21 | \description{
22 | \code{multicore()} registers cores for parallel processing.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/output_table.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{output_table}
 4 | \alias{output_table}
 5 | \title{A table for the summary  function}
 6 | \usage{
 7 | output_table(object, col.names, format, digits)
 8 | }
 9 | \arguments{
10 | \item{object}{An \code{autoMrP()} object for which a summary is desired.}
11 | 
12 | \item{col.names}{The column names of the table. A}
13 | 
14 | \item{format}{The table format. A character string passed to
15 | \code{\link[knitr]{kable}}. Default is \code{simple}.}
16 | 
17 | \item{digits}{The number of digits to be displayed. An integer scalar.
18 | Default is \code{4}.}
19 | }
20 | \value{
21 | No return value, prints a table to the console.
22 | }
23 | \description{
24 | \code{output_table()} ...
25 | }
26 | 


--------------------------------------------------------------------------------
/man/plot.autoMrP.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{plot.autoMrP}
 4 | \alias{plot.autoMrP}
 5 | \title{A plot method for autoMrP objects. Plots unit-level preference estiamtes.}
 6 | \usage{
 7 | \method{plot}{autoMrP}(x, algorithm = "ebma", ci.lvl = 0.95, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{An \code{autoMrP()} object.}
11 | 
12 | \item{algorithm}{The algorithm/classifier fo which preference estimates are
13 | desired. A character-valued scalar indicating either \code{ebma} or the
14 | classifier to be used. Allowed choices are: "ebma", "best_subset", "lasso",
15 | "pca", "gb", "svm", and "mrp". Default is \code{ebma}.}
16 | 
17 | \item{ci.lvl}{The level of the confidence intervals. A proportion. Default is
18 | \code{0.95}. Confidence intervals are based on bootstrapped estimates and
19 | will not be printed if bootstrapping was not carried out.}
20 | 
21 | \item{...}{Additional arguments affecting the summary produced.}
22 | }
23 | \value{
24 | Returns a \code{ggplot2} object of the preference estimates for the
25 | selected classifier.
26 | }
27 | \description{
28 | \code{plot.autoMrP()} plots unit-level preference estimates and error bars.
29 | }
30 | 


--------------------------------------------------------------------------------
/man/post_stratification.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/post_stratification.R
  3 | \name{post_stratification}
  4 | \alias{post_stratification}
  5 | \title{Apply post-stratification to classifiers.}
  6 | \usage{
  7 | post_stratification(
  8 |   y,
  9 |   L1.x,
 10 |   L2.x,
 11 |   L2.unit,
 12 |   L2.reg,
 13 |   best.subset.opt,
 14 |   lasso.opt,
 15 |   lasso.L2.x,
 16 |   pca.opt,
 17 |   gb.opt,
 18 |   svm.opt,
 19 |   svm.L2.reg,
 20 |   svm.L2.unit,
 21 |   svm.L2.x,
 22 |   mrp.include,
 23 |   n.minobsinnode,
 24 |   L2.unit.include,
 25 |   L2.reg.include,
 26 |   kernel,
 27 |   mrp.L2.x,
 28 |   data,
 29 |   ebma.fold,
 30 |   census,
 31 |   verbose,
 32 |   deep.mrp,
 33 |   deep.splines
 34 | )
 35 | }
 36 | \arguments{
 37 | \item{y}{Outcome variable. A character vector containing the column names of
 38 | the outcome variable. A character scalar containing the column name of
 39 | the outcome variable in \code{survey}.}
 40 | 
 41 | \item{L1.x}{Individual-level covariates. A character vector containing the
 42 | column names of the individual-level variables in \code{survey} and
 43 | \code{census} used to predict outcome \code{y}. Note that geographic unit
 44 | is specified in argument \code{L2.unit}.}
 45 | 
 46 | \item{L2.x}{Context-level covariates. A character vector containing the
 47 | column names of the context-level variables in \code{survey} and
 48 | \code{census} used to predict outcome \code{y}. To exclude context-level
 49 | variables, set \code{L2.x = NULL}.}
 50 | 
 51 | \item{L2.unit}{Geographic unit. A character scalar containing the column
 52 | name of the geographic unit in \code{survey} and \code{census} at which
 53 | outcomes should be aggregated.}
 54 | 
 55 | \item{L2.reg}{Geographic region. A character scalar containing the column
 56 | name of the geographic region in \code{survey} and \code{census} by which
 57 | geographic units are grouped (\code{L2.unit} must be nested within
 58 | \code{L2.reg}). Default is \code{NULL}.}
 59 | 
 60 | \item{best.subset.opt}{Optimal tuning parameters from best subset selection
 61 | classifier. A list returned by \code{run_best_subset()}.}
 62 | 
 63 | \item{lasso.opt}{Optimal tuning parameters from lasso classifier A list
 64 | returned by \code{run_lasso()}.}
 65 | 
 66 | \item{lasso.L2.x}{Lasso context-level covariates. A character vector
 67 | containing the column names of the context-level variables in
 68 | \code{survey} and \code{census} to be used by the lasso classifier. If
 69 | \code{NULL} and \code{lasso} is set to \code{TRUE}, then lasso uses the
 70 | variables specified in \code{L2.x}. Default is \code{NULL}.}
 71 | 
 72 | \item{pca.opt}{Optimal tuning parameters from best subset selection with
 73 | principal components classifier A list returned by \code{run_pca()}.}
 74 | 
 75 | \item{gb.opt}{Optimal tuning parameters from gradient tree boosting
 76 | classifier A list returned by \code{run_gb()}.}
 77 | 
 78 | \item{svm.opt}{Optimal tuning parameters from support vector machine
 79 | classifier A list returned by \code{run_svm()}.}
 80 | 
 81 | \item{svm.L2.reg}{SVM L2.reg. A logical argument indicating whether
 82 | \code{L2.reg} should be included in the SVM classifier. Default is
 83 | \code{FALSE}.}
 84 | 
 85 | \item{svm.L2.unit}{SVM L2.unit. A logical argument indicating whether
 86 | \code{L2.unit} should be included in the SVM classifier. Default is
 87 | \code{FALSE}.}
 88 | 
 89 | \item{svm.L2.x}{SVM context-level covariates. A character vector containing
 90 | the column names of the context-level variables in \code{survey} and
 91 | \code{census} to be used by the SVM classifier. If \code{NULL} and
 92 | \code{svm} is set to \code{TRUE}, then SVM uses the variables specified in
 93 | \code{L2.x}. Default is \code{NULL}.}
 94 | 
 95 | \item{mrp.include}{Whether to run MRP classifier. A logical argument
 96 | indicating whether the standard MRP classifier should be used for
 97 | predicting outcome \code{y}. Passed from \code{autoMrP()} argument
 98 | \code{mrp}.}
 99 | 
100 | \item{n.minobsinnode}{GB minimum number of observations in the terminal
101 | nodes. An integer-valued scalar specifying the minimum number of
102 | observations that each terminal node of the trees must contain. Passed from
103 | \code{autoMrP()} argument \code{gb.n.minobsinnode}.}
104 | 
105 | \item{L2.unit.include}{GB L2.unit. A logical argument indicating whether
106 | \code{L2.unit} should be included in the GB classifier. Passed from
107 | \code{autoMrP()} argument \code{gb.L2.unit}.}
108 | 
109 | \item{L2.reg.include}{A logical argument indicating whether \code{L2.reg}
110 | should be included in the GB classifier. Passed from \code{autoMrP()}
111 | argument \code{GB L2.reg}.}
112 | 
113 | \item{kernel}{SVM kernel. A character-valued scalar specifying the kernel to
114 | be used by SVM. The possible values are \code{linear}, \code{polynomial},
115 | \code{radial}, and \code{sigmoid}. Passed from \code{autoMrP()} argument
116 | \code{svm.kernel}.}
117 | 
118 | \item{mrp.L2.x}{MRP context-level covariates. A character vector containing
119 | the column names of the context-level variables in \code{survey} and
120 | \code{census} to be used by the MRP classifier. The character vector
121 | \emph{empty} if no context-level variables should be used by the MRP
122 | classifier. If \code{NULL} and \code{mrp} is set to \code{TRUE}, then MRP
123 | uses the variables specified in \code{L2.x}. Default is \code{NULL}. Note:
124 | For the empty MrP model, set \code{L2.x = NULL} and \code{mrp.L2.x = ""}.}
125 | 
126 | \item{data}{A data.frame containing the survey data used in classifier
127 | training.}
128 | 
129 | \item{ebma.fold}{A data.frame containing the data not used in classifier
130 | training.}
131 | 
132 | \item{census}{Census data. A \code{data.frame} whose column names include
133 | \code{L1.x}, \code{L2.x}, \code{L2.unit}, if specified, \code{L2.reg} and
134 | \code{pcs}, and either \code{bin.proportion} or \code{bin.size}.}
135 | 
136 | \item{verbose}{Verbose output. A logical argument indicating whether or not
137 | verbose output should be printed. Default is \code{FALSE}.}
138 | 
139 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether
140 | the deep MRP classifier should be used for best subset prediction. Setting
141 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best
142 | subset classifier. Default is \code{FALSE}.}
143 | 
144 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether
145 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.}
146 | }
147 | \description{
148 | Apply post-stratification to classifiers.
149 | }
150 | 


--------------------------------------------------------------------------------
/man/predict_glmmLasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{predict_glmmLasso}
 4 | \alias{predict_glmmLasso}
 5 | \title{Predicts on newdata from glmmLasso objects}
 6 | \usage{
 7 | predict_glmmLasso(
 8 |   census,
 9 |   m,
10 |   L1.x,
11 |   lasso.L2.x,
12 |   L2.unit,
13 |   L2.reg,
14 |   type = "response"
15 | )
16 | }
17 | \arguments{
18 | \item{census}{Census data. A \code{data.frame} whose column names include
19 | \code{L1.x}, \code{L2.x}, \code{L2.unit}, if specified, \code{L2.reg} and
20 | \code{pcs}, and either \code{bin.proportion} or \code{bin.size}.}
21 | 
22 | \item{m}{A \code{glmmLasso()} object.}
23 | 
24 | \item{L1.x}{Individual-level covariates. A character vector containing the
25 | column names of the individual-level variables in \code{survey} and
26 | \code{census} used to predict outcome \code{y}. Note that geographic unit
27 | is specified in argument \code{L2.unit}.}
28 | 
29 | \item{lasso.L2.x}{Lasso context-level covariates. A character vector
30 | containing the column names of the context-level variables in
31 | \code{survey} and \code{census} to be used by the lasso classifier. If
32 | \code{NULL} and \code{lasso} is set to \code{TRUE}, then lasso uses the
33 | variables specified in \code{L2.x}. Default is \code{NULL}.}
34 | 
35 | \item{L2.unit}{Geographic unit. A character scalar containing the column
36 | name of the geographic unit in \code{survey} and \code{census} at which
37 | outcomes should be aggregated.}
38 | 
39 | \item{L2.reg}{Geographic region. A character scalar containing the column
40 | name of the geographic region in \code{survey} and \code{census} by which
41 | geographic units are grouped (\code{L2.unit} must be nested within
42 | \code{L2.reg}). Default is \code{NULL}.}
43 | }
44 | \value{
45 | Returns a numeric vector of predictions from a \code{glmmLasso()}
46 |   object.
47 | }
48 | \description{
49 | \code{glmmLasso()} predicts on newdata objects from a glmmLasso object.
50 | }
51 | 


--------------------------------------------------------------------------------
/man/quiet.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{quiet}
 4 | \alias{quiet}
 5 | \title{Suppress cat in external package}
 6 | \usage{
 7 | quiet(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Input. It can be any kind.}
11 | }
12 | \description{
13 | \code{quiet()} suppresses cat output.
14 | }
15 | 


--------------------------------------------------------------------------------
/man/run_best_subset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_best_subset.R
 3 | \name{run_best_subset}
 4 | \alias{run_best_subset}
 5 | \title{Apply best subset classifier to MrP.}
 6 | \usage{
 7 | run_best_subset(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.unit,
12 |   L2.reg,
13 |   loss.unit,
14 |   loss.fun,
15 |   data,
16 |   verbose,
17 |   cores
18 | )
19 | }
20 | \arguments{
21 | \item{y}{Outcome variable. A character vector containing the column names of
22 | the outcome variable. A character scalar containing the column name of
23 | the outcome variable in \code{survey}.}
24 | 
25 | \item{L1.x}{Individual-level covariates. A character vector containing the
26 | column names of the individual-level variables in \code{survey} and
27 | \code{census} used to predict outcome \code{y}. Note that geographic unit
28 | is specified in argument \code{L2.unit}.}
29 | 
30 | \item{L2.x}{Context-level covariates. A character vector containing the
31 | column names of the context-level variables in \code{survey} and
32 | \code{census} used to predict outcome \code{y}. To exclude context-level
33 | variables, set \code{L2.x = NULL}.}
34 | 
35 | \item{L2.unit}{Geographic unit. A character scalar containing the column
36 | name of the geographic unit in \code{survey} and \code{census} at which
37 | outcomes should be aggregated.}
38 | 
39 | \item{L2.reg}{Geographic region. A character scalar containing the column
40 | name of the geographic region in \code{survey} and \code{census} by which
41 | geographic units are grouped (\code{L2.unit} must be nested within
42 | \code{L2.reg}). Default is \code{NULL}.}
43 | 
44 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
45 | whether performance loss should be evaluated at the level of individual
46 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
47 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
48 | loss units, parameters are ranked for each loss unit and the loss unit with
49 | the lowest rank sum is chosen. Ties are broken according to the order in
50 | the search grid.}
51 | 
52 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
53 | prediction loss should be measured by the mean squared error (\code{MSE}),
54 | the mean absolute error (\code{MAE}), binary cross-entropy
55 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1
56 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE",
57 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters
58 | are ranked for each loss function and the parameter combination with the
59 | lowest rank sum is chosen. Ties are broken according to the order in the
60 | search grid.}
61 | 
62 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
63 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
64 | cross-validation.}
65 | 
66 | \item{verbose}{Verbose output. A logical argument indicating whether or not
67 | verbose output should be printed. Default is \code{FALSE}.}
68 | 
69 | \item{cores}{The number of cores to be used. An integer indicating the number
70 | of processor cores used for parallel computing. Default is 1.}
71 | }
72 | \value{
73 | A model formula of the winning best subset classifier model.
74 | }
75 | \description{
76 | \code{run_best_subset} is a wrapper function that applies the best subset
77 | classifier to a list of models provided by the user, evaluates the models'
78 | prediction performance, and chooses the best-performing model.
79 | }
80 | 


--------------------------------------------------------------------------------
/man/run_best_subset_mc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_best_subset.R
 3 | \name{run_best_subset_mc}
 4 | \alias{run_best_subset_mc}
 5 | \title{Best subset multicore tuning.}
 6 | \usage{
 7 | run_best_subset_mc(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.unit,
12 |   L2.reg,
13 |   loss.unit,
14 |   loss.fun,
15 |   data,
16 |   cores,
17 |   models,
18 |   verbose
19 | )
20 | }
21 | \arguments{
22 | \item{y}{Outcome variable. A character scalar containing the column name of
23 | the outcome variable in \code{survey}.}
24 | 
25 | \item{L1.x}{Individual-level covariates. A character vector containing the
26 | column names of the individual-level variables in \code{survey} and
27 | \code{census} used to predict outcome \code{y}. Note that geographic unit
28 | is specified in argument \code{L2.unit}.}
29 | 
30 | \item{L2.x}{Context-level covariates. A character vector containing the
31 | column names of the context-level variables in \code{survey} and
32 | \code{census} used to predict outcome \code{y}.}
33 | 
34 | \item{L2.unit}{Geographic unit. A character scalar containing the column
35 | name of the geographic unit in \code{survey} and \code{census} at which
36 | outcomes should be aggregated.}
37 | 
38 | \item{L2.reg}{Geographic region. A character scalar containing the column
39 | name of the geographic region in \code{survey} and \code{census} by which
40 | geographic units are grouped (\code{L2.unit} must be nested within
41 | \code{L2.reg}). Default is \code{NULL}.}
42 | 
43 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
44 | whether performance loss should be evaluated at the level of individual
45 | respondents (\code{individuals}) or geographic units (\code{L2 units}).
46 | Default is \code{individuals}.}
47 | 
48 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
49 | prediction loss should be measured by the mean squared error (\code{MSE})
50 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.}
51 | 
52 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
53 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
54 | cross-validation.}
55 | 
56 | \item{cores}{The number of cores to be used. An integer indicating the number
57 | of processor cores used for parallel computing. Default is 1.}
58 | 
59 | \item{models}{The models to perform best subset selection on. A list of model
60 | formulas.}
61 | 
62 | \item{verbose}{Verbose output. A logical argument indicating whether or not
63 | verbose output should be printed. Default is \code{TRUE}.}
64 | }
65 | \value{
66 | The cross-validation errors for all models. A list.
67 | }
68 | \description{
69 | \code{run_best_subset_mc} is called from within \code{run_best_subset}. It
70 | tunes using multiple cores.
71 | }
72 | \examples{
73 | \dontrun{
74 | # not yet
75 | }
76 | }
77 | 


--------------------------------------------------------------------------------
/man/run_deep_bs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_deep_bs.r
 3 | \name{run_deep_bs}
 4 | \alias{run_deep_bs}
 5 | \title{Apply deep mrp to the best subset classifier to MrP.}
 6 | \usage{
 7 | run_deep_bs(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.unit,
12 |   L2.reg,
13 |   loss.unit,
14 |   loss.fun,
15 |   deep.splines,
16 |   data,
17 |   k.folds,
18 |   verbose,
19 |   cores
20 | )
21 | }
22 | \arguments{
23 | \item{y}{Outcome variable. A character vector containing the column names of
24 | the outcome variable. A character scalar containing the column name of
25 | the outcome variable in \code{survey}.}
26 | 
27 | \item{L1.x}{Individual-level covariates. A character vector containing the
28 | column names of the individual-level variables in \code{survey} and
29 | \code{census} used to predict outcome \code{y}. Note that geographic unit
30 | is specified in argument \code{L2.unit}.}
31 | 
32 | \item{L2.x}{Context-level covariates. A character vector containing the
33 | column names of the context-level variables in \code{survey} and
34 | \code{census} used to predict outcome \code{y}. To exclude context-level
35 | variables, set \code{L2.x = NULL}.}
36 | 
37 | \item{L2.unit}{Geographic unit. A character scalar containing the column
38 | name of the geographic unit in \code{survey} and \code{census} at which
39 | outcomes should be aggregated.}
40 | 
41 | \item{L2.reg}{Geographic region. A character scalar containing the column
42 | name of the geographic region in \code{survey} and \code{census} by which
43 | geographic units are grouped (\code{L2.unit} must be nested within
44 | \code{L2.reg}). Default is \code{NULL}.}
45 | 
46 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
47 | whether performance loss should be evaluated at the level of individual
48 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
49 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
50 | loss units, parameters are ranked for each loss unit and the loss unit with
51 | the lowest rank sum is chosen. Ties are broken according to the order in
52 | the search grid.}
53 | 
54 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
55 | prediction loss should be measured by the mean squared error (\code{MSE}),
56 | the mean absolute error (\code{MAE}), binary cross-entropy
57 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1
58 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE",
59 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters
60 | are ranked for each loss function and the parameter combination with the
61 | lowest rank sum is chosen. Ties are broken according to the order in the
62 | search grid.}
63 | 
64 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether
65 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.}
66 | 
67 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
68 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
69 | cross-validation.}
70 | 
71 | \item{k.folds}{Number of cross-validation folds. An integer-valued scalar
72 | indicating the number of folds to be used in cross-validation. Default is
73 | \eqn{5}. \emph{Note:} ignored if \code{folds} is provided, but must be
74 | specified otherwise.}
75 | 
76 | \item{verbose}{Verbose output. A logical argument indicating whether or not
77 | verbose output should be printed. Default is \code{FALSE}.}
78 | 
79 | \item{cores}{The number of cores to be used. An integer indicating the number
80 | of processor cores used for parallel computing. Default is 1.}
81 | }
82 | \value{
83 | A model formula of the winning best subset classifier model.
84 | }
85 | \description{
86 | \code{run_deep_bs} is a wrapper function that applies the bestsubset
87 | classifier to a list of models provided by the user, evaluates the models'
88 | prediction performance, and chooses the best-performing model. It differs
89 | from \code{run_best_subset} in that it includes L1.x interactions.
90 | }
91 | 


--------------------------------------------------------------------------------
/man/run_deep_pca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_deep_pca.r
 3 | \name{run_deep_pca}
 4 | \alias{run_deep_pca}
 5 | \title{Apply PCA classifier to MrP.}
 6 | \usage{
 7 | run_deep_pca(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.unit,
12 |   L2.reg,
13 |   loss.unit,
14 |   loss.fun,
15 |   deep.splines,
16 |   data,
17 |   cores,
18 |   verbose
19 | )
20 | }
21 | \arguments{
22 | \item{y}{Outcome variable. A character vector containing the column names of
23 | the outcome variable. A character scalar containing the column name of
24 | the outcome variable in \code{survey}.}
25 | 
26 | \item{L1.x}{Individual-level covariates. A character vector containing the
27 | column names of the individual-level variables in \code{survey} and
28 | \code{census} used to predict outcome \code{y}. Note that geographic unit
29 | is specified in argument \code{L2.unit}.}
30 | 
31 | \item{L2.x}{Context-level covariates. A character vector containing the
32 | column names of the context-level variables in \code{survey} and
33 | \code{census} used to predict outcome \code{y}. To exclude context-level
34 | variables, set \code{L2.x = NULL}.}
35 | 
36 | \item{L2.unit}{Geographic unit. A character scalar containing the column
37 | name of the geographic unit in \code{survey} and \code{census} at which
38 | outcomes should be aggregated.}
39 | 
40 | \item{L2.reg}{Geographic region. A character scalar containing the column
41 | name of the geographic region in \code{survey} and \code{census} by which
42 | geographic units are grouped (\code{L2.unit} must be nested within
43 | \code{L2.reg}). Default is \code{NULL}.}
44 | 
45 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
46 | whether performance loss should be evaluated at the level of individual
47 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
48 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
49 | loss units, parameters are ranked for each loss unit and the loss unit with
50 | the lowest rank sum is chosen. Ties are broken according to the order in
51 | the search grid.}
52 | 
53 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
54 | prediction loss should be measured by the mean squared error (\code{MSE}),
55 | the mean absolute error (\code{MAE}), binary cross-entropy
56 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1
57 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE",
58 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters
59 | are ranked for each loss function and the parameter combination with the
60 | lowest rank sum is chosen. Ties are broken according to the order in the
61 | search grid.}
62 | 
63 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether
64 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.}
65 | 
66 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
67 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
68 | cross-validation.}
69 | 
70 | \item{cores}{The number of cores to be used. An integer indicating the number
71 | of processor cores used for parallel computing. Default is 1.}
72 | 
73 | \item{verbose}{Verbose output. A logical argument indicating whether or not
74 | verbose output should be printed. Default is \code{FALSE}.}
75 | }
76 | \value{
77 | A model formula of the winning best subset classifier model.
78 | }
79 | \description{
80 | \code{run_deep_pca} is a wrapper function that applies the PCA classifier to
81 | data provided by the user, evaluates prediction performance, and chooses the
82 | best-performing model. It differs from \code{run_best_subset} in that it
83 | includes L1.x interactions.
84 | }
85 | 


--------------------------------------------------------------------------------
/man/run_gb.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/run_gb.R
  3 | \name{run_gb}
  4 | \alias{run_gb}
  5 | \title{Apply gradient boosting classifier to MrP.}
  6 | \usage{
  7 | run_gb(
  8 |   y,
  9 |   L1.x,
 10 |   L2.x,
 11 |   L2.eval.unit,
 12 |   L2.unit,
 13 |   L2.reg,
 14 |   loss.unit,
 15 |   loss.fun,
 16 |   interaction.depth,
 17 |   shrinkage,
 18 |   n.trees.init,
 19 |   n.trees.increase,
 20 |   n.trees.max,
 21 |   cores = cores,
 22 |   n.minobsinnode,
 23 |   data,
 24 |   verbose
 25 | )
 26 | }
 27 | \arguments{
 28 | \item{y}{Outcome variable. A character vector containing the column names of
 29 | the outcome variable. A character scalar containing the column name of
 30 | the outcome variable in \code{survey}.}
 31 | 
 32 | \item{L1.x}{Individual-level covariates. A character vector containing the
 33 | column names of the individual-level variables in \code{survey} and
 34 | \code{census} used to predict outcome \code{y}. Note that geographic unit
 35 | is specified in argument \code{L2.unit}.}
 36 | 
 37 | \item{L2.x}{Context-level covariates. A character vector containing the
 38 | column names of the context-level variables in \code{survey} and
 39 | \code{census} used to predict outcome \code{y}. To exclude context-level
 40 | variables, set \code{L2.x = NULL}.}
 41 | 
 42 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar
 43 | containing the column name of the geographic unit in \code{survey} and
 44 | \code{census}.}
 45 | 
 46 | \item{L2.unit}{Geographic unit. A character scalar containing the column
 47 | name of the geographic unit in \code{survey} and \code{census} at which
 48 | outcomes should be aggregated.}
 49 | 
 50 | \item{L2.reg}{Geographic region. A character scalar containing the column
 51 | name of the geographic region in \code{survey} and \code{census} by which
 52 | geographic units are grouped (\code{L2.unit} must be nested within
 53 | \code{L2.reg}). Default is \code{NULL}.}
 54 | 
 55 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
 56 | whether performance loss should be evaluated at the level of individual
 57 | respondents (\code{individuals}) or geographic units (\code{L2 units}).
 58 | Default is \code{individuals}.}
 59 | 
 60 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
 61 | prediction loss should be measured by the mean squared error (\code{MSE})
 62 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.}
 63 | 
 64 | \item{interaction.depth}{GB interaction depth. An integer-valued vector
 65 | whose values specify the interaction depth of GB. The interaction depth
 66 | defines the maximum depth of each tree grown (i.e., the maximum level of
 67 | variable interactions). Default is \code{c(1, 2, 3)}.}
 68 | 
 69 | \item{shrinkage}{GB learning rate. A numeric vector whose values specify the
 70 | learning rate or step-size reduction of GB. Values between \eqn{0.001}
 71 | and \eqn{0.1} usually work, but a smaller learning rate typically requires
 72 | more trees. Default is \code{c(0.04, 0.01, 0.008, 0.005, 0.001)}.}
 73 | 
 74 | \item{n.trees.init}{GB initial total number of trees. An integer-valued
 75 | scalar specifying the initial number of total trees to fit by GB. Default
 76 | is \eqn{50}.}
 77 | 
 78 | \item{n.trees.increase}{GB increase in total number of trees. An
 79 | integer-valued scalar specifying by how many trees the total number of
 80 | trees to fit should be increased (until \code{n.trees.max} is reached)
 81 | or an integer-valued vector of length \code{length(shrinkage)} with each
 82 | of its values being associated with a learning rate in \code{shrinkage}.
 83 | Default is \eqn{50}.}
 84 | 
 85 | \item{n.trees.max}{GB maximum number of trees. An integer-valued scalar
 86 | specifying the maximum number of trees to fit by GB or an integer-valued
 87 | vector of length \code{length(shrinkage)} with each of its values being
 88 | associated with a learning rate and an increase in the total number of
 89 | trees. Default is \eqn{1000}.}
 90 | 
 91 | \item{cores}{The number of cores to be used. An integer indicating the number
 92 | of processor cores used for parallel computing. Default is 1.}
 93 | 
 94 | \item{n.minobsinnode}{GB minimum number of observations in the terminal
 95 | nodes. An integer-valued scalar specifying the minimum number of
 96 | observations that each terminal node of the trees must contain. Default is
 97 | \eqn{5}.}
 98 | 
 99 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
100 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
101 | cross-validation.}
102 | 
103 | \item{verbose}{Verbose output. A logical argument indicating whether or not
104 | verbose output should be printed. Default is \code{TRUE}.}
105 | }
106 | \value{
107 | The tuned gradient boosting parameters. A list with three elements:
108 |   \code{interaction_depth} contains the interaction depth parameter,
109 |   \code{shrinkage} contains the learning rate, \code{n_trees} the number of
110 |   trees to be grown.
111 | }
112 | \description{
113 | \code{run_gb} is a wrapper function that applies the gradient boosting
114 | classifier to data provided by the user, evaluates prediction performance,
115 | and chooses the best-performing model.
116 | }
117 | 


--------------------------------------------------------------------------------
/man/run_gb_mc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_gb.R
 3 | \name{run_gb_mc}
 4 | \alias{run_gb_mc}
 5 | \title{GB multicore tuning.}
 6 | \usage{
 7 | run_gb_mc(
 8 |   y,
 9 |   L1.x,
10 |   L2.eval.unit,
11 |   L2.unit,
12 |   L2.reg,
13 |   form,
14 |   gb.grid,
15 |   n.minobsinnode,
16 |   loss.unit,
17 |   loss.fun,
18 |   data,
19 |   cores
20 | )
21 | }
22 | \arguments{
23 | \item{y}{Outcome variable. A character vector containing the column names of
24 | the outcome variable. A character scalar containing the column name of
25 | the outcome variable in \code{survey}.}
26 | 
27 | \item{L1.x}{Individual-level covariates. A character vector containing the
28 | column names of the individual-level variables in \code{survey} and
29 | \code{census} used to predict outcome \code{y}. Note that geographic unit
30 | is specified in argument \code{L2.unit}.}
31 | 
32 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar
33 | containing the column name of the geographic unit in \code{survey} and
34 | \code{census}.}
35 | 
36 | \item{L2.unit}{Geographic unit. A character scalar containing the column
37 | name of the geographic unit in \code{survey} and \code{census} at which
38 | outcomes should be aggregated.}
39 | 
40 | \item{L2.reg}{Geographic region. A character scalar containing the column
41 | name of the geographic region in \code{survey} and \code{census} by which
42 | geographic units are grouped (\code{L2.unit} must be nested within
43 | \code{L2.reg}). Default is \code{NULL}.}
44 | 
45 | \item{form}{The model formula. A formula object.}
46 | 
47 | \item{gb.grid}{The hyper-parameter search grid. A matrix of all
48 | hyper-parameter combinations.}
49 | 
50 | \item{n.minobsinnode}{GB minimum number of observations in the terminal
51 | nodes. An integer-valued scalar specifying the minimum number of
52 | observations that each terminal node of the trees must contain. Default is
53 | \eqn{5}.}
54 | 
55 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
56 | whether performance loss should be evaluated at the level of individual
57 | respondents (\code{individuals}) or geographic units (\code{L2 units}).
58 | Default is \code{individuals}.}
59 | 
60 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
61 | prediction loss should be measured by the mean squared error (\code{MSE})
62 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.}
63 | 
64 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
65 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
66 | cross-validation.}
67 | 
68 | \item{cores}{The number of cores to be used. An integer indicating the number
69 | of processor cores used for parallel computing. Default is 1.}
70 | }
71 | \value{
72 | The tuning parameter combinations and there associated loss function
73 |   scores. A list.
74 | }
75 | \description{
76 | \code{run_gb_mc} is called from within \code{run_gb}. It tunes using
77 | multiple cores.
78 | }
79 | 


--------------------------------------------------------------------------------
/man/run_lasso.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_lasso.R
 3 | \name{run_lasso}
 4 | \alias{run_lasso}
 5 | \title{Apply lasso classifier to MrP.}
 6 | \usage{
 7 | run_lasso(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.unit,
12 |   L2.reg,
13 |   n.iter,
14 |   loss.unit,
15 |   loss.fun,
16 |   lambda,
17 |   data,
18 |   verbose,
19 |   cores
20 | )
21 | }
22 | \arguments{
23 | \item{y}{Outcome variable. A character vector containing the column names of
24 | the outcome variable. A character scalar containing the column name of
25 | the outcome variable in \code{survey}.}
26 | 
27 | \item{L1.x}{Individual-level covariates. A character vector containing the
28 | column names of the individual-level variables in \code{survey} and
29 | \code{census} used to predict outcome \code{y}. Note that geographic unit
30 | is specified in argument \code{L2.unit}.}
31 | 
32 | \item{L2.x}{Context-level covariates. A character vector containing the
33 | column names of the context-level variables in \code{survey} and
34 | \code{census} used to predict outcome \code{y}. To exclude context-level
35 | variables, set \code{L2.x = NULL}.}
36 | 
37 | \item{L2.unit}{Geographic unit. A character scalar containing the column
38 | name of the geographic unit in \code{survey} and \code{census} at which
39 | outcomes should be aggregated.}
40 | 
41 | \item{L2.reg}{Geographic region. A character scalar containing the column
42 | name of the geographic region in \code{survey} and \code{census} by which
43 | geographic units are grouped (\code{L2.unit} must be nested within
44 | \code{L2.reg}). Default is \code{NULL}.}
45 | 
46 | \item{n.iter}{Lasso number of lambda values. An integer-valued scalar
47 | specifying the number of lambda values to search over. Default is
48 | \eqn{100}.
49 | \emph{Note:} Is ignored if a vector of \code{lasso.lambda} values is
50 | provided.}
51 | 
52 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
53 | whether performance loss should be evaluated at the level of individual
54 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
55 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
56 | loss units, parameters are ranked for each loss unit and the loss unit with
57 | the lowest rank sum is chosen. Ties are broken according to the order in
58 | the search grid.}
59 | 
60 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
61 | prediction loss should be measured by the mean squared error (\code{MSE}),
62 | the mean absolute error (\code{MAE}), binary cross-entropy
63 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1
64 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE",
65 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters
66 | are ranked for each loss function and the parameter combination with the
67 | lowest rank sum is chosen. Ties are broken according to the order in the
68 | search grid.}
69 | 
70 | \item{lambda}{Lasso penalty parameter. A numeric \code{vector} of
71 | non-negative values. The penalty parameter controls the shrinkage of the
72 | context-level variables in the lasso model. Default is a sequence with
73 | minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The
74 | number of values is controlled by the \code{lasso.n.iter} parameter.}
75 | 
76 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
77 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
78 | cross-validation.}
79 | 
80 | \item{verbose}{Verbose output. A logical argument indicating whether or not
81 | verbose output should be printed. Default is \code{FALSE}.}
82 | 
83 | \item{cores}{The number of cores to be used. An integer indicating the number
84 | of processor cores used for parallel computing. Default is 1.}
85 | }
86 | \value{
87 | The tuned lambda value. A numeric scalar.
88 | }
89 | \description{
90 | \code{run_lasso} is a wrapper function that applies the lasso classifier to
91 | data provided by the user, evaluates prediction performance, and chooses the
92 | best-performing model.
93 | }
94 | 


--------------------------------------------------------------------------------
/man/run_lasso_mc_lambda.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_lasso.R
 3 | \name{run_lasso_mc_lambda}
 4 | \alias{run_lasso_mc_lambda}
 5 | \title{Lasso multicore tuning.}
 6 | \usage{
 7 | run_lasso_mc_lambda(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.unit,
12 |   L2.reg,
13 |   loss.unit,
14 |   loss.fun,
15 |   data,
16 |   cores,
17 |   L2.fe.form,
18 |   L1.re,
19 |   lambda
20 | )
21 | }
22 | \arguments{
23 | \item{y}{Outcome variable. A character vector containing the column names of
24 | the outcome variable. A character scalar containing the column name of
25 | the outcome variable in \code{survey}.}
26 | 
27 | \item{L1.x}{Individual-level covariates. A character vector containing the
28 | column names of the individual-level variables in \code{survey} and
29 | \code{census} used to predict outcome \code{y}. Note that geographic unit
30 | is specified in argument \code{L2.unit}.}
31 | 
32 | \item{L2.x}{Context-level covariates. A character vector containing the
33 | column names of the context-level variables in \code{survey} and
34 | \code{census} used to predict outcome \code{y}. To exclude context-level
35 | variables, set \code{L2.x = NULL}.}
36 | 
37 | \item{L2.unit}{Geographic unit. A character scalar containing the column
38 | name of the geographic unit in \code{survey} and \code{census} at which
39 | outcomes should be aggregated.}
40 | 
41 | \item{L2.reg}{Geographic region. A character scalar containing the column
42 | name of the geographic region in \code{survey} and \code{census} by which
43 | geographic units are grouped (\code{L2.unit} must be nested within
44 | \code{L2.reg}). Default is \code{NULL}.}
45 | 
46 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
47 | whether performance loss should be evaluated at the level of individual
48 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
49 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
50 | loss units, parameters are ranked for each loss unit and the loss unit with
51 | the lowest rank sum is chosen. Ties are broken according to the order in
52 | the search grid.}
53 | 
54 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
55 | prediction loss should be measured by the mean squared error (\code{MSE}),
56 | the mean absolute error (\code{MAE}), binary cross-entropy
57 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1
58 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE",
59 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters
60 | are ranked for each loss function and the parameter combination with the
61 | lowest rank sum is chosen. Ties are broken according to the order in the
62 | search grid.}
63 | 
64 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
65 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
66 | cross-validation.}
67 | 
68 | \item{cores}{The number of cores to be used. An integer indicating the number
69 | of processor cores used for parallel computing. Default is 1.}
70 | 
71 | \item{L2.fe.form}{The fixed effects part of the Lasso classifier formula. The
72 | formula is inherited from \code{run_lasso}.}
73 | 
74 | \item{L1.re}{A list of random effects for the Lasso classifier formula. The
75 | formula is inherited from \code{run_lasso}.}
76 | 
77 | \item{lambda}{Lasso penalty parameter. A numeric \code{vector} of
78 | non-negative values. The penalty parameter controls the shrinkage of the
79 | context-level variables in the lasso model. Default is a sequence with
80 | minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The
81 | number of values is controlled by the \code{lasso.n.iter} parameter.}
82 | }
83 | \value{
84 | The cross-validation errors for all models. A list.
85 | }
86 | \description{
87 | \code{run_lasso_mc_lambda} is called from within \code{run_lasso}. It
88 | tunes using multiple cores.
89 | }
90 | 


--------------------------------------------------------------------------------
/man/run_pca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_pca.R
 3 | \name{run_pca}
 4 | \alias{run_pca}
 5 | \title{Apply PCA classifier to MrP.}
 6 | \usage{
 7 | run_pca(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.unit,
12 |   L2.reg,
13 |   loss.unit,
14 |   loss.fun,
15 |   data,
16 |   cores,
17 |   verbose
18 | )
19 | }
20 | \arguments{
21 | \item{y}{Outcome variable. A character vector containing the column names of
22 | the outcome variable. A character scalar containing the column name of
23 | the outcome variable in \code{survey}.}
24 | 
25 | \item{L1.x}{Individual-level covariates. A character vector containing the
26 | column names of the individual-level variables in \code{survey} and
27 | \code{census} used to predict outcome \code{y}. Note that geographic unit
28 | is specified in argument \code{L2.unit}.}
29 | 
30 | \item{L2.x}{Context-level covariates. A character vector containing the
31 | column names of the context-level variables in \code{survey} and
32 | \code{census} used to predict outcome \code{y}. To exclude context-level
33 | variables, set \code{L2.x = NULL}.}
34 | 
35 | \item{L2.unit}{Geographic unit. A character scalar containing the column
36 | name of the geographic unit in \code{survey} and \code{census} at which
37 | outcomes should be aggregated.}
38 | 
39 | \item{L2.reg}{Geographic region. A character scalar containing the column
40 | name of the geographic region in \code{survey} and \code{census} by which
41 | geographic units are grouped (\code{L2.unit} must be nested within
42 | \code{L2.reg}). Default is \code{NULL}.}
43 | 
44 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
45 | whether performance loss should be evaluated at the level of individual
46 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
47 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
48 | loss units, parameters are ranked for each loss unit and the loss unit with
49 | the lowest rank sum is chosen. Ties are broken according to the order in
50 | the search grid.}
51 | 
52 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
53 | prediction loss should be measured by the mean squared error (\code{MSE}),
54 | the mean absolute error (\code{MAE}), binary cross-entropy
55 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1
56 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE",
57 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters
58 | are ranked for each loss function and the parameter combination with the
59 | lowest rank sum is chosen. Ties are broken according to the order in the
60 | search grid.}
61 | 
62 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
63 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
64 | cross-validation.}
65 | 
66 | \item{cores}{The number of cores to be used. An integer indicating the number
67 | of processor cores used for parallel computing. Default is 1.}
68 | 
69 | \item{verbose}{Verbose output. A logical argument indicating whether or not
70 | verbose output should be printed. Default is \code{FALSE}.}
71 | }
72 | \value{
73 | A model formula of the winning best subset classifier model.
74 | }
75 | \description{
76 | \code{run_pca} is a wrapper function that applies the PCA classifier to data
77 | provided by the user, evaluates prediction performance, and chooses the
78 | best-performing model.
79 | }
80 | 


--------------------------------------------------------------------------------
/man/run_svm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_svm.R
 3 | \name{run_svm}
 4 | \alias{run_svm}
 5 | \title{Apply support vector machine classifier to MrP.}
 6 | \usage{
 7 | run_svm(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.eval.unit,
12 |   L2.unit,
13 |   L2.reg,
14 |   kernel = "radial",
15 |   loss.fun,
16 |   loss.unit,
17 |   gamma,
18 |   cost,
19 |   data,
20 |   verbose,
21 |   cores
22 | )
23 | }
24 | \arguments{
25 | \item{y}{Outcome variable. A character vector containing the column names of
26 | the outcome variable. A character scalar containing the column name of
27 | the outcome variable in \code{survey}.}
28 | 
29 | \item{L1.x}{Individual-level covariates. A character vector containing the
30 | column names of the individual-level variables in \code{survey} and
31 | \code{census} used to predict outcome \code{y}. Note that geographic unit
32 | is specified in argument \code{L2.unit}.}
33 | 
34 | \item{L2.x}{Context-level covariates. A character vector containing the
35 | column names of the context-level variables in \code{survey} and
36 | \code{census} used to predict outcome \code{y}. To exclude context-level
37 | variables, set \code{L2.x = NULL}.}
38 | 
39 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar
40 | containing the column name of the geographic unit in \code{survey} and
41 | \code{census}.}
42 | 
43 | \item{L2.unit}{Geographic unit. A character scalar containing the column
44 | name of the geographic unit in \code{survey} and \code{census} at which
45 | outcomes should be aggregated.}
46 | 
47 | \item{L2.reg}{Geographic region. A character scalar containing the column
48 | name of the geographic region in \code{survey} and \code{census} by which
49 | geographic units are grouped (\code{L2.unit} must be nested within
50 | \code{L2.reg}). Default is \code{NULL}.}
51 | 
52 | \item{kernel}{SVM kernel. A character-valued scalar specifying the kernel to
53 | be used by SVM. The possible values are \code{linear}, \code{polynomial},
54 | \code{radial}, and \code{sigmoid}. Default is \code{radial}.}
55 | 
56 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
57 | prediction loss should be measured by the mean squared error (\code{MSE})
58 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.}
59 | 
60 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
61 | whether performance loss should be evaluated at the level of individual
62 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
63 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
64 | loss units, parameters are ranked for each loss unit and the loss unit with
65 | the lowest rank sum is chosen. Ties are broken according to the order in
66 | the search grid.}
67 | 
68 | \item{gamma}{SVM kernel parameter. A numeric vector whose values specify the
69 | gamma parameter in the SVM kernel. This parameter is needed for all kernel
70 | types except linear. Default is a sequence with minimum = 1e-5, maximum =
71 | 1e-1, and length = 20 that is equally spaced on the log-scale.}
72 | 
73 | \item{cost}{SVM cost parameter. A numeric vector whose values specify the
74 | cost of constraints violation in SVM. Default is a sequence with minimum =
75 | 0.5, maximum = 10, and length = 5 that is equally spaced on the log-scale.}
76 | 
77 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
78 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
79 | cross-validation.}
80 | 
81 | \item{verbose}{Verbose output. A logical argument indicating whether or not
82 | verbose output should be printed. Default is \code{FALSE}.}
83 | 
84 | \item{cores}{The number of cores to be used. An integer indicating the number
85 | of processor cores used for parallel computing. Default is 1.}
86 | }
87 | \value{
88 | The support vector machine tuned parameters. A list.
89 | }
90 | \description{
91 | \code{run_svm} is a wrapper function that applies the support vector machine
92 | classifier to data provided by the user, evaluates prediction performance,
93 | and chooses the best-performing model.
94 | }
95 | 


--------------------------------------------------------------------------------
/man/run_svm_mc.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/run_svm.R
 3 | \name{run_svm_mc}
 4 | \alias{run_svm_mc}
 5 | \title{SVM multicore tuning.}
 6 | \usage{
 7 | run_svm_mc(
 8 |   y,
 9 |   L1.x,
10 |   L2.x,
11 |   L2.eval.unit,
12 |   L2.unit,
13 |   L2.reg,
14 |   form,
15 |   loss.unit,
16 |   loss.fun,
17 |   data,
18 |   cores,
19 |   svm.grid,
20 |   verbose
21 | )
22 | }
23 | \arguments{
24 | \item{y}{Outcome variable. A character vector containing the column names of
25 | the outcome variable. A character scalar containing the column name of
26 | the outcome variable in \code{survey}.}
27 | 
28 | \item{L1.x}{Individual-level covariates. A character vector containing the
29 | column names of the individual-level variables in \code{survey} and
30 | \code{census} used to predict outcome \code{y}. Note that geographic unit
31 | is specified in argument \code{L2.unit}.}
32 | 
33 | \item{L2.x}{Context-level covariates. A character vector containing the
34 | column names of the context-level variables in \code{survey} and
35 | \code{census} used to predict outcome \code{y}. To exclude context-level
36 | variables, set \code{L2.x = NULL}.}
37 | 
38 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar
39 | containing the column name of the geographic unit in \code{survey} and
40 | \code{census}.}
41 | 
42 | \item{L2.unit}{Geographic unit. A character scalar containing the column
43 | name of the geographic unit in \code{survey} and \code{census} at which
44 | outcomes should be aggregated.}
45 | 
46 | \item{L2.reg}{Geographic region. A character scalar containing the column
47 | name of the geographic region in \code{survey} and \code{census} by which
48 | geographic units are grouped (\code{L2.unit} must be nested within
49 | \code{L2.reg}). Default is \code{NULL}.}
50 | 
51 | \item{form}{The model formula. A formula object.}
52 | 
53 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating
54 | whether performance loss should be evaluated at the level of individual
55 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at
56 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple
57 | loss units, parameters are ranked for each loss unit and the loss unit with
58 | the lowest rank sum is chosen. Ties are broken according to the order in
59 | the search grid.}
60 | 
61 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether
62 | prediction loss should be measured by the mean squared error (\code{MSE})
63 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.}
64 | 
65 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k}
66 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold
67 | cross-validation.}
68 | 
69 | \item{cores}{The number of cores to be used. An integer indicating the number
70 | of processor cores used for parallel computing. Default is 1.}
71 | 
72 | \item{svm.grid}{The hyper-parameter search grid. A matrix of all
73 | hyper-parameter combinations.}
74 | 
75 | \item{verbose}{Verbose output. A logical argument indicating whether or not
76 | verbose output should be printed. Default is \code{FALSE}.}
77 | }
78 | \value{
79 | The cross-validation errors for all models. A list.
80 | }
81 | \description{
82 | \code{run_svm_mc} is called from within \code{run_svm}. It tunes using
83 | multiple cores.
84 | }
85 | 


--------------------------------------------------------------------------------
/man/summary.autoMrP.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{summary.autoMrP}
 4 | \alias{summary.autoMrP}
 5 | \title{A summary method for autoMrP objects.}
 6 | \usage{
 7 | \method{summary}{autoMrP}(
 8 |   object,
 9 |   ci.lvl = 0.95,
10 |   digits = 4,
11 |   format = "simple",
12 |   classifiers = NULL,
13 |   n = 10,
14 |   ...
15 | )
16 | }
17 | \arguments{
18 | \item{object}{An \code{autoMrP()} object for which a summary is desired.}
19 | 
20 | \item{ci.lvl}{The level of the confidence intervals. A proportion. Default is
21 | \code{0.95}. Confidence intervals are based on bootstrapped estimates and
22 | will not be printed if bootstrapping was not carried out.}
23 | 
24 | \item{digits}{The number of digits to be displayed. An integer scalar.
25 | Default is \code{4}.}
26 | 
27 | \item{format}{The table format. A character string passed to
28 | \code{\link[knitr]{kable}}. Default is \code{simple}.}
29 | 
30 | \item{classifiers}{Summarize a single classifier. A character string. Must be
31 | one of \code{best_subset}, \code{lasso}, \code{pca}, \code{gb}, \code{svm},
32 | or \code{mrp}. Default is \code{NULL}.}
33 | 
34 | \item{n}{Number of rows to be printed. An integer scalar. Default is
35 | \code{10}.}
36 | 
37 | \item{...}{Additional arguments affecting the summary produced.}
38 | }
39 | \value{
40 | No return value, prints a summary of the context level preference
41 |  estimates to the console.
42 | }
43 | \description{
44 | \code{summary.autoMrP()} ...
45 | }
46 | 


--------------------------------------------------------------------------------
/man/survey_item.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/survey_data.R
 3 | \docType{data}
 4 | \name{survey_item}
 5 | \alias{survey_item}
 6 | \title{A sample of a survey item from the CCES 2008}
 7 | \format{
 8 | A data frame with 1500 rows and 13 variables:
 9 | \describe{
10 |   \item{YES}{1 if individual supports use of troops; 0 otherwise}
11 |   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
12 |   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
13 |   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
14 |   \item{state}{U.S. state}
15 |   \item{L2.unit}{U.S. state id}
16 |   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
17 |   \item{L2.x1}{Normalized state-level share of votes for the Republican candidate in the previous presidential election}
18 |   \item{L2.x2}{Normalized state-level percentage of Evangelical Protestant or Mormon respondents}
19 |   \item{L2.x3}{Normalized state-level percentage of the population living in urban areas}
20 |   \item{L2.x4}{Normalized state-level unemployment rate}
21 |   \item{L2.x5}{Normalized state-level share of Hispanics}
22 |   \item{L2.x6}{Normalized state-level share of Whites}
23 | }
24 | }
25 | \source{
26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
27 |   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
28 |   multilevel regression and poststrat-stratification perform with
29 |   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
30 |   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
31 |   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
32 | }
33 | \usage{
34 | survey_item
35 | }
36 | \description{
37 | The Cooperative Congressional Election Stuides (CCES) item (cc418_1) asked:
38 | "Would you approve of the use of U.S. military troops in order to ensure the
39 | supply of oil?" The original 2008 CCES item contains 36,832 respondents. This
40 | sample mimics a typical national survey. It contains at least 5 respondents
41 | from each state but is otherwise a random sample.
42 | }
43 | \keyword{datasets}
44 | 


--------------------------------------------------------------------------------
/man/svm_classifier.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/svm_classifier.R
 3 | \name{svm_classifier}
 4 | \alias{svm_classifier}
 5 | \title{SVM classifier}
 6 | \usage{
 7 | svm_classifier(
 8 |   y,
 9 |   form,
10 |   data,
11 |   kernel,
12 |   type,
13 |   probability,
14 |   svm.gamma,
15 |   svm.cost,
16 |   verbose = c(TRUE, FALSE)
17 | )
18 | }
19 | \arguments{
20 | \item{y}{Outcome variable. A character vector containing the column names of
21 | the outcome variable. A character scalar containing the column name of
22 | the outcome variable in \code{survey}.}
23 | 
24 | \item{form}{Model formula. A two-sided linear formula describing
25 | the model to be fit, with the outcome on the LHS and the covariates
26 | separated by + operators on the RHS.}
27 | 
28 | \item{data}{Data. A data.frame containing the cross-validation data used to
29 | train and evaluate the model.}
30 | 
31 | \item{kernel}{Kernel for SVM. A character string specifying the kernel to
32 | be used for SVM. The possible types are linear, polynomial, radial, and
33 | sigmoid. Default is radial.}
34 | 
35 | \item{type}{svm can be used as a classification machine, as a regression
36 |   machine, or for novelty detection. Depending of whether y is a factor or
37 |   not, the default setting for type is C-classification or eps-regression,
38 |   respectively, but may be overwritten by setting an explicit value. Valid
39 |   options are: #' \enumerate{
40 |   \item C-classification
41 |   \item nu-classification
42 |   \item one-classification (for novelty detection)
43 |   \item eps-regression
44 |   \item nu-regression
45 | }}
46 | 
47 | \item{probability}{Probability predictions. A logical argument indicating
48 | whether the model should allow for probability predictions}
49 | 
50 | \item{svm.gamma}{Gamma parameter for SVM. This parameter is needed for all
51 | kernels except linear.}
52 | 
53 | \item{svm.cost}{Cost parameter for SVM. This parameter specifies the cost of
54 | constraints violation.}
55 | 
56 | \item{verbose}{Verbose output. A logical vector indicating whether or not
57 | verbose output should be printed.}
58 | }
59 | \value{
60 | The support vector machine model. An \code{\link[e1071]{svm}} object.
61 | }
62 | \description{
63 | \code{svm_classifier} applies support vector machine classification to a
64 | data set.
65 | }
66 | 


--------------------------------------------------------------------------------
/man/taxes_census.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/taxes_census.R
 3 | \docType{data}
 4 | \name{taxes_census}
 5 | \alias{taxes_census}
 6 | \title{Quasi census data.}
 7 | \format{
 8 | A data frame with 2934 rows and 13 variables:
 9 | \describe{
10 |   \item{state}{U.S. state}
11 |   \item{L2.unit}{U.S. state id}
12 |   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
13 |   \item{L1x1}{Age group (four categories)}
14 |   \item{L1x2}{Education level (four categories)}
15 |   \item{L1x3}{Gender-race combination (six categories)}
16 |   \item{freq}{State-level frequency of ideal type}
17 |   \item{proportion}{State-level proportion of respondents of that ideal type in the population}
18 |   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
19 |   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
20 |   \item{L2.x3}{State-level percentage of the population living in urban areas}
21 |   \item{L2.x4}{State-level unemployment rate}
22 |   \item{L2.x5}{State-level share of Hispanics}
23 |   \item{L2.x6}{State-level share of Whites}
24 | }
25 | }
26 | \source{
27 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
28 |   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
29 |   multilevel regression and poststrat-stratification perform with
30 |   conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3,
31 |   L2.x3, L2.x4, L2.x5 and L2.x6 are available at
32 |   \url{https://www.census.gov}.
33 | }
34 | \usage{
35 | data(taxes_census)
36 | }
37 | \description{
38 | The census file is generated from the full 2008 National Annenberg Election
39 | Studies item CBb01 by dissaggregating the 64 ideal type combinations of the
40 | individual level variables L1x1, L2x2 and L1x3. A row is an ideal type in a
41 | given state.
42 | }
43 | \keyword{datasets}
44 | 


--------------------------------------------------------------------------------
/man/taxes_survey.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/taxes_survey.R, R/taxes_truth.R
 3 | \docType{data}
 4 | \name{taxes_survey}
 5 | \alias{taxes_survey}
 6 | \title{Sample on raising taxes from the 2008 National Annenberg Election Studies.}
 7 | \format{
 8 | A data frame with 1500 rows and 13 variables:
 9 | \describe{
10 |   \item{YES}{1 if individual supports raising taxes; 0 otherwise}
11 |   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
12 |   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
13 |   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
14 |   \item{state}{U.S. state}
15 |   \item{L2.unit}{U.S. state id}
16 |   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
17 |   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
18 |   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
19 |   \item{L2.x3}{State-level percentage of the population living in urban areas}
20 |   \item{L2.x4}{State-level unemployment rate}
21 |   \item{L2.x5}{State-level share of Hispanics}
22 |   \item{L2.x6}{State-level share of Whites}
23 | }
24 | 
25 | A data frame with 1500 rows and 13 variables:
26 | \describe{
27 |   \item{YES}{1 if individual supports raising taxes; 0 otherwise}
28 |   \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)}
29 |   \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)}
30 |   \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)}
31 |   \item{state}{U.S. state}
32 |   \item{L2.unit}{U.S. state id}
33 |   \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)}
34 |   \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election}
35 |   \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents}
36 |   \item{L2.x3}{State-level percentage of the population living in urban areas}
37 |   \item{L2.x4}{State-level unemployment rate}
38 |   \item{L2.x5}{State-level share of Hispanics}
39 |   \item{L2.x6}{State-level share of Whites}
40 | }
41 | }
42 | \source{
43 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
44 |   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
45 |   multilevel regression and poststrat-stratification perform with
46 |   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
47 |   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
48 |   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
49 | 
50 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the
51 |   article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does
52 |   multilevel regression and poststrat-stratification perform with
53 |   conventional national surveys?" Political Analysis 21(4): 449-467. It is a
54 |   random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4,
55 |   L2.x5 and L2.x6 are available at \url{https://www.census.gov}.
56 | }
57 | \usage{
58 | data(taxes_survey)
59 | 
60 | data(taxes_survey)
61 | }
62 | \description{
63 | The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm
64 | going to read you some options about federal income taxes. Please tell me
65 | which one comes closest to your view on what we should be doing about federal
66 | income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if
67 | necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3)
68 | was turned into a 'raise taxes response,' categories (1) and (2) were
69 | combined into a 'do not raise taxes' response. The original item from the
70 | phone and online surveys contains 50,483 respondents. This sample mimics a
71 | typical national survey. It contains at least 5 respondents from each state
72 | but is otherwise a random sample.
73 | 
74 | The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm
75 | going to read you some options about federal income taxes. Please tell me
76 | which one comes closest to your view on what we should be doing about federal
77 | income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if
78 | necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3)
79 | was turned into a 'raise taxes response,' categories (1) and (2) were
80 | combined into a 'do not raise taxes' response. The original item from the
81 | phone and online surveys contains 50,483 respondents. This sample mimics a
82 | typical national survey. It contains at least 5 respondents from each state
83 | but is otherwise a random sample.
84 | }
85 | \keyword{datasets}
86 | 


--------------------------------------------------------------------------------
/vignettes/autoMrP_vignette.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/vignettes/autoMrP_vignette.pdf


--------------------------------------------------------------------------------
/vignettes/autoMrP_vignette.pdf.asis:
--------------------------------------------------------------------------------
1 | %\VignetteIndexEntry{autoMrP: Multilevel Models and Post-Stratification (MrP) Combined with Machine Learning in R}
2 | %\VignetteEngine{R.rsp::asis}


--------------------------------------------------------------------------------