├── .Rbuildignore ├── .gitignore ├── .vscode └── launch.json ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── absentee_census.R ├── absentee_voting.R ├── apply_stack_weights.r ├── auto_mrp.R ├── best_subset_classifier.R ├── boot_auto_mrp.R ├── build_folds.R ├── census_data.R ├── deep_mrp_classifier.r ├── ebma.R ├── gb_classifier.R ├── get_predictions.r ├── globals.R ├── lasso_classifier.R ├── post_stratification.R ├── run_best_subset.R ├── run_classifiers.R ├── run_deep_bs.r ├── run_deep_pca.r ├── run_gb.R ├── run_lasso.R ├── run_pca.R ├── run_svm.R ├── stacking_weights.r ├── survey_data.R ├── svm_classifier.R ├── taxes_census.R ├── taxes_survey.R ├── taxes_truth.R └── utils.R ├── README.md ├── autoMrP.Rproj ├── data ├── absentee_census.RData ├── absentee_voting.RData ├── census.RData ├── survey_item.RData ├── taxes_census.RData └── taxes_survey.RData ├── man ├── .Rapp.history ├── absentee_census.Rd ├── absentee_voting.Rd ├── auto_MrP.Rd ├── best_subset_classifier.Rd ├── binary_cross_entropy.Rd ├── boot_auto_mrp.Rd ├── census.Rd ├── cv_folding.Rd ├── deep_mrp_classifier.Rd ├── ebma.Rd ├── ebma_folding.Rd ├── ebma_mc_draws.Rd ├── ebma_mc_tol.Rd ├── error_checks.Rd ├── f1_score.Rd ├── gb_classifier.Rd ├── gb_classifier_update.Rd ├── lasso_classifier.Rd ├── log_spaced.Rd ├── loss_function.Rd ├── loss_score_ranking.Rd ├── mean_absolute_error.Rd ├── mean_squared_error.Rd ├── mean_squared_false_error.Rd ├── model_list.Rd ├── model_list_pca.Rd ├── multicore.Rd ├── output_table.Rd ├── plot.autoMrP.Rd ├── post_stratification.Rd ├── predict_glmmLasso.Rd ├── quiet.Rd ├── run_best_subset.Rd ├── run_best_subset_mc.Rd ├── run_classifiers.Rd ├── run_deep_bs.Rd ├── run_deep_pca.Rd ├── run_gb.Rd ├── run_gb_mc.Rd ├── run_lasso.Rd ├── run_lasso_mc_lambda.Rd ├── run_pca.Rd ├── run_svm.Rd ├── run_svm_mc.Rd ├── summary.autoMrP.Rd ├── survey_item.Rd ├── svm_classifier.Rd ├── taxes_census.Rd └── taxes_survey.Rd └── vignettes ├── autoMrP_vignette.pdf └── autoMrP_vignette.pdf.asis /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^renv$ 2 | ^renv\.lock$ 3 | ^packrat/ 4 | ^\.Rprofile$ 5 | ^.*\.Rproj$ 6 | ^\.Rproj\.user$ 7 | ^make-data\.R$ 8 | ^Meta$ 9 | ^doc$ 10 | ^\.vscode$ 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | packrat/lib*/ 3 | .Rproj.user 4 | .vscode 5 | .Rhistory 6 | packrat/src/ 7 | testing/ 8 | inst/doc 9 | doc 10 | Meta 11 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | 8 | { 9 | "type": "R-Debugger", 10 | "name": "Launch R-Workspace", 11 | "request": "launch", 12 | "debugMode": "workspace", 13 | "workingDirectory": "${workspaceFolder}" 14 | }, 15 | { 16 | "type": "R-Debugger", 17 | "name": "Debug R-File", 18 | "request": "launch", 19 | "debugMode": "file", 20 | "workingDirectory": "${workspaceFolder}", 21 | "file": "${file}" 22 | }, 23 | { 24 | "type": "R-Debugger", 25 | "name": "Debug R-Function", 26 | "request": "launch", 27 | "debugMode": "function", 28 | "workingDirectory": "${workspaceFolder}", 29 | "file": "${file}", 30 | "mainFunction": "main", 31 | "allowGlobalDebugging": false 32 | }, 33 | { 34 | "type": "R-Debugger", 35 | "name": "Debug R-Package", 36 | "request": "launch", 37 | "debugMode": "workspace", 38 | "workingDirectory": "${workspaceFolder}", 39 | "includePackageScopes": true, 40 | "loadPackages": [ 41 | "." 42 | ] 43 | }, 44 | { 45 | "type": "R-Debugger", 46 | "request": "attach", 47 | "name": "Attach to R process", 48 | "splitOverwrittenOutput": true 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: autoMrP 2 | Type: Package 3 | Title: Improving MrP with Ensemble Learning 4 | Version: 1.1.1 5 | Authors@R: c( 6 | person( 7 | given = "Reto", 8 | family = "Wüest", 9 | role = c("aut"), 10 | email = "wuest.reto@gmail.com", 11 | comment = c(ORCID = "0000-0002-7502-6489")), 12 | person( 13 | given = "Lucas", 14 | family = "Leemann", 15 | role = c("aut"), 16 | email = "leemann@ipz.uzh.ch", 17 | comment = c(ORCID = "0000-0001-5201-869X")), 18 | person( 19 | given = "Florian", 20 | family = "Schaffner", 21 | role = c("aut"), 22 | email = " schaffner@ipz.uzh.ch", 23 | comment = c(ORCID = "0000-0003-3352-6191")), 24 | person( 25 | given = "Philipp", 26 | family = "Broniecki", 27 | role = c("aut", "cre"), 28 | email = "philippbroniecki@gmail.com", 29 | comment = c(ORCID = "0000-0001-9214-4404")), 30 | person( 31 | given = "Hadley", 32 | family = "Wickham", 33 | role = "ctb", 34 | email = "hadley@rstudio.com")) 35 | Description: A tool that improves the prediction performance of multilevel 36 | regression with post-stratification (MrP) by combining a number of machine 37 | learning methods. For information on the method, please refer to Broniecki, 38 | Wüest, Leemann (2020) ''Improving Multilevel Regression with 39 | Post-Stratification Through Machine Learning (autoMrP)'' in the 40 | 'Journal of Politics'. Final pre-print version: 41 | . 42 | URL: https://github.com/retowuest/autoMrP 43 | BugReports: https://github.com/retowuest/autoMrP/issues 44 | Depends: R (>= 3.6) 45 | Imports: 46 | rlang (>= 0.4.5), dplyr (>= 1.0.2), lme4 (>= 1.1), gbm (>= 2.1.5), 47 | e1071 (>= 1.7-3), tibble (>= 3.0.1), glmmLasso (>= 1.5.1), 48 | EBMAforecast (>= 1.0.0), foreach (>= 1.5.0), doParallel (>= 1.0.15), 49 | doRNG (>= 1.8.2), ggplot2 (>= 3.3.2), knitr (>= 1.29), tidyr (>= 1.1.2), 50 | purrr (>= 0.3.4), forcats (>= 0.5.1), vglmer (>= 1.0.3), stringr (>= 1.5.0), 51 | R.rsp (>= 0.46.0), nloptr (>= 2.1.1), quadprog (>= 1.5-8), cli (>= 3.6.3) 52 | Suggests: R.rsp 53 | License: GPL-3 54 | Encoding: UTF-8 55 | LazyData: true 56 | RoxygenNote: 7.3.2 57 | VignetteBuilder: R.rsp 58 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(plot,autoMrP) 4 | S3method(summary,autoMrP) 5 | export(auto_MrP) 6 | export(plot.autoMrP) 7 | export(summary.autoMrP) 8 | importFrom(doRNG,"%dorng%") 9 | importFrom(dplyr,"%>%") 10 | importFrom(foreach,"%dopar%") 11 | importFrom(rlang,.data) 12 | importFrom(stats,as.formula) 13 | importFrom(stats,binomial) 14 | importFrom(stats,median) 15 | importFrom(stats,predict) 16 | importFrom(stats,sd) 17 | importFrom(stats,setNames) 18 | importFrom(stats,weighted.mean) 19 | importFrom(utils,combn) 20 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # autoMrP 1.1.0 2 | 3 | + implements stacking 4 | 5 | # autoMrP 1.0.6 6 | 7 | + implements Deep MrP by Gopelrud as presented in https://doi.org/10.1017/S0003055423000035 8 | + Set argument deep.mrp = TRUE to include Deep MrP in the ensemble 9 | 10 | # autoMrP 1.0.5 11 | 12 | + drops missing values on y, L1.x, L2.x, L2.unit, L2.reg. Missing values on the DV would previously lead to errors in SVM 13 | + works with continuous DV. 14 | 15 | # autoMrP 0.93 16 | 17 | + block sampling in bootstrapping instead of state-stratified sampling 18 | 19 | # autoMrP 0.91 20 | 21 | + bootstrapping returns GB prediction 22 | + predictions do not fail if census data contains more factor levels than training data for SVM and Lasso 23 | + svm post-stratification uses the user-specified formula instead of all information 24 | + lasso post-stratification uses correct user-specified context level variables if L2.x and lasso.L2.x differ 25 | + parallel processing loops are replicable now 26 | -------------------------------------------------------------------------------- /R/absentee_census.R: -------------------------------------------------------------------------------- 1 | #' Quasi census data. 2 | #' 3 | #' The census file is generated from the full 2008 Cooperative Congressional Election Studies 4 | #' item cc419_1 by dissaggregating the 64 ideal type combinations of the individual level variables 5 | #' L1x1, L2x2 and L1x3. A row is an ideal type in a given state. 6 | #' 7 | #' 8 | #' @format A data frame with 2934 rows and 13 variables: 9 | #' \describe{ 10 | #' \item{state}{U.S. state} 11 | #' \item{L2.unit}{U.S. state id} 12 | #' \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 13 | #' \item{L1x1}{Age group (four categories)} 14 | #' \item{L1x2}{Education level (four categories)} 15 | #' \item{L1x3}{Gender-race combination (six categories)} 16 | #' \item{proportion}{State-level proportion of respondents of that ideal type in the population} 17 | #' \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 18 | #' \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 19 | #' \item{L2.x3}{State-level percentage of the population living in urban areas} 20 | #' \item{L2.x4}{State-level unemployment rate} 21 | #' \item{L2.x5}{State-level share of Hispanics} 22 | #' \item{L2.x6}{State-level share of Whites} 23 | #' } 24 | #' @usage data(absentee_census) 25 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 26 | #' article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 27 | #' multilevel regression and poststrat-stratification perform with 28 | #' conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3, 29 | #' L2.x3, L2.x4, L2.x5 and L2.x6 are available at 30 | #' \url{https://www.census.gov}. 31 | "absentee_census" 32 | -------------------------------------------------------------------------------- /R/absentee_voting.R: -------------------------------------------------------------------------------- 1 | #' A sample of the absentee voting item from the CCES 2008 2 | #' 3 | #' The Cooperative Congressional Election Stuides (CCES) item (cc419_1) asked: 4 | #' "States have tried many new ways to run elections in recent years. Do you 5 | #' support or oppose any of the following ways of voting or conducting elections 6 | #' in your state? Election Reform - Allow absentee voting over the Internet?" 7 | #' The original 2008 CCES item contains 26,934 respondents. This sample mimics a 8 | #' typical national survey. It contains at least 5 respondents from each state 9 | #' but is otherwise a random sample. 10 | #' 11 | #' @format A data frame with 1500 rows and 13 variables: 12 | #' \describe{ 13 | #' \item{YES}{1 if individual supports use of troops; 0 otherwise} 14 | #' \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 15 | #' \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 16 | #' \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 17 | #' \item{state}{U.S. state} 18 | #' \item{L2.unit}{U.S. state id} 19 | #' \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 20 | #' \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 21 | #' \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 22 | #' \item{L2.x3}{State-level percentage of the population living in urban areas} 23 | #' \item{L2.x4}{State-level unemployment rate} 24 | #' \item{L2.x5}{State-level share of Hispanics} 25 | #' \item{L2.x6}{State-level share of Whites} 26 | #' } 27 | #' @usage data(absentee_voting) 28 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 29 | #' article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 30 | #' multilevel regression and poststrat-stratification perform with 31 | #' conventional national surveys?" Political Analysis 21(4): 449-467. It is a 32 | #' random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 33 | #' L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 34 | "absentee_voting" 35 | -------------------------------------------------------------------------------- /R/apply_stack_weights.r: -------------------------------------------------------------------------------- 1 | apply_stack_weights <- function(ebma_out, stack_out, L2.unit, preds_all, y) { 2 | 3 | # initial binding of globals 4 | ebma_preds <- NULL 5 | 6 | # check whether stacking weights were calculated 7 | if (all(is.null(stack_out$stack_preds))) { 8 | 9 | # generate an object for individual level predicitions 10 | individual_level_predictions <- preds_all %>% 11 | dplyr::rename(!!rlang::sym(y) := y) 12 | 13 | # output object 14 | ebma_out <- list( 15 | ebma = ebma_out$ebma, 16 | classifiers = ebma_out$classifiers, 17 | weights = ebma_out$weights, 18 | stacking = "Stacking step skipped (only 1 classifier run)", 19 | stacking_weights = "Stacking step skipped (only 1 classifier run)", 20 | individual_level_predictions = individual_level_predictions 21 | ) 22 | 23 | } else { 24 | 25 | # 1) non-negative least squares stack 26 | nnls_stack <- as.matrix(ebma_out$classifiers[, -1]) %*% 27 | stack_out$stack_weights$stack_nnls 28 | 29 | # generate stack predictions 30 | stacked_preds <- tibble::tibble( 31 | !!rlang::sym(L2.unit) := dplyr::pull( 32 | .data = ebma_out$classifiers, var = 1 33 | ), 34 | stack_nnls = as.numeric(nnls_stack) 35 | ) 36 | 37 | # 2) optim with constraints stack 38 | optim_stack <- as.matrix(ebma_out$classifiers[, -1]) %*% 39 | stack_out$stack_weights$stack_optim 40 | 41 | # add optim stack to stack predictions 42 | stacked_preds <- stacked_preds %>% 43 | dplyr::mutate(stack_optim = as.numeric(optim_stack)) 44 | 45 | # 3) quadratic programming stack 46 | qp_stack <- as.matrix(ebma_out$classifiers[, -1]) %*% 47 | stack_out$stack_weights$stack_qp 48 | 49 | # add qp stack to stack predictions 50 | stacked_preds <- stacked_preds %>% 51 | dplyr::mutate(stack_qp = as.numeric(qp_stack)) 52 | 53 | # 4) ornstein stack 54 | ornstein_stack <- as.matrix(ebma_out$classifiers[, -1]) %*% 55 | stack_out$stack_weights$stack_ornstein 56 | 57 | # add ornstein stack to stack predictions 58 | stacked_preds <- stacked_preds %>% 59 | dplyr::mutate(stack_ornstein = as.numeric(ornstein_stack)) 60 | 61 | # 5) stack of stacks 62 | stack_of_stacks <- as.matrix(stacked_preds[, -1]) %*% 63 | stack_out$stack_weights$stack_of_stacks 64 | 65 | # add stack of stacks to stack predictions 66 | stacked_preds <- stacked_preds %>% 67 | dplyr::mutate(stack_of_stacks = as.numeric(stack_of_stacks)) 68 | 69 | # 6) stack of stacks with ebma 70 | stack_of_stacks_ebma <- as.matrix( 71 | cbind(stacked_preds[, "stack_of_stacks"], ebma_out$ebma[, "ebma"]) 72 | ) %*% 73 | stack_out$stack_weights$stack_of_stacks_ebma 74 | 75 | # add stack of stacks with ebma to stack predictions 76 | stacked_preds <- stacked_preds %>% 77 | dplyr::mutate(stack_of_stacks_ebma = as.numeric(stack_of_stacks_ebma)) 78 | 79 | # generate an object for individual level predicitions 80 | individual_level_predictions <- preds_all %>% 81 | dplyr::mutate( 82 | ebma = ebma_out$individual_level_predictions %>% 83 | dplyr::pull(var = ebma_preds) 84 | ) %>% 85 | dplyr::bind_cols( 86 | stack_out$stack_preds %>% 87 | dplyr::select(-id, -y, -dplyr::all_of(L2.unit)) 88 | ) %>% 89 | dplyr::rename(!!rlang::sym(y) := y) 90 | 91 | # combine everything 92 | ebma_out <- list( 93 | ebma = ebma_out$ebma, 94 | classifiers = ebma_out$classifiers, 95 | weights = ebma_out$weights, 96 | stacking = stacked_preds, 97 | stacking_weights = stack_out$stack_weights, 98 | individual_level_predictions = individual_level_predictions 99 | ) 100 | } 101 | 102 | return(ebma_out) 103 | } -------------------------------------------------------------------------------- /R/best_subset_classifier.R: -------------------------------------------------------------------------------- 1 | #' Best subset classifier 2 | #' 3 | #' \code{best_subset_classifier} applies best subset classification to a data 4 | #' set. 5 | #' 6 | #' @inheritParams auto_MrP 7 | #' @param model Multilevel model. A model formula describing the multilevel 8 | #' model to be estimated on the basis of the provided training data. 9 | #' @param data.train Training data. A data.frame containing the training data 10 | #' used to train the model. 11 | #' @param model.family Model family. A variable indicating the model family 12 | #' to be used by glmer. Defaults to binomial(link = "probit"). 13 | #' @param model.optimizer Optimization method. A character-valued scalar 14 | #' describing the optimization method to be used by glmer. Defaults to 15 | #' "bobyqa". 16 | #' @param n.iter Iterations. A integer-valued scalar specifying the maximum 17 | #' number of function evaluations tried by the optimization method. 18 | #' @param verbose Verbose output. A logical vector indicating whether or not 19 | #' verbose output should be printed. 20 | #' @return The multilevel model. An \code{\link[lme4]{glmer}} object. 21 | 22 | best_subset_classifier <- function( 23 | model, data.train, model.family, model.optimizer, n.iter, y, 24 | verbose = c(TRUE, FALSE) 25 | ) { 26 | 27 | # Determine type of dependent variable 28 | if ( 29 | data.train %>% 30 | dplyr::pull(!!y) %>% 31 | unique() %>% 32 | length() == 2 33 | ) { 34 | dv_type <- "binary" 35 | } else { 36 | dv_type <- "continuous" 37 | } 38 | 39 | # Train model on training data 40 | if (isTRUE(verbose == TRUE)) { 41 | # DV type 42 | if (dv_type == "continuous") { 43 | out <- lme4::lmer( 44 | formula = model, 45 | data = data.train, 46 | ) 47 | } else { 48 | # optimizer 49 | if (model.optimizer == "bobyqa") { 50 | out <- lme4::glmer( 51 | formula = model, 52 | data = data.train, 53 | family = model.family, 54 | lme4::glmerControl( 55 | optimizer = model.optimizer, 56 | optCtrl = list(maxfun = n.iter) 57 | ) 58 | ) 59 | } else if (model.optimizer == "nloptwrap") { 60 | out <- lme4::glmer( 61 | formula = model, 62 | data = data.train, 63 | family = model.family, 64 | lme4::glmerControl( 65 | calc.derivs = FALSE, 66 | optimizer = model.optimizer, 67 | optCtrl = list( 68 | method = "NLOPT_LN_NELDERMEAD", 69 | starttests = TRUE, kkt = TRUE 70 | ) 71 | ) 72 | ) 73 | } 74 | } 75 | } else { 76 | # DV type 77 | if (dv_type == "continuous") { 78 | out <- suppressMessages(suppressWarnings( 79 | lme4::lmer( 80 | formula = model, 81 | data = data.train, 82 | ) 83 | )) 84 | } else { 85 | # optimizer 86 | if (model.optimizer == "bobyqa") { 87 | out <- suppressMessages(suppressWarnings( 88 | lme4::glmer( 89 | formula = model, 90 | data = data.train, 91 | family = model.family, 92 | lme4::glmerControl( 93 | optimizer = model.optimizer, 94 | optCtrl = list(maxfun = n.iter) 95 | ) 96 | ) 97 | )) 98 | } else if (model.optimizer == "nloptwrap") { 99 | out <- suppressMessages(suppressWarnings( 100 | lme4::glmer( 101 | formula = model, 102 | data = data.train, 103 | family = model.family, 104 | lme4::glmerControl( 105 | calc.derivs = FALSE, 106 | optimizer = model.optimizer, 107 | optCtrl = list( 108 | method = "NLOPT_LN_NELDERMEAD", 109 | starttests = TRUE, 110 | kkt = TRUE 111 | ) 112 | ) 113 | ) 114 | )) 115 | } 116 | } 117 | } 118 | 119 | # Function output 120 | return(out) 121 | } 122 | -------------------------------------------------------------------------------- /R/boot_auto_mrp.R: -------------------------------------------------------------------------------- 1 | #' Bootstrappinng wrapper for auto_mrp 2 | #' 3 | #' \code{boot_auto_mrp} estimates uncertainty for auto_mrp via botstrapping. 4 | #' 5 | #' @inheritParams auto_MrP 6 | #' @param pc.names A character vector of the principal component variable names 7 | #' in the data. 8 | 9 | boot_auto_mrp <- function( 10 | y, L1.x, L2.x, mrp.L2.x, L2.unit, L2.reg, L2.x.scale, pcs, folds, 11 | bin.proportion, bin.size, survey, census, ebma.size, k.folds, cv.sampling, 12 | loss.unit, loss.fun, best.subset, lasso, pca, gb, svm, mrp, deep.mrp, 13 | best.subset.L2.x, lasso.L2.x, pca.L2.x, pc.names, gb.L2.x, svm.L2.x, 14 | svm.L2.unit, svm.L2.reg, gb.L2.unit, gb.L2.reg, deep.splines, lasso.lambda, 15 | lasso.n.iter, gb.interaction.depth, gb.shrinkage, gb.n.trees.init, 16 | gb.n.trees.increase, gb.n.trees.max, gb.n.minobsinnode, svm.kernel, svm.gamma, 17 | svm.cost, ebma.tol, boot.iter, cores 18 | ) { 19 | 20 | # Binding for global variables 21 | `%>%` <- dplyr::`%>%` 22 | 23 | # Register cores 24 | cl <- multicore(cores = cores, type = "open", cl = NULL) 25 | 26 | # Bootstrap iterations 27 | boot_out <- foreach::foreach( 28 | idx_boot = 1:boot.iter, .packages = "autoMrP" 29 | ) %dorng% { 30 | 31 | boot_mrp <- boot_fun( 32 | y = y, 33 | L1.x = L1.x, 34 | L2.x = L2.x, 35 | mrp.L2.x = mrp.L2.x, 36 | L2.unit = L2.unit, 37 | L2.reg = L2.reg, 38 | pcs = pcs, 39 | folds = folds, 40 | survey = survey, 41 | census = census, 42 | k.folds = k.folds, 43 | cv.sampling = cv.sampling, 44 | ebma.size = ebma.size, 45 | loss.unit = loss.unit, 46 | loss.fun = loss.fun, 47 | best.subset = best.subset, 48 | lasso = lasso, 49 | pca = pca, 50 | gb = gb, 51 | svm = svm, 52 | mrp = mrp, 53 | deep.mrp = deep.mrp, 54 | best.subset.L2.x = best.subset.L2.x, 55 | lasso.L2.x = lasso.L2.x, 56 | pca.L2.x = pca.L2.x, 57 | pc.names = pc.names, 58 | gb.L2.x = gb.L2.x, 59 | svm.L2.x = svm.L2.x, 60 | svm.L2.unit = svm.L2.unit, 61 | svm.L2.reg = svm.L2.reg, 62 | gb.L2.unit = gb.L2.unit, 63 | gb.L2.reg = gb.L2.reg, 64 | deep.splines = deep.splines, 65 | lasso.lambda = lasso.lambda, 66 | lasso.n.iter = lasso.n.iter, 67 | gb.interaction.depth = gb.interaction.depth, 68 | gb.shrinkage = gb.shrinkage, 69 | gb.n.trees.init = gb.n.trees.init, 70 | gb.n.trees.increase = gb.n.trees.increase, 71 | gb.n.trees.max = gb.n.trees.max, 72 | gb.n.minobsinnode = gb.n.minobsinnode, 73 | svm.kernel = svm.kernel, 74 | svm.gamma = svm.gamma, 75 | svm.cost = svm.cost, 76 | ebma.tol = ebma.tol 77 | ) 78 | } # end of foreach loop 79 | 80 | # Median and standard deviation of EBMA estimates 81 | if (!any( 82 | boot_out[[1]]$ebma == "EBMA step skipped (only 1 classifier run)" 83 | )) { 84 | ebma <- base::do.call( 85 | base::rbind, base::do.call(base::rbind, boot_out)[, "ebma"] 86 | ) 87 | 88 | # weights 89 | weights <- base::do.call( 90 | base::rbind, base::do.call(base::rbind, boot_out)[, "weights"] 91 | ) %>% 92 | dplyr::as_tibble() %>% 93 | dplyr::select( 94 | contains("best_subset"), 95 | contains("pca"), 96 | contains("lasso"), 97 | contains("gb"), 98 | contains("svm"), 99 | contains("mrp") 100 | ) 101 | 102 | } else { 103 | ebma <- "EBMA step skipped (only 1 classifier run)" 104 | weights <- NULL 105 | } 106 | 107 | # Median and standard deviations for classifier estimates 108 | classifiers <- base::do.call( 109 | base::rbind, base::do.call(base::rbind, boot_out)[, "classifiers"] 110 | ) %>% 111 | dplyr::select( 112 | one_of(L2.unit), 113 | contains("best_subset"), 114 | contains("pca"), 115 | contains("lasso"), 116 | contains("gb"), 117 | contains("svm"), 118 | contains("mrp") 119 | ) 120 | 121 | if (!is.null(weights)) { 122 | boot_out <- list(ebma = ebma, classifiers = classifiers, weights = weights) 123 | } else { 124 | boot_out <- list(ebma = ebma, classifiers = classifiers) 125 | } 126 | 127 | # De-register cluster 128 | multicore(cores = cores, type = "close", cl = cl) 129 | 130 | return(boot_out) 131 | 132 | } 133 | -------------------------------------------------------------------------------- /R/build_folds.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | build_folds <- function(survey, 4 | L2.unit, 5 | ebma.size = 1/3, 6 | k.folds = 5, 7 | cv.sampling = "L2 units") { 8 | # EBMA hold-out fold 9 | ebma.size <- round(nrow(survey) * ebma.size, digits = 0) 10 | 11 | if(ebma.size > 0) { 12 | ebma_folding_out <- ebma_folding(data = survey, 13 | L2.unit = L2.unit, 14 | ebma.size = ebma.size) 15 | ebma_fold <- ebma_folding_out$ebma_fold 16 | cv_data <- ebma_folding_out$cv_data 17 | } else{ 18 | ebma_fold <- NULL 19 | cv_data <- survey 20 | } 21 | 22 | # K folds for cross-validation 23 | cv_folds <- cv_folding(data = cv_data, 24 | L2.unit = L2.unit, 25 | k.folds = k.folds, 26 | cv.sampling = cv.sampling) 27 | } 28 | -------------------------------------------------------------------------------- /R/census_data.R: -------------------------------------------------------------------------------- 1 | #' Quasi census data. 2 | #' 3 | #' The census file is generated from the full 2008 Cooperative Congressional Election Studies 4 | #' item cc418_1 by dissaggregating the 64 ideal type combinations of the individual level variables 5 | #' L1x1, L2x2 and L1x3. A row is an ideal type in a given state. 6 | #' 7 | #' 8 | #' @format A data frame with 2934 rows and 13 variables: 9 | #' \describe{ 10 | #' \item{state}{U.S. state} 11 | #' \item{L2.unit}{U.S. state id} 12 | #' \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 13 | #' \item{L1x1}{Age group (four categories)} 14 | #' \item{L1x2}{Education level (four categories)} 15 | #' \item{L1x3}{Gender-race combination (six categories)} 16 | #' \item{proportion}{State-level proportion of respondents of that ideal type in the population} 17 | #' \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 18 | #' \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 19 | #' \item{L2.x3}{State-level percentage of the population living in urban areas} 20 | #' \item{L2.x4}{State-level unemployment rate} 21 | #' \item{L2.x5}{State-level share of Hispanics} 22 | #' \item{L2.x6}{State-level share of Whites} 23 | #' } 24 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 25 | #' article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 26 | #' multilevel regression and poststrat-stratification perform with 27 | #' conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3, 28 | #' L2.x3, L2.x4, L2.x5 and L2.x6 are available at 29 | #' \url{https://www.census.gov}. 30 | "census" 31 | -------------------------------------------------------------------------------- /R/deep_mrp_classifier.r: -------------------------------------------------------------------------------- 1 | #' Deep MrP classifier 2 | #' 3 | #' \code{deep_mrp_classifier} applies Deep MrP implemented in the \pkg{vglmer} 4 | #' package to a data set. 5 | #' 6 | #' @inheritParams auto_MrP 7 | #' @param form Model formula. A two-sided linear formula describing 8 | #' the model to be fit, with the outcome on the LHS and the covariates 9 | #' separated by + operators on the RHS. 10 | #' @param data Data. A data.frame containing the data used to train the model. 11 | #' @return A Deep MrP model. A \code{\link[vglmer]{vglmer}} object. 12 | 13 | deep_mrp_classifier <- function(y, form, data, verbose) { 14 | 15 | # Determine type of dependent variable 16 | if ( 17 | data %>% 18 | dplyr::pull(!!y) %>% 19 | unique() %>% 20 | length() == 2 21 | ) { 22 | family <- "binomial" 23 | } else { 24 | family <- "linear" 25 | } 26 | 27 | # run vglmer model 28 | if (verbose) { 29 | out <- vglmer::vglmer( 30 | formula = as.formula(form), 31 | data = data, 32 | family = family 33 | ) 34 | } else { 35 | out <- suppressMessages(suppressWarnings( 36 | vglmer::vglmer( 37 | formula = as.formula(form), 38 | data = data, 39 | family = family 40 | ) 41 | )) 42 | } 43 | return(out) 44 | } -------------------------------------------------------------------------------- /R/gb_classifier.R: -------------------------------------------------------------------------------- 1 | #' GB classifier 2 | #' 3 | #' \code{gb_classifier} applies gradient boosting classification to a data set. 4 | #' 5 | #' @inheritParams auto_MrP 6 | #' @param form Model formula. A two-sided linear formula describing 7 | #' the model to be fit, with the outcome on the LHS and the covariates 8 | #' separated by + operators on the RHS. 9 | #' @param distribution Model distribution. A character string specifying the 10 | #' name of the distribution to be used. 11 | #' @param data.train Training data. A data.frame containing the training data 12 | #' used to train the model. 13 | #' @param n.trees Total number of trees. An integer-valued scalar specifying 14 | #' the total number of trees to be fit. 15 | #' @param interaction.depth Interaction depth. An integer-valued scalar 16 | #' specifying the maximum depth of each tree. 17 | #' @param n.minobsinnode Minimum number of observations in terminal nodes. An 18 | #' integer-valued scalar specifying the minimum number of observations in the 19 | #' terminal nodes of the trees. 20 | #' @param shrinkage Learning rate. A numeric scalar specifying the shrinkage or 21 | #' learning rate applied to each tree in the expansion. 22 | #' @param verbose Verbose output. A logical vector indicating whether or not 23 | #' verbose output should be printed. 24 | #' @return A gradient tree boosting model. A \code{\link[gbm]{gbm}} object. 25 | 26 | gb_classifier <- function( 27 | y, form, distribution, data.train, 28 | n.trees, interaction.depth, 29 | n.minobsinnode, shrinkage, 30 | verbose = c(TRUE, FALSE) 31 | ) { 32 | 33 | # Determine type of dependent variable 34 | if ( 35 | data.train %>% 36 | dplyr::pull(!!y) %>% 37 | unique() %>% 38 | length() > 2 39 | ) { 40 | # set model family to gaussian 41 | distribution <- "gaussian" 42 | } 43 | 44 | # Train model on training data with number of total trees, interaction depth, 45 | # and learning rate as tuning parameters 46 | if (isTRUE(verbose == TRUE)) { 47 | out <- gbm::gbm( 48 | formula = form, 49 | distribution = distribution, 50 | data = data.train, 51 | n.trees = n.trees, 52 | interaction.depth = interaction.depth, 53 | n.minobsinnode = n.minobsinnode, 54 | shrinkage = shrinkage, 55 | train.fraction = 1, 56 | n.cores = 1 57 | ) 58 | } else { 59 | out <- suppressMessages(suppressWarnings( 60 | gbm::gbm( 61 | formula = form, 62 | distribution = distribution, 63 | data = data.train, n.trees = n.trees, 64 | interaction.depth = interaction.depth, 65 | n.minobsinnode = n.minobsinnode, 66 | shrinkage = shrinkage, 67 | train.fraction = 1, 68 | n.cores = 1 69 | ) 70 | )) 71 | } 72 | 73 | # Function output 74 | return(out) 75 | } 76 | 77 | #' GB classifier update 78 | #' 79 | #' \code{gb_classifier_update()} grows additional trees in gradient tree 80 | #' boosting ensemble. 81 | #' 82 | #' @param object Gradient tree boosting output. A gbm object. 83 | #' @param n.new.trees Number of additional trees to grow. A numeric scalar. 84 | #' @param verbose Verbose output. A logical vector indicating whether or not 85 | #' verbose output should be printed. 86 | #' @return An updated gradient tree boosting model. 87 | #' A \code{\link[gbm]{gbm.more}} object. 88 | 89 | gb_classifier_update <- function( 90 | object, n.new.trees, verbose = c(TRUE, FALSE) 91 | ) { 92 | 93 | # Train model on training data with number of total trees, interaction depth, 94 | # and learning rate as tuning parameters 95 | if (isTRUE(verbose == TRUE)) { 96 | out <- gbm::gbm.more( 97 | object = object, 98 | n.new.trees = n.new.trees 99 | ) 100 | } else { 101 | out <- suppressMessages(suppressWarnings( 102 | gbm::gbm.more( 103 | object = object, 104 | n.new.trees = n.new.trees 105 | ) 106 | )) 107 | } 108 | 109 | # Function output 110 | return(out) 111 | } 112 | -------------------------------------------------------------------------------- /R/globals.R: -------------------------------------------------------------------------------- 1 | globalVariables(c("%>%", 2 | "%dopar%", 3 | "%dorng%", 4 | ".", 5 | ":=", 6 | "ae", 7 | "all_of", 8 | "bce", 9 | "ce", 10 | "contains", 11 | "data", 12 | "depth", 13 | "err", 14 | "err_rates", 15 | "estimates", 16 | "fn", 17 | "fp", 18 | "index", 19 | "lambda", 20 | "lasso_opt", 21 | "lb", 22 | "level", 23 | "mae", 24 | "measure", 25 | "method", 26 | "model", 27 | "mrp", 28 | "mse", 29 | "msfe", 30 | "n", 31 | "n_L2", 32 | "ntrees", 33 | "one_of", 34 | "os", 35 | "pc_names", 36 | "prop", 37 | "pval", 38 | "row_number", 39 | "sqe", 40 | "state", 41 | "tp", 42 | "ub", 43 | "value", 44 | "verbose", 45 | "y_svm", 46 | "deep_mrp", 47 | "gaussian")) 48 | -------------------------------------------------------------------------------- /R/lasso_classifier.R: -------------------------------------------------------------------------------- 1 | #' Lasso classifier 2 | #' 3 | #' \code{lasso_classifier} applies lasso classification to a data set. 4 | #' 5 | #' @inheritParams auto_MrP 6 | #' @param L2.fix Fixed effects. A two-sided linear formula describing 7 | #' the fixed effects part of the model, with the outcome on the LHS and 8 | #' the fixed effects separated by + operators on the RHS. 9 | #' @param L1.re Random effects. A named list object, with the random effects 10 | #' providing the names of the list elements and ~ 1 being the list elements. 11 | #' @param data.train Training data. A data.frame containing the training data 12 | #' used to train the model. 13 | #' @param lambda Tuning parameter. Lambda is the penalty parameter that controls 14 | #' the shrinkage of fixed effects. 15 | #' @param model.family Model family. A variable indicating the model family 16 | #' to be used by glmmLasso. Defaults to binomial(link = "probit"). 17 | #' @param verbose Verbose output. A logical vector indicating whether or not 18 | #' verbose output should be printed. 19 | #' @return A multilevel lasso model. An \code{\link[glmmLasso]{glmmLasso}} 20 | #' object. 21 | 22 | lasso_classifier <- function( 23 | L2.fix, L1.re, data.train, lambda, model.family, y, 24 | verbose = c(TRUE, FALSE) 25 | ) { 26 | 27 | # Determine type of dependent variable 28 | if ( 29 | data.train %>% 30 | dplyr::pull(!!y) %>% 31 | unique() %>% 32 | length() > 2 33 | ) { 34 | # set model family to gaussian 35 | model.family <- gaussian(link = "identity") 36 | } 37 | 38 | # Train model on training data with lambda as tuning parameter 39 | if (isTRUE(verbose == TRUE)) { 40 | out <- glmmLasso::glmmLasso( 41 | fix = L2.fix, 42 | rnd = L1.re, 43 | data = data.train, 44 | lambda = lambda, 45 | family = model.family, 46 | switch.NR = FALSE, 47 | final.re = TRUE, 48 | control = list( 49 | center = TRUE, 50 | standardize = TRUE 51 | ) 52 | ) 53 | } else { 54 | out <- quiet( 55 | suppressMessages(suppressWarnings( 56 | glmmLasso::glmmLasso( 57 | fix = L2.fix, 58 | rnd = L1.re, 59 | data = data.train, 60 | lambda = lambda, 61 | family = model.family, 62 | switch.NR = FALSE, 63 | final.re = TRUE, 64 | control = list( 65 | center = TRUE, 66 | standardize = TRUE 67 | ) 68 | ) 69 | )) 70 | ) 71 | } 72 | 73 | # Function output 74 | return(out) 75 | } 76 | -------------------------------------------------------------------------------- /R/run_best_subset.R: -------------------------------------------------------------------------------- 1 | #' Apply best subset classifier to MrP. 2 | #' 3 | #' \code{run_best_subset} is a wrapper function that applies the best subset 4 | #' classifier to a list of models provided by the user, evaluates the models' 5 | #' prediction performance, and chooses the best-performing model. 6 | #' 7 | #' @inheritParams auto_MrP 8 | #' @param data Data for cross-validation. A \code{list} of \eqn{k} 9 | #' \code{data.frames}, one for each fold to be used in \eqn{k}-fold 10 | #' cross-validation. 11 | #' @return A model formula of the winning best subset classifier model. 12 | 13 | run_best_subset <- function( 14 | y, L1.x, L2.x, L2.unit, L2.reg, 15 | loss.unit, loss.fun, data, verbose, cores 16 | ) { 17 | 18 | # List of all models to be evaluated 19 | models <- model_list( 20 | y = y, 21 | L1.x = L1.x, 22 | L2.x = L2.x, 23 | L2.unit = L2.unit, 24 | L2.reg = L2.reg 25 | ) 26 | 27 | # prallel tuning if cores > 1 28 | if (cores > 1) { 29 | 30 | # Train all models in parallel 31 | m_errors <- run_best_subset_mc( 32 | verbose = verbose, 33 | models = models, 34 | data = data, 35 | loss.unit = loss.unit, 36 | loss.fun = loss.fun, 37 | y = y, 38 | L1.x = L1.x, 39 | L2.x = L2.x, 40 | L2.unit = L2.unit, 41 | L2.reg = L2.reg, 42 | cores = cores 43 | ) 44 | } else { 45 | 46 | # Train and evaluate each model 47 | m_errors <- lapply(seq_along(models), function(m) { 48 | # Print model m 49 | if (isTRUE(verbose)) { 50 | M <- length(models) 51 | cat(paste( 52 | "Best subset: Running model ", m, 53 | " out of ", M, " models\n", sep = "" 54 | )) 55 | } 56 | 57 | # Loop over each fold 58 | k_errors <- lapply(seq_along(data), function(k) { 59 | # Split data in training and validation sets 60 | data_train <- dplyr::bind_rows(data[-k]) 61 | data_valid <- dplyr::bind_rows(data[k]) 62 | 63 | # Train mth model on kth training set 64 | model_m <- best_subset_classifier( 65 | model = models[[m]], 66 | y = y, 67 | data.train = data_train, 68 | model.family = binomial(link = "probit"), 69 | model.optimizer = "bobyqa", 70 | n.iter = 1000000, 71 | verbose = verbose 72 | ) 73 | 74 | # Use trained model to make predictions for kth validation set 75 | pred_m <- stats::predict( 76 | model_m, 77 | newdata = data_valid, 78 | type = "response", 79 | allow.new.levels = TRUE 80 | ) 81 | 82 | # Evaluate predictions based on loss function 83 | perform_m <- loss_function( 84 | pred = pred_m, 85 | data.valid = data_valid, 86 | loss.unit = loss.unit, 87 | loss.fun = loss.fun, 88 | y = y, 89 | L2.unit = L2.unit 90 | ) 91 | }) 92 | 93 | # Mean over loss functions 94 | k_errors <- dplyr::bind_rows(k_errors) %>% 95 | dplyr::group_by(measure) %>% 96 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 97 | dplyr::mutate(model = m) 98 | }) 99 | } 100 | 101 | # Extract best tuning parameters 102 | grid_cells <- dplyr::bind_rows(m_errors) 103 | best_params <- dplyr::slice( 104 | loss_score_ranking( 105 | score = grid_cells, 106 | loss.fun = loss.fun), 1) 107 | 108 | # Choose best-performing model 109 | out <- models[[dplyr::pull(.data = best_params, var = model)]] 110 | 111 | # Function output 112 | return(out) 113 | 114 | } 115 | 116 | ################################################################################ 117 | # Multicore tuning for best subset # 118 | ################################################################################ 119 | #' Best subset multicore tuning. 120 | #' 121 | #' \code{run_best_subset_mc} is called from within \code{run_best_subset}. It 122 | #' tunes using multiple cores. 123 | #' 124 | #' @param y Outcome variable. A character scalar containing the column name of 125 | #' the outcome variable in \code{survey}. 126 | #' @param L1.x Individual-level covariates. A character vector containing the 127 | #' column names of the individual-level variables in \code{survey} and 128 | #' \code{census} used to predict outcome \code{y}. Note that geographic unit 129 | #' is specified in argument \code{L2.unit}. 130 | #' @param L2.x Context-level covariates. A character vector containing the 131 | #' column names of the context-level variables in \code{survey} and 132 | #' \code{census} used to predict outcome \code{y}. 133 | #' @param L2.unit Geographic unit. A character scalar containing the column 134 | #' name of the geographic unit in \code{survey} and \code{census} at which 135 | #' outcomes should be aggregated. 136 | #' @param L2.reg Geographic region. A character scalar containing the column 137 | #' name of the geographic region in \code{survey} and \code{census} by which 138 | #' geographic units are grouped (\code{L2.unit} must be nested within 139 | #' \code{L2.reg}). Default is \code{NULL}. 140 | #' @param loss.unit Loss function unit. A character-valued scalar indicating 141 | #' whether performance loss should be evaluated at the level of individual 142 | #' respondents (\code{individuals}) or geographic units (\code{L2 units}). 143 | #' Default is \code{individuals}. 144 | #' @param loss.fun Loss function. A character-valued scalar indicating whether 145 | #' prediction loss should be measured by the mean squared error (\code{MSE}) 146 | #' or the mean absolute error (\code{MAE}). Default is \code{MSE}. 147 | #' @param data Data for cross-validation. A \code{list} of \eqn{k} 148 | #' \code{data.frames}, one for each fold to be used in \eqn{k}-fold 149 | #' cross-validation. 150 | #' @param cores The number of cores to be used. An integer indicating the number 151 | #' of processor cores used for parallel computing. Default is 1. 152 | #' @param models The models to perform best subset selection on. A list of model 153 | #' formulas. 154 | #' @param verbose Verbose output. A logical argument indicating whether or not 155 | #' verbose output should be printed. Default is \code{TRUE}. 156 | #' @return The cross-validation errors for all models. A list. 157 | #' @examples \dontrun{ 158 | #' # not yet 159 | #' } 160 | 161 | run_best_subset_mc <- function( 162 | y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, 163 | data, cores, models, verbose 164 | ) { 165 | 166 | # Binding for global variables 167 | m <- NULL 168 | 169 | # Register cores 170 | cl <- multicore(cores = cores, type = "open", cl = NULL) 171 | 172 | # Train and evaluate each model 173 | m_errors <- foreach::foreach( 174 | m = seq_along(models), .packages = "autoMrP" 175 | ) %dorng% { 176 | 177 | # Loop over each fold 178 | k_errors <- lapply(seq_along(data), function(k) { 179 | # Split data in training and validation sets 180 | data_train <- dplyr::bind_rows(data[-k]) 181 | data_valid <- dplyr::bind_rows(data[k]) 182 | 183 | # Train mth model on kth training set 184 | model_m <- best_subset_classifier( 185 | model = models[[m]], 186 | y = y, 187 | data.train = data_train, 188 | model.family = binomial(link = "probit"), 189 | model.optimizer = "bobyqa", 190 | n.iter = 1000000, 191 | verbose = verbose 192 | ) 193 | 194 | # Use trained model to make predictions for kth validation set 195 | pred_m <- stats::predict( 196 | model_m, newdata = data_valid, 197 | type = "response", allow.new.levels = TRUE 198 | ) 199 | 200 | # Evaluate predictions based on loss function 201 | perform_m <- loss_function( 202 | pred = pred_m, 203 | data.valid = data_valid, 204 | loss.unit = loss.unit, 205 | loss.fun = loss.fun, 206 | y = y, 207 | L2.unit = L2.unit 208 | ) 209 | }) 210 | 211 | # Mean over loss functions 212 | k_errors <- dplyr::bind_rows(k_errors) %>% 213 | dplyr::group_by(measure) %>% 214 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 215 | dplyr::mutate(model = m) 216 | } 217 | 218 | # De-register cluster 219 | multicore(cores = cores, type = "close", cl = cl) 220 | 221 | # Function output 222 | return(m_errors) 223 | } 224 | -------------------------------------------------------------------------------- /R/run_deep_bs.r: -------------------------------------------------------------------------------- 1 | #' Apply deep mrp to the best subset classifier to MrP. 2 | #' 3 | #' \code{run_deep_bs} is a wrapper function that applies the bestsubset 4 | #' classifier to a list of models provided by the user, evaluates the models' 5 | #' prediction performance, and chooses the best-performing model. It differs 6 | #' from \code{run_best_subset} in that it includes L1.x interactions. 7 | #' 8 | #' @inheritParams auto_MrP 9 | #' @param data Data for cross-validation. A \code{list} of \eqn{k} 10 | #' \code{data.frames}, one for each fold to be used in \eqn{k}-fold 11 | #' cross-validation. 12 | #' @return A model formula of the winning best subset classifier model. 13 | 14 | run_deep_bs <- function( 15 | y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, deep.splines, data, 16 | k.folds, verbose, cores 17 | ) { 18 | 19 | # Determine type of dependent variable 20 | if ( 21 | data[[1]] %>% 22 | dplyr::pull(!!y) %>% 23 | unique() %>% 24 | length() == 2 25 | ) { 26 | dv_type <- "binary" 27 | } else { 28 | dv_type <- "linear" 29 | } 30 | 31 | # List of all models to be evaluated 32 | models <- model_list( 33 | y = y, 34 | L1.x = L1.x, 35 | L2.x = L2.x, 36 | L2.unit = L2.unit, 37 | L2.reg = L2.reg 38 | ) 39 | 40 | # no nesting with deep interactions 41 | if (!is.null(L2.reg) && !is.null(L2.unit)) { 42 | models <- lapply(models, function(x) { 43 | # model formula to character 44 | m_form <- as.character(x) 45 | # replace (1 | region/state) with (1 | region) + (1 | state) 46 | m_form <- stringr::str_replace_all( 47 | string = m_form, 48 | pattern = "\\(1 \\| region/state\\)", 49 | replacement = "\\(1 | state\\) + \\(1 | region\\)" 50 | ) 51 | # character to formula 52 | m_form <- as.formula(sprintf("%s%s%s", m_form[2], m_form[1], m_form[3])) 53 | }) 54 | } 55 | 56 | # add interactions to the models 57 | models <- lapply(models, function(x) { 58 | 59 | # get all level 1 variables in the current model 60 | c_l1_x <- x %>% 61 | as.character() %>% 62 | .[3] %>% 63 | stringr::str_extract_all(pattern = "L1x\\d+") %>% 64 | unlist() 65 | 66 | # generate all interactions of L1.x 67 | l1_comb <- unlist(lapply(2:length(c_l1_x), function(x) { 68 | apply(combn(L1.x, x), 2, paste, collapse = ".") 69 | })) 70 | 71 | # generate all interactions of L1.x with L2.unit 72 | l1_state <- paste(L1.x, L2.unit, sep = ".") 73 | 74 | # generate all interactions of L1.x with L2.reg 75 | if (!is.null(L2.reg)) { 76 | l1_region <- paste(L1.x, L2.reg, sep = ".") 77 | } else { 78 | l1_region <- NULL 79 | } 80 | 81 | # interactions 82 | add_interactions <- paste0( 83 | # interactions of L1x 84 | paste("(1 | ", l1_comb, ")", collapse = " + "), " + ", 85 | # interactions of L1x with L2.unit 86 | paste("(1 | ", l1_state, ")", collapse = " + "), " + ", 87 | # interactions of L1x with L2.reg 88 | if (any(!is.null(l1_region))) { 89 | paste("(1 | ", l1_region, ")", collapse = " + ") 90 | } 91 | ) 92 | 93 | # remove trailing " + " from interactions 94 | add_interactions <- stringr::str_extract( 95 | string = add_interactions, 96 | pattern = "^.*\\)" 97 | ) 98 | 99 | # character to formula 100 | add_interactions <- as.formula(paste("~ . +", add_interactions)) 101 | 102 | # update formula with interactions 103 | x <- update(x, add_interactions) 104 | 105 | # add splines to context level variables 106 | if (deep.splines) { 107 | 108 | # formula to character 109 | char_form <- as.character(x) 110 | char_form <- sprintf("%s %s %s", char_form[2], char_form[1], char_form[3]) 111 | 112 | # get all context level variables in the current model 113 | c_l2_x <- char_form %>% 114 | stringr::str_extract_all(pattern = "L2\\.x\\d+") %>% 115 | unlist() 116 | 117 | # replace in string 118 | for (i in seq_along(c_l2_x)) { 119 | char_form <- stringr::str_replace( 120 | string = char_form, 121 | pattern = c_l2_x[i], 122 | replacement = sprintf("v_s(%s)", c_l2_x[i]) 123 | ) 124 | } 125 | 126 | # character to formula 127 | x <- as.formula(char_form) 128 | 129 | } 130 | 131 | return(x) 132 | }) 133 | 134 | # Register cores 135 | cl <- multicore(cores = cores, type = "open", cl = NULL) 136 | 137 | # loop over models 138 | m_errors <- foreach::foreach( 139 | m = seq_along(models), .packages = "autoMrP" 140 | ) %dorng% { 141 | 142 | `%>%` <- magrittr::`%>%` 143 | 144 | k_errors <- lapply(seq_len(k.folds), function(k) { 145 | 146 | # Split data in training and validation sets 147 | data_train <- dplyr::bind_rows(data[-k]) 148 | data_valid <- dplyr::bind_rows(data[k]) 149 | 150 | # Train mth model on kth training set 151 | model_m <- deep_mrp_classifier( 152 | form = models[[m]], 153 | y = y, 154 | data = data_train, 155 | verbose = TRUE 156 | ) 157 | 158 | # predictions based on DV type (binary or continuous) 159 | if (dv_type == "binary") { 160 | # use trained model to make predictions for kth validation set 161 | pred_m <- vglmer::predict_MAVB( 162 | samples = 1000, 163 | model_m, 164 | newdata = data_valid, 165 | allow_missing_levels = TRUE 166 | )[["mean"]] 167 | 168 | # convert to response probabilities 169 | pred_m <- stats::plogis(pred_m) 170 | 171 | } else if (dv_type == "linear") { 172 | # Use trained model to make predictions for kth validation set 173 | pred_m <- predict( 174 | samples = 1000, 175 | object = model_m, 176 | newdata = data_valid, 177 | allow_missing_levels = TRUE 178 | )[["mean"]] 179 | } 180 | 181 | # evaluate predictions based on loss function 182 | perform_m <- loss_function( 183 | pred = pred_m, 184 | data.valid = data_valid, 185 | loss.unit = loss.unit, 186 | loss.fun = loss.fun, 187 | y = y, 188 | L2.unit = L2.unit 189 | ) 190 | 191 | return(perform_m) 192 | }) 193 | 194 | # Mean over loss functions 195 | k_errors <- dplyr::bind_rows(k_errors) %>% 196 | dplyr::group_by(measure) %>% 197 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 198 | dplyr::mutate(model = m) 199 | 200 | return(k_errors) 201 | } 202 | 203 | # De-register cluster 204 | multicore(cores = cores, type = "close", cl = cl) 205 | 206 | # Extract best tuning parameters 207 | grid_cells <- dplyr::bind_rows(m_errors) 208 | best_params <- dplyr::slice( 209 | loss_score_ranking( 210 | score = grid_cells, 211 | loss.fun = loss.fun 212 | ), 1 213 | ) 214 | 215 | # Choose best-performing model 216 | out <- models[[dplyr::pull(.data = best_params, var = model)]] 217 | 218 | 219 | # Function output 220 | return(out) 221 | 222 | } -------------------------------------------------------------------------------- /R/run_deep_pca.r: -------------------------------------------------------------------------------- 1 | #' Apply PCA classifier to MrP. 2 | #' 3 | #' \code{run_deep_pca} is a wrapper function that applies the PCA classifier to 4 | #' data provided by the user, evaluates prediction performance, and chooses the 5 | #' best-performing model. It differs from \code{run_best_subset} in that it 6 | #' includes L1.x interactions. 7 | #' 8 | #' @inheritParams auto_MrP 9 | #' @param data Data for cross-validation. A \code{list} of \eqn{k} 10 | #' \code{data.frames}, one for each fold to be used in \eqn{k}-fold 11 | #' cross-validation. 12 | #' 13 | #' @return A model formula of the winning best subset classifier model. 14 | 15 | run_deep_pca <- function( 16 | y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, deep.splines, data, 17 | cores, verbose 18 | ) { 19 | 20 | # Determine type of dependent variable 21 | if ( 22 | data[[1]] %>% 23 | dplyr::pull(!!y) %>% 24 | unique() %>% 25 | length() == 2 26 | ) { 27 | dv_type <- "binary" 28 | } else { 29 | dv_type <- "linear" 30 | } 31 | 32 | # List of all models to be evaluated 33 | models <- model_list_pca( 34 | y = y, 35 | L1.x = L1.x, 36 | L2.x = L2.x, 37 | L2.unit = L2.unit, 38 | L2.reg = L2.reg 39 | ) 40 | 41 | # no nesting with deep interactions 42 | if (!is.null(L2.reg)) { 43 | models <- lapply(models, function(x) { 44 | # model formula to character 45 | m_form <- as.character(x) 46 | # replace (1 | region/state) with (1 | region) + (1 | state) 47 | m_form <- stringr::str_replace_all( 48 | string = m_form, 49 | pattern = "\\(1 \\| region/state\\)", 50 | replacement = "\\(1 | state\\) + \\(1 | region\\)" 51 | ) 52 | # character to formula 53 | m_form <- as.formula(sprintf("%s%s%s", m_form[2], m_form[1], m_form[3])) 54 | }) 55 | } 56 | 57 | # add interactions to the models 58 | models <- lapply(models, function(x) { 59 | 60 | # get all level 1 variables in the current model 61 | c_l1_x <- x %>% 62 | as.character() %>% 63 | .[3] %>% 64 | stringr::str_extract_all(pattern = "L1x\\d+") %>% 65 | unlist() 66 | 67 | # generate all interactions of L1.x 68 | l1_comb <- unlist(lapply(2:length(c_l1_x), function(x) { 69 | apply(combn(L1.x, x), 2, paste, collapse = ".") 70 | })) 71 | 72 | # generate all interactions of L1.x with L2.unit 73 | l1_state <- paste(L1.x, L2.unit, sep = ".") 74 | 75 | # generate all interactions of L1.x with L2.reg 76 | if (!is.null(L2.reg)) { 77 | l1_region <- paste(L1.x, L2.reg, sep = ".") 78 | } else { 79 | l1_region <- NULL 80 | } 81 | 82 | # interactions 83 | add_interactions <- paste0( 84 | # interactions of L1x 85 | paste("(1 | ", l1_comb, ")", collapse = " + "), " + ", 86 | # interactions of L1x with L2.unit 87 | paste("(1 | ", l1_state, ")", collapse = " + "), " + ", 88 | # interactions of L1x with L2.reg 89 | if (any(!is.null(l1_region))) { 90 | paste("(1 | ", l1_region, ")", collapse = " + ") 91 | } 92 | ) 93 | 94 | # remove trailing " + " from interactions 95 | add_interactions <- stringr::str_extract( 96 | string = add_interactions, 97 | pattern = "^.*\\)" 98 | ) 99 | 100 | # character to formula 101 | add_interactions <- as.formula(paste("~ . +", add_interactions)) 102 | 103 | # update formula with interactions 104 | x <- update(x, add_interactions) 105 | 106 | # add splines to context level variables 107 | if (deep.splines) { 108 | 109 | # formula to character 110 | char_form <- as.character(x) 111 | char_form <- sprintf("%s %s %s", char_form[2], char_form[1], char_form[3]) 112 | 113 | # get all context level variables in the current model 114 | c_l2_x <- char_form %>% 115 | stringr::str_extract_all(pattern = "L2\\.x\\d+") %>% 116 | unlist() 117 | 118 | # replace in string 119 | for (i in seq_along(c_l2_x)) { 120 | char_form <- stringr::str_replace( 121 | string = char_form, 122 | pattern = c_l2_x[i], 123 | replacement = sprintf("v_s(%s)", c_l2_x[i]) 124 | ) 125 | } 126 | 127 | # character to formula 128 | x <- as.formula(char_form) 129 | 130 | } 131 | 132 | return(x) 133 | }) 134 | 135 | # Register cores 136 | cl <- multicore(cores = cores, type = "open", cl = NULL) 137 | 138 | # Train and evaluate each model 139 | m_errors <- foreach::foreach( 140 | m = seq_along(models), .packages = "autoMrP", 141 | .export = c("deep_mrp_classifier", "loss_function") 142 | ) %dorng% { 143 | 144 | `%>%` <- magrittr::`%>%` 145 | 146 | # Loop over each fold 147 | k_errors <- lapply(seq_along(data), function(k) { 148 | 149 | # Split data in training and validation sets 150 | data_train <- dplyr::bind_rows(data[-k]) 151 | data_valid <- dplyr::bind_rows(data[k]) 152 | 153 | # Train mth model on kth training set 154 | model_m <- deep_mrp_classifier( 155 | form = models[[m]], 156 | y = y, 157 | data = data_train, 158 | verbose = TRUE 159 | ) 160 | 161 | # predictions based on DV type (binary or continuous) 162 | if (dv_type == "binary") { 163 | # use trained model to make predictions for kth validation set 164 | pred_m <- vglmer::predict_MAVB( 165 | samples = 1000, 166 | model_m, 167 | newdata = data_valid, 168 | allow_missing_levels = TRUE 169 | )[["mean"]] 170 | 171 | # convert to response probabilities 172 | pred_m <- stats::plogis(pred_m) 173 | 174 | } else if (dv_type == "linear") { 175 | # Use trained model to make predictions for kth validation set 176 | pred_m <- predict( 177 | samples = 1000, 178 | object = model_m, 179 | newdata = data_valid, 180 | allow_missing_levels = TRUE 181 | )[["mean"]] 182 | } 183 | 184 | # evaluate predictions based on loss function 185 | perform_m <- loss_function( 186 | pred = pred_m, 187 | data.valid = data_valid, 188 | loss.unit = loss.unit, 189 | loss.fun = loss.fun, 190 | y = y, 191 | L2.unit = L2.unit 192 | ) 193 | }) 194 | 195 | # Mean over loss functions 196 | k_errors <- dplyr::bind_rows(k_errors) %>% 197 | dplyr::group_by(measure) %>% 198 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 199 | dplyr::mutate(model = m) 200 | } 201 | 202 | # De-register cluster 203 | multicore(cores = cores, type = "close", cl = cl) 204 | 205 | # Extract best tuning parameters 206 | grid_cells <- dplyr::bind_rows(m_errors) 207 | best_params <- dplyr::slice( 208 | loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1 209 | ) 210 | 211 | # Choose best-performing model 212 | out <- models[[dplyr::pull(.data = best_params, var = model)]] 213 | 214 | # Function output 215 | return(out) 216 | } 217 | -------------------------------------------------------------------------------- /R/run_lasso.R: -------------------------------------------------------------------------------- 1 | #' Apply lasso classifier to MrP. 2 | #' 3 | #' \code{run_lasso} is a wrapper function that applies the lasso classifier to 4 | #' data provided by the user, evaluates prediction performance, and chooses the 5 | #' best-performing model. 6 | #' 7 | #' @inheritParams auto_MrP 8 | #' @param lambda Lasso penalty parameter. A numeric \code{vector} of 9 | #' non-negative values. The penalty parameter controls the shrinkage of the 10 | #' context-level variables in the lasso model. Default is a sequence with 11 | #' minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The 12 | #' number of values is controlled by the \code{lasso.n.iter} parameter. 13 | #' @param n.iter Lasso number of lambda values. An integer-valued scalar 14 | #' specifying the number of lambda values to search over. Default is 15 | #' \eqn{100}. 16 | #' \emph{Note:} Is ignored if a vector of \code{lasso.lambda} values is 17 | #' provided. 18 | #' @param data Data for cross-validation. A \code{list} of \eqn{k} 19 | #' \code{data.frames}, one for each fold to be used in \eqn{k}-fold 20 | #' cross-validation. 21 | #' 22 | #' @return The tuned lambda value. A numeric scalar. 23 | 24 | run_lasso <- function( 25 | y, L1.x, L2.x, L2.unit, L2.reg, n.iter, loss.unit, loss.fun, 26 | lambda, data, verbose, cores 27 | ) { 28 | 29 | # Lasso search grid 30 | if (is.null(lambda)) { 31 | lambda <- log_spaced(min = 0.1, max = 250, n = n.iter) 32 | } 33 | 34 | # Context-level fixed effects 35 | L2_fe <- paste(L2.x, collapse = " + ") 36 | if (L2_fe == "") { 37 | L2_fe_form <- as.formula(paste(y, " ~ 1", sep = "")) 38 | L2.x <- NULL 39 | } else { 40 | L2_fe_form <- as.formula(paste(y, " ~ ", L2_fe, sep = "")) 41 | } 42 | 43 | # Individual-level random effects as named list 44 | L1_re <- setNames( 45 | as.list(rep(c(~ 1), times = length(c(L1.x, L2.unit, L2.reg)))), 46 | c(L1.x, L2.unit, L2.reg) 47 | ) 48 | 49 | # Parallel processing 50 | if (cores > 1) { 51 | lambda_errors <- run_lasso_mc_lambda( 52 | y = y, L1.x = L1.x, L2.x = L2.x, L2.unit = L2.unit, L2.reg = L2.reg, 53 | loss.unit = loss.unit, loss.fun = loss.fun, data = data, 54 | cores = cores, L2.fe.form = L2_fe_form, L1.re = L1_re, 55 | lambda = lambda 56 | ) 57 | } else { 58 | 59 | # Train and evaluate each model 60 | lambda_errors <- lapply(seq_along(lambda), function(l) { 61 | 62 | # Print lambda value 63 | if (isTRUE(verbose)) { 64 | L <- length(lambda) 65 | cat(paste( 66 | "Lasso: Running lambda w/ value ", lambda[l], 67 | " (lambda ", l, " out of max. ", 68 | L, " lambdas)\n", sep = "" 69 | )) 70 | } 71 | 72 | # Loop over each fold 73 | k_errors <- lapply(seq_along(data), function(k) { 74 | # Split data in training and validation sets 75 | data_train <- dplyr::bind_rows(data[-k]) 76 | data_valid <- dplyr::bind_rows(data[k]) 77 | 78 | # Convert individual-level, geographic unit, and geographic region 79 | # covariates to factor variables in training and validation sets 80 | data_train <- data_train %>% 81 | dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>% 82 | dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>% 83 | tidyr::drop_na() 84 | 85 | data_valid <- data_valid %>% 86 | dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>% 87 | dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>% 88 | tidyr::drop_na() 89 | 90 | # Train model using lambda value on kth training set 91 | model_l <- lasso_classifier( 92 | y = y, 93 | L2.fix = L2_fe_form, 94 | L1.re = L1_re, 95 | data.train = data_train, 96 | lambda = lambda[l], 97 | model.family = binomial(link = "probit"), 98 | verbose = verbose 99 | ) 100 | 101 | # Use trained model to make predictions for kth validation set 102 | pred_l <- stats::predict(model_l, newdata = data.frame(data_valid)) 103 | 104 | # Evaluate predictions based on loss function 105 | perform_l <- loss_function( 106 | pred = pred_l, 107 | data.valid = data_valid, 108 | loss.unit = loss.unit, 109 | loss.fun = loss.fun, 110 | y = y, 111 | L2.unit = L2.unit 112 | ) 113 | }) 114 | 115 | # Mean over loss functions 116 | k_errors <- dplyr::bind_rows(k_errors) %>% 117 | dplyr::group_by(measure) %>% 118 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 119 | dplyr::mutate(lambda = lambda[l]) 120 | }) 121 | } 122 | # Extract best tuning parameters 123 | grid_cells <- dplyr::bind_rows(lambda_errors) 124 | best_params <- dplyr::slice( 125 | loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1 126 | ) 127 | 128 | # Choose best-performing model 129 | out <- dplyr::pull(.data = best_params, var = lambda) 130 | 131 | return(out) 132 | 133 | } 134 | 135 | 136 | ################################################################################ 137 | # Multicore tuning for lasso parallel across lambda values # 138 | ################################################################################ 139 | #' Lasso multicore tuning. 140 | #' 141 | #' \code{run_lasso_mc_lambda} is called from within \code{run_lasso}. It 142 | #' tunes using multiple cores. 143 | #' 144 | #' @inheritParams auto_MrP 145 | #' @inheritParams run_lasso 146 | #' @param L2.fe.form The fixed effects part of the Lasso classifier formula. The 147 | #' formula is inherited from \code{run_lasso}. 148 | #' @param L1.re A list of random effects for the Lasso classifier formula. The 149 | #' formula is inherited from \code{run_lasso}. 150 | #' @return The cross-validation errors for all models. A list. 151 | 152 | run_lasso_mc_lambda <- function( 153 | y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, data, 154 | cores, L2.fe.form, L1.re, lambda 155 | ) { 156 | 157 | # Binding for global variables 158 | `%>%` <- dplyr::`%>%` 159 | l <- NULL 160 | 161 | # Register cores 162 | cl <- multicore(cores = cores, type = "open", cl = NULL) 163 | 164 | # Loop over each lambda value 165 | lambda_errors <- foreach::foreach(l = seq_along(lambda)) %dorng% { 166 | 167 | # Set lambda value to 0 168 | lambda_value <- lambda[l] 169 | 170 | # Loop over each fold 171 | k_errors <- lapply(seq_along(data), function(k) { 172 | # Split data in training and validation sets 173 | data_train <- dplyr::bind_rows(data[-k]) 174 | data_valid <- dplyr::bind_rows(data[k]) 175 | 176 | # Convert individual-level, geographic unit, and geographic region 177 | # covariates to factor variables in training and validation sets 178 | data_train <- data_train %>% 179 | dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>% 180 | dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>% 181 | tidyr::drop_na() 182 | 183 | data_valid <- data_valid %>% 184 | dplyr::mutate_at(.vars = c(L1.x, L2.unit, L2.reg), as.factor) %>% 185 | dplyr::select(dplyr::all_of(c(y, L1.x, L2.x, L2.unit, L2.reg))) %>% 186 | tidyr::drop_na() 187 | 188 | # Train model using lambda value on kth training set 189 | model_l <- lasso_classifier( 190 | y = y, 191 | L2.fix = L2.fe.form, 192 | L1.re = L1.re, 193 | data.train = data_train, 194 | lambda = lambda_value, 195 | model.family = binomial(link = "probit"), 196 | verbose = FALSE 197 | ) 198 | 199 | # Use trained model to make predictions for kth validation set 200 | pred_l <- stats::predict(model_l, newdata = data.frame(data_valid)) 201 | 202 | # Evaluate predictions based on loss function 203 | perform_l <- loss_function( 204 | pred = pred_l, 205 | data.valid = data_valid, 206 | loss.unit = loss.unit, 207 | loss.fun = loss.fun, 208 | y = y, L2.unit = L2.unit 209 | ) 210 | }) 211 | 212 | # Mean over loss functions 213 | k_errors <- dplyr::bind_rows(k_errors) %>% 214 | dplyr::group_by(measure) %>% 215 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 216 | dplyr::mutate(lambda = lambda[l]) 217 | 218 | } 219 | 220 | # De-register cluster 221 | multicore(cores = cores, type = "close", cl = cl) 222 | 223 | # Function output 224 | return(lambda_errors) 225 | 226 | } 227 | -------------------------------------------------------------------------------- /R/run_pca.R: -------------------------------------------------------------------------------- 1 | #' Apply PCA classifier to MrP. 2 | #' 3 | #' \code{run_pca} is a wrapper function that applies the PCA classifier to data 4 | #' provided by the user, evaluates prediction performance, and chooses the 5 | #' best-performing model. 6 | #' 7 | #' @inheritParams auto_MrP 8 | #' @param data Data for cross-validation. A \code{list} of \eqn{k} 9 | #' \code{data.frames}, one for each fold to be used in \eqn{k}-fold 10 | #' cross-validation. 11 | #' 12 | #' @return A model formula of the winning best subset classifier model. 13 | 14 | run_pca <- function( 15 | y, L1.x, L2.x, L2.unit, L2.reg, loss.unit, loss.fun, data, cores, 16 | verbose 17 | ) { 18 | 19 | # List of all models to be evaluated 20 | models <- model_list_pca( 21 | y = y, 22 | L1.x = L1.x, 23 | L2.x = L2.x, 24 | L2.unit = L2.unit, 25 | L2.reg = L2.reg 26 | ) 27 | 28 | # prallel tuning if cores > 1 29 | if (cores > 1) { 30 | 31 | # Train all models in parallel 32 | m_errors <- run_best_subset_mc( 33 | verbose = verbose, 34 | models = models, 35 | data = data, 36 | loss.unit = loss.unit, 37 | loss.fun = loss.fun, 38 | y = y, 39 | L1.x = L1.x, 40 | L2.x = L2.x, 41 | L2.unit = L2.unit, 42 | L2.reg = L2.reg, 43 | cores = cores 44 | ) 45 | } else { 46 | # Train and evaluate each model 47 | m_errors <- lapply(seq_along(models), function(m) { 48 | # Print model m 49 | if (isTRUE(verbose)) { 50 | M <- length(models) 51 | message( 52 | "Best subset: Running model ", m, 53 | " out of ", M, " models\n") 54 | } 55 | 56 | # Loop over each fold 57 | k_errors <- lapply(seq_along(data), function(k) { 58 | # Split data in training and validation sets 59 | data_train <- dplyr::bind_rows(data[-k]) 60 | data_valid <- dplyr::bind_rows(data[k]) 61 | 62 | # Train mth model on kth training set 63 | model_m <- best_subset_classifier( 64 | y = y, 65 | model = models[[m]], 66 | data.train = data_train, 67 | model.family = binomial(link = "probit"), 68 | model.optimizer = "bobyqa", 69 | n.iter = 1000000, 70 | verbose = verbose 71 | ) 72 | 73 | # Use trained model to make predictions for kth validation set 74 | pred_m <- stats::predict( 75 | model_m, newdata = data_valid, 76 | type = "response", allow.new.levels = TRUE 77 | ) 78 | 79 | # Evaluate predictions based on loss function 80 | perform_m <- loss_function( 81 | pred = pred_m, 82 | data.valid = data_valid, 83 | loss.unit = loss.unit, 84 | loss.fun = loss.fun, 85 | y = y, 86 | L2.unit = L2.unit 87 | ) 88 | }) 89 | 90 | # Mean over loss functions 91 | k_errors <- dplyr::bind_rows(k_errors) %>% 92 | dplyr::group_by(measure) %>% 93 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 94 | dplyr::mutate(model = m) 95 | }) 96 | } 97 | 98 | # Extract best tuning parameters 99 | grid_cells <- dplyr::bind_rows(m_errors) 100 | best_params <- dplyr::slice( 101 | loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1 102 | ) 103 | 104 | # Choose best-performing model 105 | out <- models[[dplyr::pull(.data = best_params, var = model)]] 106 | 107 | # Function output 108 | return(out) 109 | 110 | } 111 | -------------------------------------------------------------------------------- /R/run_svm.R: -------------------------------------------------------------------------------- 1 | #' Apply support vector machine classifier to MrP. 2 | #' 3 | #' \code{run_svm} is a wrapper function that applies the support vector machine 4 | #' classifier to data provided by the user, evaluates prediction performance, 5 | #' and chooses the best-performing model. 6 | #' 7 | #' @inheritParams auto_MrP 8 | #' @param L2.eval.unit Geographic unit for the loss function. A character scalar 9 | #' containing the column name of the geographic unit in \code{survey} and 10 | #' \code{census}. 11 | #' @param L2.reg Geographic region. A character scalar containing the column 12 | #' name of the geographic region in \code{survey} and \code{census} by which 13 | #' geographic units are grouped (\code{L2.unit} must be nested within 14 | #' \code{L2.reg}). Default is \code{NULL}. 15 | #' @param loss.fun Loss function. A character-valued scalar indicating whether 16 | #' prediction loss should be measured by the mean squared error (\code{MSE}) 17 | #' or the mean absolute error (\code{MAE}). Default is \code{MSE}. 18 | #' @param kernel SVM kernel. A character-valued scalar specifying the kernel to 19 | #' be used by SVM. The possible values are \code{linear}, \code{polynomial}, 20 | #' \code{radial}, and \code{sigmoid}. Default is \code{radial}. 21 | #' @param gamma SVM kernel parameter. A numeric vector whose values specify the 22 | #' gamma parameter in the SVM kernel. This parameter is needed for all kernel 23 | #' types except linear. Default is a sequence with minimum = 1e-5, maximum = 24 | #' 1e-1, and length = 20 that is equally spaced on the log-scale. 25 | #' @param cost SVM cost parameter. A numeric vector whose values specify the 26 | #' cost of constraints violation in SVM. Default is a sequence with minimum = 27 | #' 0.5, maximum = 10, and length = 5 that is equally spaced on the log-scale. 28 | #' @param data Data for cross-validation. A \code{list} of \eqn{k} 29 | #' \code{data.frames}, one for each fold to be used in \eqn{k}-fold 30 | #' cross-validation. 31 | #' 32 | #' @return The support vector machine tuned parameters. A list. 33 | 34 | run_svm <- function( 35 | y, L1.x, L2.x, L2.eval.unit, L2.unit, L2.reg, 36 | kernel = "radial", loss.fun, loss.unit, gamma, 37 | cost, data, verbose, cores 38 | ) { 39 | 40 | # Create model formula 41 | x <- paste(c(L1.x, L2.x, L2.unit, L2.reg), collapse = " + ") 42 | form <- as.formula(paste(y, " ~ ", x, sep = "")) 43 | 44 | # Default Gamma values 45 | if (is.null(gamma)) { 46 | # SVM Gamma values 47 | gamma <- log_spaced(min = 1e-5, 1e-1, n = 20) 48 | } 49 | 50 | # Default Cost values 51 | if (is.null(cost)) { 52 | cost <- log_spaced(min = 0.5, max = 10, n = 5) 53 | } 54 | 55 | # tuning parameter grid 56 | svm_grid <- expand.grid(gamma, cost, kernel) 57 | names(svm_grid) <- c("gamma", "cost", "kernel") 58 | 59 | # prallel tuning if cores > 1 60 | if (cores > 1) { 61 | 62 | # Train all models in parallel 63 | grid_cells <- run_svm_mc( 64 | verbose = verbose, 65 | svm.grid = svm_grid, 66 | data = data, 67 | L2.eval.unit = L2.eval.unit, 68 | loss.unit = loss.unit, 69 | loss.fun = loss.fun, 70 | y = y, 71 | L1.x = L1.x, 72 | L2.x = L2.x, 73 | L2.unit = L2.unit, 74 | L2.reg = L2.reg, 75 | form = form, 76 | cores = cores 77 | ) 78 | 79 | # Train all models sequentially 80 | } else { 81 | # loop over tuning grid 82 | grid_cells <- apply(svm_grid, 1, function(g) { 83 | 84 | # Set tuning parameters 85 | gamma_value <- as.numeric(g["gamma"]) 86 | cost_value <- as.numeric(g["cost"]) 87 | kernel_value <- as.character(g[["kernel"]]) 88 | 89 | # Loop over each fold 90 | k_errors <- lapply(seq_along(data), function(k) { 91 | 92 | # Split data in training and validation sets and factorize DV 93 | data_train <- dplyr::bind_rows(data[-k]) %>% 94 | dplyr::mutate_at(.vars = y, as.factor) %>% 95 | dplyr::select(dplyr::all_of( 96 | c(y, L1.x, L2.x, L2.eval.unit, L2.reg) 97 | )) %>% 98 | tidyr::drop_na() 99 | 100 | data_valid <- dplyr::bind_rows(data[k]) %>% 101 | dplyr::mutate_at(.vars = y, as.factor) %>% 102 | dplyr::select(dplyr::all_of( 103 | c(y, L1.x, L2.x, L2.eval.unit, L2.reg) 104 | )) %>% 105 | tidyr::drop_na() 106 | 107 | # Svm classifier 108 | model_l <- svm_classifier( 109 | y = y, 110 | form = form, 111 | data = data_train, 112 | kernel = kernel_value, 113 | type = "C-classification", 114 | probability = TRUE, 115 | svm.gamma = gamma_value, 116 | svm.cost = cost_value, 117 | verbose = verbose 118 | ) 119 | 120 | # Use trained model to make predictions for kth validation set 121 | pred_l <- predict( 122 | model_l, newdata = data.frame(data_valid), 123 | probability = TRUE 124 | ) 125 | if (!is.null(attr(pred_l, "probabilities")[, "1"])) { 126 | pred_l <- as.numeric(attr(pred_l, "probabilities")[, "1"]) 127 | } 128 | 129 | # Transform factor DV to numeric for loss function 130 | data_valid <- data_valid %>% 131 | dplyr::mutate_at(.vars = y, function(x) as.numeric(levels(x))[x]) 132 | 133 | # Evaluate predictions based on loss function 134 | perform_l <- loss_function( 135 | pred = pred_l, data.valid = data_valid, 136 | loss.unit = loss.unit, 137 | loss.fun = loss.fun, 138 | y = y, L2.unit = L2.eval.unit 139 | ) 140 | }) 141 | 142 | # Mean over loss functions 143 | k_errors <- dplyr::bind_rows(k_errors) %>% 144 | dplyr::group_by(measure) %>% 145 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 146 | dplyr::mutate( 147 | gamma = gamma_value, 148 | cost = cost_value, 149 | kernel = kernel_value 150 | ) 151 | 152 | }) 153 | } 154 | 155 | # Extract best tuning parameters 156 | grid_cells <- dplyr::bind_rows(grid_cells) 157 | best_params <- dplyr::slice( 158 | loss_score_ranking(score = grid_cells, loss.fun = loss.fun), 1 159 | ) 160 | 161 | out <- list( 162 | gamma = dplyr::pull(.data = best_params, var = gamma), 163 | cost = dplyr::pull(.data = best_params, var = cost), 164 | kernel = dplyr::pull(.data = best_params, var = kernel) 165 | ) 166 | 167 | # Function output 168 | return(out) 169 | 170 | } 171 | 172 | ################################################################################ 173 | # Multicore tuning for svm # 174 | ################################################################################ 175 | #' SVM multicore tuning. 176 | #' 177 | #' \code{run_svm_mc} is called from within \code{run_svm}. It tunes using 178 | #' multiple cores. 179 | #' 180 | #' @inheritParams run_svm 181 | #' @param form The model formula. A formula object. 182 | #' @param svm.grid The hyper-parameter search grid. A matrix of all 183 | #' hyper-parameter combinations. 184 | #' @return The cross-validation errors for all models. A list. 185 | 186 | run_svm_mc <- function( 187 | y, L1.x, L2.x, L2.eval.unit, L2.unit, L2.reg, form, 188 | loss.unit, loss.fun, data, cores, svm.grid, verbose 189 | ) { 190 | 191 | # Binding for global variables 192 | g <- NULL 193 | `%>%` <- dplyr::`%>%` 194 | 195 | # Register cores 196 | cl <- multicore(cores = cores, type = "open", cl = NULL) 197 | 198 | # Train and evaluate each model 199 | grid_cells <- foreach::foreach( 200 | g = seq_len(nrow(svm.grid)), .packages = "autoMrP" 201 | ) %dorng% { 202 | 203 | # Set tuning parameters 204 | gamma_value <- as.numeric(svm.grid[g, "gamma"]) 205 | cost_value <- as.numeric(svm.grid[g, "cost"]) 206 | kernel_value <- svm.grid[g, "kernel"] 207 | 208 | # Loop over each fold 209 | k_errors <- lapply(seq_along(data), function(k) { 210 | 211 | # Split data in training and validation sets and factorize DV 212 | data_train <- dplyr::bind_rows(data[-k]) %>% 213 | dplyr::mutate_at(.vars = y, as.factor) %>% 214 | dplyr::select(dplyr::all_of( 215 | c(y, L1.x, L2.x, L2.eval.unit, L2.reg) 216 | )) %>% 217 | tidyr::drop_na() 218 | 219 | data_valid <- dplyr::bind_rows(data[k]) %>% 220 | dplyr::mutate_at(.vars = y, as.factor) %>% 221 | dplyr::select(dplyr::all_of( 222 | c(y, L1.x, L2.x, L2.eval.unit, L2.reg) 223 | )) %>% 224 | tidyr::drop_na() 225 | 226 | # Svm classifier 227 | model_l <- svm_classifier( 228 | y = y, 229 | form = form, 230 | data = data_train, 231 | kernel = kernel_value, 232 | type = "C-classification", 233 | probability = TRUE, 234 | svm.gamma = gamma_value, 235 | svm.cost = cost_value, 236 | verbose = verbose 237 | ) 238 | 239 | # Use trained model to make predictions for kth validation set 240 | pred_l <- predict( 241 | model_l, newdata = data.frame(data_valid), 242 | probability = TRUE 243 | ) 244 | if (!is.null(attr(pred_l, "probabilities")[, "1"])) { 245 | pred_l <- as.numeric(attr(pred_l, "probabilities")[, "1"]) 246 | } 247 | 248 | # Transform factor DV to numeric for loss function 249 | data_valid <- data_valid %>% 250 | dplyr::mutate_at(.vars = y, function(x) as.numeric(levels(x))[x]) 251 | 252 | # Evaluate predictions based on loss function 253 | perform_l <- loss_function( 254 | pred = pred_l, 255 | data.valid = data_valid, 256 | loss.unit = loss.unit, 257 | loss.fun = loss.fun, 258 | y = y, 259 | L2.unit = L2.eval.unit 260 | ) 261 | 262 | return(perform_l) 263 | }) 264 | 265 | # Mean over loss functions 266 | k_errors <- dplyr::bind_rows(k_errors) %>% 267 | dplyr::group_by(measure) %>% 268 | dplyr::summarise(value = mean(value), .groups = "drop") %>% 269 | dplyr::mutate( 270 | gamma = gamma_value, 271 | cost = cost_value, 272 | kernel = kernel_value 273 | ) 274 | } 275 | 276 | # De-register cluster 277 | multicore(cores = cores, type = "close", cl = cl) 278 | 279 | # Function output 280 | return(grid_cells) 281 | } 282 | -------------------------------------------------------------------------------- /R/survey_data.R: -------------------------------------------------------------------------------- 1 | #' A sample of a survey item from the CCES 2008 2 | #' 3 | #' The Cooperative Congressional Election Stuides (CCES) item (cc418_1) asked: 4 | #' "Would you approve of the use of U.S. military troops in order to ensure the 5 | #' supply of oil?" The original 2008 CCES item contains 36,832 respondents. This 6 | #' sample mimics a typical national survey. It contains at least 5 respondents 7 | #' from each state but is otherwise a random sample. 8 | #' 9 | #' @format A data frame with 1500 rows and 13 variables: 10 | #' \describe{ 11 | #' \item{YES}{1 if individual supports use of troops; 0 otherwise} 12 | #' \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 13 | #' \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 14 | #' \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 15 | #' \item{state}{U.S. state} 16 | #' \item{L2.unit}{U.S. state id} 17 | #' \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 18 | #' \item{L2.x1}{Normalized state-level share of votes for the Republican candidate in the previous presidential election} 19 | #' \item{L2.x2}{Normalized state-level percentage of Evangelical Protestant or Mormon respondents} 20 | #' \item{L2.x3}{Normalized state-level percentage of the population living in urban areas} 21 | #' \item{L2.x4}{Normalized state-level unemployment rate} 22 | #' \item{L2.x5}{Normalized state-level share of Hispanics} 23 | #' \item{L2.x6}{Normalized state-level share of Whites} 24 | #' } 25 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 26 | #' article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 27 | #' multilevel regression and poststrat-stratification perform with 28 | #' conventional national surveys?" Political Analysis 21(4): 449-467. It is a 29 | #' random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 30 | #' L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 31 | "survey_item" 32 | -------------------------------------------------------------------------------- /R/svm_classifier.R: -------------------------------------------------------------------------------- 1 | #' SVM classifier 2 | #' 3 | #' \code{svm_classifier} applies support vector machine classification to a 4 | #' data set. 5 | #' 6 | #' @inheritParams auto_MrP 7 | #' @param form Model formula. A two-sided linear formula describing 8 | #' the model to be fit, with the outcome on the LHS and the covariates 9 | #' separated by + operators on the RHS. 10 | #' @param data Data. A data.frame containing the cross-validation data used to 11 | #' train and evaluate the model. 12 | #' @param kernel Kernel for SVM. A character string specifying the kernel to 13 | #' be used for SVM. The possible types are linear, polynomial, radial, and 14 | #' sigmoid. Default is radial. 15 | #' @param type svm can be used as a classification machine, as a regression 16 | #' machine, or for novelty detection. Depending of whether y is a factor or 17 | #' not, the default setting for type is C-classification or eps-regression, 18 | #' respectively, but may be overwritten by setting an explicit value. Valid 19 | #' options are: #' \enumerate{ 20 | #' \item C-classification 21 | #' \item nu-classification 22 | #' \item one-classification (for novelty detection) 23 | #' \item eps-regression 24 | #' \item nu-regression 25 | #' } 26 | #' @param probability Probability predictions. A logical argument indicating 27 | #' whether the model should allow for probability predictions 28 | #' @param svm.gamma Gamma parameter for SVM. This parameter is needed for all 29 | #' kernels except linear. 30 | #' @param svm.cost Cost parameter for SVM. This parameter specifies the cost of 31 | #' constraints violation. 32 | #' @param verbose Verbose output. A logical vector indicating whether or not 33 | #' verbose output should be printed. 34 | #' @return The support vector machine model. An \code{\link[e1071]{svm}} object. 35 | 36 | svm_classifier <- function( 37 | y, form, data, kernel, type, probability, svm.gamma, 38 | svm.cost, verbose = c(TRUE, FALSE) 39 | ) { 40 | 41 | # Determine type of dependent variable 42 | if ( 43 | data %>% 44 | dplyr::pull(!!y) %>% 45 | unique() %>% 46 | length() > 2 47 | ) { 48 | # set type 49 | type <- "eps-regression" 50 | # numeric dv 51 | data <- data %>% 52 | dplyr::mutate_at(.vars = y, function(x) as.numeric(levels(x))[x]) 53 | } 54 | 55 | # Train and evaluate model using the supplied set of tuning parameters 56 | if (isTRUE(verbose == TRUE)) { 57 | out <- e1071::svm( 58 | formula = form, 59 | data = data, 60 | type = type, 61 | kernel = kernel, 62 | gamma = svm.gamma, 63 | cost = svm.cost, 64 | probability = probability 65 | ) 66 | } else { 67 | out <- suppressMessages(suppressWarnings( 68 | e1071::svm( 69 | formula = form, 70 | data = data, 71 | type = type, 72 | kernel = kernel, 73 | gamma = svm.gamma, 74 | cost = svm.cost, 75 | probability = probability 76 | ) 77 | )) 78 | } 79 | 80 | # Function output 81 | return(out) 82 | } 83 | -------------------------------------------------------------------------------- /R/taxes_census.R: -------------------------------------------------------------------------------- 1 | #' Quasi census data. 2 | #' 3 | #' The census file is generated from the full 2008 National Annenberg Election 4 | #' Studies item CBb01 by dissaggregating the 64 ideal type combinations of the 5 | #' individual level variables L1x1, L2x2 and L1x3. A row is an ideal type in a 6 | #' given state. 7 | #' 8 | #' 9 | #' @format A data frame with 2934 rows and 13 variables: 10 | #' \describe{ 11 | #' \item{state}{U.S. state} 12 | #' \item{L2.unit}{U.S. state id} 13 | #' \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 14 | #' \item{L1x1}{Age group (four categories)} 15 | #' \item{L1x2}{Education level (four categories)} 16 | #' \item{L1x3}{Gender-race combination (six categories)} 17 | #' \item{freq}{State-level frequency of ideal type} 18 | #' \item{proportion}{State-level proportion of respondents of that ideal type in the population} 19 | #' \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 20 | #' \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 21 | #' \item{L2.x3}{State-level percentage of the population living in urban areas} 22 | #' \item{L2.x4}{State-level unemployment rate} 23 | #' \item{L2.x5}{State-level share of Hispanics} 24 | #' \item{L2.x6}{State-level share of Whites} 25 | #' } 26 | #' @usage data(taxes_census) 27 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 28 | #' article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 29 | #' multilevel regression and poststrat-stratification perform with 30 | #' conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3, 31 | #' L2.x3, L2.x4, L2.x5 and L2.x6 are available at 32 | #' \url{https://www.census.gov}. 33 | "taxes_census" 34 | -------------------------------------------------------------------------------- /R/taxes_survey.R: -------------------------------------------------------------------------------- 1 | #' Sample on raising taxes from the 2008 National Annenberg Election Studies. 2 | #' 3 | #' The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm 4 | #' going to read you some options about federal income taxes. Please tell me 5 | #' which one comes closest to your view on what we should be doing about federal 6 | #' income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if 7 | #' necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3) 8 | #' was turned into a 'raise taxes response,' categories (1) and (2) were 9 | #' combined into a 'do not raise taxes' response. The original item from the 10 | #' phone and online surveys contains 50,483 respondents. This sample mimics a 11 | #' typical national survey. It contains at least 5 respondents from each state 12 | #' but is otherwise a random sample. 13 | #' 14 | #' 15 | #' @format A data frame with 1500 rows and 13 variables: 16 | #' \describe{ 17 | #' \item{YES}{1 if individual supports raising taxes; 0 otherwise} 18 | #' \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 19 | #' \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 20 | #' \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 21 | #' \item{state}{U.S. state} 22 | #' \item{L2.unit}{U.S. state id} 23 | #' \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 24 | #' \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 25 | #' \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 26 | #' \item{L2.x3}{State-level percentage of the population living in urban areas} 27 | #' \item{L2.x4}{State-level unemployment rate} 28 | #' \item{L2.x5}{State-level share of Hispanics} 29 | #' \item{L2.x6}{State-level share of Whites} 30 | #' } 31 | #' @usage data(taxes_survey) 32 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 33 | #' article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 34 | #' multilevel regression and poststrat-stratification perform with 35 | #' conventional national surveys?" Political Analysis 21(4): 449-467. It is a 36 | #' random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 37 | #' L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 38 | "taxes_survey" 39 | -------------------------------------------------------------------------------- /R/taxes_truth.R: -------------------------------------------------------------------------------- 1 | #' Sample of tax rates item from the 2008 National Annenberg Election Studies. 2 | #' 3 | #' The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm 4 | #' going to read you some options about federal income taxes. Please tell me 5 | #' which one comes closest to your view on what we should be doing about federal 6 | #' income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if 7 | #' necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3) 8 | #' was turned into a 'raise taxes response,' categories (1) and (2) were 9 | #' combined into a 'do not raise taxes' response. The original item from the 10 | #' phone and online surveys contains 50,483 respondents. This sample mimics a 11 | #' typical national survey. It contains at least 5 respondents from each state 12 | #' but is otherwise a random sample. 13 | #' 14 | #' 15 | #' @format A data frame with 1500 rows and 13 variables: 16 | #' \describe{ 17 | #' \item{YES}{1 if individual supports raising taxes; 0 otherwise} 18 | #' \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 19 | #' \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 20 | #' \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 21 | #' \item{state}{U.S. state} 22 | #' \item{L2.unit}{U.S. state id} 23 | #' \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 24 | #' \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 25 | #' \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 26 | #' \item{L2.x3}{State-level percentage of the population living in urban areas} 27 | #' \item{L2.x4}{State-level unemployment rate} 28 | #' \item{L2.x5}{State-level share of Hispanics} 29 | #' \item{L2.x6}{State-level share of Whites} 30 | #' } 31 | #' @usage data(taxes_survey) 32 | #' @source The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 33 | #' article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 34 | #' multilevel regression and poststrat-stratification perform with 35 | #' conventional national surveys?" Political Analysis 21(4): 449-467. It is a 36 | #' random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 37 | #' L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 38 | "taxes_survey" 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # autoMrP 2 | 3 | autoMrP improves the prediction performance of multilevel regression with post-stratification (MrP) by combining a number of machine learning methods through ensemble Bayesian model averaging (EBMA). For more information, see: Broniecki, Leemann, and Wüest. 2022. "Improving Multilevel Regression with Post-Stratification Through Machine Learning (autoMrP)", published in the *Journal of Politics*: https://doi.org/10.1086/714777. 4 | 5 | ## Installation 6 | 7 | To install autoMrP from GitHub, run: 8 | 9 | ```R 10 | devtools::install_github("retowuest/autoMrP", build_vignettes = TRUE) 11 | ``` 12 | 13 | Please refer to the vignette for a detailed introduction to autoMrP. Access the vignette via: 14 | 15 | ```R 16 | utils::browseVignettes("autoMrP") 17 | ``` 18 | -------------------------------------------------------------------------------- /autoMrP.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,namespace,vignette 23 | -------------------------------------------------------------------------------- /data/absentee_census.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/absentee_census.RData -------------------------------------------------------------------------------- /data/absentee_voting.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/absentee_voting.RData -------------------------------------------------------------------------------- /data/census.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/census.RData -------------------------------------------------------------------------------- /data/survey_item.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/survey_item.RData -------------------------------------------------------------------------------- /data/taxes_census.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/taxes_census.RData -------------------------------------------------------------------------------- /data/taxes_survey.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/data/taxes_survey.RData -------------------------------------------------------------------------------- /man/.Rapp.history: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/man/.Rapp.history -------------------------------------------------------------------------------- /man/absentee_census.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/absentee_census.R 3 | \docType{data} 4 | \name{absentee_census} 5 | \alias{absentee_census} 6 | \title{Quasi census data.} 7 | \format{ 8 | A data frame with 2934 rows and 13 variables: 9 | \describe{ 10 | \item{state}{U.S. state} 11 | \item{L2.unit}{U.S. state id} 12 | \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 13 | \item{L1x1}{Age group (four categories)} 14 | \item{L1x2}{Education level (four categories)} 15 | \item{L1x3}{Gender-race combination (six categories)} 16 | \item{proportion}{State-level proportion of respondents of that ideal type in the population} 17 | \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 18 | \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 19 | \item{L2.x3}{State-level percentage of the population living in urban areas} 20 | \item{L2.x4}{State-level unemployment rate} 21 | \item{L2.x5}{State-level share of Hispanics} 22 | \item{L2.x6}{State-level share of Whites} 23 | } 24 | } 25 | \source{ 26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 27 | article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 28 | multilevel regression and poststrat-stratification perform with 29 | conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3, 30 | L2.x3, L2.x4, L2.x5 and L2.x6 are available at 31 | \url{https://www.census.gov}. 32 | } 33 | \usage{ 34 | data(absentee_census) 35 | } 36 | \description{ 37 | The census file is generated from the full 2008 Cooperative Congressional Election Studies 38 | item cc419_1 by dissaggregating the 64 ideal type combinations of the individual level variables 39 | L1x1, L2x2 and L1x3. A row is an ideal type in a given state. 40 | } 41 | \keyword{datasets} 42 | -------------------------------------------------------------------------------- /man/absentee_voting.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/absentee_voting.R 3 | \docType{data} 4 | \name{absentee_voting} 5 | \alias{absentee_voting} 6 | \title{A sample of the absentee voting item from the CCES 2008} 7 | \format{ 8 | A data frame with 1500 rows and 13 variables: 9 | \describe{ 10 | \item{YES}{1 if individual supports use of troops; 0 otherwise} 11 | \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 12 | \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 13 | \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 14 | \item{state}{U.S. state} 15 | \item{L2.unit}{U.S. state id} 16 | \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 17 | \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 18 | \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 19 | \item{L2.x3}{State-level percentage of the population living in urban areas} 20 | \item{L2.x4}{State-level unemployment rate} 21 | \item{L2.x5}{State-level share of Hispanics} 22 | \item{L2.x6}{State-level share of Whites} 23 | } 24 | } 25 | \source{ 26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 27 | article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 28 | multilevel regression and poststrat-stratification perform with 29 | conventional national surveys?" Political Analysis 21(4): 449-467. It is a 30 | random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 31 | L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 32 | } 33 | \usage{ 34 | data(absentee_voting) 35 | } 36 | \description{ 37 | The Cooperative Congressional Election Stuides (CCES) item (cc419_1) asked: 38 | "States have tried many new ways to run elections in recent years. Do you 39 | support or oppose any of the following ways of voting or conducting elections 40 | in your state? Election Reform - Allow absentee voting over the Internet?" 41 | The original 2008 CCES item contains 26,934 respondents. This sample mimics a 42 | typical national survey. It contains at least 5 respondents from each state 43 | but is otherwise a random sample. 44 | } 45 | \keyword{datasets} 46 | -------------------------------------------------------------------------------- /man/best_subset_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/best_subset_classifier.R 3 | \name{best_subset_classifier} 4 | \alias{best_subset_classifier} 5 | \title{Best subset classifier} 6 | \usage{ 7 | best_subset_classifier( 8 | model, 9 | data.train, 10 | model.family, 11 | model.optimizer, 12 | n.iter, 13 | y, 14 | verbose = c(TRUE, FALSE) 15 | ) 16 | } 17 | \arguments{ 18 | \item{model}{Multilevel model. A model formula describing the multilevel 19 | model to be estimated on the basis of the provided training data.} 20 | 21 | \item{data.train}{Training data. A data.frame containing the training data 22 | used to train the model.} 23 | 24 | \item{model.family}{Model family. A variable indicating the model family 25 | to be used by glmer. Defaults to binomial(link = "probit").} 26 | 27 | \item{model.optimizer}{Optimization method. A character-valued scalar 28 | describing the optimization method to be used by glmer. Defaults to 29 | "bobyqa".} 30 | 31 | \item{n.iter}{Iterations. A integer-valued scalar specifying the maximum 32 | number of function evaluations tried by the optimization method.} 33 | 34 | \item{y}{Outcome variable. A character vector containing the column names of 35 | the outcome variable. A character scalar containing the column name of 36 | the outcome variable in \code{survey}.} 37 | 38 | \item{verbose}{Verbose output. A logical vector indicating whether or not 39 | verbose output should be printed.} 40 | } 41 | \value{ 42 | The multilevel model. An \code{\link[lme4]{glmer}} object. 43 | } 44 | \description{ 45 | \code{best_subset_classifier} applies best subset classification to a data 46 | set. 47 | } 48 | -------------------------------------------------------------------------------- /man/binary_cross_entropy.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{binary_cross_entropy} 4 | \alias{binary_cross_entropy} 5 | \title{Estimates the inverse binary cross-entropy, i.e. 0 is the best score and 1 6 | the worst.} 7 | \usage{ 8 | binary_cross_entropy( 9 | pred, 10 | data.valid, 11 | loss.unit = c("individuals", "L2 units"), 12 | y, 13 | L2.unit 14 | ) 15 | } 16 | \arguments{ 17 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.} 18 | 19 | \item{data.valid}{Test data set. A tibble of data that was not used for 20 | prediction.} 21 | 22 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 23 | whether performance loss should be evaluated at the level of individual 24 | respondents (\code{individuals}) or geographic units (\code{L2 units}). 25 | Default is \code{individuals}.} 26 | 27 | \item{y}{Outcome variable. A character vector containing the column names of 28 | the outcome variable.} 29 | 30 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 31 | of the geographic unit in \code{survey} and \code{census} at which outcomes 32 | should be aggregated.} 33 | } 34 | \value{ 35 | Returns a tibble containing two binary cross-entropy prediction 36 | errors. The first is measured at the level of individuals and the second is 37 | measured at the context level. The tibble dimensions are 2x3 with 38 | variables: measure, value and level. 39 | } 40 | \description{ 41 | \code{binary_cross_entropy()} estimates the inverse binary cross-entropy on 42 | the individual and state-level. 43 | } 44 | -------------------------------------------------------------------------------- /man/census.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/census_data.R 3 | \docType{data} 4 | \name{census} 5 | \alias{census} 6 | \title{Quasi census data.} 7 | \format{ 8 | A data frame with 2934 rows and 13 variables: 9 | \describe{ 10 | \item{state}{U.S. state} 11 | \item{L2.unit}{U.S. state id} 12 | \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 13 | \item{L1x1}{Age group (four categories)} 14 | \item{L1x2}{Education level (four categories)} 15 | \item{L1x3}{Gender-race combination (six categories)} 16 | \item{proportion}{State-level proportion of respondents of that ideal type in the population} 17 | \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 18 | \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 19 | \item{L2.x3}{State-level percentage of the population living in urban areas} 20 | \item{L2.x4}{State-level unemployment rate} 21 | \item{L2.x5}{State-level share of Hispanics} 22 | \item{L2.x6}{State-level share of Whites} 23 | } 24 | } 25 | \source{ 26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 27 | article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 28 | multilevel regression and poststrat-stratification perform with 29 | conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3, 30 | L2.x3, L2.x4, L2.x5 and L2.x6 are available at 31 | \url{https://www.census.gov}. 32 | } 33 | \usage{ 34 | census 35 | } 36 | \description{ 37 | The census file is generated from the full 2008 Cooperative Congressional Election Studies 38 | item cc418_1 by dissaggregating the 64 ideal type combinations of the individual level variables 39 | L1x1, L2x2 and L1x3. A row is an ideal type in a given state. 40 | } 41 | \keyword{datasets} 42 | -------------------------------------------------------------------------------- /man/cv_folding.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{cv_folding} 4 | \alias{cv_folding} 5 | \title{Generates folds for cross-validation} 6 | \usage{ 7 | cv_folding(data, L2.unit, k.folds, cv.sampling = c("individuals", "L2 units")) 8 | } 9 | \arguments{ 10 | \item{data}{The survey data; must be a tibble.} 11 | 12 | \item{L2.unit}{The column name of the factor variable identifying the 13 | context-level unit} 14 | 15 | \item{k.folds}{An integer value indicating the number of folds to be 16 | generated.} 17 | 18 | \item{cv.sampling}{Cross-validation sampling method. A character-valued 19 | scalar indicating whether cross-validation folds should be created by 20 | sampling individual respondents (\code{individuals}) or geographic units 21 | (\code{L2 units}). Default is \code{L2 units}. \emph{Note:} ignored if 22 | \code{folds} is provided, but must be specified otherwise.} 23 | } 24 | \value{ 25 | Returns a list with length specified by \code{k.folds} argument. Each 26 | element is a tibble with a fold used in k-fold cross-validation. 27 | } 28 | \description{ 29 | \code{cv_folding} creates folds used in classifier training within the survey 30 | data. 31 | } 32 | -------------------------------------------------------------------------------- /man/deep_mrp_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deep_mrp_classifier.r 3 | \name{deep_mrp_classifier} 4 | \alias{deep_mrp_classifier} 5 | \title{Deep MrP classifier} 6 | \usage{ 7 | deep_mrp_classifier(y, form, data, verbose) 8 | } 9 | \arguments{ 10 | \item{y}{Outcome variable. A character vector containing the column names of 11 | the outcome variable. A character scalar containing the column name of 12 | the outcome variable in \code{survey}.} 13 | 14 | \item{form}{Model formula. A two-sided linear formula describing 15 | the model to be fit, with the outcome on the LHS and the covariates 16 | separated by + operators on the RHS.} 17 | 18 | \item{data}{Data. A data.frame containing the data used to train the model.} 19 | 20 | \item{verbose}{Verbose output. A logical argument indicating whether or not 21 | verbose output should be printed. Default is \code{FALSE}.} 22 | } 23 | \value{ 24 | A Deep MrP model. A \code{\link[vglmer]{vglmer}} object. 25 | } 26 | \description{ 27 | \code{deep_mrp_classifier} applies Deep MrP implemented in the \pkg{vglmer} 28 | package to a data set. 29 | } 30 | -------------------------------------------------------------------------------- /man/ebma.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebma.R 3 | \name{ebma} 4 | \alias{ebma} 5 | \title{Bayesian Ensemble Model Averaging EBMA} 6 | \usage{ 7 | ebma( 8 | ebma.fold, 9 | y, 10 | L1.x, 11 | L2.x, 12 | L2.unit, 13 | L2.reg, 14 | pc.names, 15 | post.strat, 16 | n.draws, 17 | tol, 18 | best.subset.opt, 19 | pca.opt, 20 | lasso.opt, 21 | gb.opt, 22 | svm.opt, 23 | deep.mrp, 24 | verbose, 25 | cores, 26 | preds_all 27 | ) 28 | } 29 | \arguments{ 30 | \item{ebma.fold}{New data for EBMA tuning. A list containing the the data 31 | that must not have been used in classifier training.} 32 | 33 | \item{y}{Outcome variable. A character vector containing the column names of 34 | the outcome variable. A character scalar containing the column name of 35 | the outcome variable in \code{survey}.} 36 | 37 | \item{L1.x}{Individual-level covariates. A character vector containing the 38 | column names of the individual-level variables in \code{survey} and 39 | \code{census} used to predict outcome \code{y}. Note that geographic unit 40 | is specified in argument \code{L2.unit}.} 41 | 42 | \item{L2.x}{Context-level covariates. A character vector containing the 43 | column names of the context-level variables in \code{survey} and 44 | \code{census} used to predict outcome \code{y}. To exclude context-level 45 | variables, set \code{L2.x = NULL}.} 46 | 47 | \item{L2.unit}{Geographic unit. A character scalar containing the column 48 | name of the geographic unit in \code{survey} and \code{census} at which 49 | outcomes should be aggregated.} 50 | 51 | \item{L2.reg}{Geographic region. A character scalar containing the column 52 | name of the geographic region in \code{survey} and \code{census} by which 53 | geographic units are grouped (\code{L2.unit} must be nested within 54 | \code{L2.reg}). Default is \code{NULL}.} 55 | 56 | \item{pc.names}{Principal Component Variable names. A character vector 57 | containing the names of the context-level principal components variables.} 58 | 59 | \item{post.strat}{Post-stratification results. A list containing the best 60 | models for each of the tuned classifiers, the individual level predictions 61 | on the data classifier trainig data and the post-stratified context-level 62 | predictions.} 63 | 64 | \item{n.draws}{EBMA number of samples. An integer-valued scalar specifying 65 | the number of bootstrapped samples to be drawn from the EBMA fold and used 66 | for tuning EBMA. Default is \eqn{100}. Passed on from \code{ebma.n.draws}.} 67 | 68 | \item{tol}{EBMA tolerance. A numeric vector containing the tolerance values 69 | for improvements in the log-likelihood before the EM algorithm stops 70 | optimization. Values should range at least from \eqn{0.01} to \eqn{0.001}. 71 | Default is \code{c(0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001)}. 72 | Passed on from \code{ebma.tol}.} 73 | 74 | \item{best.subset.opt}{Tuned best subset parameters. A list returned from 75 | \code{run_best_subset()}.} 76 | 77 | \item{pca.opt}{Tuned best subset with principal components parameters. A list 78 | returned from \code{run_pca()}.} 79 | 80 | \item{lasso.opt}{Tuned lasso parameters. A list returned from 81 | \code{run_lasso()}.} 82 | 83 | \item{gb.opt}{Tuned gradient tree boosting parameters. A list returned from 84 | \code{run_gb()}.} 85 | 86 | \item{svm.opt}{Tuned support vector machine parameters. A list returned from 87 | \code{run_svm()}.} 88 | 89 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether 90 | the deep MRP classifier should be used for best subset prediction. Setting 91 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best 92 | subset classifier. Default is \code{FALSE}.} 93 | 94 | \item{verbose}{Verbose output. A logical argument indicating whether or not 95 | verbose output should be printed. Default is \code{FALSE}.} 96 | 97 | \item{cores}{The number of cores to be used. An integer indicating the number 98 | of processor cores used for parallel computing. Default is 1.} 99 | } 100 | \description{ 101 | \code{ebma} tunes EBMA and generates weights for classifier averaging. 102 | } 103 | -------------------------------------------------------------------------------- /man/ebma_folding.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{ebma_folding} 4 | \alias{ebma_folding} 5 | \title{Generates data fold to be used for EBMA tuning} 6 | \usage{ 7 | ebma_folding(data, L2.unit, ebma.size) 8 | } 9 | \arguments{ 10 | \item{data}{The full survey data. A tibble.} 11 | 12 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 13 | of the geographic unit in \code{survey} and \code{census} at which outcomes 14 | should be aggregated.} 15 | 16 | \item{ebma.size}{EBMA fold size. A number in the open unit interval 17 | indicating the proportion of respondents to be allocated to the EBMA fold. 18 | Default is \eqn{1/3}.} 19 | } 20 | \value{ 21 | Returns a list with two elements which are both tibble. List element 22 | one is named \code{ebma_fold} and contains the tibble used in Ensemble 23 | Bayesian Model Averaging Tuning. List element two is named \code{cv_data} 24 | and contains the tibble used for classifier tuning. 25 | } 26 | \description{ 27 | #' \code{ebma_folding()} generates a data fold that will not be used in 28 | classifier tuning. It is data that is needed to determine the optimal 29 | tolerance for EBMA. 30 | } 31 | -------------------------------------------------------------------------------- /man/ebma_mc_draws.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebma.R 3 | \name{ebma_mc_draws} 4 | \alias{ebma_mc_draws} 5 | \title{EBMA multicore tuning - parallelises over draws.} 6 | \usage{ 7 | ebma_mc_draws( 8 | train.preds, 9 | train.y, 10 | ebma.fold, 11 | y, 12 | L1.x, 13 | L2.x, 14 | L2.unit, 15 | L2.reg, 16 | pc.names, 17 | model.bs, 18 | model.pca, 19 | model.lasso, 20 | model.gb, 21 | model.svm, 22 | model.mrp, 23 | tol, 24 | n.draws, 25 | cores, 26 | preds_all, 27 | post.strat, 28 | dv_type, 29 | deep.mrp 30 | ) 31 | } 32 | \arguments{ 33 | \item{train.preds}{Predictions of classifiers on the classifier training 34 | data. A tibble.} 35 | 36 | \item{train.y}{Outcome variable of the classifier training data. A numeric 37 | vector.} 38 | 39 | \item{ebma.fold}{New data for EBMA tuning. A list containing the the data 40 | that must not have been used in classifier training.} 41 | 42 | \item{y}{Outcome variable. A character vector containing the column names of 43 | the outcome variable. A character scalar containing the column name of 44 | the outcome variable in \code{survey}.} 45 | 46 | \item{L1.x}{Individual-level covariates. A character vector containing the 47 | column names of the individual-level variables in \code{survey} and 48 | \code{census} used to predict outcome \code{y}. Note that geographic unit 49 | is specified in argument \code{L2.unit}.} 50 | 51 | \item{L2.x}{Context-level covariates. A character vector containing the 52 | column names of the context-level variables in \code{survey} and 53 | \code{census} used to predict outcome \code{y}. To exclude context-level 54 | variables, set \code{L2.x = NULL}.} 55 | 56 | \item{L2.unit}{Geographic unit. A character scalar containing the column 57 | name of the geographic unit in \code{survey} and \code{census} at which 58 | outcomes should be aggregated.} 59 | 60 | \item{L2.reg}{Geographic region. A character scalar containing the column 61 | name of the geographic region in \code{survey} and \code{census} by which 62 | geographic units are grouped (\code{L2.unit} must be nested within 63 | \code{L2.reg}). Default is \code{NULL}.} 64 | 65 | \item{pc.names}{Principal Component Variable names. A character vector 66 | containing the names of the context-level principal components variables.} 67 | 68 | \item{model.bs}{The tuned model from the multilevel regression with best 69 | subset selection classifier. An \code{\link[lme4]{glmer}} object.} 70 | 71 | \item{model.pca}{The tuned model from the multilevel regression with 72 | principal components as context-level predictors classifier. An 73 | \code{\link[lme4]{glmer}} object.} 74 | 75 | \item{model.lasso}{The tuned model from the multilevel regression with L1 76 | regularization classifier. A \code{\link[glmmLasso]{glmmLasso}} object.} 77 | 78 | \item{model.gb}{The tuned model from the gradient boosting classifier. A 79 | \code{\link[gbm]{gbm}} object.} 80 | 81 | \item{model.svm}{The tuned model from the support vector machine classifier. 82 | An \code{\link[e1071]{svm}} object.} 83 | 84 | \item{model.mrp}{The standard MrP model. An \code{\link[lme4]{glmer}} object} 85 | 86 | \item{tol}{EBMA tolerance. A numeric vector containing the tolerance values 87 | for improvements in the log-likelihood before the EM algorithm stops 88 | optimization. Values should range at least from \eqn{0.01} to \eqn{0.001}. 89 | Default is \code{c(0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001)}. 90 | Passed on from \code{ebma.tol}.} 91 | 92 | \item{n.draws}{EBMA number of samples. An integer-valued scalar specifying 93 | the number of bootstrapped samples to be drawn from the EBMA fold and used 94 | for tuning EBMA. Default is \eqn{100}. Passed on from \code{ebma.n.draws}.} 95 | 96 | \item{cores}{The number of cores to be used. An integer indicating the number 97 | of processor cores used for parallel computing. Default is 1.} 98 | 99 | \item{post.strat}{Post-stratification results. A list containing the best 100 | models for each of the tuned classifiers, the individual level predictions 101 | on the data classifier trainig data and the post-stratified context-level 102 | predictions.} 103 | 104 | \item{dv_type}{The type of the depenedent variable. A character string. 105 | Either "binary" or "linear".} 106 | 107 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether 108 | the deep MRP classifier should be used for best subset prediction. Setting 109 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best 110 | subset classifier. Default is \code{FALSE}.} 111 | } 112 | \value{ 113 | The classifier weights. A numeric vector. 114 | } 115 | \description{ 116 | \code{ebma_mc_draws} is called from within \code{ebma}. It tunes using 117 | multiple cores. 118 | } 119 | -------------------------------------------------------------------------------- /man/ebma_mc_tol.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ebma.R 3 | \name{ebma_mc_tol} 4 | \alias{ebma_mc_tol} 5 | \title{EBMA multicore tuning - parallelises over tolerance values.} 6 | \usage{ 7 | ebma_mc_tol( 8 | train.preds, 9 | train.y, 10 | ebma.fold, 11 | y, 12 | L1.x, 13 | L2.x, 14 | L2.unit, 15 | L2.reg, 16 | pc.names, 17 | model.bs, 18 | model.pca, 19 | model.lasso, 20 | model.gb, 21 | model.svm, 22 | model.mrp, 23 | tol, 24 | n.draws, 25 | cores, 26 | preds_all, 27 | post.strat, 28 | dv_type, 29 | deep.mrp 30 | ) 31 | } 32 | \arguments{ 33 | \item{train.preds}{Predictions of classifiers on the classifier training 34 | data. A tibble.} 35 | 36 | \item{train.y}{Outcome variable of the classifier training data. A numeric 37 | vector.} 38 | 39 | \item{ebma.fold}{The data used for EBMA tuning. A tibble.} 40 | 41 | \item{y}{Outcome variable. A character vector containing the column names of 42 | the outcome variable. A character scalar containing the column name of 43 | the outcome variable in \code{survey}.} 44 | 45 | \item{L1.x}{Individual-level covariates. A character vector containing the 46 | column names of the individual-level variables in \code{survey} and 47 | \code{census} used to predict outcome \code{y}. Note that geographic unit 48 | is specified in argument \code{L2.unit}.} 49 | 50 | \item{L2.x}{Context-level covariates. A character vector containing the 51 | column names of the context-level variables in \code{survey} and 52 | \code{census} used to predict outcome \code{y}. To exclude context-level 53 | variables, set \code{L2.x = NULL}.} 54 | 55 | \item{L2.unit}{Geographic unit. A character scalar containing the column 56 | name of the geographic unit in \code{survey} and \code{census} at which 57 | outcomes should be aggregated.} 58 | 59 | \item{L2.reg}{Geographic region. A character scalar containing the column 60 | name of the geographic region in \code{survey} and \code{census} by which 61 | geographic units are grouped (\code{L2.unit} must be nested within 62 | \code{L2.reg}). Default is \code{NULL}.} 63 | 64 | \item{pc.names}{Principal Component Variable names. A character vector 65 | containing the names of the context-level principal components variables.} 66 | 67 | \item{model.bs}{The tuned model from the multilevel regression with best 68 | subset selection classifier. An \code{\link[lme4]{glmer}} object.} 69 | 70 | \item{model.pca}{The tuned model from the multilevel regression with 71 | principal components as context-level predictors classifier. An 72 | \code{\link[lme4]{glmer}} object.} 73 | 74 | \item{model.lasso}{The tuned model from the multilevel regression with L1 75 | regularization classifier. A \code{\link[glmmLasso]{glmmLasso}} object.} 76 | 77 | \item{model.gb}{The tuned model from the gradient boosting classifier. A 78 | \code{\link[gbm]{gbm}} object.} 79 | 80 | \item{model.svm}{The tuned model from the support vector machine classifier. 81 | An \code{\link[e1071]{svm}} object.} 82 | 83 | \item{model.mrp}{The standard MrP model. An \code{\link[lme4]{glmer}} object} 84 | 85 | \item{tol}{The tolerance values used for EBMA. A numeric vector.} 86 | 87 | \item{n.draws}{EBMA number of samples. An integer-valued scalar specifying 88 | the number of bootstrapped samples to be drawn from the EBMA fold and used 89 | for tuning EBMA. Default is \eqn{100}. Passed on from \code{ebma.n.draws}.} 90 | 91 | \item{cores}{The number of cores to be used. An integer indicating the number 92 | of processor cores used for parallel computing. Default is 1.} 93 | 94 | \item{post.strat}{Post-stratification results. A list containing the best 95 | models for each of the tuned classifiers, the individual level predictions 96 | on the data classifier trainig data and the post-stratified context-level 97 | predictions.} 98 | 99 | \item{dv_type}{The type of the depenedent variable. A character string. 100 | Either "binary" or "linear".} 101 | 102 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether 103 | the deep MRP classifier should be used for best subset prediction. Setting 104 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best 105 | subset classifier. Default is \code{FALSE}.} 106 | } 107 | \value{ 108 | The classifier weights. A numeric vector. 109 | } 110 | \description{ 111 | \code{ebma_mc_tol} is called from within \code{ebma}. It tunes using 112 | multiple cores. 113 | } 114 | \examples{ 115 | \dontrun{ 116 | # not yet 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /man/error_checks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{error_checks} 4 | \alias{error_checks} 5 | \title{Catches user input errors} 6 | \usage{ 7 | error_checks( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | L2.x.scale, 14 | pcs, 15 | folds, 16 | bin.proportion, 17 | bin.size, 18 | survey, 19 | census, 20 | ebma.size, 21 | k.folds, 22 | cv.sampling, 23 | loss.unit, 24 | loss.fun, 25 | best.subset, 26 | lasso, 27 | pca, 28 | gb, 29 | svm, 30 | mrp, 31 | best.subset.L2.x, 32 | lasso.L2.x, 33 | deep.mrp, 34 | gb.L2.x, 35 | svm.L2.x, 36 | mrp.L2.x, 37 | gb.L2.unit, 38 | gb.L2.reg, 39 | lasso.lambda, 40 | lasso.n.iter, 41 | deep.splines, 42 | uncertainty, 43 | boot.iter 44 | ) 45 | } 46 | \arguments{ 47 | \item{y}{Outcome variable. A character vector containing the column names of 48 | the outcome variable. A character scalar containing the column name of 49 | the outcome variable in \code{survey}.} 50 | 51 | \item{L1.x}{Individual-level covariates. A character vector containing the 52 | column names of the individual-level variables in \code{survey} and 53 | \code{census} used to predict outcome \code{y}. Note that geographic unit 54 | is specified in argument \code{L2.unit}.} 55 | 56 | \item{L2.x}{Context-level covariates. A character vector containing the 57 | column names of the context-level variables in \code{survey} and 58 | \code{census} used to predict outcome \code{y}. To exclude context-level 59 | variables, set \code{L2.x = NULL}.} 60 | 61 | \item{L2.unit}{Geographic unit. A character scalar containing the column 62 | name of the geographic unit in \code{survey} and \code{census} at which 63 | outcomes should be aggregated.} 64 | 65 | \item{L2.reg}{Geographic region. A character scalar containing the column 66 | name of the geographic region in \code{survey} and \code{census} by which 67 | geographic units are grouped (\code{L2.unit} must be nested within 68 | \code{L2.reg}). Default is \code{NULL}.} 69 | 70 | \item{L2.x.scale}{Scale context-level covariates. A logical argument 71 | indicating whether the context-level covariates should be normalized. 72 | Default is \code{TRUE}. Note that if set to \code{FALSE}, then the 73 | context-level covariates should be normalized prior to calling 74 | \code{auto_MrP()}.} 75 | 76 | \item{pcs}{Principal components. A character vector containing the column 77 | names of the principal components of the context-level variables in 78 | \code{survey} and \code{census}. Default is \code{NULL}.} 79 | 80 | \item{folds}{EBMA and cross-validation folds. A character scalar containing 81 | the column name of the variable in \code{survey} that specifies the fold 82 | to which an observation is allocated. The variable should contain integers 83 | running from \eqn{1} to \eqn{k + 1}, where \eqn{k} is the number of 84 | cross-validation folds. Value \eqn{k + 1} refers to the EBMA fold. Default 85 | is \code{NULL}. \emph{Note:} if \code{folds} is \code{NULL}, then 86 | \code{ebma.size}, \code{k.folds}, and \code{cv.sampling} must be specified.} 87 | 88 | \item{bin.proportion}{Proportion of ideal types. A character scalar 89 | containing the column name of the variable in \code{census} that indicates 90 | the proportion of individuals by ideal type and geographic unit. Default is 91 | \code{NULL}. \emph{Note:} if \code{bin.proportion} is \code{NULL}, then 92 | \code{bin.size} must be specified.} 93 | 94 | \item{bin.size}{Bin size of ideal types. A character scalar containing the 95 | column name of the variable in \code{census} that indicates the bin size of 96 | ideal types by geographic unit. Default is \code{NULL}. \emph{Note:} 97 | ignored if \code{bin.proportion} is provided, but must be specified 98 | otherwise.} 99 | 100 | \item{survey}{Survey data. A \code{data.frame} whose column names include 101 | \code{y}, \code{L1.x}, \code{L2.x}, \code{L2.unit}, and, if specified, 102 | \code{L2.reg}, \code{pcs}, and \code{folds}.} 103 | 104 | \item{census}{Census data. A \code{data.frame} whose column names include 105 | \code{L1.x}, \code{L2.x}, \code{L2.unit}, if specified, \code{L2.reg} and 106 | \code{pcs}, and either \code{bin.proportion} or \code{bin.size}.} 107 | 108 | \item{ebma.size}{EBMA fold size. A number in the open unit interval 109 | indicating the proportion of respondents to be allocated to the EBMA fold. 110 | Default is \eqn{1/3}. \emph{Note:} ignored if \code{folds} is provided, but 111 | must be specified otherwise.} 112 | 113 | \item{k.folds}{Number of cross-validation folds. An integer-valued scalar 114 | indicating the number of folds to be used in cross-validation. Default is 115 | \eqn{5}. \emph{Note:} ignored if \code{folds} is provided, but must be 116 | specified otherwise.} 117 | 118 | \item{cv.sampling}{Cross-validation sampling method. A character-valued 119 | scalar indicating whether cross-validation folds should be created by 120 | sampling individual respondents (\code{individuals}) or geographic units 121 | (\code{L2 units}). Default is \code{L2 units}. \emph{Note:} ignored if 122 | \code{folds} is provided, but must be specified otherwise.} 123 | 124 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 125 | whether performance loss should be evaluated at the level of individual 126 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 127 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 128 | loss units, parameters are ranked for each loss unit and the loss unit with 129 | the lowest rank sum is chosen. Ties are broken according to the order in 130 | the search grid.} 131 | 132 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 133 | prediction loss should be measured by the mean squared error (\code{MSE}), 134 | the mean absolute error (\code{MAE}), binary cross-entropy 135 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1 136 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE", 137 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters 138 | are ranked for each loss function and the parameter combination with the 139 | lowest rank sum is chosen. Ties are broken according to the order in the 140 | search grid.} 141 | 142 | \item{best.subset}{Best subset classifier. A logical argument indicating 143 | whether the best subset classifier should be used for predicting outcome 144 | \code{y}. Default is \code{TRUE}.} 145 | 146 | \item{lasso}{Lasso classifier. A logical argument indicating whether the 147 | lasso classifier should be used for predicting outcome \code{y}. Default is 148 | \code{TRUE}.} 149 | 150 | \item{pca}{PCA classifier. A logical argument indicating whether the PCA 151 | classifier should be used for predicting outcome \code{y}. Default is 152 | \code{TRUE}.} 153 | 154 | \item{gb}{GB classifier. A logical argument indicating whether the GB 155 | classifier should be used for predicting outcome \code{y}. Default is 156 | \code{TRUE}.} 157 | 158 | \item{svm}{SVM classifier. A logical argument indicating whether the SVM 159 | classifier should be used for predicting outcome \code{y}. Default is 160 | \code{TRUE}.} 161 | 162 | \item{mrp}{MRP classifier. A logical argument indicating whether the standard 163 | MRP classifier should be used for predicting outcome \code{y}. Default is 164 | \code{FALSE}.} 165 | 166 | \item{best.subset.L2.x}{Best subset context-level covariates. A character 167 | vector containing the column names of the context-level variables in 168 | \code{survey} and \code{census} to be used by the best subset classifier. 169 | If \code{NULL} and \code{best.subset} is set to \code{TRUE}, then best 170 | subset uses the variables specified in \code{L2.x}. Default is \code{NULL}.} 171 | 172 | \item{lasso.L2.x}{Lasso context-level covariates. A character vector 173 | containing the column names of the context-level variables in 174 | \code{survey} and \code{census} to be used by the lasso classifier. If 175 | \code{NULL} and \code{lasso} is set to \code{TRUE}, then lasso uses the 176 | variables specified in \code{L2.x}. Default is \code{NULL}.} 177 | 178 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether 179 | the deep MRP classifier should be used for best subset prediction. Setting 180 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best 181 | subset classifier. Default is \code{FALSE}.} 182 | 183 | \item{gb.L2.x}{GB context-level covariates. A character vector containing the 184 | column names of the context-level variables in \code{survey} and 185 | \code{census} to be used by the GB classifier. If \code{NULL} and \code{gb} 186 | is set to \code{TRUE}, then GB uses the variables specified in \code{L2.x}. 187 | Default is \code{NULL}.} 188 | 189 | \item{svm.L2.x}{SVM context-level covariates. A character vector containing 190 | the column names of the context-level variables in \code{survey} and 191 | \code{census} to be used by the SVM classifier. If \code{NULL} and 192 | \code{svm} is set to \code{TRUE}, then SVM uses the variables specified in 193 | \code{L2.x}. Default is \code{NULL}.} 194 | 195 | \item{mrp.L2.x}{MRP context-level covariates. A character vector containing 196 | the column names of the context-level variables in \code{survey} and 197 | \code{census} to be used by the MRP classifier. The character vector 198 | \emph{empty} if no context-level variables should be used by the MRP 199 | classifier. If \code{NULL} and \code{mrp} is set to \code{TRUE}, then MRP 200 | uses the variables specified in \code{L2.x}. Default is \code{NULL}. Note: 201 | For the empty MrP model, set \code{L2.x = NULL} and \code{mrp.L2.x = ""}.} 202 | 203 | \item{gb.L2.unit}{GB L2.unit. A logical argument indicating whether 204 | \code{L2.unit} should be included in the GB classifier. Default is 205 | \code{FALSE}.} 206 | 207 | \item{gb.L2.reg}{GB L2.reg. A logical argument indicating whether 208 | \code{L2.reg} should be included in the GB classifier. Default is 209 | \code{FALSE}.} 210 | 211 | \item{lasso.lambda}{Lasso penalty parameter. A numeric \code{vector} of 212 | non-negative values. The penalty parameter controls the shrinkage of the 213 | context-level variables in the lasso model. Default is a sequence with 214 | minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The 215 | number of values is controlled by the \code{lasso.n.iter} parameter.} 216 | 217 | \item{lasso.n.iter}{Lasso number of lambda values. An integer-valued scalar 218 | specifying the number of lambda values to search over. Default is 219 | \eqn{100}. \emph{Note:} Is ignored if a vector of \code{lasso.lambda} 220 | values is provided.} 221 | 222 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether 223 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.} 224 | 225 | \item{uncertainty}{Uncertainty estimates. A logical argument indicating 226 | whether uncertainty estimates should be computed. Default is \code{FALSE}.} 227 | 228 | \item{boot.iter}{Number of bootstrap iterations. An integer argument 229 | indicating the number of bootstrap iterations to be computed. Will be 230 | ignored unless \code{uncertainty = TRUE}. Default is \code{200} if 231 | \code{uncertainty = TRUE} and \code{NULL} if \code{uncertainty = FALSE}.} 232 | } 233 | \value{ 234 | No return value, called for detection of errors in autoMrP() call. 235 | } 236 | \description{ 237 | \code{error_checks()} checks for incorrect data entry in \code{autoMrP()} 238 | call. 239 | } 240 | -------------------------------------------------------------------------------- /man/f1_score.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{f1_score} 4 | \alias{f1_score} 5 | \title{Estimates the inverse f1 score, i.e. 0 is the best score and 1 the worst.} 6 | \usage{ 7 | f1_score(pred, data.valid, y, L2.unit) 8 | } 9 | \arguments{ 10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.} 11 | 12 | \item{data.valid}{Test data set. A tibble of data that was not used for 13 | prediction.} 14 | 15 | \item{y}{Outcome variable. A character vector containing the column names of 16 | the outcome variable.} 17 | 18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 19 | of the geographic unit in \code{survey} and \code{census} at which outcomes 20 | should be aggregated.} 21 | } 22 | \value{ 23 | Returns a tibble containing two f1 prediction errors. The first is 24 | measured at the level of individuals and the second is measured at the 25 | context level. The tibble dimensions are 2x3 with variables: measure, value 26 | and level. 27 | } 28 | \description{ 29 | \code{f1_score()} estimates the inverse f1 scores on the individual and state 30 | levels. 31 | } 32 | -------------------------------------------------------------------------------- /man/gb_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gb_classifier.R 3 | \name{gb_classifier} 4 | \alias{gb_classifier} 5 | \title{GB classifier} 6 | \usage{ 7 | gb_classifier( 8 | y, 9 | form, 10 | distribution, 11 | data.train, 12 | n.trees, 13 | interaction.depth, 14 | n.minobsinnode, 15 | shrinkage, 16 | verbose = c(TRUE, FALSE) 17 | ) 18 | } 19 | \arguments{ 20 | \item{y}{Outcome variable. A character vector containing the column names of 21 | the outcome variable. A character scalar containing the column name of 22 | the outcome variable in \code{survey}.} 23 | 24 | \item{form}{Model formula. A two-sided linear formula describing 25 | the model to be fit, with the outcome on the LHS and the covariates 26 | separated by + operators on the RHS.} 27 | 28 | \item{distribution}{Model distribution. A character string specifying the 29 | name of the distribution to be used.} 30 | 31 | \item{data.train}{Training data. A data.frame containing the training data 32 | used to train the model.} 33 | 34 | \item{n.trees}{Total number of trees. An integer-valued scalar specifying 35 | the total number of trees to be fit.} 36 | 37 | \item{interaction.depth}{Interaction depth. An integer-valued scalar 38 | specifying the maximum depth of each tree.} 39 | 40 | \item{n.minobsinnode}{Minimum number of observations in terminal nodes. An 41 | integer-valued scalar specifying the minimum number of observations in the 42 | terminal nodes of the trees.} 43 | 44 | \item{shrinkage}{Learning rate. A numeric scalar specifying the shrinkage or 45 | learning rate applied to each tree in the expansion.} 46 | 47 | \item{verbose}{Verbose output. A logical vector indicating whether or not 48 | verbose output should be printed.} 49 | } 50 | \value{ 51 | A gradient tree boosting model. A \code{\link[gbm]{gbm}} object. 52 | } 53 | \description{ 54 | \code{gb_classifier} applies gradient boosting classification to a data set. 55 | } 56 | -------------------------------------------------------------------------------- /man/gb_classifier_update.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gb_classifier.R 3 | \name{gb_classifier_update} 4 | \alias{gb_classifier_update} 5 | \title{GB classifier update} 6 | \usage{ 7 | gb_classifier_update(object, n.new.trees, verbose = c(TRUE, FALSE)) 8 | } 9 | \arguments{ 10 | \item{object}{Gradient tree boosting output. A gbm object.} 11 | 12 | \item{n.new.trees}{Number of additional trees to grow. A numeric scalar.} 13 | 14 | \item{verbose}{Verbose output. A logical vector indicating whether or not 15 | verbose output should be printed.} 16 | } 17 | \value{ 18 | An updated gradient tree boosting model. 19 | A \code{\link[gbm]{gbm.more}} object. 20 | } 21 | \description{ 22 | \code{gb_classifier_update()} grows additional trees in gradient tree 23 | boosting ensemble. 24 | } 25 | -------------------------------------------------------------------------------- /man/lasso_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lasso_classifier.R 3 | \name{lasso_classifier} 4 | \alias{lasso_classifier} 5 | \title{Lasso classifier} 6 | \usage{ 7 | lasso_classifier( 8 | L2.fix, 9 | L1.re, 10 | data.train, 11 | lambda, 12 | model.family, 13 | y, 14 | verbose = c(TRUE, FALSE) 15 | ) 16 | } 17 | \arguments{ 18 | \item{L2.fix}{Fixed effects. A two-sided linear formula describing 19 | the fixed effects part of the model, with the outcome on the LHS and 20 | the fixed effects separated by + operators on the RHS.} 21 | 22 | \item{L1.re}{Random effects. A named list object, with the random effects 23 | providing the names of the list elements and ~ 1 being the list elements.} 24 | 25 | \item{data.train}{Training data. A data.frame containing the training data 26 | used to train the model.} 27 | 28 | \item{lambda}{Tuning parameter. Lambda is the penalty parameter that controls 29 | the shrinkage of fixed effects.} 30 | 31 | \item{model.family}{Model family. A variable indicating the model family 32 | to be used by glmmLasso. Defaults to binomial(link = "probit").} 33 | 34 | \item{y}{Outcome variable. A character vector containing the column names of 35 | the outcome variable. A character scalar containing the column name of 36 | the outcome variable in \code{survey}.} 37 | 38 | \item{verbose}{Verbose output. A logical vector indicating whether or not 39 | verbose output should be printed.} 40 | } 41 | \value{ 42 | A multilevel lasso model. An \code{\link[glmmLasso]{glmmLasso}} 43 | object. 44 | } 45 | \description{ 46 | \code{lasso_classifier} applies lasso classification to a data set. 47 | } 48 | -------------------------------------------------------------------------------- /man/log_spaced.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{log_spaced} 4 | \alias{log_spaced} 5 | \title{Sequence that is equally spaced on the log scale} 6 | \usage{ 7 | log_spaced(min, max, n) 8 | } 9 | \arguments{ 10 | \item{min}{The minimum value of the sequence. A positive numeric scalar (min 11 | > 0).} 12 | 13 | \item{max}{The maximum value of the sequence. a positive numeric scalar (max 14 | > 0).} 15 | 16 | \item{n}{The length of the sequence. An integer valued scalar.} 17 | } 18 | \value{ 19 | Returns a numeric vector with length specified in argument \code{n}. 20 | The vector elements are equally spaced on the log-scale. 21 | } 22 | \description{ 23 | Sequence that is equally spaced on the log scale 24 | } 25 | -------------------------------------------------------------------------------- /man/loss_function.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{loss_function} 4 | \alias{loss_function} 5 | \title{Estimates loss value.} 6 | \usage{ 7 | loss_function( 8 | pred, 9 | data.valid, 10 | loss.unit = c("individuals", "L2 units"), 11 | loss.fun = c("MSE", "MAE", "cross-entropy"), 12 | y, 13 | L2.unit 14 | ) 15 | } 16 | \arguments{ 17 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.} 18 | 19 | \item{data.valid}{Test data set. A tibble of data that was not used for 20 | prediction.} 21 | 22 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 23 | whether performance loss should be evaluated at the level of individual 24 | respondents (\code{individuals}) or geographic units (\code{L2 units}). 25 | Default is \code{individuals}.} 26 | 27 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 28 | prediction loss should be measured by the mean squared error (\code{MSE}) 29 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.} 30 | 31 | \item{y}{Outcome variable. A character vector containing the column names of 32 | the outcome variable.} 33 | 34 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 35 | of the geographic unit in \code{survey} and \code{census} at which outcomes 36 | should be aggregated.} 37 | } 38 | \value{ 39 | Returns a tibble with number of rows equal to the number of loss 40 | functions tested (defaults to 4 for cross-entropy, f1, MSE, and msfe). The 41 | number of columns is 2 where the first is called measure and contains the 42 | names of the loss-functions and the second is called value and contains the 43 | loss-function scores. 44 | } 45 | \description{ 46 | \code{loss_function()} estimates the loss based on a loss function. 47 | } 48 | -------------------------------------------------------------------------------- /man/loss_score_ranking.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{loss_score_ranking} 4 | \alias{loss_score_ranking} 5 | \title{Ranks tuning parameters according to loss functions} 6 | \usage{ 7 | loss_score_ranking(score, loss.fun) 8 | } 9 | \arguments{ 10 | \item{score}{A data set containing loss function names, the loss function 11 | values, and the tuning parameter values.} 12 | 13 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 14 | prediction loss should be measured by the mean squared error (\code{MSE}) 15 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.} 16 | } 17 | \value{ 18 | Returns a tibble containing the parameter grid as well as a rank 19 | column that corresponds to the cross-validation rank of a parameter 20 | combination across all loss function scores. 21 | } 22 | \description{ 23 | \code{loss_score_ranking()} ranks tuning parameters according to the scores 24 | received in multiple loss functions. 25 | } 26 | -------------------------------------------------------------------------------- /man/mean_absolute_error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{mean_absolute_error} 4 | \alias{mean_absolute_error} 5 | \title{Estimates the mean absolute prediction error.} 6 | \usage{ 7 | mean_absolute_error(pred, data.valid, y, L2.unit) 8 | } 9 | \arguments{ 10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.} 11 | 12 | \item{data.valid}{Test data set. A tibble of data that was not used for 13 | prediction.} 14 | 15 | \item{y}{Outcome variable. A character vector containing the column names of 16 | the outcome variable.} 17 | 18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 19 | of the geographic unit in \code{survey} and \code{census} at which outcomes 20 | should be aggregated.} 21 | } 22 | \value{ 23 | Returns a tibble containing two mean absolute prediction errors. The 24 | first is measured at the level of individuals and the second is measured at 25 | the context level. The tibble dimensions are 2x3 with variables: measure, 26 | value and level. 27 | } 28 | \description{ 29 | \code{mean_absolute_error()} estimates the mean absolute error for the 30 | desired loss unit. 31 | } 32 | -------------------------------------------------------------------------------- /man/mean_squared_error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{mean_squared_error} 4 | \alias{mean_squared_error} 5 | \title{Estimates the mean squared prediction error.} 6 | \usage{ 7 | mean_squared_error(pred, data.valid, y, L2.unit) 8 | } 9 | \arguments{ 10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.} 11 | 12 | \item{data.valid}{Test data set. A tibble of data that was not used for 13 | prediction.} 14 | 15 | \item{y}{Outcome variable. A character vector containing the column names of 16 | the outcome variable.} 17 | 18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 19 | of the geographic unit in \code{survey} and \code{census} at which outcomes 20 | should be aggregated.} 21 | } 22 | \value{ 23 | Returns a tibble containing two mean squared prediction errors. The 24 | first is measured at the level of individuals and the second is measured at 25 | the context level. The tibble dimensions are 2x3 with variables: measure, 26 | value and level. 27 | } 28 | \description{ 29 | \code{mean_squared_error()} estimates the mean squared error for the desired 30 | loss unit. 31 | } 32 | -------------------------------------------------------------------------------- /man/mean_squared_false_error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{mean_squared_false_error} 4 | \alias{mean_squared_false_error} 5 | \title{Estimates the mean squared false error.} 6 | \usage{ 7 | mean_squared_false_error(pred, data.valid, y, L2.unit) 8 | } 9 | \arguments{ 10 | \item{pred}{Predictions of outcome. A numeric vector of outcome predictions.} 11 | 12 | \item{data.valid}{Test data set. A tibble of data that was not used for 13 | prediction.} 14 | 15 | \item{y}{Outcome variable. A character vector containing the column names of 16 | the outcome variable.} 17 | 18 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 19 | of the geographic unit in \code{survey} and \code{census} at which outcomes 20 | should be aggregated.} 21 | } 22 | \value{ 23 | Returns a tibble containing two mean squared false prediction errors. 24 | The first is measured at the level of individuals and the second is 25 | measured at the context level. The tibble dimensions are 2x3 with 26 | variables: measure, value and level. 27 | } 28 | \description{ 29 | \code{msfe()} estimates the inverse f1 scores on the individual and state 30 | levels. 31 | } 32 | -------------------------------------------------------------------------------- /man/model_list.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{model_list} 4 | \alias{model_list} 5 | \title{A list of models for the best subset selection.} 6 | \usage{ 7 | model_list(y, L1.x, L2.x, L2.unit, L2.reg = NULL) 8 | } 9 | \arguments{ 10 | \item{y}{Outcome variable. A character vector containing the column names of 11 | the outcome variable.} 12 | 13 | \item{L1.x}{Individual-level covariates. A character vector containing the 14 | column names of the individual-level variables in \code{survey} and 15 | \code{census} used to predict outcome \code{y}. Note that geographic unit 16 | is specified in argument \code{L2.unit}.} 17 | 18 | \item{L2.x}{Context-level covariates. A character vector containing the 19 | column names of the context-level variables in \code{survey} and 20 | \code{census} used to predict outcome \code{y}.} 21 | 22 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 23 | of the geographic unit in \code{survey} and \code{census} at which outcomes 24 | should be aggregated.} 25 | 26 | \item{L2.reg}{Geographic region. A character scalar containing the column 27 | name of the geographic region in \code{survey} and \code{census} by which 28 | geographic units are grouped (\code{L2.unit} must be nested within 29 | \code{L2.reg}). Default is \code{NULL}.} 30 | } 31 | \value{ 32 | Returns a list with the number of elements equal to 2^k where k is 33 | the number context-level variables. Each element is of class formula. 34 | } 35 | \description{ 36 | \code{model_list()} generates an exhaustive list of lme4 model formulas from 37 | the individual level and context level variables as well as geographic unit 38 | variables to be iterated over in best subset selection. 39 | } 40 | -------------------------------------------------------------------------------- /man/model_list_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{model_list_pca} 4 | \alias{model_list_pca} 5 | \title{A list of models for the best subset selection with PCA.} 6 | \usage{ 7 | model_list_pca(y, L1.x, L2.x, L2.unit, L2.reg = NULL) 8 | } 9 | \arguments{ 10 | \item{y}{Outcome variable. A character vector containing the column names of 11 | the outcome variable.} 12 | 13 | \item{L1.x}{Individual-level covariates. A character vector containing the 14 | column names of the individual-level variables in \code{survey} and 15 | \code{census} used to predict outcome \code{y}. Note that geographic unit 16 | is specified in argument \code{L2.unit}.} 17 | 18 | \item{L2.x}{Context-level covariates. A character vector containing the 19 | column names of the context-level variables in \code{survey} and 20 | \code{census} used to predict outcome \code{y}.} 21 | 22 | \item{L2.unit}{Geographic unit. A character scalar containing the column name 23 | of the geographic unit in \code{survey} and \code{census} at which outcomes 24 | should be aggregated.} 25 | 26 | \item{L2.reg}{Geographic region. A character scalar containing the column 27 | name of the geographic region in \code{survey} and \code{census} by which 28 | geographic units are grouped (\code{L2.unit} must be nested within 29 | \code{L2.reg}). Default is \code{NULL}.} 30 | } 31 | \value{ 32 | Returns a list with the number of elements k+1 where k is the number 33 | of context-level variables. Each element is of class formula. The first 34 | element is a model with context-level variables and the following models 35 | iteratively add the principal components as context-level variables. 36 | } 37 | \description{ 38 | \code{model_list_pca()} generates an exhaustive list of lme4 model formulas 39 | from the individual level and context level principal components as well as 40 | geographic unit variables to be iterated over in best subset selection with 41 | principal components. 42 | } 43 | -------------------------------------------------------------------------------- /man/multicore.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{multicore} 4 | \alias{multicore} 5 | \title{Register cores for multicore computing} 6 | \usage{ 7 | multicore(cores = 1, type, cl = NULL) 8 | } 9 | \arguments{ 10 | \item{cores}{Number of cores to be used. An integer. Default is \code{1}.} 11 | 12 | \item{type}{Whether to start or end parallel processing. A character string. 13 | The possible values are \code{open}, \code{close}.} 14 | 15 | \item{cl}{The registered cluster. Default is \code{NULL}} 16 | } 17 | \value{ 18 | No return value, called to register or un-register clusters for 19 | parallel processing. 20 | } 21 | \description{ 22 | \code{multicore()} registers cores for parallel processing. 23 | } 24 | -------------------------------------------------------------------------------- /man/output_table.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{output_table} 4 | \alias{output_table} 5 | \title{A table for the summary function} 6 | \usage{ 7 | output_table(object, col.names, format, digits) 8 | } 9 | \arguments{ 10 | \item{object}{An \code{autoMrP()} object for which a summary is desired.} 11 | 12 | \item{col.names}{The column names of the table. A} 13 | 14 | \item{format}{The table format. A character string passed to 15 | \code{\link[knitr]{kable}}. Default is \code{simple}.} 16 | 17 | \item{digits}{The number of digits to be displayed. An integer scalar. 18 | Default is \code{4}.} 19 | } 20 | \value{ 21 | No return value, prints a table to the console. 22 | } 23 | \description{ 24 | \code{output_table()} ... 25 | } 26 | -------------------------------------------------------------------------------- /man/plot.autoMrP.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{plot.autoMrP} 4 | \alias{plot.autoMrP} 5 | \title{A plot method for autoMrP objects. Plots unit-level preference estiamtes.} 6 | \usage{ 7 | \method{plot}{autoMrP}(x, algorithm = "ebma", ci.lvl = 0.95, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An \code{autoMrP()} object.} 11 | 12 | \item{algorithm}{The algorithm/classifier fo which preference estimates are 13 | desired. A character-valued scalar indicating either \code{ebma} or the 14 | classifier to be used. Allowed choices are: "ebma", "best_subset", "lasso", 15 | "pca", "gb", "svm", and "mrp". Default is \code{ebma}.} 16 | 17 | \item{ci.lvl}{The level of the confidence intervals. A proportion. Default is 18 | \code{0.95}. Confidence intervals are based on bootstrapped estimates and 19 | will not be printed if bootstrapping was not carried out.} 20 | 21 | \item{...}{Additional arguments affecting the summary produced.} 22 | } 23 | \value{ 24 | Returns a \code{ggplot2} object of the preference estimates for the 25 | selected classifier. 26 | } 27 | \description{ 28 | \code{plot.autoMrP()} plots unit-level preference estimates and error bars. 29 | } 30 | -------------------------------------------------------------------------------- /man/post_stratification.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/post_stratification.R 3 | \name{post_stratification} 4 | \alias{post_stratification} 5 | \title{Apply post-stratification to classifiers.} 6 | \usage{ 7 | post_stratification( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | best.subset.opt, 14 | lasso.opt, 15 | lasso.L2.x, 16 | pca.opt, 17 | gb.opt, 18 | svm.opt, 19 | svm.L2.reg, 20 | svm.L2.unit, 21 | svm.L2.x, 22 | mrp.include, 23 | n.minobsinnode, 24 | L2.unit.include, 25 | L2.reg.include, 26 | kernel, 27 | mrp.L2.x, 28 | data, 29 | ebma.fold, 30 | census, 31 | verbose, 32 | deep.mrp, 33 | deep.splines 34 | ) 35 | } 36 | \arguments{ 37 | \item{y}{Outcome variable. A character vector containing the column names of 38 | the outcome variable. A character scalar containing the column name of 39 | the outcome variable in \code{survey}.} 40 | 41 | \item{L1.x}{Individual-level covariates. A character vector containing the 42 | column names of the individual-level variables in \code{survey} and 43 | \code{census} used to predict outcome \code{y}. Note that geographic unit 44 | is specified in argument \code{L2.unit}.} 45 | 46 | \item{L2.x}{Context-level covariates. A character vector containing the 47 | column names of the context-level variables in \code{survey} and 48 | \code{census} used to predict outcome \code{y}. To exclude context-level 49 | variables, set \code{L2.x = NULL}.} 50 | 51 | \item{L2.unit}{Geographic unit. A character scalar containing the column 52 | name of the geographic unit in \code{survey} and \code{census} at which 53 | outcomes should be aggregated.} 54 | 55 | \item{L2.reg}{Geographic region. A character scalar containing the column 56 | name of the geographic region in \code{survey} and \code{census} by which 57 | geographic units are grouped (\code{L2.unit} must be nested within 58 | \code{L2.reg}). Default is \code{NULL}.} 59 | 60 | \item{best.subset.opt}{Optimal tuning parameters from best subset selection 61 | classifier. A list returned by \code{run_best_subset()}.} 62 | 63 | \item{lasso.opt}{Optimal tuning parameters from lasso classifier A list 64 | returned by \code{run_lasso()}.} 65 | 66 | \item{lasso.L2.x}{Lasso context-level covariates. A character vector 67 | containing the column names of the context-level variables in 68 | \code{survey} and \code{census} to be used by the lasso classifier. If 69 | \code{NULL} and \code{lasso} is set to \code{TRUE}, then lasso uses the 70 | variables specified in \code{L2.x}. Default is \code{NULL}.} 71 | 72 | \item{pca.opt}{Optimal tuning parameters from best subset selection with 73 | principal components classifier A list returned by \code{run_pca()}.} 74 | 75 | \item{gb.opt}{Optimal tuning parameters from gradient tree boosting 76 | classifier A list returned by \code{run_gb()}.} 77 | 78 | \item{svm.opt}{Optimal tuning parameters from support vector machine 79 | classifier A list returned by \code{run_svm()}.} 80 | 81 | \item{svm.L2.reg}{SVM L2.reg. A logical argument indicating whether 82 | \code{L2.reg} should be included in the SVM classifier. Default is 83 | \code{FALSE}.} 84 | 85 | \item{svm.L2.unit}{SVM L2.unit. A logical argument indicating whether 86 | \code{L2.unit} should be included in the SVM classifier. Default is 87 | \code{FALSE}.} 88 | 89 | \item{svm.L2.x}{SVM context-level covariates. A character vector containing 90 | the column names of the context-level variables in \code{survey} and 91 | \code{census} to be used by the SVM classifier. If \code{NULL} and 92 | \code{svm} is set to \code{TRUE}, then SVM uses the variables specified in 93 | \code{L2.x}. Default is \code{NULL}.} 94 | 95 | \item{mrp.include}{Whether to run MRP classifier. A logical argument 96 | indicating whether the standard MRP classifier should be used for 97 | predicting outcome \code{y}. Passed from \code{autoMrP()} argument 98 | \code{mrp}.} 99 | 100 | \item{n.minobsinnode}{GB minimum number of observations in the terminal 101 | nodes. An integer-valued scalar specifying the minimum number of 102 | observations that each terminal node of the trees must contain. Passed from 103 | \code{autoMrP()} argument \code{gb.n.minobsinnode}.} 104 | 105 | \item{L2.unit.include}{GB L2.unit. A logical argument indicating whether 106 | \code{L2.unit} should be included in the GB classifier. Passed from 107 | \code{autoMrP()} argument \code{gb.L2.unit}.} 108 | 109 | \item{L2.reg.include}{A logical argument indicating whether \code{L2.reg} 110 | should be included in the GB classifier. Passed from \code{autoMrP()} 111 | argument \code{GB L2.reg}.} 112 | 113 | \item{kernel}{SVM kernel. A character-valued scalar specifying the kernel to 114 | be used by SVM. The possible values are \code{linear}, \code{polynomial}, 115 | \code{radial}, and \code{sigmoid}. Passed from \code{autoMrP()} argument 116 | \code{svm.kernel}.} 117 | 118 | \item{mrp.L2.x}{MRP context-level covariates. A character vector containing 119 | the column names of the context-level variables in \code{survey} and 120 | \code{census} to be used by the MRP classifier. The character vector 121 | \emph{empty} if no context-level variables should be used by the MRP 122 | classifier. If \code{NULL} and \code{mrp} is set to \code{TRUE}, then MRP 123 | uses the variables specified in \code{L2.x}. Default is \code{NULL}. Note: 124 | For the empty MrP model, set \code{L2.x = NULL} and \code{mrp.L2.x = ""}.} 125 | 126 | \item{data}{A data.frame containing the survey data used in classifier 127 | training.} 128 | 129 | \item{ebma.fold}{A data.frame containing the data not used in classifier 130 | training.} 131 | 132 | \item{census}{Census data. A \code{data.frame} whose column names include 133 | \code{L1.x}, \code{L2.x}, \code{L2.unit}, if specified, \code{L2.reg} and 134 | \code{pcs}, and either \code{bin.proportion} or \code{bin.size}.} 135 | 136 | \item{verbose}{Verbose output. A logical argument indicating whether or not 137 | verbose output should be printed. Default is \code{FALSE}.} 138 | 139 | \item{deep.mrp}{Deep MRP classifier. A logical argument indicating whether 140 | the deep MRP classifier should be used for best subset prediction. Setting 141 | \code{deep.mrp = TRUE} will include all interactions of L1.x in the best 142 | subset classifier. Default is \code{FALSE}.} 143 | 144 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether 145 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.} 146 | } 147 | \description{ 148 | Apply post-stratification to classifiers. 149 | } 150 | -------------------------------------------------------------------------------- /man/predict_glmmLasso.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{predict_glmmLasso} 4 | \alias{predict_glmmLasso} 5 | \title{Predicts on newdata from glmmLasso objects} 6 | \usage{ 7 | predict_glmmLasso( 8 | census, 9 | m, 10 | L1.x, 11 | lasso.L2.x, 12 | L2.unit, 13 | L2.reg, 14 | type = "response" 15 | ) 16 | } 17 | \arguments{ 18 | \item{census}{Census data. A \code{data.frame} whose column names include 19 | \code{L1.x}, \code{L2.x}, \code{L2.unit}, if specified, \code{L2.reg} and 20 | \code{pcs}, and either \code{bin.proportion} or \code{bin.size}.} 21 | 22 | \item{m}{A \code{glmmLasso()} object.} 23 | 24 | \item{L1.x}{Individual-level covariates. A character vector containing the 25 | column names of the individual-level variables in \code{survey} and 26 | \code{census} used to predict outcome \code{y}. Note that geographic unit 27 | is specified in argument \code{L2.unit}.} 28 | 29 | \item{lasso.L2.x}{Lasso context-level covariates. A character vector 30 | containing the column names of the context-level variables in 31 | \code{survey} and \code{census} to be used by the lasso classifier. If 32 | \code{NULL} and \code{lasso} is set to \code{TRUE}, then lasso uses the 33 | variables specified in \code{L2.x}. Default is \code{NULL}.} 34 | 35 | \item{L2.unit}{Geographic unit. A character scalar containing the column 36 | name of the geographic unit in \code{survey} and \code{census} at which 37 | outcomes should be aggregated.} 38 | 39 | \item{L2.reg}{Geographic region. A character scalar containing the column 40 | name of the geographic region in \code{survey} and \code{census} by which 41 | geographic units are grouped (\code{L2.unit} must be nested within 42 | \code{L2.reg}). Default is \code{NULL}.} 43 | } 44 | \value{ 45 | Returns a numeric vector of predictions from a \code{glmmLasso()} 46 | object. 47 | } 48 | \description{ 49 | \code{glmmLasso()} predicts on newdata objects from a glmmLasso object. 50 | } 51 | -------------------------------------------------------------------------------- /man/quiet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{quiet} 4 | \alias{quiet} 5 | \title{Suppress cat in external package} 6 | \usage{ 7 | quiet(x) 8 | } 9 | \arguments{ 10 | \item{x}{Input. It can be any kind.} 11 | } 12 | \description{ 13 | \code{quiet()} suppresses cat output. 14 | } 15 | -------------------------------------------------------------------------------- /man/run_best_subset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_best_subset.R 3 | \name{run_best_subset} 4 | \alias{run_best_subset} 5 | \title{Apply best subset classifier to MrP.} 6 | \usage{ 7 | run_best_subset( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | loss.unit, 14 | loss.fun, 15 | data, 16 | verbose, 17 | cores 18 | ) 19 | } 20 | \arguments{ 21 | \item{y}{Outcome variable. A character vector containing the column names of 22 | the outcome variable. A character scalar containing the column name of 23 | the outcome variable in \code{survey}.} 24 | 25 | \item{L1.x}{Individual-level covariates. A character vector containing the 26 | column names of the individual-level variables in \code{survey} and 27 | \code{census} used to predict outcome \code{y}. Note that geographic unit 28 | is specified in argument \code{L2.unit}.} 29 | 30 | \item{L2.x}{Context-level covariates. A character vector containing the 31 | column names of the context-level variables in \code{survey} and 32 | \code{census} used to predict outcome \code{y}. To exclude context-level 33 | variables, set \code{L2.x = NULL}.} 34 | 35 | \item{L2.unit}{Geographic unit. A character scalar containing the column 36 | name of the geographic unit in \code{survey} and \code{census} at which 37 | outcomes should be aggregated.} 38 | 39 | \item{L2.reg}{Geographic region. A character scalar containing the column 40 | name of the geographic region in \code{survey} and \code{census} by which 41 | geographic units are grouped (\code{L2.unit} must be nested within 42 | \code{L2.reg}). Default is \code{NULL}.} 43 | 44 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 45 | whether performance loss should be evaluated at the level of individual 46 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 47 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 48 | loss units, parameters are ranked for each loss unit and the loss unit with 49 | the lowest rank sum is chosen. Ties are broken according to the order in 50 | the search grid.} 51 | 52 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 53 | prediction loss should be measured by the mean squared error (\code{MSE}), 54 | the mean absolute error (\code{MAE}), binary cross-entropy 55 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1 56 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE", 57 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters 58 | are ranked for each loss function and the parameter combination with the 59 | lowest rank sum is chosen. Ties are broken according to the order in the 60 | search grid.} 61 | 62 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 63 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 64 | cross-validation.} 65 | 66 | \item{verbose}{Verbose output. A logical argument indicating whether or not 67 | verbose output should be printed. Default is \code{FALSE}.} 68 | 69 | \item{cores}{The number of cores to be used. An integer indicating the number 70 | of processor cores used for parallel computing. Default is 1.} 71 | } 72 | \value{ 73 | A model formula of the winning best subset classifier model. 74 | } 75 | \description{ 76 | \code{run_best_subset} is a wrapper function that applies the best subset 77 | classifier to a list of models provided by the user, evaluates the models' 78 | prediction performance, and chooses the best-performing model. 79 | } 80 | -------------------------------------------------------------------------------- /man/run_best_subset_mc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_best_subset.R 3 | \name{run_best_subset_mc} 4 | \alias{run_best_subset_mc} 5 | \title{Best subset multicore tuning.} 6 | \usage{ 7 | run_best_subset_mc( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | loss.unit, 14 | loss.fun, 15 | data, 16 | cores, 17 | models, 18 | verbose 19 | ) 20 | } 21 | \arguments{ 22 | \item{y}{Outcome variable. A character scalar containing the column name of 23 | the outcome variable in \code{survey}.} 24 | 25 | \item{L1.x}{Individual-level covariates. A character vector containing the 26 | column names of the individual-level variables in \code{survey} and 27 | \code{census} used to predict outcome \code{y}. Note that geographic unit 28 | is specified in argument \code{L2.unit}.} 29 | 30 | \item{L2.x}{Context-level covariates. A character vector containing the 31 | column names of the context-level variables in \code{survey} and 32 | \code{census} used to predict outcome \code{y}.} 33 | 34 | \item{L2.unit}{Geographic unit. A character scalar containing the column 35 | name of the geographic unit in \code{survey} and \code{census} at which 36 | outcomes should be aggregated.} 37 | 38 | \item{L2.reg}{Geographic region. A character scalar containing the column 39 | name of the geographic region in \code{survey} and \code{census} by which 40 | geographic units are grouped (\code{L2.unit} must be nested within 41 | \code{L2.reg}). Default is \code{NULL}.} 42 | 43 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 44 | whether performance loss should be evaluated at the level of individual 45 | respondents (\code{individuals}) or geographic units (\code{L2 units}). 46 | Default is \code{individuals}.} 47 | 48 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 49 | prediction loss should be measured by the mean squared error (\code{MSE}) 50 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.} 51 | 52 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 53 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 54 | cross-validation.} 55 | 56 | \item{cores}{The number of cores to be used. An integer indicating the number 57 | of processor cores used for parallel computing. Default is 1.} 58 | 59 | \item{models}{The models to perform best subset selection on. A list of model 60 | formulas.} 61 | 62 | \item{verbose}{Verbose output. A logical argument indicating whether or not 63 | verbose output should be printed. Default is \code{TRUE}.} 64 | } 65 | \value{ 66 | The cross-validation errors for all models. A list. 67 | } 68 | \description{ 69 | \code{run_best_subset_mc} is called from within \code{run_best_subset}. It 70 | tunes using multiple cores. 71 | } 72 | \examples{ 73 | \dontrun{ 74 | # not yet 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /man/run_deep_bs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_deep_bs.r 3 | \name{run_deep_bs} 4 | \alias{run_deep_bs} 5 | \title{Apply deep mrp to the best subset classifier to MrP.} 6 | \usage{ 7 | run_deep_bs( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | loss.unit, 14 | loss.fun, 15 | deep.splines, 16 | data, 17 | k.folds, 18 | verbose, 19 | cores 20 | ) 21 | } 22 | \arguments{ 23 | \item{y}{Outcome variable. A character vector containing the column names of 24 | the outcome variable. A character scalar containing the column name of 25 | the outcome variable in \code{survey}.} 26 | 27 | \item{L1.x}{Individual-level covariates. A character vector containing the 28 | column names of the individual-level variables in \code{survey} and 29 | \code{census} used to predict outcome \code{y}. Note that geographic unit 30 | is specified in argument \code{L2.unit}.} 31 | 32 | \item{L2.x}{Context-level covariates. A character vector containing the 33 | column names of the context-level variables in \code{survey} and 34 | \code{census} used to predict outcome \code{y}. To exclude context-level 35 | variables, set \code{L2.x = NULL}.} 36 | 37 | \item{L2.unit}{Geographic unit. A character scalar containing the column 38 | name of the geographic unit in \code{survey} and \code{census} at which 39 | outcomes should be aggregated.} 40 | 41 | \item{L2.reg}{Geographic region. A character scalar containing the column 42 | name of the geographic region in \code{survey} and \code{census} by which 43 | geographic units are grouped (\code{L2.unit} must be nested within 44 | \code{L2.reg}). Default is \code{NULL}.} 45 | 46 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 47 | whether performance loss should be evaluated at the level of individual 48 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 49 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 50 | loss units, parameters are ranked for each loss unit and the loss unit with 51 | the lowest rank sum is chosen. Ties are broken according to the order in 52 | the search grid.} 53 | 54 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 55 | prediction loss should be measured by the mean squared error (\code{MSE}), 56 | the mean absolute error (\code{MAE}), binary cross-entropy 57 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1 58 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE", 59 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters 60 | are ranked for each loss function and the parameter combination with the 61 | lowest rank sum is chosen. Ties are broken according to the order in the 62 | search grid.} 63 | 64 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether 65 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.} 66 | 67 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 68 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 69 | cross-validation.} 70 | 71 | \item{k.folds}{Number of cross-validation folds. An integer-valued scalar 72 | indicating the number of folds to be used in cross-validation. Default is 73 | \eqn{5}. \emph{Note:} ignored if \code{folds} is provided, but must be 74 | specified otherwise.} 75 | 76 | \item{verbose}{Verbose output. A logical argument indicating whether or not 77 | verbose output should be printed. Default is \code{FALSE}.} 78 | 79 | \item{cores}{The number of cores to be used. An integer indicating the number 80 | of processor cores used for parallel computing. Default is 1.} 81 | } 82 | \value{ 83 | A model formula of the winning best subset classifier model. 84 | } 85 | \description{ 86 | \code{run_deep_bs} is a wrapper function that applies the bestsubset 87 | classifier to a list of models provided by the user, evaluates the models' 88 | prediction performance, and chooses the best-performing model. It differs 89 | from \code{run_best_subset} in that it includes L1.x interactions. 90 | } 91 | -------------------------------------------------------------------------------- /man/run_deep_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_deep_pca.r 3 | \name{run_deep_pca} 4 | \alias{run_deep_pca} 5 | \title{Apply PCA classifier to MrP.} 6 | \usage{ 7 | run_deep_pca( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | loss.unit, 14 | loss.fun, 15 | deep.splines, 16 | data, 17 | cores, 18 | verbose 19 | ) 20 | } 21 | \arguments{ 22 | \item{y}{Outcome variable. A character vector containing the column names of 23 | the outcome variable. A character scalar containing the column name of 24 | the outcome variable in \code{survey}.} 25 | 26 | \item{L1.x}{Individual-level covariates. A character vector containing the 27 | column names of the individual-level variables in \code{survey} and 28 | \code{census} used to predict outcome \code{y}. Note that geographic unit 29 | is specified in argument \code{L2.unit}.} 30 | 31 | \item{L2.x}{Context-level covariates. A character vector containing the 32 | column names of the context-level variables in \code{survey} and 33 | \code{census} used to predict outcome \code{y}. To exclude context-level 34 | variables, set \code{L2.x = NULL}.} 35 | 36 | \item{L2.unit}{Geographic unit. A character scalar containing the column 37 | name of the geographic unit in \code{survey} and \code{census} at which 38 | outcomes should be aggregated.} 39 | 40 | \item{L2.reg}{Geographic region. A character scalar containing the column 41 | name of the geographic region in \code{survey} and \code{census} by which 42 | geographic units are grouped (\code{L2.unit} must be nested within 43 | \code{L2.reg}). Default is \code{NULL}.} 44 | 45 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 46 | whether performance loss should be evaluated at the level of individual 47 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 48 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 49 | loss units, parameters are ranked for each loss unit and the loss unit with 50 | the lowest rank sum is chosen. Ties are broken according to the order in 51 | the search grid.} 52 | 53 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 54 | prediction loss should be measured by the mean squared error (\code{MSE}), 55 | the mean absolute error (\code{MAE}), binary cross-entropy 56 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1 57 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE", 58 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters 59 | are ranked for each loss function and the parameter combination with the 60 | lowest rank sum is chosen. Ties are broken according to the order in the 61 | search grid.} 62 | 63 | \item{deep.splines}{Deep MRP splines. A logical argument indicating whether 64 | splines should be used in the deep MRP classifier. Default is \code{TRUE}.} 65 | 66 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 67 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 68 | cross-validation.} 69 | 70 | \item{cores}{The number of cores to be used. An integer indicating the number 71 | of processor cores used for parallel computing. Default is 1.} 72 | 73 | \item{verbose}{Verbose output. A logical argument indicating whether or not 74 | verbose output should be printed. Default is \code{FALSE}.} 75 | } 76 | \value{ 77 | A model formula of the winning best subset classifier model. 78 | } 79 | \description{ 80 | \code{run_deep_pca} is a wrapper function that applies the PCA classifier to 81 | data provided by the user, evaluates prediction performance, and chooses the 82 | best-performing model. It differs from \code{run_best_subset} in that it 83 | includes L1.x interactions. 84 | } 85 | -------------------------------------------------------------------------------- /man/run_gb.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_gb.R 3 | \name{run_gb} 4 | \alias{run_gb} 5 | \title{Apply gradient boosting classifier to MrP.} 6 | \usage{ 7 | run_gb( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.eval.unit, 12 | L2.unit, 13 | L2.reg, 14 | loss.unit, 15 | loss.fun, 16 | interaction.depth, 17 | shrinkage, 18 | n.trees.init, 19 | n.trees.increase, 20 | n.trees.max, 21 | cores = cores, 22 | n.minobsinnode, 23 | data, 24 | verbose 25 | ) 26 | } 27 | \arguments{ 28 | \item{y}{Outcome variable. A character vector containing the column names of 29 | the outcome variable. A character scalar containing the column name of 30 | the outcome variable in \code{survey}.} 31 | 32 | \item{L1.x}{Individual-level covariates. A character vector containing the 33 | column names of the individual-level variables in \code{survey} and 34 | \code{census} used to predict outcome \code{y}. Note that geographic unit 35 | is specified in argument \code{L2.unit}.} 36 | 37 | \item{L2.x}{Context-level covariates. A character vector containing the 38 | column names of the context-level variables in \code{survey} and 39 | \code{census} used to predict outcome \code{y}. To exclude context-level 40 | variables, set \code{L2.x = NULL}.} 41 | 42 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar 43 | containing the column name of the geographic unit in \code{survey} and 44 | \code{census}.} 45 | 46 | \item{L2.unit}{Geographic unit. A character scalar containing the column 47 | name of the geographic unit in \code{survey} and \code{census} at which 48 | outcomes should be aggregated.} 49 | 50 | \item{L2.reg}{Geographic region. A character scalar containing the column 51 | name of the geographic region in \code{survey} and \code{census} by which 52 | geographic units are grouped (\code{L2.unit} must be nested within 53 | \code{L2.reg}). Default is \code{NULL}.} 54 | 55 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 56 | whether performance loss should be evaluated at the level of individual 57 | respondents (\code{individuals}) or geographic units (\code{L2 units}). 58 | Default is \code{individuals}.} 59 | 60 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 61 | prediction loss should be measured by the mean squared error (\code{MSE}) 62 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.} 63 | 64 | \item{interaction.depth}{GB interaction depth. An integer-valued vector 65 | whose values specify the interaction depth of GB. The interaction depth 66 | defines the maximum depth of each tree grown (i.e., the maximum level of 67 | variable interactions). Default is \code{c(1, 2, 3)}.} 68 | 69 | \item{shrinkage}{GB learning rate. A numeric vector whose values specify the 70 | learning rate or step-size reduction of GB. Values between \eqn{0.001} 71 | and \eqn{0.1} usually work, but a smaller learning rate typically requires 72 | more trees. Default is \code{c(0.04, 0.01, 0.008, 0.005, 0.001)}.} 73 | 74 | \item{n.trees.init}{GB initial total number of trees. An integer-valued 75 | scalar specifying the initial number of total trees to fit by GB. Default 76 | is \eqn{50}.} 77 | 78 | \item{n.trees.increase}{GB increase in total number of trees. An 79 | integer-valued scalar specifying by how many trees the total number of 80 | trees to fit should be increased (until \code{n.trees.max} is reached) 81 | or an integer-valued vector of length \code{length(shrinkage)} with each 82 | of its values being associated with a learning rate in \code{shrinkage}. 83 | Default is \eqn{50}.} 84 | 85 | \item{n.trees.max}{GB maximum number of trees. An integer-valued scalar 86 | specifying the maximum number of trees to fit by GB or an integer-valued 87 | vector of length \code{length(shrinkage)} with each of its values being 88 | associated with a learning rate and an increase in the total number of 89 | trees. Default is \eqn{1000}.} 90 | 91 | \item{cores}{The number of cores to be used. An integer indicating the number 92 | of processor cores used for parallel computing. Default is 1.} 93 | 94 | \item{n.minobsinnode}{GB minimum number of observations in the terminal 95 | nodes. An integer-valued scalar specifying the minimum number of 96 | observations that each terminal node of the trees must contain. Default is 97 | \eqn{5}.} 98 | 99 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 100 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 101 | cross-validation.} 102 | 103 | \item{verbose}{Verbose output. A logical argument indicating whether or not 104 | verbose output should be printed. Default is \code{TRUE}.} 105 | } 106 | \value{ 107 | The tuned gradient boosting parameters. A list with three elements: 108 | \code{interaction_depth} contains the interaction depth parameter, 109 | \code{shrinkage} contains the learning rate, \code{n_trees} the number of 110 | trees to be grown. 111 | } 112 | \description{ 113 | \code{run_gb} is a wrapper function that applies the gradient boosting 114 | classifier to data provided by the user, evaluates prediction performance, 115 | and chooses the best-performing model. 116 | } 117 | -------------------------------------------------------------------------------- /man/run_gb_mc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_gb.R 3 | \name{run_gb_mc} 4 | \alias{run_gb_mc} 5 | \title{GB multicore tuning.} 6 | \usage{ 7 | run_gb_mc( 8 | y, 9 | L1.x, 10 | L2.eval.unit, 11 | L2.unit, 12 | L2.reg, 13 | form, 14 | gb.grid, 15 | n.minobsinnode, 16 | loss.unit, 17 | loss.fun, 18 | data, 19 | cores 20 | ) 21 | } 22 | \arguments{ 23 | \item{y}{Outcome variable. A character vector containing the column names of 24 | the outcome variable. A character scalar containing the column name of 25 | the outcome variable in \code{survey}.} 26 | 27 | \item{L1.x}{Individual-level covariates. A character vector containing the 28 | column names of the individual-level variables in \code{survey} and 29 | \code{census} used to predict outcome \code{y}. Note that geographic unit 30 | is specified in argument \code{L2.unit}.} 31 | 32 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar 33 | containing the column name of the geographic unit in \code{survey} and 34 | \code{census}.} 35 | 36 | \item{L2.unit}{Geographic unit. A character scalar containing the column 37 | name of the geographic unit in \code{survey} and \code{census} at which 38 | outcomes should be aggregated.} 39 | 40 | \item{L2.reg}{Geographic region. A character scalar containing the column 41 | name of the geographic region in \code{survey} and \code{census} by which 42 | geographic units are grouped (\code{L2.unit} must be nested within 43 | \code{L2.reg}). Default is \code{NULL}.} 44 | 45 | \item{form}{The model formula. A formula object.} 46 | 47 | \item{gb.grid}{The hyper-parameter search grid. A matrix of all 48 | hyper-parameter combinations.} 49 | 50 | \item{n.minobsinnode}{GB minimum number of observations in the terminal 51 | nodes. An integer-valued scalar specifying the minimum number of 52 | observations that each terminal node of the trees must contain. Default is 53 | \eqn{5}.} 54 | 55 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 56 | whether performance loss should be evaluated at the level of individual 57 | respondents (\code{individuals}) or geographic units (\code{L2 units}). 58 | Default is \code{individuals}.} 59 | 60 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 61 | prediction loss should be measured by the mean squared error (\code{MSE}) 62 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.} 63 | 64 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 65 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 66 | cross-validation.} 67 | 68 | \item{cores}{The number of cores to be used. An integer indicating the number 69 | of processor cores used for parallel computing. Default is 1.} 70 | } 71 | \value{ 72 | The tuning parameter combinations and there associated loss function 73 | scores. A list. 74 | } 75 | \description{ 76 | \code{run_gb_mc} is called from within \code{run_gb}. It tunes using 77 | multiple cores. 78 | } 79 | -------------------------------------------------------------------------------- /man/run_lasso.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_lasso.R 3 | \name{run_lasso} 4 | \alias{run_lasso} 5 | \title{Apply lasso classifier to MrP.} 6 | \usage{ 7 | run_lasso( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | n.iter, 14 | loss.unit, 15 | loss.fun, 16 | lambda, 17 | data, 18 | verbose, 19 | cores 20 | ) 21 | } 22 | \arguments{ 23 | \item{y}{Outcome variable. A character vector containing the column names of 24 | the outcome variable. A character scalar containing the column name of 25 | the outcome variable in \code{survey}.} 26 | 27 | \item{L1.x}{Individual-level covariates. A character vector containing the 28 | column names of the individual-level variables in \code{survey} and 29 | \code{census} used to predict outcome \code{y}. Note that geographic unit 30 | is specified in argument \code{L2.unit}.} 31 | 32 | \item{L2.x}{Context-level covariates. A character vector containing the 33 | column names of the context-level variables in \code{survey} and 34 | \code{census} used to predict outcome \code{y}. To exclude context-level 35 | variables, set \code{L2.x = NULL}.} 36 | 37 | \item{L2.unit}{Geographic unit. A character scalar containing the column 38 | name of the geographic unit in \code{survey} and \code{census} at which 39 | outcomes should be aggregated.} 40 | 41 | \item{L2.reg}{Geographic region. A character scalar containing the column 42 | name of the geographic region in \code{survey} and \code{census} by which 43 | geographic units are grouped (\code{L2.unit} must be nested within 44 | \code{L2.reg}). Default is \code{NULL}.} 45 | 46 | \item{n.iter}{Lasso number of lambda values. An integer-valued scalar 47 | specifying the number of lambda values to search over. Default is 48 | \eqn{100}. 49 | \emph{Note:} Is ignored if a vector of \code{lasso.lambda} values is 50 | provided.} 51 | 52 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 53 | whether performance loss should be evaluated at the level of individual 54 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 55 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 56 | loss units, parameters are ranked for each loss unit and the loss unit with 57 | the lowest rank sum is chosen. Ties are broken according to the order in 58 | the search grid.} 59 | 60 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 61 | prediction loss should be measured by the mean squared error (\code{MSE}), 62 | the mean absolute error (\code{MAE}), binary cross-entropy 63 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1 64 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE", 65 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters 66 | are ranked for each loss function and the parameter combination with the 67 | lowest rank sum is chosen. Ties are broken according to the order in the 68 | search grid.} 69 | 70 | \item{lambda}{Lasso penalty parameter. A numeric \code{vector} of 71 | non-negative values. The penalty parameter controls the shrinkage of the 72 | context-level variables in the lasso model. Default is a sequence with 73 | minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The 74 | number of values is controlled by the \code{lasso.n.iter} parameter.} 75 | 76 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 77 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 78 | cross-validation.} 79 | 80 | \item{verbose}{Verbose output. A logical argument indicating whether or not 81 | verbose output should be printed. Default is \code{FALSE}.} 82 | 83 | \item{cores}{The number of cores to be used. An integer indicating the number 84 | of processor cores used for parallel computing. Default is 1.} 85 | } 86 | \value{ 87 | The tuned lambda value. A numeric scalar. 88 | } 89 | \description{ 90 | \code{run_lasso} is a wrapper function that applies the lasso classifier to 91 | data provided by the user, evaluates prediction performance, and chooses the 92 | best-performing model. 93 | } 94 | -------------------------------------------------------------------------------- /man/run_lasso_mc_lambda.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_lasso.R 3 | \name{run_lasso_mc_lambda} 4 | \alias{run_lasso_mc_lambda} 5 | \title{Lasso multicore tuning.} 6 | \usage{ 7 | run_lasso_mc_lambda( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | loss.unit, 14 | loss.fun, 15 | data, 16 | cores, 17 | L2.fe.form, 18 | L1.re, 19 | lambda 20 | ) 21 | } 22 | \arguments{ 23 | \item{y}{Outcome variable. A character vector containing the column names of 24 | the outcome variable. A character scalar containing the column name of 25 | the outcome variable in \code{survey}.} 26 | 27 | \item{L1.x}{Individual-level covariates. A character vector containing the 28 | column names of the individual-level variables in \code{survey} and 29 | \code{census} used to predict outcome \code{y}. Note that geographic unit 30 | is specified in argument \code{L2.unit}.} 31 | 32 | \item{L2.x}{Context-level covariates. A character vector containing the 33 | column names of the context-level variables in \code{survey} and 34 | \code{census} used to predict outcome \code{y}. To exclude context-level 35 | variables, set \code{L2.x = NULL}.} 36 | 37 | \item{L2.unit}{Geographic unit. A character scalar containing the column 38 | name of the geographic unit in \code{survey} and \code{census} at which 39 | outcomes should be aggregated.} 40 | 41 | \item{L2.reg}{Geographic region. A character scalar containing the column 42 | name of the geographic region in \code{survey} and \code{census} by which 43 | geographic units are grouped (\code{L2.unit} must be nested within 44 | \code{L2.reg}). Default is \code{NULL}.} 45 | 46 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 47 | whether performance loss should be evaluated at the level of individual 48 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 49 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 50 | loss units, parameters are ranked for each loss unit and the loss unit with 51 | the lowest rank sum is chosen. Ties are broken according to the order in 52 | the search grid.} 53 | 54 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 55 | prediction loss should be measured by the mean squared error (\code{MSE}), 56 | the mean absolute error (\code{MAE}), binary cross-entropy 57 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1 58 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE", 59 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters 60 | are ranked for each loss function and the parameter combination with the 61 | lowest rank sum is chosen. Ties are broken according to the order in the 62 | search grid.} 63 | 64 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 65 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 66 | cross-validation.} 67 | 68 | \item{cores}{The number of cores to be used. An integer indicating the number 69 | of processor cores used for parallel computing. Default is 1.} 70 | 71 | \item{L2.fe.form}{The fixed effects part of the Lasso classifier formula. The 72 | formula is inherited from \code{run_lasso}.} 73 | 74 | \item{L1.re}{A list of random effects for the Lasso classifier formula. The 75 | formula is inherited from \code{run_lasso}.} 76 | 77 | \item{lambda}{Lasso penalty parameter. A numeric \code{vector} of 78 | non-negative values. The penalty parameter controls the shrinkage of the 79 | context-level variables in the lasso model. Default is a sequence with 80 | minimum 0.1 and maximum 250 that is equally spaced on the log-scale. The 81 | number of values is controlled by the \code{lasso.n.iter} parameter.} 82 | } 83 | \value{ 84 | The cross-validation errors for all models. A list. 85 | } 86 | \description{ 87 | \code{run_lasso_mc_lambda} is called from within \code{run_lasso}. It 88 | tunes using multiple cores. 89 | } 90 | -------------------------------------------------------------------------------- /man/run_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_pca.R 3 | \name{run_pca} 4 | \alias{run_pca} 5 | \title{Apply PCA classifier to MrP.} 6 | \usage{ 7 | run_pca( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.unit, 12 | L2.reg, 13 | loss.unit, 14 | loss.fun, 15 | data, 16 | cores, 17 | verbose 18 | ) 19 | } 20 | \arguments{ 21 | \item{y}{Outcome variable. A character vector containing the column names of 22 | the outcome variable. A character scalar containing the column name of 23 | the outcome variable in \code{survey}.} 24 | 25 | \item{L1.x}{Individual-level covariates. A character vector containing the 26 | column names of the individual-level variables in \code{survey} and 27 | \code{census} used to predict outcome \code{y}. Note that geographic unit 28 | is specified in argument \code{L2.unit}.} 29 | 30 | \item{L2.x}{Context-level covariates. A character vector containing the 31 | column names of the context-level variables in \code{survey} and 32 | \code{census} used to predict outcome \code{y}. To exclude context-level 33 | variables, set \code{L2.x = NULL}.} 34 | 35 | \item{L2.unit}{Geographic unit. A character scalar containing the column 36 | name of the geographic unit in \code{survey} and \code{census} at which 37 | outcomes should be aggregated.} 38 | 39 | \item{L2.reg}{Geographic region. A character scalar containing the column 40 | name of the geographic region in \code{survey} and \code{census} by which 41 | geographic units are grouped (\code{L2.unit} must be nested within 42 | \code{L2.reg}). Default is \code{NULL}.} 43 | 44 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 45 | whether performance loss should be evaluated at the level of individual 46 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 47 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 48 | loss units, parameters are ranked for each loss unit and the loss unit with 49 | the lowest rank sum is chosen. Ties are broken according to the order in 50 | the search grid.} 51 | 52 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 53 | prediction loss should be measured by the mean squared error (\code{MSE}), 54 | the mean absolute error (\code{MAE}), binary cross-entropy 55 | (\code{cross-entropy}), mean squared false error (\code{msfe}), the f1 56 | score (\code{f1}), or a combination thereof. Default is \code{c("MSE", 57 | "cross-entropy","msfe", "f1")}. With multiple loss functions, parameters 58 | are ranked for each loss function and the parameter combination with the 59 | lowest rank sum is chosen. Ties are broken according to the order in the 60 | search grid.} 61 | 62 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 63 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 64 | cross-validation.} 65 | 66 | \item{cores}{The number of cores to be used. An integer indicating the number 67 | of processor cores used for parallel computing. Default is 1.} 68 | 69 | \item{verbose}{Verbose output. A logical argument indicating whether or not 70 | verbose output should be printed. Default is \code{FALSE}.} 71 | } 72 | \value{ 73 | A model formula of the winning best subset classifier model. 74 | } 75 | \description{ 76 | \code{run_pca} is a wrapper function that applies the PCA classifier to data 77 | provided by the user, evaluates prediction performance, and chooses the 78 | best-performing model. 79 | } 80 | -------------------------------------------------------------------------------- /man/run_svm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_svm.R 3 | \name{run_svm} 4 | \alias{run_svm} 5 | \title{Apply support vector machine classifier to MrP.} 6 | \usage{ 7 | run_svm( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.eval.unit, 12 | L2.unit, 13 | L2.reg, 14 | kernel = "radial", 15 | loss.fun, 16 | loss.unit, 17 | gamma, 18 | cost, 19 | data, 20 | verbose, 21 | cores 22 | ) 23 | } 24 | \arguments{ 25 | \item{y}{Outcome variable. A character vector containing the column names of 26 | the outcome variable. A character scalar containing the column name of 27 | the outcome variable in \code{survey}.} 28 | 29 | \item{L1.x}{Individual-level covariates. A character vector containing the 30 | column names of the individual-level variables in \code{survey} and 31 | \code{census} used to predict outcome \code{y}. Note that geographic unit 32 | is specified in argument \code{L2.unit}.} 33 | 34 | \item{L2.x}{Context-level covariates. A character vector containing the 35 | column names of the context-level variables in \code{survey} and 36 | \code{census} used to predict outcome \code{y}. To exclude context-level 37 | variables, set \code{L2.x = NULL}.} 38 | 39 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar 40 | containing the column name of the geographic unit in \code{survey} and 41 | \code{census}.} 42 | 43 | \item{L2.unit}{Geographic unit. A character scalar containing the column 44 | name of the geographic unit in \code{survey} and \code{census} at which 45 | outcomes should be aggregated.} 46 | 47 | \item{L2.reg}{Geographic region. A character scalar containing the column 48 | name of the geographic region in \code{survey} and \code{census} by which 49 | geographic units are grouped (\code{L2.unit} must be nested within 50 | \code{L2.reg}). Default is \code{NULL}.} 51 | 52 | \item{kernel}{SVM kernel. A character-valued scalar specifying the kernel to 53 | be used by SVM. The possible values are \code{linear}, \code{polynomial}, 54 | \code{radial}, and \code{sigmoid}. Default is \code{radial}.} 55 | 56 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 57 | prediction loss should be measured by the mean squared error (\code{MSE}) 58 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.} 59 | 60 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 61 | whether performance loss should be evaluated at the level of individual 62 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 63 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 64 | loss units, parameters are ranked for each loss unit and the loss unit with 65 | the lowest rank sum is chosen. Ties are broken according to the order in 66 | the search grid.} 67 | 68 | \item{gamma}{SVM kernel parameter. A numeric vector whose values specify the 69 | gamma parameter in the SVM kernel. This parameter is needed for all kernel 70 | types except linear. Default is a sequence with minimum = 1e-5, maximum = 71 | 1e-1, and length = 20 that is equally spaced on the log-scale.} 72 | 73 | \item{cost}{SVM cost parameter. A numeric vector whose values specify the 74 | cost of constraints violation in SVM. Default is a sequence with minimum = 75 | 0.5, maximum = 10, and length = 5 that is equally spaced on the log-scale.} 76 | 77 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 78 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 79 | cross-validation.} 80 | 81 | \item{verbose}{Verbose output. A logical argument indicating whether or not 82 | verbose output should be printed. Default is \code{FALSE}.} 83 | 84 | \item{cores}{The number of cores to be used. An integer indicating the number 85 | of processor cores used for parallel computing. Default is 1.} 86 | } 87 | \value{ 88 | The support vector machine tuned parameters. A list. 89 | } 90 | \description{ 91 | \code{run_svm} is a wrapper function that applies the support vector machine 92 | classifier to data provided by the user, evaluates prediction performance, 93 | and chooses the best-performing model. 94 | } 95 | -------------------------------------------------------------------------------- /man/run_svm_mc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/run_svm.R 3 | \name{run_svm_mc} 4 | \alias{run_svm_mc} 5 | \title{SVM multicore tuning.} 6 | \usage{ 7 | run_svm_mc( 8 | y, 9 | L1.x, 10 | L2.x, 11 | L2.eval.unit, 12 | L2.unit, 13 | L2.reg, 14 | form, 15 | loss.unit, 16 | loss.fun, 17 | data, 18 | cores, 19 | svm.grid, 20 | verbose 21 | ) 22 | } 23 | \arguments{ 24 | \item{y}{Outcome variable. A character vector containing the column names of 25 | the outcome variable. A character scalar containing the column name of 26 | the outcome variable in \code{survey}.} 27 | 28 | \item{L1.x}{Individual-level covariates. A character vector containing the 29 | column names of the individual-level variables in \code{survey} and 30 | \code{census} used to predict outcome \code{y}. Note that geographic unit 31 | is specified in argument \code{L2.unit}.} 32 | 33 | \item{L2.x}{Context-level covariates. A character vector containing the 34 | column names of the context-level variables in \code{survey} and 35 | \code{census} used to predict outcome \code{y}. To exclude context-level 36 | variables, set \code{L2.x = NULL}.} 37 | 38 | \item{L2.eval.unit}{Geographic unit for the loss function. A character scalar 39 | containing the column name of the geographic unit in \code{survey} and 40 | \code{census}.} 41 | 42 | \item{L2.unit}{Geographic unit. A character scalar containing the column 43 | name of the geographic unit in \code{survey} and \code{census} at which 44 | outcomes should be aggregated.} 45 | 46 | \item{L2.reg}{Geographic region. A character scalar containing the column 47 | name of the geographic region in \code{survey} and \code{census} by which 48 | geographic units are grouped (\code{L2.unit} must be nested within 49 | \code{L2.reg}). Default is \code{NULL}.} 50 | 51 | \item{form}{The model formula. A formula object.} 52 | 53 | \item{loss.unit}{Loss function unit. A character-valued scalar indicating 54 | whether performance loss should be evaluated at the level of individual 55 | respondents (\code{individuals}), geographic units (\code{L2 units}) or at 56 | both levels. Default is \code{c("individuals", "L2 units")}. With multiple 57 | loss units, parameters are ranked for each loss unit and the loss unit with 58 | the lowest rank sum is chosen. Ties are broken according to the order in 59 | the search grid.} 60 | 61 | \item{loss.fun}{Loss function. A character-valued scalar indicating whether 62 | prediction loss should be measured by the mean squared error (\code{MSE}) 63 | or the mean absolute error (\code{MAE}). Default is \code{MSE}.} 64 | 65 | \item{data}{Data for cross-validation. A \code{list} of \eqn{k} 66 | \code{data.frames}, one for each fold to be used in \eqn{k}-fold 67 | cross-validation.} 68 | 69 | \item{cores}{The number of cores to be used. An integer indicating the number 70 | of processor cores used for parallel computing. Default is 1.} 71 | 72 | \item{svm.grid}{The hyper-parameter search grid. A matrix of all 73 | hyper-parameter combinations.} 74 | 75 | \item{verbose}{Verbose output. A logical argument indicating whether or not 76 | verbose output should be printed. Default is \code{FALSE}.} 77 | } 78 | \value{ 79 | The cross-validation errors for all models. A list. 80 | } 81 | \description{ 82 | \code{run_svm_mc} is called from within \code{run_svm}. It tunes using 83 | multiple cores. 84 | } 85 | -------------------------------------------------------------------------------- /man/summary.autoMrP.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{summary.autoMrP} 4 | \alias{summary.autoMrP} 5 | \title{A summary method for autoMrP objects.} 6 | \usage{ 7 | \method{summary}{autoMrP}( 8 | object, 9 | ci.lvl = 0.95, 10 | digits = 4, 11 | format = "simple", 12 | classifiers = NULL, 13 | n = 10, 14 | ... 15 | ) 16 | } 17 | \arguments{ 18 | \item{object}{An \code{autoMrP()} object for which a summary is desired.} 19 | 20 | \item{ci.lvl}{The level of the confidence intervals. A proportion. Default is 21 | \code{0.95}. Confidence intervals are based on bootstrapped estimates and 22 | will not be printed if bootstrapping was not carried out.} 23 | 24 | \item{digits}{The number of digits to be displayed. An integer scalar. 25 | Default is \code{4}.} 26 | 27 | \item{format}{The table format. A character string passed to 28 | \code{\link[knitr]{kable}}. Default is \code{simple}.} 29 | 30 | \item{classifiers}{Summarize a single classifier. A character string. Must be 31 | one of \code{best_subset}, \code{lasso}, \code{pca}, \code{gb}, \code{svm}, 32 | or \code{mrp}. Default is \code{NULL}.} 33 | 34 | \item{n}{Number of rows to be printed. An integer scalar. Default is 35 | \code{10}.} 36 | 37 | \item{...}{Additional arguments affecting the summary produced.} 38 | } 39 | \value{ 40 | No return value, prints a summary of the context level preference 41 | estimates to the console. 42 | } 43 | \description{ 44 | \code{summary.autoMrP()} ... 45 | } 46 | -------------------------------------------------------------------------------- /man/survey_item.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/survey_data.R 3 | \docType{data} 4 | \name{survey_item} 5 | \alias{survey_item} 6 | \title{A sample of a survey item from the CCES 2008} 7 | \format{ 8 | A data frame with 1500 rows and 13 variables: 9 | \describe{ 10 | \item{YES}{1 if individual supports use of troops; 0 otherwise} 11 | \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 12 | \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 13 | \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 14 | \item{state}{U.S. state} 15 | \item{L2.unit}{U.S. state id} 16 | \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 17 | \item{L2.x1}{Normalized state-level share of votes for the Republican candidate in the previous presidential election} 18 | \item{L2.x2}{Normalized state-level percentage of Evangelical Protestant or Mormon respondents} 19 | \item{L2.x3}{Normalized state-level percentage of the population living in urban areas} 20 | \item{L2.x4}{Normalized state-level unemployment rate} 21 | \item{L2.x5}{Normalized state-level share of Hispanics} 22 | \item{L2.x6}{Normalized state-level share of Whites} 23 | } 24 | } 25 | \source{ 26 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 27 | article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 28 | multilevel regression and poststrat-stratification perform with 29 | conventional national surveys?" Political Analysis 21(4): 449-467. It is a 30 | random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 31 | L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 32 | } 33 | \usage{ 34 | survey_item 35 | } 36 | \description{ 37 | The Cooperative Congressional Election Stuides (CCES) item (cc418_1) asked: 38 | "Would you approve of the use of U.S. military troops in order to ensure the 39 | supply of oil?" The original 2008 CCES item contains 36,832 respondents. This 40 | sample mimics a typical national survey. It contains at least 5 respondents 41 | from each state but is otherwise a random sample. 42 | } 43 | \keyword{datasets} 44 | -------------------------------------------------------------------------------- /man/svm_classifier.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/svm_classifier.R 3 | \name{svm_classifier} 4 | \alias{svm_classifier} 5 | \title{SVM classifier} 6 | \usage{ 7 | svm_classifier( 8 | y, 9 | form, 10 | data, 11 | kernel, 12 | type, 13 | probability, 14 | svm.gamma, 15 | svm.cost, 16 | verbose = c(TRUE, FALSE) 17 | ) 18 | } 19 | \arguments{ 20 | \item{y}{Outcome variable. A character vector containing the column names of 21 | the outcome variable. A character scalar containing the column name of 22 | the outcome variable in \code{survey}.} 23 | 24 | \item{form}{Model formula. A two-sided linear formula describing 25 | the model to be fit, with the outcome on the LHS and the covariates 26 | separated by + operators on the RHS.} 27 | 28 | \item{data}{Data. A data.frame containing the cross-validation data used to 29 | train and evaluate the model.} 30 | 31 | \item{kernel}{Kernel for SVM. A character string specifying the kernel to 32 | be used for SVM. The possible types are linear, polynomial, radial, and 33 | sigmoid. Default is radial.} 34 | 35 | \item{type}{svm can be used as a classification machine, as a regression 36 | machine, or for novelty detection. Depending of whether y is a factor or 37 | not, the default setting for type is C-classification or eps-regression, 38 | respectively, but may be overwritten by setting an explicit value. Valid 39 | options are: #' \enumerate{ 40 | \item C-classification 41 | \item nu-classification 42 | \item one-classification (for novelty detection) 43 | \item eps-regression 44 | \item nu-regression 45 | }} 46 | 47 | \item{probability}{Probability predictions. A logical argument indicating 48 | whether the model should allow for probability predictions} 49 | 50 | \item{svm.gamma}{Gamma parameter for SVM. This parameter is needed for all 51 | kernels except linear.} 52 | 53 | \item{svm.cost}{Cost parameter for SVM. This parameter specifies the cost of 54 | constraints violation.} 55 | 56 | \item{verbose}{Verbose output. A logical vector indicating whether or not 57 | verbose output should be printed.} 58 | } 59 | \value{ 60 | The support vector machine model. An \code{\link[e1071]{svm}} object. 61 | } 62 | \description{ 63 | \code{svm_classifier} applies support vector machine classification to a 64 | data set. 65 | } 66 | -------------------------------------------------------------------------------- /man/taxes_census.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/taxes_census.R 3 | \docType{data} 4 | \name{taxes_census} 5 | \alias{taxes_census} 6 | \title{Quasi census data.} 7 | \format{ 8 | A data frame with 2934 rows and 13 variables: 9 | \describe{ 10 | \item{state}{U.S. state} 11 | \item{L2.unit}{U.S. state id} 12 | \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 13 | \item{L1x1}{Age group (four categories)} 14 | \item{L1x2}{Education level (four categories)} 15 | \item{L1x3}{Gender-race combination (six categories)} 16 | \item{freq}{State-level frequency of ideal type} 17 | \item{proportion}{State-level proportion of respondents of that ideal type in the population} 18 | \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 19 | \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 20 | \item{L2.x3}{State-level percentage of the population living in urban areas} 21 | \item{L2.x4}{State-level unemployment rate} 22 | \item{L2.x5}{State-level share of Hispanics} 23 | \item{L2.x6}{State-level share of Whites} 24 | } 25 | } 26 | \source{ 27 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 28 | article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 29 | multilevel regression and poststrat-stratification perform with 30 | conventional national surveys?" Political Analysis 21(4): 449-467. L2.x3, 31 | L2.x3, L2.x4, L2.x5 and L2.x6 are available at 32 | \url{https://www.census.gov}. 33 | } 34 | \usage{ 35 | data(taxes_census) 36 | } 37 | \description{ 38 | The census file is generated from the full 2008 National Annenberg Election 39 | Studies item CBb01 by dissaggregating the 64 ideal type combinations of the 40 | individual level variables L1x1, L2x2 and L1x3. A row is an ideal type in a 41 | given state. 42 | } 43 | \keyword{datasets} 44 | -------------------------------------------------------------------------------- /man/taxes_survey.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/taxes_survey.R, R/taxes_truth.R 3 | \docType{data} 4 | \name{taxes_survey} 5 | \alias{taxes_survey} 6 | \title{Sample on raising taxes from the 2008 National Annenberg Election Studies.} 7 | \format{ 8 | A data frame with 1500 rows and 13 variables: 9 | \describe{ 10 | \item{YES}{1 if individual supports raising taxes; 0 otherwise} 11 | \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 12 | \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 13 | \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 14 | \item{state}{U.S. state} 15 | \item{L2.unit}{U.S. state id} 16 | \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 17 | \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 18 | \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 19 | \item{L2.x3}{State-level percentage of the population living in urban areas} 20 | \item{L2.x4}{State-level unemployment rate} 21 | \item{L2.x5}{State-level share of Hispanics} 22 | \item{L2.x6}{State-level share of Whites} 23 | } 24 | 25 | A data frame with 1500 rows and 13 variables: 26 | \describe{ 27 | \item{YES}{1 if individual supports raising taxes; 0 otherwise} 28 | \item{L1x1}{Age group (four categories: 1 = 18-29; 2 = 30-44; 3 = 45-64; 4 = 65+)} 29 | \item{L1x2}{Education level (four categories: 1 = < high school; 2 = high school graduate; 3 = some college; 4 = college graduate)} 30 | \item{L1x3}{Gender-race combination (six categories: 1 = white male; 2 = black male; 3 = hispanic male; 4 = white female; 5 = black female; 6 = hispanic female)} 31 | \item{state}{U.S. state} 32 | \item{L2.unit}{U.S. state id} 33 | \item{region}{U.S. region (four categories: 1 = Northeast; 2 = Midwest; 3 = South; 4 = West)} 34 | \item{L2.x1}{State-level share of votes for the Republican candidate in the previous presidential election} 35 | \item{L2.x2}{State-level percentage of Evangelical Protestant or Mormon respondents} 36 | \item{L2.x3}{State-level percentage of the population living in urban areas} 37 | \item{L2.x4}{State-level unemployment rate} 38 | \item{L2.x5}{State-level share of Hispanics} 39 | \item{L2.x6}{State-level share of Whites} 40 | } 41 | } 42 | \source{ 43 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 44 | article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 45 | multilevel regression and poststrat-stratification perform with 46 | conventional national surveys?" Political Analysis 21(4): 449-467. It is a 47 | random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 48 | L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 49 | 50 | The data set (excluding L2.x3, L2.x4, L2.x5, L2.x6) is taken from the 51 | article: Buttice, Matthew K, and Benjamin Highton. 2013. "How does 52 | multilevel regression and poststrat-stratification perform with 53 | conventional national surveys?" Political Analysis 21(4): 449-467. It is a 54 | random sample with at least 5 respondents per state. L2.x3, L2.x3, L2.x4, 55 | L2.x5 and L2.x6 are available at \url{https://www.census.gov}. 56 | } 57 | \usage{ 58 | data(taxes_survey) 59 | 60 | data(taxes_survey) 61 | } 62 | \description{ 63 | The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm 64 | going to read you some options about federal income taxes. Please tell me 65 | which one comes closest to your view on what we should be doing about federal 66 | income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if 67 | necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3) 68 | was turned into a 'raise taxes response,' categories (1) and (2) were 69 | combined into a 'do not raise taxes' response. The original item from the 70 | phone and online surveys contains 50,483 respondents. This sample mimics a 71 | typical national survey. It contains at least 5 respondents from each state 72 | but is otherwise a random sample. 73 | 74 | The 2008 National Annenberg Election Studies (NAES) item (CBb01) asked: "I'm 75 | going to read you some options about federal income taxes. Please tell me 76 | which one comes closest to your view on what we should be doing about federal 77 | income taxes: (1) Cut taxes; (2) Keep taxes as they are; (3) Raise taxes if 78 | necessary; (4) None of these; (998) Don't know; (999) No answer. Category (3) 79 | was turned into a 'raise taxes response,' categories (1) and (2) were 80 | combined into a 'do not raise taxes' response. The original item from the 81 | phone and online surveys contains 50,483 respondents. This sample mimics a 82 | typical national survey. It contains at least 5 respondents from each state 83 | but is otherwise a random sample. 84 | } 85 | \keyword{datasets} 86 | -------------------------------------------------------------------------------- /vignettes/autoMrP_vignette.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/retowuest/autoMrP/1da422652719263acb41d4d3ca04e83d1bcf32f9/vignettes/autoMrP_vignette.pdf -------------------------------------------------------------------------------- /vignettes/autoMrP_vignette.pdf.asis: -------------------------------------------------------------------------------- 1 | %\VignetteIndexEntry{autoMrP: Multilevel Models and Post-Stratification (MrP) Combined with Machine Learning in R} 2 | %\VignetteEngine{R.rsp::asis} --------------------------------------------------------------------------------