├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── DESCRIPTION ├── MNLpred.Rproj ├── NAMESPACE ├── NEWS.md ├── R ├── gles-data.R ├── mnl_fd2_ova.R ├── mnl_fd_ova.R └── mnl_pred_ova.R ├── README.Rmd ├── README.md ├── cran-comments.md ├── data └── gles.RData ├── inst └── CITATION ├── man ├── figures │ ├── README-first_differences_plot-1.png │ ├── README-prediction_plot1-1.png │ ├── README-prediction_plot2-1.png │ └── README-static_fd_plot-1.png ├── gles.Rd ├── mnl_fd2_ova.Rd ├── mnl_fd_ova.Rd └── mnl_pred_ova.Rd ├── tests ├── testthat.R └── testthat │ └── test_inputvariants.R └── vignettes ├── .gitignore ├── OVA_Predictions_For_MNL.Rmd └── bibliography.bib /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^MNLpred\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^cran-comments\.md$ 5 | feedback 6 | issues 7 | README_files 8 | PDF 9 | ^CRAN-RELEASE$ 10 | release 11 | ^\.github$ 12 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 2 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | pull_request: 9 | branches: 10 | - main 11 | - master 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: ${{ matrix.config.os }} 18 | 19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | config: 25 | - {os: windows-latest, r: 'release'} 26 | - {os: macOS-latest, r: 'release'} 27 | - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 28 | - {os: ubuntu-20.04, r: 'devel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} 29 | 30 | env: 31 | R_REMOTES_NO_ERRORS_FROM_WARNINGS: true 32 | RSPM: ${{ matrix.config.rspm }} 33 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 34 | 35 | steps: 36 | - uses: actions/checkout@v2 37 | 38 | - uses: r-lib/actions/setup-r@v1 39 | with: 40 | r-version: ${{ matrix.config.r }} 41 | 42 | - uses: r-lib/actions/setup-pandoc@v1 43 | 44 | - name: Query dependencies 45 | run: | 46 | install.packages('remotes') 47 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 48 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 49 | shell: Rscript {0} 50 | 51 | - name: Cache R packages 52 | if: runner.os != 'Windows' 53 | uses: actions/cache@v2 54 | with: 55 | path: ${{ env.R_LIBS_USER }} 56 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 57 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 58 | 59 | - name: Install system dependencies 60 | if: runner.os == 'Linux' 61 | run: | 62 | while read -r cmd 63 | do 64 | eval sudo $cmd 65 | done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') 66 | 67 | - name: Install dependencies 68 | run: | 69 | remotes::install_deps(dependencies = TRUE) 70 | remotes::install_cran("rcmdcheck") 71 | shell: Rscript {0} 72 | 73 | - name: Check 74 | env: 75 | _R_CHECK_CRAN_INCOMING_REMOTE_: false 76 | run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 77 | shell: Rscript {0} 78 | 79 | - name: Upload check results 80 | if: failure() 81 | uses: actions/upload-artifact@main 82 | with: 83 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 84 | path: check 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | inst/doc 3 | feedback 4 | release 5 | 6 | issues/ 7 | 8 | .Rhistory 9 | 10 | PDF 11 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: MNLpred 2 | Title: Simulated Predicted Probabilities for Multinomial Logit Models 3 | Version: 0.0.8 4 | Authors@R: 5 | person(given = "Manuel", 6 | family = "Neumann", 7 | role = c("aut", "cre"), 8 | email = "manuel.neumann@mzes.uni-mannheim.de", 9 | comment = c(ORCID = "0000-0002-7953-3939")) 10 | Depends: R (>= 3.5.0) 11 | Description: Functions to easily return simulated predicted probabilities and 12 | first differences for multinomial logit models. It takes a specified 13 | scenario and a multinomial model to predict probabilities with a set of 14 | coefficients, drawn from a simulated sampling distribution. The simulated 15 | predictions allow for meaningful plots with means and confidence intervals. 16 | The methodological approach is based on the principles laid out by King, 17 | Tomz, and Wittenberg (2000) and Hanmer and Ozan Kalkan 18 | (2016) . 19 | License: GPL-3 20 | Encoding: UTF-8 21 | LazyData: true 22 | RoxygenNote: 7.1.1 23 | Suggests: 24 | knitr, 25 | rmarkdown, 26 | testthat, 27 | nnet, 28 | magrittr, 29 | ggplot2, 30 | scales 31 | VignetteBuilder: knitr 32 | Imports: 33 | MASS, 34 | stats 35 | -------------------------------------------------------------------------------- /MNLpred.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace,vignette 23 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(mnl_fd2_ova) 4 | export(mnl_fd_ova) 5 | export(mnl_pred_ova) 6 | importFrom(MASS,mvrnorm) 7 | importFrom(stats,coef) 8 | importFrom(stats,na.omit) 9 | importFrom(stats,quantile) 10 | importFrom(utils,setTxtProgressBar) 11 | importFrom(utils,txtProgressBar) 12 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # MNLpred 0.0.8 2 | 3 | - Skipping a package test that causes issues on CRAN. 4 | 5 | # MNLpred 0.0.7 6 | 7 | - Fixing an erroneously triggered error message that did not correctly 8 | evaluate the class of the IV when only one IV is supplied in the model. 9 | 10 | # MNLpred 0.0.6 11 | 12 | - Fixes a bug so that the function also works with just one IV. 13 | 14 | # MNLpred 0.0.5 15 | 16 | - Arguments 'xvari', 'scenname', 'scenvalue', and 'scenvalues' are deprecated 17 | in favor of 'x', 'z', 'z_value', and 'z_values' respectively. 18 | - New error messages. 19 | - Major speed enhancement. 20 | 21 | # MNLpred 0.0.4 22 | 23 | - Error message for factor variables in data. 24 | 25 | # MNLpred 0.0.3 26 | 27 | - Package includes sample data for examples, vignettes, etc. 28 | - Functions return more descriptive error messages. 29 | - Functions return progress bars. 30 | 31 | 32 | # MNLpred 0.0.2 33 | Package continues to work with the new major tibble release. 34 | 35 | # MNLpred 0.0.1 36 | -------------------------------------------------------------------------------- /R/gles-data.R: -------------------------------------------------------------------------------- 1 | #' German Longitudinal Election Study 2 | #' 3 | #' A sample of 1,000 respondents in the Rolling Cross Sectional 4 | #' study in the German Longitudinal Election Study in 2017. 5 | #' 6 | #' @docType data 7 | #' 8 | #' @usage data(gles) 9 | #' 10 | #' @format An data frame with 1,000 observations and 6 variables: 11 | #' \describe{ 12 | #' \item{vote}{Voting decision for party} 13 | #' \item{egoposition_immigration}{Ego-position toward immigration (0 = very open to 10 = very restrictive )} 14 | #' \item{ostwest}{Dummy for respondents from Eastern Germany (= 1)} 15 | #' \item{political_interest}{Measurement for political interst (0 = low, 4 = high)} 16 | #' \item{income}{Self-reported income satisfaction (0 = low, 4 = high)} 17 | #' \item{gender}{Self-reported gender (binary coding with 1 = female)} 18 | #' } 19 | #' 20 | #' 21 | #' @keywords datasets 22 | #' 23 | #' @references Roßteutscher, Sigrid et al. 2019. 24 | #' “Rolling Cross-Section-Wahlkampfstudie mit Nachwahl-Panelwelle (GLES 2017).” 25 | #' ZA6803 Datenfile Version 4.0.1. 26 | #' (\href{https://www.doi.org/10.4232/1.13213}{GESIS Datenarchiv}). 27 | #' 28 | #' @source \href{https://www.doi.org/10.4232/1.13213}{GESIS Datenarchiv} 29 | #' 30 | #' @examples 31 | #' data(gles) 32 | #' table(gles$vote) 33 | "gles" 34 | -------------------------------------------------------------------------------- /R/mnl_fd2_ova.R: -------------------------------------------------------------------------------- 1 | #' Multinomial First Differences Predictions For Two Values (Observed Value Approach) 2 | #' 3 | #' @param model the multinomial model, from a \code{\link{multinom}}()-function call (see the \code{\link{nnet}} package) 4 | #' @param data the data with which the model was estimated 5 | #' @param x the name of the variable that should be varied 6 | #' @param value1 first value for the difference 7 | #' @param value2 second value for the difference 8 | #' @param xvari former argument for \code{x} (deprecated). 9 | #' @param nsim numbers of simulations 10 | #' @param seed set a seed for replication purposes. 11 | #' @param probs a vector with two numbers, defining the significance levels. Default to 5\% significance level: \code{c(0.025, 0.975)} 12 | #' 13 | #' @return The function returns a list with several elements. Most importantly the list includes the simulated draws `S`, the simulated predictions `P`, the first differences of the predictions `P_fd`, a data set for plotting `plotdata` the predicted probabilities, and one for the first differences `plotdata_fd`. 14 | #' @export 15 | #' 16 | #' @examples 17 | #' library(nnet) 18 | #' library(MASS) 19 | #' 20 | #' dataset <- data.frame(y = c(rep("a", 10), rep("b", 10), rep("c", 10)), 21 | #' x1 = rnorm(30), 22 | #' x2 = rnorm(30, mean = 1), 23 | #' x3 = sample(1:10, 30, replace = TRUE)) 24 | #' 25 | #' mod <- multinom(y ~ x1 + x2 + x3, data = dataset, Hess = TRUE) 26 | #' 27 | #' fdi1 <- mnl_fd2_ova(model = mod, data = dataset, 28 | #' x = "x1", 29 | #' value1 = min(dataset$x1), 30 | #' value2 = max(dataset$x1), 31 | #' nsim = 10) 32 | #' 33 | #' 34 | #' 35 | #' @importFrom stats coef na.omit quantile 36 | #' @importFrom utils setTxtProgressBar txtProgressBar 37 | #' @importFrom MASS mvrnorm 38 | 39 | 40 | mnl_fd2_ova <- function(model, 41 | data, 42 | x, 43 | value1, 44 | value2, 45 | xvari, 46 | nsim = 1000, 47 | seed = "random", 48 | probs = c(0.025, 0.975)){ 49 | 50 | # Create list that is returned in the end. 51 | output <- list() 52 | 53 | # Warnings for deprecated arguments 54 | if (!missing(xvari)) { 55 | warning("The argument 'xvari' is deprecated; please use 'x' instead.\n\n", 56 | call. = FALSE) 57 | x <- xvari 58 | } 59 | 60 | # Errors: 61 | if (is.null(model) == TRUE) { 62 | stop("Please supply a model") 63 | } 64 | 65 | if (sum(grepl("multinom", model$call)) == 0) { 66 | stop("Please supply a multinom()-model") 67 | } 68 | 69 | if (is.null(data) == TRUE) { 70 | stop("Please supply a data set") 71 | } 72 | 73 | if (is.null(x) == TRUE | is.character(x) == FALSE) { 74 | stop("Please supply a character of your x-variable of interest") 75 | } 76 | 77 | if (is.null(value1) == TRUE | is.null(value2) == TRUE) { 78 | stop("Please supply values to compute differences") 79 | } 80 | 81 | if (is.null(model$Hessian) == TRUE) { 82 | stop("There is no Hessian matrix. Please specify Hess = TRUE in your multinom() call.") 83 | } 84 | 85 | # Names of variables in model (without the "list" character in the vector) 86 | variables <- as.character(attr(model$terms, "variables"))[-1] 87 | 88 | if (!(x %in% variables) == TRUE){ 89 | stop("x-variable is not an independent variable in the model. There might be a typo.") 90 | } 91 | 92 | # > Handeling the IVs -------------------------------------------------------- 93 | # Name of independent variables 94 | iv <- variables[2:length(variables)] 95 | output[["IV"]] <- iv 96 | 97 | # Variables have to be numeric 98 | if (length(iv) > 1) { 99 | if (sum(apply(data[, iv], 2, class) %in% c("numeric", "integer")) < ncol(data[, iv])) { 100 | stop("Please supply data that consists of numeric values. The package can not handle factor or character variables, yet. For workarounds, please take a look at the github issues (https://github.com/ManuelNeumann/MNLpred/issues/1). The problem will hopefully be fixed with the 0.1.0 release.") 101 | } 102 | } else { 103 | if (class(eval(parse(text = paste0("data$", iv)))) %in% c("numeric", "integer") == FALSE) { 104 | stop("Please supply data that consists of numeric values. The package can not handle factor or character variables, yet. For workarounds, please take a look at the github issues (https://github.com/ManuelNeumann/MNLpred/issues/1). The problem will hopefully be fixed with the 0.1.0 release.") 105 | } 106 | } 107 | 108 | # > Handeling the DVs -------------------------------------------------------- 109 | # Name of dependent variable 110 | dv <- variables[1] 111 | output[["DV"]] <- dv 112 | 113 | 114 | # > Full observations (listwise deletion) -------------------------------------- 115 | data_redux <- na.omit(data[, c(dv, iv)]) 116 | 117 | # Number of full observations 118 | obs <- nrow(data_redux) 119 | output[["Observations"]] <- obs 120 | 121 | # > Working with the model --------------------------------------------------- 122 | # Get matrix of coefficients out of the model 123 | coefmatrix <- coef(model) 124 | 125 | # Number of coefficients 126 | ncoef <- ncol(coefmatrix) 127 | 128 | # Model coefficients as a vector 129 | mu <- as.vector(t(coef(model))) 130 | 131 | # Variance-covariance matrix of estimates 132 | varcov <- solve(model$Hessian) 133 | 134 | # Set seed if needed: 135 | if (seed != "random") { 136 | set.seed(seed = seed) 137 | } 138 | 139 | # Simulate a sampling distribution 140 | S <- mvrnorm(nsim, mu, varcov) 141 | output[["S"]] <- S 142 | 143 | # Artificial variation ov independent variable of interest 144 | variation <- c(value1, value2) 145 | 146 | output[["ScenarioValues"]] <- variation 147 | 148 | nseq <- length(variation) 149 | 150 | # Number of full observations 151 | obs <- nrow(data_redux) 152 | output[["Observations"]] <- obs 153 | 154 | # Choice categories of the dependent variable 155 | categories <- sort(unique(eval(parse(text = paste0("data$", dv))))) 156 | J <- length(categories) 157 | output[["ChoiceCategories"]] <- categories 158 | output[["nChoices"]] <- J 159 | 160 | # Numbers of interactions 161 | ninteraction <- sum(grepl(":", model$coefnames)) 162 | 163 | # Matrix of observations 164 | X <- matrix(NA, ncol = ncoef, nrow = obs) 165 | colnames(X) <- model$coefnames 166 | # 1 for the Intercept 167 | X[, 1] <- 1 168 | # Values of the independent variables 169 | X[, 2:(length(iv)+1)] <- as.matrix(data_redux[, iv]) 170 | 171 | 172 | # Prepare array to fill in the matrix with the observed values 173 | ovacases <- array(NA, c(dim(X), nseq)) 174 | # Fill in the matrices: 175 | ovacases[,,] <- X 176 | 177 | # Select the position of the variable which should vary: 178 | if (is.null(x) == FALSE) { 179 | varidim <- which(colnames(X) == x) 180 | } 181 | 182 | # Artificially alter the variable in each dimension according to 183 | # the preferred sequence: 184 | if (is.null(x) == FALSE) { 185 | for (i in 1:nseq) { 186 | ovacases[, varidim, i] <- variation[i] 187 | } 188 | } 189 | 190 | 191 | # Compute interactions: 192 | if (ninteraction != 0) { 193 | 194 | # Get position of interaction terms 195 | interactionterms <- which(grepl(":", model$coefnames) == TRUE) 196 | 197 | # Compute the terms: 198 | for (i in c(interactionterms)) { 199 | # First variable name of the interaction: 200 | firstint <- gsub(":.*", "", model$coefnames[i]) 201 | # Second variable name of the interaction: 202 | secondint <- gsub(".*:", "", model$coefnames[i]) 203 | 204 | # Get position in matrix: 205 | intdim1 <- which(colnames(X) == firstint) 206 | intdim2 <- which(colnames(X) == secondint) 207 | 208 | # Compute interaction term: 209 | for(j in 1:nseq) { 210 | ovacases[, i, j] <- ovacases[, intdim1, j]*ovacases[, intdim2, j] 211 | } 212 | } 213 | } 214 | 215 | # Prepare array of observed values: 216 | ovaV <- array(NA, c(obs, nsim, nseq, J)) 217 | 218 | # Add progress bar 219 | pb_multiplication <- txtProgressBar(min = 0, max = nseq, initial = 0) 220 | 221 | # Loop over all scenarios 222 | cat("Multiplying values with simulated estimates:\n") 223 | 224 | for(i in 1:nseq){ 225 | ovaV[, , i, 1] <- apply(matrix(0, 226 | nrow = nsim, 227 | ncol = ncol(X)), 1, function(s) ovacases[, , i] %*% s) 228 | # ^ This will be zero because of the baseline category ^ 229 | 230 | # For each choice, the cases will now be multiplied with the simulated estimates 231 | for (k in 2:J) { 232 | coefstart <- (k-2)*ncoef+1 233 | coefend <- (k-1)*ncoef 234 | element <- parse(text = paste0("ovaV[,, i,", k, "] <- apply(S[, ", 235 | coefstart, ":", coefend, 236 | "], 1, function(s) ovacases[,, i] %*% s)")) 237 | eval(element) 238 | } 239 | 240 | # Progress bar: 241 | setTxtProgressBar(pb_multiplication, i) 242 | } 243 | 244 | 245 | 246 | # Multinomial link function: 247 | 248 | pb_link <- txtProgressBar(min = 0, max = nseq, initial = 0) 249 | cat("\nApplying link function:\n") 250 | 251 | # 1. Part: Sum over cases 252 | Sexp <- rowSums(exp(ovaV), dims = 3L) 253 | 254 | # Create P (array with predictions) 255 | P <- array(NA, c(nsim, J, nseq)) 256 | 257 | # 2. Part: take the exponent and divide through the sum of all (Sexp) 258 | for (l in 1:nseq) { 259 | for (m in 1:J) { 260 | P[, m, l] <- colMeans(exp(ovaV[, , l, m]) / Sexp[, , l]) 261 | if (sum(is.na(P[, m, l])) != 0) { 262 | stop( 263 | "Some of the log-odds are very large and the exponent cannot be computed. Please check your model specification for any problems, such as perfectly separated variables." 264 | ) 265 | } 266 | } 267 | 268 | setTxtProgressBar(pb_link, l) 269 | } 270 | 271 | output[["P"]] <- P 272 | 273 | # Aggregate the simulations 274 | # Create tibble for plot 275 | # plotdat <- tibble::tibble(iv = rep(variation, J), 276 | # categories = rep(categories, each = length(variation)), 277 | # mean = NA, 278 | # lower = NA, 279 | # upper = NA) 280 | plotdat <- data.frame(iv = rep(variation, J), 281 | categories = rep(categories, each = length(variation)), 282 | mean = NA, 283 | lower = NA, 284 | upper = NA) 285 | 286 | 287 | 288 | # Aggregate 289 | 290 | start <- 1 291 | 292 | for (i in 1:J) { 293 | end <- i*length(variation) 294 | plotdat[c(start:end), "mean"] <- colMeans(P[, i,]) 295 | plotdat[c(start:end), "lower"] <- apply(P[, i,], 2, quantile, probs = probs[1]) 296 | plotdat[c(start:end), "upper"] <- apply(P[, i,], 2, quantile, probs = probs[2]) 297 | start <- end+1 298 | } 299 | 300 | # Rename the variables in the plot data 301 | colnames(plotdat)[1:2] <- c(x, dv) 302 | 303 | 304 | # Put the data in the output 305 | output[["plotdata"]] <- plotdat 306 | 307 | # First differences 308 | P_fd <- array(NA, dim = c(nsim, J)) 309 | 310 | for (i in 1:J) { 311 | P_fd[, i] <- P[, i, 2] - P[, i, 1] 312 | } 313 | 314 | output[["P_fd"]] <- P_fd 315 | 316 | # Plotdata 317 | plotdat_fd <- data.frame(categories = categories, 318 | mean = NA, 319 | lower = NA, 320 | upper = NA) 321 | 322 | start <- 1 323 | for (i in 1:J){ 324 | end <- i 325 | plotdat_fd[c(start:end), "mean"] <- mean(P_fd[, i]) 326 | plotdat_fd[c(start:end), "lower"] <- quantile(P_fd[, i], probs = probs[1]) 327 | plotdat_fd[c(start:end), "upper"] <- quantile(P_fd[, i], probs = probs[2]) 328 | start <- end+1 329 | } 330 | 331 | output[["plotdata_fd"]] <- plotdat_fd 332 | 333 | cat("\nDone!\n\n") 334 | return(output) 335 | } 336 | -------------------------------------------------------------------------------- /R/mnl_fd_ova.R: -------------------------------------------------------------------------------- 1 | #' Multinomial First Differences Prediction (Observed Value Approach) 2 | #' 3 | #' This function predicts values for two different scenarios over a range of 4 | #' values. It then takes the differences between the different simulations to 5 | #' return first differences for each value. 6 | #' 7 | #' The function uses the \code{\link{mnl_pred_ova}} function for each scenario. 8 | #' The results of these predictions are also returned and can therefore be 9 | #' easily accessed. If you need predictions for multiple scenarios, you can use 10 | #' this function to both plot the predictions for each scenario and the 11 | #' differences between them. 12 | #' 13 | #' @param model the multinomial model, from a \code{\link{multinom}}()-function call (see the \code{\link{nnet}} package) 14 | #' @param data the data with which the model was estimated 15 | #' @param x the name of the variable that should be varied (the x-axis variable in prediction plots) 16 | #' @param z define the variable for which you want to compute the difference. 17 | #' @param z_values determine the two values at which value you want to fix the scenario (\code{z}). The first differences will be computed by subtracting the values of the first supplied scenario from the second one. 18 | #' @param xvari former argument for \code{x} (deprecated). 19 | #' @param scenname former argument for \code{z} (deprecated). 20 | #' @param scenvalues former argument for \code{z_values} (deprecated). 21 | #' @param by define the steps of \code{x}. 22 | #' @param nsim numbers of simulations 23 | #' @param seed set a seed for replication purposes. 24 | #' @param probs a vector with two numbers, defining the significance levels. Default to 5\% significance level: \code{c(0.025, 0.975)} 25 | #' 26 | #' @return The function returns a list with several elements. Most importantly the list includes the simulated draws `S`, the simulated predictions `P`, and a data set for plotting `plotdata`. 27 | #' @export 28 | #' 29 | #' @examples 30 | #' library(nnet) 31 | #' library(MASS) 32 | #' 33 | #' dataset <- data.frame(y = c(rep("a", 10), rep("b", 10), rep("c", 10)), 34 | #' x1 = rnorm(30), 35 | #' x2 = rnorm(30, mean = 1), 36 | #' x3 = sample(1:10, 30, replace = TRUE)) 37 | #' 38 | #' mod <- multinom(y ~ x1 + x2 + x3, data = dataset, Hess = TRUE) 39 | #' 40 | #' fdif <- mnl_fd_ova(model = mod, data = dataset, 41 | #' x = "x1", z = "x3", 42 | #' z_values = c(min(dataset$x3), max(dataset$x3)), 43 | #' nsim = 10) 44 | #' 45 | 46 | mnl_fd_ova <- function(model, 47 | data, 48 | x, 49 | z, 50 | z_values, 51 | xvari, 52 | scenname, 53 | scenvalues, 54 | by = NULL, 55 | nsim = 1000, 56 | seed = "random", 57 | probs = c(0.025, 0.975)){ 58 | 59 | # Prepare output: 60 | output <- list() 61 | 62 | # Warnings for deprecated arguments 63 | if (!missing(xvari)) { 64 | warning("The argument 'xvari' is deprecated; please use 'x' instead.\n\n", 65 | call. = FALSE) 66 | x <- xvari 67 | } 68 | 69 | if (!missing(scenname)) { 70 | warning("The argument 'scenname' is deprecated; please use 'z' instead.\n\n", 71 | call. = FALSE) 72 | z <- scenname 73 | } 74 | 75 | if (!missing(scenvalues)) { 76 | warning("The argument 'scenvalues' is deprecated; please use 'z_values' instead.\n\n", 77 | call. = FALSE) 78 | z_values <- scenvalues 79 | } 80 | 81 | # Errors: 82 | if (is.null(model) == TRUE) { 83 | stop("Please supply a model") 84 | } 85 | 86 | if (sum(grepl("multinom", model$call)) == 0) { 87 | stop("Please supply a multinom()-model") 88 | } 89 | 90 | if (is.null(data) == TRUE) { 91 | stop("Please supply a data set") 92 | } 93 | 94 | if (is.null(x) == TRUE | is.character(x) == FALSE) { 95 | stop("Please supply a character of your x-variable of interest") 96 | } 97 | 98 | if (is.null(z) == TRUE | is.character(z) == FALSE) { 99 | stop("Please supply a character of the scenario variable of interest") 100 | } 101 | 102 | if (is.null(z_values) == TRUE | 103 | length(z_values) != 2 | 104 | is.vector(z_values, mode = "numeric") == FALSE) { 105 | stop("Please two numeric values that are used for the different scenarios") 106 | } 107 | 108 | 109 | if (seed == "random") { 110 | seed <- sample(1:10000, 1) 111 | } 112 | 113 | # Predictions for first scenario 114 | cat("First scenario:\n") 115 | 116 | pred1 <- mnl_pred_ova(model = model, 117 | data = data, 118 | x = x, 119 | z = z, 120 | z_value = z_values[1], 121 | by = by, nsim = nsim, seed = seed, 122 | probs = probs) 123 | output[["Prediction1"]] <- pred1 124 | 125 | # Predictions for second scenario 126 | cat("Second scenario:\n") 127 | 128 | pred2 <- mnl_pred_ova(model = model, 129 | data = data, 130 | x = x, 131 | z = z, 132 | z_value = z_values[2], 133 | by = by, nsim = nsim, seed = seed, 134 | probs = probs) 135 | output[["Prediction2"]] <- pred2 136 | 137 | plotdat <- rbind(pred1$plotdata, 138 | pred2$plotdata) 139 | 140 | output[["plotdata"]] <- plotdat 141 | 142 | 143 | # First differences 144 | P_fd <- array(NA, dim = c(nsim, pred1$nChoices, pred1$nVariation)) 145 | 146 | for (i in 1:pred1$nChoices) { 147 | P_fd[, i,] <- pred2$P[, i,] - pred1$P[, i,] 148 | } 149 | 150 | output[["P"]] <- P_fd 151 | 152 | # Plotdata 153 | plotdata_fd <- pred1$plotdata[, c(1:2,4:6)] 154 | plotdata_fd[, c("mean", "lower", "upper")] <- NA 155 | 156 | start <- 1 157 | for (i in 1:pred1$nChoices){ 158 | end <- i*pred1$nVariation 159 | plotdata_fd[c(start:end), "mean"] <- colMeans(P_fd[, i, ]) 160 | plotdata_fd[c(start:end), "lower"] <- apply(P_fd[, i, ], 2, quantile, probs = probs[1]) 161 | plotdata_fd[c(start:end), "upper"] <- apply(P_fd[, i, ], 2, quantile, probs = probs[2]) 162 | start <- end+1 163 | } 164 | 165 | output[["plotdata_fd"]] <- plotdata_fd 166 | 167 | return(output) 168 | 169 | } 170 | -------------------------------------------------------------------------------- /R/mnl_pred_ova.R: -------------------------------------------------------------------------------- 1 | #' Multinomial Prediction Function (Observed Value Approach) 2 | #' 3 | #' This function predicts probabilities for all choices of a multinomial logit 4 | #' model over a specified span of values. 5 | #' 6 | #' @param model the multinomial model, from a \code{\link{multinom}}()-function call (see the \code{\link{nnet}} package) 7 | #' @param data the data with which the model was estimated 8 | #' @param x the name of the variable that should be varied (the x-axis variable in prediction plots) 9 | #' @param z if you want to hold a specific variable stable over all scenarios, you can name it here (optional). 10 | #' @param z_value determine at which value you want to fix the \code{z}. 11 | #' @param xvari former argument for \code{x} (deprecated). 12 | #' @param scenname former argument for \code{z} (deprecated). 13 | #' @param scenvalue former argument for \code{z_value} (deprecated). 14 | #' @param by define the steps of \code{x}. 15 | #' @param nsim numbers of simulations 16 | #' @param seed set a seed for replication purposes. 17 | #' @param probs a vector with two numbers, defining the significance levels. Default to 5\% significance level: \code{c(0.025, 0.975)} 18 | #' 19 | #' @return The function returns a list with several elements. Most importantly the list includes the simulated draws `S`, the simulated predictions `P`, and a data set for plotting `plotdata`. 20 | #' @export 21 | #' 22 | #' @examples 23 | #' library(nnet) 24 | #' library(MASS) 25 | #' 26 | #' dataset <- data.frame(y = c(rep("a", 10), rep("b", 10), rep("c", 10)), 27 | #' x1 = rnorm(30), 28 | #' x2 = rnorm(30, mean = 1), 29 | #' x3 = sample(1:10, 30, replace = TRUE)) 30 | #' 31 | #' mod <- multinom(y ~ x1 + x2 + x3, data = dataset, Hess = TRUE) 32 | #' 33 | #' pred <- mnl_pred_ova(model = mod, data = dataset, 34 | #' x = "x1", 35 | #' nsim = 10) 36 | #' 37 | #' 38 | #' @importFrom stats coef na.omit quantile 39 | #' @importFrom utils setTxtProgressBar txtProgressBar 40 | #' @importFrom MASS mvrnorm 41 | 42 | 43 | mnl_pred_ova <- function(model, 44 | data, 45 | x, 46 | by = NULL, 47 | z = NULL, 48 | z_value = NULL, 49 | xvari, 50 | scenname, 51 | scenvalue, 52 | nsim = 1000, 53 | seed = "random", 54 | probs = c(0.025, 0.975)){ 55 | 56 | # Create list that is returned in the end. 57 | output <- list() 58 | 59 | # Warnings for deprecated arguments 60 | if (!missing(xvari)) { 61 | warning("The argument 'xvari' is deprecated; please use 'x' instead.\n\n", 62 | call. = FALSE) 63 | x <- xvari 64 | } 65 | 66 | if (!missing(scenname)) { 67 | warning("The argument 'scenname' is deprecated; please use 'z' instead.\n\n", 68 | call. = FALSE) 69 | z <- scenname 70 | } 71 | 72 | if (!missing(scenvalue)) { 73 | warning("The argument 'scenvalue' is deprecated; please use 'z_value' instead.\n\n", 74 | call. = FALSE) 75 | z_value <- scenvalue 76 | } 77 | 78 | 79 | # Errors: 80 | if (is.null(model) == TRUE) { 81 | stop("Please supply a model") 82 | } 83 | 84 | if (sum(grepl("multinom", model$call)) == 0) { 85 | stop("Please supply a multinom()-model") 86 | } 87 | 88 | if (is.null(data) == TRUE) { 89 | stop("Please supply a data set") 90 | } 91 | 92 | if (is.null(x) == TRUE | is.character(x) == FALSE) { 93 | stop("Please supply a character string of your x-variable of interest") 94 | } 95 | 96 | if (is.null(model$Hessian) == TRUE) { 97 | stop("There is no Hessian matrix. Please specify Hess = TRUE in your multinom() call.") 98 | } 99 | 100 | # Names of variables in model (without the "list" character in the vector) 101 | variables <- as.character(attr(model$terms, "variables"))[-1] 102 | 103 | if(!(x %in% variables) == TRUE){ 104 | stop("x-variable is not an independent variable in the model. There might be a typo.") 105 | } 106 | 107 | # Check if scenario is supplied correctly 108 | if (is.null(z) == FALSE & is.character(z) == FALSE) { 109 | stop("Please supply a character string of your scenario of interest") 110 | } 111 | 112 | if(is.null(z) == FALSE){ 113 | if (!(z %in% variables) == TRUE) { 114 | stop("The scenario variable is not an independent variable in the model. There might be a typo.") 115 | } 116 | } 117 | 118 | # > Handeling the IVs -------------------------------------------------------- 119 | # Name of independent variables 120 | iv <- variables[2:length(variables)] 121 | output[["IV"]] <- iv 122 | 123 | # Variables have to be numeric 124 | if (length(iv) > 1) { 125 | if (sum(apply(data[, iv], 2, class) %in% c("numeric", "integer")) < ncol(data[, iv])) { 126 | stop("Please supply data that consists of numeric values. The package can not handle factor or character variables, yet. For workarounds, please take a look at the github issues (https://github.com/ManuelNeumann/MNLpred/issues/1). The problem will hopefully be fixed with the 0.1.0 release.") 127 | } 128 | } else { 129 | if (class(eval(parse(text = paste0("data$", iv)))) %in% c("numeric", "integer") == FALSE) { 130 | stop("Please supply data that consists of numeric values. The package can not handle factor or character variables, yet. For workarounds, please take a look at the github issues (https://github.com/ManuelNeumann/MNLpred/issues/1). The problem will hopefully be fixed with the 0.1.0 release.") 131 | } 132 | } 133 | 134 | # > Handeling the DVs -------------------------------------------------------- 135 | # Name of dependent variable 136 | dv <- variables[1] 137 | output[["DV"]] <- dv 138 | 139 | 140 | # > Full observations (listwise deletion) ------------------------------------ 141 | data_redux <- na.omit(data[, c(dv, iv)]) 142 | 143 | # Number of full observations 144 | obs <- nrow(data_redux) 145 | output[["Observations"]] <- obs 146 | 147 | # > Working with the model --------------------------------------------------- 148 | # Get matrix of coefficients out of the model 149 | coefmatrix <- coef(model) 150 | 151 | # Number of coefficients 152 | ncoef <- ncol(coefmatrix) 153 | 154 | # Model coefficients as a vector 155 | mu <- as.vector(t(coef(model))) 156 | 157 | # Variance-covariance matrix of estimates 158 | varcov <- solve(model$Hessian) 159 | 160 | # Set seed if needed: 161 | if (seed != "random") { 162 | set.seed(seed = seed) 163 | } 164 | 165 | # Simulate a sampling distribution 166 | S <- mvrnorm(nsim, mu, varcov) 167 | output[["S"]] <- S 168 | 169 | # Artificial variation of independent variable of interest (x) 170 | if (is.null(by) == TRUE) { 171 | by <- abs(min(eval(parse(text = paste0("data$", x))), na.rm = TRUE) - 172 | max(eval(parse(text = paste0("data$", x))), na.rm = TRUE)) 173 | } 174 | 175 | variation <- seq(from = min(eval(parse(text = paste0("data$", x))), 176 | na.rm = TRUE), 177 | to = max(eval(parse(text = paste0("data$", x))), 178 | na.rm = TRUE), 179 | by = by) 180 | 181 | output[["Variation"]] <- variation 182 | 183 | # Length of sequence 184 | nseq <- length(variation) 185 | 186 | if (nseq == 1) { 187 | stop("Please supply a dataset or a x-variable with variation") 188 | } 189 | 190 | output[["nVariation"]] <- nseq 191 | 192 | # Choice categories of the dependent variable 193 | categories <- sort(unique(eval(parse(text = paste0("data$", dv))))) 194 | J <- length(categories) 195 | 196 | if (J < 3) { 197 | stop("Please supply a dataset with a dependent variable that has a sufficient number of outcomes (> 2)") 198 | } 199 | 200 | output[["ChoiceCategories"]] <- categories 201 | output[["nChoices"]] <- J 202 | 203 | # Numbers of interactions 204 | ninteraction <- sum(grepl(":", model$coefnames)) 205 | 206 | # Matrix of observations 207 | X <- matrix(NA, ncol = ncoef, nrow = obs) 208 | colnames(X) <- model$coefnames 209 | # 1 for the Intercept 210 | X[, 1] <- 1 211 | # Values of the independent variables 212 | X[, 2:(length(iv)+1)] <- as.matrix(data_redux[, iv]) 213 | 214 | 215 | # Prepare array to fill in the matrix with the observed values 216 | ovacases <- array(NA, c(dim(X), nseq)) 217 | # Fill in the matrices: 218 | ovacases[,,] <- X 219 | 220 | # Select the position of the variable which should vary: 221 | varidim <- which(colnames(X) == x) 222 | 223 | 224 | # Artificially alter the variable in each dimension according to 225 | # the preferred sequence: 226 | for (i in 1:nseq) { 227 | ovacases[, varidim, i] <- variation[i] 228 | } 229 | 230 | # Hold a second variable steady (if need be) 231 | if(is.null(z) == FALSE) { 232 | scendim <- which(colnames(X) == z) 233 | 234 | for (i in 1:nseq) { 235 | ovacases[, scendim, i] <- z_value 236 | } 237 | } 238 | 239 | # Compute interactions: 240 | if (ninteraction != 0) { 241 | 242 | # Get position of interaction terms 243 | interactionterms <- which(grepl(":", model$coefnames) == TRUE) 244 | 245 | # Compute the terms: 246 | for (i in c(interactionterms)) { 247 | # First variable name of the interaction: 248 | firstint <- gsub(":.*", "", model$coefnames[i]) 249 | # Second variable name of the interaction: 250 | secondint <- gsub(".*:", "", model$coefnames[i]) 251 | 252 | # Get position in matrix: 253 | intdim1 <- which(colnames(X) == firstint) 254 | intdim2 <- which(colnames(X) == secondint) 255 | 256 | # Compute interaction term: 257 | for(j in 1:nseq) { 258 | ovacases[, i, j] <- ovacases[, intdim1, j]*ovacases[, intdim2, j] 259 | } 260 | } 261 | } 262 | 263 | # Prepare array of observed values: 264 | ovaV <- array(NA, c(obs, nsim, nseq, J)) 265 | 266 | # Add progress bar 267 | pb_multiplication <- txtProgressBar(min = 0, max = nseq, initial = 0) 268 | 269 | # Loop over all scenarios 270 | cat("Multiplying values with simulated estimates:\n") 271 | 272 | # Loop over all scenarios 273 | for(i in 1:nseq){ 274 | ovaV[, , i, 1] <- apply(matrix(0, 275 | nrow = nsim, 276 | ncol = ncol(X)), 277 | 1, 278 | function(s) ovacases[, , i] %*% s) 279 | # ^ This will be zero because of the baseline category ^ 280 | 281 | # For each choice, the cases will now be multiplied with the simulated estimates 282 | for (k in 2:J) { 283 | coefstart <- (k-2)*ncoef+1 284 | coefend <- (k-1)*ncoef 285 | element <- parse(text = paste0("ovaV[,, i,", k, "] <- apply(S[, ", 286 | coefstart, 287 | ":", 288 | coefend, 289 | "], 1, function(s) ovacases[,, i] %*% s)")) 290 | eval(element) 291 | } 292 | 293 | # Progress bar: 294 | setTxtProgressBar(pb_multiplication, i) 295 | } 296 | 297 | # Multinomial link function: 298 | 299 | # 1. Part: Sum over cases 300 | Sexp <- rowSums(exp(ovaV), dims = 3L) 301 | 302 | # Create P (array with predictions) 303 | P <- array(NA, c(nsim, J, nseq)) 304 | 305 | # 2. Part: take the exponent and divide through the sum of all (Sexp) 306 | # Add progress bar 307 | pb_link <- txtProgressBar(min = 0, max = nseq, initial = 0) 308 | 309 | # Loop over all scenarios 310 | cat("\nApplying link function:\n") 311 | 312 | for (l in 1:nseq) { 313 | for (m in 1:J) { 314 | P[, m, l] <- colMeans(exp(ovaV[, , l, m]) / Sexp[, , l]) 315 | if (sum(is.na(P[, m, l])) != 0) { 316 | stop("Stop") 317 | } 318 | } 319 | 320 | # Progress bar: 321 | setTxtProgressBar(pb_link, l) 322 | } 323 | 324 | 325 | output[["P"]] <- P 326 | 327 | # Aggregate the simulations 328 | # Create tibble for plot 329 | # if (is.null(z_value) == TRUE) { 330 | # plotdat <- tibble::tibble(iv = rep(variation, J), 331 | # categories = rep(categories, each = length(variation)), 332 | # mean = NA, 333 | # lower = NA, 334 | # upper = NA) 335 | # } else { 336 | # plotdat <- tibble::tibble(iv = rep(variation, J), 337 | # categories = rep(categories, each = length(variation)), 338 | # scen = rep(z_value, each = length(categories)), 339 | # mean = NA, 340 | # lower = NA, 341 | # upper = NA) 342 | # } 343 | 344 | if (is.null(z_value) == TRUE) { 345 | plotdat <- data.frame(iv = rep(variation, J), 346 | categories = rep(categories, each = length(variation)), 347 | mean = NA, 348 | lower = NA, 349 | upper = NA) 350 | } else { 351 | plotdat <- data.frame(iv = rep(variation, J), 352 | categories = rep(categories, each = length(variation)), 353 | scen = rep(z_value, each = length(variation)), 354 | mean = NA, 355 | lower = NA, 356 | upper = NA) 357 | } 358 | 359 | 360 | 361 | # Aggregate 362 | start <- 1 363 | 364 | for (i in 1:J) { 365 | end <- i*length(variation) 366 | plotdat[c(start:end), "mean"] <- colMeans(P[, i,]) 367 | plotdat[c(start:end), "lower"] <- apply(P[, i,], 2, quantile, probs = probs[1]) 368 | plotdat[c(start:end), "upper"] <- apply(P[, i,], 2, quantile, probs = probs[2]) 369 | start <- end+1 370 | } 371 | 372 | # Rename the variables in the plot data 373 | if (is.null(z) == TRUE) { 374 | colnames(plotdat)[1:2] <- c(x, dv) 375 | } else { 376 | colnames(plotdat)[1:3] <- c(x, dv, z) 377 | } 378 | 379 | 380 | # Put the data in the output 381 | output[["plotdata"]] <- plotdat 382 | 383 | cat("\nDone!\n\n") 384 | 385 | return(output) 386 | } 387 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | bibliography: vignettes/bibliography.bib 4 | --- 5 | 6 | 7 | 8 | 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>", 14 | fig.path = "man/figures/README-" 15 | ) 16 | ``` 17 | 18 | # MNLpred - Simulated Predictions From Multinomial Logistic Models 19 | 20 | 21 | [![GPLv3 license](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://perso.crans.org/besson/LICENSE.html) 22 | [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/MNLpred)](https://cran.r-project.org/package=MNLpred) 23 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4525342.svg)](https://doi.org/10.5281/zenodo.4525342) 24 | [![downloads](https://cranlogs.r-pkg.org/badges/MNLpred)](https://www.r-pkg.org/badges/version/MNLpred) 25 | [![total_downloads](https://cranlogs.r-pkg.org/badges/grand-total/MNLpred)](https://www.r-pkg.org/badges/version/MNLpred) 26 | [![R build status](https://github.com/ManuelNeumann/MNLpred/workflows/R-CMD-check/badge.svg)](https://github.com/ManuelNeumann/MNLpred/actions) 27 | 28 | 29 | This package provides functions that make it easy to get plottable predictions from multinomial logit models. The predictions are based on simulated draws of regression estimates from their respective sampling distribution. 30 | 31 | At first I will present the theoretical and statistical background, before using sample data to demonstrate the functions of the package. 32 | 33 | ## The Multinomial Logit Model 34 | 35 | For the statistical and theoretical background of the multinomial logit regression please refer to the vignette or sources like [these lecture notes by Germán Rodríguez](https://data.princeton.edu/wws509/notes/c6s2). 36 | 37 | Due to the inconvenience of integrating math equations in the README file, this is not the place to write comprehensively about it. 38 | 39 | These are the important characteristics of the model: 40 | 41 | - The multinomial logit regression is used to model nominal outcomes. It provides the opportunity to assign specific choices a probability, based on a set of independent variables. 42 | - The model needs an assigned baseline category to be identifiable. All other choices are evaluated in contrast to this reference. 43 | - The model returns a set of coefficients for each choice category. 44 | - Like all logit models, the multinomial logit model returns log-odds which are difficult to interpret in terms of effect sizes and uncertainties. 45 | 46 | This package helps to interpret the model in meaningful ways. 47 | 48 | ## Using the Package 49 | 50 | ### Installing 51 | 52 | The package can be both installed from CRAN or the github repository: 53 | 54 | ```{r} 55 | # Uncomment if necessary: 56 | 57 | # install.packages("MNLpred") 58 | # devtools::install_github("ManuelNeumann/MNLpred") 59 | ``` 60 | 61 | 62 | ### How Does the Function Work? 63 | 64 | As we have seen above, the multinomial logit can be used to get an insight into the probabilities to choose one option out of a set of alternatives. We have also seen that we need a baseline category to identify the model. This is mathematically necessary, but does not come in handy for purposes of interpretation. 65 | 66 | It is far more helpful and easier to understand to come up with predicted probabilities and first differences for values of interest [see e.g., @king2000 for approaches in social sciences]. Based on simulations, this package helps to easily predict probabilities and their uncertainty in forms of confidence intervals for each choice category over a specified scenario. The functions use the observed values to compute the predicted probabilities, as is recommended by @hanmer2013. 67 | 68 | The procedure follows the following steps: 69 | 70 | 1. Estimate a multinomial model and save the coefficients and the variance covariance matrix (based on the Hessian-matrix of the model). 71 | 1. To simulate uncertainty, make n draws of coefficients from a simulated sampling distribution based on the coefficients and the variance covariance matrix. 72 | 1. Predict probabilities by multiplying the drawn coefficients with a specified scenario (the observed values). 73 | 1. Take the mean and the quantiles of the simulated predicted probabilities. 74 | 75 | The presented functions follow these steps. Additionally, they use the so called observed value approach. This means that the "scenario" uses all observed values that informed the model. Therefore the function takes these more detailed steps: 76 | 77 | 1. For all (complete) cases n predictions are computed based on their observed independent values and the n sets of coefficients. 78 | 1. Next, the predicted values of all observations for each simulation are averaged. 79 | 1. Take the mean and the quantiles of the simulated predicted probabilities (same as above). 80 | 81 | For first differences, the simulated predictions are subtracted from each other. 82 | 83 | To showcase these steps, I present a reproducible example of how the functions can be used. 84 | 85 | ### Example 86 | 87 | The example uses data from the German Longitudinal Election Study (GLES, @rosteutscher2019). 88 | 89 | The contains 1,000 respondents characteristics and their vote choice. 90 | 91 | For this task, we need the following packages: 92 | 93 | ```{r setup} 94 | # Required packages 95 | library(magrittr) # for pipes 96 | library(nnet) # for the multinom()-function 97 | library(MASS) # for the multivariate normal distribution 98 | 99 | # The package 100 | library(MNLpred) 101 | 102 | # Plotting the predicted probabilities: 103 | library(ggplot2) 104 | library(scales) 105 | ``` 106 | 107 | Now we load the data: 108 | 109 | ```{r data, echo=TRUE} 110 | # The data: 111 | data("gles") 112 | ``` 113 | 114 | The next step is to compute the actual model. The function of the `MNLpred` package is based on models that were estimated with the `multinom()`-function of the `nnet` package. The `multinom()` function is convenient because it does not need transformed datasets. The syntax is very easy and resembles the ordinary regression functions. Important is that the Hessian matrix is returned with `Hess = TRUE`. The matrix is needed to simulate the sampling distribution. 115 | 116 | As we have seen above, we need a baseline or reference category for the model to work. Therefore, be aware what your baseline category is. If you use a dependent variable that is of type `character`, the categories will be ordered in alphabetical order. If you have a `factor`at hand, you can define your baseline category, for example with the `relevel()`function. 117 | 118 | Now, let's estimate the model: 119 | 120 | ```{r model} 121 | # Multinomial logit model: 122 | mod1 <- multinom(vote ~ egoposition_immigration + 123 | political_interest + 124 | income + gender + ostwest, 125 | data = gles, 126 | Hess = TRUE) 127 | ``` 128 | The results show the coefficients and standard errors. As we can see, there are five sets of coefficients. They describe the relationship between the reference category (`AfD`) and the vote choices for the parties `CDU/CSU`, `FDP`, `Gruene`, `LINKE`, and `SPD`. 129 | 130 | ```{r results} 131 | summary(mod1) 132 | ``` 133 | 134 | A first rough review of the coefficients shows that a more restrictive ego-position toward immigration leads to a lower probability of the voters to choose any other party than the AfD. It is hard to evaluate whether the effect is statistically significant and how the probabilities for each choice look like. For this it is helpful to predict the probabilities for certain scenarios and plot the means and confidence intervals for visual analysis. 135 | 136 | Let's say we are interested in the relationship between the ego-position toward immigration and the probability to choose any of the parties. It would be helpful to plot the predicted probabilities for the span of the positions. 137 | 138 | ```{r math} 139 | summary(gles$egoposition_immigration) 140 | ``` 141 | 142 | As we can see, the ego positions were recorded on a scale from 0 to 10. Higher numbers represent more restrictive positions. 143 | We pick this score as the x-variable (`x`) and use the `mnl_pred_ova()` function to get predicted probabilities for each position in this range. 144 | 145 | The function needs a multinomial logit model (`model`), data (`data`), the variable of interest `x`, the steps for which the probabilities should be predicted (`by`). Additionally, a `seed` can be defined for replication purposes, the numbers of simulations can be defined (`nsim`), and the confidence intervals (`probs`). 146 | 147 | If we want to hold another variable stable, we can specify so with `z`and `z_value`. See also the `mnl_fd_ova()` function below. 148 | 149 | ```{r mnl_pred_ova} 150 | pred1 <- mnl_pred_ova(model = mod1, 151 | data = gles, 152 | x = "egoposition_immigration", 153 | by = 1, 154 | seed = 68159, 155 | nsim = 100, # faster 156 | probs = c(0.025, 0.975)) # default 157 | ``` 158 | 159 | The function returns a list with several elements. Most importantly, it returns a `plotdata` data set: 160 | 161 | ```{r return} 162 | pred1$plotdata %>% head() 163 | ``` 164 | 165 | As we can see, it includes the range of the x variable, a mean, a lower, and an upper bound of the confidence interval. Concerning the choice category, the data is in a long format. This makes it easy to plot it with the `ggplot` syntax. The choice category can now easily be used to differentiate the lines in the plot by using `linetype = vote` in the `aes()`. Another option is to use `facet_wrap()` or `facet_grid()` to differentiate the predictions: 166 | 167 | ```{r prediction_plot1} 168 | ggplot(data = pred1$plotdata, aes(x = egoposition_immigration, 169 | y = mean, 170 | ymin = lower, ymax = upper)) + 171 | geom_ribbon(alpha = 0.1) + # Confidence intervals 172 | geom_line() + # Mean 173 | facet_wrap(.~ vote, scales = "free_y", ncol = 2) + 174 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 175 | scale_x_continuous(breaks = c(0:10), 176 | minor_breaks = FALSE) + 177 | theme_bw() + 178 | labs(y = "Predicted probabilities", 179 | x = "Ego-position toward immigration") # Always label your axes ;) 180 | ``` 181 | 182 | If we want first differences between two scenarios, we can use the function `mnl_fd2_ova()`. The function takes similar arguments as the function above, but now the values for the scenarios of interest have to be supplied. Imagine we want to know what difference it makes to position oneself on the most tolerant or most restrictive end of the `egoposition_immigration` scale. This can be done as follows: 183 | 184 | ```{r static_fd} 185 | fdif1 <- mnl_fd2_ova(model = mod1, 186 | data = gles, 187 | x = "egoposition_immigration", 188 | value1 = min(gles$egoposition_immigration), 189 | value2 = max(gles$egoposition_immigration), 190 | seed = 68159, 191 | nsim = 100) 192 | ``` 193 | 194 | The first differences can then be depicted in a graph. 195 | 196 | ```{r static_fd_plot} 197 | ggplot(fdif1$plotdata_fd, aes(x = categories, 198 | y = mean, 199 | ymin = lower, ymax = upper)) + 200 | geom_pointrange() + 201 | geom_hline(yintercept = 0) + 202 | scale_y_continuous(labels = percent_format()) + 203 | theme_bw() + 204 | labs(y = "Predicted probabilities", 205 | x = "Party vote") 206 | ``` 207 | 208 | 209 | We are often not only interested in the static difference, but the difference across a span of values, given a difference in a second variable. This is especially helpful when we look at dummy variables. For example, we could be interested in the effect of `gender` on the vote decision over the different ego-positions. With the `mnl_fd_ova()` function, we can predict the probabilities for two scenarios and subtract them. The function returns the differences and the confidence intervals of the differences. The different scenarios can be held stable with `z` and the `z_values`. `z_values` takes a vector of two numeric values. These values are held stable for the variable that is named in `z`. 210 | 211 | ```{r first_diffferences_prediction} 212 | fdif2 <- mnl_fd_ova(model = mod1, 213 | data = gles, 214 | x = "egoposition_immigration", 215 | by = 1, 216 | z = "gender", 217 | z_values = c(0,1), 218 | seed = 68159, 219 | nsim = 100) 220 | ``` 221 | As before, the function returns a list including a data set that can be used to plot the differences. 222 | 223 | ```{r fd_return} 224 | fdif2$plotdata_fd %>% head() 225 | ``` 226 | 227 | Since the function calls the `mnl_pred_ova()` function internally, it also returns the output of the two predictions in the list element `Prediction1` and `Prediction2`. The plot data for the predictions is already bound together row wise to easily plot the predicted probabilities. 228 | 229 | ```{r prediction_plot2} 230 | ggplot(data = fdif2$plotdata, aes(x = egoposition_immigration, 231 | y = mean, 232 | ymin = lower, ymax = upper, 233 | group = as.factor(gender), 234 | linetype = as.factor(gender))) + 235 | geom_ribbon(alpha = 0.1) + 236 | geom_line() + 237 | facet_wrap(. ~ vote, scales = "free_y", ncol = 2) + 238 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 239 | scale_x_continuous(breaks = c(0:10), 240 | minor_breaks = FALSE) + 241 | scale_linetype_discrete(name = "Gender", 242 | breaks = c(0, 1), 243 | labels = c("Male", "Female")) + 244 | theme_bw() + 245 | labs(y = "Predicted probabilities", 246 | x = "Ego-position toward immigration") # Always label your axes ;) 247 | ``` 248 | 249 | As we can see, the differences between `female` and `male` differ, depending on the party and ego-position. So let's take a look at the differences: 250 | 251 | ```{r first_differences_plot} 252 | ggplot(data = fdif2$plotdata_fd, aes(x = egoposition_immigration, 253 | y = mean, 254 | ymin = lower, ymax = upper)) + 255 | geom_ribbon(alpha = 0.1) + 256 | geom_line() + 257 | geom_hline(yintercept = 0) + 258 | facet_wrap(. ~ vote, ncol = 3) + 259 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 260 | scale_x_continuous(breaks = c(0:10), 261 | minor_breaks = FALSE) + 262 | theme_bw() + 263 | labs(y = "Predicted probabilities", 264 | x = "Ego-position toward immigration") # Always label your axes ;) 265 | ``` 266 | 267 | We can see that the differences are for some parties at no point statistically significant from 0. 268 | 269 | 270 | ## Conclusion 271 | Multinomial logit models are important to model nominal choices. They are, however, restricted by being in need of a baseline category. Additionally, the log-character of the estimates makes it difficult to interpret them in meaningful ways. Predicting probabilities for all choices for scenarios, based on the observed data provides much more insight. The functions of this package provide easy to use functions that return data that can be used to plot predicted probabilities. The function uses a model from the `multinom()` function and uses the observed value approach and a supplied scenario to predict values over the range of fitting values. The functions simulate sampling distributions and therefore provide meaningful confidence intervals. `mnl_pred_ova()` can be used to predict probabilities for a certain scenario. `mnl_fd_ova()` can be used to predict probabilities for two scenarios and their first differences. 272 | 273 | ## Acknowledgment 274 | 275 | My code is inspired by the method courses in the [Political Science master's program at the University of Mannheim](https://www.sowi.uni-mannheim.de/en/academics/prospective-students/ma-in-political-science/)(cool place, check it out!). The skeleton of the code is based on a tutorial taught by [Marcel Neunhoeffer](https://www.marcel-neunhoeffer.com/) (lecture: "Advanced Quantitative Methods" by [Thomas Gschwend](https://www.sowi.uni-mannheim.de/gschwend/)). 276 | 277 | ## DOI 278 | 279 | General DOI (always links to most recent version): [10.5281/zenodo.4525342](https://doi.org/10.5281/zenodo.4525342) 280 | 281 | DOIs for different versions: 282 | 283 | - Version 0.0.6: [10.5281/zenodo.4580427](https://doi.org/10.5281/zenodo.4580427) 284 | - Version 0.0.5: [10.5281/zenodo.4525343](https://doi.org/10.5281/zenodo.4525343) 285 | 286 | ## References 287 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # MNLpred - Simulated Predictions From Multinomial Logistic Models 5 | 6 | 7 | 8 | [![GPLv3 9 | license](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://perso.crans.org/besson/LICENSE.html) 10 | [![CRAN\_Status\_Badge](https://www.r-pkg.org/badges/version/MNLpred)](https://cran.r-project.org/package=MNLpred) 11 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4525342.svg)](https://doi.org/10.5281/zenodo.4525342) 12 | [![downloads](https://cranlogs.r-pkg.org/badges/MNLpred)](https://www.r-pkg.org/badges/version/MNLpred) 13 | [![total\_downloads](https://cranlogs.r-pkg.org/badges/grand-total/MNLpred)](https://www.r-pkg.org/badges/version/MNLpred) 14 | [![R build 15 | status](https://github.com/ManuelNeumann/MNLpred/workflows/R-CMD-check/badge.svg)](https://github.com/ManuelNeumann/MNLpred/actions) 16 | 17 | 18 | This package provides functions that make it easy to get plottable 19 | predictions from multinomial logit models. The predictions are based on 20 | simulated draws of regression estimates from their respective sampling 21 | distribution. 22 | 23 | At first I will present the theoretical and statistical background, 24 | before using sample data to demonstrate the functions of the package. 25 | 26 | ## The Multinomial Logit Model 27 | 28 | For the statistical and theoretical background of the multinomial logit 29 | regression please refer to the vignette or sources like [these lecture 30 | notes by Germán 31 | Rodríguez](https://data.princeton.edu/wws509/notes/c6s2). 32 | 33 | Due to the inconvenience of integrating math equations in the README 34 | file, this is not the place to write comprehensively about it. 35 | 36 | These are the important characteristics of the model: 37 | 38 | - The multinomial logit regression is used to model nominal outcomes. 39 | It provides the opportunity to assign specific choices a 40 | probability, based on a set of independent variables. 41 | - The model needs an assigned baseline category to be identifiable. 42 | All other choices are evaluated in contrast to this reference. 43 | - The model returns a set of coefficients for each choice category. 44 | - Like all logit models, the multinomial logit model returns log-odds 45 | which are difficult to interpret in terms of effect sizes and 46 | uncertainties. 47 | 48 | This package helps to interpret the model in meaningful ways. 49 | 50 | ## Using the Package 51 | 52 | ### Installing 53 | 54 | The package can be both installed from CRAN or the github repository: 55 | 56 | ``` r 57 | # Uncomment if necessary: 58 | 59 | # install.packages("MNLpred") 60 | # devtools::install_github("ManuelNeumann/MNLpred") 61 | ``` 62 | 63 | ### How Does the Function Work? 64 | 65 | As we have seen above, the multinomial logit can be used to get an 66 | insight into the probabilities to choose one option out of a set of 67 | alternatives. We have also seen that we need a baseline category to 68 | identify the model. This is mathematically necessary, but does not come 69 | in handy for purposes of interpretation. 70 | 71 | It is far more helpful and easier to understand to come up with 72 | predicted probabilities and first differences for values of interest 73 | (see e.g., King, Tomz, and Wittenberg 2000 for approaches in social 74 | sciences). Based on simulations, this package helps to easily predict 75 | probabilities and their uncertainty in forms of confidence intervals for 76 | each choice category over a specified scenario. The functions use the 77 | observed values to compute the predicted probabilities, as is 78 | recommended by Hanmer and Ozan Kalkan (2013). 79 | 80 | The procedure follows the following steps: 81 | 82 | 1. Estimate a multinomial model and save the coefficients and the 83 | variance covariance matrix (based on the Hessian-matrix of the 84 | model). 85 | 2. To simulate uncertainty, make n draws of coefficients from a 86 | simulated sampling distribution based on the coefficients and the 87 | variance covariance matrix. 88 | 3. Predict probabilities by multiplying the drawn coefficients with a 89 | specified scenario (the observed values). 90 | 4. Take the mean and the quantiles of the simulated predicted 91 | probabilities. 92 | 93 | The presented functions follow these steps. Additionally, they use the 94 | so called observed value approach. This means that the “scenario” uses 95 | all observed values that informed the model. Therefore the function 96 | takes these more detailed steps: 97 | 98 | 1. For all (complete) cases n predictions are computed based on their 99 | observed independent values and the n sets of coefficients. 100 | 2. Next, the predicted values of all observations for each simulation 101 | are averaged. 102 | 3. Take the mean and the quantiles of the simulated predicted 103 | probabilities (same as above). 104 | 105 | For first differences, the simulated predictions are subtracted from 106 | each other. 107 | 108 | To showcase these steps, I present a reproducible example of how the 109 | functions can be used. 110 | 111 | ### Example 112 | 113 | The example uses data from the German Longitudinal Election Study (GLES, 114 | Roßteutscher et al. (2019)). 115 | 116 | The contains 1,000 respondents characteristics and their vote choice. 117 | 118 | For this task, we need the following packages: 119 | 120 | ``` r 121 | # Required packages 122 | library(magrittr) # for pipes 123 | library(nnet) # for the multinom()-function 124 | library(MASS) # for the multivariate normal distribution 125 | 126 | # The package 127 | library(MNLpred) 128 | 129 | # Plotting the predicted probabilities: 130 | library(ggplot2) 131 | library(scales) 132 | ``` 133 | 134 | Now we load the data: 135 | 136 | ``` r 137 | # The data: 138 | data("gles") 139 | ``` 140 | 141 | The next step is to compute the actual model. The function of the 142 | `MNLpred` package is based on models that were estimated with the 143 | `multinom()`-function of the `nnet` package. The `multinom()` function 144 | is convenient because it does not need transformed datasets. The syntax 145 | is very easy and resembles the ordinary regression functions. Important 146 | is that the Hessian matrix is returned with `Hess = TRUE`. The matrix is 147 | needed to simulate the sampling distribution. 148 | 149 | As we have seen above, we need a baseline or reference category for the 150 | model to work. Therefore, be aware what your baseline category is. If 151 | you use a dependent variable that is of type `character`, the categories 152 | will be ordered in alphabetical order. If you have a `factor`at hand, 153 | you can define your baseline category, for example with the 154 | `relevel()`function. 155 | 156 | Now, let’s estimate the model: 157 | 158 | ``` r 159 | # Multinomial logit model: 160 | mod1 <- multinom(vote ~ egoposition_immigration + 161 | political_interest + 162 | income + gender + ostwest, 163 | data = gles, 164 | Hess = TRUE) 165 | #> # weights: 42 (30 variable) 166 | #> initial value 1791.759469 167 | #> iter 10 value 1644.501289 168 | #> iter 20 value 1553.803188 169 | #> iter 30 value 1538.792079 170 | #> final value 1537.906674 171 | #> converged 172 | ``` 173 | 174 | The results show the coefficients and standard errors. As we can see, 175 | there are five sets of coefficients. They describe the relationship 176 | between the reference category (`AfD`) and the vote choices for the 177 | parties `CDU/CSU`, `FDP`, `Gruene`, `LINKE`, and `SPD`. 178 | 179 | ``` r 180 | summary(mod1) 181 | #> Call: 182 | #> multinom(formula = vote ~ egoposition_immigration + political_interest + 183 | #> income + gender + ostwest, data = gles, Hess = TRUE) 184 | #> 185 | #> Coefficients: 186 | #> (Intercept) egoposition_immigration political_interest income 187 | #> CDU/CSU 3.101201 -0.4419104 -0.29177070 0.33114348 188 | #> FDP 2.070618 -0.4106626 -0.19044703 0.18496691 189 | #> Gruene 3.232074 -0.8482213 -0.03023454 0.24330589 190 | #> LINKE 4.990008 -0.7477359 -0.04503371 -0.24206850 191 | #> SPD 3.799394 -0.6425427 -0.03514426 0.08211066 192 | #> gender ostwest 193 | #> CDU/CSU 1.296949 0.79760035 194 | #> FDP 1.252112 1.01378955 195 | #> Gruene 1.831714 0.76299897 196 | #> LINKE 1.368591 -0.02428322 197 | #> SPD 1.497019 0.74026388 198 | #> 199 | #> Std. Errors: 200 | #> (Intercept) egoposition_immigration political_interest income 201 | #> CDU/CSU 0.8568928 0.06504100 0.1694270 0.1872533 202 | #> FDP 0.9508589 0.07083385 0.1882974 0.2063590 203 | #> Gruene 0.9854950 0.07887427 0.1969704 0.2149368 204 | #> LINKE 0.9505656 0.07755359 0.1962954 0.2058036 205 | #> SPD 0.8880256 0.06912678 0.1779570 0.1924002 206 | #> gender ostwest 207 | #> CDU/CSU 0.3530225 0.3120178 208 | #> FDP 0.3794295 0.3615071 209 | #> Gruene 0.3875116 0.3671397 210 | #> LINKE 0.3885663 0.3493941 211 | #> SPD 0.3625652 0.3269007 212 | #> 213 | #> Residual Deviance: 3075.813 214 | #> AIC: 3135.813 215 | ``` 216 | 217 | A first rough review of the coefficients shows that a more restrictive 218 | ego-position toward immigration leads to a lower probability of the 219 | voters to choose any other party than the AfD. It is hard to evaluate 220 | whether the effect is statistically significant and how the 221 | probabilities for each choice look like. For this it is helpful to 222 | predict the probabilities for certain scenarios and plot the means and 223 | confidence intervals for visual analysis. 224 | 225 | Let’s say we are interested in the relationship between the ego-position 226 | toward immigration and the probability to choose any of the parties. It 227 | would be helpful to plot the predicted probabilities for the span of the 228 | positions. 229 | 230 | ``` r 231 | summary(gles$egoposition_immigration) 232 | #> Min. 1st Qu. Median Mean 3rd Qu. Max. 233 | #> 0.000 3.000 4.000 4.361 6.000 10.000 234 | ``` 235 | 236 | As we can see, the ego positions were recorded on a scale from 0 to 10. 237 | Higher numbers represent more restrictive positions. We pick this score 238 | as the x-variable (`x`) and use the `mnl_pred_ova()` function to get 239 | predicted probabilities for each position in this range. 240 | 241 | The function needs a multinomial logit model (`model`), data (`data`), 242 | the variable of interest `x`, the steps for which the probabilities 243 | should be predicted (`by`). Additionally, a `seed` can be defined for 244 | replication purposes, the numbers of simulations can be defined 245 | (`nsim`), and the confidence intervals (`probs`). 246 | 247 | If we want to hold another variable stable, we can specify so with 248 | `z`and `z_value`. See also the `mnl_fd_ova()` function below. 249 | 250 | ``` r 251 | pred1 <- mnl_pred_ova(model = mod1, 252 | data = gles, 253 | x = "egoposition_immigration", 254 | by = 1, 255 | seed = 68159, 256 | nsim = 100, # faster 257 | probs = c(0.025, 0.975)) # default 258 | #> Multiplying values with simulated estimates: 259 | #> ================================================================================ 260 | #> Applying link function: 261 | #> ================================================================================ 262 | #> Done! 263 | ``` 264 | 265 | The function returns a list with several elements. Most importantly, it 266 | returns a `plotdata` data set: 267 | 268 | ``` r 269 | pred1$plotdata %>% head() 270 | #> egoposition_immigration vote mean lower upper 271 | #> 1 0 AfD 0.002419192 0.001025942 0.004913258 272 | #> 2 1 AfD 0.004625108 0.002172854 0.008685788 273 | #> 3 2 AfD 0.008653845 0.004472502 0.014698651 274 | #> 4 3 AfD 0.015796304 0.008923480 0.025148142 275 | #> 5 4 AfD 0.028022769 0.017756232 0.040861062 276 | #> 6 5 AfD 0.048081830 0.033241425 0.063794110 277 | ``` 278 | 279 | As we can see, it includes the range of the x variable, a mean, a lower, 280 | and an upper bound of the confidence interval. Concerning the choice 281 | category, the data is in a long format. This makes it easy to plot it 282 | with the `ggplot` syntax. The choice category can now easily be used to 283 | differentiate the lines in the plot by using `linetype = vote` in the 284 | `aes()`. Another option is to use `facet_wrap()` or `facet_grid()` to 285 | differentiate the predictions: 286 | 287 | ``` r 288 | ggplot(data = pred1$plotdata, aes(x = egoposition_immigration, 289 | y = mean, 290 | ymin = lower, ymax = upper)) + 291 | geom_ribbon(alpha = 0.1) + # Confidence intervals 292 | geom_line() + # Mean 293 | facet_wrap(.~ vote, scales = "free_y", ncol = 2) + 294 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 295 | scale_x_continuous(breaks = c(0:10), 296 | minor_breaks = FALSE) + 297 | theme_bw() + 298 | labs(y = "Predicted probabilities", 299 | x = "Ego-position toward immigration") # Always label your axes ;) 300 | ``` 301 | 302 | ![](man/figures/README-prediction_plot1-1.png) 303 | 304 | If we want first differences between two scenarios, we can use the 305 | function `mnl_fd2_ova()`. The function takes similar arguments as the 306 | function above, but now the values for the scenarios of interest have to 307 | be supplied. Imagine we want to know what difference it makes to 308 | position oneself on the most tolerant or most restrictive end of the 309 | `egoposition_immigration` scale. This can be done as follows: 310 | 311 | ``` r 312 | fdif1 <- mnl_fd2_ova(model = mod1, 313 | data = gles, 314 | x = "egoposition_immigration", 315 | value1 = min(gles$egoposition_immigration), 316 | value2 = max(gles$egoposition_immigration), 317 | seed = 68159, 318 | nsim = 100) 319 | #> Multiplying values with simulated estimates: 320 | #> ================================================================================ 321 | #> Applying link function: 322 | #> ================================================================================ 323 | #> Done! 324 | ``` 325 | 326 | The first differences can then be depicted in a graph. 327 | 328 | ``` r 329 | ggplot(fdif1$plotdata_fd, aes(x = categories, 330 | y = mean, 331 | ymin = lower, ymax = upper)) + 332 | geom_pointrange() + 333 | geom_hline(yintercept = 0) + 334 | scale_y_continuous(labels = percent_format()) + 335 | theme_bw() + 336 | labs(y = "Predicted probabilities", 337 | x = "Party vote") 338 | ``` 339 | 340 | ![](man/figures/README-static_fd_plot-1.png) 341 | 342 | We are often not only interested in the static difference, but the 343 | difference across a span of values, given a difference in a second 344 | variable. This is especially helpful when we look at dummy variables. 345 | For example, we could be interested in the effect of `gender` on the 346 | vote decision over the different ego-positions. With the `mnl_fd_ova()` 347 | function, we can predict the probabilities for two scenarios and 348 | subtract them. The function returns the differences and the confidence 349 | intervals of the differences. The different scenarios can be held stable 350 | with `z` and the `z_values`. `z_values` takes a vector of two numeric 351 | values. These values are held stable for the variable that is named in 352 | `z`. 353 | 354 | ``` r 355 | fdif2 <- mnl_fd_ova(model = mod1, 356 | data = gles, 357 | x = "egoposition_immigration", 358 | by = 1, 359 | z = "gender", 360 | z_values = c(0,1), 361 | seed = 68159, 362 | nsim = 100) 363 | #> First scenario: 364 | #> Multiplying values with simulated estimates: 365 | #> ================================================================================ 366 | #> Applying link function: 367 | #> ================================================================================ 368 | #> Done! 369 | #> 370 | #> Second scenario: 371 | #> Multiplying values with simulated estimates: 372 | #> ================================================================================ 373 | #> Applying link function: 374 | #> ================================================================================ 375 | #> Done! 376 | ``` 377 | 378 | As before, the function returns a list including a data set that can be 379 | used to plot the differences. 380 | 381 | ``` r 382 | fdif2$plotdata_fd %>% head() 383 | #> egoposition_immigration vote mean lower upper 384 | #> 1 0 AfD -0.002861681 -0.006172786 -0.001206796 385 | #> 2 1 AfD -0.005396410 -0.010531302 -0.002537904 386 | #> 3 2 AfD -0.009942124 -0.017921934 -0.005038667 387 | #> 4 3 AfD -0.017827751 -0.029548569 -0.009657235 388 | #> 5 4 AfD -0.030957366 -0.046960384 -0.017810853 389 | #> 6 5 AfD -0.051691516 -0.071903589 -0.031438906 390 | ``` 391 | 392 | Since the function calls the `mnl_pred_ova()` function internally, it 393 | also returns the output of the two predictions in the list element 394 | `Prediction1` and `Prediction2`. The plot data for the predictions is 395 | already bound together row wise to easily plot the predicted 396 | probabilities. 397 | 398 | ``` r 399 | ggplot(data = fdif2$plotdata, aes(x = egoposition_immigration, 400 | y = mean, 401 | ymin = lower, ymax = upper, 402 | group = as.factor(gender), 403 | linetype = as.factor(gender))) + 404 | geom_ribbon(alpha = 0.1) + 405 | geom_line() + 406 | facet_wrap(. ~ vote, scales = "free_y", ncol = 2) + 407 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 408 | scale_x_continuous(breaks = c(0:10), 409 | minor_breaks = FALSE) + 410 | scale_linetype_discrete(name = "Gender", 411 | breaks = c(0, 1), 412 | labels = c("Male", "Female")) + 413 | theme_bw() + 414 | labs(y = "Predicted probabilities", 415 | x = "Ego-position toward immigration") # Always label your axes ;) 416 | ``` 417 | 418 | ![](man/figures/README-prediction_plot2-1.png) 419 | 420 | As we can see, the differences between `female` and `male` differ, 421 | depending on the party and ego-position. So let’s take a look at the 422 | differences: 423 | 424 | ``` r 425 | ggplot(data = fdif2$plotdata_fd, aes(x = egoposition_immigration, 426 | y = mean, 427 | ymin = lower, ymax = upper)) + 428 | geom_ribbon(alpha = 0.1) + 429 | geom_line() + 430 | geom_hline(yintercept = 0) + 431 | facet_wrap(. ~ vote, ncol = 3) + 432 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 433 | scale_x_continuous(breaks = c(0:10), 434 | minor_breaks = FALSE) + 435 | theme_bw() + 436 | labs(y = "Predicted probabilities", 437 | x = "Ego-position toward immigration") # Always label your axes ;) 438 | ``` 439 | 440 | ![](man/figures/README-first_differences_plot-1.png) 441 | 442 | We can see that the differences are for some parties at no point 443 | statistically significant from 0. 444 | 445 | ## Conclusion 446 | 447 | Multinomial logit models are important to model nominal choices. They 448 | are, however, restricted by being in need of a baseline category. 449 | Additionally, the log-character of the estimates makes it difficult to 450 | interpret them in meaningful ways. Predicting probabilities for all 451 | choices for scenarios, based on the observed data provides much more 452 | insight. The functions of this package provide easy to use functions 453 | that return data that can be used to plot predicted probabilities. The 454 | function uses a model from the `multinom()` function and uses the 455 | observed value approach and a supplied scenario to predict values over 456 | the range of fitting values. The functions simulate sampling 457 | distributions and therefore provide meaningful confidence intervals. 458 | `mnl_pred_ova()` can be used to predict probabilities for a certain 459 | scenario. `mnl_fd_ova()` can be used to predict probabilities for two 460 | scenarios and their first differences. 461 | 462 | ## Acknowledgment 463 | 464 | My code is inspired by the method courses in the [Political Science 465 | master’s program at the University of 466 | Mannheim](https://www.sowi.uni-mannheim.de/en/academics/prospective-students/ma-in-political-science/)(cool 467 | place, check it out!). The skeleton of the code is based on a tutorial 468 | taught by [Marcel Neunhoeffer](https://www.marcel-neunhoeffer.com/) 469 | (lecture: “Advanced Quantitative Methods” by [Thomas 470 | Gschwend](https://www.sowi.uni-mannheim.de/gschwend/)). 471 | 472 | ## DOI 473 | 474 | General DOI (always links to most recent version): 475 | [10.5281/zenodo.4525342](https://doi.org/10.5281/zenodo.4525342) 476 | 477 | DOIs for different versions: 478 | 479 | - Version 0.0.6: 480 | [10.5281/zenodo.4580427](https://doi.org/10.5281/zenodo.4580427) 481 | - Version 0.0.5: 482 | [10.5281/zenodo.4525343](https://doi.org/10.5281/zenodo.4525343) 483 | 484 | ## References 485 | 486 |
487 | 488 |
489 | 490 | Hanmer, Michael J., and Kerem Ozan Kalkan. 2013. “Behind the Curve: 491 | Clarifying the Best Approach to Calculating Predicted Probabilities and 492 | Marginal Effects from Limited Dependent Variable Models.” *American 493 | Journal of Political Science* 57 (1): 263–77. 494 | . 495 | 496 |
497 | 498 |
499 | 500 | King, Gary, Michael Tomz, and Jason Wittenberg. 2000. “Making the Most 501 | of Statistical Analyses: Improving Interpretation and Presentation.” 502 | *American Journal of Political Science* 44 (2): 341–55. 503 | . 504 | 505 |
506 | 507 |
508 | 509 | Roßteutscher, Sigrid, Harald Schoen, Rüdiger Schmitt-Beck, Christof 510 | Wolf, and Alexander Staudt. 2019. “Rolling Cross-Section-Wahlkampfstudie 511 | Mit Nachwahl-Panelwelle (GLES 2017).” GESIS Datenarchiv. 512 | . 513 | 514 |
515 | 516 |
517 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Submission 2 | 3 | This is a submission for a new patch release (0.0.8). 4 | 5 | The 0.0.8 release fixes a bug where the functions stopped because of an erroneously 6 | triggered error message. 7 | 8 | The resubmission is necessary because a test failed in a 9 | r-devel-linux-x86_64-fedora-gcc environment. This test is now skipped because 10 | the error is not replicable on a local fedora-gcc-devel container. 11 | 12 | 13 | ## Test environments 14 | 15 | * Windows Server 2008 R2 SP1, R-devel, 32/64 bit (R-hub) 16 | * Windows x86_64-w64-mingw32 (64-bit), R-devel (win-builder) 17 | * windows latest, release (github actions) 18 | * Ubuntu 20.04, release (github actions) 19 | * Ubuntu 20.04, devel (github actions) 20 | * Fedora Linux, R-devel, GCC (R-hub) 21 | * macOS latest, release (github actions) 22 | 23 | 24 | ## R CMD check results 25 | There were no ERRORs or WARNINGs. 26 | 27 | There was one NOTE: 28 | 29 | Maintainer: 'Manuel Neumann ' 30 | 31 | Days since last update: 3 32 | 33 | Found the following (possibly) invalid URLs: 34 | URL: https://doi.org/10.2307/2669316 35 | From: inst/doc/OVA_Predictions_For_MNL.html 36 | README.md 37 | Status: 403 38 | Message: Forbidden 39 | URL: https://doi.org/10.4232/1.13213 40 | From: inst/doc/OVA_Predictions_For_MNL.html 41 | README.md 42 | Status: Error 43 | Message: SSL certificate problem: unable to get local issuer certificate 44 | URL: https://www.doi.org/10.4232/1.13213 45 | From: man/gles.Rd 46 | Status: Error 47 | Message: SSL certificate problem: unable to get local issuer certificate 48 | 49 | Found the following (possibly) invalid DOIs: 50 | DOI: 10.2307/2669316 51 | From: DESCRIPTION 52 | Status: Forbidden 53 | Message: 403 54 | 55 | * All DOIs and URLs were manually checked and are valid. 56 | 57 | 58 | ## Downstram dependencies 59 | There are currently no downstream dependencies for this package. 60 | -------------------------------------------------------------------------------- /data/gles.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ManuelNeumann/MNLpred/119672dcffea9b9a0e0caf187894b6f9fac8bb33/data/gles.RData -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite MNLpred in publications please use:") 2 | 3 | citEntry(entry = "Manual", 4 | title = "MNLpred - Simulated Predicted Probabilities for Multinomial Logit Models", 5 | author = as.person("Manuel Neumann"), 6 | year = "2021", 7 | url = "https://CRAN.R-project.org/package=MNLpred", 8 | note = "Version 0.0.8", 9 | doi = "10.5281/zenodo.4525342", 10 | textVersion = 11 | paste("Neumann, Manuel (2021).", 12 | "MNLpred - Simulated Predicted Probabilities for Multinomial Logit Models", 13 | "(Version 0.0.8).", 14 | "DOI 10.5281/zenodo.4525342.", 15 | "URL https://CRAN.R-project.org/package=MNLpred.") 16 | ) 17 | -------------------------------------------------------------------------------- /man/figures/README-first_differences_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ManuelNeumann/MNLpred/119672dcffea9b9a0e0caf187894b6f9fac8bb33/man/figures/README-first_differences_plot-1.png -------------------------------------------------------------------------------- /man/figures/README-prediction_plot1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ManuelNeumann/MNLpred/119672dcffea9b9a0e0caf187894b6f9fac8bb33/man/figures/README-prediction_plot1-1.png -------------------------------------------------------------------------------- /man/figures/README-prediction_plot2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ManuelNeumann/MNLpred/119672dcffea9b9a0e0caf187894b6f9fac8bb33/man/figures/README-prediction_plot2-1.png -------------------------------------------------------------------------------- /man/figures/README-static_fd_plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ManuelNeumann/MNLpred/119672dcffea9b9a0e0caf187894b6f9fac8bb33/man/figures/README-static_fd_plot-1.png -------------------------------------------------------------------------------- /man/gles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gles-data.R 3 | \docType{data} 4 | \name{gles} 5 | \alias{gles} 6 | \title{German Longitudinal Election Study} 7 | \format{ 8 | An data frame with 1,000 observations and 6 variables: 9 | \describe{ 10 | \item{vote}{Voting decision for party} 11 | \item{egoposition_immigration}{Ego-position toward immigration (0 = very open to 10 = very restrictive )} 12 | \item{ostwest}{Dummy for respondents from Eastern Germany (= 1)} 13 | \item{political_interest}{Measurement for political interst (0 = low, 4 = high)} 14 | \item{income}{Self-reported income satisfaction (0 = low, 4 = high)} 15 | \item{gender}{Self-reported gender (binary coding with 1 = female)} 16 | } 17 | } 18 | \source{ 19 | \href{https://www.doi.org/10.4232/1.13213}{GESIS Datenarchiv} 20 | } 21 | \usage{ 22 | data(gles) 23 | } 24 | \description{ 25 | A sample of 1,000 respondents in the Rolling Cross Sectional 26 | study in the German Longitudinal Election Study in 2017. 27 | } 28 | \examples{ 29 | data(gles) 30 | table(gles$vote) 31 | } 32 | \references{ 33 | Roßteutscher, Sigrid et al. 2019. 34 | “Rolling Cross-Section-Wahlkampfstudie mit Nachwahl-Panelwelle (GLES 2017).” 35 | ZA6803 Datenfile Version 4.0.1. 36 | (\href{https://www.doi.org/10.4232/1.13213}{GESIS Datenarchiv}). 37 | } 38 | \keyword{datasets} 39 | -------------------------------------------------------------------------------- /man/mnl_fd2_ova.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mnl_fd2_ova.R 3 | \name{mnl_fd2_ova} 4 | \alias{mnl_fd2_ova} 5 | \title{Multinomial First Differences Predictions For Two Values (Observed Value Approach)} 6 | \usage{ 7 | mnl_fd2_ova( 8 | model, 9 | data, 10 | x, 11 | value1, 12 | value2, 13 | xvari, 14 | nsim = 1000, 15 | seed = "random", 16 | probs = c(0.025, 0.975) 17 | ) 18 | } 19 | \arguments{ 20 | \item{model}{the multinomial model, from a \code{\link{multinom}}()-function call (see the \code{\link{nnet}} package)} 21 | 22 | \item{data}{the data with which the model was estimated} 23 | 24 | \item{x}{the name of the variable that should be varied} 25 | 26 | \item{value1}{first value for the difference} 27 | 28 | \item{value2}{second value for the difference} 29 | 30 | \item{xvari}{former argument for \code{x} (deprecated).} 31 | 32 | \item{nsim}{numbers of simulations} 33 | 34 | \item{seed}{set a seed for replication purposes.} 35 | 36 | \item{probs}{a vector with two numbers, defining the significance levels. Default to 5\% significance level: \code{c(0.025, 0.975)}} 37 | } 38 | \value{ 39 | The function returns a list with several elements. Most importantly the list includes the simulated draws `S`, the simulated predictions `P`, the first differences of the predictions `P_fd`, a data set for plotting `plotdata` the predicted probabilities, and one for the first differences `plotdata_fd`. 40 | } 41 | \description{ 42 | Multinomial First Differences Predictions For Two Values (Observed Value Approach) 43 | } 44 | \examples{ 45 | library(nnet) 46 | library(MASS) 47 | 48 | dataset <- data.frame(y = c(rep("a", 10), rep("b", 10), rep("c", 10)), 49 | x1 = rnorm(30), 50 | x2 = rnorm(30, mean = 1), 51 | x3 = sample(1:10, 30, replace = TRUE)) 52 | 53 | mod <- multinom(y ~ x1 + x2 + x3, data = dataset, Hess = TRUE) 54 | 55 | fdi1 <- mnl_fd2_ova(model = mod, data = dataset, 56 | x = "x1", 57 | value1 = min(dataset$x1), 58 | value2 = max(dataset$x1), 59 | nsim = 10) 60 | 61 | 62 | 63 | } 64 | -------------------------------------------------------------------------------- /man/mnl_fd_ova.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mnl_fd_ova.R 3 | \name{mnl_fd_ova} 4 | \alias{mnl_fd_ova} 5 | \title{Multinomial First Differences Prediction (Observed Value Approach)} 6 | \usage{ 7 | mnl_fd_ova( 8 | model, 9 | data, 10 | x, 11 | z, 12 | z_values, 13 | xvari, 14 | scenname, 15 | scenvalues, 16 | by = NULL, 17 | nsim = 1000, 18 | seed = "random", 19 | probs = c(0.025, 0.975) 20 | ) 21 | } 22 | \arguments{ 23 | \item{model}{the multinomial model, from a \code{\link{multinom}}()-function call (see the \code{\link{nnet}} package)} 24 | 25 | \item{data}{the data with which the model was estimated} 26 | 27 | \item{x}{the name of the variable that should be varied (the x-axis variable in prediction plots)} 28 | 29 | \item{z}{define the variable for which you want to compute the difference.} 30 | 31 | \item{z_values}{determine the two values at which value you want to fix the scenario (\code{z}). The first differences will be computed by subtracting the values of the first supplied scenario from the second one.} 32 | 33 | \item{xvari}{former argument for \code{x} (deprecated).} 34 | 35 | \item{scenname}{former argument for \code{z} (deprecated).} 36 | 37 | \item{scenvalues}{former argument for \code{z_values} (deprecated).} 38 | 39 | \item{by}{define the steps of \code{x}.} 40 | 41 | \item{nsim}{numbers of simulations} 42 | 43 | \item{seed}{set a seed for replication purposes.} 44 | 45 | \item{probs}{a vector with two numbers, defining the significance levels. Default to 5\% significance level: \code{c(0.025, 0.975)}} 46 | } 47 | \value{ 48 | The function returns a list with several elements. Most importantly the list includes the simulated draws `S`, the simulated predictions `P`, and a data set for plotting `plotdata`. 49 | } 50 | \description{ 51 | This function predicts values for two different scenarios over a range of 52 | values. It then takes the differences between the different simulations to 53 | return first differences for each value. 54 | } 55 | \details{ 56 | The function uses the \code{\link{mnl_pred_ova}} function for each scenario. 57 | The results of these predictions are also returned and can therefore be 58 | easily accessed. If you need predictions for multiple scenarios, you can use 59 | this function to both plot the predictions for each scenario and the 60 | differences between them. 61 | } 62 | \examples{ 63 | library(nnet) 64 | library(MASS) 65 | 66 | dataset <- data.frame(y = c(rep("a", 10), rep("b", 10), rep("c", 10)), 67 | x1 = rnorm(30), 68 | x2 = rnorm(30, mean = 1), 69 | x3 = sample(1:10, 30, replace = TRUE)) 70 | 71 | mod <- multinom(y ~ x1 + x2 + x3, data = dataset, Hess = TRUE) 72 | 73 | fdif <- mnl_fd_ova(model = mod, data = dataset, 74 | x = "x1", z = "x3", 75 | z_values = c(min(dataset$x3), max(dataset$x3)), 76 | nsim = 10) 77 | 78 | } 79 | -------------------------------------------------------------------------------- /man/mnl_pred_ova.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mnl_pred_ova.R 3 | \name{mnl_pred_ova} 4 | \alias{mnl_pred_ova} 5 | \title{Multinomial Prediction Function (Observed Value Approach)} 6 | \usage{ 7 | mnl_pred_ova( 8 | model, 9 | data, 10 | x, 11 | by = NULL, 12 | z = NULL, 13 | z_value = NULL, 14 | xvari, 15 | scenname, 16 | scenvalue, 17 | nsim = 1000, 18 | seed = "random", 19 | probs = c(0.025, 0.975) 20 | ) 21 | } 22 | \arguments{ 23 | \item{model}{the multinomial model, from a \code{\link{multinom}}()-function call (see the \code{\link{nnet}} package)} 24 | 25 | \item{data}{the data with which the model was estimated} 26 | 27 | \item{x}{the name of the variable that should be varied (the x-axis variable in prediction plots)} 28 | 29 | \item{by}{define the steps of \code{x}.} 30 | 31 | \item{z}{if you want to hold a specific variable stable over all scenarios, you can name it here (optional).} 32 | 33 | \item{z_value}{determine at which value you want to fix the \code{z}.} 34 | 35 | \item{xvari}{former argument for \code{x} (deprecated).} 36 | 37 | \item{scenname}{former argument for \code{z} (deprecated).} 38 | 39 | \item{scenvalue}{former argument for \code{z_value} (deprecated).} 40 | 41 | \item{nsim}{numbers of simulations} 42 | 43 | \item{seed}{set a seed for replication purposes.} 44 | 45 | \item{probs}{a vector with two numbers, defining the significance levels. Default to 5\% significance level: \code{c(0.025, 0.975)}} 46 | } 47 | \value{ 48 | The function returns a list with several elements. Most importantly the list includes the simulated draws `S`, the simulated predictions `P`, and a data set for plotting `plotdata`. 49 | } 50 | \description{ 51 | This function predicts probabilities for all choices of a multinomial logit 52 | model over a specified span of values. 53 | } 54 | \examples{ 55 | library(nnet) 56 | library(MASS) 57 | 58 | dataset <- data.frame(y = c(rep("a", 10), rep("b", 10), rep("c", 10)), 59 | x1 = rnorm(30), 60 | x2 = rnorm(30, mean = 1), 61 | x3 = sample(1:10, 30, replace = TRUE)) 62 | 63 | mod <- multinom(y ~ x1 + x2 + x3, data = dataset, Hess = TRUE) 64 | 65 | pred <- mnl_pred_ova(model = mod, data = dataset, 66 | x = "x1", 67 | nsim = 10) 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(MNLpred) 3 | 4 | test_check("MNLpred") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_inputvariants.R: -------------------------------------------------------------------------------- 1 | context("Variations in the input") 2 | library(MNLpred) 3 | library(nnet) 4 | library(MASS) 5 | 6 | mod1 <- multinom(vote ~ egoposition_immigration + 7 | political_interest + 8 | income + gender + ostwest, 9 | data = gles, 10 | Hess = TRUE) 11 | 12 | mod2 <- multinom(vote ~ egoposition_immigration + 13 | political_interest + 14 | income + gender + ostwest, 15 | data = gles) 16 | 17 | # Add factors: 18 | gles$factor <- as.factor(sample(c("A", "B", "C"), nrow(gles), replace = T)) 19 | 20 | mod3 <- multinom(vote ~ egoposition_immigration + 21 | political_interest + 22 | income + gender + ostwest + 23 | factor, 24 | data = gles, 25 | Hess = TRUE) 26 | 27 | # Just one IV 28 | mod4 <- multinom(vote ~ egoposition_immigration, 29 | data = gles, 30 | Hess = TRUE) 31 | 32 | data_1iv <- data.frame(x1 = c(1:4), 33 | x2 = rep(c("m", "n"), 2), 34 | y = factor(c("a", "b", "c", "d"))) 35 | 36 | mod5a_1iv <- multinom(y ~ x1, 37 | data = data_1iv, 38 | Hess = TRUE) 39 | 40 | mod5b_1iv <- multinom(y ~ x2, 41 | data = data_1iv, 42 | Hess = TRUE) 43 | 44 | 45 | # Tests 46 | test_that("mnl_pred_ova() returns two predictions when by = NULL", { 47 | 48 | expect_equal(mnl_pred_ova(model = mod1, 49 | data = gles, 50 | x = "egoposition_immigration", 51 | by = NULL, 52 | seed = "random", # default 53 | nsim = 2, # faster 54 | probs = c(0.025, 0.975))$nVariation, 2) 55 | }) 56 | 57 | test_that("mnl_pred_ova() returns dataframes with correct number of rows", { 58 | 59 | pred1 <- mnl_pred_ova(model = mod1, 60 | data = gles, 61 | x = "egoposition_immigration", 62 | by = 1, 63 | seed = "random", # default 64 | nsim = 2, # faster 65 | probs = c(0.025, 0.975)) 66 | pred2 <- mnl_pred_ova(model = mod1, 67 | data = gles, 68 | x = "egoposition_immigration", 69 | z = "gender", 70 | z_value = 1, 71 | by = 1, 72 | seed = "random", # default 73 | nsim = 2, # faster 74 | probs = c(0.025, 0.975)) 75 | expect_equal(nrow(pred1$plotdata), length(unique(gles$vote)) * length(seq(min(gles$egoposition_immigration), 76 | max(gles$egoposition_immigration), 77 | by = 1))) 78 | }) 79 | 80 | test_that("mnl_pred_ova() returns error message when variables contain typos", { 81 | 82 | expect_error(mnl_pred_ova(model = mod1, 83 | data = gles, 84 | x = "immigration", 85 | by = 1, 86 | seed = "random", # default 87 | nsim = 2, # faster 88 | probs = c(0.025, 0.975)), 89 | regexp = "There might be a typo.") 90 | }) 91 | 92 | test_that("mnl_pred_ova() returns error message when variables contain typos", { 93 | 94 | expect_error(mnl_pred_ova(model = mod1, 95 | data = gles, 96 | x = "egoposition_immigration", 97 | by = 1, 98 | z = "gndr", 99 | z_value = 1, 100 | seed = "random", # default 101 | nsim = 2, # faster 102 | probs = c(0.025, 0.975)), 103 | regexp = "There might be a typo.") 104 | }) 105 | 106 | test_that("mnl_pred_ova() returns error message when there is no Hessian matrix", { 107 | 108 | expect_error(mnl_pred_ova(model = mod2, 109 | data = gles, 110 | x = "egoposition_immigration", 111 | by = 1, 112 | seed = "random", # default 113 | nsim = 2, # faster 114 | probs = c(0.025, 0.975)), 115 | regexp = "Hess = TRUE") 116 | }) 117 | 118 | test_that("mnl_pred_ova() stops if non-numeric variables are supplied with the data", { 119 | 120 | expect_error(mnl_pred_ova(model = mod3, 121 | data = gles, 122 | x = "egoposition_immigration", 123 | nsim = 2), 124 | regexp = "Please supply data that consists of numeric values.") 125 | 126 | }) 127 | 128 | 129 | # Fixing bug with apply() and one IV (v0.0.6) 130 | test_that("mnl_pred_ov() works with just one iv", { 131 | 132 | expect_type(mnl_pred_ova(model = mod4, 133 | data = gles, 134 | x = "egoposition_immigration", 135 | nsim = 2), "list") 136 | }) 137 | 138 | test_that("mnl_pred_ov() does correctly evaluate the class of one iv (negative example)",{ 139 | 140 | expect_error(mnl_pred_ova(model = mod5b_1iv, 141 | data = data_1iv, 142 | x = "x2", 143 | nsim = 2), 144 | "Please supply data that consists of numeric values. The package can not handle factor or character variables, yet.*") 145 | }) 146 | 147 | test_that("mnl_pred_ov() does correctly evaluate the class of one iv", { 148 | skip_on_cran() 149 | expect_length(mnl_pred_ova(model = mod5a_1iv, 150 | data = data_1iv, 151 | x = "x1", 152 | nsim = 2)$IV, 153 | n = 1) 154 | }) 155 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/OVA_Predictions_For_MNL.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Observed Value Predictions for Multinomial Logit Models" 3 | author: "Manuel Neumann" 4 | output: rmarkdown::html_vignette 5 | bibliography: bibliography.bib 6 | vignette: > 7 | %\VignetteIndexEntry{Observed Value Predictions for Multinomial Logit Models} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | This package provides functions that make it easy to get plottable predictions from multinomial logit models. The predictions are based on simulated draws of regression estimates from their respective sampling distribution. 20 | 21 | At first I will present the theoretical and statistical background, before using sample data to demonstrate the functions of the package. 22 | 23 | ## The multinomial logit model 24 | 25 | This is a short introduction in the theoretical and statistical background of the multinomial logit. 26 | 27 | Dependent variables can not necessarily be ordered. In political science, for example, the variable of interest is often the individual's vote choice, based on the set of parties that are presented. Of interest is then how somebody comes up with their choice. 28 | 29 | More generally spoken, many questions deal with a nominal outcome variable and we want to test assumptions about the function that may lead to a respective outcome. 30 | 31 | For these questions, the multinomial logit model is often a fitting option. Similar to an ordinary logit model, the multinomial logit model assumes that the probability to choose one over the other outcomes can be modeled with a linear function and a fitting logit link function. The difference of the multinomial logit is that it models the choice of *each* category as a function of the characteristics of the observation. 32 | 33 | In formal terms, we assume $$\Pr(y_i = j|X_i)$$ is a linear combination of $$X_i\beta_j$$, whereby $$\beta_j$$ is a choice specific vector. This means we are interested in the probability that the observed choice of the individual $$y_i$$ is the choice category $$j$$ dependent on characteristics of the observation's characteristics $$X_i$$. Therefore we estimate a choice specific vector $\beta_j$. Since the probability is restricted to be between $$0$$ and $$1$$, we use $$exp(X_i\beta_j)$$ as a fitting link function. Additionally, we bring the exponents into relationship with each other and normalize them by dividing through the sum of them. 34 | 35 | Since we cannot compare all choices against each other, the model is not identified so far. Instead, we have to choose a baseline category and fix it to $$0$$. Therefore we estimate the probability of all choices to be chosen in comparison to the baseline choice. 36 | 37 | Eventually, we end up with the following probability function: 38 | 39 | $$\Pr(y_i|X_i)= \frac{exp(X_i\beta_j)}{\sum^{J}_{m=1}exp(X_i \beta_m)}$$, whereby $$\beta_1 = 0$$ 40 | This is the link function that is used for estimation. 41 | 42 | For a more detailed insight into the multinomial model refer to sources like [these lecture notes by Germán Rodríguez](https://data.princeton.edu/wws509/notes/c6s2). 43 | 44 | ## Using the package 45 | 46 | ### How does the function work? 47 | 48 | As we have seen above, the multinomial logit can be used to get an insight into the probabilities to choose one option out of a set of alternatives. We have also seen that we need a baseline category to identify the model. This is mathematically necessary, but does not come in handy for purposes of interpretation. 49 | 50 | It is far more helpful and easier to understand to come up with predicted probabilities and first differences for values of interest [see e.g., @king2000 for approaches in social sciences]. Based on simulations, this package helps to easily predict probabilities and confidence intervals for each choice category over a specified scenario. The functions use the observed values to compute the predicted probabilities, as is recommended by @hanmer2013. 51 | 52 | The procedure follows the following steps: 53 | 54 | 1. Estimate a multinomial model and save the coefficients and the variance covariance matrix (based on the Hessian-matrix of the model). 55 | 1. To simulate uncertainty, make $n$ draws of coefficients from a simulated sampling distribution based on the coefficients and the variance covariance matrix. 56 | 1. Predict probabilities by multiplying the drawn coefficients with a specified scenario (so far these are the observed values). 57 | 1. Take the mean and the quantiles of the simulated predicted probabilities. 58 | 59 | The presented functions follow these steps. Additionally, they use the so called observed value approach. This means that the "scenario" uses all observed values that informed the model. Therefore the function takes these more detailed steps: 60 | 61 | 1. For all (complete) cases $n$ predictions are computed based on their observed independent values and the $n$ sets of coefficients. 62 | 1. Next the predicted values of all observations for each simulation are averaged. 63 | 1. Take the mean and the quantiles of the simulated predicted probabilities (same as above). 64 | 65 | For first differences, the simulated predictions are subtracted from each other. 66 | 67 | To showcase these steps, I present a reproducible example of how the functions can be used. 68 | 69 | ### Example 70 | 71 | The example uses data from the German Longitudinal Election Study (GLES, @rosteutscher2019). 72 | 73 | The contains 1,000 respondents characteristics and their vote choice. 74 | 75 | For this task, we need the following packages: 76 | 77 | ```{r setup} 78 | # Required packages 79 | library(magrittr) # for pipes 80 | library(nnet) # for the multinom()-function 81 | library(MASS) # for the multivariate normal distribution 82 | 83 | # The package 84 | library(MNLpred) 85 | 86 | # Plotting the predicted probabilities: 87 | library(ggplot2) 88 | library(scales) 89 | ``` 90 | 91 | Now we load the data: 92 | 93 | ```{r data, echo=TRUE} 94 | # The data: 95 | data("gles") 96 | ``` 97 | 98 | The next step is to compute the actual model. The function of the `MNLpred` package is based on models that were estimated with the `multinom()`-function of the `nnet` package. The `multinom()` function is convenient because it does not need transformed datasets. The syntax is very easy and resembles the ordinary regression functions. Important is that the Hessian matrix is returned with `Hess = TRUE`. The matrix is needed to simulate the sampling distribution. 99 | 100 | As we have seen above, we need a baseline or reference category for the model to work. Therefore, be aware what your baseline category is. If you use a dependent variable that is of type `character`, the categories will be ordered in alphabetical order. If you have a `factor`at hand, you can define your baseline category, for example with the `relevel()`function. 101 | 102 | Now, let's estimate the model: 103 | 104 | ```{r model} 105 | # Multinomial logit model: 106 | mod1 <- multinom(vote ~ egoposition_immigration + 107 | political_interest + 108 | income + gender + ostwest, 109 | data = gles, 110 | Hess = TRUE) 111 | ``` 112 | The results show the coefficients and standard errors. As we can see, there are five sets of coefficients. They describe the relationship between the reference category (`AfD`) and the vote choices for the parties `CDU/CSU`, `FDP`, `Gruene`, `LINKE`, and `SPD`. 113 | 114 | ```{r results} 115 | summary(mod1) 116 | ``` 117 | 118 | A first rough review of the coefficients shows that a more restrictive ego-position toward immigration leads to a lower probability of the voters to choose any other party than the AfD. It is hard to evaluate whether the effect is statistically significant and how the probabilities for each choice look like. For this it is helpful to predict the probabilities for certain scenarios and plot the means and confidence intervals for visual analysis. 119 | 120 | Let's say we are interested in the relationship between the ego-position toward immigration and the probability to choose any of the parties. It would be helpful to plot the predicted probabilities for the span of the positions. 121 | 122 | ```{r math} 123 | summary(gles$egoposition_immigration) 124 | ``` 125 | 126 | As we can see, the ego positions were recorded on a scale from 0 to 10. Higher numbers represent more restrictive positions. 127 | We pick this score as the x-variable (`x`) and use the `mnl_pred_ova()` function to get predicted probabilities for each position in this range. 128 | 129 | The function needs a multinomial logit model (`model`), data (`data`), the variable of interest `x`, the steps for which the probabilities should be predicted (`by`). Additionally, a `seed` can be defined for replication purposes, the numbers of simulations can be defined (`nsim`), and the confidence intervals (`probs`). 130 | 131 | If we want to hold another variable stable, we can specify so with `z`and `z_value`. See also the `mnl_fd_ova()` function below. 132 | 133 | ```{r mnl_pred_ova} 134 | pred1 <- mnl_pred_ova(model = mod1, 135 | data = gles, 136 | x = "egoposition_immigration", 137 | by = 1, 138 | seed = "random", # default 139 | nsim = 100, # faster 140 | probs = c(0.025, 0.975)) # default 141 | ``` 142 | 143 | The function returns a list with several elements. Most importantly, it returns a `plotdata` data set: 144 | 145 | ```{r return} 146 | pred1$plotdata %>% head() 147 | ``` 148 | 149 | As we can see, it includes the range of the x variable, a mean, a lower, and an upper bound of the confidence interval. Concerning the choice category, the data is in a long format. This makes it easy to plot it with the `ggplot` syntax. The choice category can now easily be used to differentiate the lines in the plot by using `linetype = vote` in the `aes()`. Another option is to use `facet_wrap()` or `facet_grid()` to differentiate the predictions: 150 | 151 | ```{r prediction_plot1} 152 | ggplot(data = pred1$plotdata, aes(x = egoposition_immigration, 153 | y = mean, 154 | ymin = lower, ymax = upper)) + 155 | geom_ribbon(alpha = 0.1) + # Confidence intervals 156 | geom_line() + # Mean 157 | facet_wrap(.~ vote, scales = "free_y", ncol = 2) + 158 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 159 | scale_x_continuous(breaks = c(0:10)) + 160 | theme_bw() + 161 | labs(y = "Predicted probabilities", 162 | x = "Ego-position toward immigration") # Always label your axes ;) 163 | ``` 164 | 165 | If we want first differences between two scenarios, we can use the function `mnl_fd2_ova()`. The function takes similar arguments as the function above, but now the values for the scenarios of interest have to be supplied. Imagine we want to know what difference it makes to position oneself on the most tolerant or most restrictive end of the `egoposition_immigration` scale. This can be done as follows: 166 | 167 | ```{r static_fd} 168 | fdif1 <- mnl_fd2_ova(model = mod1, 169 | data = gles, 170 | x = "egoposition_immigration", 171 | value1 = min(gles$egoposition_immigration), 172 | value2 = max(gles$egoposition_immigration), 173 | nsim = 100) 174 | ``` 175 | 176 | The first differences can then be depicted in a graph. 177 | 178 | ```{r static_fd_plot} 179 | ggplot(fdif1$plotdata_fd, aes(x = categories, 180 | y = mean, 181 | ymin = lower, ymax = upper)) + 182 | geom_pointrange() + 183 | geom_hline(yintercept = 0) + 184 | scale_y_continuous(labels = percent_format()) + 185 | theme_bw() + 186 | labs(y = "Predicted probabilities", 187 | x = "Party vote") 188 | ``` 189 | 190 | 191 | We are often not only interested in the static difference, but the difference across a span of values, given a difference in a second variable. This is especially helpful when we look at dummy variables. For example, we could be interested in the effect of `gender` on the vote decision over the different ego-positions. With the `mnl_fd_ova()` function, we can predict the probabilities for two scenarios and subtract them. The function returns the differences and the confidence intervals of the differences. The different scenarios can be held stable with `z` and the `z_values`. `z_values` takes a vector of two numeric values. These values are held stable for the variable that is named in `z`. 192 | 193 | ```{r first_diffferences_prediction} 194 | fdif2 <- mnl_fd_ova(model = mod1, 195 | data = gles, 196 | x = "egoposition_immigration", 197 | by = 1, 198 | z = "gender", 199 | z_values = c(0,1), 200 | nsim = 100) 201 | ``` 202 | As before, the function returns a list including a data set that can be used to plot the differences. 203 | 204 | ```{r fd_return} 205 | fdif2$plotdata_fd %>% head() 206 | ``` 207 | 208 | Since the function calls the `mnl_pred_ova()` function internally, it also returns the output of the two predictions in the list element `Prediction1` and `Prediction2`. The plot data for the predictions is already bound together row wise to easily plot the predicted probabilities. 209 | 210 | ```{r prediction_plot2} 211 | ggplot(data = fdif2$plotdata, aes(x = egoposition_immigration, 212 | y = mean, 213 | ymin = lower, ymax = upper, 214 | group = as.factor(gender), 215 | linetype = as.factor(gender))) + 216 | geom_ribbon(alpha = 0.1) + 217 | geom_line() + 218 | facet_wrap(. ~ vote, scales = "free_y", ncol = 2) + 219 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 220 | scale_x_continuous(breaks = c(0:10)) + 221 | scale_linetype_discrete(name = "Gender", 222 | breaks = c(0, 1), 223 | labels = c("Male", "Female")) + 224 | theme_bw() + 225 | labs(y = "Predicted probabilities", 226 | x = "Ego-position toward immigration") # Always label your axes ;) 227 | ``` 228 | 229 | As we can see, the differences between `female` and `male` differ, depending on the party and ego-position. So let's take a look at the differences: 230 | 231 | ```{r first_differences_plot} 232 | ggplot(data = fdif2$plotdata_fd, aes(x = egoposition_immigration, 233 | y = mean, 234 | ymin = lower, ymax = upper)) + 235 | geom_ribbon(alpha = 0.1) + 236 | geom_line() + 237 | geom_hline(yintercept = 0) + 238 | facet_wrap(. ~ vote, ncol = 3) + 239 | scale_y_continuous(labels = percent_format(accuracy = 1)) + # % labels 240 | scale_x_continuous(breaks = c(0:10)) + 241 | theme_bw() + 242 | labs(y = "Predicted probabilities", 243 | x = "Ego-position toward immigration") # Always label your axes ;) 244 | ``` 245 | 246 | We can see that the differences are for some parties at no point statistically significant from 0. 247 | 248 | 249 | ## Conclusion 250 | Multinomial logit models are important to model nominal choices. They are, however, restricted by being in need of a baseline category. Additionally, the log-character of the estimates makes it difficult to interpret them in meaningful ways. Predicting probabilities for all choices for scenarios, based on the observed data provides much more insight. The functions of this package provide easy to use functions that return data that can be used to plot predicted probabilities. The function uses a model from the `multinom()` function and uses the observed value approach and a supplied scenario to predict values over the range of fitting values. The functions simulate sampling distributions and therefore provide meaningful confidence intervals. `mnl_pred_ova()` can be used to predict probabilities for a certain scenario. `mnl_fd_ova()` can be used to predict probabilities for two scenarios and their first differences. 251 | 252 | ## Acknowledgment 253 | 254 | My code is inspired by the method courses in the [Political Science master's program at the University of Mannheim](https://www.sowi.uni-mannheim.de/en/academics/prospective-students/ma-in-political-science/)(cool place, check it out!). The skeleton of the code is based on a tutorial taught by [Marcel Neunhoeffer](https://www.marcel-neunhoeffer.com/) (lecture: "Advanced Quantitative Methods" by [Thomas Gschwend](https://www.sowi.uni-mannheim.de/gschwend/)). 255 | 256 | ## References 257 | -------------------------------------------------------------------------------- /vignettes/bibliography.bib: -------------------------------------------------------------------------------- 1 | # Bibliography for vignette 2 | 3 | @article{hanmer2013, 4 | title = {Behind the {Curve}: {Clarifying} the {Best} {Approach} to {Calculating} {Predicted} {Probabilities} and {Marginal} {Effects} from {Limited} {Dependent} {Variable} {Models}}, 5 | volume = {57}, 6 | issn = {00925853}, 7 | shorttitle = {Behind the {Curve}}, 8 | doi = {10.1111/j.1540-5907.2012.00602.x}, 9 | number = {1}, 10 | journal = {American Journal of Political Science}, 11 | author = {Hanmer, Michael J. and Ozan Kalkan, Kerem}, 12 | month = jan, 13 | year = {2013}, 14 | pages = {263--277} 15 | } 16 | 17 | 18 | @article{king2000, 19 | title = {Making the {Most} of {Statistical} {Analyses}: {Improving} {Interpretation} and {Presentation}}, 20 | volume = {44}, 21 | doi = {10.2307/2669316}, 22 | number = {2}, 23 | journal = {American Journal of Political Science}, 24 | author = {King, Gary and Tomz, Michael and Wittenberg, Jason}, 25 | year = {2000}, 26 | pages = {341--355} 27 | } 28 | 29 | 30 | 31 | @misc{rosteutscher2019, 32 | title = {Rolling {Cross}-{Section}-{Wahlkampfstudie} mit {Nachwahl}-{Panelwelle} ({GLES} 2017)}, 33 | publisher = {GESIS Datenarchiv}, 34 | author = {Roßteutscher, Sigrid and Schoen, Harald and Schmitt-Beck, Rüdiger and Wolf, Christof and Staudt, Alexander}, 35 | year = {2019}, 36 | doi = {10.4232/1.13213}, 37 | note = {version: ZA6803 Datenfile Version 4.0.1} 38 | } 39 | --------------------------------------------------------------------------------