├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── check-standard.yaml │ └── test-coverage.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── compare.R ├── contrast.R ├── count.R ├── summarise.R ├── tabbycat-package.R ├── vcount.R └── zzz.R ├── cran-comments.md ├── man ├── cat_compare.Rd ├── cat_contrast.Rd ├── cat_count.Rd ├── cat_summarise.Rd ├── cat_vcount.Rd ├── safe_max.Rd ├── safe_mean.Rd ├── safe_min.Rd └── tabbycat.Rd ├── readme.md ├── tabbycat.Rproj └── tests ├── testthat.R └── testthat ├── test_compare.R ├── test_contrast.R ├── test_count.R ├── test_summarise.R └── test_vcount.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.github$ 4 | ^readme\.md$ 5 | ^cran-comments\.md$ 6 | ^LICENSE\.md$ 7 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/check-standard.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | .Rproj.user 38 | 39 | # Mac files 40 | .DS_Store 41 | 42 | 43 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tabbycat 2 | Type: Package 3 | Title: Tabulate and Summarise Categorical Data 4 | Version: 0.18.0 5 | Authors@R: 6 | person( 7 | given = "Oliver", 8 | family = "Hawkins", 9 | role = c("aut", "cre"), 10 | email = "oli@olihawkins.com") 11 | Maintainer: Oliver Hawkins 12 | Description: Functions for tabulating and summarising categorical variables. 13 | Most functions are designed to work with dataframes, and use the 'tidyverse' 14 | idiom of taking the dataframe as the first argument so they work within 15 | pipelines. Equivalent functions that operate directly on vectors are also 16 | provided where it makes sense. This package aims to make exploratory data 17 | analysis involving categorical variables quicker, simpler and more robust. 18 | License: MIT + file LICENSE 19 | Depends: 20 | R (>= 3.4.0) 21 | Imports: 22 | dplyr (>= 1.0.0), 23 | janitor, 24 | magrittr, 25 | purrr, 26 | rlang, 27 | stringr, 28 | tibble, 29 | tidyr 30 | Encoding: UTF-8 31 | RoxygenNote: 7.2.3 32 | Suggests: 33 | testthat (>= 3.0.0) 34 | Config/testthat/edition: 3 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: Oliver Hawkins 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # The MIT License 2 | 3 | Copyright (c) 2021 Oliver Hawkins 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(cat_compare) 4 | export(cat_contrast) 5 | export(cat_count) 6 | export(cat_summarise) 7 | export(cat_summarize) 8 | export(cat_vcount) 9 | importFrom(magrittr,"%>%") 10 | importFrom(rlang,.data) 11 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # tabbycat 0.18.0 2 | 3 | * No breaking changes 4 | * No new functions 5 | * Existing internal code has been updated to remove deprecated uses of third-party packages 6 | 7 | # tabbycat 0.17.0 8 | 9 | * Initial CRAN release 10 | * The stable API comprises five functions: 11 | * `cat_count` 12 | * `cat_vcount` 13 | * `cat_compare` 14 | * `cat_contrast` 15 | * `cat_summarise` 16 | * Unit test coverage is 97% and all tests pass 17 | * Added GitHub Actions, NEWS.md, and cran-comments.md for CRAN submission 18 | -------------------------------------------------------------------------------- /R/compare.R: -------------------------------------------------------------------------------- 1 | #' Calculate the frequency of discrete values in one categorical variable for 2 | #' each group within another categorical variable 3 | #' 4 | #' This function crosstabulates the frequencies of one categorical variable 5 | #' within the groups of another. The results are sorted on the values of the 6 | #' variable whose distribution is shown in each column i.e. the variable 7 | #' specified with \code{row_cat}. If this variable is a character vector it 8 | #' will be sorted alphabetically. If it is a factor it will be sorted in the 9 | #' order of its levels. 10 | #' 11 | #' @param data A dataframe containing the two variables of interest. 12 | #' @param row_cat The column name of a categorical variable whose distribution 13 | #' will be calculated for each group in \code{col_cat}. 14 | #' @param col_cat The column name of a categorical variable which will be 15 | #' split into groups and the distrubtion of \code{row_cat} calulated 16 | #' for each group. 17 | #' @param na.rm.row A boolean indicating whether to exclude NAs from the row 18 | #' results. The default is FALSE. 19 | #' @param na.rm.col A boolean indicating whether to exclude NAs from the column 20 | #' results. The default is FALSE. 21 | #' @param na.rm A boolean indicating whether to exclude NAs from both row and 22 | #' column results. This argument is provided as a convenience. It allows you 23 | #' to set \code{na.rm.row} and \code{na.rm.col} to the same value without 24 | #' having to specify them separately. If the value of \code{na.rm} is NULL, 25 | #' the argument is ignored. If it is not NULL it takes precendence. 26 | #' default is NULL. 27 | #' @param only A string indicating that only one set of frequency columns 28 | #' should be returned in the results. If \code{only} is either "n" or 29 | #' "number", only the number columns are returned. If \code{only} is either 30 | #' "p" or "percent", only the percent columns are returned. If \code{only} is 31 | #' any other value, both sets of columns are shown. The default value is an 32 | #' empty string, which means both sets of columns are shown. 33 | #' @param clean_names A boolean indicating whether the column names of the 34 | #' results tibble should be cleaned, so that any column names produced from 35 | #' data are converted to snake_case. The default is TRUE, but this can be 36 | #' changed with \code{options(tabbycat.clean_names = FALSE)}. 37 | #' @param na_label A string indicating the label to use for the columns that 38 | #' contain data for missing values. The default value is "na", but use this 39 | #' argument to set a different value if the default value collides with data 40 | #' in your dataset. 41 | #' @return A tibble showing the distribution of \code{row_cat} within each 42 | #' group in \code{col_cat}. 43 | #' @export 44 | 45 | cat_compare <- function( 46 | data, 47 | row_cat, 48 | col_cat, 49 | na.rm.row = FALSE, 50 | na.rm.col = FALSE, 51 | na.rm = NULL, 52 | only = "", 53 | clean_names = getOption("tabbycat.clean_names"), 54 | na_label = getOption("tabbycat.na_label")) { 55 | 56 | # Check the data argument is not null and is a dataframe 57 | if (is.null(data) || ! is.data.frame(data)) { 58 | stop("The \"data\" argument is not a dataframe.") 59 | } 60 | 61 | # Check that data has rows 62 | if (nrow(data) == 0) { 63 | stop("The \"data\" argument is empty.") 64 | } 65 | 66 | # Check the row_cat argument is a character vector of length one 67 | if (! is.character(row_cat) || length(row_cat) != 1) { 68 | stop("Invalid \"row_cat\" argument. Must be a character vector of length one.") 69 | } 70 | 71 | # Check the row_cat argument is a column in data 72 | if (! row_cat %in% colnames(data)) { 73 | stop(stringr::str_c("'", row_cat, 74 | "' is not a column in the dataframe.")) 75 | } 76 | 77 | # Check the col_cat argument is a character vector of length one 78 | if (! is.character(col_cat) || length(col_cat) != 1) { 79 | stop("Invalid \"col_cat\" argument. Must be a character vector of length one.") 80 | } 81 | 82 | # Check the col_cat argument is a column in data 83 | if (! col_cat %in% colnames(data)) { 84 | stop(stringr::str_c("'", col_cat, 85 | "' is not a column in the dataframe.")) 86 | } 87 | 88 | # Check the na.rm.row argument is valid 89 | if (length(na.rm.row) != 1 || is.na(na.rm.row) || ! is.logical(na.rm.row)) { 90 | stop("Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE.") 91 | } 92 | 93 | # Check the na.rm.col argument is valid 94 | if (length(na.rm.col) != 1 || is.na(na.rm.col) || ! is.logical(na.rm.col)) { 95 | stop("Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE.") 96 | } 97 | 98 | # Check the na.rm argument is valid 99 | if (length(na.rm) > 1 || 100 | (! is.null(na.rm) && ! is.logical(na.rm)) || 101 | (! is.null(na.rm) && is.na(na.rm))) { 102 | stop("Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE.") 103 | } 104 | 105 | # Check the only argument is valid 106 | if (length(only) != 1 || is.na(only) || ! is.character(only)) { 107 | stop("Invalid \"only\" argument. Must be a character vector of length one.") 108 | } 109 | 110 | # Check the clean_names argument is valid 111 | if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) { 112 | stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.") 113 | } 114 | 115 | # Check the na_label argument is valid 116 | if (length(na_label) != 1 || is.na(na_label) || ! is.character(na_label)) { 117 | stop("Invalid \"na_label\" argument. Must be a character vector of length one.") 118 | } 119 | 120 | # Set both na.rm.row and na.rm.col to the value of na.rm if specified 121 | if (! is.null(na.rm)) { 122 | na.rm.row <- na.rm 123 | na.rm.col <- na.rm 124 | } 125 | 126 | # Remove rows with NAs if na.rm.row is TRUE 127 | if (na.rm.row == TRUE) { 128 | data <- data %>% dplyr::filter(! is.na(.data[[row_cat]])) 129 | } 130 | 131 | group_names <- sort(unique(data[[col_cat]])) 132 | 133 | comparison_data <- purrr::map_dfr(group_names, function(group_name) { 134 | 135 | data %>% 136 | dplyr::filter(.data[[col_cat]] == group_name) %>% 137 | dplyr::group_by(.data[[row_cat]]) %>% 138 | dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% 139 | dplyr::mutate(p = .data$n / sum(.data$n)) %>% 140 | dplyr::mutate(group = as.character(group_name)) %>% 141 | dplyr::select( 142 | "group", 143 | dplyr::all_of(row_cat), 144 | "n", 145 | "p") 146 | }) 147 | 148 | if (na.rm.col == FALSE) { 149 | 150 | na_data <- data %>% 151 | dplyr::filter(is.na(.data[[col_cat]])) %>% 152 | dplyr::group_by(.data[[row_cat]]) %>% 153 | dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% 154 | dplyr::mutate(p = .data$n / sum(.data$n)) %>% 155 | dplyr::mutate(group = na_label) %>% 156 | dplyr::select( 157 | "group", 158 | dplyr::all_of(row_cat), 159 | "n", 160 | "p") 161 | 162 | } else { 163 | 164 | na_data <- tibble::tibble() 165 | } 166 | 167 | comparison <- dplyr::bind_rows( 168 | comparison_data, 169 | na_data) %>% 170 | tidyr::pivot_wider( 171 | id_cols = dplyr::all_of(row_cat), 172 | names_from = "group", 173 | values_from = c("n", "p")) %>% 174 | dplyr::mutate(dplyr::across(-1, ~tidyr::replace_na(.x, 0))) %>% 175 | dplyr::arrange(.data[[row_cat]]) 176 | 177 | # Clean names if clean_names is TRUE 178 | if (clean_names == TRUE) { 179 | comparison <- comparison %>% janitor::clean_names() 180 | } 181 | 182 | # Remove columns based on only argument 183 | if (stringr::str_trim(only) %in% c("n", "number")) { 184 | comparison <- comparison %>% dplyr::select(-dplyr::starts_with("p_")) 185 | } 186 | 187 | if (stringr::str_trim(only) %in% c("p", "percent")) { 188 | comparison <- comparison %>% dplyr::select(-dplyr::starts_with("n_")) 189 | } 190 | 191 | comparison 192 | } 193 | -------------------------------------------------------------------------------- /R/contrast.R: -------------------------------------------------------------------------------- 1 | #' Calculate the frequency of discrete values in one categorical variable for 2 | #' each of two mutually exclusive groups within another categorical variable 3 | #' 4 | #' This function shows the distrbution of values within given a categorical 5 | #' variable for one group within another categorical variable, and compares it 6 | #' with the distribution among all observations not in that group. Its purpose 7 | #' is to let you see quickly whether the distribution within that group differs 8 | #' from the distribution for the rest of the observations. The results are 9 | #' sorted in descending order of frequency for the named group i.e. the group 10 | #' named in \code{col_group}. 11 | #' 12 | #' @param data A dataframe containing the two variables of interest. 13 | #' @param row_cat The column name of a categorical variable whose distribution 14 | #' should be calculated for each exclusive group in \code{col_cat}. 15 | #' @param col_cat The column name of a categorical variable that will be 16 | #' split into two exclusive groups, one containing observations with a 17 | #' particular value of that variable, and another containing all other 18 | #' observations. 19 | #' @param col_group The name of the group within \code{col_cat} that is 20 | #' used to split the observations into two exclusive groups: those that are 21 | #' in the group and those that are not in the group. 22 | #' @param na.rm.row A boolean indicating whether to exclude NAs from the row 23 | #' results. The default is FALSE. 24 | #' @param na.rm.col A boolean indicating whether to exclude NAs from the column 25 | #' results. The default is FALSE. 26 | #' @param na.rm A boolean indicating whether to exclude NAs from both row and 27 | #' column results. This argument is provided as a convenience. It allows you 28 | #' to set \code{na.rm.row} and \code{na.rm.col} to the same value without 29 | #' having to specify them separately. If the value of \code{na.rm} is NULL, 30 | #' the argument is ignored. If it is not NULL it takes precendence. 31 | #' default is NULL. 32 | #' @param clean_names A boolean indicating whether the column names of the 33 | #' results tibble should be cleaned, so that any column names produced from 34 | #' data are converted to snake_case. The default is TRUE, but this can be 35 | #' changed with \code{options(tabbycat.clean_names = FALSE)}. 36 | #' @param only A string indicating that only one set of frequency columns 37 | #' should be returned in the results. If \code{only} is either "n" or 38 | #' "number", only the number columns are returned. If \code{only} is either 39 | #' "p" or "percent", only the percent columns are returned. If \code{only} is 40 | #' any other value, both sets of columns are shown. The default value is an 41 | #' empty string, which means both sets of columns are shown. 42 | #' @param na_label A string indicating the label to use for the columns that 43 | #' contain data for missing values. The default value is "na", but use this 44 | #' argument to set a different value if the default value collides with data 45 | #' in your dataset. 46 | #' @param other_label A string indicating the label to use for the columns that 47 | #' contain data for observations not in the named group. The default value is 48 | #' "other", but use this argument to set a different value if the default 49 | #' value collides with data in your dataset. 50 | #' @return A tibble showing the distribution of \code{row_cat} within each of 51 | #' the two exclusive groups in \code{col_cat}. 52 | #' @export 53 | 54 | cat_contrast <- function( 55 | data, 56 | row_cat, 57 | col_cat, 58 | col_group, 59 | na.rm.row = FALSE, 60 | na.rm.col = FALSE, 61 | na.rm = NULL, 62 | only = "", 63 | clean_names = getOption("tabbycat.clean_names"), 64 | na_label = getOption("tabbycat.na_label"), 65 | other_label = getOption("tabbycat.other_label")) { 66 | 67 | # Check the data argument is not null and is a dataframe 68 | if (is.null(data) || ! is.data.frame(data)) { 69 | stop("The \"data\" argument is not a dataframe.") 70 | } 71 | 72 | # Check that data has rows 73 | if (nrow(data) == 0) { 74 | stop("The \"data\" argument is empty.") 75 | } 76 | 77 | # Check the row_cat argument is a character vector of length one 78 | if (! is.character(row_cat) || length(row_cat) != 1) { 79 | stop("Invalid \"row_cat\" argument. Must be a character vector of length one.") 80 | } 81 | 82 | # Check the row_cat argument is a column in data 83 | if (! row_cat %in% colnames(data)) { 84 | stop(stringr::str_c("'", row_cat, 85 | "' is not a column in the dataframe.")) 86 | } 87 | 88 | # Check the col_cat argument is a character vector of length one 89 | if (! is.character(col_cat) || length(col_cat) != 1) { 90 | stop("Invalid \"col_cat\" argument. Must be a character vector of length one.") 91 | } 92 | 93 | # Check the col_cat argument is a column in data 94 | if (! col_cat %in% colnames(data)) { 95 | stop(stringr::str_c("'", col_cat, 96 | "' is not a column in the dataframe.")) 97 | } 98 | 99 | # Check the col_group argument is a valid vector of length one 100 | if (length(col_group) != 1 || is.na(col_group) || ! is.atomic(col_group)) { 101 | stop("Invalid \"col_group\" argument. Must be a character vector of length one.") 102 | } 103 | 104 | # Check that col_group exits in col_cat 105 | if (! col_group %in% data[[col_cat]]) { 106 | stop(stringr::str_c( 107 | "The \"col_group\" '", col_group, 108 | "' does not exist in the \"col_cat\" '", col_cat, "'.")) 109 | } 110 | 111 | # Check the na.rm.row argument is valid 112 | if (length(na.rm.row) != 1 || is.na(na.rm.row) || ! is.logical(na.rm.row)) { 113 | stop("Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE.") 114 | } 115 | 116 | # Check the na.rm.col argument is valid 117 | if (length(na.rm.col) != 1 || is.na(na.rm.col) || ! is.logical(na.rm.col)) { 118 | stop("Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE.") 119 | } 120 | 121 | # Check the na.rm argument is valid 122 | if (length(na.rm) > 1 || 123 | (! is.null(na.rm) && ! is.logical(na.rm)) || 124 | (! is.null(na.rm) && is.na(na.rm))) { 125 | stop("Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE.") 126 | } 127 | 128 | # Check the only argument is valid 129 | if (length(only) != 1 || is.na(only) || ! is.character(only)) { 130 | stop("Invalid \"only\" argument. Must be a character vector of length one.") 131 | } 132 | 133 | # Check the clean_names argument is valid 134 | if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) { 135 | stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.") 136 | } 137 | 138 | # Check the other_label argument is a character vector of length one 139 | if (! is.character(other_label) || length(other_label) != 1) { 140 | stop("Invalid \"other_label\" argument. Must be a character vector of length one.") 141 | } 142 | 143 | # Check the na_label argument is a character vector of length one 144 | if (! is.character(na_label) || length(na_label) != 1) { 145 | stop("Invalid \"na_label\" argument. Must be a character vector of length one.") 146 | } 147 | 148 | # Set both na.rm.row and na.rm.col to the value of na.rm if specified 149 | if (! is.null(na.rm)) { 150 | na.rm.row <- na.rm 151 | na.rm.col <- na.rm 152 | } 153 | 154 | # Remove rows with NAs if na.rm.row is TRUE 155 | if (na.rm.row == TRUE) { 156 | data <- data %>% dplyr::filter(! is.na(.data[[row_cat]])) 157 | } 158 | 159 | in_group_data <- data %>% 160 | dplyr::filter(.data[[col_cat]] == col_group) %>% 161 | dplyr::group_by(.data[[row_cat]]) %>% 162 | dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% 163 | dplyr::mutate(p = .data$n / sum(.data$n)) %>% 164 | dplyr::mutate(group = as.character(col_group)) %>% 165 | dplyr::select( 166 | "group", 167 | dplyr::all_of(row_cat), 168 | "n", 169 | "p") 170 | 171 | out_group_data <- data %>% 172 | dplyr::filter(.data[[col_cat]] != col_group) %>% 173 | dplyr::group_by(.data[[row_cat]]) %>% 174 | dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% 175 | dplyr::mutate(p = .data$n / sum(.data$n)) %>% 176 | dplyr::mutate(group = other_label) %>% 177 | dplyr::select( 178 | "group", 179 | dplyr::all_of(row_cat), 180 | "n", 181 | "p") 182 | 183 | if (na.rm.col == FALSE) { 184 | 185 | na_data <- data %>% 186 | dplyr::filter(is.na(.data[[col_cat]])) %>% 187 | dplyr::group_by(.data[[row_cat]]) %>% 188 | dplyr::summarise(n = dplyr::n(), .groups = "drop") %>% 189 | dplyr::mutate(p = .data$n / sum(.data$n)) %>% 190 | dplyr::mutate(group = na_label) %>% 191 | dplyr::select( 192 | "group", 193 | dplyr::all_of(row_cat), 194 | "n", 195 | "p") 196 | 197 | } else { 198 | 199 | na_data <- tibble::tibble() 200 | } 201 | 202 | comparison <- dplyr::bind_rows( 203 | in_group_data, 204 | out_group_data, 205 | na_data) %>% 206 | tidyr::pivot_wider( 207 | id_cols = dplyr::all_of(row_cat), 208 | names_from = "group", 209 | values_from = c("n", "p")) %>% 210 | dplyr::mutate(dplyr::across(-1, ~tidyr::replace_na(.x, 0))) %>% 211 | dplyr::arrange(dplyr::desc(.data[[stringr::str_c("n_", col_group)]])) 212 | 213 | # Clean names if clean_names is TRUE 214 | if (clean_names == TRUE) { 215 | comparison <- comparison %>% janitor::clean_names() 216 | } 217 | 218 | # Remove columns based on only argument 219 | if (stringr::str_trim(only) %in% c("n", "number")) { 220 | comparison <- comparison %>% dplyr::select(-dplyr::starts_with("p_")) 221 | } 222 | 223 | if (stringr::str_trim(only) %in% c("p", "percent")) { 224 | comparison <- comparison %>% dplyr::select(-dplyr::starts_with("n_")) 225 | } 226 | 227 | comparison 228 | } 229 | -------------------------------------------------------------------------------- /R/count.R: -------------------------------------------------------------------------------- 1 | #' Count the frequency of discrete values in the column of a dataframe 2 | #' 3 | #' This function differs from \code{cat_vcount} in that it operates on columns 4 | #' in dataframes rather than directly on vectors, which means it is more useful 5 | #' in pipelines but handles a narrower range of inputs. The results are sorted 6 | #' in descending order of frequency. 7 | #' 8 | #' @param data A dataframe containing a categorical vector for which 9 | #' frequencies will be calculated. 10 | #' @param cat The column name of the categorical variable for which frequencies 11 | #' will be calculated. 12 | #' @param na.rm A boolean indicating whether to exclude NAs from the results. 13 | #' The default is FALSE. 14 | #' @param only A string indicating that only one of the frequency columns 15 | #' should be returned in the results. If \code{only} is either "n" or 16 | #' "number", only the number column is returned. If \code{only} is either 17 | #' "p" or "percent", only the percent column is returned. If \code{only} is 18 | #' any other value, both columns are shown. The default value is an empty 19 | #' string, which means both columns are shown. 20 | #' @param clean_names A boolean indicating whether the column names of the 21 | #' results tibble should be cleaned, so that any column names produced from 22 | #' data are converted to snake_case. The default is TRUE, but this can be 23 | #' changed with \code{options(tabbycat.clean_names = FALSE)}. 24 | #' @return A tibble showing the frequency of each value in \code{cat}. 25 | #' @export 26 | 27 | cat_count <- function( 28 | data, 29 | cat, 30 | na.rm = FALSE, 31 | only = "", 32 | clean_names = getOption("tabbycat.clean_names")) { 33 | 34 | # Check the data argument is not null and is a dataframe 35 | if (is.null(data) || ! is.data.frame(data)) { 36 | stop("The \"data\" argument is not a dataframe.") 37 | } 38 | 39 | # Check that data has rows 40 | if (nrow(data) == 0) { 41 | stop("The \"data\" argument is empty.") 42 | } 43 | 44 | # Check the cat argument is a character vector of length one 45 | if (! is.character(cat) || length(cat) != 1) { 46 | stop("Invalid \"cat\" argument. Must be a character vector of length one.") 47 | } 48 | 49 | # Check the cat argument is a column in data 50 | if (! cat %in% colnames(data)) { 51 | stop(stringr::str_c("'", cat, "' is not a column in the dataframe.")) 52 | } 53 | 54 | # Check the na.rm argument is valid 55 | if (length(na.rm) != 1 || is.na(na.rm) || ! is.logical(na.rm)) { 56 | stop("Invalid \"na.rm\" argument. Must be either TRUE or FALSE.") 57 | } 58 | 59 | # Check the only argument is valid 60 | if (length(only) != 1 || is.na(only) || ! is.character(only)) { 61 | stop("Invalid \"only\" argument. Must be a character vector of length one.") 62 | } 63 | 64 | # Check the clean_names argument is valid 65 | if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) { 66 | stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.") 67 | } 68 | 69 | # Remove rows with NAs if na.rm is TRUE 70 | if (na.rm == TRUE) { 71 | data <- data %>% dplyr::filter(! is.na(.data[[cat]])) 72 | } 73 | 74 | # Create table 75 | count <- data %>% 76 | dplyr::group_by(.data[[cat]]) %>% 77 | dplyr::summarise(number = dplyr::n(), .groups = "drop") %>% 78 | dplyr::mutate(percent = .data$number / sum(.data$number)) %>% 79 | dplyr::arrange(dplyr::desc(.data$number)) 80 | 81 | # Clean names if clean_names is TRUE 82 | if (clean_names == TRUE) { 83 | count <- count %>% janitor::clean_names() 84 | } 85 | 86 | # Remove columns based on only argument 87 | if (stringr::str_trim(only) %in% c("n", "number")) { 88 | count <- count %>% dplyr::select(-c("percent")) 89 | } 90 | 91 | if (stringr::str_trim(only) %in% c("p", "percent")) { 92 | count <- count %>% dplyr::select(-c("number")) 93 | } 94 | 95 | count 96 | } 97 | -------------------------------------------------------------------------------- /R/summarise.R: -------------------------------------------------------------------------------- 1 | #' Summarise the values of a numerical variable for each group within a 2 | #' categorical variable 3 | #' 4 | #' The results are sorted on the values of the categorical variable i.e. 5 | #' the variable specified with \code{cat}. If this variable is a character 6 | #' vector it will be sorted alphabetically. If it is a factor it will be 7 | #' sorted in the order of its levels. This function can be called as either 8 | #' \code{cat_summarise} or \code{cat_summarize}. 9 | #' 10 | #' @param data A dataframe containing a categorical variable and numerical 11 | #' variable to summarise. 12 | #' @param cat The name of a column in \code{data} which is a categorical vector 13 | #' of discrete values for which summaries will be calculated. 14 | #' @param num The name of a column in \code{data} which is a numerical vector 15 | #' that will be summarised for each group. 16 | #' @param na.rm A boolean indicating whether to exclude NAs from the row 17 | #' results. Note that NAs are **always** ignored in calculating the summary 18 | #' statistics for \code{num} shown in each row, and the number of NAs that 19 | #' exist in \code{num} for each group in \code{cat} is shown in the 20 | #' \code{na} column of the results table. This argument controls whether a 21 | #' row of summary statistics is shown for observations that are NA in 22 | #' \code{cat}. The default is FALSE. 23 | #' @param clean_names A boolean indicating whether the column names of the 24 | #' results tibble should be cleaned, so that any column names produced from 25 | #' data are converted to snake_case. The default is TRUE, but this can be 26 | #' changed with \code{options(tabbycat.clean_names = FALSE)}. 27 | #' @return A tibble showing summary statistics for \code{num} for each group 28 | #' in \code{cat}. 29 | #' @export 30 | 31 | cat_summarise <- function( 32 | data, 33 | cat, 34 | num, 35 | na.rm = FALSE, 36 | clean_names = getOption("tabbycat.clean_names")) { 37 | 38 | # Check the data argument is not null and is a dataframe 39 | if (is.null(data) || ! is.data.frame(data)) { 40 | stop("The \"data\" argument is not a dataframe.") 41 | } 42 | 43 | # Check that data has rows 44 | if (nrow(data) == 0) { 45 | stop("The \"data\" argument is empty.") 46 | } 47 | 48 | # Check the cat argument is a character vector of length one 49 | if (! is.character(cat) || length(cat) != 1) { 50 | stop("Invalid \"cat\" argument. Must be a character vector of length one.") 51 | } 52 | 53 | # Check the cat argument is a column in data 54 | if (! cat %in% colnames(data)) { 55 | stop(stringr::str_c("'", cat, "' is not a column in the dataframe.")) 56 | } 57 | 58 | # Check the num argument is a character vector of length one 59 | if (! is.character(num) || length(num) != 1) { 60 | stop("Invalid \"num\" argument. Must be a character vector of length one.") 61 | } 62 | 63 | # Check the num argument is a column in data 64 | if (! num %in% colnames(data)) { 65 | stop(stringr::str_c("'", num, "' is not a column in the dataframe.")) 66 | } 67 | 68 | # Check the num argument is numeric 69 | if (! is.numeric(data[[num]])) { 70 | stop(stringr::str_c("The num argument is not a numeric column.")) 71 | } 72 | 73 | # Check the na.rm argument is valid 74 | if (length(na.rm) != 1 || is.na(na.rm) || ! is.logical(na.rm)) { 75 | stop("Invalid \"na.rm\" argument. Must be either TRUE or FALSE.") 76 | } 77 | 78 | # Check the clean_names argument is valid 79 | if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) { 80 | stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.") 81 | } 82 | 83 | # Remove rows with NAs if na.rm is TRUE 84 | if (na.rm == TRUE) { 85 | data <- data %>% dplyr::filter(! is.na(.data[[cat]])) 86 | } 87 | 88 | # Create table 89 | summary <- data %>% 90 | dplyr::group_by(.data[[cat]]) %>% 91 | dplyr::summarise( 92 | n = dplyr::n(), 93 | na = sum(is.na(.data[[num]])), 94 | mean = safe_mean(.data[[num]], na.rm = TRUE), 95 | sd = stats::sd(.data[[num]], na.rm = TRUE), 96 | min = safe_min(.data[[num]], na.rm = TRUE), 97 | lq = unname(stats::quantile(.data[[num]], 0.25, na.rm = TRUE)), 98 | med = stats::median(.data[[num]], na.rm = TRUE), 99 | uq = unname(stats::quantile(.data[[num]], 0.75, na.rm = TRUE)), 100 | max = safe_max(.data[[num]], na.rm = TRUE), 101 | .groups = "drop") %>% 102 | dplyr::arrange(.data[[cat]]) 103 | 104 | # Clean names if clean_names is TRUE 105 | if (clean_names == TRUE) { 106 | summary <- summary %>% janitor::clean_names() 107 | } 108 | 109 | summary 110 | } 111 | 112 | #' @rdname cat_summarise 113 | #' @export 114 | #' 115 | cat_summarize <- cat_summarise 116 | 117 | #' Calculate \code{mean} but return NA rather than NaN when values are missing 118 | #' 119 | #' This function is a drop-in replacement for \code{mean}, which is used in 120 | #' \code{cat_summarise}. It returns NA rather than NaN when all values are NA. 121 | #' 122 | #' @param x A numerical vector. 123 | #' @param na.rm A boolean indicating whether to remove NAs. 124 | #' @return The mean of \code{x} or NA when values are missing. 125 | #' @keywords internal 126 | 127 | safe_mean <- function(x, na.rm = FALSE) { 128 | result <- mean(x, na.rm = na.rm) 129 | if (is.na(result)) return(NA) 130 | result 131 | } 132 | 133 | #' Calculate \code{min} but suppress the warning when all values are missing 134 | #' 135 | #' This function is a drop-in replacement for \code{min}, which is used in 136 | #' \code{cat_summarise}. It suppresses the warning when all values are NA and 137 | #' na.rm is TRUE, and returns NA instead of Inf. 138 | #' 139 | #' @param x A numerical vector. 140 | #' @param na.rm A boolean indicating whether to remove NAs. 141 | #' @return The min of \code{x} or NA when values are missing. 142 | #' @keywords internal 143 | 144 | safe_min <- function(x, na.rm = FALSE) { 145 | result <- NA 146 | tryCatch( 147 | result <- min(x, na.rm = na.rm), 148 | warning = function(c) c) 149 | result 150 | } 151 | 152 | 153 | #' Calculate \code{max} but suppress the warning when all values are missing 154 | #' 155 | #' This function is a drop-in replacement for \code{max}, which is used in 156 | #' \code{cat_summarise}. It suppresses the warning when all values are NA and 157 | #' na.rm is TRUE, and returns NA instead of -Inf. 158 | #' 159 | #' @param x A numerical vector. 160 | #' @param na.rm A boolean indicating whether to remove NAs. 161 | #' @return The max of \code{x} or NA when values are missing. 162 | #' @keywords internal 163 | 164 | safe_max <- function(x, na.rm = FALSE) { 165 | result <- NA 166 | tryCatch( 167 | result <- max(x, na.rm = na.rm), 168 | warning = function(c) c) 169 | result 170 | } 171 | -------------------------------------------------------------------------------- /R/tabbycat-package.R: -------------------------------------------------------------------------------- 1 | #' tabbycat: Tabulate and summarise categorical data 2 | #' 3 | #' Functions for tabulating and summarising categorical variables. 4 | #' 5 | #' @name tabbycat 6 | #' @importFrom rlang .data 7 | #' @importFrom magrittr %>% 8 | #' @keywords internal 9 | "_PACKAGE" 10 | 11 | # Tell R CMD check about new operators 12 | if(getRversion() >= "2.15.1") { 13 | utils::globalVariables(c(".", ":=")) 14 | } 15 | -------------------------------------------------------------------------------- /R/vcount.R: -------------------------------------------------------------------------------- 1 | #' Count the frequency of discrete values in a categorical vector 2 | #' 3 | #' This function differs from \code{cat_count} in that it operates directly on 4 | #' vectors, rather than on columns in dataframes, which means it is less useful 5 | #' in pipelines but can handle a wider range of inputs. The results are sorted 6 | #' in descending order of frequency. 7 | #' 8 | #' @param cat A categorical vector for which frequencies will be calculated. 9 | #' @param na.rm A boolean indicating whether to exclude NAs from the results. 10 | #' The default is FALSE. 11 | #' @param only A string indicating that only one of the frequency columns 12 | #' should be returned in the results. If \code{only} is either "n" or 13 | #' "number", only the number column is returned. If \code{only} is either 14 | #' "p" or "percent", only the percent column is returned. If \code{only} is 15 | #' any other value, both columns are shown. The default value is an empty 16 | #' string, which means both columns are shown. 17 | #' @param clean_names A boolean indicating whether the column names of the 18 | #' results tibble should be cleaned, so that any column names produced from 19 | #' data are converted to snake_case. The default is TRUE, but this can be 20 | #' changed with \code{options(tabbycat.clean_names = FALSE)}. 21 | #' @return A tibble showing the frequency of each value in \code{cat}. 22 | #' @export 23 | 24 | cat_vcount <- function( 25 | cat, 26 | na.rm = FALSE, 27 | only = "", 28 | clean_names = getOption("tabbycat.clean_names")) { 29 | 30 | # Check the cat argument is not null and is a vector 31 | if (is.null(cat) || ! is.atomic(cat)) { 32 | stop("The \"cat\" argument is not a vector.") 33 | } 34 | 35 | # Check the cat argument is not an empty factor 36 | if (length(cat) == 0) { 37 | stop("The \"cat\" argument is empty.") 38 | } 39 | 40 | # Check the na.rm argument is valid 41 | if (length(na.rm) != 1 || is.na(na.rm) || ! is.logical(na.rm)) { 42 | stop("Invalid \"na.rm\" argument. Must be either TRUE or FALSE.") 43 | } 44 | 45 | # Check the only argument is valid 46 | if (length(only) != 1 || is.na(only) || ! is.character(only)) { 47 | stop("Invalid \"only\" argument. Must be a character vector of length one.") 48 | } 49 | 50 | # Check the clean_names argument is valid 51 | if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) { 52 | stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.") 53 | } 54 | 55 | # Get the variable name of the cat argument 56 | obj_name <- deparse(substitute(cat)) 57 | 58 | # If the name is a dataframe and a column extract the column name 59 | if(stringr::str_detect(obj_name, "\\$")) { 60 | obj_name <- stringr::str_split_fixed(obj_name, "\\$", 2)[1, 2] 61 | } 62 | 63 | # Set the name 64 | name <- obj_name 65 | 66 | # Set option for handling NAs 67 | use_na <- ifelse(na.rm, "no", "ifany") 68 | 69 | # Create the results dataframe and return 70 | count <- tibble::as_tibble( 71 | table( 72 | cat, 73 | useNA = use_na), 74 | .name_repair = ~ c(name, "number")) %>% 75 | dplyr::mutate(percent = .data$number / sum(.data$number)) %>% 76 | dplyr::arrange(dplyr::desc(.data$number)) 77 | 78 | # Clean names if clean_names is TRUE 79 | if (clean_names == TRUE) { 80 | count <- count %>% janitor::clean_names() 81 | } 82 | 83 | # Remove columns based on only argument 84 | if (stringr::str_trim(only) %in% c("n", "number")) { 85 | count <- count %>% dplyr::select(-c("percent")) 86 | } 87 | 88 | if (stringr::str_trim(only) %in% c("p", "percent")) { 89 | count <- count %>% dplyr::select(-c("number")) 90 | } 91 | 92 | count 93 | } 94 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onLoad <- function(libname, pkgname) { 2 | 3 | # Set default options if options have not already been set 4 | op <- options() 5 | 6 | op_tabbycat <- list( 7 | tabbycat.clean_names = TRUE, 8 | tabbycat.na_label = "na", 9 | tabbycat.other_label = "other") 10 | 11 | to_set <- !(names(op_tabbycat) %in% names(op)) 12 | if (any(to_set)) options(op_tabbycat[to_set]) 13 | invisible() 14 | } 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | # CRAN comments 2 | 3 | This is a new version of an existing package. It updates the package from version 0.17.0 to 0.18.0. 4 | 5 | ## Changes in version 0.18.0 6 | 7 | None of the changes in version 0.18.0 alter the functionality of the package. They simply remove deprecated behaviour. In particular: 8 | 9 | * The file R/tabbycat.R has been replaced with the file R/tabbycat-package.R. 10 | * The @docType annotation has been removed from tabbycat-package.R. 11 | * Brief package-level documentation has been created by documenting the _PACKAGE sentinel in tabbycat-package.R. 12 | * Package functions no longer use the .data variable in tidy selections in order to comply with its deprecation in tidyselect. 13 | 14 | ## R CMD check results 15 | 16 | There were 0 errors, 0 warnings, 0 notes in all test environments. 17 | 18 | ## Test environments 19 | 20 | * Local R installation 21 | * MacOS 13.4.1, R 4.3.1 22 | 23 | * GitHub actions 24 | * macOS Monterey 12.6.7, R version 4.3.1 25 | * Windows Server 2022 x64 (build 20348), R version 4.3.1 26 | * Ubuntu 22.04.3, LTS-R version 4.4.0 27 | * Ubuntu 22.04.3, LTS-R version 4.3.1 28 | * Ubuntu 22.04.3, LTS-R version 4.2.3 29 | -------------------------------------------------------------------------------- /man/cat_compare.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/compare.R 3 | \name{cat_compare} 4 | \alias{cat_compare} 5 | \title{Calculate the frequency of discrete values in one categorical variable for 6 | each group within another categorical variable} 7 | \usage{ 8 | cat_compare( 9 | data, 10 | row_cat, 11 | col_cat, 12 | na.rm.row = FALSE, 13 | na.rm.col = FALSE, 14 | na.rm = NULL, 15 | only = "", 16 | clean_names = getOption("tabbycat.clean_names"), 17 | na_label = getOption("tabbycat.na_label") 18 | ) 19 | } 20 | \arguments{ 21 | \item{data}{A dataframe containing the two variables of interest.} 22 | 23 | \item{row_cat}{The column name of a categorical variable whose distribution 24 | will be calculated for each group in \code{col_cat}.} 25 | 26 | \item{col_cat}{The column name of a categorical variable which will be 27 | split into groups and the distrubtion of \code{row_cat} calulated 28 | for each group.} 29 | 30 | \item{na.rm.row}{A boolean indicating whether to exclude NAs from the row 31 | results. The default is FALSE.} 32 | 33 | \item{na.rm.col}{A boolean indicating whether to exclude NAs from the column 34 | results. The default is FALSE.} 35 | 36 | \item{na.rm}{A boolean indicating whether to exclude NAs from both row and 37 | column results. This argument is provided as a convenience. It allows you 38 | to set \code{na.rm.row} and \code{na.rm.col} to the same value without 39 | having to specify them separately. If the value of \code{na.rm} is NULL, 40 | the argument is ignored. If it is not NULL it takes precendence. 41 | default is NULL.} 42 | 43 | \item{only}{A string indicating that only one set of frequency columns 44 | should be returned in the results. If \code{only} is either "n" or 45 | "number", only the number columns are returned. If \code{only} is either 46 | "p" or "percent", only the percent columns are returned. If \code{only} is 47 | any other value, both sets of columns are shown. The default value is an 48 | empty string, which means both sets of columns are shown.} 49 | 50 | \item{clean_names}{A boolean indicating whether the column names of the 51 | results tibble should be cleaned, so that any column names produced from 52 | data are converted to snake_case. The default is TRUE, but this can be 53 | changed with \code{options(tabbycat.clean_names = FALSE)}.} 54 | 55 | \item{na_label}{A string indicating the label to use for the columns that 56 | contain data for missing values. The default value is "na", but use this 57 | argument to set a different value if the default value collides with data 58 | in your dataset.} 59 | } 60 | \value{ 61 | A tibble showing the distribution of \code{row_cat} within each 62 | group in \code{col_cat}. 63 | } 64 | \description{ 65 | This function crosstabulates the frequencies of one categorical variable 66 | within the groups of another. The results are sorted on the values of the 67 | variable whose distribution is shown in each column i.e. the variable 68 | specified with \code{row_cat}. If this variable is a character vector it 69 | will be sorted alphabetically. If it is a factor it will be sorted in the 70 | order of its levels. 71 | } 72 | -------------------------------------------------------------------------------- /man/cat_contrast.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/contrast.R 3 | \name{cat_contrast} 4 | \alias{cat_contrast} 5 | \title{Calculate the frequency of discrete values in one categorical variable for 6 | each of two mutually exclusive groups within another categorical variable} 7 | \usage{ 8 | cat_contrast( 9 | data, 10 | row_cat, 11 | col_cat, 12 | col_group, 13 | na.rm.row = FALSE, 14 | na.rm.col = FALSE, 15 | na.rm = NULL, 16 | only = "", 17 | clean_names = getOption("tabbycat.clean_names"), 18 | na_label = getOption("tabbycat.na_label"), 19 | other_label = getOption("tabbycat.other_label") 20 | ) 21 | } 22 | \arguments{ 23 | \item{data}{A dataframe containing the two variables of interest.} 24 | 25 | \item{row_cat}{The column name of a categorical variable whose distribution 26 | should be calculated for each exclusive group in \code{col_cat}.} 27 | 28 | \item{col_cat}{The column name of a categorical variable that will be 29 | split into two exclusive groups, one containing observations with a 30 | particular value of that variable, and another containing all other 31 | observations.} 32 | 33 | \item{col_group}{The name of the group within \code{col_cat} that is 34 | used to split the observations into two exclusive groups: those that are 35 | in the group and those that are not in the group.} 36 | 37 | \item{na.rm.row}{A boolean indicating whether to exclude NAs from the row 38 | results. The default is FALSE.} 39 | 40 | \item{na.rm.col}{A boolean indicating whether to exclude NAs from the column 41 | results. The default is FALSE.} 42 | 43 | \item{na.rm}{A boolean indicating whether to exclude NAs from both row and 44 | column results. This argument is provided as a convenience. It allows you 45 | to set \code{na.rm.row} and \code{na.rm.col} to the same value without 46 | having to specify them separately. If the value of \code{na.rm} is NULL, 47 | the argument is ignored. If it is not NULL it takes precendence. 48 | default is NULL.} 49 | 50 | \item{only}{A string indicating that only one set of frequency columns 51 | should be returned in the results. If \code{only} is either "n" or 52 | "number", only the number columns are returned. If \code{only} is either 53 | "p" or "percent", only the percent columns are returned. If \code{only} is 54 | any other value, both sets of columns are shown. The default value is an 55 | empty string, which means both sets of columns are shown.} 56 | 57 | \item{clean_names}{A boolean indicating whether the column names of the 58 | results tibble should be cleaned, so that any column names produced from 59 | data are converted to snake_case. The default is TRUE, but this can be 60 | changed with \code{options(tabbycat.clean_names = FALSE)}.} 61 | 62 | \item{na_label}{A string indicating the label to use for the columns that 63 | contain data for missing values. The default value is "na", but use this 64 | argument to set a different value if the default value collides with data 65 | in your dataset.} 66 | 67 | \item{other_label}{A string indicating the label to use for the columns that 68 | contain data for observations not in the named group. The default value is 69 | "other", but use this argument to set a different value if the default 70 | value collides with data in your dataset.} 71 | } 72 | \value{ 73 | A tibble showing the distribution of \code{row_cat} within each of 74 | the two exclusive groups in \code{col_cat}. 75 | } 76 | \description{ 77 | This function shows the distrbution of values within given a categorical 78 | variable for one group within another categorical variable, and compares it 79 | with the distribution among all observations not in that group. Its purpose 80 | is to let you see quickly whether the distribution within that group differs 81 | from the distribution for the rest of the observations. The results are 82 | sorted in descending order of frequency for the named group i.e. the group 83 | named in \code{col_group}. 84 | } 85 | -------------------------------------------------------------------------------- /man/cat_count.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/count.R 3 | \name{cat_count} 4 | \alias{cat_count} 5 | \title{Count the frequency of discrete values in the column of a dataframe} 6 | \usage{ 7 | cat_count( 8 | data, 9 | cat, 10 | na.rm = FALSE, 11 | only = "", 12 | clean_names = getOption("tabbycat.clean_names") 13 | ) 14 | } 15 | \arguments{ 16 | \item{data}{A dataframe containing a categorical vector for which 17 | frequencies will be calculated.} 18 | 19 | \item{cat}{The column name of the categorical variable for which frequencies 20 | will be calculated.} 21 | 22 | \item{na.rm}{A boolean indicating whether to exclude NAs from the results. 23 | The default is FALSE.} 24 | 25 | \item{only}{A string indicating that only one of the frequency columns 26 | should be returned in the results. If \code{only} is either "n" or 27 | "number", only the number column is returned. If \code{only} is either 28 | "p" or "percent", only the percent column is returned. If \code{only} is 29 | any other value, both columns are shown. The default value is an empty 30 | string, which means both columns are shown.} 31 | 32 | \item{clean_names}{A boolean indicating whether the column names of the 33 | results tibble should be cleaned, so that any column names produced from 34 | data are converted to snake_case. The default is TRUE, but this can be 35 | changed with \code{options(tabbycat.clean_names = FALSE)}.} 36 | } 37 | \value{ 38 | A tibble showing the frequency of each value in \code{cat}. 39 | } 40 | \description{ 41 | This function differs from \code{cat_vcount} in that it operates on columns 42 | in dataframes rather than directly on vectors, which means it is more useful 43 | in pipelines but handles a narrower range of inputs. The results are sorted 44 | in descending order of frequency. 45 | } 46 | -------------------------------------------------------------------------------- /man/cat_summarise.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarise.R 3 | \name{cat_summarise} 4 | \alias{cat_summarise} 5 | \alias{cat_summarize} 6 | \title{Summarise the values of a numerical variable for each group within a 7 | categorical variable} 8 | \usage{ 9 | cat_summarise( 10 | data, 11 | cat, 12 | num, 13 | na.rm = FALSE, 14 | clean_names = getOption("tabbycat.clean_names") 15 | ) 16 | 17 | cat_summarize( 18 | data, 19 | cat, 20 | num, 21 | na.rm = FALSE, 22 | clean_names = getOption("tabbycat.clean_names") 23 | ) 24 | } 25 | \arguments{ 26 | \item{data}{A dataframe containing a categorical variable and numerical 27 | variable to summarise.} 28 | 29 | \item{cat}{The name of a column in \code{data} which is a categorical vector 30 | of discrete values for which summaries will be calculated.} 31 | 32 | \item{num}{The name of a column in \code{data} which is a numerical vector 33 | that will be summarised for each group.} 34 | 35 | \item{na.rm}{A boolean indicating whether to exclude NAs from the row 36 | results. Note that NAs are **always** ignored in calculating the summary 37 | statistics for \code{num} shown in each row, and the number of NAs that 38 | exist in \code{num} for each group in \code{cat} is shown in the 39 | \code{na} column of the results table. This argument controls whether a 40 | row of summary statistics is shown for observations that are NA in 41 | \code{cat}. The default is FALSE.} 42 | 43 | \item{clean_names}{A boolean indicating whether the column names of the 44 | results tibble should be cleaned, so that any column names produced from 45 | data are converted to snake_case. The default is TRUE, but this can be 46 | changed with \code{options(tabbycat.clean_names = FALSE)}.} 47 | } 48 | \value{ 49 | A tibble showing summary statistics for \code{num} for each group 50 | in \code{cat}. 51 | } 52 | \description{ 53 | The results are sorted on the values of the categorical variable i.e. 54 | the variable specified with \code{cat}. If this variable is a character 55 | vector it will be sorted alphabetically. If it is a factor it will be 56 | sorted in the order of its levels. This function can be called as either 57 | \code{cat_summarise} or \code{cat_summarize}. 58 | } 59 | -------------------------------------------------------------------------------- /man/cat_vcount.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/vcount.R 3 | \name{cat_vcount} 4 | \alias{cat_vcount} 5 | \title{Count the frequency of discrete values in a categorical vector} 6 | \usage{ 7 | cat_vcount( 8 | cat, 9 | na.rm = FALSE, 10 | only = "", 11 | clean_names = getOption("tabbycat.clean_names") 12 | ) 13 | } 14 | \arguments{ 15 | \item{cat}{A categorical vector for which frequencies will be calculated.} 16 | 17 | \item{na.rm}{A boolean indicating whether to exclude NAs from the results. 18 | The default is FALSE.} 19 | 20 | \item{only}{A string indicating that only one of the frequency columns 21 | should be returned in the results. If \code{only} is either "n" or 22 | "number", only the number column is returned. If \code{only} is either 23 | "p" or "percent", only the percent column is returned. If \code{only} is 24 | any other value, both columns are shown. The default value is an empty 25 | string, which means both columns are shown.} 26 | 27 | \item{clean_names}{A boolean indicating whether the column names of the 28 | results tibble should be cleaned, so that any column names produced from 29 | data are converted to snake_case. The default is TRUE, but this can be 30 | changed with \code{options(tabbycat.clean_names = FALSE)}.} 31 | } 32 | \value{ 33 | A tibble showing the frequency of each value in \code{cat}. 34 | } 35 | \description{ 36 | This function differs from \code{cat_count} in that it operates directly on 37 | vectors, rather than on columns in dataframes, which means it is less useful 38 | in pipelines but can handle a wider range of inputs. The results are sorted 39 | in descending order of frequency. 40 | } 41 | -------------------------------------------------------------------------------- /man/safe_max.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarise.R 3 | \name{safe_max} 4 | \alias{safe_max} 5 | \title{Calculate \code{max} but suppress the warning when all values are missing} 6 | \usage{ 7 | safe_max(x, na.rm = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{A numerical vector.} 11 | 12 | \item{na.rm}{A boolean indicating whether to remove NAs.} 13 | } 14 | \value{ 15 | The max of \code{x} or NA when values are missing. 16 | } 17 | \description{ 18 | This function is a drop-in replacement for \code{max}, which is used in 19 | \code{cat_summarise}. It suppresses the warning when all values are NA and 20 | na.rm is TRUE, and returns NA instead of -Inf. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/safe_mean.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarise.R 3 | \name{safe_mean} 4 | \alias{safe_mean} 5 | \title{Calculate \code{mean} but return NA rather than NaN when values are missing} 6 | \usage{ 7 | safe_mean(x, na.rm = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{A numerical vector.} 11 | 12 | \item{na.rm}{A boolean indicating whether to remove NAs.} 13 | } 14 | \value{ 15 | The mean of \code{x} or NA when values are missing. 16 | } 17 | \description{ 18 | This function is a drop-in replacement for \code{mean}, which is used in 19 | \code{cat_summarise}. It returns NA rather than NaN when all values are NA. 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /man/safe_min.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarise.R 3 | \name{safe_min} 4 | \alias{safe_min} 5 | \title{Calculate \code{min} but suppress the warning when all values are missing} 6 | \usage{ 7 | safe_min(x, na.rm = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{A numerical vector.} 11 | 12 | \item{na.rm}{A boolean indicating whether to remove NAs.} 13 | } 14 | \value{ 15 | The min of \code{x} or NA when values are missing. 16 | } 17 | \description{ 18 | This function is a drop-in replacement for \code{min}, which is used in 19 | \code{cat_summarise}. It suppresses the warning when all values are NA and 20 | na.rm is TRUE, and returns NA instead of Inf. 21 | } 22 | \keyword{internal} 23 | -------------------------------------------------------------------------------- /man/tabbycat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tabbycat-package.R 3 | \docType{package} 4 | \name{tabbycat} 5 | \alias{tabbycat} 6 | \alias{tabbycat-package} 7 | \title{tabbycat: Tabulate and summarise categorical data} 8 | \description{ 9 | Functions for tabulating and summarising categorical variables. 10 | } 11 | \author{ 12 | \strong{Maintainer}: Oliver Hawkins \email{oli@olihawkins.com} 13 | 14 | } 15 | \keyword{internal} 16 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # tabbycat 2 | 3 | `tabbycat` is an R package for tabulating and summarising categorical variables. Most functions are designed to work with dataframes, and use the tidyverse idiom of taking the dataframe as the first argument so they work within pipelines. Equivalent functions that operate directly on vectors are also provided where it makes sense. This package aims to make exploratory data analysis involving categorical variables quicker, simpler and more robust. 4 | 5 | ## Status 6 | 7 | This package is fully functional. Please let me know if you run into any issues. 8 | 9 | 10 | [![CRAN status](https://www.r-pkg.org/badges/version/tabbycat)](https://CRAN.R-project.org/package=tabbycat) 11 | [![R-CMD-check](https://github.com/olihawkins/tabbycat/workflows/R-CMD-check/badge.svg)](https://github.com/olihawkins/tabbycat/actions) 12 | [![codecov](https://codecov.io/gh/olihawkins/tabbycat/branch/main/graph/badge.svg?token=W177A4T66K)](https://codecov.io/gh/olihawkins/tabbycat) 13 | 14 | 15 | ## Contents 16 | 17 | * [Installation](#1-installation) 18 | * [Counting functions](#2-counting-functions) 19 | * [Comparison functions](#3-comparison-functions) 20 | * [Summarising functions](#4-summarising-functions) 21 | * [Other API features](#5-other-api-features) 22 | 23 | ## List of functions 24 | 25 | * [`cat_count`](#cat_count) 26 | * [`cat_vcount`](#cat_vcount) 27 | * [`cat_compare`](#cat_compare) 28 | * [`cat_contrast`](#cat_contrast) 29 | * [`cat_summarise`](#cat_summarise) 30 | 31 | ## 1. Installation 32 | 33 | Install the latest release from CRAN. 34 | 35 | ```r 36 | install.packages("tabbycat") 37 | ``` 38 | 39 | Or install the development version from GitHub. 40 | 41 | ```r 42 | install.packages("remotes") 43 | remotes::install_github("olihawkins/tabbycat") 44 | ``` 45 | 46 | ## 2. Counting functions 47 | 48 | ### `cat_count` 49 | 50 | `cat_count` calculates the frequency of discrete values in the column of a dataframe and returns the counts and percentages as a tibble. This function operates on columns in dataframes, but an equivalent function called `cat_vcount` provides the same functionality for vectors. Call the function with a dataframe and the name of the column to count. 51 | 52 | ```r 53 | # Load tabbycat and the mpg dataset 54 | library(tabbycat) 55 | mpg <- ggplot2::mpg 56 | 57 | cat_count(mpg, "class") 58 | 59 | # A tibble: 7 × 3 60 | # class number percent 61 | # 62 | # 1 suv 62 0.265 63 | # 2 compact 47 0.201 64 | # 3 midsize 41 0.175 65 | # 4 subcompact 35 0.150 66 | # 5 pickup 33 0.141 67 | # 6 minivan 11 0.0470 68 | # 7 2seater 5 0.0214 69 | ``` 70 | 71 | ### `cat_vcount` 72 | 73 | `cat_vcount` is equivalent to `cat_count` but works directly on vectors: it calculates the frequency of discrete values in a vector and returns the counts and percentages as a tibble. `cat_vcount` can handle a wider range of inputs than `cat_count` but it does not fit as easily into pipelines. Call the function with a vector to count. 74 | 75 | ```r 76 | # Load tabbycat and the mpg dataset 77 | library(tabbycat) 78 | mpg <- ggplot2::mpg 79 | 80 | cat_vcount(mpg$class) 81 | 82 | # A tibble: 7 × 3 83 | # class number percent 84 | # 85 | # 1 suv 62 0.265 86 | # 2 compact 47 0.201 87 | # 3 midsize 41 0.175 88 | # 4 subcompact 35 0.150 89 | # 5 pickup 33 0.141 90 | # 6 minivan 11 0.0470 91 | # 7 2seater 5 0.0214 92 | ``` 93 | 94 | ### 2.1. NA handling for counting functions 95 | 96 | By default, if any NAs exist in the data their frequency is included in the results, but you can remove this by setting the `na.rm` argument to `TRUE`. This means the percentages are caclulated excluding NAs (i.e. based on the counts shown in the table). 97 | 98 | ```r 99 | # Set the class of the first observation to NA 100 | mpg[1, ]$class <- NA 101 | 102 | # Call cat_count with defaults 103 | cat_count(mpg, "class") 104 | 105 | # A tibble: 8 × 3 106 | # class number percent 107 | # 108 | # 1 suv 62 0.265 109 | # 2 compact 46 0.197 110 | # 3 midsize 41 0.175 111 | # 4 subcompact 35 0.150 112 | # 5 pickup 33 0.141 113 | # 6 minivan 11 0.0470 114 | # 7 2seater 5 0.0214 115 | # 8 NA 1 0.00427 116 | 117 | # Call cat_count with na.rm set to TRUE 118 | cat_count(mpg, "class", na.rm = TRUE) 119 | 120 | # A tibble: 7 × 3 121 | # class number percent 122 | # 123 | # 1 suv 62 0.266 124 | # 2 compact 46 0.197 125 | # 3 midsize 41 0.176 126 | # 4 subcompact 35 0.150 127 | # 5 pickup 33 0.142 128 | # 6 minivan 11 0.0472 129 | # 7 2seater 5 0.0215 130 | ``` 131 | 132 | ## 3. Comparison functions 133 | 134 | ### `cat_compare` 135 | 136 | `cat_compare` calculates the distribution of one categorical variable within the groups of another categorical variable and returns the counts and percentages as a tibble. It is essentially a cross tabulation of the two variables with column-wise percentages. Call the function with a dataframe and provide: 137 | 138 | 1. `row_cat` -- the variable to distribute down the rows 139 | 2. `col_cat` -- the variable to split into groups along the columns 140 | 141 | ```r 142 | # Load tabbycat and the mpg dataset 143 | library(tabbycat) 144 | mpg <- ggplot2::mpg 145 | 146 | cat_compare(mpg, "class", "cyl") 147 | 148 | # A tibble: 7 × 9 149 | # class n_4 n_5 n_6 n_8 p_4 p_5 p_6 p_8 150 | # 151 | # 1 2seater 0 0 0 5 0 0 0 0.0714 152 | # 2 compact 32 2 13 0 0.395 0.5 0.165 0 153 | # 3 midsize 16 0 23 2 0.198 0 0.291 0.0286 154 | # 4 minivan 1 0 10 0 0.0123 0 0.127 0 155 | # 5 pickup 3 0 10 20 0.0370 0 0.127 0.286 156 | # 6 subcompact 21 2 7 5 0.259 0.5 0.0886 0.0714 157 | # 7 suv 8 0 16 38 0.0988 0 0.203 0.543 158 | ``` 159 | 160 | ### `cat_contrast` 161 | 162 | `cat_contrast` caculates the frequency of discrete values in one categorical variable for each of two mutually exclusive groups within another categorical variable and returns the counts and percentages as a tibble. This lets you see if the distribution of a variable within a particular group differs from the distribution in the rest of the dataset. Call the function with a dataframe and provide: 163 | 164 | 1. `row_cat` -- the variable to distribute down the rows 165 | 2. `col_cat` -- the variable to split into two exclusive groups along the columns 166 | 3. `col_group` -- the name of the group in `col_cat` to contrast against the rest of the dataset 167 | 168 | ```r 169 | # Load tabbycat and the mpg dataset 170 | library(tabbycat) 171 | mpg <- ggplot2::mpg 172 | 173 | cat_contrast(mpg, "class", "manufacturer", "toyota") 174 | 175 | # # A tibble: 7 × 5 176 | # class n_toyota n_other p_toyota p_other 177 | # 178 | # 1 compact 12 35 0.353 0.175 179 | # 2 suv 8 54 0.235 0.27 180 | # 3 midsize 7 34 0.206 0.17 181 | # 4 pickup 7 26 0.206 0.13 182 | # 5 2seater 0 5 0 0.025 183 | # 6 minivan 0 11 0 0.055 184 | # 7 subcompact 0 35 0 0.175 185 | ``` 186 | 187 | ### 3.1. NA handling for comparison functions 188 | 189 | By default, if any NAs exist in the data their frequency is included in both the row and column results. So there is a row for observations containing NAs in `row_cat`, and columns showing the number and percentage of NAs found in `col_cat` for each group in `row_cat`. 190 | 191 | 192 | ```r 193 | # Set the class of the first observation to NA 194 | mpg[1, ]$class <- NA 195 | 196 | # Set the manufacturer of the second observation to NA 197 | mpg[2, ]$manufacturer <- NA 198 | 199 | # Call cat_contrast with defaults 200 | cat_contrast(mpg, "class", "manufacturer", "toyota") 201 | 202 | # A tibble: 8 × 7 203 | # class n_toyota n_other n_na p_toyota p_other p_na 204 | # 205 | # 1 compact 12 33 1 0.353 0.166 1 206 | # 2 suv 8 54 0 0.235 0.271 0 207 | # 3 midsize 7 34 0 0.206 0.171 0 208 | # 4 pickup 7 26 0 0.206 0.131 0 209 | # 5 2seater 0 5 0 0 0.0251 0 210 | # 6 minivan 0 11 0 0 0.0553 0 211 | # 7 subcompact 0 35 0 0 0.176 0 212 | # 8 NA 0 1 0 0 0.00503 0 213 | ``` 214 | 215 | This default behaviour can be changed through three boolean arguments: `na.rm.row`, `na.rm.col`, and `na.rm`. Setting each of these arguments to TRUE has the following effects: 216 | 217 | - `na.rm.row` -- removes the row for NAs from the row results 218 | - `na.rm.col` -- removes the columns for NAs from the column results 219 | - `na.rm` -- removes both the rows and columns of NAs from the results 220 | 221 | ```r 222 | # Call cat_contrast with na.rm.row set to TRUE 223 | cat_contrast(mpg, "class", "manufacturer", "toyota", na.rm.row = TRUE) 224 | 225 | # A tibble: 7 × 7 226 | # class n_toyota n_other n_na p_toyota p_other p_na 227 | # 228 | # 1 compact 12 33 1 0.353 0.167 1 229 | # 2 suv 8 54 0 0.235 0.273 0 230 | # 3 midsize 7 34 0 0.206 0.172 0 231 | # 4 pickup 7 26 0 0.206 0.131 0 232 | # 5 2seater 0 5 0 0 0.0253 0 233 | # 6 minivan 0 11 0 0 0.0556 0 234 | # 7 subcompact 0 35 0 0 0.177 0 235 | 236 | # Call cat_contrast with na.rm.col set to TRUE 237 | cat_contrast(mpg, "class", "manufacturer", "toyota", na.rm.col = TRUE) 238 | 239 | # A tibble: 8 × 5 240 | # class n_toyota n_other p_toyota p_other 241 | # 242 | # 1 compact 12 33 0.353 0.166 243 | # 2 suv 8 54 0.235 0.271 244 | # 3 midsize 7 34 0.206 0.171 245 | # 4 pickup 7 26 0.206 0.131 246 | # 5 2seater 0 5 0 0.0251 247 | # 6 minivan 0 11 0 0.0553 248 | # 7 subcompact 0 35 0 0.176 249 | # 8 NA 0 1 0 0.00503 250 | 251 | # Call cat_contrast with na.rm set to TRUE 252 | cat_contrast(mpg, "class", "manufacturer", "toyota", na.rm = TRUE) 253 | 254 | # A tibble: 7 × 5 255 | # class n_toyota n_other p_toyota p_other 256 | # 257 | # 1 compact 12 33 0.353 0.167 258 | # 2 suv 8 54 0.235 0.273 259 | # 3 midsize 7 34 0.206 0.172 260 | # 4 pickup 7 26 0.206 0.131 261 | # 5 2seater 0 5 0 0.0253 262 | # 6 minivan 0 11 0 0.0556 263 | # 7 subcompact 0 35 0 0.177 264 | ``` 265 | 266 | Note that while removing the columns for NAs from the column results simply changes which columns are shown in the results table, removing the row for NAs from the row results affects the data in the table, because the percentage frequencies are calculated based on the rows shown. In other words, `na.rm.row` lets you calculate the percentage frequencies with or without NAs. This is consistent with the behaviour of `cat_count` and `cat_vcount`. 267 | 268 | The `na.rm` argument is a convenience which simply sets `na.rm.row` and `na.rm.col` to the same value. If it is set, it takes priority over both of those arguments, otherwise it is ignored. 269 | 270 | ## 4. Summarising functions 271 | 272 | ### `cat_summarise` 273 | 274 | `cat_summarise` (or `cat_summarize`) calculates summary statistics for a numerical variable for each group within a categorical variable. Call the function with a dataframe and provide: 275 | 276 | 1. `cat` -- the categorical variable for which summaries will be calculated 277 | 2. `num` -- the numerical variable to summarise 278 | 279 | ```r 280 | # Load tabbycat and the mpg dataset 281 | library(tabbycat) 282 | mpg <- ggplot2::mpg 283 | 284 | cat_summarise(mpg, "class", "hwy") 285 | 286 | # A tibble: 7 × 10 287 | # class n na mean sd min lq med uq max 288 | # 289 | # 1 2seater 5 0 24.8 1.30 23 24 25 26 26 290 | # 2 compact 47 0 28.3 3.78 23 26 27 29 44 291 | # 3 midsize 41 0 27.3 2.14 23 26 27 29 32 292 | # 4 minivan 11 0 22.4 2.06 17 22 23 24 24 293 | # 5 pickup 33 0 16.9 2.27 12 16 17 18 22 294 | # 6 subcompact 35 0 28.1 5.38 20 24.5 26 30.5 44 295 | # 7 suv 62 0 18.1 2.98 12 17 17.5 19 27 296 | ``` 297 | 298 | ### 4.1. NA handling for `cat_summarise` 299 | 300 | In `cat_summarise`, NAs are **always** ignored in calculating the summary statistics for each group in `cat`. But the number of NAs in each group is shown in a column in the table so you can see the potential impact of NAs on the calculation of these statistics. By default, a row showing summary statistics for observations that are NA in `cat` is included in the table, but this can be turned off by setting `na.rm` to `TRUE`. You can see these behaviours in the following example. 301 | 302 | ```r 303 | # Set the class of the first three observations to NA 304 | mpg[1:3, ]$class <- NA 305 | 306 | # Set the hwy (miles per gallon) of the fourth observation to NA 307 | mpg[4, ]$hwy <- NA 308 | 309 | # Call cat_summarise with defaults 310 | cat_summarise(mpg, "class", "hwy") 311 | 312 | # A tibble: 8 × 10 313 | # class n na mean sd min lq med uq max 314 | # 315 | # 1 2seater 5 0 24.8 1.30 23 24 25 26 26 316 | # 2 compact 44 1 28.2 3.92 23 26 27 29 44 317 | # 3 midsize 41 0 27.3 2.14 23 26 27 29 32 318 | # 4 minivan 11 0 22.4 2.06 17 22 23 24 24 319 | # 5 pickup 33 0 16.9 2.27 12 16 17 18 22 320 | # 6 subcompact 35 0 28.1 5.38 20 24.5 26 30.5 44 321 | # 7 suv 62 0 18.1 2.98 12 17 17.5 19 27 322 | # 8 NA 3 0 29.7 1.15 29 29 29 30 31 323 | 324 | # Call cat_summarise with na.rm set to TRUE 325 | cat_summarise(mpg, "class", "hwy", na.rm = TRUE) 326 | 327 | # A tibble: 7 × 10 328 | # class n na mean sd min lq med uq max 329 | # 330 | # 1 2seater 5 0 24.8 1.30 23 24 25 26 26 331 | # 2 compact 44 1 28.2 3.92 23 26 27 29 44 332 | # 3 midsize 41 0 27.3 2.14 23 26 27 29 32 333 | # 4 minivan 11 0 22.4 2.06 17 22 23 24 24 334 | # 5 pickup 33 0 16.9 2.27 12 16 17 18 22 335 | # 6 subcompact 35 0 28.1 5.38 20 24.5 26 30.5 44 336 | # 7 suv 62 0 18.1 2.98 12 17 17.5 19 27 337 | ``` 338 | 339 | ## 5. Other API features 340 | 341 | There are some arguments that are found in most, if not all, the package functions. 342 | 343 | ### 5.1. `clean_names` 344 | 345 | All functions in the package take a boolean argument called `clean_names`. This argument controls whether column names derived from values in the data should be cleaned with `janitor::clean_names`, which converts them to snake case. 346 | 347 | The default value of this argument is `TRUE`. This is in order to produce more readable results tables and to avoid the creation of columns with spaces in the names, which are harder to use interactively. 348 | 349 | If you prefer not to have this behaviour, you can disable it on each function call by setting `clean_names` to `FALSE`, or globally using the package options. 350 | 351 | ```r 352 | options(tabbycat.clean_names = FALSE) 353 | ``` 354 | 355 | ### 5.2. `only` 356 | 357 | Counting and comparison functions take a string argument called `only`. This is used to return just the number or percentage columns for frequencies in the results. Valid values for `only` are : 358 | 359 | - `"n"` or `"number"` -- to return just the number columns 360 | - `"p"` or `"percent"` -- to return just the percentage columns 361 | - *any other string* -- to return both the number and percentage columns 362 | 363 | The defalut value is an empty string, meaning all columns are returned by default. 364 | 365 | ### 5.3. Labelling arguments for comparison functions 366 | 367 | The comparison functions need names to use as labels for the NA columns, and in the case of `cat_contrast`, for the columns showing frequencies for the observations that are not in the target group. 368 | 369 | These labels are controlled with the arguments `na_label` and `other_label`. The default values are `"na"` and `"other"` respectively, but you can change them if they colllide with data in your dataset. You can use the following package options if it makes more sense to change them globally when working with a particular dataset. 370 | 371 | ```r 372 | options(tabbycat.na_label = "missing") 373 | options(tabbycat.other_label = "remainder") 374 | ``` 375 | -------------------------------------------------------------------------------- /tabbycat.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(tabbycat) 3 | 4 | test_check("tabbycat") 5 | -------------------------------------------------------------------------------- /tests/testthat/test_compare.R: -------------------------------------------------------------------------------- 1 | # Test compare.R 2 | 3 | # Setup ---------------------------------------------------------------------- 4 | 5 | data <- mtcars %>% 6 | tibble::rownames_to_column("model") %>% 7 | tibble::as_tibble() %>% 8 | dplyr::mutate( 9 | manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1]) 10 | 11 | data$cyl[1] <- NA 12 | data$vs[2] <- NA 13 | 14 | # Tests: cat_compare -------------------------------------------------------- 15 | 16 | test_that("cat_compare rejects a data argument that is not a dataframe", { 17 | 18 | msg <- "The \"data\" argument is not a dataframe." 19 | expect_error(cat_compare(NULL, "cyl", "vs"), msg) 20 | expect_error(cat_compare(NA, "cyl", "vs"), msg) 21 | expect_error(cat_compare(1:10, "cyl", "vs"), msg) 22 | expect_error(cat_compare(LETTERS[1:10], "cyl", "vs"), msg) 23 | expect_error(cat_compare(c(TRUE, FALSE), "cyl", "vs"), msg) 24 | expect_error(cat_compare(list(), "cyl", "vs"), msg) 25 | }) 26 | 27 | test_that("cat_compare rejects a data argument that has no rows", { 28 | 29 | msg <- "The \"data\" argument is empty." 30 | expect_error(cat_compare(data.frame(), "cyl", "vs"), msg) 31 | }) 32 | 33 | test_that("cat_compare rejects invalid row_cat arguments", { 34 | 35 | msg <- "Invalid \"row_cat\" argument. Must be a character vector of length one." 36 | expect_error(cat_compare(data, NULL, "vs"), msg) 37 | expect_error(cat_compare(data, NA, "vs"), msg) 38 | expect_error(cat_compare(data, 1:10, "vs"), msg) 39 | expect_error(cat_compare(data, LETTERS[1:10], "vs"), msg) 40 | expect_error(cat_compare(data, c(TRUE, FALSE), "vs"), msg) 41 | expect_error(cat_compare(data, list(), "vs"), msg) 42 | }) 43 | 44 | test_that("cat_compare rejects a row_cat argument that is not a column in the data", { 45 | 46 | msg <- "'notacolumn' is not a column in the dataframe." 47 | expect_error(cat_compare(data, "notacolumn", "vs"), msg) 48 | }) 49 | 50 | test_that("cat_compare rejects invalid col_cat arguments", { 51 | 52 | msg <- "Invalid \"col_cat\" argument. Must be a character vector of length one." 53 | expect_error(cat_compare(data, "cyl", NULL), msg) 54 | expect_error(cat_compare(data, "cyl", NA), msg) 55 | expect_error(cat_compare(data, "cyl", 1:10), msg) 56 | expect_error(cat_compare(data, "cyl", LETTERS[1:10]), msg) 57 | expect_error(cat_compare(data, "cyl", c(TRUE, FALSE)), msg) 58 | expect_error(cat_compare(data, "cyl", list()), msg) 59 | }) 60 | 61 | test_that("cat_compare rejects a col_cat argument that is not a column in the data", { 62 | 63 | msg <- "'notacolumn' is not a column in the dataframe." 64 | expect_error(cat_compare(data, "cyl", "notacolumn"), msg) 65 | }) 66 | 67 | test_that("cat_compare rejects invalid na.rm.row arguments", { 68 | 69 | msg <- "Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE." 70 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = NULL), msg) 71 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = NA), msg) 72 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = 1), msg) 73 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = ""), msg) 74 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = 1:10), msg) 75 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = LETTERS[1:10]), msg) 76 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = c(TRUE, FALSE)), msg) 77 | expect_error(cat_compare(data, "cyl", "vs", na.rm.row = list()), msg) 78 | }) 79 | 80 | test_that("cat_compare rejects invalid na.rm.col arguments", { 81 | 82 | msg <- "Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE." 83 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = NULL), msg) 84 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = NA), msg) 85 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = 1), msg) 86 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = ""), msg) 87 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = 1:10), msg) 88 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = LETTERS[1:10]), msg) 89 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = c(TRUE, FALSE)), msg) 90 | expect_error(cat_compare(data, "cyl", "vs", na.rm.col = list()), msg) 91 | }) 92 | 93 | test_that("cat_compare rejects invalid na.rm arguments", { 94 | 95 | msg <- "Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE." 96 | expect_error(cat_compare(data, "cyl", "vs", na.rm = NA), msg) 97 | expect_error(cat_compare(data, "cyl", "vs", na.rm = 1), msg) 98 | expect_error(cat_compare(data, "cyl", "vs", na.rm = ""), msg) 99 | expect_error(cat_compare(data, "cyl", "vs", na.rm = 1:10), msg) 100 | expect_error(cat_compare(data, "cyl", "vs", na.rm = LETTERS[1:10]), msg) 101 | expect_error(cat_compare(data, "cyl", "vs", na.rm = c(TRUE, FALSE)), msg) 102 | expect_error(cat_compare(data, "cyl", "vs", na.rm = list()), msg) 103 | }) 104 | 105 | test_that("cat_compare rejects invalid clean_names arguments", { 106 | 107 | msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE." 108 | expect_error(cat_compare(data, "cyl", "vs", clean_names = NULL), msg) 109 | expect_error(cat_compare(data, "cyl", "vs", clean_names = NA), msg) 110 | expect_error(cat_compare(data, "cyl", "vs", clean_names = 1), msg) 111 | expect_error(cat_compare(data, "cyl", "vs", clean_names = ""), msg) 112 | expect_error(cat_compare(data, "cyl", "vs", clean_names = 1:10), msg) 113 | expect_error(cat_compare(data, "cyl", "vs", clean_names = LETTERS[1:10]), msg) 114 | expect_error(cat_compare(data, "cyl", "vs", clean_names = c(TRUE, FALSE)), msg) 115 | expect_error(cat_compare(data, "cyl", "vs", clean_names = list()), msg) 116 | }) 117 | 118 | test_that("cat_compare rejects invalid only arguments", { 119 | 120 | msg <- "Invalid \"only\" argument. Must be a character vector of length one." 121 | expect_error(cat_compare(data, "cyl", "vs", only = NULL), msg) 122 | expect_error(cat_compare(data, "cyl", "vs", only = NA), msg) 123 | expect_error(cat_compare(data, "cyl", "vs", only = 1), msg) 124 | expect_error(cat_compare(data, "cyl", "vs", only = TRUE), msg) 125 | expect_error(cat_compare(data, "cyl", "vs", only = 1:10), msg) 126 | expect_error(cat_compare(data, "cyl", "vs", only = LETTERS[1:10]), msg) 127 | expect_error(cat_compare(data, "cyl", "vs", only = c(TRUE, FALSE)), msg) 128 | expect_error(cat_compare(data, "cyl", "vs", only = list()), msg) 129 | }) 130 | 131 | test_that("cat_compare rejects invalid na_label arguments", { 132 | 133 | msg <- "Invalid \"na_label\" argument. Must be a character vector of length one." 134 | expect_error(cat_compare(data, "cyl", "vs", na_label = NULL), msg) 135 | expect_error(cat_compare(data, "cyl", "vs", na_label = NA), msg) 136 | expect_error(cat_compare(data, "cyl", "vs", na_label = 1), msg) 137 | expect_error(cat_compare(data, "cyl", "vs", na_label = TRUE), msg) 138 | expect_error(cat_compare(data, "cyl", "vs", na_label = 1:10), msg) 139 | expect_error(cat_compare(data, "cyl", "vs", na_label = LETTERS[1:10]), msg) 140 | expect_error(cat_compare(data, "cyl", "vs", na_label = c(TRUE, FALSE)), msg) 141 | expect_error(cat_compare(data, "cyl", "vs", na_label = list()), msg) 142 | }) 143 | 144 | test_that("cat_compare returns correct data with defaults", { 145 | 146 | expected <- tibble::tibble( 147 | cyl = c(4, 6, 8, NA), 148 | n_0 = c(1, 1, 14, 1), 149 | n_1 = c(10, 4, 0, 0), 150 | n_na = c(0, 1, 0, 0), 151 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 152 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 153 | p_na = c(0, 1, 0, 0)) 154 | observed <- cat_compare(data, "cyl", "vs") 155 | expect_equal(observed, expected) 156 | }) 157 | 158 | test_that("cat_compare returns correct data with a valid na.rm.row argument", { 159 | 160 | expected <- tibble::tibble( 161 | cyl = c(4, 6, 8, NA), 162 | n_0 = c(1, 1, 14, 1), 163 | n_1 = c(10, 4, 0, 0), 164 | n_na = c(0, 1, 0, 0), 165 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 166 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 167 | p_na = c(0, 1, 0, 0)) 168 | observed <- cat_compare(data, "cyl", "vs", na.rm.row = FALSE) 169 | expect_equal(observed, expected) 170 | 171 | expected <- tibble::tibble( 172 | cyl = c(4, 6, 8), 173 | n_0 = c(1, 1, 14), 174 | n_1 = c(10, 4, 0), 175 | n_na = c(0, 1, 0), 176 | p_0 = c(0.0625, 0.0625, 0.8750), 177 | p_1 = c(.71428571, 0.28571429, 0.0), 178 | p_na = c(0, 1, 0)) 179 | observed <- cat_compare(data, "cyl", "vs", na.rm.row = TRUE) 180 | expect_equal(observed, expected) 181 | }) 182 | 183 | test_that("cat_compare returns correct data with a valid na.rm.col argument", { 184 | 185 | expected <- tibble::tibble( 186 | cyl = c(4, 6, 8, NA), 187 | n_0 = c(1, 1, 14, 1), 188 | n_1 = c(10, 4, 0, 0), 189 | n_na = c(0, 1, 0, 0), 190 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 191 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 192 | p_na = c(0, 1, 0, 0)) 193 | observed <- cat_compare(data, "cyl", "vs", na.rm.col = FALSE) 194 | expect_equal(observed, expected) 195 | 196 | expected <- tibble::tibble( 197 | cyl = c(4, 6, 8, NA), 198 | n_0 = c(1, 1, 14, 1), 199 | n_1 = c(10, 4, 0, 0), 200 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 201 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0)) 202 | observed <- cat_compare(data, "cyl", "vs", na.rm.col = TRUE) 203 | expect_equal(observed, expected) 204 | }) 205 | 206 | test_that("cat_compare returns correct data with a valid na.rm argument", { 207 | 208 | expected <- tibble::tibble( 209 | cyl = c(4, 6, 8, NA), 210 | n_0 = c(1, 1, 14, 1), 211 | n_1 = c(10, 4, 0, 0), 212 | n_na = c(0, 1, 0, 0), 213 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 214 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 215 | p_na = c(0, 1, 0, 0)) 216 | observed <- cat_compare(data, "cyl", "vs", na.rm = FALSE) 217 | expect_equal(observed, expected) 218 | 219 | expected <- tibble::tibble( 220 | cyl = c(4, 6, 8), 221 | n_0 = c(1, 1, 14), 222 | n_1 = c(10, 4, 0), 223 | p_0 = c(0.0625, 0.0625, 0.8750), 224 | p_1 = c(.71428571, 0.28571429, 0.0)) 225 | observed <- cat_compare(data, "cyl", "vs", na.rm = TRUE) 226 | expect_equal(observed, expected) 227 | }) 228 | 229 | test_that("cat_compare returns correct data with a valid only argument", { 230 | 231 | expected <- tibble::tibble( 232 | cyl = c(4, 6, 8, NA), 233 | n_0 = c(1, 1, 14, 1), 234 | n_1 = c(10, 4, 0, 0), 235 | n_na = c(0, 1, 0, 0), 236 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 237 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 238 | p_na = c(0, 1, 0, 0)) 239 | observed <- cat_compare(data, "cyl", "vs", only = "ignore") 240 | expect_equal(observed, expected) 241 | 242 | expected_number <- tibble::tibble( 243 | cyl = c(4, 6, 8, NA), 244 | n_0 = c(1, 1, 14, 1), 245 | n_1 = c(10, 4, 0, 0), 246 | n_na = c(0, 1, 0, 0)) 247 | 248 | observed <- cat_compare(data, "cyl", "vs", only = "n") 249 | expect_equal(observed, expected_number) 250 | 251 | observed <- cat_compare(data, "cyl", "vs", only = "number") 252 | expect_equal(observed, expected_number) 253 | 254 | observed <- cat_compare(data, "cyl", "vs", only = " number ") 255 | expect_equal(observed, expected_number) 256 | 257 | expected_percent <- tibble::tibble( 258 | cyl = c(4, 6, 8, NA), 259 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 260 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 261 | p_na = c(0, 1, 0, 0)) 262 | 263 | observed <- cat_compare(data, "cyl", "vs", only = "p") 264 | expect_equal(observed, expected_percent) 265 | 266 | observed <- cat_compare(data, "cyl", "vs", only = "percent") 267 | expect_equal(observed, expected_percent) 268 | 269 | observed <- cat_compare(data, "cyl", "vs", only = " percent ") 270 | expect_equal(observed, expected_percent) 271 | }) 272 | 273 | test_that("cat_compare returns correct data with a valid clean_names argument", { 274 | 275 | data$Cyl <- data$cyl 276 | 277 | expected <- tibble::tibble( 278 | cyl = c(4, 6, 8, NA), 279 | n_0 = c(1, 1, 14, 1), 280 | n_1 = c(10, 4, 0, 0), 281 | n_na = c(0, 1, 0, 0), 282 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 283 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 284 | p_na = c(0, 1, 0, 0)) 285 | observed <- cat_compare(data, "Cyl", "vs", clean_names = TRUE) 286 | expect_equal(observed, expected) 287 | 288 | expected <- tibble::tibble( 289 | Cyl = c(4, 6, 8, NA), 290 | n_0 = c(1, 1, 14, 1), 291 | n_1 = c(10, 4, 0, 0), 292 | n_na = c(0, 1, 0, 0), 293 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 294 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 295 | p_na = c(0, 1, 0, 0)) 296 | observed <- cat_compare(data, "Cyl", "vs", clean_names = FALSE) 297 | expect_equal(observed, expected) 298 | }) 299 | 300 | test_that("cat_compare uses option for default clean_names argument", { 301 | 302 | data$Cyl <- data$cyl 303 | restore_option <- getOption("tabbycat.clean_names") 304 | 305 | options(tabbycat.clean_names = TRUE) 306 | expected <- tibble::tibble( 307 | cyl = c(4, 6, 8, NA), 308 | n_0 = c(1, 1, 14, 1), 309 | n_1 = c(10, 4, 0, 0), 310 | n_na = c(0, 1, 0, 0), 311 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 312 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 313 | p_na = c(0, 1, 0, 0)) 314 | observed <- cat_compare(data, "Cyl", "vs") 315 | expect_equal(observed, expected) 316 | 317 | options(tabbycat.clean_names = FALSE) 318 | expected <- tibble::tibble( 319 | Cyl = c(4, 6, 8, NA), 320 | n_0 = c(1, 1, 14, 1), 321 | n_1 = c(10, 4, 0, 0), 322 | n_na = c(0, 1, 0, 0), 323 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 324 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 325 | p_na = c(0, 1, 0, 0)) 326 | observed <- cat_compare(data, "Cyl", "vs") 327 | expect_equal(observed, expected) 328 | 329 | options(tabbycat.clean_names = restore_option) 330 | }) 331 | 332 | test_that("cat_contrast returns correct data with a valid na_label argument", { 333 | 334 | expected <- tibble::tibble( 335 | cyl = c(4, 6, 8, NA), 336 | n_0 = c(1, 1, 14, 1), 337 | n_1 = c(10, 4, 0, 0), 338 | n_missing = c(0, 1, 0, 0), 339 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 340 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 341 | p_missing = c(0, 1, 0, 0)) 342 | observed <- cat_compare(data, "cyl", "vs", na_label = "missing") 343 | expect_equal(observed, expected) 344 | }) 345 | 346 | test_that("cat_compare uses option for default na_label argument", { 347 | 348 | restore_option <- getOption("tabbycat.na_label") 349 | 350 | options(tabbycat.na_label = "missing") 351 | expected <- tibble::tibble( 352 | cyl = c(4, 6, 8, NA), 353 | n_0 = c(1, 1, 14, 1), 354 | n_1 = c(10, 4, 0, 0), 355 | n_missing = c(0, 1, 0, 0), 356 | p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353), 357 | p_1 = c(.71428571, 0.28571429, 0.0, 0.0), 358 | p_missing = c(0, 1, 0, 0)) 359 | observed <- cat_compare(data, "cyl", "vs") 360 | expect_equal(observed, expected) 361 | 362 | options(tabbycat.na_label = restore_option) 363 | }) 364 | -------------------------------------------------------------------------------- /tests/testthat/test_contrast.R: -------------------------------------------------------------------------------- 1 | # Test contrast.R 2 | 3 | # Setup ---------------------------------------------------------------------- 4 | 5 | data <- mtcars %>% 6 | tibble::rownames_to_column("model") %>% 7 | tibble::as_tibble() %>% 8 | dplyr::mutate( 9 | manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1]) 10 | 11 | data$cyl[1] <- NA 12 | data$manufacturer[2] <- NA 13 | 14 | # Tests: cat_contrast -------------------------------------------------------- 15 | 16 | test_that("cat_contrast rejects a data argument that is not a dataframe", { 17 | 18 | msg <- "The \"data\" argument is not a dataframe." 19 | expect_error(cat_contrast(NULL, "cyl", "manufacturer", "Merc"), msg) 20 | expect_error(cat_contrast(NA, "cyl", "manufacturer", "Merc"), msg) 21 | expect_error(cat_contrast(1:10, "cyl", "manufacturer", "Merc"), msg) 22 | expect_error(cat_contrast(LETTERS[1:10], "cyl", "manufacturer", "Merc"), msg) 23 | expect_error(cat_contrast(c(TRUE, FALSE), "cyl", "manufacturer", "Merc"), msg) 24 | expect_error(cat_contrast(list(), "cyl", "manufacturer", "Merc"), msg) 25 | }) 26 | 27 | test_that("cat_contrast rejects a data argument that has no rows", { 28 | 29 | msg <- "The \"data\" argument is empty." 30 | expect_error(cat_contrast(data.frame(), "cyl", "manufacturer", "Merc"), msg) 31 | }) 32 | 33 | test_that("cat_contrast rejects invalid row_cat arguments", { 34 | 35 | msg <- "Invalid \"row_cat\" argument. Must be a character vector of length one." 36 | expect_error(cat_contrast(data, NULL, "manufacturer", "Merc"), msg) 37 | expect_error(cat_contrast(data, NA, "manufacturer", "Merc"), msg) 38 | expect_error(cat_contrast(data, 1:10, "manufacturer", "Merc"), msg) 39 | expect_error(cat_contrast(data, LETTERS[1:10], "manufacturer", "Merc"), msg) 40 | expect_error(cat_contrast(data, c(TRUE, FALSE), "manufacturer", "Merc"), msg) 41 | expect_error(cat_contrast(data, list(), "manufacturer", "Merc"), msg) 42 | }) 43 | 44 | test_that("cat_contrast rejects a row_cat argument that is not a column in the data", { 45 | 46 | msg <- "'notacolumn' is not a column in the dataframe." 47 | expect_error(cat_contrast(data, "notacolumn", "manufacturer", "Merc"), msg) 48 | }) 49 | 50 | test_that("cat_contrast rejects invalid col_cat arguments", { 51 | 52 | msg <- "Invalid \"col_cat\" argument. Must be a character vector of length one." 53 | expect_error(cat_contrast(data, "cyl", NULL, "Merc"), msg) 54 | expect_error(cat_contrast(data, "cyl", NA, "Merc"), msg) 55 | expect_error(cat_contrast(data, "cyl", 1:10, "Merc"), msg) 56 | expect_error(cat_contrast(data, "cyl", LETTERS[1:10], "Merc"), msg) 57 | expect_error(cat_contrast(data, "cyl", c(TRUE, FALSE), "Merc"), msg) 58 | expect_error(cat_contrast(data, "cyl", list(), "Merc"), msg) 59 | }) 60 | 61 | test_that("cat_contrast rejects a col_cat argument that is not a column in the data", { 62 | 63 | msg <- "'notacolumn' is not a column in the dataframe." 64 | expect_error(cat_contrast(data, "cyl", "notacolumn", "Merc"), msg) 65 | }) 66 | 67 | test_that("cat_contrast rejects invalid col_group arguments", { 68 | 69 | msg <- "Invalid \"col_group\" argument. Must be a character vector of length one." 70 | expect_error(cat_contrast(data, "cyl", "manufacturer", NULL), msg) 71 | expect_error(cat_contrast(data, "cyl", "manufacturer", NA), msg) 72 | expect_error(cat_contrast(data, "cyl", "manufacturer", 1:10), msg) 73 | expect_error(cat_contrast(data, "cyl", "manufacturer", LETTERS[1:10]), msg) 74 | expect_error(cat_contrast(data, "cyl", "manufacturer", c(TRUE, FALSE)), msg) 75 | expect_error(cat_contrast(data, "cyl", "manufacturer", list()), msg) 76 | }) 77 | 78 | test_that("cat_contrast rejects a col_group argument that does not exist in col_cat", { 79 | 80 | msg <- "The \"col_group\" 'notagroup' does not exist in the \"col_cat\" 'manufacturer'." 81 | expect_error(cat_contrast(data, "cyl", "manufacturer", "notagroup"), msg) 82 | }) 83 | 84 | test_that("cat_contrast rejects invalid na.rm.row arguments", { 85 | 86 | msg <- "Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE." 87 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = NULL), msg) 88 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = NA), msg) 89 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = 1), msg) 90 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = ""), msg) 91 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = 1:10), msg) 92 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = LETTERS[1:10]), msg) 93 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = c(TRUE, FALSE)), msg) 94 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = list()), msg) 95 | }) 96 | 97 | test_that("cat_contrast rejects invalid na.rm.col arguments", { 98 | 99 | msg <- "Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE." 100 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = NULL), msg) 101 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = NA), msg) 102 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = 1), msg) 103 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = ""), msg) 104 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = 1:10), msg) 105 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = LETTERS[1:10]), msg) 106 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = c(TRUE, FALSE)), msg) 107 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = list()), msg) 108 | }) 109 | 110 | test_that("cat_contrast rejects invalid na.rm arguments", { 111 | 112 | msg <- "Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE." 113 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = NA), msg) 114 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = 1), msg) 115 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = ""), msg) 116 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = 1:10), msg) 117 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = LETTERS[1:10]), msg) 118 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = c(TRUE, FALSE)), msg) 119 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = list()), msg) 120 | }) 121 | 122 | test_that("cat_contrast rejects invalid clean_names arguments", { 123 | 124 | msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE." 125 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = NULL), msg) 126 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = NA), msg) 127 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = 1), msg) 128 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = ""), msg) 129 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = 1:10), msg) 130 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = LETTERS[1:10]), msg) 131 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = c(TRUE, FALSE)), msg) 132 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = list()), msg) 133 | }) 134 | 135 | test_that("cat_contrast rejects invalid only arguments", { 136 | 137 | msg <- "Invalid \"only\" argument. Must be a character vector of length one." 138 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = NULL), msg) 139 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = NA), msg) 140 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = 1), msg) 141 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = TRUE), msg) 142 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = 1:10), msg) 143 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = LETTERS[1:10]), msg) 144 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = c(TRUE, FALSE)), msg) 145 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = list()), msg) 146 | }) 147 | 148 | test_that("cat_contrast rejects invalid other_label arguments", { 149 | 150 | msg <- "Invalid \"other_label\" argument. Must be a character vector of length one." 151 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = NULL), msg) 152 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = NA), msg) 153 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = 1), msg) 154 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = TRUE), msg) 155 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = 1:10), msg) 156 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = LETTERS[1:10]), msg) 157 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = c(TRUE, FALSE)), msg) 158 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = list()), msg) 159 | }) 160 | 161 | test_that("cat_contrast rejects invalid na_label arguments", { 162 | 163 | msg <- "Invalid \"na_label\" argument. Must be a character vector of length one." 164 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = NULL), msg) 165 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = NA), msg) 166 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = 1), msg) 167 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = TRUE), msg) 168 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = 1:10), msg) 169 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = LETTERS[1:10]), msg) 170 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = c(TRUE, FALSE)), msg) 171 | expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = list()), msg) 172 | }) 173 | 174 | test_that("cat_contrast returns correct data with defaults", { 175 | 176 | expected <- tibble::tibble( 177 | cyl = c(8, 4, 6, NA), 178 | n_merc = c(3, 2, 2, 0), 179 | n_other = c(11, 9, 3, 1), 180 | n_na = c(0, 0, 1, 0), 181 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 182 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 183 | p_na = c(0, 0, 1, 0)) 184 | observed <- cat_contrast(data, "cyl", "manufacturer", "Merc") 185 | expect_equal(observed, expected) 186 | }) 187 | 188 | test_that("cat_contrast returns correct data with a valid na.rm.row argument", { 189 | 190 | expected <- tibble::tibble( 191 | cyl = c(8, 4, 6, NA), 192 | n_merc = c(3, 2, 2, 0), 193 | n_other = c(11, 9, 3, 1), 194 | n_na = c(0, 0, 1, 0), 195 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 196 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 197 | p_na = c(0, 0, 1, 0)) 198 | observed <- cat_contrast( 199 | data, 200 | "cyl", 201 | "manufacturer", 202 | "Merc", 203 | na.rm.row = FALSE) 204 | expect_equal(observed, expected) 205 | 206 | expected <- tibble::tibble( 207 | cyl = c(8, 4, 6), 208 | n_merc = c(3, 2, 2), 209 | n_other = c(11, 9, 3), 210 | n_na = c(0, 0, 1), 211 | p_merc = c( 0.42857143, 0.28571429, 0.28571429), 212 | p_other = c(0.47826087, 0.39130435, 0.13043478), 213 | p_na = c(0, 0, 1)) 214 | observed <- cat_contrast( 215 | data, 216 | "cyl", 217 | "manufacturer", 218 | "Merc", 219 | na.rm.row = TRUE) 220 | expect_equal(observed, expected) 221 | }) 222 | 223 | test_that("cat_contrast returns correct data with a valid na.rm.col argument", { 224 | 225 | expected <- tibble::tibble( 226 | cyl = c(8, 4, 6, NA), 227 | n_merc = c(3, 2, 2, 0), 228 | n_other = c(11, 9, 3, 1), 229 | n_na = c(0, 0, 1, 0), 230 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 231 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 232 | p_na = c(0, 0, 1, 0)) 233 | observed <- cat_contrast( 234 | data, 235 | "cyl", 236 | "manufacturer", 237 | "Merc", 238 | na.rm.col = FALSE) 239 | expect_equal(observed, expected) 240 | 241 | expected <- tibble::tibble( 242 | cyl = c(8, 4, 6, NA), 243 | n_merc = c(3, 2, 2, 0), 244 | n_other = c(11, 9, 3, 1), 245 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 246 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667)) 247 | observed <- cat_contrast( 248 | data, 249 | "cyl", 250 | "manufacturer", 251 | "Merc", 252 | na.rm.col = TRUE) 253 | expect_equal(observed, expected) 254 | }) 255 | 256 | test_that("cat_contrast returns correct data with a valid na.rm argument", { 257 | 258 | expected <- tibble::tibble( 259 | cyl = c(8, 4, 6, NA), 260 | n_merc = c(3, 2, 2, 0), 261 | n_other = c(11, 9, 3, 1), 262 | n_na = c(0, 0, 1, 0), 263 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 264 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 265 | p_na = c(0, 0, 1, 0)) 266 | observed <- cat_contrast( 267 | data, 268 | "cyl", 269 | "manufacturer", 270 | "Merc", 271 | na.rm = FALSE) 272 | expect_equal(observed, expected) 273 | 274 | expected <- tibble::tibble( 275 | cyl = c(8, 4, 6), 276 | n_merc = c(3, 2, 2), 277 | n_other = c(11, 9, 3), 278 | p_merc = c( 0.42857143, 0.28571429, 0.28571429), 279 | p_other = c(0.47826087, 0.39130435, 0.13043478)) 280 | observed <- cat_contrast( 281 | data, 282 | "cyl", 283 | "manufacturer", 284 | "Merc", 285 | na.rm = TRUE) 286 | expect_equal(observed, expected) 287 | }) 288 | 289 | test_that("cat_contrast returns correct data with a valid only argument", { 290 | 291 | expected <- tibble::tibble( 292 | cyl = c(8, 4, 6, NA), 293 | n_merc = c(3, 2, 2, 0), 294 | n_other = c(11, 9, 3, 1), 295 | n_na = c(0, 0, 1, 0), 296 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 297 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 298 | p_na = c(0, 0, 1, 0)) 299 | observed <- cat_contrast( 300 | data, 301 | "cyl", 302 | "manufacturer", 303 | "Merc", 304 | only = "ignore") 305 | expect_equal(observed, expected) 306 | 307 | expected_number <- tibble::tibble( 308 | cyl = c(8, 4, 6, NA), 309 | n_merc = c(3, 2, 2, 0), 310 | n_other = c(11, 9, 3, 1), 311 | n_na = c(0, 0, 1, 0)) 312 | 313 | observed <- cat_contrast( 314 | data, 315 | "cyl", 316 | "manufacturer", 317 | "Merc", 318 | only = "n") 319 | expect_equal(observed, expected_number) 320 | 321 | observed <- cat_contrast( 322 | data, 323 | "cyl", 324 | "manufacturer", 325 | "Merc", 326 | only = "number") 327 | expect_equal(observed, expected_number) 328 | 329 | observed <- cat_contrast( 330 | data, 331 | "cyl", 332 | "manufacturer", 333 | "Merc", 334 | only = " number ") 335 | expect_equal(observed, expected_number) 336 | 337 | expected_percent <- tibble::tibble( 338 | cyl = c(8, 4, 6, NA), 339 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 340 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 341 | p_na = c(0, 0, 1, 0)) 342 | 343 | observed <- cat_contrast( 344 | data, 345 | "cyl", 346 | "manufacturer", 347 | "Merc", 348 | only = "p") 349 | expect_equal(observed, expected_percent) 350 | 351 | observed <- cat_contrast( 352 | data, 353 | "cyl", 354 | "manufacturer", 355 | "Merc", 356 | only = "percent") 357 | expect_equal(observed, expected_percent) 358 | 359 | observed <- cat_contrast( 360 | data, 361 | "cyl", 362 | "manufacturer", 363 | "Merc", 364 | only = " percent ") 365 | expect_equal(observed, expected_percent) 366 | }) 367 | 368 | test_that("cat_contrast returns correct data with a valid clean_names argument", { 369 | 370 | data$Cyl <- data$cyl 371 | 372 | expected <- tibble::tibble( 373 | cyl = c(8, 4, 6, NA), 374 | n_merc = c(3, 2, 2, 0), 375 | n_other = c(11, 9, 3, 1), 376 | n_na = c(0, 0, 1, 0), 377 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 378 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 379 | p_na = c(0, 0, 1, 0)) 380 | observed <- cat_contrast( 381 | data, 382 | "Cyl", 383 | "manufacturer", 384 | "Merc", 385 | clean_names = TRUE) 386 | expect_equal(observed, expected) 387 | 388 | expected <- tibble::tibble( 389 | Cyl = c(8, 4, 6, NA), 390 | n_Merc = c(3, 2, 2, 0), 391 | n_other = c(11, 9, 3, 1), 392 | n_na = c(0, 0, 1, 0), 393 | p_Merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 394 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 395 | p_na = c(0, 0, 1, 0)) 396 | observed <- cat_contrast( 397 | data, 398 | "Cyl", 399 | "manufacturer", 400 | "Merc", 401 | clean_names = FALSE) 402 | expect_equal(observed, expected) 403 | }) 404 | 405 | test_that("cat_contrast uses option for default clean_names argument", { 406 | 407 | data$Cyl <- data$cyl 408 | restore_option <- getOption("tabbycat.clean_names") 409 | 410 | options(tabbycat.clean_names = TRUE) 411 | expected <- tibble::tibble( 412 | cyl = c(8, 4, 6, NA), 413 | n_merc = c(3, 2, 2, 0), 414 | n_other = c(11, 9, 3, 1), 415 | n_na = c(0, 0, 1, 0), 416 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 417 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 418 | p_na = c(0, 0, 1, 0)) 419 | observed <- cat_contrast( 420 | data, 421 | "Cyl", 422 | "manufacturer", 423 | "Merc") 424 | expect_equal(observed, expected) 425 | 426 | options(tabbycat.clean_names = FALSE) 427 | expected <- tibble::tibble( 428 | Cyl = c(8, 4, 6, NA), 429 | n_Merc = c(3, 2, 2, 0), 430 | n_other = c(11, 9, 3, 1), 431 | n_na = c(0, 0, 1, 0), 432 | p_Merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 433 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 434 | p_na = c(0, 0, 1, 0)) 435 | observed <- cat_contrast( 436 | data, 437 | "Cyl", 438 | "manufacturer", 439 | "Merc") 440 | expect_equal(observed, expected) 441 | 442 | options(tabbycat.clean_names = restore_option) 443 | }) 444 | 445 | test_that("cat_contrast returns correct data with a valid na_label argument", { 446 | 447 | expected <- tibble::tibble( 448 | cyl = c(8, 4, 6, NA), 449 | n_merc = c(3, 2, 2, 0), 450 | n_other = c(11, 9, 3, 1), 451 | n_missing = c(0, 0, 1, 0), 452 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 453 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 454 | p_missing = c(0, 0, 1, 0)) 455 | observed <- cat_contrast( 456 | data, 457 | "cyl", 458 | "manufacturer", 459 | "Merc", 460 | na_label = "missing") 461 | expect_equal(observed, expected) 462 | }) 463 | 464 | test_that("cat_contrast uses option for default na_label argument", { 465 | 466 | restore_option <- getOption("tabbycat.na_label") 467 | 468 | options(tabbycat.na_label = "missing") 469 | expected <- tibble::tibble( 470 | cyl = c(8, 4, 6, NA), 471 | n_merc = c(3, 2, 2, 0), 472 | n_other = c(11, 9, 3, 1), 473 | n_missing = c(0, 0, 1, 0), 474 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 475 | p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 476 | p_missing = c(0, 0, 1, 0)) 477 | observed <- cat_contrast( 478 | data, 479 | "cyl", 480 | "manufacturer", 481 | "Merc") 482 | expect_equal(observed, expected) 483 | 484 | options(tabbycat.na_label = restore_option) 485 | 486 | }) 487 | 488 | test_that("cat_contrast returns correct data with a valid other_label argument", { 489 | 490 | expected <- tibble::tibble( 491 | cyl = c(8, 4, 6, NA), 492 | n_merc = c(3, 2, 2, 0), 493 | n_remainder = c(11, 9, 3, 1), 494 | n_na = c(0, 0, 1, 0), 495 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 496 | p_remainder = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 497 | p_na = c(0, 0, 1, 0)) 498 | observed <- cat_contrast( 499 | data, 500 | "cyl", 501 | "manufacturer", 502 | "Merc", 503 | other_label = "remainder") 504 | expect_equal(observed, expected) 505 | }) 506 | 507 | test_that("cat_contrast uses option for default other_label argument", { 508 | 509 | restore_option <- getOption("tabbycat.other_label") 510 | 511 | options(tabbycat.other_label = "remainder") 512 | expected <- tibble::tibble( 513 | cyl = c(8, 4, 6, NA), 514 | n_merc = c(3, 2, 2, 0), 515 | n_remainder = c(11, 9, 3, 1), 516 | n_na = c(0, 0, 1, 0), 517 | p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0), 518 | p_remainder = c(0.45833333, 0.37500000, 0.12500000, 0.04166667), 519 | p_na = c(0, 0, 1, 0)) 520 | observed <- cat_contrast( 521 | data, 522 | "cyl", 523 | "manufacturer", 524 | "Merc") 525 | expect_equal(observed, expected) 526 | 527 | options(tabbycat.other_label = restore_option) 528 | }) 529 | -------------------------------------------------------------------------------- /tests/testthat/test_count.R: -------------------------------------------------------------------------------- 1 | # Test count.R 2 | 3 | # Setup ----------------------------------------------------------------------- 4 | 5 | data <- mtcars %>% 6 | tibble::rownames_to_column("model") %>% 7 | tibble::as_tibble() %>% 8 | dplyr::mutate( 9 | manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1]) 10 | 11 | data$cyl[1] <- NA 12 | 13 | # Tests: cat_vcount ----------------------------------------------------------- 14 | 15 | test_that("cat_count rejects a data argument that is not a dataframe", { 16 | 17 | msg <- "The \"data\" argument is not a dataframe." 18 | expect_error(cat_count(NULL, "cyl"), msg) 19 | expect_error(cat_count(NA, "cyl"), msg) 20 | expect_error(cat_count(1:10, "cyl"), msg) 21 | expect_error(cat_count(LETTERS[1:10], "cyl"), msg) 22 | expect_error(cat_count(c(TRUE, FALSE), "cyl"), msg) 23 | expect_error(cat_count(list(), "cyl"), msg) 24 | }) 25 | 26 | test_that("cat_count rejects a data argument that has no rows", { 27 | 28 | msg <- "The \"data\" argument is empty." 29 | expect_error(cat_count(data.frame(), "cyl"), msg) 30 | }) 31 | 32 | test_that("cat_count rejects invalid cat arguments", { 33 | 34 | msg <- "Invalid \"cat\" argument. Must be a character vector of length one." 35 | expect_error(cat_count(data, NULL), msg) 36 | expect_error(cat_count(data, NA), msg) 37 | expect_error(cat_count(data, 1:10), msg) 38 | expect_error(cat_count(data, LETTERS[1:10]), msg) 39 | expect_error(cat_count(data, c(TRUE, FALSE)), msg) 40 | expect_error(cat_count(data, list()), msg) 41 | }) 42 | 43 | test_that("cat_count rejects a cat argument that is not a column in the data", { 44 | 45 | msg <- "'notacolumn' is not a column in the dataframe." 46 | expect_error(cat_count(data, "notacolumn"), msg) 47 | }) 48 | 49 | test_that("cat_count rejects invalid na.rm arguments", { 50 | 51 | msg <- "Invalid \"na.rm\" argument. Must be either TRUE or FALSE." 52 | expect_error(cat_count(data, "cyl", na.rm = NULL), msg) 53 | expect_error(cat_count(data, "cyl", na.rm = NA), msg) 54 | expect_error(cat_count(data, "cyl", na.rm = 1), msg) 55 | expect_error(cat_count(data, "cyl", na.rm = ""), msg) 56 | expect_error(cat_count(data, "cyl", na.rm = 1:10), msg) 57 | expect_error(cat_count(data, "cyl", na.rm = LETTERS[1:10]), msg) 58 | expect_error(cat_count(data, "cyl", na.rm = c(TRUE, FALSE)), msg) 59 | expect_error(cat_count(data, "cyl", na.rm = list()), msg) 60 | }) 61 | 62 | test_that("cat_count rejects invalid clean_names arguments", { 63 | 64 | msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE." 65 | expect_error(cat_count(data, "cyl", clean_names = NULL), msg) 66 | expect_error(cat_count(data, "cyl", clean_names = NA), msg) 67 | expect_error(cat_count(data, "cyl", clean_names = 1), msg) 68 | expect_error(cat_count(data, "cyl", clean_names = ""), msg) 69 | expect_error(cat_count(data, "cyl", clean_names = 1:10), msg) 70 | expect_error(cat_count(data, "cyl", clean_names = LETTERS[1:10]), msg) 71 | expect_error(cat_count(data, "cyl", clean_names = c(TRUE, FALSE)), msg) 72 | expect_error(cat_count(data, "cyl", clean_names = list()), msg) 73 | }) 74 | 75 | test_that("cat_count rejects invalid only arguments", { 76 | 77 | msg <- "Invalid \"only\" argument. Must be a character vector of length one." 78 | expect_error(cat_count(data, "cyl", only = NULL), msg) 79 | expect_error(cat_count(data, "cyl", only = NA), msg) 80 | expect_error(cat_count(data, "cyl", only = 1), msg) 81 | expect_error(cat_count(data, "cyl", only = TRUE), msg) 82 | expect_error(cat_count(data, "cyl", only = 1:10), msg) 83 | expect_error(cat_count(data, "cyl", only = LETTERS[1:10]), msg) 84 | expect_error(cat_count(data, "cyl", only = c(TRUE, FALSE)), msg) 85 | expect_error(cat_count(data, "cyl", only = list()), msg) 86 | }) 87 | 88 | test_that("cat_count returns correct data with defaults", { 89 | 90 | expected <- tibble::tibble( 91 | cyl = c(8, 4, 6, NA), 92 | number = c(14, 11, 6, 1), 93 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 94 | observed <- cat_count(data, "cyl") 95 | expect_equal(observed, expected) 96 | }) 97 | 98 | test_that("cat_count returns correct data with a valid na.rm argument", { 99 | 100 | expected <- tibble::tibble( 101 | cyl = c(8, 4, 6, NA), 102 | number = c(14, 11, 6, 1), 103 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 104 | observed <- cat_count(data, "cyl", na.rm = FALSE) 105 | expect_equal(observed, expected) 106 | 107 | expected <- tibble::tibble( 108 | cyl = c(8, 4, 6), 109 | number = c(14, 11, 6), 110 | percent = c(0.4516129032, 0.3548387097, 0.1935483871)) 111 | observed <- cat_count(data, "cyl", na.rm = TRUE) 112 | expect_equal(observed, expected) 113 | }) 114 | 115 | test_that("cat_count returns correct data with a valid only argument", { 116 | 117 | expected <- tibble::tibble( 118 | cyl = c(8, 4, 6, NA), 119 | number = c(14, 11, 6, 1), 120 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 121 | observed <- cat_count(data, "cyl", only = "ignore") 122 | expect_equal(observed, expected) 123 | 124 | expected_number <- tibble::tibble( 125 | cyl = c(8, 4, 6, NA), 126 | number = c(14, 11, 6, 1)) 127 | 128 | observed <- cat_count(data, "cyl", only = "n") 129 | expect_equal(observed, expected_number) 130 | 131 | observed <- cat_count(data, "cyl", only = "number") 132 | expect_equal(observed, expected_number) 133 | 134 | observed <- cat_count(data, "cyl", only = " number ") 135 | expect_equal(observed, expected_number) 136 | 137 | expected_percent <- tibble::tibble( 138 | cyl = c(8, 4, 6, NA), 139 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 140 | 141 | observed <- cat_count(data, "cyl", only = "p") 142 | expect_equal(observed, expected_percent) 143 | 144 | observed <- cat_count(data, "cyl", only = "percent") 145 | expect_equal(observed, expected_percent) 146 | 147 | observed <- cat_count(data, "cyl", only = " percent ") 148 | expect_equal(observed, expected_percent) 149 | }) 150 | 151 | test_that("cat_count returns correct data with a valid clean_names argument", { 152 | 153 | data$Cyl <- data$cyl 154 | 155 | expected <- tibble::tibble( 156 | cyl = c(8, 4, 6, NA), 157 | number = c(14, 11, 6, 1), 158 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 159 | observed <- cat_count(data, "Cyl", clean_names = TRUE) 160 | expect_equal(observed, expected) 161 | 162 | expected <- tibble::tibble( 163 | Cyl = c(8, 4, 6, NA), 164 | number = c(14, 11, 6, 1), 165 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 166 | observed <- cat_count(data, "Cyl", clean_names = FALSE) 167 | expect_equal(observed, expected) 168 | }) 169 | 170 | test_that("cat_count uses option for default clean_names argument", { 171 | 172 | data$Cyl <- data$cyl 173 | restore_option <- getOption("tabbycat.clean_names") 174 | 175 | options(tabbycat.clean_names = TRUE) 176 | expected <- tibble::tibble( 177 | cyl = c(8, 4, 6, NA), 178 | number = c(14, 11, 6, 1), 179 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 180 | observed <- cat_count(data, "Cyl") 181 | expect_equal(observed, expected) 182 | 183 | options(tabbycat.clean_names = FALSE) 184 | expected <- tibble::tibble( 185 | Cyl = c(8, 4, 6, NA), 186 | number = c(14, 11, 6, 1), 187 | percent = c(0.43750, 0.34375, 0.18750, 0.03125)) 188 | observed <- cat_count(data, "Cyl") 189 | expect_equal(observed, expected) 190 | 191 | options(tabbycat.clean_names = restore_option) 192 | }) 193 | -------------------------------------------------------------------------------- /tests/testthat/test_summarise.R: -------------------------------------------------------------------------------- 1 | # Test summarise.R 2 | 3 | # Setup ----------------------------------------------------------------------- 4 | 5 | data <- mtcars %>% 6 | tibble::rownames_to_column("model") %>% 7 | tibble::as_tibble() %>% 8 | dplyr::mutate( 9 | manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1]) 10 | 11 | data$cyl[1] <- NA 12 | 13 | # Tests: cat_vcount ----------------------------------------------------------- 14 | 15 | test_that("cat_summarise rejects a data argument that is not a dataframe", { 16 | 17 | msg <- "The \"data\" argument is not a dataframe." 18 | expect_error(cat_summarise(NULL, "cyl", "mpg"), msg) 19 | expect_error(cat_summarise(NA, "cyl", "mpg"), msg) 20 | expect_error(cat_summarise(1:10, "cyl", "mpg"), msg) 21 | expect_error(cat_summarise(LETTERS[1:10], "cyl", "mpg"), msg) 22 | expect_error(cat_summarise(c(TRUE, FALSE), "cyl", "mpg"), msg) 23 | expect_error(cat_summarise(list(), "cyl", "mpg"), msg) 24 | }) 25 | 26 | test_that("cat_summarise rejects a data argument that has no rows", { 27 | 28 | msg <- "The \"data\" argument is empty." 29 | expect_error(cat_summarise(data.frame(), "cyl", "mpg"), msg) 30 | }) 31 | 32 | test_that("cat_summarise rejects invalid cat arguments", { 33 | 34 | msg <- "Invalid \"cat\" argument. Must be a character vector of length one." 35 | expect_error(cat_summarise(data, NULL, "mpg"), msg) 36 | expect_error(cat_summarise(data, NA, "mpg"), msg) 37 | expect_error(cat_summarise(data, 1:10, "mpg"), msg) 38 | expect_error(cat_summarise(data, LETTERS[1:10], "mpg"), msg) 39 | expect_error(cat_summarise(data, c(TRUE, FALSE), "mpg"), msg) 40 | expect_error(cat_summarise(data, list(), "mpg"), msg) 41 | }) 42 | 43 | test_that("cat_summarise rejects a cat argument that is not a column in the data", { 44 | 45 | msg <- "'notacolumn' is not a column in the dataframe." 46 | expect_error(cat_summarise(data, "notacolumn", "mpg"), msg) 47 | }) 48 | 49 | test_that("cat_summarise rejects invalid num arguments", { 50 | 51 | msg <- "Invalid \"num\" argument. Must be a character vector of length one." 52 | expect_error(cat_summarise(data, "cyl", NULL), msg) 53 | expect_error(cat_summarise(data, "cyl", NA), msg) 54 | expect_error(cat_summarise(data, "cyl", 1:10), msg) 55 | expect_error(cat_summarise(data, "cyl", LETTERS[1:10]), msg) 56 | expect_error(cat_summarise(data, "cyl", c(TRUE, FALSE)), msg) 57 | expect_error(cat_summarise(data, "cyl", list()), msg) 58 | }) 59 | 60 | test_that("cat_summarise rejects a num argument that is not a column in the data", { 61 | 62 | msg <- "'notacolumn' is not a column in the dataframe." 63 | expect_error(cat_summarise(data, "cyl", "notacolumn"), msg) 64 | }) 65 | 66 | test_that("cat_summarise rejects a num argument that is not a numeric column", { 67 | 68 | msg <- "The num argument is not a numeric column." 69 | expect_error(cat_summarise(data, "cyl", "manufacturer"), msg) 70 | }) 71 | 72 | test_that("cat_summarise rejects invalid na.rm arguments", { 73 | 74 | msg <- "Invalid \"na.rm\" argument. Must be either TRUE or FALSE." 75 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = NULL), msg) 76 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = NA), msg) 77 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = 1), msg) 78 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = ""), msg) 79 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = 1:10), msg) 80 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = LETTERS[1:10]), msg) 81 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = c(TRUE, FALSE)), msg) 82 | expect_error(cat_summarise(data, "cyl", "mpg", na.rm = list()), msg) 83 | }) 84 | 85 | test_that("cat_summarise rejects invalid clean_names arguments", { 86 | 87 | msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE." 88 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = NULL), msg) 89 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = NA), msg) 90 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = 1), msg) 91 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = ""), msg) 92 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = 1:10), msg) 93 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = LETTERS[1:10]), msg) 94 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = c(TRUE, FALSE)), msg) 95 | expect_error(cat_summarise(data, "cyl", "mpg", clean_names = list()), msg) 96 | }) 97 | 98 | test_that("cat_summarise returns correct data with defaults", { 99 | 100 | expected <- tibble::tibble( 101 | cyl = c(4, 6, 8, NA), 102 | n = c(11, 6, 14, 1), 103 | na = c(0, 0, 0, 0), 104 | mean = c(26.663636, 19.5333333, 15.10000, 21.00000), 105 | sd = c(4.50982765, 1.47196014, 2.56004808, NA), 106 | min = c(21.4, 17.8, 10.4, 21.0), 107 | lq = c(22.800, 18.375, 14.400, 21.000), 108 | med = c(26.00, 19.45, 15.20, 21.00), 109 | uq = c(30.400, 20.675, 16.250, 21.000), 110 | max = c(33.9, 21.4, 19.2, 21.0)) 111 | observed <- cat_summarise(data, "cyl", "mpg") 112 | expect_equal(observed, expected) 113 | 114 | data_na_all_na <- data 115 | data_na_all_na$mpg[1] <- NA 116 | 117 | expected <- tibble::tibble( 118 | cyl = c(4, 6, 8, NA), 119 | n = c(11, 6, 14, 1), 120 | na = c(0, 0, 0, 1), 121 | mean = c(26.663636, 19.5333333, 15.10000, NA), 122 | sd = c(4.50982765, 1.47196014, 2.56004808, NA), 123 | min = c(21.4, 17.8, 10.4, NA), 124 | lq = c(22.800, 18.375, 14.400, NA), 125 | med = c(26.00, 19.45, 15.20, NA), 126 | uq = c(30.400, 20.675, 16.250, NA), 127 | max = c(33.9, 21.4, 19.2, NA)) 128 | observed <- cat_summarise(data_na_all_na, "cyl", "mpg") 129 | expect_equal(observed, expected) 130 | 131 | data_na_summary <- data 132 | data_na_summary$cyl[3] <- NA 133 | 134 | expected <- tibble::tibble( 135 | cyl = c(4, 6, 8, NA), 136 | n = c(10, 6, 14, 2), 137 | na = c(0, 0, 0, 0), 138 | mean = c(27.05000, 19.5333333, 15.10000, 21.90000), 139 | sd = c(4.557838182, 1.4719601444, 2.5600480765, 1.272792061), 140 | min = c(21.4, 17.8, 10.4, 21.0), 141 | lq = c(23.200, 18.375, 14.400, 21.450), 142 | med = c(26.65, 19.45, 15.20, 21.90), 143 | uq = c(30.400, 20.675, 16.250, 22.350), 144 | max = c(33.9, 21.4, 19.2, 22.8)) 145 | observed <- cat_summarise(data_na_summary, "cyl", "mpg") 146 | expect_equal(observed, expected) 147 | 148 | }) 149 | 150 | test_that("cat_summarise returns correct data with a valid na.rm argument", { 151 | 152 | expected <- tibble::tibble( 153 | cyl = c(4, 6, 8, NA), 154 | n = c(11, 6, 14, 1), 155 | na = c(0, 0, 0, 0), 156 | mean = c(26.663636, 19.5333333, 15.10000, 21.00000), 157 | sd = c(4.50982765, 1.47196014, 2.56004808, NA), 158 | min = c(21.4, 17.8, 10.4, 21.0), 159 | lq = c(22.800, 18.375, 14.400, 21.000), 160 | med = c(26.00, 19.45, 15.20, 21.00), 161 | uq = c(30.400, 20.675, 16.250, 21.000), 162 | max = c(33.9, 21.4, 19.2, 21.0)) 163 | observed <- cat_summarise(data, "cyl", "mpg", na.rm = FALSE) 164 | expect_equal(observed, expected) 165 | 166 | expected <- tibble::tibble( 167 | cyl = c(4, 6, 8), 168 | n = c(11, 6, 14), 169 | na = c(0, 0, 0), 170 | mean = c(26.663636, 19.5333333, 15.10000), 171 | sd = c(4.50982765, 1.47196014, 2.56004808), 172 | min = c(21.4, 17.8, 10.4), 173 | lq = c(22.800, 18.375, 14.400), 174 | med = c(26.00, 19.45, 15.20), 175 | uq = c(30.400, 20.675, 16.250), 176 | max = c(33.9, 21.4, 19.2)) 177 | observed <- cat_summarise(data, "cyl", "mpg", na.rm = TRUE) 178 | expect_equal(observed, expected) 179 | }) 180 | 181 | test_that("cat_summarise returns correct data with a valid clean_names argument", { 182 | 183 | data$Cyl <- data$cyl 184 | 185 | expected <- tibble::tibble( 186 | cyl = c(4, 6, 8, NA), 187 | n = c(11, 6, 14, 1), 188 | na = c(0, 0, 0, 0), 189 | mean = c(26.663636, 19.5333333, 15.10000, 21.00000), 190 | sd = c(4.50982765, 1.47196014, 2.56004808, NA), 191 | min = c(21.4, 17.8, 10.4, 21.0), 192 | lq = c(22.800, 18.375, 14.400, 21.000), 193 | med = c(26.00, 19.45, 15.20, 21.00), 194 | uq = c(30.400, 20.675, 16.250, 21.000), 195 | max = c(33.9, 21.4, 19.2, 21.0)) 196 | observed <- cat_summarise(data, "Cyl", "mpg", clean_names = TRUE) 197 | expect_equal(observed, expected) 198 | 199 | expected <- tibble::tibble( 200 | Cyl = c(4, 6, 8, NA), 201 | n = c(11, 6, 14, 1), 202 | na = c(0, 0, 0, 0), 203 | mean = c(26.663636, 19.5333333, 15.10000, 21.00000), 204 | sd = c(4.50982765, 1.47196014, 2.56004808, NA), 205 | min = c(21.4, 17.8, 10.4, 21.0), 206 | lq = c(22.800, 18.375, 14.400, 21.000), 207 | med = c(26.00, 19.45, 15.20, 21.00), 208 | uq = c(30.400, 20.675, 16.250, 21.000), 209 | max = c(33.9, 21.4, 19.2, 21.0)) 210 | observed <- cat_summarise(data, "Cyl", "mpg", clean_names = FALSE) 211 | expect_equal(observed, expected) 212 | }) 213 | 214 | test_that("cat_summarise uses option for default clean_names argument", { 215 | 216 | data$Cyl <- data$cyl 217 | restore_option <- getOption("tabbycat.clean_names") 218 | 219 | options(tabbycat.clean_names = FALSE) 220 | expected <- tibble::tibble( 221 | Cyl = c(4, 6, 8, NA), 222 | n = c(11, 6, 14, 1), 223 | na = c(0, 0, 0, 0), 224 | mean = c(26.663636, 19.5333333, 15.10000, 21.00000), 225 | sd = c(4.50982765, 1.47196014, 2.56004808, NA), 226 | min = c(21.4, 17.8, 10.4, 21.0), 227 | lq = c(22.800, 18.375, 14.400, 21.000), 228 | med = c(26.00, 19.45, 15.20, 21.00), 229 | uq = c(30.400, 20.675, 16.250, 21.000), 230 | max = c(33.9, 21.4, 19.2, 21.0)) 231 | observed <- cat_summarise(data, "Cyl", "mpg") 232 | expect_equal(observed, expected) 233 | 234 | options(tabbycat.clean_names = TRUE) 235 | expected <- tibble::tibble( 236 | cyl = c(4, 6, 8, NA), 237 | n = c(11, 6, 14, 1), 238 | na = c(0, 0, 0, 0), 239 | mean = c(26.663636, 19.5333333, 15.10000, 21.00000), 240 | sd = c(4.50982765, 1.47196014, 2.56004808, NA), 241 | min = c(21.4, 17.8, 10.4, 21.0), 242 | lq = c(22.800, 18.375, 14.400, 21.000), 243 | med = c(26.00, 19.45, 15.20, 21.00), 244 | uq = c(30.400, 20.675, 16.250, 21.000), 245 | max = c(33.9, 21.4, 19.2, 21.0)) 246 | observed <- cat_summarise(data, "Cyl", "mpg") 247 | expect_equal(observed, expected) 248 | 249 | options(tabbycat.clean_names = restore_option) 250 | }) 251 | 252 | test_that("cat_summarize is an alias for cat_summarise", { 253 | 254 | summarise <- cat_summarise(data, "cyl", "mpg") 255 | summarize <- cat_summarize(data, "cyl", "mpg") 256 | expect_equal(summarise, summarize) 257 | }) 258 | -------------------------------------------------------------------------------- /tests/testthat/test_vcount.R: -------------------------------------------------------------------------------- 1 | # Test vcount.R 2 | 3 | # Setup ----------------------------------------------------------------------- 4 | 5 | cat <- c("a", "b", "b", "c", "c", "c", "d", "d", "d", "d") 6 | 7 | # Tests: cat_vcount ----------------------------------------------------------- 8 | 9 | test_that("cat_vcount rejects a cat argument that is not a vector", { 10 | 11 | msg <- "The \"cat\" argument is not a vector." 12 | expect_error(cat_vcount(NULL), msg) 13 | expect_error(cat_vcount(list()), msg) 14 | expect_error(cat_vcount(data.frame()), msg) 15 | }) 16 | 17 | test_that("cat_vcount rejects a cat argument that is an empty factor", { 18 | 19 | msg <- "The \"cat\" argument is empty." 20 | expect_error(cat_vcount(factor()), msg) 21 | }) 22 | 23 | test_that("cat_vcount rejects invalid na.rm arguments", { 24 | 25 | msg <- "Invalid \"na.rm\" argument. Must be either TRUE or FALSE." 26 | expect_error(cat_vcount(cat, na.rm = NULL), msg) 27 | expect_error(cat_vcount(cat, na.rm = NA), msg) 28 | expect_error(cat_vcount(cat, na.rm = 1), msg) 29 | expect_error(cat_vcount(cat, na.rm = ""), msg) 30 | expect_error(cat_vcount(cat, na.rm = 1:10), msg) 31 | expect_error(cat_vcount(cat, na.rm = LETTERS[1:10]), msg) 32 | expect_error(cat_vcount(cat, na.rm = c(TRUE, FALSE)), msg) 33 | expect_error(cat_vcount(cat, na.rm = list()), msg) 34 | }) 35 | 36 | test_that("cat_vcount rejects invalid clean_names arguments", { 37 | 38 | msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE." 39 | expect_error(cat_vcount(cat, clean_names = NULL), msg) 40 | expect_error(cat_vcount(cat, clean_names = NA), msg) 41 | expect_error(cat_vcount(cat, clean_names = 1), msg) 42 | expect_error(cat_vcount(cat, clean_names = ""), msg) 43 | expect_error(cat_vcount(cat, clean_names = 1:10), msg) 44 | expect_error(cat_vcount(cat, clean_names = LETTERS[1:10]), msg) 45 | expect_error(cat_vcount(cat, clean_names = c(TRUE, FALSE)), msg) 46 | expect_error(cat_vcount(cat, clean_names = list()), msg) 47 | }) 48 | 49 | test_that("cat_vcount rejects invalid only arguments", { 50 | 51 | msg <- "Invalid \"only\" argument. Must be a character vector of length one." 52 | expect_error(cat_vcount(cat, only = NULL), msg) 53 | expect_error(cat_vcount(cat, only = NA), msg) 54 | expect_error(cat_vcount(cat, only = 1), msg) 55 | expect_error(cat_vcount(cat, only = TRUE), msg) 56 | expect_error(cat_vcount(cat, only = 1:10), msg) 57 | expect_error(cat_vcount(cat, only = LETTERS[1:10]), msg) 58 | expect_error(cat_vcount(cat, only = c(TRUE, FALSE)), msg) 59 | expect_error(cat_vcount(cat, only = list()), msg) 60 | }) 61 | 62 | test_that("cat_vcount returns expected data with defaults", { 63 | 64 | expected <- tibble::tibble( 65 | cat = letters[4:1], 66 | number = 4:1, 67 | percent = number / sum(number)) 68 | observed <- cat_vcount(cat) 69 | expect_equal(observed, expected) 70 | }) 71 | 72 | test_that("cat_vcount returns expected data with a valid na.rm argument", { 73 | 74 | cat <- c(cat, NA) 75 | 76 | expected <- tibble::tibble( 77 | cat = c(letters[4:1], NA), 78 | number = as.integer(c(4:1, 1)), 79 | percent = number / sum(number)) 80 | observed <- cat_vcount(cat) 81 | expect_equal(observed[2:3], expected[2:3]) 82 | expect_equal(observed[1:4, ], expected[1:4, ]) 83 | expect_equal(is.na(observed[[5, 1]]), TRUE) 84 | 85 | expected <- tibble::tibble( 86 | cat = c(letters[4:1], NA), 87 | number = as.integer(c(4:1, 1)), 88 | percent = number / sum(number)) 89 | observed <- cat_vcount(cat, na.rm = FALSE) 90 | expect_equal(observed[2:3], expected[2:3]) 91 | expect_equal(observed[1:4, ], expected[1:4, ]) 92 | expect_equal(is.na(observed[[5, 1]]), TRUE) 93 | 94 | expected <- tibble::tibble( 95 | cat = letters[4:1], 96 | number = 4:1, 97 | percent = number / sum(number)) 98 | observed <- cat_vcount(cat, na.rm = TRUE) 99 | expect_equal(observed, expected) 100 | }) 101 | 102 | test_that("cat_vcount returns expected data with a valid only argument", { 103 | 104 | expected <- tibble::tibble( 105 | cat = letters[4:1], 106 | number = 4:1, 107 | percent = number / sum(number)) 108 | observed <- cat_vcount(cat, only = "ignore") 109 | expect_equal(observed, expected) 110 | 111 | expected_number <- tibble::tibble( 112 | cat = letters[4:1], 113 | number = 4:1) 114 | observed <- cat_vcount(cat, only = "n") 115 | expect_equal(observed, expected_number) 116 | 117 | observed <- cat_vcount(cat, only = "number") 118 | expect_equal(observed, expected_number) 119 | 120 | observed <- cat_vcount(cat, only = " number ") 121 | expect_equal(observed, expected_number) 122 | 123 | expected_percent <- tibble::tibble( 124 | cat = letters[4:1], 125 | percent = 4:1 / sum(4:1)) 126 | 127 | observed <- cat_vcount(cat, only = "p") 128 | expect_equal(observed, expected_percent) 129 | 130 | observed <- cat_vcount(cat, only = "percent") 131 | expect_equal(observed, expected_percent) 132 | 133 | observed <- cat_vcount(cat, only = " percent ") 134 | expect_equal(observed, expected_percent) 135 | }) 136 | 137 | test_that("cat_count returns correct data with a valid clean_names argument", { 138 | 139 | Cat <- cat 140 | 141 | expected <- tibble::tibble( 142 | cat = letters[4:1], 143 | number = 4:1, 144 | percent = number / sum(number)) 145 | observed <- cat_vcount(Cat, clean_names = TRUE) 146 | expect_equal(observed, expected) 147 | 148 | expected <- tibble::tibble( 149 | Cat = letters[4:1], 150 | number = 4:1, 151 | percent = number / sum(number)) 152 | observed <- cat_vcount(Cat, clean_names = FALSE) 153 | expect_equal(observed, expected) 154 | }) 155 | 156 | test_that("cat_vcount uses option for default clean_names argument", { 157 | 158 | Cat <- cat 159 | restore_option <- getOption("tabbycat.clean_names") 160 | 161 | options(tabbycat.clean_names = TRUE) 162 | expected <- tibble::tibble( 163 | cat = letters[4:1], 164 | number = 4:1, 165 | percent = number / sum(number)) 166 | observed <- cat_vcount(Cat) 167 | expect_equal(observed, expected) 168 | 169 | options(tabbycat.clean_names = FALSE) 170 | expected <- tibble::tibble( 171 | Cat = letters[4:1], 172 | number = 4:1, 173 | percent = number / sum(number)) 174 | observed <- cat_vcount(Cat) 175 | expect_equal(observed, expected) 176 | 177 | options(tabbycat.clean_names = restore_option) 178 | }) 179 | 180 | --------------------------------------------------------------------------------