├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── check-standard.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── compare.R
    ├── contrast.R
    ├── count.R
    ├── summarise.R
    ├── tabbycat-package.R
    ├── vcount.R
    └── zzz.R
├── cran-comments.md
├── man
    ├── cat_compare.Rd
    ├── cat_contrast.Rd
    ├── cat_count.Rd
    ├── cat_summarise.Rd
    ├── cat_vcount.Rd
    ├── safe_max.Rd
    ├── safe_mean.Rd
    ├── safe_min.Rd
    └── tabbycat.Rd
├── readme.md
├── tabbycat.Rproj
└── tests
    ├── testthat.R
    └── testthat
        ├── test_compare.R
        ├── test_contrast.R
        ├── test_count.R
        ├── test_summarise.R
        └── test_vcount.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.github$
4 | ^readme\.md$
5 | ^cran-comments\.md$
6 | ^LICENSE\.md$
7 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/check-standard.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macos-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v3
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: |
31 |           covr::codecov(
32 |             quiet = FALSE,
33 |             clean = FALSE,
34 |             install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
35 |           )
36 |         shell: Rscript {0}
37 | 
38 |       - name: Show testthat output
39 |         if: always()
40 |         run: |
41 |           ## --------------------------------------------------------------------
42 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 |         shell: bash
44 | 
45 |       - name: Upload test results
46 |         if: failure()
47 |         uses: actions/upload-artifact@v3
48 |         with:
49 |           name: coverage-test-failures
50 |           path: ${{ runner.temp }}/package
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | 
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 | 
24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25 | .httr-oauth
26 | 
27 | # knitr and R markdown default cache directories
28 | /*_cache/
29 | /cache/
30 | 
31 | # Temporary files created by R markdown
32 | *.utf8.md
33 | *.knit.md
34 | 
35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
36 | rsconnect/
37 | .Rproj.user
38 | 
39 | # Mac files
40 | .DS_Store
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tabbycat
 2 | Type: Package
 3 | Title: Tabulate and Summarise Categorical Data
 4 | Version: 0.18.0
 5 | Authors@R: 
 6 |     person(
 7 |         given = "Oliver",
 8 |         family = "Hawkins",
 9 |         role = c("aut", "cre"),
10 |         email = "oli@olihawkins.com")
11 | Maintainer: Oliver Hawkins <oli@olihawkins.com>
12 | Description: Functions for tabulating and summarising categorical variables. 
13 |     Most functions are designed to work with dataframes, and use the 'tidyverse' 
14 |     idiom of taking the dataframe as the first argument so they work within 
15 |     pipelines. Equivalent functions that operate directly on vectors are also 
16 |     provided where it makes sense. This package aims to make exploratory data 
17 |     analysis involving categorical variables quicker, simpler and more robust.
18 | License: MIT + file LICENSE
19 | Depends: 
20 |     R (>= 3.4.0)
21 | Imports:
22 |     dplyr (>= 1.0.0),
23 |     janitor,
24 |     magrittr,
25 |     purrr,
26 |     rlang,
27 |     stringr,
28 |     tibble,
29 |     tidyr
30 | Encoding: UTF-8
31 | RoxygenNote: 7.2.3
32 | Suggests: 
33 |     testthat (>= 3.0.0)
34 | Config/testthat/edition: 3
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: Oliver Hawkins
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # The MIT License
 2 | 
 3 | Copyright (c) 2021 Oliver Hawkins
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(cat_compare)
 4 | export(cat_contrast)
 5 | export(cat_count)
 6 | export(cat_summarise)
 7 | export(cat_summarize)
 8 | export(cat_vcount)
 9 | importFrom(magrittr,"%>%")
10 | importFrom(rlang,.data)
11 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # tabbycat 0.18.0
 2 | 
 3 | * No breaking changes
 4 | * No new functions
 5 | * Existing internal code has been updated to remove deprecated uses of third-party packages
 6 | 
 7 | # tabbycat 0.17.0
 8 | 
 9 | * Initial CRAN release
10 | * The stable API comprises five functions:
11 |     * `cat_count`
12 |     * `cat_vcount`
13 |     * `cat_compare`
14 |     * `cat_contrast`
15 |     * `cat_summarise`
16 | * Unit test coverage is 97% and all tests pass
17 | * Added GitHub Actions, NEWS.md, and cran-comments.md for CRAN submission
18 | 


--------------------------------------------------------------------------------
/R/compare.R:
--------------------------------------------------------------------------------
  1 | #' Calculate the frequency of discrete values in one categorical variable for
  2 | #' each group within another categorical variable
  3 | #'
  4 | #' This function crosstabulates the frequencies of one categorical variable
  5 | #' within the groups of another. The results are sorted on the values of the
  6 | #' variable whose distribution is shown in each column i.e. the variable
  7 | #' specified with \code{row_cat}. If this variable is a character vector it
  8 | #' will be sorted alphabetically. If it is a factor it will be sorted in the
  9 | #' order of its levels.
 10 | #'
 11 | #' @param data A dataframe containing the two variables of interest.
 12 | #' @param row_cat The column name of a categorical variable whose distribution
 13 | #'   will be calculated for each group in \code{col_cat}.
 14 | #' @param col_cat The column name of a categorical variable which will be
 15 | #'   split into groups and the distrubtion of \code{row_cat} calulated
 16 | #'   for each group.
 17 | #' @param na.rm.row A boolean indicating whether to exclude NAs from the row
 18 | #'   results. The default is FALSE.
 19 | #' @param na.rm.col A boolean indicating whether to exclude NAs from the column
 20 | #'   results. The default is FALSE.
 21 | #' @param na.rm A boolean indicating whether to exclude NAs from both row and
 22 | #'   column results. This argument is provided as a convenience. It allows you
 23 | #'   to set \code{na.rm.row} and \code{na.rm.col} to the same value without
 24 | #'   having to specify them separately. If the value of \code{na.rm} is NULL,
 25 | #'   the argument is ignored. If it is not NULL it takes precendence.
 26 | #'   default is NULL.
 27 | #' @param only A string indicating that only one set of frequency columns
 28 | #'   should be returned in the results. If \code{only} is either "n" or
 29 | #'   "number", only the number columns are returned. If \code{only} is either
 30 | #'   "p" or "percent", only the percent columns are returned. If \code{only} is
 31 | #'   any other value, both sets of columns are shown. The default value is an
 32 | #'   empty string, which means both sets of columns are shown.
 33 | #' @param clean_names A boolean indicating whether the column names of the
 34 | #'   results tibble should be cleaned, so that any column names produced from
 35 | #'   data are converted to snake_case. The default is TRUE, but this can be
 36 | #'   changed with \code{options(tabbycat.clean_names = FALSE)}.
 37 | #' @param na_label A string indicating the label to use for the columns that
 38 | #'   contain data for missing values. The default value is "na", but use this
 39 | #'   argument to set a different value if the default value collides with data
 40 | #'   in your dataset.
 41 | #' @return A tibble showing the distribution of \code{row_cat} within each
 42 | #'   group in \code{col_cat}.
 43 | #' @export
 44 | 
 45 | cat_compare <- function(
 46 |     data,
 47 |     row_cat,
 48 |     col_cat,
 49 |     na.rm.row = FALSE,
 50 |     na.rm.col = FALSE,
 51 |     na.rm = NULL,
 52 |     only = "",
 53 |     clean_names = getOption("tabbycat.clean_names"),
 54 |     na_label = getOption("tabbycat.na_label")) {
 55 | 
 56 |     # Check the data argument is not null and is a dataframe
 57 |     if (is.null(data) || ! is.data.frame(data)) {
 58 |         stop("The \"data\" argument is not a dataframe.")
 59 |     }
 60 | 
 61 |     # Check that data has rows
 62 |     if (nrow(data) == 0) {
 63 |         stop("The \"data\" argument is empty.")
 64 |     }
 65 | 
 66 |     # Check the row_cat argument is a character vector of length one
 67 |     if (! is.character(row_cat) || length(row_cat) != 1) {
 68 |         stop("Invalid \"row_cat\" argument. Must be a character vector of length one.")
 69 |     }
 70 | 
 71 |     # Check the row_cat argument is a column in data
 72 |     if (! row_cat %in% colnames(data)) {
 73 |         stop(stringr::str_c("'", row_cat,
 74 |             "' is not a column in the dataframe."))
 75 |     }
 76 | 
 77 |     # Check the col_cat argument is a character vector of length one
 78 |     if (! is.character(col_cat) || length(col_cat) != 1) {
 79 |         stop("Invalid \"col_cat\" argument. Must be a character vector of length one.")
 80 |     }
 81 | 
 82 |     # Check the col_cat argument is a column in data
 83 |     if (! col_cat %in% colnames(data)) {
 84 |         stop(stringr::str_c("'", col_cat,
 85 |             "' is not a column in the dataframe."))
 86 |     }
 87 | 
 88 |     # Check the na.rm.row argument is valid
 89 |     if (length(na.rm.row) != 1 || is.na(na.rm.row) || ! is.logical(na.rm.row)) {
 90 |         stop("Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE.")
 91 |     }
 92 | 
 93 |     # Check the na.rm.col argument is valid
 94 |     if (length(na.rm.col) != 1 || is.na(na.rm.col) || ! is.logical(na.rm.col)) {
 95 |         stop("Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE.")
 96 |     }
 97 | 
 98 |     # Check the na.rm argument is valid
 99 |     if (length(na.rm) > 1 ||
100 |             (! is.null(na.rm) && ! is.logical(na.rm)) ||
101 |             (! is.null(na.rm) && is.na(na.rm))) {
102 |         stop("Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE.")
103 |     }
104 | 
105 |     # Check the only argument is valid
106 |     if (length(only) != 1 || is.na(only) || ! is.character(only)) {
107 |         stop("Invalid \"only\" argument. Must be a character vector of length one.")
108 |     }
109 | 
110 |     # Check the clean_names argument is valid
111 |     if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) {
112 |         stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.")
113 |     }
114 | 
115 |     # Check the na_label argument is valid
116 |     if (length(na_label) != 1 || is.na(na_label) || ! is.character(na_label)) {
117 |         stop("Invalid \"na_label\" argument. Must be a character vector of length one.")
118 |     }
119 | 
120 |     # Set both na.rm.row and na.rm.col to the value of na.rm if specified
121 |     if (! is.null(na.rm)) {
122 |         na.rm.row <- na.rm
123 |         na.rm.col <- na.rm
124 |     }
125 | 
126 |     # Remove rows with NAs if na.rm.row is TRUE
127 |     if (na.rm.row == TRUE) {
128 |         data <- data %>% dplyr::filter(! is.na(.data[[row_cat]]))
129 |     }
130 | 
131 |     group_names <- sort(unique(data[[col_cat]]))
132 | 
133 |     comparison_data <- purrr::map_dfr(group_names, function(group_name) {
134 | 
135 |         data %>%
136 |             dplyr::filter(.data[[col_cat]] == group_name) %>%
137 |             dplyr::group_by(.data[[row_cat]]) %>%
138 |             dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
139 |             dplyr::mutate(p = .data$n / sum(.data$n)) %>%
140 |             dplyr::mutate(group = as.character(group_name)) %>%
141 |             dplyr::select(
142 |                 "group",
143 |                 dplyr::all_of(row_cat),
144 |                 "n",
145 |                 "p")
146 |     })
147 | 
148 |     if (na.rm.col == FALSE) {
149 | 
150 |         na_data <- data %>%
151 |             dplyr::filter(is.na(.data[[col_cat]])) %>%
152 |             dplyr::group_by(.data[[row_cat]]) %>%
153 |             dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
154 |             dplyr::mutate(p = .data$n / sum(.data$n)) %>%
155 |             dplyr::mutate(group = na_label) %>%
156 |             dplyr::select(
157 |                 "group",
158 |                 dplyr::all_of(row_cat),
159 |                 "n",
160 |                 "p")
161 | 
162 |     } else {
163 | 
164 |         na_data <- tibble::tibble()
165 |     }
166 | 
167 |     comparison <- dplyr::bind_rows(
168 |             comparison_data,
169 |             na_data) %>%
170 |         tidyr::pivot_wider(
171 |             id_cols = dplyr::all_of(row_cat),
172 |             names_from = "group",
173 |             values_from = c("n", "p")) %>%
174 |         dplyr::mutate(dplyr::across(-1, ~tidyr::replace_na(.x, 0))) %>%
175 |         dplyr::arrange(.data[[row_cat]])
176 | 
177 |     # Clean names if clean_names is TRUE
178 |     if (clean_names == TRUE) {
179 |         comparison <- comparison %>% janitor::clean_names()
180 |     }
181 | 
182 |     # Remove columns based on only argument
183 |     if (stringr::str_trim(only) %in% c("n", "number")) {
184 |         comparison <- comparison %>% dplyr::select(-dplyr::starts_with("p_"))
185 |     }
186 | 
187 |     if (stringr::str_trim(only) %in% c("p", "percent")) {
188 |         comparison <- comparison %>% dplyr::select(-dplyr::starts_with("n_"))
189 |     }
190 | 
191 |     comparison
192 | }
193 | 


--------------------------------------------------------------------------------
/R/contrast.R:
--------------------------------------------------------------------------------
  1 | #' Calculate the frequency of discrete values in one categorical variable for
  2 | #' each of two mutually exclusive groups within another categorical variable
  3 | #'
  4 | #' This function shows the distrbution of values within given a categorical
  5 | #' variable for one group within another categorical variable, and compares it
  6 | #' with the distribution among all observations not in that group. Its purpose
  7 | #' is to let you see quickly whether the distribution within that group differs
  8 | #' from the distribution for the rest of the observations. The results are
  9 | #' sorted in descending order of frequency for the named group i.e. the group
 10 | #' named in \code{col_group}.
 11 | #'
 12 | #' @param data A dataframe containing the two variables of interest.
 13 | #' @param row_cat The column name of a categorical variable whose distribution
 14 | #'   should be calculated for each exclusive group in \code{col_cat}.
 15 | #' @param col_cat The column name of a categorical variable that will be
 16 | #'   split into two exclusive groups, one containing observations with a
 17 | #'   particular value of that variable, and another containing all other
 18 | #'   observations.
 19 | #' @param col_group The name of the group within \code{col_cat} that is
 20 | #'   used to split the observations into two exclusive groups: those that are
 21 | #'   in the group and those that are not in the group.
 22 | #' @param na.rm.row A boolean indicating whether to exclude NAs from the row
 23 | #'   results. The default is FALSE.
 24 | #' @param na.rm.col A boolean indicating whether to exclude NAs from the column
 25 | #'   results. The default is FALSE.
 26 | #' @param na.rm A boolean indicating whether to exclude NAs from both row and
 27 | #'   column results. This argument is provided as a convenience. It allows you
 28 | #'   to set \code{na.rm.row} and \code{na.rm.col} to the same value without
 29 | #'   having to specify them separately. If the value of \code{na.rm} is NULL,
 30 | #'   the argument is ignored. If it is not NULL it takes precendence.
 31 | #'   default is NULL.
 32 | #' @param clean_names A boolean indicating whether the column names of the
 33 | #'   results tibble should be cleaned, so that any column names produced from
 34 | #'   data are converted to snake_case. The default is TRUE, but this can be
 35 | #'   changed with \code{options(tabbycat.clean_names = FALSE)}.
 36 | #' @param only A string indicating that only one set of frequency columns
 37 | #'   should be returned in the results. If \code{only} is either "n" or
 38 | #'   "number", only the number columns are returned. If \code{only} is either
 39 | #'   "p" or "percent", only the percent columns are returned. If \code{only} is
 40 | #'   any other value, both sets of columns are shown. The default value is an
 41 | #'   empty string, which means both sets of columns are shown.
 42 | #' @param na_label A string indicating the label to use for the columns that
 43 | #'   contain data for missing values. The default value is "na", but use this
 44 | #'   argument to set a different value if the default value collides with data
 45 | #'   in your dataset.
 46 | #' @param other_label A string indicating the label to use for the columns that
 47 | #'   contain data for observations not in the named group. The default value is
 48 | #'   "other", but use this argument to set a different value if the default
 49 | #'   value collides with data in your dataset.
 50 | #' @return A tibble showing the distribution of \code{row_cat} within each of
 51 | #'   the two exclusive groups in \code{col_cat}.
 52 | #' @export
 53 | 
 54 | cat_contrast <- function(
 55 |     data,
 56 |     row_cat,
 57 |     col_cat,
 58 |     col_group,
 59 |     na.rm.row = FALSE,
 60 |     na.rm.col = FALSE,
 61 |     na.rm = NULL,
 62 |     only = "",
 63 |     clean_names = getOption("tabbycat.clean_names"),
 64 |     na_label = getOption("tabbycat.na_label"),
 65 |     other_label = getOption("tabbycat.other_label")) {
 66 | 
 67 |     # Check the data argument is not null and is a dataframe
 68 |     if (is.null(data) || ! is.data.frame(data)) {
 69 |         stop("The \"data\" argument is not a dataframe.")
 70 |     }
 71 | 
 72 |     # Check that data has rows
 73 |     if (nrow(data) == 0) {
 74 |         stop("The \"data\" argument is empty.")
 75 |     }
 76 | 
 77 |     # Check the row_cat argument is a character vector of length one
 78 |     if (! is.character(row_cat) || length(row_cat) != 1) {
 79 |         stop("Invalid \"row_cat\" argument. Must be a character vector of length one.")
 80 |     }
 81 | 
 82 |     # Check the row_cat argument is a column in data
 83 |     if (! row_cat %in% colnames(data)) {
 84 |         stop(stringr::str_c("'", row_cat,
 85 |             "' is not a column in the dataframe."))
 86 |     }
 87 | 
 88 |     # Check the col_cat argument is a character vector of length one
 89 |     if (! is.character(col_cat) || length(col_cat) != 1) {
 90 |         stop("Invalid \"col_cat\" argument. Must be a character vector of length one.")
 91 |     }
 92 | 
 93 |     # Check the col_cat argument is a column in data
 94 |     if (! col_cat %in% colnames(data)) {
 95 |         stop(stringr::str_c("'", col_cat,
 96 |             "' is not a column in the dataframe."))
 97 |     }
 98 | 
 99 |     # Check the col_group argument is a valid vector of length one
100 |     if (length(col_group) != 1 || is.na(col_group) || ! is.atomic(col_group)) {
101 |         stop("Invalid \"col_group\" argument. Must be a character vector of length one.")
102 |     }
103 | 
104 |     # Check that col_group exits in col_cat
105 |     if (! col_group %in% data[[col_cat]]) {
106 |         stop(stringr::str_c(
107 |             "The \"col_group\" '", col_group,
108 |             "' does not exist in the \"col_cat\" '", col_cat, "'."))
109 |     }
110 | 
111 |     # Check the na.rm.row argument is valid
112 |     if (length(na.rm.row) != 1 || is.na(na.rm.row) || ! is.logical(na.rm.row)) {
113 |         stop("Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE.")
114 |     }
115 | 
116 |     # Check the na.rm.col argument is valid
117 |     if (length(na.rm.col) != 1 || is.na(na.rm.col) || ! is.logical(na.rm.col)) {
118 |         stop("Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE.")
119 |     }
120 | 
121 |     # Check the na.rm argument is valid
122 |     if (length(na.rm) > 1 ||
123 |         (! is.null(na.rm) && ! is.logical(na.rm)) ||
124 |         (! is.null(na.rm) && is.na(na.rm))) {
125 |         stop("Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE.")
126 |     }
127 | 
128 |     # Check the only argument is valid
129 |     if (length(only) != 1 || is.na(only) || ! is.character(only)) {
130 |         stop("Invalid \"only\" argument. Must be a character vector of length one.")
131 |     }
132 | 
133 |     # Check the clean_names argument is valid
134 |     if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) {
135 |         stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.")
136 |     }
137 | 
138 |     # Check the other_label argument is a character vector of length one
139 |     if (! is.character(other_label) || length(other_label) != 1) {
140 |         stop("Invalid \"other_label\" argument. Must be a character vector of length one.")
141 |     }
142 | 
143 |     # Check the na_label argument is a character vector of length one
144 |     if (! is.character(na_label) || length(na_label) != 1) {
145 |         stop("Invalid \"na_label\" argument. Must be a character vector of length one.")
146 |     }
147 | 
148 |     # Set both na.rm.row and na.rm.col to the value of na.rm if specified
149 |     if (! is.null(na.rm)) {
150 |         na.rm.row <- na.rm
151 |         na.rm.col <- na.rm
152 |     }
153 | 
154 |     # Remove rows with NAs if na.rm.row is TRUE
155 |     if (na.rm.row == TRUE) {
156 |         data <- data %>% dplyr::filter(! is.na(.data[[row_cat]]))
157 |     }
158 | 
159 |     in_group_data <- data %>%
160 |         dplyr::filter(.data[[col_cat]] == col_group) %>%
161 |         dplyr::group_by(.data[[row_cat]]) %>%
162 |         dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
163 |         dplyr::mutate(p = .data$n / sum(.data$n)) %>%
164 |         dplyr::mutate(group = as.character(col_group)) %>%
165 |         dplyr::select(
166 |             "group",
167 |             dplyr::all_of(row_cat),
168 |             "n",
169 |             "p")
170 | 
171 |     out_group_data <- data %>%
172 |         dplyr::filter(.data[[col_cat]] != col_group) %>%
173 |         dplyr::group_by(.data[[row_cat]]) %>%
174 |         dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
175 |         dplyr::mutate(p = .data$n / sum(.data$n)) %>%
176 |         dplyr::mutate(group = other_label) %>%
177 |         dplyr::select(
178 |             "group",
179 |             dplyr::all_of(row_cat),
180 |             "n",
181 |             "p")
182 | 
183 |     if (na.rm.col == FALSE) {
184 | 
185 |         na_data <- data %>%
186 |             dplyr::filter(is.na(.data[[col_cat]])) %>%
187 |             dplyr::group_by(.data[[row_cat]]) %>%
188 |             dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
189 |             dplyr::mutate(p = .data$n / sum(.data$n)) %>%
190 |             dplyr::mutate(group = na_label) %>%
191 |             dplyr::select(
192 |                 "group",
193 |                 dplyr::all_of(row_cat),
194 |                 "n",
195 |                 "p")
196 | 
197 |     } else {
198 | 
199 |         na_data <- tibble::tibble()
200 |     }
201 | 
202 |     comparison <- dplyr::bind_rows(
203 |             in_group_data,
204 |             out_group_data,
205 |             na_data) %>%
206 |         tidyr::pivot_wider(
207 |             id_cols = dplyr::all_of(row_cat),
208 |             names_from = "group",
209 |             values_from = c("n", "p")) %>%
210 |         dplyr::mutate(dplyr::across(-1, ~tidyr::replace_na(.x, 0))) %>%
211 |         dplyr::arrange(dplyr::desc(.data[[stringr::str_c("n_", col_group)]]))
212 | 
213 |     # Clean names if clean_names is TRUE
214 |     if (clean_names == TRUE) {
215 |         comparison <- comparison %>% janitor::clean_names()
216 |     }
217 | 
218 |     # Remove columns based on only argument
219 |     if (stringr::str_trim(only) %in% c("n", "number")) {
220 |         comparison <- comparison %>% dplyr::select(-dplyr::starts_with("p_"))
221 |     }
222 | 
223 |     if (stringr::str_trim(only) %in% c("p", "percent")) {
224 |         comparison <- comparison %>% dplyr::select(-dplyr::starts_with("n_"))
225 |     }
226 | 
227 |     comparison
228 | }
229 | 


--------------------------------------------------------------------------------
/R/count.R:
--------------------------------------------------------------------------------
 1 | #' Count the frequency of discrete values in the column of a dataframe
 2 | #'
 3 | #' This function differs from \code{cat_vcount} in that it operates on columns
 4 | #' in dataframes rather than directly on vectors, which means it is more useful
 5 | #' in pipelines but handles a narrower range of inputs. The results are sorted
 6 | #' in descending order of frequency.
 7 | #'
 8 | #' @param data A dataframe containing a categorical vector for which
 9 | #'   frequencies will be calculated.
10 | #' @param cat The column name of the categorical variable for which frequencies
11 | #'   will be calculated.
12 | #' @param na.rm A boolean indicating whether to exclude NAs from the results.
13 | #'   The default is FALSE.
14 | #' @param only A string indicating that only one of the frequency columns
15 | #'   should be returned in the results. If \code{only} is either "n" or
16 | #'   "number", only the number column is returned. If \code{only} is either
17 | #'   "p" or "percent", only the percent column is returned. If \code{only} is
18 | #'   any other value, both columns are shown. The default value is an empty
19 | #'   string, which means both columns are shown.
20 | #' @param clean_names A boolean indicating whether the column names of the
21 | #'   results tibble should be cleaned, so that any column names produced from
22 | #'   data are converted to snake_case. The default is TRUE, but this can be
23 | #'   changed with \code{options(tabbycat.clean_names = FALSE)}.
24 | #' @return A tibble showing the frequency of each value in \code{cat}.
25 | #' @export
26 | 
27 | cat_count <- function(
28 |     data,
29 |     cat,
30 |     na.rm = FALSE,
31 |     only = "",
32 |     clean_names = getOption("tabbycat.clean_names")) {
33 | 
34 |     # Check the data argument is not null and is a dataframe
35 |     if (is.null(data) || ! is.data.frame(data)) {
36 |         stop("The \"data\" argument is not a dataframe.")
37 |     }
38 | 
39 |     # Check that data has rows
40 |     if (nrow(data) == 0) {
41 |         stop("The \"data\" argument is empty.")
42 |     }
43 | 
44 |     # Check the cat argument is a character vector of length one
45 |     if (! is.character(cat) || length(cat) != 1) {
46 |         stop("Invalid \"cat\" argument. Must be a character vector of length one.")
47 |     }
48 | 
49 |     # Check the cat argument is a column in data
50 |     if (! cat %in% colnames(data)) {
51 |         stop(stringr::str_c("'", cat, "' is not a column in the dataframe."))
52 |     }
53 | 
54 |     # Check the na.rm argument is valid
55 |     if (length(na.rm) != 1 || is.na(na.rm) || ! is.logical(na.rm)) {
56 |         stop("Invalid \"na.rm\" argument. Must be either TRUE or FALSE.")
57 |     }
58 | 
59 |     # Check the only argument is valid
60 |     if (length(only) != 1 || is.na(only) || ! is.character(only)) {
61 |         stop("Invalid \"only\" argument. Must be a character vector of length one.")
62 |     }
63 | 
64 |     # Check the clean_names argument is valid
65 |     if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) {
66 |         stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.")
67 |     }
68 | 
69 |     # Remove rows with NAs if na.rm is TRUE
70 |     if (na.rm == TRUE) {
71 |         data <- data %>% dplyr::filter(! is.na(.data[[cat]]))
72 |     }
73 | 
74 |     # Create table
75 |     count <- data %>%
76 |         dplyr::group_by(.data[[cat]]) %>%
77 |         dplyr::summarise(number = dplyr::n(), .groups = "drop") %>%
78 |         dplyr::mutate(percent = .data$number / sum(.data$number)) %>%
79 |         dplyr::arrange(dplyr::desc(.data$number))
80 | 
81 |     # Clean names if clean_names is TRUE
82 |     if (clean_names == TRUE) {
83 |         count <- count %>% janitor::clean_names()
84 |     }
85 | 
86 |     # Remove columns based on only argument
87 |     if (stringr::str_trim(only) %in% c("n", "number")) {
88 |         count <- count %>% dplyr::select(-c("percent"))
89 |     }
90 | 
91 |     if (stringr::str_trim(only) %in% c("p", "percent")) {
92 |         count <- count %>% dplyr::select(-c("number"))
93 |     }
94 | 
95 |     count
96 | }
97 | 


--------------------------------------------------------------------------------
/R/summarise.R:
--------------------------------------------------------------------------------
  1 | #' Summarise the values of a numerical variable for each group within a
  2 | #' categorical variable
  3 | #'
  4 | #' The results are sorted on the values of the categorical variable i.e.
  5 | #' the variable specified with \code{cat}. If this variable is a character
  6 | #' vector it will be sorted alphabetically. If it is a factor it will be
  7 | #' sorted in the order of its levels. This function can be called as either
  8 | #' \code{cat_summarise} or \code{cat_summarize}.
  9 | #'
 10 | #' @param data A dataframe containing a categorical variable and numerical
 11 | #'   variable to summarise.
 12 | #' @param cat The name of a column in \code{data} which is a categorical vector
 13 | #'   of discrete values for which summaries will be calculated.
 14 | #' @param num The name of a column in \code{data} which is a numerical vector
 15 | #'   that will be summarised for each group.
 16 | #' @param na.rm A boolean indicating whether to exclude NAs from the row
 17 | #'   results. Note that NAs are **always** ignored in calculating the summary
 18 | #'   statistics for \code{num} shown in each row, and the number of NAs that
 19 | #'   exist in \code{num} for each group in \code{cat} is shown in the
 20 | #'   \code{na} column of the results table. This argument controls whether a
 21 | #'   row of summary statistics is shown for observations that are NA in
 22 | #'   \code{cat}. The default is FALSE.
 23 | #' @param clean_names A boolean indicating whether the column names of the
 24 | #'   results tibble should be cleaned, so that any column names produced from
 25 | #'   data are converted to snake_case. The default is TRUE, but this can be
 26 | #'   changed with \code{options(tabbycat.clean_names = FALSE)}.
 27 | #' @return A tibble showing summary statistics for \code{num} for each group
 28 | #'   in \code{cat}.
 29 | #' @export
 30 | 
 31 | cat_summarise <- function(
 32 |     data,
 33 |     cat,
 34 |     num,
 35 |     na.rm = FALSE,
 36 |     clean_names = getOption("tabbycat.clean_names")) {
 37 | 
 38 |     # Check the data argument is not null and is a dataframe
 39 |     if (is.null(data) || ! is.data.frame(data)) {
 40 |         stop("The \"data\" argument is not a dataframe.")
 41 |     }
 42 | 
 43 |     # Check that data has rows
 44 |     if (nrow(data) == 0) {
 45 |         stop("The \"data\" argument is empty.")
 46 |     }
 47 | 
 48 |     # Check the cat argument is a character vector of length one
 49 |     if (! is.character(cat) || length(cat) != 1) {
 50 |         stop("Invalid \"cat\" argument. Must be a character vector of length one.")
 51 |     }
 52 | 
 53 |     # Check the cat argument is a column in data
 54 |     if (! cat %in% colnames(data)) {
 55 |         stop(stringr::str_c("'", cat, "' is not a column in the dataframe."))
 56 |     }
 57 | 
 58 |     # Check the num argument is a character vector of length one
 59 |     if (! is.character(num) || length(num) != 1) {
 60 |         stop("Invalid \"num\" argument. Must be a character vector of length one.")
 61 |     }
 62 | 
 63 |     # Check the num argument is a column in data
 64 |     if (! num %in% colnames(data)) {
 65 |         stop(stringr::str_c("'", num, "' is not a column in the dataframe."))
 66 |     }
 67 | 
 68 |     # Check the num argument is numeric
 69 |     if (! is.numeric(data[[num]])) {
 70 |         stop(stringr::str_c("The num argument is not a numeric column."))
 71 |     }
 72 | 
 73 |     # Check the na.rm argument is valid
 74 |     if (length(na.rm) != 1 || is.na(na.rm) || ! is.logical(na.rm)) {
 75 |         stop("Invalid \"na.rm\" argument. Must be either TRUE or FALSE.")
 76 |     }
 77 | 
 78 |     # Check the clean_names argument is valid
 79 |     if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) {
 80 |         stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.")
 81 |     }
 82 | 
 83 |     # Remove rows with NAs if na.rm is TRUE
 84 |     if (na.rm == TRUE) {
 85 |         data <- data %>% dplyr::filter(! is.na(.data[[cat]]))
 86 |     }
 87 | 
 88 |     # Create table
 89 |     summary <- data %>%
 90 |         dplyr::group_by(.data[[cat]]) %>%
 91 |         dplyr::summarise(
 92 |             n = dplyr::n(),
 93 |             na = sum(is.na(.data[[num]])),
 94 |             mean = safe_mean(.data[[num]], na.rm = TRUE),
 95 |             sd = stats::sd(.data[[num]], na.rm = TRUE),
 96 |             min = safe_min(.data[[num]], na.rm = TRUE),
 97 |             lq = unname(stats::quantile(.data[[num]], 0.25, na.rm = TRUE)),
 98 |             med = stats::median(.data[[num]], na.rm = TRUE),
 99 |             uq = unname(stats::quantile(.data[[num]], 0.75, na.rm = TRUE)),
100 |             max = safe_max(.data[[num]], na.rm = TRUE),
101 |             .groups = "drop") %>%
102 |         dplyr::arrange(.data[[cat]])
103 | 
104 |     # Clean names if clean_names is TRUE
105 |     if (clean_names == TRUE) {
106 |         summary <- summary %>% janitor::clean_names()
107 |     }
108 | 
109 |     summary
110 | }
111 | 
112 | #' @rdname cat_summarise
113 | #' @export
114 | #'
115 | cat_summarize <- cat_summarise
116 | 
117 | #' Calculate \code{mean} but return NA rather than NaN when values are missing
118 | #'
119 | #' This function is a drop-in replacement for \code{mean}, which is used in
120 | #' \code{cat_summarise}. It returns NA rather than NaN when all values are NA.
121 | #'
122 | #' @param x A numerical vector.
123 | #' @param na.rm A boolean indicating whether to remove NAs.
124 | #' @return The mean of \code{x} or NA when values are missing.
125 | #' @keywords internal
126 | 
127 | safe_mean <- function(x, na.rm = FALSE) {
128 |     result <- mean(x, na.rm = na.rm)
129 |     if (is.na(result)) return(NA)
130 |     result
131 | }
132 | 
133 | #' Calculate \code{min} but suppress the warning when all values are missing
134 | #'
135 | #' This function is a drop-in replacement for \code{min}, which is used in
136 | #' \code{cat_summarise}. It suppresses the warning when all values are NA and
137 | #' na.rm is TRUE, and returns NA instead of Inf.
138 | #'
139 | #' @param x A numerical vector.
140 | #' @param na.rm A boolean indicating whether to remove NAs.
141 | #' @return The min of \code{x} or NA when values are missing.
142 | #' @keywords internal
143 | 
144 | safe_min <- function(x, na.rm = FALSE) {
145 |     result <- NA
146 |     tryCatch(
147 |         result <- min(x, na.rm = na.rm),
148 |         warning = function(c) c)
149 |     result
150 | }
151 | 
152 | 
153 | #' Calculate \code{max} but suppress the warning when all values are missing
154 | #'
155 | #' This function is a drop-in replacement for \code{max}, which is used in
156 | #' \code{cat_summarise}. It suppresses the warning when all values are NA and
157 | #' na.rm is TRUE, and returns NA instead of -Inf.
158 | #'
159 | #' @param x A numerical vector.
160 | #' @param na.rm A boolean indicating whether to remove NAs.
161 | #' @return The max of \code{x} or NA when values are missing.
162 | #' @keywords internal
163 | 
164 | safe_max <- function(x, na.rm = FALSE) {
165 |     result <- NA
166 |     tryCatch(
167 |         result <- max(x, na.rm = na.rm),
168 |         warning = function(c) c)
169 |     result
170 | }
171 | 


--------------------------------------------------------------------------------
/R/tabbycat-package.R:
--------------------------------------------------------------------------------
 1 | #' tabbycat: Tabulate and summarise categorical data
 2 | #'
 3 | #' Functions for tabulating and summarising categorical variables.
 4 | #'
 5 | #' @name tabbycat
 6 | #' @importFrom rlang .data
 7 | #' @importFrom magrittr %>%
 8 | #' @keywords internal
 9 | "_PACKAGE"
10 | 
11 | # Tell R CMD check about new operators
12 | if(getRversion() >= "2.15.1") {
13 |     utils::globalVariables(c(".", ":="))
14 | }
15 | 


--------------------------------------------------------------------------------
/R/vcount.R:
--------------------------------------------------------------------------------
 1 | #' Count the frequency of discrete values in a categorical vector
 2 | #'
 3 | #' This function differs from \code{cat_count} in that it operates directly on
 4 | #' vectors, rather than on columns in dataframes, which means it is less useful
 5 | #' in pipelines but can handle a wider range of inputs. The results are sorted
 6 | #' in descending order of frequency.
 7 | #'
 8 | #' @param cat A categorical vector for which frequencies will be calculated.
 9 | #' @param na.rm A boolean indicating whether to exclude NAs from the results.
10 | #'   The default is FALSE.
11 | #' @param only A string indicating that only one of the frequency columns
12 | #'   should be returned in the results. If \code{only} is either "n" or
13 | #'   "number", only the number column is returned. If \code{only} is either
14 | #'   "p" or "percent", only the percent column is returned. If \code{only} is
15 | #'   any other value, both columns are shown. The default value is an empty
16 | #'   string, which means both columns are shown.
17 | #' @param clean_names A boolean indicating whether the column names of the
18 | #'   results tibble should be cleaned, so that any column names produced from
19 | #'   data are converted to snake_case. The default is TRUE, but this can be
20 | #'   changed with \code{options(tabbycat.clean_names = FALSE)}.
21 | #' @return A tibble showing the frequency of each value in \code{cat}.
22 | #' @export
23 | 
24 | cat_vcount <- function(
25 |     cat,
26 |     na.rm = FALSE,
27 |     only = "",
28 |     clean_names = getOption("tabbycat.clean_names")) {
29 | 
30 |     # Check the cat argument is not null and is a vector
31 |     if (is.null(cat) || ! is.atomic(cat)) {
32 |         stop("The \"cat\" argument is not a vector.")
33 |     }
34 | 
35 |     # Check the cat argument is not an empty factor
36 |     if (length(cat) == 0) {
37 |         stop("The \"cat\" argument is empty.")
38 |     }
39 | 
40 |     # Check the na.rm argument is valid
41 |     if (length(na.rm) != 1 || is.na(na.rm) || ! is.logical(na.rm)) {
42 |         stop("Invalid \"na.rm\" argument. Must be either TRUE or FALSE.")
43 |     }
44 | 
45 |     # Check the only argument is valid
46 |     if (length(only) != 1 || is.na(only) || ! is.character(only)) {
47 |         stop("Invalid \"only\" argument. Must be a character vector of length one.")
48 |     }
49 | 
50 |     # Check the clean_names argument is valid
51 |     if (length(clean_names) != 1 || is.na(clean_names) || ! is.logical(clean_names)) {
52 |         stop("Invalid \"clean_names\" argument. Must be either TRUE or FALSE.")
53 |     }
54 | 
55 |     # Get the variable name of the cat argument
56 |     obj_name <- deparse(substitute(cat))
57 | 
58 |     # If the name is a dataframe and a column extract the column name
59 |     if(stringr::str_detect(obj_name, "\\$")) {
60 |         obj_name <- stringr::str_split_fixed(obj_name, "\\$", 2)[1, 2]
61 |     }
62 | 
63 |     # Set the name
64 |     name <- obj_name
65 | 
66 |     # Set option for handling NAs
67 |     use_na <- ifelse(na.rm, "no", "ifany")
68 | 
69 |     # Create the results dataframe and return
70 |     count <- tibble::as_tibble(
71 |         table(
72 |             cat,
73 |             useNA = use_na),
74 |             .name_repair = ~ c(name, "number")) %>%
75 |         dplyr::mutate(percent = .data$number / sum(.data$number)) %>%
76 |         dplyr::arrange(dplyr::desc(.data$number))
77 | 
78 |     # Clean names if clean_names is TRUE
79 |     if (clean_names == TRUE) {
80 |         count <- count %>% janitor::clean_names()
81 |     }
82 | 
83 |     # Remove columns based on only argument
84 |     if (stringr::str_trim(only) %in% c("n", "number")) {
85 |         count <- count %>% dplyr::select(-c("percent"))
86 |     }
87 | 
88 |     if (stringr::str_trim(only) %in% c("p", "percent")) {
89 |         count <- count %>% dplyr::select(-c("number"))
90 |     }
91 | 
92 |     count
93 | }
94 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onLoad <- function(libname, pkgname) {
 2 | 
 3 |     # Set default options if options have not already been set
 4 |     op <- options()
 5 | 
 6 |     op_tabbycat <- list(
 7 |         tabbycat.clean_names = TRUE,
 8 |         tabbycat.na_label = "na",
 9 |         tabbycat.other_label = "other")
10 | 
11 |     to_set <- !(names(op_tabbycat) %in% names(op))
12 |     if (any(to_set)) options(op_tabbycat[to_set])
13 |     invisible()
14 | }
15 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | # CRAN comments
 2 | 
 3 | This is a new version of an existing package. It updates the package from version 0.17.0 to 0.18.0.
 4 | 
 5 | ## Changes in version 0.18.0
 6 | 
 7 | None of the changes in version 0.18.0 alter the functionality of the package. They simply remove deprecated behaviour. In particular:
 8 | 
 9 | * The file R/tabbycat.R has been replaced with the file R/tabbycat-package.R. 
10 | * The @docType annotation has been removed from tabbycat-package.R.
11 | * Brief package-level documentation has been created by documenting the _PACKAGE sentinel in tabbycat-package.R.
12 | * Package functions no longer use the .data variable in tidy selections in order to comply with its deprecation in tidyselect.
13 | 
14 | ## R CMD check results
15 | 
16 | There were 0 errors, 0 warnings, 0 notes in all test environments.
17 | 
18 | ## Test environments
19 | 
20 | * Local R installation
21 |     * MacOS 13.4.1, R 4.3.1
22 | 
23 | * GitHub actions
24 |     * macOS Monterey 12.6.7, R version 4.3.1
25 |     * Windows Server 2022 x64 (build 20348), R version 4.3.1
26 |     * Ubuntu 22.04.3, LTS-R version 4.4.0
27 |     * Ubuntu 22.04.3, LTS-R version 4.3.1
28 |     * Ubuntu 22.04.3, LTS-R version 4.2.3
29 | 


--------------------------------------------------------------------------------
/man/cat_compare.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/compare.R
 3 | \name{cat_compare}
 4 | \alias{cat_compare}
 5 | \title{Calculate the frequency of discrete values in one categorical variable for
 6 | each group within another categorical variable}
 7 | \usage{
 8 | cat_compare(
 9 |   data,
10 |   row_cat,
11 |   col_cat,
12 |   na.rm.row = FALSE,
13 |   na.rm.col = FALSE,
14 |   na.rm = NULL,
15 |   only = "",
16 |   clean_names = getOption("tabbycat.clean_names"),
17 |   na_label = getOption("tabbycat.na_label")
18 | )
19 | }
20 | \arguments{
21 | \item{data}{A dataframe containing the two variables of interest.}
22 | 
23 | \item{row_cat}{The column name of a categorical variable whose distribution
24 | will be calculated for each group in \code{col_cat}.}
25 | 
26 | \item{col_cat}{The column name of a categorical variable which will be
27 | split into groups and the distrubtion of \code{row_cat} calulated
28 | for each group.}
29 | 
30 | \item{na.rm.row}{A boolean indicating whether to exclude NAs from the row
31 | results. The default is FALSE.}
32 | 
33 | \item{na.rm.col}{A boolean indicating whether to exclude NAs from the column
34 | results. The default is FALSE.}
35 | 
36 | \item{na.rm}{A boolean indicating whether to exclude NAs from both row and
37 | column results. This argument is provided as a convenience. It allows you
38 | to set \code{na.rm.row} and \code{na.rm.col} to the same value without
39 | having to specify them separately. If the value of \code{na.rm} is NULL,
40 | the argument is ignored. If it is not NULL it takes precendence.
41 | default is NULL.}
42 | 
43 | \item{only}{A string indicating that only one set of frequency columns
44 | should be returned in the results. If \code{only} is either "n" or
45 | "number", only the number columns are returned. If \code{only} is either
46 | "p" or "percent", only the percent columns are returned. If \code{only} is
47 | any other value, both sets of columns are shown. The default value is an
48 | empty string, which means both sets of columns are shown.}
49 | 
50 | \item{clean_names}{A boolean indicating whether the column names of the
51 | results tibble should be cleaned, so that any column names produced from
52 | data are converted to snake_case. The default is TRUE, but this can be
53 | changed with \code{options(tabbycat.clean_names = FALSE)}.}
54 | 
55 | \item{na_label}{A string indicating the label to use for the columns that
56 | contain data for missing values. The default value is "na", but use this
57 | argument to set a different value if the default value collides with data
58 | in your dataset.}
59 | }
60 | \value{
61 | A tibble showing the distribution of \code{row_cat} within each
62 |   group in \code{col_cat}.
63 | }
64 | \description{
65 | This function crosstabulates the frequencies of one categorical variable
66 | within the groups of another. The results are sorted on the values of the
67 | variable whose distribution is shown in each column i.e. the variable
68 | specified with \code{row_cat}. If this variable is a character vector it
69 | will be sorted alphabetically. If it is a factor it will be sorted in the
70 | order of its levels.
71 | }
72 | 


--------------------------------------------------------------------------------
/man/cat_contrast.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/contrast.R
 3 | \name{cat_contrast}
 4 | \alias{cat_contrast}
 5 | \title{Calculate the frequency of discrete values in one categorical variable for
 6 | each of two mutually exclusive groups within another categorical variable}
 7 | \usage{
 8 | cat_contrast(
 9 |   data,
10 |   row_cat,
11 |   col_cat,
12 |   col_group,
13 |   na.rm.row = FALSE,
14 |   na.rm.col = FALSE,
15 |   na.rm = NULL,
16 |   only = "",
17 |   clean_names = getOption("tabbycat.clean_names"),
18 |   na_label = getOption("tabbycat.na_label"),
19 |   other_label = getOption("tabbycat.other_label")
20 | )
21 | }
22 | \arguments{
23 | \item{data}{A dataframe containing the two variables of interest.}
24 | 
25 | \item{row_cat}{The column name of a categorical variable whose distribution
26 | should be calculated for each exclusive group in \code{col_cat}.}
27 | 
28 | \item{col_cat}{The column name of a categorical variable that will be
29 | split into two exclusive groups, one containing observations with a
30 | particular value of that variable, and another containing all other
31 | observations.}
32 | 
33 | \item{col_group}{The name of the group within \code{col_cat} that is
34 | used to split the observations into two exclusive groups: those that are
35 | in the group and those that are not in the group.}
36 | 
37 | \item{na.rm.row}{A boolean indicating whether to exclude NAs from the row
38 | results. The default is FALSE.}
39 | 
40 | \item{na.rm.col}{A boolean indicating whether to exclude NAs from the column
41 | results. The default is FALSE.}
42 | 
43 | \item{na.rm}{A boolean indicating whether to exclude NAs from both row and
44 | column results. This argument is provided as a convenience. It allows you
45 | to set \code{na.rm.row} and \code{na.rm.col} to the same value without
46 | having to specify them separately. If the value of \code{na.rm} is NULL,
47 | the argument is ignored. If it is not NULL it takes precendence.
48 | default is NULL.}
49 | 
50 | \item{only}{A string indicating that only one set of frequency columns
51 | should be returned in the results. If \code{only} is either "n" or
52 | "number", only the number columns are returned. If \code{only} is either
53 | "p" or "percent", only the percent columns are returned. If \code{only} is
54 | any other value, both sets of columns are shown. The default value is an
55 | empty string, which means both sets of columns are shown.}
56 | 
57 | \item{clean_names}{A boolean indicating whether the column names of the
58 | results tibble should be cleaned, so that any column names produced from
59 | data are converted to snake_case. The default is TRUE, but this can be
60 | changed with \code{options(tabbycat.clean_names = FALSE)}.}
61 | 
62 | \item{na_label}{A string indicating the label to use for the columns that
63 | contain data for missing values. The default value is "na", but use this
64 | argument to set a different value if the default value collides with data
65 | in your dataset.}
66 | 
67 | \item{other_label}{A string indicating the label to use for the columns that
68 | contain data for observations not in the named group. The default value is
69 | "other", but use this argument to set a different value if the default
70 | value collides with data in your dataset.}
71 | }
72 | \value{
73 | A tibble showing the distribution of \code{row_cat} within each of
74 |   the two exclusive groups in \code{col_cat}.
75 | }
76 | \description{
77 | This function shows the distrbution of values within given a categorical
78 | variable for one group within another categorical variable, and compares it
79 | with the distribution among all observations not in that group. Its purpose
80 | is to let you see quickly whether the distribution within that group differs
81 | from the distribution for the rest of the observations. The results are
82 | sorted in descending order of frequency for the named group i.e. the group
83 | named in \code{col_group}.
84 | }
85 | 


--------------------------------------------------------------------------------
/man/cat_count.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/count.R
 3 | \name{cat_count}
 4 | \alias{cat_count}
 5 | \title{Count the frequency of discrete values in the column of a dataframe}
 6 | \usage{
 7 | cat_count(
 8 |   data,
 9 |   cat,
10 |   na.rm = FALSE,
11 |   only = "",
12 |   clean_names = getOption("tabbycat.clean_names")
13 | )
14 | }
15 | \arguments{
16 | \item{data}{A dataframe containing a categorical vector for which
17 | frequencies will be calculated.}
18 | 
19 | \item{cat}{The column name of the categorical variable for which frequencies
20 | will be calculated.}
21 | 
22 | \item{na.rm}{A boolean indicating whether to exclude NAs from the results.
23 | The default is FALSE.}
24 | 
25 | \item{only}{A string indicating that only one of the frequency columns
26 | should be returned in the results. If \code{only} is either "n" or
27 | "number", only the number column is returned. If \code{only} is either
28 | "p" or "percent", only the percent column is returned. If \code{only} is
29 | any other value, both columns are shown. The default value is an empty
30 | string, which means both columns are shown.}
31 | 
32 | \item{clean_names}{A boolean indicating whether the column names of the
33 | results tibble should be cleaned, so that any column names produced from
34 | data are converted to snake_case. The default is TRUE, but this can be
35 | changed with \code{options(tabbycat.clean_names = FALSE)}.}
36 | }
37 | \value{
38 | A tibble showing the frequency of each value in \code{cat}.
39 | }
40 | \description{
41 | This function differs from \code{cat_vcount} in that it operates on columns
42 | in dataframes rather than directly on vectors, which means it is more useful
43 | in pipelines but handles a narrower range of inputs. The results are sorted
44 | in descending order of frequency.
45 | }
46 | 


--------------------------------------------------------------------------------
/man/cat_summarise.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarise.R
 3 | \name{cat_summarise}
 4 | \alias{cat_summarise}
 5 | \alias{cat_summarize}
 6 | \title{Summarise the values of a numerical variable for each group within a
 7 | categorical variable}
 8 | \usage{
 9 | cat_summarise(
10 |   data,
11 |   cat,
12 |   num,
13 |   na.rm = FALSE,
14 |   clean_names = getOption("tabbycat.clean_names")
15 | )
16 | 
17 | cat_summarize(
18 |   data,
19 |   cat,
20 |   num,
21 |   na.rm = FALSE,
22 |   clean_names = getOption("tabbycat.clean_names")
23 | )
24 | }
25 | \arguments{
26 | \item{data}{A dataframe containing a categorical variable and numerical
27 | variable to summarise.}
28 | 
29 | \item{cat}{The name of a column in \code{data} which is a categorical vector
30 | of discrete values for which summaries will be calculated.}
31 | 
32 | \item{num}{The name of a column in \code{data} which is a numerical vector
33 | that will be summarised for each group.}
34 | 
35 | \item{na.rm}{A boolean indicating whether to exclude NAs from the row
36 | results. Note that NAs are **always** ignored in calculating the summary
37 | statistics for \code{num} shown in each row, and the number of NAs that
38 | exist in \code{num} for each group in \code{cat} is shown in the
39 | \code{na} column of the results table. This argument controls whether a
40 | row of summary statistics is shown for observations that are NA in
41 | \code{cat}. The default is FALSE.}
42 | 
43 | \item{clean_names}{A boolean indicating whether the column names of the
44 | results tibble should be cleaned, so that any column names produced from
45 | data are converted to snake_case. The default is TRUE, but this can be
46 | changed with \code{options(tabbycat.clean_names = FALSE)}.}
47 | }
48 | \value{
49 | A tibble showing summary statistics for \code{num} for each group
50 |   in \code{cat}.
51 | }
52 | \description{
53 | The results are sorted on the values of the categorical variable i.e.
54 | the variable specified with \code{cat}. If this variable is a character
55 | vector it will be sorted alphabetically. If it is a factor it will be
56 | sorted in the order of its levels. This function can be called as either
57 | \code{cat_summarise} or \code{cat_summarize}.
58 | }
59 | 


--------------------------------------------------------------------------------
/man/cat_vcount.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/vcount.R
 3 | \name{cat_vcount}
 4 | \alias{cat_vcount}
 5 | \title{Count the frequency of discrete values in a categorical vector}
 6 | \usage{
 7 | cat_vcount(
 8 |   cat,
 9 |   na.rm = FALSE,
10 |   only = "",
11 |   clean_names = getOption("tabbycat.clean_names")
12 | )
13 | }
14 | \arguments{
15 | \item{cat}{A categorical vector for which frequencies will be calculated.}
16 | 
17 | \item{na.rm}{A boolean indicating whether to exclude NAs from the results.
18 | The default is FALSE.}
19 | 
20 | \item{only}{A string indicating that only one of the frequency columns
21 | should be returned in the results. If \code{only} is either "n" or
22 | "number", only the number column is returned. If \code{only} is either
23 | "p" or "percent", only the percent column is returned. If \code{only} is
24 | any other value, both columns are shown. The default value is an empty
25 | string, which means both columns are shown.}
26 | 
27 | \item{clean_names}{A boolean indicating whether the column names of the
28 | results tibble should be cleaned, so that any column names produced from
29 | data are converted to snake_case. The default is TRUE, but this can be
30 | changed with \code{options(tabbycat.clean_names = FALSE)}.}
31 | }
32 | \value{
33 | A tibble showing the frequency of each value in \code{cat}.
34 | }
35 | \description{
36 | This function differs from \code{cat_count} in that it operates directly on
37 | vectors, rather than on columns in dataframes, which means it is less useful
38 | in pipelines but can handle a wider range of inputs. The results are sorted
39 | in descending order of frequency.
40 | }
41 | 


--------------------------------------------------------------------------------
/man/safe_max.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarise.R
 3 | \name{safe_max}
 4 | \alias{safe_max}
 5 | \title{Calculate \code{max} but suppress the warning when all values are missing}
 6 | \usage{
 7 | safe_max(x, na.rm = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{A numerical vector.}
11 | 
12 | \item{na.rm}{A boolean indicating whether to remove NAs.}
13 | }
14 | \value{
15 | The max of \code{x} or NA when values are missing.
16 | }
17 | \description{
18 | This function is a drop-in replacement for \code{max}, which is used in
19 | \code{cat_summarise}. It suppresses the warning when all values are NA and
20 | na.rm is TRUE, and returns NA instead of -Inf.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/safe_mean.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarise.R
 3 | \name{safe_mean}
 4 | \alias{safe_mean}
 5 | \title{Calculate \code{mean} but return NA rather than NaN when values are missing}
 6 | \usage{
 7 | safe_mean(x, na.rm = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{A numerical vector.}
11 | 
12 | \item{na.rm}{A boolean indicating whether to remove NAs.}
13 | }
14 | \value{
15 | The mean of \code{x} or NA when values are missing.
16 | }
17 | \description{
18 | This function is a drop-in replacement for \code{mean}, which is used in
19 | \code{cat_summarise}. It returns NA rather than NaN when all values are NA.
20 | }
21 | \keyword{internal}
22 | 


--------------------------------------------------------------------------------
/man/safe_min.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarise.R
 3 | \name{safe_min}
 4 | \alias{safe_min}
 5 | \title{Calculate \code{min} but suppress the warning when all values are missing}
 6 | \usage{
 7 | safe_min(x, na.rm = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{A numerical vector.}
11 | 
12 | \item{na.rm}{A boolean indicating whether to remove NAs.}
13 | }
14 | \value{
15 | The min of \code{x} or NA when values are missing.
16 | }
17 | \description{
18 | This function is a drop-in replacement for \code{min}, which is used in
19 | \code{cat_summarise}. It suppresses the warning when all values are NA and
20 | na.rm is TRUE, and returns NA instead of Inf.
21 | }
22 | \keyword{internal}
23 | 


--------------------------------------------------------------------------------
/man/tabbycat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tabbycat-package.R
 3 | \docType{package}
 4 | \name{tabbycat}
 5 | \alias{tabbycat}
 6 | \alias{tabbycat-package}
 7 | \title{tabbycat: Tabulate and summarise categorical data}
 8 | \description{
 9 | Functions for tabulating and summarising categorical variables.
10 | }
11 | \author{
12 | \strong{Maintainer}: Oliver Hawkins \email{oli@olihawkins.com}
13 | 
14 | }
15 | \keyword{internal}
16 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # tabbycat
  2 | 
  3 | `tabbycat` is an R package for tabulating and summarising categorical variables. Most functions are designed to work with dataframes, and use the tidyverse idiom of taking the dataframe as the first argument so they work within pipelines. Equivalent functions that operate directly on vectors are also provided where it makes sense. This package aims to make exploratory data analysis involving categorical variables quicker, simpler and more robust.
  4 | 
  5 | ## Status
  6 | 
  7 | This package is fully functional. Please let me know if you run into any issues.
  8 | 
  9 | <!-- badges: start -->
 10 | [![CRAN status](https://www.r-pkg.org/badges/version/tabbycat)](https://CRAN.R-project.org/package=tabbycat)
 11 | [![R-CMD-check](https://github.com/olihawkins/tabbycat/workflows/R-CMD-check/badge.svg)](https://github.com/olihawkins/tabbycat/actions)
 12 | [![codecov](https://codecov.io/gh/olihawkins/tabbycat/branch/main/graph/badge.svg?token=W177A4T66K)](https://codecov.io/gh/olihawkins/tabbycat)
 13 | <!-- badges: end -->
 14 | 
 15 | ## Contents
 16 | 
 17 | * [Installation](#1-installation)
 18 | * [Counting functions](#2-counting-functions)
 19 | * [Comparison functions](#3-comparison-functions)
 20 | * [Summarising functions](#4-summarising-functions)
 21 | * [Other API features](#5-other-api-features)
 22 | 
 23 | ## List of functions
 24 | 
 25 | * [`cat_count`](#cat_count)
 26 | * [`cat_vcount`](#cat_vcount)
 27 | * [`cat_compare`](#cat_compare)
 28 | * [`cat_contrast`](#cat_contrast)
 29 | * [`cat_summarise`](#cat_summarise)
 30 | 
 31 | ## 1. Installation
 32 | 
 33 | Install the latest release from CRAN.
 34 | 
 35 | ```r
 36 | install.packages("tabbycat")
 37 | ```
 38 | 
 39 | Or install the development version from GitHub.
 40 | 
 41 | ```r
 42 | install.packages("remotes")
 43 | remotes::install_github("olihawkins/tabbycat")
 44 | ```
 45 | 
 46 | ## 2. Counting functions
 47 | 
 48 | ### `cat_count`
 49 | 
 50 | `cat_count` calculates the frequency of discrete values in the column of a dataframe and returns the counts and percentages as a tibble. This function operates on columns in dataframes, but an equivalent function called `cat_vcount` provides the same functionality for vectors. Call the function with a dataframe and the name of the column to count.
 51 | 
 52 | ```r
 53 | # Load tabbycat and the mpg dataset
 54 | library(tabbycat)
 55 | mpg <- ggplot2::mpg
 56 | 
 57 | cat_count(mpg, "class")
 58 | 
 59 | #   A tibble: 7 × 3
 60 | #   class      number percent
 61 | #   <chr>       <int>   <dbl>
 62 | # 1 suv            62  0.265 
 63 | # 2 compact        47  0.201 
 64 | # 3 midsize        41  0.175 
 65 | # 4 subcompact     35  0.150 
 66 | # 5 pickup         33  0.141 
 67 | # 6 minivan        11  0.0470
 68 | # 7 2seater         5  0.0214
 69 | ```
 70 | 
 71 | ### `cat_vcount`
 72 | 
 73 | `cat_vcount` is equivalent to `cat_count` but works directly on vectors: it calculates the frequency of discrete values in a vector and returns the counts and percentages as a tibble. `cat_vcount` can handle a wider range of inputs than `cat_count` but it does not fit as easily into pipelines. Call the function with a vector to count.
 74 | 
 75 | ```r
 76 | # Load tabbycat and the mpg dataset
 77 | library(tabbycat)
 78 | mpg <- ggplot2::mpg
 79 | 
 80 | cat_vcount(mpg$class)
 81 | 
 82 | #   A tibble: 7 × 3
 83 | #   class      number percent
 84 | #   <chr>       <int>   <dbl>
 85 | # 1 suv            62  0.265 
 86 | # 2 compact        47  0.201 
 87 | # 3 midsize        41  0.175 
 88 | # 4 subcompact     35  0.150 
 89 | # 5 pickup         33  0.141 
 90 | # 6 minivan        11  0.0470
 91 | # 7 2seater         5  0.0214
 92 | ```
 93 | 
 94 | ### 2.1. NA handling for counting functions
 95 | 
 96 | By default, if any NAs exist in the data their frequency is included in the results, but you can remove this by setting the `na.rm` argument to `TRUE`. This means the percentages are caclulated excluding NAs (i.e. based on the counts shown in the table).
 97 | 
 98 | ```r
 99 | # Set the class of the first observation to NA
100 | mpg[1, ]$class <- NA
101 | 
102 | # Call cat_count with defaults
103 | cat_count(mpg, "class")
104 | 
105 | #   A tibble: 8 × 3
106 | #   class      number percent
107 | #   <chr>       <int>   <dbl>
108 | # 1 suv            62 0.265  
109 | # 2 compact        46 0.197  
110 | # 3 midsize        41 0.175  
111 | # 4 subcompact     35 0.150  
112 | # 5 pickup         33 0.141  
113 | # 6 minivan        11 0.0470 
114 | # 7 2seater         5 0.0214 
115 | # 8 NA              1 0.00427
116 | 
117 | # Call cat_count with na.rm set to TRUE
118 | cat_count(mpg, "class", na.rm = TRUE)
119 | 
120 | #   A tibble: 7 × 3
121 | #   class      number percent
122 | #   <chr>       <int>   <dbl>
123 | # 1 suv            62  0.266 
124 | # 2 compact        46  0.197 
125 | # 3 midsize        41  0.176 
126 | # 4 subcompact     35  0.150 
127 | # 5 pickup         33  0.142 
128 | # 6 minivan        11  0.0472
129 | # 7 2seater         5  0.0215
130 | ```
131 | 
132 | ## 3. Comparison functions
133 | 
134 | ### `cat_compare`
135 | 
136 | `cat_compare` calculates the distribution of one categorical variable within the groups of another categorical variable and returns the counts and percentages as a tibble. It is essentially a cross tabulation of the two variables with column-wise percentages. Call the function with a dataframe and provide:
137 | 
138 | 1. `row_cat` -- the variable to distribute down the rows
139 | 2. `col_cat` -- the variable to split into groups along the columns
140 | 
141 | ```r
142 | # Load tabbycat and the mpg dataset
143 | library(tabbycat)
144 | mpg <- ggplot2::mpg
145 | 
146 | cat_compare(mpg, "class", "cyl")
147 | 
148 | #  A tibble: 7 × 9
149 | #   class        n_4   n_5   n_6   n_8    p_4   p_5    p_6    p_8
150 | #   <chr>      <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl>
151 | # 1 2seater        0     0     0     5 0        0   0      0.0714
152 | # 2 compact       32     2    13     0 0.395    0.5 0.165  0     
153 | # 3 midsize       16     0    23     2 0.198    0   0.291  0.0286
154 | # 4 minivan        1     0    10     0 0.0123   0   0.127  0     
155 | # 5 pickup         3     0    10    20 0.0370   0   0.127  0.286 
156 | # 6 subcompact    21     2     7     5 0.259    0.5 0.0886 0.0714
157 | # 7 suv            8     0    16    38 0.0988   0   0.203  0.543
158 | ```
159 | 
160 | ### `cat_contrast`
161 | 
162 | `cat_contrast` caculates the frequency of discrete values in one categorical variable for each of two mutually exclusive groups within another categorical variable and returns the counts and percentages as a tibble. This lets you see if the distribution of a variable within a particular group differs from the distribution in the rest of the dataset. Call the function with a dataframe and provide:
163 | 
164 | 1. `row_cat` -- the variable to distribute down the rows 
165 | 2. `col_cat` -- the variable to split into two exclusive groups along the columns
166 | 3. `col_group` -- the name of the group in `col_cat` to contrast against the rest of the dataset
167 | 
168 | ```r
169 | # Load tabbycat and the mpg dataset
170 | library(tabbycat)
171 | mpg <- ggplot2::mpg
172 | 
173 | cat_contrast(mpg, "class", "manufacturer", "toyota")
174 | 
175 | # # A tibble: 7 × 5
176 | #   class      n_toyota n_other p_toyota p_other
177 | #   <chr>         <dbl>   <dbl>    <dbl>   <dbl>
178 | # 1 compact          12      35    0.353   0.175
179 | # 2 suv               8      54    0.235   0.27 
180 | # 3 midsize           7      34    0.206   0.17 
181 | # 4 pickup            7      26    0.206   0.13 
182 | # 5 2seater           0       5    0       0.025
183 | # 6 minivan           0      11    0       0.055
184 | # 7 subcompact        0      35    0       0.175
185 | ```
186 | 
187 | ### 3.1. NA handling for comparison functions
188 | 
189 | By default, if any NAs exist in the data their frequency is included in both the row and column results. So there is a row for observations containing NAs in `row_cat`, and columns showing the number and percentage of NAs found in `col_cat` for each group in `row_cat`.
190 | 
191 | 
192 | ```r
193 | # Set the class of the first observation to NA
194 | mpg[1, ]$class <- NA
195 | 
196 | # Set the manufacturer of the second observation to NA
197 | mpg[2, ]$manufacturer <- NA
198 | 
199 | # Call cat_contrast with defaults
200 | cat_contrast(mpg, "class", "manufacturer", "toyota")
201 | 
202 | #   A tibble: 8 × 7
203 | #   class      n_toyota n_other  n_na p_toyota p_other  p_na
204 | #   <chr>         <dbl>   <dbl> <dbl>    <dbl>   <dbl> <dbl>
205 | # 1 compact          12      33     1    0.353 0.166       1
206 | # 2 suv               8      54     0    0.235 0.271       0
207 | # 3 midsize           7      34     0    0.206 0.171       0
208 | # 4 pickup            7      26     0    0.206 0.131       0
209 | # 5 2seater           0       5     0    0     0.0251      0
210 | # 6 minivan           0      11     0    0     0.0553      0
211 | # 7 subcompact        0      35     0    0     0.176       0
212 | # 8 NA                0       1     0    0     0.00503     0
213 | ```
214 | 
215 | This default behaviour can be changed through three boolean arguments: `na.rm.row`, `na.rm.col`, and `na.rm`. Setting each of these arguments to TRUE has the following effects:
216 | 
217 | - `na.rm.row` -- removes the row for NAs from the row results
218 | - `na.rm.col` -- removes the columns for NAs from the column results
219 | - `na.rm` -- removes both the rows and columns of NAs from the results
220 | 
221 | ```r
222 | # Call cat_contrast with na.rm.row set to TRUE
223 | cat_contrast(mpg, "class", "manufacturer", "toyota", na.rm.row = TRUE)
224 | 
225 | #   A tibble: 7 × 7
226 | #   class      n_toyota n_other  n_na p_toyota p_other  p_na
227 | #   <chr>         <dbl>   <dbl> <dbl>    <dbl>   <dbl> <dbl>
228 | # 1 compact          12      33     1    0.353  0.167      1
229 | # 2 suv               8      54     0    0.235  0.273      0
230 | # 3 midsize           7      34     0    0.206  0.172      0
231 | # 4 pickup            7      26     0    0.206  0.131      0
232 | # 5 2seater           0       5     0    0      0.0253     0
233 | # 6 minivan           0      11     0    0      0.0556     0
234 | # 7 subcompact        0      35     0    0      0.177      0
235 | 
236 | # Call cat_contrast with na.rm.col set to TRUE
237 | cat_contrast(mpg, "class", "manufacturer", "toyota", na.rm.col = TRUE)
238 | 
239 | #   A tibble: 8 × 5
240 | #   class      n_toyota n_other p_toyota p_other
241 | #   <chr>         <dbl>   <dbl>    <dbl>   <dbl>
242 | # 1 compact          12      33    0.353 0.166  
243 | # 2 suv               8      54    0.235 0.271  
244 | # 3 midsize           7      34    0.206 0.171  
245 | # 4 pickup            7      26    0.206 0.131  
246 | # 5 2seater           0       5    0     0.0251 
247 | # 6 minivan           0      11    0     0.0553 
248 | # 7 subcompact        0      35    0     0.176  
249 | # 8 NA                0       1    0     0.00503
250 | 
251 | # Call cat_contrast with na.rm set to TRUE
252 | cat_contrast(mpg, "class", "manufacturer", "toyota", na.rm = TRUE)
253 | 
254 | #   A tibble: 7 × 5
255 | #   class      n_toyota n_other p_toyota p_other
256 | #   <chr>         <dbl>   <dbl>    <dbl>   <dbl>
257 | # 1 compact          12      33    0.353  0.167 
258 | # 2 suv               8      54    0.235  0.273 
259 | # 3 midsize           7      34    0.206  0.172 
260 | # 4 pickup            7      26    0.206  0.131 
261 | # 5 2seater           0       5    0      0.0253
262 | # 6 minivan           0      11    0      0.0556
263 | # 7 subcompact        0      35    0      0.177 
264 | ```
265 | 
266 | Note that while removing the columns for NAs from the column results simply changes which columns are shown in the results table, removing the row for NAs from the row results affects the data in the table, because the percentage frequencies are calculated based on the rows shown. In other words, `na.rm.row` lets you calculate the percentage frequencies with or without NAs. This is consistent with the behaviour of `cat_count` and `cat_vcount`. 
267 | 
268 | The `na.rm` argument is a convenience which simply sets `na.rm.row` and `na.rm.col` to the same value. If it is set, it takes priority over both of those arguments, otherwise it is ignored.
269 | 
270 | ## 4. Summarising functions
271 | 
272 | ### `cat_summarise`
273 | 
274 | `cat_summarise` (or `cat_summarize`) calculates summary statistics for a numerical variable for each group within a categorical variable. Call the function with a dataframe and provide:
275 | 
276 | 1. `cat` -- the categorical variable for which summaries will be calculated
277 | 2. `num` -- the numerical variable to summarise
278 | 
279 | ```r
280 | # Load tabbycat and the mpg dataset
281 | library(tabbycat)
282 | mpg <- ggplot2::mpg
283 | 
284 | cat_summarise(mpg, "class", "hwy")
285 | 
286 | #   A tibble: 7 × 10
287 | #   class          n    na  mean    sd   min    lq   med    uq   max
288 | #   <chr>      <int> <int> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
289 | # 1 2seater        5     0  24.8  1.30    23  24    25    26      26
290 | # 2 compact       47     0  28.3  3.78    23  26    27    29      44
291 | # 3 midsize       41     0  27.3  2.14    23  26    27    29      32
292 | # 4 minivan       11     0  22.4  2.06    17  22    23    24      24
293 | # 5 pickup        33     0  16.9  2.27    12  16    17    18      22
294 | # 6 subcompact    35     0  28.1  5.38    20  24.5  26    30.5    44
295 | # 7 suv           62     0  18.1  2.98    12  17    17.5  19      27
296 | ```
297 | 
298 | ### 4.1. NA handling for `cat_summarise`
299 | 
300 | In `cat_summarise`, NAs are **always** ignored in calculating the summary statistics for each group in `cat`. But the number of NAs in each group is shown in a column in the table so you can see the potential impact of NAs on the calculation of these statistics. By default, a row showing summary statistics for observations that are NA in `cat` is included in the table, but this can be turned off by setting `na.rm` to `TRUE`. You can see these behaviours in the following example.
301 | 
302 | ```r
303 | # Set the class of the first three observations to NA
304 | mpg[1:3, ]$class <- NA
305 | 
306 | # Set the hwy (miles per gallon) of the fourth observation to NA
307 | mpg[4, ]$hwy <- NA
308 | 
309 | # Call cat_summarise with defaults
310 | cat_summarise(mpg, "class", "hwy")
311 | 
312 | #   A tibble: 8 × 10
313 | #   class          n    na  mean    sd   min    lq   med    uq   max
314 | #   <chr>      <int> <int> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
315 | # 1 2seater        5     0  24.8  1.30    23  24    25    26      26
316 | # 2 compact       44     1  28.2  3.92    23  26    27    29      44
317 | # 3 midsize       41     0  27.3  2.14    23  26    27    29      32
318 | # 4 minivan       11     0  22.4  2.06    17  22    23    24      24
319 | # 5 pickup        33     0  16.9  2.27    12  16    17    18      22
320 | # 6 subcompact    35     0  28.1  5.38    20  24.5  26    30.5    44
321 | # 7 suv           62     0  18.1  2.98    12  17    17.5  19      27
322 | # 8 NA             3     0  29.7  1.15    29  29    29    30      31
323 | 
324 | # Call cat_summarise with na.rm set to TRUE
325 | cat_summarise(mpg, "class", "hwy", na.rm = TRUE)
326 | 
327 | #   A tibble: 7 × 10
328 | #   class          n    na  mean    sd   min    lq   med    uq   max
329 | #   <chr>      <int> <int> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int>
330 | # 1 2seater        5     0  24.8  1.30    23  24    25    26      26
331 | # 2 compact       44     1  28.2  3.92    23  26    27    29      44
332 | # 3 midsize       41     0  27.3  2.14    23  26    27    29      32
333 | # 4 minivan       11     0  22.4  2.06    17  22    23    24      24
334 | # 5 pickup        33     0  16.9  2.27    12  16    17    18      22
335 | # 6 subcompact    35     0  28.1  5.38    20  24.5  26    30.5    44
336 | # 7 suv           62     0  18.1  2.98    12  17    17.5  19      27
337 | ```
338 | 
339 | ## 5. Other API features
340 | 
341 | There are some arguments that are found in most, if not all, the package functions.
342 | 
343 | ### 5.1. `clean_names`
344 | 
345 | All functions in the package take a boolean argument called `clean_names`. This argument controls whether column names derived from values in the data should be cleaned with `janitor::clean_names`, which converts them to snake case. 
346 | 
347 | The default value of this argument is `TRUE`. This is in order to produce more readable results tables and to avoid the creation of columns with spaces in the names, which are harder to use interactively. 
348 | 
349 | If you prefer not to have this behaviour, you can disable it on each function call by setting `clean_names` to `FALSE`, or globally using the package options.
350 | 
351 | ```r
352 | options(tabbycat.clean_names = FALSE)
353 | ```
354 | 
355 | ### 5.2. `only`
356 | 
357 | Counting and comparison functions take a string argument called `only`. This is used to return just the number or percentage columns for frequencies in the results. Valid values for `only` are :
358 | 
359 | - `"n"` or `"number"` -- to return just the number columns
360 | - `"p"` or `"percent"` -- to return just the percentage columns
361 | - *any other string* -- to return both the number and percentage columns
362 | 
363 | The defalut value is an empty string, meaning all columns are returned by default.
364 | 
365 | ### 5.3. Labelling arguments for comparison functions
366 | 
367 | The comparison functions need names to use as labels for the NA columns, and in the case of `cat_contrast`, for the columns showing frequencies for the observations that are not in the target group. 
368 | 
369 | These labels are controlled with the arguments `na_label` and `other_label`. The default values are `"na"` and `"other"` respectively, but you can change them if they colllide with data in your dataset. You can use the following package options if it makes more sense to change them globally when working with a particular dataset.
370 | 
371 | ```r
372 | options(tabbycat.na_label = "missing")
373 | options(tabbycat.other_label = "remainder")
374 | ```
375 | 


--------------------------------------------------------------------------------
/tabbycat.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(tabbycat)
3 | 
4 | test_check("tabbycat")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test_compare.R:
--------------------------------------------------------------------------------
  1 | # Test compare.R
  2 | 
  3 | # Setup ----------------------------------------------------------------------
  4 | 
  5 | data <- mtcars %>%
  6 |     tibble::rownames_to_column("model") %>%
  7 |     tibble::as_tibble() %>%
  8 |     dplyr::mutate(
  9 |         manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1])
 10 | 
 11 | data$cyl[1] <- NA
 12 | data$vs[2] <- NA
 13 | 
 14 | # Tests: cat_compare --------------------------------------------------------
 15 | 
 16 | test_that("cat_compare rejects a data argument that is not a dataframe", {
 17 | 
 18 |     msg <- "The \"data\" argument is not a dataframe."
 19 |     expect_error(cat_compare(NULL, "cyl", "vs"), msg)
 20 |     expect_error(cat_compare(NA, "cyl", "vs"), msg)
 21 |     expect_error(cat_compare(1:10, "cyl", "vs"), msg)
 22 |     expect_error(cat_compare(LETTERS[1:10], "cyl", "vs"), msg)
 23 |     expect_error(cat_compare(c(TRUE, FALSE), "cyl", "vs"), msg)
 24 |     expect_error(cat_compare(list(), "cyl", "vs"), msg)
 25 | })
 26 | 
 27 | test_that("cat_compare rejects a data argument that has no rows", {
 28 | 
 29 |     msg <- "The \"data\" argument is empty."
 30 |     expect_error(cat_compare(data.frame(), "cyl", "vs"), msg)
 31 | })
 32 | 
 33 | test_that("cat_compare rejects invalid row_cat arguments", {
 34 | 
 35 |     msg <- "Invalid \"row_cat\" argument. Must be a character vector of length one."
 36 |     expect_error(cat_compare(data, NULL, "vs"), msg)
 37 |     expect_error(cat_compare(data, NA, "vs"), msg)
 38 |     expect_error(cat_compare(data, 1:10, "vs"), msg)
 39 |     expect_error(cat_compare(data, LETTERS[1:10], "vs"), msg)
 40 |     expect_error(cat_compare(data, c(TRUE, FALSE), "vs"), msg)
 41 |     expect_error(cat_compare(data, list(), "vs"), msg)
 42 | })
 43 | 
 44 | test_that("cat_compare rejects a row_cat argument that is not a column in the data", {
 45 | 
 46 |     msg <- "'notacolumn' is not a column in the dataframe."
 47 |     expect_error(cat_compare(data, "notacolumn", "vs"), msg)
 48 | })
 49 | 
 50 | test_that("cat_compare rejects invalid col_cat arguments", {
 51 | 
 52 |     msg <- "Invalid \"col_cat\" argument. Must be a character vector of length one."
 53 |     expect_error(cat_compare(data, "cyl", NULL), msg)
 54 |     expect_error(cat_compare(data, "cyl", NA), msg)
 55 |     expect_error(cat_compare(data, "cyl", 1:10), msg)
 56 |     expect_error(cat_compare(data, "cyl", LETTERS[1:10]), msg)
 57 |     expect_error(cat_compare(data, "cyl", c(TRUE, FALSE)), msg)
 58 |     expect_error(cat_compare(data, "cyl", list()), msg)
 59 | })
 60 | 
 61 | test_that("cat_compare rejects a col_cat argument  that is not a column in the data", {
 62 | 
 63 |     msg <- "'notacolumn' is not a column in the dataframe."
 64 |     expect_error(cat_compare(data, "cyl", "notacolumn"), msg)
 65 | })
 66 | 
 67 | test_that("cat_compare rejects invalid na.rm.row arguments", {
 68 | 
 69 |     msg <- "Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE."
 70 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = NULL), msg)
 71 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = NA), msg)
 72 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = 1), msg)
 73 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = ""), msg)
 74 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = 1:10), msg)
 75 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = LETTERS[1:10]), msg)
 76 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = c(TRUE, FALSE)), msg)
 77 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.row = list()), msg)
 78 | })
 79 | 
 80 | test_that("cat_compare rejects invalid na.rm.col arguments", {
 81 | 
 82 |     msg <- "Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE."
 83 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = NULL), msg)
 84 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = NA), msg)
 85 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = 1), msg)
 86 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = ""), msg)
 87 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = 1:10), msg)
 88 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = LETTERS[1:10]), msg)
 89 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = c(TRUE, FALSE)), msg)
 90 |     expect_error(cat_compare(data, "cyl", "vs", na.rm.col = list()), msg)
 91 | })
 92 | 
 93 | test_that("cat_compare rejects invalid na.rm arguments", {
 94 | 
 95 |     msg <- "Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE."
 96 |     expect_error(cat_compare(data, "cyl", "vs", na.rm = NA), msg)
 97 |     expect_error(cat_compare(data, "cyl", "vs", na.rm = 1), msg)
 98 |     expect_error(cat_compare(data, "cyl", "vs", na.rm = ""), msg)
 99 |     expect_error(cat_compare(data, "cyl", "vs", na.rm = 1:10), msg)
100 |     expect_error(cat_compare(data, "cyl", "vs", na.rm = LETTERS[1:10]), msg)
101 |     expect_error(cat_compare(data, "cyl", "vs", na.rm = c(TRUE, FALSE)), msg)
102 |     expect_error(cat_compare(data, "cyl", "vs", na.rm = list()), msg)
103 | })
104 | 
105 | test_that("cat_compare rejects invalid clean_names arguments", {
106 | 
107 |     msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE."
108 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = NULL), msg)
109 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = NA), msg)
110 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = 1), msg)
111 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = ""), msg)
112 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = 1:10), msg)
113 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = LETTERS[1:10]), msg)
114 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = c(TRUE, FALSE)), msg)
115 |     expect_error(cat_compare(data, "cyl", "vs", clean_names = list()), msg)
116 | })
117 | 
118 | test_that("cat_compare rejects invalid only arguments", {
119 | 
120 |     msg <- "Invalid \"only\" argument. Must be a character vector of length one."
121 |     expect_error(cat_compare(data, "cyl", "vs", only = NULL), msg)
122 |     expect_error(cat_compare(data, "cyl", "vs", only = NA), msg)
123 |     expect_error(cat_compare(data, "cyl", "vs", only = 1), msg)
124 |     expect_error(cat_compare(data, "cyl", "vs", only = TRUE), msg)
125 |     expect_error(cat_compare(data, "cyl", "vs", only = 1:10), msg)
126 |     expect_error(cat_compare(data, "cyl", "vs", only = LETTERS[1:10]), msg)
127 |     expect_error(cat_compare(data, "cyl", "vs", only = c(TRUE, FALSE)), msg)
128 |     expect_error(cat_compare(data, "cyl", "vs", only = list()), msg)
129 | })
130 | 
131 | test_that("cat_compare rejects invalid na_label arguments", {
132 | 
133 |     msg <- "Invalid \"na_label\" argument. Must be a character vector of length one."
134 |     expect_error(cat_compare(data, "cyl", "vs", na_label = NULL), msg)
135 |     expect_error(cat_compare(data, "cyl", "vs", na_label = NA), msg)
136 |     expect_error(cat_compare(data, "cyl", "vs", na_label = 1), msg)
137 |     expect_error(cat_compare(data, "cyl", "vs", na_label = TRUE), msg)
138 |     expect_error(cat_compare(data, "cyl", "vs", na_label = 1:10), msg)
139 |     expect_error(cat_compare(data, "cyl", "vs", na_label = LETTERS[1:10]), msg)
140 |     expect_error(cat_compare(data, "cyl", "vs", na_label = c(TRUE, FALSE)), msg)
141 |     expect_error(cat_compare(data, "cyl", "vs", na_label = list()), msg)
142 | })
143 | 
144 | test_that("cat_compare returns correct data with defaults", {
145 | 
146 |     expected <- tibble::tibble(
147 |         cyl = c(4, 6, 8, NA),
148 |         n_0 = c(1, 1, 14, 1),
149 |         n_1 = c(10, 4, 0, 0),
150 |         n_na = c(0, 1, 0, 0),
151 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
152 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
153 |         p_na = c(0, 1, 0, 0))
154 |     observed <- cat_compare(data, "cyl", "vs")
155 |     expect_equal(observed, expected)
156 | })
157 | 
158 | test_that("cat_compare returns correct data with a valid na.rm.row argument", {
159 | 
160 |     expected <- tibble::tibble(
161 |         cyl = c(4, 6, 8, NA),
162 |         n_0 = c(1, 1, 14, 1),
163 |         n_1 = c(10, 4, 0, 0),
164 |         n_na = c(0, 1, 0, 0),
165 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
166 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
167 |         p_na = c(0, 1, 0, 0))
168 |     observed <- cat_compare(data, "cyl", "vs", na.rm.row = FALSE)
169 |     expect_equal(observed, expected)
170 | 
171 |     expected <- tibble::tibble(
172 |         cyl = c(4, 6, 8),
173 |         n_0 = c(1, 1, 14),
174 |         n_1 = c(10, 4, 0),
175 |         n_na = c(0, 1, 0),
176 |         p_0 = c(0.0625, 0.0625, 0.8750),
177 |         p_1 = c(.71428571, 0.28571429, 0.0),
178 |         p_na = c(0, 1, 0))
179 |     observed <- cat_compare(data, "cyl", "vs", na.rm.row = TRUE)
180 |     expect_equal(observed, expected)
181 | })
182 | 
183 | test_that("cat_compare returns correct data with a valid na.rm.col argument", {
184 | 
185 |     expected <- tibble::tibble(
186 |         cyl = c(4, 6, 8, NA),
187 |         n_0 = c(1, 1, 14, 1),
188 |         n_1 = c(10, 4, 0, 0),
189 |         n_na = c(0, 1, 0, 0),
190 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
191 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
192 |         p_na = c(0, 1, 0, 0))
193 |     observed <- cat_compare(data, "cyl", "vs", na.rm.col = FALSE)
194 |     expect_equal(observed, expected)
195 | 
196 |     expected <- tibble::tibble(
197 |         cyl = c(4, 6, 8, NA),
198 |         n_0 = c(1, 1, 14, 1),
199 |         n_1 = c(10, 4, 0, 0),
200 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
201 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0))
202 |     observed <- cat_compare(data, "cyl", "vs", na.rm.col = TRUE)
203 |     expect_equal(observed, expected)
204 | })
205 | 
206 | test_that("cat_compare returns correct data with a valid na.rm argument", {
207 | 
208 |     expected <- tibble::tibble(
209 |         cyl = c(4, 6, 8, NA),
210 |         n_0 = c(1, 1, 14, 1),
211 |         n_1 = c(10, 4, 0, 0),
212 |         n_na = c(0, 1, 0, 0),
213 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
214 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
215 |         p_na = c(0, 1, 0, 0))
216 |     observed <- cat_compare(data, "cyl", "vs", na.rm = FALSE)
217 |     expect_equal(observed, expected)
218 | 
219 |     expected <- tibble::tibble(
220 |         cyl = c(4, 6, 8),
221 |         n_0 = c(1, 1, 14),
222 |         n_1 = c(10, 4, 0),
223 |         p_0 = c(0.0625, 0.0625, 0.8750),
224 |         p_1 = c(.71428571, 0.28571429, 0.0))
225 |     observed <- cat_compare(data, "cyl", "vs", na.rm = TRUE)
226 |     expect_equal(observed, expected)
227 | })
228 | 
229 | test_that("cat_compare returns correct data with a valid only argument", {
230 | 
231 |     expected <- tibble::tibble(
232 |         cyl = c(4, 6, 8, NA),
233 |         n_0 = c(1, 1, 14, 1),
234 |         n_1 = c(10, 4, 0, 0),
235 |         n_na = c(0, 1, 0, 0),
236 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
237 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
238 |         p_na = c(0, 1, 0, 0))
239 |     observed <- cat_compare(data, "cyl", "vs", only = "ignore")
240 |     expect_equal(observed, expected)
241 | 
242 |     expected_number <- tibble::tibble(
243 |         cyl = c(4, 6, 8, NA),
244 |         n_0 = c(1, 1, 14, 1),
245 |         n_1 = c(10, 4, 0, 0),
246 |         n_na = c(0, 1, 0, 0))
247 | 
248 |     observed <- cat_compare(data, "cyl", "vs", only = "n")
249 |     expect_equal(observed, expected_number)
250 | 
251 |     observed <- cat_compare(data, "cyl", "vs", only = "number")
252 |     expect_equal(observed, expected_number)
253 | 
254 |     observed <- cat_compare(data, "cyl", "vs", only = " number ")
255 |     expect_equal(observed, expected_number)
256 | 
257 |     expected_percent <- tibble::tibble(
258 |         cyl = c(4, 6, 8, NA),
259 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
260 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
261 |         p_na = c(0, 1, 0, 0))
262 | 
263 |     observed <- cat_compare(data, "cyl", "vs", only = "p")
264 |     expect_equal(observed, expected_percent)
265 | 
266 |     observed <- cat_compare(data, "cyl", "vs", only = "percent")
267 |     expect_equal(observed, expected_percent)
268 | 
269 |     observed <- cat_compare(data, "cyl", "vs", only = " percent ")
270 |     expect_equal(observed, expected_percent)
271 | })
272 | 
273 | test_that("cat_compare returns correct data with a valid clean_names argument", {
274 | 
275 |     data$Cyl <- data$cyl
276 | 
277 |     expected <- tibble::tibble(
278 |         cyl = c(4, 6, 8, NA),
279 |         n_0 = c(1, 1, 14, 1),
280 |         n_1 = c(10, 4, 0, 0),
281 |         n_na = c(0, 1, 0, 0),
282 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
283 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
284 |         p_na = c(0, 1, 0, 0))
285 |     observed <- cat_compare(data, "Cyl", "vs", clean_names = TRUE)
286 |     expect_equal(observed, expected)
287 | 
288 |     expected <- tibble::tibble(
289 |         Cyl = c(4, 6, 8, NA),
290 |         n_0 = c(1, 1, 14, 1),
291 |         n_1 = c(10, 4, 0, 0),
292 |         n_na = c(0, 1, 0, 0),
293 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
294 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
295 |         p_na = c(0, 1, 0, 0))
296 |     observed <- cat_compare(data, "Cyl", "vs", clean_names = FALSE)
297 |     expect_equal(observed, expected)
298 | })
299 | 
300 | test_that("cat_compare uses option for default clean_names argument", {
301 | 
302 |     data$Cyl <- data$cyl
303 |     restore_option <- getOption("tabbycat.clean_names")
304 | 
305 |     options(tabbycat.clean_names = TRUE)
306 |     expected <- tibble::tibble(
307 |         cyl = c(4, 6, 8, NA),
308 |         n_0 = c(1, 1, 14, 1),
309 |         n_1 = c(10, 4, 0, 0),
310 |         n_na = c(0, 1, 0, 0),
311 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
312 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
313 |         p_na = c(0, 1, 0, 0))
314 |     observed <- cat_compare(data, "Cyl", "vs")
315 |     expect_equal(observed, expected)
316 | 
317 |     options(tabbycat.clean_names = FALSE)
318 |     expected <- tibble::tibble(
319 |         Cyl = c(4, 6, 8, NA),
320 |         n_0 = c(1, 1, 14, 1),
321 |         n_1 = c(10, 4, 0, 0),
322 |         n_na = c(0, 1, 0, 0),
323 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
324 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
325 |         p_na = c(0, 1, 0, 0))
326 |     observed <- cat_compare(data, "Cyl", "vs")
327 |     expect_equal(observed, expected)
328 | 
329 |     options(tabbycat.clean_names = restore_option)
330 | })
331 | 
332 | test_that("cat_contrast returns correct data with a valid na_label argument", {
333 | 
334 |     expected <- tibble::tibble(
335 |         cyl = c(4, 6, 8, NA),
336 |         n_0 = c(1, 1, 14, 1),
337 |         n_1 = c(10, 4, 0, 0),
338 |         n_missing = c(0, 1, 0, 0),
339 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
340 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
341 |         p_missing = c(0, 1, 0, 0))
342 |     observed <- cat_compare(data, "cyl", "vs", na_label = "missing")
343 |     expect_equal(observed, expected)
344 | })
345 | 
346 | test_that("cat_compare uses option for default na_label argument", {
347 | 
348 |     restore_option <- getOption("tabbycat.na_label")
349 | 
350 |     options(tabbycat.na_label = "missing")
351 |     expected <- tibble::tibble(
352 |         cyl = c(4, 6, 8, NA),
353 |         n_0 = c(1, 1, 14, 1),
354 |         n_1 = c(10, 4, 0, 0),
355 |         n_missing = c(0, 1, 0, 0),
356 |         p_0 = c(0.05882353, 0.05882353, 0.82352941, 0.05882353),
357 |         p_1 = c(.71428571, 0.28571429, 0.0, 0.0),
358 |         p_missing = c(0, 1, 0, 0))
359 |     observed <- cat_compare(data, "cyl", "vs")
360 |     expect_equal(observed, expected)
361 | 
362 |     options(tabbycat.na_label = restore_option)
363 | })
364 | 


--------------------------------------------------------------------------------
/tests/testthat/test_contrast.R:
--------------------------------------------------------------------------------
  1 | # Test contrast.R
  2 | 
  3 | # Setup ----------------------------------------------------------------------
  4 | 
  5 | data <- mtcars %>%
  6 |     tibble::rownames_to_column("model") %>%
  7 |     tibble::as_tibble() %>%
  8 |     dplyr::mutate(
  9 |         manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1])
 10 | 
 11 | data$cyl[1] <- NA
 12 | data$manufacturer[2] <- NA
 13 | 
 14 | # Tests: cat_contrast --------------------------------------------------------
 15 | 
 16 | test_that("cat_contrast rejects a data argument that is not a dataframe", {
 17 | 
 18 |     msg <- "The \"data\" argument is not a dataframe."
 19 |     expect_error(cat_contrast(NULL, "cyl", "manufacturer", "Merc"), msg)
 20 |     expect_error(cat_contrast(NA, "cyl", "manufacturer", "Merc"), msg)
 21 |     expect_error(cat_contrast(1:10, "cyl", "manufacturer", "Merc"), msg)
 22 |     expect_error(cat_contrast(LETTERS[1:10], "cyl", "manufacturer", "Merc"), msg)
 23 |     expect_error(cat_contrast(c(TRUE, FALSE), "cyl", "manufacturer", "Merc"), msg)
 24 |     expect_error(cat_contrast(list(), "cyl", "manufacturer", "Merc"), msg)
 25 | })
 26 | 
 27 | test_that("cat_contrast rejects a data argument that has no rows", {
 28 | 
 29 |     msg <- "The \"data\" argument is empty."
 30 |     expect_error(cat_contrast(data.frame(), "cyl", "manufacturer", "Merc"), msg)
 31 | })
 32 | 
 33 | test_that("cat_contrast rejects invalid row_cat arguments", {
 34 | 
 35 |     msg <- "Invalid \"row_cat\" argument. Must be a character vector of length one."
 36 |     expect_error(cat_contrast(data, NULL, "manufacturer", "Merc"), msg)
 37 |     expect_error(cat_contrast(data, NA, "manufacturer", "Merc"), msg)
 38 |     expect_error(cat_contrast(data, 1:10, "manufacturer", "Merc"), msg)
 39 |     expect_error(cat_contrast(data, LETTERS[1:10], "manufacturer", "Merc"), msg)
 40 |     expect_error(cat_contrast(data, c(TRUE, FALSE), "manufacturer", "Merc"), msg)
 41 |     expect_error(cat_contrast(data, list(), "manufacturer", "Merc"), msg)
 42 | })
 43 | 
 44 | test_that("cat_contrast rejects a row_cat argument that is not a column in the data", {
 45 | 
 46 |     msg <- "'notacolumn' is not a column in the dataframe."
 47 |     expect_error(cat_contrast(data, "notacolumn", "manufacturer", "Merc"), msg)
 48 | })
 49 | 
 50 | test_that("cat_contrast rejects invalid col_cat arguments", {
 51 | 
 52 |     msg <- "Invalid \"col_cat\" argument. Must be a character vector of length one."
 53 |     expect_error(cat_contrast(data, "cyl", NULL, "Merc"), msg)
 54 |     expect_error(cat_contrast(data, "cyl", NA, "Merc"), msg)
 55 |     expect_error(cat_contrast(data, "cyl", 1:10, "Merc"), msg)
 56 |     expect_error(cat_contrast(data, "cyl", LETTERS[1:10], "Merc"), msg)
 57 |     expect_error(cat_contrast(data, "cyl", c(TRUE, FALSE), "Merc"), msg)
 58 |     expect_error(cat_contrast(data, "cyl", list(), "Merc"), msg)
 59 | })
 60 | 
 61 | test_that("cat_contrast rejects a col_cat argument that is not a column in the data", {
 62 | 
 63 |     msg <- "'notacolumn' is not a column in the dataframe."
 64 |     expect_error(cat_contrast(data, "cyl", "notacolumn", "Merc"), msg)
 65 | })
 66 | 
 67 | test_that("cat_contrast rejects invalid col_group arguments", {
 68 | 
 69 |     msg <- "Invalid \"col_group\" argument. Must be a character vector of length one."
 70 |     expect_error(cat_contrast(data, "cyl", "manufacturer", NULL), msg)
 71 |     expect_error(cat_contrast(data, "cyl", "manufacturer", NA), msg)
 72 |     expect_error(cat_contrast(data, "cyl", "manufacturer", 1:10), msg)
 73 |     expect_error(cat_contrast(data, "cyl", "manufacturer", LETTERS[1:10]), msg)
 74 |     expect_error(cat_contrast(data, "cyl", "manufacturer", c(TRUE, FALSE)), msg)
 75 |     expect_error(cat_contrast(data, "cyl", "manufacturer", list()), msg)
 76 | })
 77 | 
 78 | test_that("cat_contrast rejects a col_group argument that does not exist in col_cat", {
 79 | 
 80 |     msg <- "The \"col_group\" 'notagroup' does not exist in the \"col_cat\" 'manufacturer'."
 81 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "notagroup"), msg)
 82 | })
 83 | 
 84 | test_that("cat_contrast rejects invalid na.rm.row arguments", {
 85 | 
 86 |     msg <- "Invalid \"na.rm.row\" argument. Must be either TRUE or FALSE."
 87 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = NULL), msg)
 88 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = NA), msg)
 89 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = 1), msg)
 90 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = ""), msg)
 91 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = 1:10), msg)
 92 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = LETTERS[1:10]), msg)
 93 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = c(TRUE, FALSE)), msg)
 94 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.row = list()), msg)
 95 | })
 96 | 
 97 | test_that("cat_contrast rejects invalid na.rm.col arguments", {
 98 | 
 99 |     msg <- "Invalid \"na.rm.col\" argument. Must be either TRUE or FALSE."
100 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = NULL), msg)
101 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = NA), msg)
102 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = 1), msg)
103 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = ""), msg)
104 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = 1:10), msg)
105 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = LETTERS[1:10]), msg)
106 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = c(TRUE, FALSE)), msg)
107 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm.col = list()), msg)
108 | })
109 | 
110 | test_that("cat_contrast rejects invalid na.rm arguments", {
111 | 
112 |     msg <- "Invalid \"na.rm\" argument. Must be either NULL, TRUE or FALSE."
113 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = NA), msg)
114 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = 1), msg)
115 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = ""), msg)
116 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = 1:10), msg)
117 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = LETTERS[1:10]), msg)
118 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = c(TRUE, FALSE)), msg)
119 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na.rm = list()), msg)
120 | })
121 | 
122 | test_that("cat_contrast rejects invalid clean_names arguments", {
123 | 
124 |     msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE."
125 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = NULL), msg)
126 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = NA), msg)
127 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = 1), msg)
128 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = ""), msg)
129 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = 1:10), msg)
130 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = LETTERS[1:10]), msg)
131 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = c(TRUE, FALSE)), msg)
132 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", clean_names = list()), msg)
133 | })
134 | 
135 | test_that("cat_contrast rejects invalid only arguments", {
136 | 
137 |     msg <- "Invalid \"only\" argument. Must be a character vector of length one."
138 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = NULL), msg)
139 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = NA), msg)
140 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = 1), msg)
141 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = TRUE), msg)
142 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = 1:10), msg)
143 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = LETTERS[1:10]), msg)
144 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = c(TRUE, FALSE)), msg)
145 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", only = list()), msg)
146 | })
147 | 
148 | test_that("cat_contrast rejects invalid other_label arguments", {
149 | 
150 |     msg <- "Invalid \"other_label\" argument. Must be a character vector of length one."
151 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = NULL), msg)
152 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = NA), msg)
153 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = 1), msg)
154 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = TRUE), msg)
155 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = 1:10), msg)
156 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = LETTERS[1:10]), msg)
157 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = c(TRUE, FALSE)), msg)
158 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", other_label = list()), msg)
159 | })
160 | 
161 | test_that("cat_contrast rejects invalid na_label arguments", {
162 | 
163 |     msg <- "Invalid \"na_label\" argument. Must be a character vector of length one."
164 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = NULL), msg)
165 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = NA), msg)
166 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = 1), msg)
167 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = TRUE), msg)
168 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = 1:10), msg)
169 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = LETTERS[1:10]), msg)
170 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = c(TRUE, FALSE)), msg)
171 |     expect_error(cat_contrast(data, "cyl", "manufacturer", "Merc", na_label = list()), msg)
172 | })
173 | 
174 | test_that("cat_contrast returns correct data with defaults", {
175 | 
176 |     expected <- tibble::tibble(
177 |         cyl = c(8, 4, 6, NA),
178 |         n_merc = c(3, 2, 2, 0),
179 |         n_other = c(11, 9, 3, 1),
180 |         n_na = c(0, 0, 1, 0),
181 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
182 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
183 |         p_na = c(0, 0, 1, 0))
184 |     observed <- cat_contrast(data, "cyl", "manufacturer", "Merc")
185 |     expect_equal(observed, expected)
186 | })
187 | 
188 | test_that("cat_contrast returns correct data with a valid na.rm.row argument", {
189 | 
190 |     expected <- tibble::tibble(
191 |         cyl = c(8, 4, 6, NA),
192 |         n_merc = c(3, 2, 2, 0),
193 |         n_other = c(11, 9, 3, 1),
194 |         n_na = c(0, 0, 1, 0),
195 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
196 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
197 |         p_na = c(0, 0, 1, 0))
198 |     observed <- cat_contrast(
199 |         data,
200 |         "cyl",
201 |         "manufacturer",
202 |         "Merc",
203 |         na.rm.row = FALSE)
204 |     expect_equal(observed, expected)
205 | 
206 |     expected <- tibble::tibble(
207 |         cyl = c(8, 4, 6),
208 |         n_merc = c(3, 2, 2),
209 |         n_other = c(11, 9, 3),
210 |         n_na = c(0, 0, 1),
211 |         p_merc = c( 0.42857143, 0.28571429, 0.28571429),
212 |         p_other = c(0.47826087, 0.39130435, 0.13043478),
213 |         p_na = c(0, 0, 1))
214 |     observed <- cat_contrast(
215 |         data,
216 |         "cyl",
217 |         "manufacturer",
218 |         "Merc",
219 |         na.rm.row = TRUE)
220 |     expect_equal(observed, expected)
221 | })
222 | 
223 | test_that("cat_contrast returns correct data with a valid na.rm.col argument", {
224 | 
225 |     expected <- tibble::tibble(
226 |         cyl = c(8, 4, 6, NA),
227 |         n_merc = c(3, 2, 2, 0),
228 |         n_other = c(11, 9, 3, 1),
229 |         n_na = c(0, 0, 1, 0),
230 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
231 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
232 |         p_na = c(0, 0, 1, 0))
233 |     observed <- cat_contrast(
234 |         data,
235 |         "cyl",
236 |         "manufacturer",
237 |         "Merc",
238 |         na.rm.col = FALSE)
239 |     expect_equal(observed, expected)
240 | 
241 |     expected <- tibble::tibble(
242 |         cyl = c(8, 4, 6, NA),
243 |         n_merc = c(3, 2, 2, 0),
244 |         n_other = c(11, 9, 3, 1),
245 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
246 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667))
247 |     observed <- cat_contrast(
248 |         data,
249 |         "cyl",
250 |         "manufacturer",
251 |         "Merc",
252 |         na.rm.col = TRUE)
253 |     expect_equal(observed, expected)
254 | })
255 | 
256 | test_that("cat_contrast returns correct data with a valid na.rm argument", {
257 | 
258 |     expected <- tibble::tibble(
259 |         cyl = c(8, 4, 6, NA),
260 |         n_merc = c(3, 2, 2, 0),
261 |         n_other = c(11, 9, 3, 1),
262 |         n_na = c(0, 0, 1, 0),
263 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
264 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
265 |         p_na = c(0, 0, 1, 0))
266 |     observed <- cat_contrast(
267 |         data,
268 |         "cyl",
269 |         "manufacturer",
270 |         "Merc",
271 |         na.rm = FALSE)
272 |     expect_equal(observed, expected)
273 | 
274 |     expected <- tibble::tibble(
275 |         cyl = c(8, 4, 6),
276 |         n_merc = c(3, 2, 2),
277 |         n_other = c(11, 9, 3),
278 |         p_merc = c( 0.42857143, 0.28571429, 0.28571429),
279 |         p_other = c(0.47826087, 0.39130435, 0.13043478))
280 |     observed <- cat_contrast(
281 |         data,
282 |         "cyl",
283 |         "manufacturer",
284 |         "Merc",
285 |         na.rm = TRUE)
286 |     expect_equal(observed, expected)
287 | })
288 | 
289 | test_that("cat_contrast returns correct data with a valid only argument", {
290 | 
291 |     expected <- tibble::tibble(
292 |         cyl = c(8, 4, 6, NA),
293 |         n_merc = c(3, 2, 2, 0),
294 |         n_other = c(11, 9, 3, 1),
295 |         n_na = c(0, 0, 1, 0),
296 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
297 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
298 |         p_na = c(0, 0, 1, 0))
299 |     observed <- cat_contrast(
300 |         data,
301 |         "cyl",
302 |         "manufacturer",
303 |         "Merc",
304 |         only = "ignore")
305 |     expect_equal(observed, expected)
306 | 
307 |     expected_number <- tibble::tibble(
308 |         cyl = c(8, 4, 6, NA),
309 |         n_merc = c(3, 2, 2, 0),
310 |         n_other = c(11, 9, 3, 1),
311 |         n_na = c(0, 0, 1, 0))
312 | 
313 |     observed <- cat_contrast(
314 |         data,
315 |         "cyl",
316 |         "manufacturer",
317 |         "Merc",
318 |         only = "n")
319 |     expect_equal(observed, expected_number)
320 | 
321 |     observed <- cat_contrast(
322 |         data,
323 |         "cyl",
324 |         "manufacturer",
325 |         "Merc",
326 |         only = "number")
327 |     expect_equal(observed, expected_number)
328 | 
329 |     observed <- cat_contrast(
330 |         data,
331 |         "cyl",
332 |         "manufacturer",
333 |         "Merc",
334 |         only = " number ")
335 |     expect_equal(observed, expected_number)
336 | 
337 |     expected_percent <- tibble::tibble(
338 |         cyl = c(8, 4, 6, NA),
339 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
340 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
341 |         p_na = c(0, 0, 1, 0))
342 | 
343 |     observed <- cat_contrast(
344 |         data,
345 |         "cyl",
346 |         "manufacturer",
347 |         "Merc",
348 |         only = "p")
349 |     expect_equal(observed, expected_percent)
350 | 
351 |     observed <- cat_contrast(
352 |         data,
353 |         "cyl",
354 |         "manufacturer",
355 |         "Merc",
356 |         only = "percent")
357 |     expect_equal(observed, expected_percent)
358 | 
359 |     observed <- cat_contrast(
360 |         data,
361 |         "cyl",
362 |         "manufacturer",
363 |         "Merc",
364 |         only = " percent ")
365 |     expect_equal(observed, expected_percent)
366 | })
367 | 
368 | test_that("cat_contrast returns correct data with a valid clean_names argument", {
369 | 
370 |     data$Cyl <- data$cyl
371 | 
372 |     expected <- tibble::tibble(
373 |         cyl = c(8, 4, 6, NA),
374 |         n_merc = c(3, 2, 2, 0),
375 |         n_other = c(11, 9, 3, 1),
376 |         n_na = c(0, 0, 1, 0),
377 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
378 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
379 |         p_na = c(0, 0, 1, 0))
380 |     observed <- cat_contrast(
381 |         data,
382 |         "Cyl",
383 |         "manufacturer",
384 |         "Merc",
385 |         clean_names = TRUE)
386 |     expect_equal(observed, expected)
387 | 
388 |     expected <- tibble::tibble(
389 |         Cyl = c(8, 4, 6, NA),
390 |         n_Merc = c(3, 2, 2, 0),
391 |         n_other = c(11, 9, 3, 1),
392 |         n_na = c(0, 0, 1, 0),
393 |         p_Merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
394 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
395 |         p_na = c(0, 0, 1, 0))
396 |     observed <- cat_contrast(
397 |         data,
398 |         "Cyl",
399 |         "manufacturer",
400 |         "Merc",
401 |         clean_names = FALSE)
402 |     expect_equal(observed, expected)
403 | })
404 | 
405 | test_that("cat_contrast uses option for default clean_names argument", {
406 | 
407 |     data$Cyl <- data$cyl
408 |     restore_option <- getOption("tabbycat.clean_names")
409 | 
410 |     options(tabbycat.clean_names = TRUE)
411 |     expected <- tibble::tibble(
412 |         cyl = c(8, 4, 6, NA),
413 |         n_merc = c(3, 2, 2, 0),
414 |         n_other = c(11, 9, 3, 1),
415 |         n_na = c(0, 0, 1, 0),
416 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
417 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
418 |         p_na = c(0, 0, 1, 0))
419 |     observed <- cat_contrast(
420 |         data,
421 |         "Cyl",
422 |         "manufacturer",
423 |         "Merc")
424 |     expect_equal(observed, expected)
425 | 
426 |     options(tabbycat.clean_names = FALSE)
427 |     expected <- tibble::tibble(
428 |         Cyl = c(8, 4, 6, NA),
429 |         n_Merc = c(3, 2, 2, 0),
430 |         n_other = c(11, 9, 3, 1),
431 |         n_na = c(0, 0, 1, 0),
432 |         p_Merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
433 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
434 |         p_na = c(0, 0, 1, 0))
435 |     observed <- cat_contrast(
436 |         data,
437 |         "Cyl",
438 |         "manufacturer",
439 |         "Merc")
440 |     expect_equal(observed, expected)
441 | 
442 |     options(tabbycat.clean_names = restore_option)
443 | })
444 | 
445 | test_that("cat_contrast returns correct data with a valid na_label argument", {
446 | 
447 |     expected <- tibble::tibble(
448 |         cyl = c(8, 4, 6, NA),
449 |         n_merc = c(3, 2, 2, 0),
450 |         n_other = c(11, 9, 3, 1),
451 |         n_missing = c(0, 0, 1, 0),
452 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
453 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
454 |         p_missing = c(0, 0, 1, 0))
455 |     observed <- cat_contrast(
456 |         data,
457 |         "cyl",
458 |         "manufacturer",
459 |         "Merc",
460 |         na_label = "missing")
461 |     expect_equal(observed, expected)
462 | })
463 | 
464 | test_that("cat_contrast uses option for default na_label argument", {
465 | 
466 |     restore_option <- getOption("tabbycat.na_label")
467 | 
468 |     options(tabbycat.na_label = "missing")
469 |     expected <- tibble::tibble(
470 |         cyl = c(8, 4, 6, NA),
471 |         n_merc = c(3, 2, 2, 0),
472 |         n_other = c(11, 9, 3, 1),
473 |         n_missing = c(0, 0, 1, 0),
474 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
475 |         p_other = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
476 |         p_missing = c(0, 0, 1, 0))
477 |     observed <- cat_contrast(
478 |         data,
479 |         "cyl",
480 |         "manufacturer",
481 |         "Merc")
482 |     expect_equal(observed, expected)
483 | 
484 |     options(tabbycat.na_label = restore_option)
485 | 
486 | })
487 | 
488 | test_that("cat_contrast returns correct data with a valid other_label argument", {
489 | 
490 |     expected <- tibble::tibble(
491 |         cyl = c(8, 4, 6, NA),
492 |         n_merc = c(3, 2, 2, 0),
493 |         n_remainder = c(11, 9, 3, 1),
494 |         n_na = c(0, 0, 1, 0),
495 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
496 |         p_remainder = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
497 |         p_na = c(0, 0, 1, 0))
498 |     observed <- cat_contrast(
499 |         data,
500 |         "cyl",
501 |         "manufacturer",
502 |         "Merc",
503 |         other_label = "remainder")
504 |     expect_equal(observed, expected)
505 | })
506 | 
507 | test_that("cat_contrast uses option for default other_label argument", {
508 | 
509 |     restore_option <- getOption("tabbycat.other_label")
510 | 
511 |     options(tabbycat.other_label = "remainder")
512 |     expected <- tibble::tibble(
513 |         cyl = c(8, 4, 6, NA),
514 |         n_merc = c(3, 2, 2, 0),
515 |         n_remainder = c(11, 9, 3, 1),
516 |         n_na = c(0, 0, 1, 0),
517 |         p_merc = c(0.42857143, 0.28571429, 0.28571429, 0.0),
518 |         p_remainder = c(0.45833333, 0.37500000, 0.12500000, 0.04166667),
519 |         p_na = c(0, 0, 1, 0))
520 |     observed <- cat_contrast(
521 |         data,
522 |         "cyl",
523 |         "manufacturer",
524 |         "Merc")
525 |     expect_equal(observed, expected)
526 | 
527 |     options(tabbycat.other_label = restore_option)
528 | })
529 | 


--------------------------------------------------------------------------------
/tests/testthat/test_count.R:
--------------------------------------------------------------------------------
  1 | # Test count.R
  2 | 
  3 | # Setup -----------------------------------------------------------------------
  4 | 
  5 | data <- mtcars %>%
  6 |     tibble::rownames_to_column("model") %>%
  7 |     tibble::as_tibble() %>%
  8 |     dplyr::mutate(
  9 |         manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1])
 10 | 
 11 | data$cyl[1] <- NA
 12 | 
 13 | # Tests: cat_vcount -----------------------------------------------------------
 14 | 
 15 | test_that("cat_count rejects a data argument that is not a dataframe", {
 16 | 
 17 |     msg <- "The \"data\" argument is not a dataframe."
 18 |     expect_error(cat_count(NULL, "cyl"), msg)
 19 |     expect_error(cat_count(NA, "cyl"), msg)
 20 |     expect_error(cat_count(1:10, "cyl"), msg)
 21 |     expect_error(cat_count(LETTERS[1:10], "cyl"), msg)
 22 |     expect_error(cat_count(c(TRUE, FALSE), "cyl"), msg)
 23 |     expect_error(cat_count(list(), "cyl"), msg)
 24 | })
 25 | 
 26 | test_that("cat_count rejects a data argument that has no rows", {
 27 | 
 28 |     msg <- "The \"data\" argument is empty."
 29 |     expect_error(cat_count(data.frame(), "cyl"), msg)
 30 | })
 31 | 
 32 | test_that("cat_count rejects invalid cat arguments", {
 33 | 
 34 |     msg <- "Invalid \"cat\" argument. Must be a character vector of length one."
 35 |     expect_error(cat_count(data, NULL), msg)
 36 |     expect_error(cat_count(data, NA), msg)
 37 |     expect_error(cat_count(data, 1:10), msg)
 38 |     expect_error(cat_count(data, LETTERS[1:10]), msg)
 39 |     expect_error(cat_count(data, c(TRUE, FALSE)), msg)
 40 |     expect_error(cat_count(data, list()), msg)
 41 | })
 42 | 
 43 | test_that("cat_count rejects a cat argument that is not a column in the data", {
 44 | 
 45 |     msg <- "'notacolumn' is not a column in the dataframe."
 46 |     expect_error(cat_count(data, "notacolumn"), msg)
 47 | })
 48 | 
 49 | test_that("cat_count rejects invalid na.rm arguments", {
 50 | 
 51 |     msg <- "Invalid \"na.rm\" argument. Must be either TRUE or FALSE."
 52 |     expect_error(cat_count(data, "cyl", na.rm = NULL), msg)
 53 |     expect_error(cat_count(data, "cyl", na.rm = NA), msg)
 54 |     expect_error(cat_count(data, "cyl", na.rm = 1), msg)
 55 |     expect_error(cat_count(data, "cyl", na.rm = ""), msg)
 56 |     expect_error(cat_count(data, "cyl", na.rm = 1:10), msg)
 57 |     expect_error(cat_count(data, "cyl", na.rm = LETTERS[1:10]), msg)
 58 |     expect_error(cat_count(data, "cyl", na.rm = c(TRUE, FALSE)), msg)
 59 |     expect_error(cat_count(data, "cyl", na.rm = list()), msg)
 60 | })
 61 | 
 62 | test_that("cat_count rejects invalid clean_names arguments", {
 63 | 
 64 |     msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE."
 65 |     expect_error(cat_count(data, "cyl", clean_names = NULL), msg)
 66 |     expect_error(cat_count(data, "cyl", clean_names = NA), msg)
 67 |     expect_error(cat_count(data, "cyl", clean_names = 1), msg)
 68 |     expect_error(cat_count(data, "cyl", clean_names = ""), msg)
 69 |     expect_error(cat_count(data, "cyl", clean_names = 1:10), msg)
 70 |     expect_error(cat_count(data, "cyl", clean_names = LETTERS[1:10]), msg)
 71 |     expect_error(cat_count(data, "cyl", clean_names = c(TRUE, FALSE)), msg)
 72 |     expect_error(cat_count(data, "cyl", clean_names = list()), msg)
 73 | })
 74 | 
 75 | test_that("cat_count rejects invalid only arguments", {
 76 | 
 77 |     msg <- "Invalid \"only\" argument. Must be a character vector of length one."
 78 |     expect_error(cat_count(data, "cyl", only = NULL), msg)
 79 |     expect_error(cat_count(data, "cyl", only = NA), msg)
 80 |     expect_error(cat_count(data, "cyl", only = 1), msg)
 81 |     expect_error(cat_count(data, "cyl", only = TRUE), msg)
 82 |     expect_error(cat_count(data, "cyl", only = 1:10), msg)
 83 |     expect_error(cat_count(data, "cyl", only = LETTERS[1:10]), msg)
 84 |     expect_error(cat_count(data, "cyl", only = c(TRUE, FALSE)), msg)
 85 |     expect_error(cat_count(data, "cyl", only = list()), msg)
 86 | })
 87 | 
 88 | test_that("cat_count returns correct data with defaults", {
 89 | 
 90 |     expected <- tibble::tibble(
 91 |         cyl = c(8, 4, 6, NA),
 92 |         number = c(14, 11, 6, 1),
 93 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
 94 |     observed <- cat_count(data, "cyl")
 95 |     expect_equal(observed, expected)
 96 | })
 97 | 
 98 | test_that("cat_count returns correct data with a valid na.rm argument", {
 99 | 
100 |     expected <- tibble::tibble(
101 |         cyl = c(8, 4, 6, NA),
102 |         number = c(14, 11, 6, 1),
103 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
104 |     observed <- cat_count(data, "cyl", na.rm = FALSE)
105 |     expect_equal(observed, expected)
106 | 
107 |     expected <- tibble::tibble(
108 |         cyl = c(8, 4, 6),
109 |         number = c(14, 11, 6),
110 |         percent = c(0.4516129032, 0.3548387097, 0.1935483871))
111 |     observed <- cat_count(data, "cyl", na.rm = TRUE)
112 |     expect_equal(observed, expected)
113 | })
114 | 
115 | test_that("cat_count returns correct data with a valid only argument", {
116 | 
117 |     expected <- tibble::tibble(
118 |         cyl = c(8, 4, 6, NA),
119 |         number = c(14, 11, 6, 1),
120 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
121 |     observed <- cat_count(data, "cyl", only = "ignore")
122 |     expect_equal(observed, expected)
123 | 
124 |     expected_number <- tibble::tibble(
125 |         cyl = c(8, 4, 6, NA),
126 |         number = c(14, 11, 6, 1))
127 | 
128 |     observed <- cat_count(data, "cyl", only = "n")
129 |     expect_equal(observed, expected_number)
130 | 
131 |     observed <- cat_count(data, "cyl", only = "number")
132 |     expect_equal(observed, expected_number)
133 | 
134 |     observed <- cat_count(data, "cyl", only = " number ")
135 |     expect_equal(observed, expected_number)
136 | 
137 |     expected_percent <- tibble::tibble(
138 |         cyl = c(8, 4, 6, NA),
139 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
140 | 
141 |     observed <- cat_count(data, "cyl", only = "p")
142 |     expect_equal(observed, expected_percent)
143 | 
144 |     observed <- cat_count(data, "cyl", only = "percent")
145 |     expect_equal(observed, expected_percent)
146 | 
147 |     observed <- cat_count(data, "cyl", only = " percent ")
148 |     expect_equal(observed, expected_percent)
149 | })
150 | 
151 | test_that("cat_count returns correct data with a valid clean_names argument", {
152 | 
153 |     data$Cyl <- data$cyl
154 | 
155 |     expected <- tibble::tibble(
156 |         cyl = c(8, 4, 6, NA),
157 |         number = c(14, 11, 6, 1),
158 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
159 |     observed <- cat_count(data, "Cyl", clean_names = TRUE)
160 |     expect_equal(observed, expected)
161 | 
162 |     expected <- tibble::tibble(
163 |         Cyl = c(8, 4, 6, NA),
164 |         number = c(14, 11, 6, 1),
165 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
166 |     observed <- cat_count(data, "Cyl", clean_names = FALSE)
167 |     expect_equal(observed, expected)
168 | })
169 | 
170 | test_that("cat_count uses option for default clean_names argument", {
171 | 
172 |     data$Cyl <- data$cyl
173 |     restore_option <- getOption("tabbycat.clean_names")
174 | 
175 |     options(tabbycat.clean_names = TRUE)
176 |     expected <- tibble::tibble(
177 |         cyl = c(8, 4, 6, NA),
178 |         number = c(14, 11, 6, 1),
179 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
180 |     observed <- cat_count(data, "Cyl")
181 |     expect_equal(observed, expected)
182 | 
183 |     options(tabbycat.clean_names = FALSE)
184 |     expected <- tibble::tibble(
185 |         Cyl = c(8, 4, 6, NA),
186 |         number = c(14, 11, 6, 1),
187 |         percent = c(0.43750, 0.34375, 0.18750, 0.03125))
188 |     observed <- cat_count(data, "Cyl")
189 |     expect_equal(observed, expected)
190 | 
191 |     options(tabbycat.clean_names = restore_option)
192 | })
193 | 


--------------------------------------------------------------------------------
/tests/testthat/test_summarise.R:
--------------------------------------------------------------------------------
  1 | # Test summarise.R
  2 | 
  3 | # Setup -----------------------------------------------------------------------
  4 | 
  5 | data <- mtcars %>%
  6 |     tibble::rownames_to_column("model") %>%
  7 |     tibble::as_tibble() %>%
  8 |     dplyr::mutate(
  9 |         manufacturer = stringr::str_split(model, " ", simplify = TRUE)[, 1])
 10 | 
 11 | data$cyl[1] <- NA
 12 | 
 13 | # Tests: cat_vcount -----------------------------------------------------------
 14 | 
 15 | test_that("cat_summarise rejects a data argument that is not a dataframe", {
 16 | 
 17 |     msg <- "The \"data\" argument is not a dataframe."
 18 |     expect_error(cat_summarise(NULL, "cyl", "mpg"), msg)
 19 |     expect_error(cat_summarise(NA, "cyl", "mpg"), msg)
 20 |     expect_error(cat_summarise(1:10, "cyl", "mpg"), msg)
 21 |     expect_error(cat_summarise(LETTERS[1:10], "cyl", "mpg"), msg)
 22 |     expect_error(cat_summarise(c(TRUE, FALSE), "cyl", "mpg"), msg)
 23 |     expect_error(cat_summarise(list(), "cyl", "mpg"), msg)
 24 | })
 25 | 
 26 | test_that("cat_summarise rejects a data argument that has no rows", {
 27 | 
 28 |     msg <- "The \"data\" argument is empty."
 29 |     expect_error(cat_summarise(data.frame(), "cyl", "mpg"), msg)
 30 | })
 31 | 
 32 | test_that("cat_summarise rejects invalid cat arguments", {
 33 | 
 34 |     msg <- "Invalid \"cat\" argument. Must be a character vector of length one."
 35 |     expect_error(cat_summarise(data, NULL, "mpg"), msg)
 36 |     expect_error(cat_summarise(data, NA, "mpg"), msg)
 37 |     expect_error(cat_summarise(data, 1:10, "mpg"), msg)
 38 |     expect_error(cat_summarise(data, LETTERS[1:10], "mpg"), msg)
 39 |     expect_error(cat_summarise(data, c(TRUE, FALSE), "mpg"), msg)
 40 |     expect_error(cat_summarise(data, list(), "mpg"), msg)
 41 | })
 42 | 
 43 | test_that("cat_summarise rejects a cat argument that is not a column in the data", {
 44 | 
 45 |     msg <- "'notacolumn' is not a column in the dataframe."
 46 |     expect_error(cat_summarise(data, "notacolumn", "mpg"), msg)
 47 | })
 48 | 
 49 | test_that("cat_summarise rejects invalid num arguments", {
 50 | 
 51 |     msg <- "Invalid \"num\" argument. Must be a character vector of length one."
 52 |     expect_error(cat_summarise(data, "cyl", NULL), msg)
 53 |     expect_error(cat_summarise(data, "cyl", NA), msg)
 54 |     expect_error(cat_summarise(data, "cyl", 1:10), msg)
 55 |     expect_error(cat_summarise(data, "cyl", LETTERS[1:10]), msg)
 56 |     expect_error(cat_summarise(data, "cyl", c(TRUE, FALSE)), msg)
 57 |     expect_error(cat_summarise(data, "cyl", list()), msg)
 58 | })
 59 | 
 60 | test_that("cat_summarise rejects a num argument that is not a column in the data", {
 61 | 
 62 |     msg <- "'notacolumn' is not a column in the dataframe."
 63 |     expect_error(cat_summarise(data, "cyl", "notacolumn"), msg)
 64 | })
 65 | 
 66 | test_that("cat_summarise rejects a num argument that is not a numeric column", {
 67 | 
 68 |     msg <- "The num argument is not a numeric column."
 69 |     expect_error(cat_summarise(data, "cyl", "manufacturer"), msg)
 70 | })
 71 | 
 72 | test_that("cat_summarise rejects invalid na.rm arguments", {
 73 | 
 74 |     msg <- "Invalid \"na.rm\" argument. Must be either TRUE or FALSE."
 75 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = NULL), msg)
 76 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = NA), msg)
 77 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = 1), msg)
 78 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = ""), msg)
 79 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = 1:10), msg)
 80 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = LETTERS[1:10]), msg)
 81 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = c(TRUE, FALSE)), msg)
 82 |     expect_error(cat_summarise(data, "cyl", "mpg", na.rm = list()), msg)
 83 | })
 84 | 
 85 | test_that("cat_summarise rejects invalid clean_names arguments", {
 86 | 
 87 |     msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE."
 88 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = NULL), msg)
 89 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = NA), msg)
 90 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = 1), msg)
 91 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = ""), msg)
 92 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = 1:10), msg)
 93 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = LETTERS[1:10]), msg)
 94 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = c(TRUE, FALSE)), msg)
 95 |     expect_error(cat_summarise(data, "cyl", "mpg", clean_names = list()), msg)
 96 | })
 97 | 
 98 | test_that("cat_summarise returns correct data with defaults", {
 99 | 
100 |     expected <- tibble::tibble(
101 |         cyl = c(4, 6, 8, NA),
102 |         n = c(11, 6, 14, 1),
103 |         na = c(0, 0, 0, 0),
104 |         mean = c(26.663636, 19.5333333, 15.10000, 21.00000),
105 |         sd = c(4.50982765, 1.47196014, 2.56004808, NA),
106 |         min = c(21.4, 17.8, 10.4, 21.0),
107 |         lq = c(22.800, 18.375, 14.400, 21.000),
108 |         med = c(26.00, 19.45, 15.20, 21.00),
109 |         uq = c(30.400, 20.675, 16.250, 21.000),
110 |         max = c(33.9, 21.4, 19.2, 21.0))
111 |     observed <- cat_summarise(data, "cyl", "mpg")
112 |     expect_equal(observed, expected)
113 | 
114 |     data_na_all_na <- data
115 |     data_na_all_na$mpg[1] <- NA
116 | 
117 |     expected <- tibble::tibble(
118 |         cyl = c(4, 6, 8, NA),
119 |         n = c(11, 6, 14, 1),
120 |         na = c(0, 0, 0, 1),
121 |         mean = c(26.663636, 19.5333333, 15.10000, NA),
122 |         sd = c(4.50982765, 1.47196014, 2.56004808, NA),
123 |         min = c(21.4, 17.8, 10.4, NA),
124 |         lq = c(22.800, 18.375, 14.400, NA),
125 |         med = c(26.00, 19.45, 15.20, NA),
126 |         uq = c(30.400, 20.675, 16.250, NA),
127 |         max = c(33.9, 21.4, 19.2, NA))
128 |     observed <- cat_summarise(data_na_all_na, "cyl", "mpg")
129 |     expect_equal(observed, expected)
130 | 
131 |     data_na_summary <- data
132 |     data_na_summary$cyl[3] <- NA
133 | 
134 |     expected <- tibble::tibble(
135 |         cyl = c(4, 6, 8, NA),
136 |         n = c(10, 6, 14, 2),
137 |         na = c(0, 0, 0, 0),
138 |         mean = c(27.05000, 19.5333333, 15.10000, 21.90000),
139 |         sd = c(4.557838182, 1.4719601444, 2.5600480765, 1.272792061),
140 |         min = c(21.4, 17.8, 10.4, 21.0),
141 |         lq = c(23.200, 18.375, 14.400, 21.450),
142 |         med = c(26.65, 19.45, 15.20, 21.90),
143 |         uq = c(30.400, 20.675, 16.250, 22.350),
144 |         max = c(33.9, 21.4, 19.2, 22.8))
145 |     observed <- cat_summarise(data_na_summary, "cyl", "mpg")
146 |     expect_equal(observed, expected)
147 | 
148 | })
149 | 
150 | test_that("cat_summarise returns correct data with a valid na.rm argument", {
151 | 
152 |     expected <- tibble::tibble(
153 |         cyl = c(4, 6, 8, NA),
154 |         n = c(11, 6, 14, 1),
155 |         na = c(0, 0, 0, 0),
156 |         mean = c(26.663636, 19.5333333, 15.10000, 21.00000),
157 |         sd = c(4.50982765, 1.47196014, 2.56004808, NA),
158 |         min = c(21.4, 17.8, 10.4, 21.0),
159 |         lq = c(22.800, 18.375, 14.400, 21.000),
160 |         med = c(26.00, 19.45, 15.20, 21.00),
161 |         uq = c(30.400, 20.675, 16.250, 21.000),
162 |         max = c(33.9, 21.4, 19.2, 21.0))
163 |     observed <- cat_summarise(data, "cyl", "mpg", na.rm = FALSE)
164 |     expect_equal(observed, expected)
165 | 
166 |     expected <- tibble::tibble(
167 |         cyl = c(4, 6, 8),
168 |         n = c(11, 6, 14),
169 |         na = c(0, 0, 0),
170 |         mean = c(26.663636, 19.5333333, 15.10000),
171 |         sd = c(4.50982765, 1.47196014, 2.56004808),
172 |         min = c(21.4, 17.8, 10.4),
173 |         lq = c(22.800, 18.375, 14.400),
174 |         med = c(26.00, 19.45, 15.20),
175 |         uq = c(30.400, 20.675, 16.250),
176 |         max = c(33.9, 21.4, 19.2))
177 |     observed <- cat_summarise(data, "cyl", "mpg", na.rm = TRUE)
178 |     expect_equal(observed, expected)
179 | })
180 | 
181 | test_that("cat_summarise returns correct data with a valid clean_names argument", {
182 | 
183 |     data$Cyl <- data$cyl
184 | 
185 |     expected <- tibble::tibble(
186 |         cyl = c(4, 6, 8, NA),
187 |         n = c(11, 6, 14, 1),
188 |         na = c(0, 0, 0, 0),
189 |         mean = c(26.663636, 19.5333333, 15.10000, 21.00000),
190 |         sd = c(4.50982765, 1.47196014, 2.56004808, NA),
191 |         min = c(21.4, 17.8, 10.4, 21.0),
192 |         lq = c(22.800, 18.375, 14.400, 21.000),
193 |         med = c(26.00, 19.45, 15.20, 21.00),
194 |         uq = c(30.400, 20.675, 16.250, 21.000),
195 |         max = c(33.9, 21.4, 19.2, 21.0))
196 |     observed <- cat_summarise(data, "Cyl", "mpg", clean_names = TRUE)
197 |     expect_equal(observed, expected)
198 | 
199 |     expected <- tibble::tibble(
200 |         Cyl = c(4, 6, 8, NA),
201 |         n = c(11, 6, 14, 1),
202 |         na = c(0, 0, 0, 0),
203 |         mean = c(26.663636, 19.5333333, 15.10000, 21.00000),
204 |         sd = c(4.50982765, 1.47196014, 2.56004808, NA),
205 |         min = c(21.4, 17.8, 10.4, 21.0),
206 |         lq = c(22.800, 18.375, 14.400, 21.000),
207 |         med = c(26.00, 19.45, 15.20, 21.00),
208 |         uq = c(30.400, 20.675, 16.250, 21.000),
209 |         max = c(33.9, 21.4, 19.2, 21.0))
210 |     observed <- cat_summarise(data, "Cyl", "mpg", clean_names = FALSE)
211 |     expect_equal(observed, expected)
212 | })
213 | 
214 | test_that("cat_summarise uses option for default clean_names argument", {
215 | 
216 |     data$Cyl <- data$cyl
217 |     restore_option <- getOption("tabbycat.clean_names")
218 | 
219 |     options(tabbycat.clean_names = FALSE)
220 |     expected <- tibble::tibble(
221 |         Cyl = c(4, 6, 8, NA),
222 |         n = c(11, 6, 14, 1),
223 |         na = c(0, 0, 0, 0),
224 |         mean = c(26.663636, 19.5333333, 15.10000, 21.00000),
225 |         sd = c(4.50982765, 1.47196014, 2.56004808, NA),
226 |         min = c(21.4, 17.8, 10.4, 21.0),
227 |         lq = c(22.800, 18.375, 14.400, 21.000),
228 |         med = c(26.00, 19.45, 15.20, 21.00),
229 |         uq = c(30.400, 20.675, 16.250, 21.000),
230 |         max = c(33.9, 21.4, 19.2, 21.0))
231 |     observed <- cat_summarise(data, "Cyl", "mpg")
232 |     expect_equal(observed, expected)
233 | 
234 |     options(tabbycat.clean_names = TRUE)
235 |     expected <- tibble::tibble(
236 |         cyl = c(4, 6, 8, NA),
237 |         n = c(11, 6, 14, 1),
238 |         na = c(0, 0, 0, 0),
239 |         mean = c(26.663636, 19.5333333, 15.10000, 21.00000),
240 |         sd = c(4.50982765, 1.47196014, 2.56004808, NA),
241 |         min = c(21.4, 17.8, 10.4, 21.0),
242 |         lq = c(22.800, 18.375, 14.400, 21.000),
243 |         med = c(26.00, 19.45, 15.20, 21.00),
244 |         uq = c(30.400, 20.675, 16.250, 21.000),
245 |         max = c(33.9, 21.4, 19.2, 21.0))
246 |     observed <- cat_summarise(data, "Cyl", "mpg")
247 |     expect_equal(observed, expected)
248 | 
249 |     options(tabbycat.clean_names = restore_option)
250 | })
251 | 
252 | test_that("cat_summarize is an alias for cat_summarise", {
253 | 
254 |     summarise <- cat_summarise(data, "cyl", "mpg")
255 |     summarize <- cat_summarize(data, "cyl", "mpg")
256 |     expect_equal(summarise, summarize)
257 | })
258 | 


--------------------------------------------------------------------------------
/tests/testthat/test_vcount.R:
--------------------------------------------------------------------------------
  1 | # Test vcount.R
  2 | 
  3 | # Setup -----------------------------------------------------------------------
  4 | 
  5 | cat <- c("a", "b", "b", "c", "c", "c", "d", "d", "d", "d")
  6 | 
  7 | # Tests: cat_vcount -----------------------------------------------------------
  8 | 
  9 | test_that("cat_vcount rejects a cat argument that is not a vector", {
 10 | 
 11 |     msg <- "The \"cat\" argument is not a vector."
 12 |     expect_error(cat_vcount(NULL), msg)
 13 |     expect_error(cat_vcount(list()), msg)
 14 |     expect_error(cat_vcount(data.frame()), msg)
 15 | })
 16 | 
 17 | test_that("cat_vcount rejects a cat argument that is an empty factor", {
 18 | 
 19 |     msg <- "The \"cat\" argument is empty."
 20 |     expect_error(cat_vcount(factor()), msg)
 21 | })
 22 | 
 23 | test_that("cat_vcount rejects invalid na.rm arguments", {
 24 | 
 25 |     msg <- "Invalid \"na.rm\" argument. Must be either TRUE or FALSE."
 26 |     expect_error(cat_vcount(cat, na.rm = NULL), msg)
 27 |     expect_error(cat_vcount(cat, na.rm = NA), msg)
 28 |     expect_error(cat_vcount(cat, na.rm = 1), msg)
 29 |     expect_error(cat_vcount(cat, na.rm = ""), msg)
 30 |     expect_error(cat_vcount(cat, na.rm = 1:10), msg)
 31 |     expect_error(cat_vcount(cat, na.rm = LETTERS[1:10]), msg)
 32 |     expect_error(cat_vcount(cat, na.rm = c(TRUE, FALSE)), msg)
 33 |     expect_error(cat_vcount(cat, na.rm = list()), msg)
 34 | })
 35 | 
 36 | test_that("cat_vcount rejects invalid clean_names arguments", {
 37 | 
 38 |     msg <- "Invalid \"clean_names\" argument. Must be either TRUE or FALSE."
 39 |     expect_error(cat_vcount(cat, clean_names = NULL), msg)
 40 |     expect_error(cat_vcount(cat, clean_names = NA), msg)
 41 |     expect_error(cat_vcount(cat, clean_names = 1), msg)
 42 |     expect_error(cat_vcount(cat, clean_names = ""), msg)
 43 |     expect_error(cat_vcount(cat, clean_names = 1:10), msg)
 44 |     expect_error(cat_vcount(cat, clean_names = LETTERS[1:10]), msg)
 45 |     expect_error(cat_vcount(cat, clean_names = c(TRUE, FALSE)), msg)
 46 |     expect_error(cat_vcount(cat, clean_names = list()), msg)
 47 | })
 48 | 
 49 | test_that("cat_vcount rejects invalid only arguments", {
 50 | 
 51 |     msg <- "Invalid \"only\" argument. Must be a character vector of length one."
 52 |     expect_error(cat_vcount(cat, only = NULL), msg)
 53 |     expect_error(cat_vcount(cat, only = NA), msg)
 54 |     expect_error(cat_vcount(cat, only = 1), msg)
 55 |     expect_error(cat_vcount(cat, only = TRUE), msg)
 56 |     expect_error(cat_vcount(cat, only = 1:10), msg)
 57 |     expect_error(cat_vcount(cat, only = LETTERS[1:10]), msg)
 58 |     expect_error(cat_vcount(cat, only = c(TRUE, FALSE)), msg)
 59 |     expect_error(cat_vcount(cat, only = list()), msg)
 60 | })
 61 | 
 62 | test_that("cat_vcount returns expected data with defaults", {
 63 | 
 64 |     expected <- tibble::tibble(
 65 |         cat = letters[4:1],
 66 |         number = 4:1,
 67 |         percent = number / sum(number))
 68 |     observed <- cat_vcount(cat)
 69 |     expect_equal(observed, expected)
 70 | })
 71 | 
 72 | test_that("cat_vcount returns expected data with a valid na.rm argument", {
 73 | 
 74 |     cat <- c(cat, NA)
 75 | 
 76 |     expected <- tibble::tibble(
 77 |         cat = c(letters[4:1], NA),
 78 |         number = as.integer(c(4:1, 1)),
 79 |         percent = number / sum(number))
 80 |     observed <- cat_vcount(cat)
 81 |     expect_equal(observed[2:3], expected[2:3])
 82 |     expect_equal(observed[1:4, ], expected[1:4, ])
 83 |     expect_equal(is.na(observed[[5, 1]]), TRUE)
 84 | 
 85 |     expected <- tibble::tibble(
 86 |         cat = c(letters[4:1], NA),
 87 |         number = as.integer(c(4:1, 1)),
 88 |         percent = number / sum(number))
 89 |     observed <- cat_vcount(cat, na.rm = FALSE)
 90 |     expect_equal(observed[2:3], expected[2:3])
 91 |     expect_equal(observed[1:4, ], expected[1:4, ])
 92 |     expect_equal(is.na(observed[[5, 1]]), TRUE)
 93 | 
 94 |     expected <- tibble::tibble(
 95 |         cat = letters[4:1],
 96 |         number = 4:1,
 97 |         percent = number / sum(number))
 98 |     observed <- cat_vcount(cat, na.rm = TRUE)
 99 |     expect_equal(observed, expected)
100 | })
101 | 
102 | test_that("cat_vcount returns expected data with a valid only argument", {
103 | 
104 |     expected <- tibble::tibble(
105 |         cat = letters[4:1],
106 |         number = 4:1,
107 |         percent = number / sum(number))
108 |     observed <- cat_vcount(cat, only = "ignore")
109 |     expect_equal(observed, expected)
110 | 
111 |     expected_number <- tibble::tibble(
112 |         cat = letters[4:1],
113 |         number = 4:1)
114 |     observed <- cat_vcount(cat, only = "n")
115 |     expect_equal(observed, expected_number)
116 | 
117 |     observed <- cat_vcount(cat, only = "number")
118 |     expect_equal(observed, expected_number)
119 | 
120 |     observed <- cat_vcount(cat, only = " number ")
121 |     expect_equal(observed, expected_number)
122 | 
123 |     expected_percent <- tibble::tibble(
124 |         cat = letters[4:1],
125 |         percent = 4:1 / sum(4:1))
126 | 
127 |     observed <- cat_vcount(cat, only = "p")
128 |     expect_equal(observed, expected_percent)
129 | 
130 |     observed <- cat_vcount(cat, only = "percent")
131 |     expect_equal(observed, expected_percent)
132 | 
133 |     observed <- cat_vcount(cat, only = " percent ")
134 |     expect_equal(observed, expected_percent)
135 | })
136 | 
137 | test_that("cat_count returns correct data with a valid clean_names argument", {
138 | 
139 |     Cat <- cat
140 | 
141 |     expected <- tibble::tibble(
142 |         cat = letters[4:1],
143 |         number = 4:1,
144 |         percent = number / sum(number))
145 |     observed <- cat_vcount(Cat, clean_names = TRUE)
146 |     expect_equal(observed, expected)
147 | 
148 |     expected <- tibble::tibble(
149 |         Cat = letters[4:1],
150 |         number = 4:1,
151 |         percent = number / sum(number))
152 |     observed <- cat_vcount(Cat, clean_names = FALSE)
153 |     expect_equal(observed, expected)
154 | })
155 | 
156 | test_that("cat_vcount uses option for default clean_names argument", {
157 | 
158 |     Cat <- cat
159 |     restore_option <- getOption("tabbycat.clean_names")
160 | 
161 |     options(tabbycat.clean_names = TRUE)
162 |     expected <- tibble::tibble(
163 |         cat = letters[4:1],
164 |         number = 4:1,
165 |         percent = number / sum(number))
166 |     observed <- cat_vcount(Cat)
167 |     expect_equal(observed, expected)
168 | 
169 |     options(tabbycat.clean_names = FALSE)
170 |     expected <- tibble::tibble(
171 |         Cat = letters[4:1],
172 |         number = 4:1,
173 |         percent = number / sum(number))
174 |     observed <- cat_vcount(Cat)
175 |     expect_equal(observed, expected)
176 | 
177 |     options(tabbycat.clean_names = restore_option)
178 | })
179 | 
180 | 


--------------------------------------------------------------------------------