├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── test-coverage.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── create_dictionary.R ├── summarise_variable.R └── utils.R ├── README.Rmd ├── README.md ├── codecov.yml ├── cran-comments.md ├── datadictionary.Rproj ├── man ├── create_dictionary.Rd ├── mode_stat.Rd └── summarise_variable.Rd ├── revdep ├── .gitignore ├── README.md ├── cran.md ├── email.yml ├── failures.md └── problems.md └── tests ├── testthat.R └── testthat ├── test-create_dictionary.R ├── test-summarise_variable.R └── testdata ├── tester.rds └── tester_no_error.rds /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^LICENSE\.md$ 5 | ^\.github$ 6 | ^cran-comments\.md$ 7 | ^CRAN-SUBMISSION$ 8 | ^codecov\.yml$ 9 | ^revdep$ 10 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macOS-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v2 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v4 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # R Environment Variables 39 | .Renviron 40 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: datadictionary 2 | Title: Create a Data Dictionary 3 | Version: 1.0.1.9000 4 | Authors@R: 5 | person(given = "Bethany", 6 | family = "Jones", 7 | role = c("aut", "cre"), 8 | email = "r.data.nerd@gmail.com") 9 | Description: Creates a data dictionary from any dataframe or tibble in your R environment. 10 | You can opt to add variable labels. You can write the object directly to Excel. 11 | License: MIT + file LICENSE 12 | Encoding: UTF-8 13 | Language: en-GB 14 | Roxygen: list(markdown = TRUE) 15 | RoxygenNote: 7.3.2 16 | Imports: 17 | chron, 18 | dplyr, 19 | haven, 20 | labelled, 21 | lubridate, 22 | openxlsx, 23 | stats, 24 | tibble, 25 | tidyr, 26 | tidyselect, 27 | Suggests: 28 | covr, 29 | testthat (>= 3.0.0) 30 | Depends: 31 | R (>= 4.1.0) 32 | Config/testthat/edition: 3 33 | URL: https://github.com/DoctorBJones/datadictionary 34 | BugReports: https://github.com/DoctorBJones/datadictionary/issues 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2022 2 | COPYRIGHT HOLDER: Bethany Jones 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 Bethany Jones 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(create_dictionary) 4 | export(summarise_variable) 5 | importFrom(chron,'as.times') 6 | importFrom(dplyr,'bind_rows') 7 | importFrom(dplyr,'mutate') 8 | importFrom(haven,'as_factor') 9 | importFrom(labelled,'var_label') 10 | importFrom(lubridate,'date') 11 | importFrom(openxlsx,'write.xlsx') 12 | importFrom(stats,'median') 13 | importFrom(tibble,'rownames_to_column') 14 | importFrom(tidyr,'pivot_longer') 15 | importFrom(tidyselect,'everything') 16 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # datadictionary (development version) 2 | 3 | # datadictionary 1.0.1 4 | 5 | * Moves code from magrittr to base pipe 6 | * Corrects error that occurred when summarising partially labelled data 7 | * Adds URLs to DESCRIPTION 8 | 9 | # datadictionary 1.0.0 10 | 11 | * Improves handling of vectors of class 'Date' 12 | * Improves handling of vectors of class 'difftime' 13 | * Improves output for vectors of class 'times' 14 | * Corrects occasional error with labelling 15 | * Improves handling of vectors with multiple classes 16 | 17 | # datadictionary 0.1.0 18 | 19 | * Added a `NEWS.md` file to track changes to the package. 20 | * Package renamed from `dd` to `datadictionary` 21 | -------------------------------------------------------------------------------- /R/create_dictionary.R: -------------------------------------------------------------------------------- 1 | 2 | #' Create a data dictionary from any dataset 3 | #' 4 | #' 5 | #' @param dataset The dataset you wish to summarise 6 | #' @param file The file path to write an Excel spreadsheet (optional) 7 | #' @param var_labels A named vector of variable labels (optional) 8 | #' @param id_var A variable/vector of variables that are identifiers (optional) 9 | 10 | #' @return Either an Excel spreadsheet or a dataframe 11 | #' 12 | #' @importFrom haven 'as_factor' 13 | #' @importFrom openxlsx 'write.xlsx' 14 | #' @importFrom labelled 'var_label' 15 | #' @importFrom dplyr 'bind_rows' 16 | #' 17 | #' @examples 18 | #' 19 | #' # A simple dictionary printed to console 20 | #' create_dictionary(esoph) 21 | #' 22 | #' # You can specify id variable/s 23 | #' mtcars$id <- 1:nrow(mtcars) 24 | #' create_dictionary(mtcars, id_var = "id") 25 | #' 26 | #' # You can also specify labels with a named vector 27 | #' iris.labels <- c(Sepal.Length = "Sepal length in mm", 28 | #' Sepal.Width = "Sepal width in mm", 29 | #' Petal.Length = "Petal length in mm", 30 | #' Petal.Width = "Petal width in mm", 31 | #' Species = "Species of iris") 32 | #' create_dictionary(iris, var_labels = iris.labels) 33 | #' 34 | #' @export 35 | create_dictionary <- function(dataset, 36 | id_var = NULL, 37 | file = NULL, 38 | var_labels = NULL) { 39 | 40 | # first check that the argument is correct class 41 | dataset_class <- class(dataset) 42 | 43 | if (! "data.frame" %in% dataset_class) 44 | stop("You can only make a dictionary for a dataframe or tibble") 45 | 46 | if (! is.null(file)) { 47 | if (grepl("xlsx$", file) == FALSE) { 48 | stop("You can only write to Excel files with extension `.xlsx`") 49 | } 50 | } 51 | 52 | if (is.null(file)) { 53 | output = TRUE 54 | } else { 55 | output = FALSE 56 | } 57 | 58 | if (! is.null(var_labels)) { 59 | labelled::var_label(dataset) <- var_labels 60 | } 61 | 62 | # initialise output dataframe with overall summary 63 | out <- dataset_summary(dataset) 64 | 65 | # create internal variable for the dataset 66 | df <- dataset 67 | 68 | # Use the id summary function for id var/s 69 | # remove the id vars from internal version of the data 70 | # once the summary is done so it doesn't get replicated 71 | if (! is.null(id_var)) { 72 | vec <- id_var 73 | 74 | for (i in vec) { 75 | f <- id_summary(dataset, i) 76 | 77 | out <- dplyr::bind_rows(out, f) 78 | 79 | df <- df[, ! names(df) == i] 80 | } 81 | } 82 | 83 | # find the names of the internal dataframe to iterate over 84 | df_col <- colnames(df) 85 | 86 | # summarise each column and append the summary to the output 87 | for (col in df_col) { 88 | 89 | x <- summarise_variable(df, col) 90 | 91 | out <- dplyr::bind_rows(out, x) 92 | } 93 | 94 | if (output == FALSE) { 95 | 96 | openxlsx::write.xlsx(out, file = file) 97 | 98 | } else { 99 | 100 | return(out) 101 | 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /R/summarise_variable.R: -------------------------------------------------------------------------------- 1 | 2 | #' Summarise a single variable 3 | #' 4 | #' @param dataset The dataset with the variable you wish to summarise 5 | #' @param column The column you wish to summarise as a quoted string 6 | #' 7 | #' @return A dataframe with a summary of the variable 8 | #' 9 | #' @examples 10 | #' summarise_variable(mtcars, "mpg") 11 | #' 12 | #' summarise_variable(iris, "Species") 13 | 14 | #' @export 15 | summarise_variable <- function(dataset, column) { 16 | 17 | x <- class(dataset[[column]]) 18 | 19 | if (sum(is.na(dataset[[column]])) == length(dataset[[column]])) { 20 | allna_summary(dataset, column) 21 | } else if ("factor" %in% x) { 22 | factor_summary(dataset, column) 23 | } else if ("haven_labelled" %in% x) { 24 | label_summary(dataset, column) 25 | } else if ("POSIXt" %in% x | "Date" %in% x) { 26 | datetime_summary(dataset, column) 27 | } else if ("times" %in% x) { 28 | times_summary(dataset, column) 29 | } else if ("difftime" %in% x | 30 | "hms" %in% x | 31 | "ms" %in% x | 32 | "hm" %in% x) { 33 | difftimes_summary(dataset, column) 34 | } else if ("numeric" %in% x || 35 | "integer" %in% x || 36 | "double" %in% x) { 37 | numeric_summary(dataset, column) 38 | } else if ("logical" %in% x | 39 | "boolean" %in% x ) { 40 | logical_summary(dataset, column) 41 | } else { 42 | character_summary(dataset, column) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | 2 | #' @importFrom tidyr 'pivot_longer' 3 | #' @importFrom tidyselect 'everything' 4 | #' @importFrom stats 'median' 5 | #' @importFrom lubridate 'date' 6 | #' @importFrom dplyr 'mutate' 7 | #' @importFrom tibble 'rownames_to_column' 8 | #' @importFrom chron 'as.times' 9 | 10 | 11 | factor_summary <- function(dataset, column) { 12 | a <- as.data.frame(table(dataset[[column]])) 13 | names(a)[1] <- "summary" 14 | 15 | # throw a warning in case it should be numeric or character 16 | if (nrow(a) > 10) { 17 | msg <- paste0(column, " has more than 10 levels, did you want a character variable?") 18 | warning(msg) 19 | } 20 | 21 | # this creates the factor level with it's value in parentheses 22 | # e.g. Strongly disagree (5) 23 | 24 | a <- a |> 25 | dplyr::mutate(summary = paste(summary, 26 | " (", 27 | as.numeric(summary), 28 | ")", sep = "")) 29 | 30 | names(a)[2] <- "value" 31 | 32 | a$item <- "" 33 | a$item[1] <- gsub('"', '', deparse(column)) 34 | 35 | a$class <- "" 36 | a$class[1] <- 37 | paste(class(dataset[[column]]), sep = " ", collapse = " ") 38 | 39 | a$label <- "" 40 | a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")), 41 | "No label", attr(dataset[[column]], "label")) 42 | vars <- c("item", "label", "class", "summary", "value") 43 | a <- a[, vars] 44 | a[nrow(a) + 1, ] <- 45 | c("", "", "", "missing", sum(is.na(dataset[[column]]))) 46 | 47 | a$value <- as.character(a$value) 48 | return(a) 49 | } 50 | 51 | numeric_summary <- function(dataset, column) { 52 | 53 | var <- dataset[[column]] 54 | 55 | a <- as.data.frame(round(mean(var, na.rm = TRUE)), digits = 2) 56 | names(a)[1] <- "mean" 57 | 58 | a$median = as.numeric(round(median(var, na.rm = TRUE)), digits = 2) 59 | a$min = round(min(var, na.rm = TRUE), digits = 2) 60 | a$max = round(max(var, na.rm = TRUE), digits = 2) 61 | a$missing = sum(is.na(dataset[[column]])) 62 | 63 | a <- a |> 64 | pivot_longer(cols = everything(), 65 | names_to = "summary", 66 | values_to = "value", 67 | values_transform = list(value = as.character)) 68 | 69 | # pivot_longer creates a tibble which actually messes with output 70 | a <- as.data.frame(a) # so coerce to df 71 | 72 | a$item <- "" 73 | a$item[1] <- gsub('"','', deparse(column)) 74 | 75 | a$class <- "" 76 | a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ") 77 | 78 | a$label <- "" 79 | a$label[1] <- ifelse( 80 | is.null(attr(dataset[[column]], "label")), 81 | "No label", attr(dataset[[column]], "label")) 82 | 83 | vars <- c("item", "label", "class", "summary", "value") 84 | 85 | a <- a[, vars] 86 | 87 | a$value <- as.character(a$value) 88 | 89 | return(a) 90 | } 91 | 92 | character_summary <- function(dataset, column) { 93 | var <- dataset[[column]] 94 | 95 | 96 | a <- as.data.frame(length(unique(var))) 97 | names(a)[1] <- "unique responses" 98 | 99 | a$missing <- sum(is.na(var)) 100 | 101 | a <- a |> 102 | pivot_longer(cols = everything(), names_to = "summary") 103 | 104 | if (a$value[1] < 10) { 105 | msg <- paste0(column, " has fewer than 10 unique values, did you want a factor?") 106 | warning(msg) 107 | } 108 | 109 | a <- as.data.frame(a) 110 | 111 | a$item <- "" 112 | a$item[1] <- gsub('"', '', deparse(column)) 113 | 114 | a$class <- "" 115 | a$class[1] <- 116 | paste(class(dataset[[column]]), sep = " ", collapse = " ") 117 | 118 | a$label <- "" 119 | a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")), 120 | "No label", attr(dataset[[column]], "label")) 121 | 122 | vars <- c("item", "label", "class", "summary", "value") 123 | a <- a[, vars] 124 | 125 | a$value <- as.character(a$value) 126 | 127 | return(a) 128 | } 129 | 130 | 131 | logical_summary <- function(dataset, column) { 132 | 133 | a <- as.data.frame(table(dataset[[column]])) 134 | names(a)[1] <- "summary" 135 | names(a)[2] <- "value" 136 | 137 | a$item <- "" 138 | a$item[1] <- gsub('"', '', deparse(column)) 139 | 140 | a$class <- "" 141 | a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ") 142 | 143 | a$label <- "" 144 | a$label[1] <- ifelse( 145 | is.null(attr(dataset[[column]], "label")), 146 | "No label", attr(dataset[[column]], "label") 147 | ) 148 | 149 | vars <- c("item", "label", "class", "summary", "value") 150 | 151 | a <- a[, vars] 152 | a$summary <- as.character(a$summary) 153 | a[nrow(a) + 1, ] <- c("", "", "", "missing", sum(is.na(dataset[[column]]))) 154 | 155 | a$value <- as.character(a$value) 156 | 157 | return(a) 158 | 159 | } 160 | 161 | 162 | datetime_summary <- function(dataset, column) { 163 | var <- lubridate::date(dataset[[column]]) 164 | 165 | a <- as.data.frame(as.character(mean(var, na.rm = TRUE))) 166 | names(a)[1] <- "mean" 167 | 168 | date_mode <- as.Date(mode_stat(var), origin = '1970-01-01') 169 | a$mode = paste(date_mode, sep = ", ", collapse = " ") 170 | a$min = as.character(min(var, na.rm = TRUE)) 171 | a$max = as.character(max(var, na.rm = TRUE)) 172 | a$missing = as.character(sum(is.na(dataset[[column]]))) 173 | 174 | a <- a |> 175 | pivot_longer(cols = everything(), names_to = "summary") 176 | a <- as.data.frame(a) 177 | 178 | a$item <- "" 179 | a$item[1] <- gsub('"', '', deparse(column)) 180 | 181 | a$class <- "" 182 | a$class[1] <- 183 | paste(class(dataset[[column]]), sep = " ", collapse = " ") 184 | 185 | a$label <- "" 186 | a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")), 187 | "No label", attr(dataset[[column]], "label")) 188 | 189 | vars <- c("item", "label", "class", "summary", "value") 190 | a <- a[, vars] 191 | 192 | a$value <- as.character(a$value) 193 | 194 | return(a) 195 | } 196 | 197 | times_summary <- function(dataset, column) { 198 | 199 | a <- as.data.frame(as.character(mean(dataset[[column]], na.rm = TRUE))) 200 | names(a)[1] <- "mean" 201 | 202 | a$median = as.character(median(dataset[[column]], na.rm = TRUE)) 203 | a$min = as.character(min(dataset[[column]], na.rm = TRUE)) 204 | a$max = as.character(max(dataset[[column]], na.rm = TRUE)) 205 | a$missing = as.character(sum(is.na(dataset[[column]]))) 206 | 207 | a <- a |> 208 | pivot_longer(cols = everything(), names_to = "summary") 209 | a <- as.data.frame(a) 210 | # a$value <- as.Date(a$value, format = "%Y-%m-%d") 211 | 212 | a$item <- "" 213 | a$item[1] <- gsub('"','', deparse(column)) 214 | 215 | a$class <- "" 216 | a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ") 217 | 218 | a$label <- "" 219 | a$label[1] <- ifelse( 220 | is.null(attr(dataset[[column]], "label")), 221 | "No label", attr(dataset[[column]], "label")) 222 | 223 | vars <- c("item", "label", "class", "summary", "value") 224 | a <- a[, vars] 225 | 226 | a$value <- as.character(a$value) 227 | 228 | return(a) 229 | } 230 | 231 | 232 | label_summary <- function(dataset, column) { 233 | 234 | if (length(unique(dataset[[column]])) == 235 | length(attr(dataset[[column]], "labels"))) { 236 | 237 | label_values <- 238 | as.data.frame(attributes(dataset[[column]])$labels) |> 239 | tibble::rownames_to_column() 240 | 241 | names(label_values)[1] <- "label" 242 | names(label_values)[2] <- "value" 243 | 244 | label_values$summary <- 245 | paste(label_values$label, " (", label_values$value, ")", 246 | sep = "") 247 | 248 | a <- as.data.frame(table(dataset[[column]])) 249 | names(a)[1] <- "num_val" 250 | names(a)[2] <- "value" 251 | 252 | a <- merge(a, label_values, by.x = "num_val", by.y = "value") 253 | 254 | a$item <- "" 255 | a$item[1] <- gsub('"', '', deparse(column)) 256 | 257 | a$class <- "" 258 | a$class[1] <- 259 | paste(class(dataset[[column]]), sep = " ", collapse = " ") 260 | 261 | a$label <- "" 262 | a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")), 263 | "No label", attr(dataset[[column]], "label")) 264 | 265 | vars <- c("item", "label", "class", "summary", "value") 266 | a <- a[, vars] 267 | a[nrow(a) + 1, ] <- 268 | c("", "", "", "missing", sum(is.na(dataset[[column]]))) 269 | a$value <- as.character(a$value) 270 | 271 | return(a) 272 | 273 | } else { 274 | 275 | msg <- paste0(column, " has different numbers of labels and levels. It has been treated as numeric") 276 | warning(msg) 277 | 278 | numeric_summary(dataset = dataset, column = column) 279 | 280 | } 281 | 282 | } 283 | 284 | difftimes_summary <- function(dataset, column) { 285 | 286 | var <- dataset[[column]] 287 | 288 | a <- as.data.frame(floor(mean(var, na.rm = TRUE))) 289 | names(a)[1] <- "mean" 290 | 291 | a$median = median(var, na.rm = TRUE) 292 | a$min = min(var, na.rm = TRUE) 293 | a$max = max(var, na.rm = TRUE) 294 | a$missing = sum(is.na(dataset[[column]])) 295 | 296 | a <- a |> 297 | pivot_longer(cols = everything(), 298 | names_to = "summary", 299 | values_to = "value", 300 | values_transform = list(value = as.character)) 301 | 302 | # pivot_longer creates a tibble which actually messes with output 303 | a <- as.data.frame(a) # so coerce to df 304 | 305 | a$item <- "" 306 | a$item[1] <- gsub('"','', deparse(column)) 307 | 308 | a$class <- "" 309 | a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ") 310 | 311 | a$label <- "" 312 | a$label[1] <- ifelse( 313 | is.null(attr(dataset[[column]], "label")), 314 | "No label", attr(dataset[[column]], "label")) 315 | 316 | vars <- c("item", "label", "class", "summary", "value") 317 | 318 | a <- a[, vars] 319 | 320 | a$value <- as.character(a$value) 321 | 322 | return(a) 323 | } 324 | 325 | 326 | id_summary <- function(dataset, column) { 327 | var <- dataset[[column]] 328 | 329 | item <- gsub('"', '', deparse(column)) 330 | label <- "Unique identifier" 331 | class <- "" 332 | summary <- "unique values" 333 | value <- length(unique(var)) 334 | 335 | a <- data.frame(item, label, class, summary, value) 336 | a[nrow(a) + 1, ] <- 337 | c("", "", "", "missing", sum(is.na(dataset[[column]]))) 338 | 339 | a$value <- as.character(a$value) 340 | 341 | return(a) 342 | } 343 | 344 | allna_summary <- function(dataset, column) { 345 | a <- data.frame( 346 | item = gsub('"', '', deparse(column)), 347 | label = ifelse(is.null(attr(dataset[[column]], "label")), 348 | "No label", attr(dataset[[column]], "label")), 349 | class = paste(class(dataset[[column]]), sep = " ", collapse = " "), 350 | summary = "missing", 351 | value = as.character(length(dataset[[column]])) 352 | ) 353 | } 354 | 355 | dataset_summary <- function(dataset) { 356 | x <- as.data.frame(nrow(dataset)) 357 | y <- as.data.frame(ncol(dataset)) 358 | 359 | a <- cbind(x, y) 360 | names(a)[1] <- "Rows in dataset" 361 | names(a)[2] <- "Columns in dataset" 362 | 363 | a <- a |> 364 | pivot_longer(cols = everything(), names_to = "summary") 365 | a <- as.data.frame(a) 366 | 367 | a$item <- "" 368 | 369 | a$class <- "" 370 | 371 | a$label <- "" 372 | 373 | vars <- c("item", "label", "class", "summary", "value") 374 | a <- a[, vars] 375 | 376 | a$value <- as.character(a$value) 377 | 378 | return(a) 379 | } 380 | 381 | #' Get the mode of a vector 382 | #' @param x A vector 383 | #' @param freq Boolean when TRUE returns the frequency of the mode 384 | #' @keywords internal 385 | mode_stat <- function(x, freq = FALSE) { 386 | z <- 2 387 | if (freq) 388 | z <- 1:2 389 | run <- x 390 | 391 | run <- as.vector(run) 392 | 393 | run <- sort(run) 394 | 395 | run <- rle(run) 396 | 397 | run <- unclass(run) 398 | 399 | run <- data.frame(run) 400 | 401 | colnames(run) <- c("freq", "value") 402 | 403 | run[which(run$freq == max(run$freq)), z] 404 | } 405 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r, include = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "#>", 9 | fig.path = "man/figures/README-", 10 | out.width = "100%" 11 | ) 12 | ``` 13 | 14 | 15 | [![CRAN status](https://www.r-pkg.org/badges/version/datadictionary)](https://cran.r-project.org/package=datadictionary) 16 | [![R-CMD-check](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml) 17 | [![Codecov test coverage](https://codecov.io/gh/DoctorBJones/datadictionary/branch/main/graph/badge.svg)](https://app.codecov.io/gh/DoctorBJones/datadictionary?branch=main) 18 | 19 | 20 | # datadictionary 21 | 22 | The goal of `datadictionary` is to create a data dictionary from any dataframe or tibble in your R environment. While other packages exist I found they were complicated to use and/or the output wasn't what I was after. This package attempts to solve those problems by presenting tabular summaries of the dataset in a format that fits easily in a pane or screen, using a single line of code. 23 | 24 | It includes an overall summary of the dataset and at-a-glance summaries of each variable. All variables have a count of missing included, and different summaries are provided based on the data class. 25 | 26 | For factors, labelled data and logicals the summary will include the name of each level with the level number in parentheses where appropriate. A value for the count of units in each level is included. 27 | 28 | For dates, integers and other numeric types of data the summary includes statistical summaries such as mean, median, mode, minimum and maximum. A value for each is included in the table. 29 | 30 | Character variables include only a count of unique values and missing values. This is the default so if you include a class of data that isn't yet implemented you should get this output. 31 | 32 | You can nominate one or more identifier variables, for example individuals and clusters, so you only get a count of unique and missing values rather than nonsense numeric summaries. 33 | 34 | You can also include a vector to add labels if you want descriptions included in the document. Lastly, you can opt for the output to write directly to Excel. 35 | 36 | 37 | ## Installation 38 | 39 | You can install the current version of `datadictionary` from CRAN using: 40 | 41 | ``` r 42 | install.packages("datadictionary") 43 | ``` 44 | 45 | You can install the development version of `datadictionary` from [GitHub](https://github.com/) with: 46 | 47 | ``` r 48 | # install.packages("devtools") 49 | devtools::install_github("DoctorBJones/datadictionary") 50 | ``` 51 | 52 | ## Example 53 | 54 | You can print a basic data dictionary directly to your console or assign it to an object in your environment: 55 | 56 | ```{r} 57 | library(datadictionary) 58 | 59 | create_dictionary(esoph) 60 | 61 | esoph_dictionary <- create_dictionary(esoph) 62 | ``` 63 | 64 | 65 | You specify one or more identifier variables by passing a quoted string or vector of quoted strings to `id_var`. This is useful if you have hierarchical data, for example and have identifiers for individuals, clusters or blocks. 66 | 67 | ```{r} 68 | 69 | # create fake id variables 70 | mtcars$id1 <- 1:nrow(mtcars) 71 | mtcars$id2 <- mtcars$id1*10 72 | 73 | create_dictionary(mtcars, id_var = c("id1", "id2")) 74 | 75 | ``` 76 | You can also optionally add labels for unlabelled variables. You need to pass a named vector to `var_labels` where the names 77 | correspond to columns in your dataset. The vector must be of the same length as your dataset. 78 | 79 | ```{r} 80 | 81 | # Create labels as a named vector. 82 | iris.labels <- c(Sepal.Length = "Sepal length in mm", 83 | Sepal.Width = "Sepal width in mm", 84 | Petal.Length = "Petal length in mm", 85 | Petal.Width = "Petal width in mm", 86 | Species = "Species of iris") 87 | 88 | create_dictionary(iris, var_labels = iris.labels) 89 | ``` 90 | 91 | You can also write directly to Excel from the `create_dictionary` function if you pass a file path and name as a quoted string to the `file` parameter. There is no visible output for this use. 92 | 93 | ```{r, eval = FALSE} 94 | 95 | create_dictionary(ChickWeight, file = "chickweight_dictionary.xlsx") 96 | 97 | ``` 98 | 99 | The package also includes a function to create a summary of a single variable in your dataset. There are no other arguments to this function. 100 | ```{r} 101 | 102 | summarise_variable(iris, "Sepal.Length") 103 | 104 | summarise_variable(ChickWeight, "Diet") 105 | ``` 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | [![CRAN status](https://www.r-pkg.org/badges/version/datadictionary)](https://cran.r-project.org/package=datadictionary) 4 | [![R-CMD-check](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml) 5 | [![Codecov test 6 | coverage](https://codecov.io/gh/DoctorBJones/datadictionary/branch/main/graph/badge.svg)](https://app.codecov.io/gh/DoctorBJones/datadictionary?branch=main) 7 | 8 | 9 | # datadictionary 10 | 11 | The goal of `datadictionary` is to create a data dictionary from any 12 | dataframe or tibble in your R environment. While other packages exist I 13 | found they were complicated to use and/or the output wasn’t what I was 14 | after. This package attempts to solve those problems by presenting 15 | tabular summaries of the dataset in a format that fits easily in a pane 16 | or screen, using a single line of code. 17 | 18 | It includes an overall summary of the dataset and at-a-glance summaries 19 | of each variable. All variables have a count of missing included, and 20 | different summaries are provided based on the data class. 21 | 22 | For factors, labelled data and logicals the summary will include the 23 | name of each level with the level number in parentheses where 24 | appropriate. A value for the count of units in each level is included. 25 | 26 | For dates, integers and other numeric types of data the summary includes 27 | statistical summaries such as mean, median, mode, minimum and maximum. A 28 | value for each is included in the table. 29 | 30 | Character variables include only a count of unique values and missing 31 | values. This is the default so if you include a class of data that isn’t 32 | yet implemented you should get this output. 33 | 34 | You can nominate one or more identifier variables, for example 35 | individuals and clusters, so you only get a count of unique and missing 36 | values rather than nonsense numeric summaries. 37 | 38 | You can also include a vector to add labels if you want descriptions 39 | included in the document. Lastly, you can opt for the output to write 40 | directly to Excel. 41 | 42 | ## Installation 43 | 44 | You can install the current version of `datadictionary` from CRAN using: 45 | 46 | ``` r 47 | install.packages("datadictionary") 48 | ``` 49 | 50 | You can install the development version of `datadictionary` from 51 | [GitHub](https://github.com/) with: 52 | 53 | ``` r 54 | # install.packages("devtools") 55 | devtools::install_github("DoctorBJones/datadictionary") 56 | ``` 57 | 58 | ## Example 59 | 60 | You can print a basic data dictionary directly to your console or assign 61 | it to an object in your environment: 62 | 63 | ``` r 64 | library(datadictionary) 65 | 66 | create_dictionary(esoph) 67 | #> item label class summary value 68 | #> 1 Rows in dataset 88 69 | #> 2 Columns in dataset 5 70 | #> 3 agegp No label ordered factor 25-34 (1) 15 71 | #> 4 35-44 (2) 15 72 | #> 5 45-54 (3) 16 73 | #> 6 55-64 (4) 16 74 | #> 7 65-74 (5) 15 75 | #> 8 75+ (6) 11 76 | #> 9 missing 0 77 | #> 10 alcgp No label ordered factor 0-39g/day (1) 23 78 | #> 11 40-79 (2) 23 79 | #> 12 80-119 (3) 21 80 | #> 13 120+ (4) 21 81 | #> 14 missing 0 82 | #> 15 tobgp No label ordered factor 0-9g/day (1) 24 83 | #> 16 10-19 (2) 24 84 | #> 17 20-29 (3) 20 85 | #> 18 30+ (4) 20 86 | #> 19 missing 0 87 | #> 20 ncases No label numeric mean 2 88 | #> 21 median 1 89 | #> 22 min 0 90 | #> 23 max 17 91 | #> 24 missing 0 92 | #> 25 ncontrols No label numeric mean 9 93 | #> 26 median 4 94 | #> 27 min 0 95 | #> 28 max 60 96 | #> 29 missing 0 97 | 98 | esoph_dictionary <- create_dictionary(esoph) 99 | ``` 100 | 101 | You specify one or more identifier variables by passing a quoted string 102 | or vector of quoted strings to `id_var`. This is useful if you have 103 | hierarchical data, for example and have identifiers for individuals, 104 | clusters or blocks. 105 | 106 | ``` r 107 | 108 | # create fake id variables 109 | mtcars$id1 <- 1:nrow(mtcars) 110 | mtcars$id2 <- mtcars$id1*10 111 | 112 | create_dictionary(mtcars, id_var = c("id1", "id2")) 113 | #> item label class summary value 114 | #> 1 Rows in dataset 32 115 | #> 2 Columns in dataset 13 116 | #> 3 id1 Unique identifier unique values 32 117 | #> 4 missing 0 118 | #> 5 id2 Unique identifier unique values 32 119 | #> 6 missing 0 120 | #> 7 mpg No label numeric mean 20 121 | #> 8 median 19 122 | #> 9 min 10.4 123 | #> 10 max 33.9 124 | #> 11 missing 0 125 | #> 12 cyl No label numeric mean 6 126 | #> 13 median 6 127 | #> 14 min 4 128 | #> 15 max 8 129 | #> 16 missing 0 130 | #> 17 disp No label numeric mean 231 131 | #> 18 median 196 132 | #> 19 min 71.1 133 | #> 20 max 472 134 | #> 21 missing 0 135 | #> 22 hp No label numeric mean 147 136 | #> 23 median 123 137 | #> 24 min 52 138 | #> 25 max 335 139 | #> 26 missing 0 140 | #> 27 drat No label numeric mean 4 141 | #> 28 median 4 142 | #> 29 min 2.76 143 | #> 30 max 4.93 144 | #> 31 missing 0 145 | #> 32 wt No label numeric mean 3 146 | #> 33 median 3 147 | #> 34 min 1.51 148 | #> 35 max 5.42 149 | #> 36 missing 0 150 | #> 37 qsec No label numeric mean 18 151 | #> 38 median 18 152 | #> 39 min 14.5 153 | #> 40 max 22.9 154 | #> 41 missing 0 155 | #> 42 vs No label numeric mean 0 156 | #> 43 median 0 157 | #> 44 min 0 158 | #> 45 max 1 159 | #> 46 missing 0 160 | #> 47 am No label numeric mean 0 161 | #> 48 median 0 162 | #> 49 min 0 163 | #> 50 max 1 164 | #> 51 missing 0 165 | #> 52 gear No label numeric mean 4 166 | #> 53 median 4 167 | #> 54 min 3 168 | #> 55 max 5 169 | #> 56 missing 0 170 | #> 57 carb No label numeric mean 3 171 | #> 58 median 2 172 | #> 59 min 1 173 | #> 60 max 8 174 | #> 61 missing 0 175 | ``` 176 | 177 | You can also optionally add labels for unlabelled variables. You need to 178 | pass a named vector to `var_labels` where the names correspond to 179 | columns in your dataset. The vector must be of the same length as your 180 | dataset. 181 | 182 | ``` r 183 | 184 | # Create labels as a named vector. 185 | iris.labels <- c(Sepal.Length = "Sepal length in mm", 186 | Sepal.Width = "Sepal width in mm", 187 | Petal.Length = "Petal length in mm", 188 | Petal.Width = "Petal width in mm", 189 | Species = "Species of iris") 190 | 191 | create_dictionary(iris, var_labels = iris.labels) 192 | #> item label class summary value 193 | #> 1 Rows in dataset 150 194 | #> 2 Columns in dataset 5 195 | #> 3 Sepal.Length Sepal length in mm numeric mean 6 196 | #> 4 median 6 197 | #> 5 min 4.3 198 | #> 6 max 7.9 199 | #> 7 missing 0 200 | #> 8 Sepal.Width Sepal width in mm numeric mean 3 201 | #> 9 median 3 202 | #> 10 min 2 203 | #> 11 max 4.4 204 | #> 12 missing 0 205 | #> 13 Petal.Length Petal length in mm numeric mean 4 206 | #> 14 median 4 207 | #> 15 min 1 208 | #> 16 max 6.9 209 | #> 17 missing 0 210 | #> 18 Petal.Width Petal width in mm numeric mean 1 211 | #> 19 median 1 212 | #> 20 min 0.1 213 | #> 21 max 2.5 214 | #> 22 missing 0 215 | #> 23 Species Species of iris factor setosa (1) 50 216 | #> 24 versicolor (2) 50 217 | #> 25 virginica (3) 50 218 | #> 26 missing 0 219 | ``` 220 | 221 | You can also write directly to Excel from the `create_dictionary` 222 | function if you pass a file path and name as a quoted string to the 223 | `file` parameter. There is no visible output for this use. 224 | 225 | ``` r 226 | 227 | create_dictionary(ChickWeight, file = "chickweight_dictionary.xlsx") 228 | ``` 229 | 230 | The package also includes a function to create a summary of a single 231 | variable in your dataset. There are no other arguments to this function. 232 | 233 | ``` r 234 | 235 | summarise_variable(iris, "Sepal.Length") 236 | #> item label class summary value 237 | #> 1 Sepal.Length No label numeric mean 6 238 | #> 2 median 6 239 | #> 3 min 4.3 240 | #> 4 max 7.9 241 | #> 5 missing 0 242 | 243 | summarise_variable(ChickWeight, "Diet") 244 | #> item label class summary value 245 | #> 1 Diet No label factor 1 (1) 220 246 | #> 2 2 (2) 120 247 | #> 3 3 (3) 120 248 | #> 4 4 (4) 118 249 | #> 5 missing 0 250 | ``` 251 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## R CMD check results 2 | 3 | 0 errors | 0 warnings | 0 notes 4 | 5 | 6 | ## revdepcheck results 7 | 8 | There are currently no downstream dependencies for this package 9 | -------------------------------------------------------------------------------- /datadictionary.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /man/create_dictionary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create_dictionary.R 3 | \name{create_dictionary} 4 | \alias{create_dictionary} 5 | \title{Create a data dictionary from any dataset} 6 | \usage{ 7 | create_dictionary(dataset, id_var = NULL, file = NULL, var_labels = NULL) 8 | } 9 | \arguments{ 10 | \item{dataset}{The dataset you wish to summarise} 11 | 12 | \item{id_var}{A variable/vector of variables that are identifiers (optional)} 13 | 14 | \item{file}{The file path to write an Excel spreadsheet (optional)} 15 | 16 | \item{var_labels}{A named vector of variable labels (optional)} 17 | } 18 | \value{ 19 | Either an Excel spreadsheet or a dataframe 20 | } 21 | \description{ 22 | Create a data dictionary from any dataset 23 | } 24 | \examples{ 25 | 26 | # A simple dictionary printed to console 27 | create_dictionary(esoph) 28 | 29 | # You can specify id variable/s 30 | mtcars$id <- 1:nrow(mtcars) 31 | create_dictionary(mtcars, id_var = "id") 32 | 33 | # You can also specify labels with a named vector 34 | iris.labels <- c(Sepal.Length = "Sepal length in mm", 35 | Sepal.Width = "Sepal width in mm", 36 | Petal.Length = "Petal length in mm", 37 | Petal.Width = "Petal width in mm", 38 | Species = "Species of iris") 39 | create_dictionary(iris, var_labels = iris.labels) 40 | 41 | } 42 | -------------------------------------------------------------------------------- /man/mode_stat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils.R 3 | \name{mode_stat} 4 | \alias{mode_stat} 5 | \title{Get the mode of a vector} 6 | \usage{ 7 | mode_stat(x, freq = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{A vector} 11 | 12 | \item{freq}{Boolean when TRUE returns the frequency of the mode} 13 | } 14 | \description{ 15 | Get the mode of a vector 16 | } 17 | \keyword{internal} 18 | -------------------------------------------------------------------------------- /man/summarise_variable.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarise_variable.R 3 | \name{summarise_variable} 4 | \alias{summarise_variable} 5 | \title{Summarise a single variable} 6 | \usage{ 7 | summarise_variable(dataset, column) 8 | } 9 | \arguments{ 10 | \item{dataset}{The dataset with the variable you wish to summarise} 11 | 12 | \item{column}{The column you wish to summarise as a quoted string} 13 | } 14 | \value{ 15 | A dataframe with a summary of the variable 16 | } 17 | \description{ 18 | Summarise a single variable 19 | } 20 | \examples{ 21 | summarise_variable(mtcars, "mpg") 22 | 23 | summarise_variable(iris, "Species") 24 | } 25 | -------------------------------------------------------------------------------- /revdep/.gitignore: -------------------------------------------------------------------------------- 1 | checks 2 | library 3 | checks.noindex 4 | library.noindex 5 | cloud.noindex 6 | data.sqlite 7 | *.html 8 | -------------------------------------------------------------------------------- /revdep/README.md: -------------------------------------------------------------------------------- 1 | # Platform 2 | 3 | |field |value | 4 | |:--------|:-----------------------------------------| 5 | |version |R version 4.2.1 (2022-06-23 ucrt) | 6 | |os |Windows 10 x64 (build 22621) | 7 | |system |x86_64, mingw32 | 8 | |ui |RStudio | 9 | |language |(EN) | 10 | |collate |English_Australia.utf8 | 11 | |ctype |English_Australia.utf8 | 12 | |tz |Australia/Sydney | 13 | |date |2023-03-12 | 14 | |rstudio |2022.07.1+554 Spotted Wakerobin (desktop) | 15 | |pandoc |NA | 16 | 17 | # Dependencies 18 | 19 | |package |old |new |Δ | 20 | |:--------------|:-------|:----------|:--| 21 | |datadictionary |0.1.0 |0.1.0.9000 |* | 22 | |backports |1.4.1 |NA |* | 23 | |base64enc |0.1-3 |NA |* | 24 | |bit |4.0.5 |4.0.5 | | 25 | |bit64 |4.0.5 |4.0.5 | | 26 | |bslib |0.4.2 |NA |* | 27 | |cachem |1.0.7 |NA |* | 28 | |checkmate |2.1.0 |NA |* | 29 | |chron |2.3-60 |2.3-60 | | 30 | |cli |3.6.0 |3.6.0 | | 31 | |clipr |0.8.0 |0.8.0 | | 32 | |colorspace |2.1-0 |NA |* | 33 | |cpp11 |0.4.3 |0.4.3 | | 34 | |crayon |1.5.2 |1.5.2 | | 35 | |data.table |1.14.8 |1.14.8 | | 36 | |digest |0.6.31 |NA |* | 37 | |dplyr |1.1.0 |1.1.0 | | 38 | |ellipsis |0.3.2 |0.3.2 | | 39 | |evaluate |0.20 |NA |* | 40 | |fansi |1.0.4 |1.0.4 | | 41 | |farver |2.1.1 |NA |* | 42 | |fastmap |1.1.1 |NA |* | 43 | |forcats |1.0.0 |1.0.0 | | 44 | |Formula |1.2-5 |NA |* | 45 | |fs |1.6.1 |NA |* | 46 | |generics |0.1.3 |0.1.3 | | 47 | |ggplot2 |3.4.1 |NA |* | 48 | |glue |1.6.2 |1.6.2 | | 49 | |gridExtra |2.3 |NA |* | 50 | |gtable |0.3.1 |NA |* | 51 | |haven |2.5.2 |2.5.2 | | 52 | |highr |0.10 |NA |* | 53 | |Hmisc |5.0-1 |NA |* | 54 | |hms |1.1.2 |1.1.2 | | 55 | |htmlTable |2.4.1 |NA |* | 56 | |htmltools |0.5.4 |NA |* | 57 | |htmlwidgets |1.6.1 |NA |* | 58 | |isoband |0.2.7 |NA |* | 59 | |jquerylib |0.1.4 |NA |* | 60 | |jsonlite |1.8.4 |NA |* | 61 | |knitr |1.42 |NA |* | 62 | |labeling |0.4.2 |NA |* | 63 | |labelled |NA |2.10.0 |* | 64 | |lifecycle |1.0.3 |1.0.3 | | 65 | |lubridate |1.9.2 |1.9.2 | | 66 | |magrittr |2.0.3 |2.0.3 | | 67 | |memoise |2.0.1 |NA |* | 68 | |mime |0.12 |NA |* | 69 | |munsell |0.5.0 |NA |* | 70 | |openxlsx |4.2.5.2 |4.2.5.2 | | 71 | |pillar |1.8.1 |1.8.1 | | 72 | |pkgconfig |2.0.3 |2.0.3 | | 73 | |prettyunits |1.1.1 |1.1.1 | | 74 | |progress |1.2.2 |1.2.2 | | 75 | |purrr |1.0.1 |1.0.1 | | 76 | |R6 |2.5.1 |2.5.1 | | 77 | |rappdirs |0.3.3 |NA |* | 78 | |RColorBrewer |1.1-3 |NA |* | 79 | |Rcpp |1.0.10 |1.0.10 | | 80 | |readr |2.1.4 |2.1.4 | | 81 | |rlang |1.0.6 |1.0.6 | | 82 | |rmarkdown |2.20 |NA |* | 83 | |rstudioapi |0.14 |NA |* | 84 | |sass |0.4.5 |NA |* | 85 | |scales |1.2.1 |NA |* | 86 | |stringi |1.7.12 |1.7.12 | | 87 | |stringr |1.5.0 |1.5.0 | | 88 | |tibble |3.2.0 |3.2.0 | | 89 | |tidyr |1.3.0 |1.3.0 | | 90 | |tidyselect |1.2.0 |1.2.0 | | 91 | |timechange |0.2.0 |0.2.0 | | 92 | |tinytex |0.44 |NA |* | 93 | |tzdb |0.3.0 |0.3.0 | | 94 | |utf8 |1.2.3 |1.2.3 | | 95 | |vctrs |0.5.2 |0.5.2 | | 96 | |viridis |0.6.2 |NA |* | 97 | |viridisLite |0.4.1 |NA |* | 98 | |vroom |1.6.1 |1.6.1 | | 99 | |withr |2.5.0 |2.5.0 | | 100 | |xfun |0.37 |NA |* | 101 | |yaml |2.3.7 |NA |* | 102 | |zip |2.2.2 |2.2.2 | | 103 | 104 | # Revdeps 105 | 106 | -------------------------------------------------------------------------------- /revdep/cran.md: -------------------------------------------------------------------------------- 1 | ## revdepcheck results 2 | 3 | We checked 0 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. 4 | 5 | * We saw 0 new problems 6 | * We failed to check 0 packages 7 | 8 | -------------------------------------------------------------------------------- /revdep/email.yml: -------------------------------------------------------------------------------- 1 | release_date: ??? 2 | rel_release_date: ??? 3 | my_news_url: ??? 4 | release_version: ??? 5 | release_details: ??? 6 | -------------------------------------------------------------------------------- /revdep/failures.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /revdep/problems.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(datadictionary) 11 | 12 | test_check("datadictionary") 13 | -------------------------------------------------------------------------------- /tests/testthat/test-create_dictionary.R: -------------------------------------------------------------------------------- 1 | 2 | testthat::test_that("errors", { 3 | 4 | testthat::expect_error( 5 | create_dictionary(c(1,2,3,4)), 6 | "You can only make a dictionary for a dataframe or tibble" 7 | ) 8 | 9 | testthat::expect_error( 10 | create_dictionary(iris, file = "test.csv"), 11 | "You can only write to Excel files with extension `.xlsx`" 12 | ) 13 | 14 | }) 15 | 16 | testthat::test_that("dictionary",{ 17 | 18 | # overall summary 19 | over <- create_dictionary( 20 | readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')), 21 | id_var = "id") 22 | 23 | testthat::expect_equal(over$summary[1], 24 | "Rows in dataset" 25 | ) 26 | 27 | testthat::expect_equal(over$value[2], 28 | "15" 29 | ) 30 | 31 | # dimensions of object 32 | len <- create_dictionary( 33 | readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')), 34 | id_var = "id") 35 | 36 | testthat::expect_equal(nrow(len), 37 | 67 38 | ) 39 | 40 | # id var properly summarised 41 | testthat::expect_equal( 42 | len$label[3], 43 | "Unique identifier" 44 | ) 45 | 46 | # labelling working correctly 47 | 48 | test_labels <- c( 49 | id = "ID", 50 | start_date = "Start date", 51 | end_date = "End date", 52 | gender = "Gender", 53 | age = "Age", 54 | state = "State", 55 | duration = "Time taken to complete survey", 56 | likert = "Agreement", 57 | speed = "How fast", 58 | suggestions = "Policy suggestions", 59 | lab_location = "Location", 60 | effective_date = "Date recorded", 61 | all_missing = "Missing data", 62 | time_recorded = "Time recorded", 63 | labelled_data = "Labelled" 64 | ) 65 | 66 | lab <- create_dictionary( 67 | readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')), 68 | id_var = "id", var_labels = test_labels) 69 | 70 | testthat::expect_equal( 71 | lab$label[5], "Start date" 72 | ) 73 | 74 | # writing to Excel 75 | xl <- create_dictionary( 76 | readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')), 77 | file = "test.xlsx") 78 | 79 | testthat::test_path( 80 | "~/test.xlsx" 81 | ) 82 | 83 | }) 84 | 85 | -------------------------------------------------------------------------------- /tests/testthat/test-summarise_variable.R: -------------------------------------------------------------------------------- 1 | testthat::test_that("length", { 2 | 3 | testthat::expect_length( 4 | summarise_variable( 5 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 6 | "start_date"), 7 | 5 8 | ) 9 | 10 | testthat::expect_equal( 11 | nrow(summarise_variable( 12 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 13 | "lab_location")), 14 | 3 15 | ) 16 | 17 | }) 18 | 19 | # Errors 20 | 21 | testthat::test_that("error", { 22 | 23 | testthat::expect_warning( 24 | summarise_variable( 25 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 26 | "bad_factor"), 27 | "bad_factor has more than 10 levels, did you want a character variable?" 28 | ) 29 | 30 | testthat::expect_warning( 31 | summarise_variable( 32 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 33 | "bad_labels"), 34 | "bad_labels has different numbers of labels and levels. It has been treated as numeric" 35 | ) 36 | 37 | }) 38 | 39 | # test each data class 40 | testthat::test_that("classes", { 41 | 42 | #'Date' class 43 | sd <- summarise_variable( 44 | readRDS(file = testthat::test_path("testdata",'tester.rds')), 45 | "start_date") 46 | 47 | testthat::expect_equal( 48 | sd$value[1], 49 | "2022-02-17" 50 | ) 51 | 52 | # POSIX date class 53 | ed <- summarise_variable( 54 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 55 | "end_date") 56 | 57 | testthat::expect_equal( 58 | ed$value[2], 59 | "2022-01-20 2022-04-18 2022-04-22" 60 | ) 61 | 62 | # factor class 63 | g <- summarise_variable( 64 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 65 | "gender") 66 | 67 | testthat::expect_equal( 68 | g$summary[1], 69 | "Female (1)" 70 | ) 71 | 72 | # integer 73 | a <- summarise_variable( 74 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 75 | "age") 76 | 77 | testthat::expect_equal( 78 | a$value[4], 79 | "49" 80 | ) 81 | 82 | # haven labelled 83 | s <- summarise_variable( 84 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 85 | "state") 86 | 87 | testthat::expect_equal( 88 | s$summary[3], 89 | "Qld (3)" 90 | ) 91 | 92 | # haven partially labelled 93 | p <- summarise_variable( 94 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 95 | "bad_labels") 96 | 97 | testthat::expect_equal( 98 | p$summary[1], 99 | "mean" 100 | ) 101 | 102 | # difftime 103 | d <- summarise_variable( 104 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 105 | "duration") 106 | 107 | testthat::expect_equal( 108 | d$value[2], 109 | "20" 110 | ) 111 | 112 | # ordered factor 113 | l <- summarise_variable( 114 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 115 | "likert") 116 | 117 | testthat::expect_equal( 118 | l$summary[3], 119 | "Disagree (3)" 120 | ) 121 | 122 | # double 123 | d <- summarise_variable( 124 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 125 | "speed") 126 | 127 | testthat::expect_equal( 128 | d$value[3], 129 | "3.83" 130 | ) 131 | 132 | # character 133 | 134 | testthat::expect_warning( 135 | summarise_variable(readRDS(file = testthat::test_path("testdata",'tester.rds')), "comments"), 136 | "comments has fewer than 10 unique values, did you want a factor?" 137 | ) 138 | 139 | c <- summarise_variable( 140 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 141 | "suggestions") 142 | testthat::expect_equal( 143 | c$summary, 144 | c("unique responses", "missing") 145 | ) 146 | 147 | # logical 148 | log <- summarise_variable( 149 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 150 | "lab_location") 151 | 152 | testthat::expect_equal( 153 | log$summary[2], 154 | "TRUE" 155 | ) 156 | 157 | # datetime 158 | dttm <- summarise_variable( 159 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 160 | "effective_date") 161 | 162 | testthat::expect_equal( 163 | dttm$value[1], 164 | "2022-04-13" 165 | ) 166 | 167 | # all missing values 168 | nas <- summarise_variable( 169 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 170 | "all_missing") 171 | 172 | testthat::expect_equal( 173 | nas$value[1], 174 | "11" 175 | ) 176 | 177 | # times 178 | time <- summarise_variable( 179 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 180 | "time_recorded") 181 | 182 | testthat::expect_equal( 183 | time$value[4], 184 | "21:36:52" 185 | ) 186 | 187 | }) 188 | 189 | testthat::test_that("NA and mode", { 190 | 191 | testthat::expect_equal( 192 | nrow(summarise_variable( 193 | readRDS(file = testthat::test_path("testdata", 'tester.rds')), 194 | "all_missing")), 195 | 1 196 | ) 197 | 198 | tester <- readRDS(file = testthat::test_path("testdata", 'tester.rds')) 199 | 200 | testthat::expect_equal( 201 | mode_stat(tester$start_date), 202 | c(18996, 19084) 203 | ) 204 | 205 | }) 206 | -------------------------------------------------------------------------------- /tests/testthat/testdata/tester.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DoctorBJones/datadictionary/16a1a2dd4754e4d6e1c35759de43a03caa7974f2/tests/testthat/testdata/tester.rds -------------------------------------------------------------------------------- /tests/testthat/testdata/tester_no_error.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DoctorBJones/datadictionary/16a1a2dd4754e4d6e1c35759de43a03caa7974f2/tests/testthat/testdata/tester_no_error.rds --------------------------------------------------------------------------------