├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── create_dictionary.R
    ├── summarise_variable.R
    └── utils.R
├── README.Rmd
├── README.md
├── codecov.yml
├── cran-comments.md
├── datadictionary.Rproj
├── man
    ├── create_dictionary.Rd
    ├── mode_stat.Rd
    └── summarise_variable.Rd
├── revdep
    ├── .gitignore
    ├── README.md
    ├── cran.md
    ├── email.yml
    ├── failures.md
    └── problems.md
└── tests
    ├── testthat.R
    └── testthat
        ├── test-create_dictionary.R
        ├── test-summarise_variable.R
        └── testdata
            ├── tester.rds
            └── tester_no_error.rds


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^README\.Rmd$
 4 | ^LICENSE\.md$
 5 | ^\.github$
 6 | ^cran-comments\.md$
 7 | ^CRAN-SUBMISSION$
 8 | ^codecov\.yml$
 9 | ^revdep$
10 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ${{ matrix.config.os }}
14 | 
15 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
16 | 
17 |     strategy:
18 |       fail-fast: false
19 |       matrix:
20 |         config:
21 |           - {os: macOS-latest,   r: 'release'}
22 |           - {os: windows-latest, r: 'release'}
23 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
24 |           - {os: ubuntu-latest,   r: 'release'}
25 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
26 | 
27 |     env:
28 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
29 |       R_KEEP_PKG_SOURCE: yes
30 | 
31 |     steps:
32 |       - uses: actions/checkout@v2
33 | 
34 |       - uses: r-lib/actions/setup-pandoc@v2
35 | 
36 |       - uses: r-lib/actions/setup-r@v2
37 |         with:
38 |           r-version: ${{ matrix.config.r }}
39 |           http-user-agent: ${{ matrix.config.http-user-agent }}
40 |           use-public-rspm: true
41 | 
42 |       - uses: r-lib/actions/setup-r-dependencies@v2
43 |         with:
44 |           extra-packages: any::rcmdcheck
45 |           needs: check
46 | 
47 |       - uses: r-lib/actions/check-r-package@v2
48 |         with:
49 |           upload-snapshots: true
50 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v4
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: |
31 |           covr::codecov(
32 |             quiet = FALSE,
33 |             clean = FALSE,
34 |             install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
35 |           )
36 |         shell: Rscript {0}
37 | 
38 |       - name: Show testthat output
39 |         if: always()
40 |         run: |
41 |           ## --------------------------------------------------------------------
42 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 |         shell: bash
44 | 
45 |       - name: Upload test results
46 |         if: failure()
47 |         uses: actions/upload-artifact@v4
48 |         with:
49 |           name: coverage-test-failures
50 |           path: ${{ runner.temp }}/package
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: datadictionary
 2 | Title: Create a Data Dictionary
 3 | Version: 1.0.1.9000
 4 | Authors@R: 
 5 |     person(given = "Bethany", 
 6 |            family = "Jones",
 7 |            role = c("aut", "cre"),
 8 |            email = "r.data.nerd@gmail.com")
 9 | Description: Creates a data dictionary from any dataframe or tibble in your R environment. 
10 |     You can opt to add variable labels. You can write the object directly to Excel.
11 | License: MIT + file LICENSE
12 | Encoding: UTF-8
13 | Language: en-GB
14 | Roxygen: list(markdown = TRUE)
15 | RoxygenNote: 7.3.2
16 | Imports:
17 |     chron,
18 |     dplyr,
19 |     haven,
20 |     labelled,
21 |     lubridate,
22 |     openxlsx,
23 |     stats,
24 |     tibble,
25 |     tidyr,
26 |     tidyselect,
27 | Suggests: 
28 |     covr,
29 |     testthat (>= 3.0.0)
30 | Depends:
31 |     R (>= 4.1.0)
32 | Config/testthat/edition: 3
33 | URL: https://github.com/DoctorBJones/datadictionary
34 | BugReports: https://github.com/DoctorBJones/datadictionary/issues
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2022
2 | COPYRIGHT HOLDER: Bethany Jones
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2022 Bethany Jones
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(create_dictionary)
 4 | export(summarise_variable)
 5 | importFrom(chron,'as.times')
 6 | importFrom(dplyr,'bind_rows')
 7 | importFrom(dplyr,'mutate')
 8 | importFrom(haven,'as_factor')
 9 | importFrom(labelled,'var_label')
10 | importFrom(lubridate,'date')
11 | importFrom(openxlsx,'write.xlsx')
12 | importFrom(stats,'median')
13 | importFrom(tibble,'rownames_to_column')
14 | importFrom(tidyr,'pivot_longer')
15 | importFrom(tidyselect,'everything')
16 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # datadictionary (development version)
 2 | 
 3 | # datadictionary 1.0.1
 4 | 
 5 | * Moves code from magrittr to base pipe
 6 | * Corrects error that occurred when summarising partially labelled data
 7 | * Adds URLs to DESCRIPTION
 8 | 
 9 | # datadictionary 1.0.0
10 | 
11 | * Improves handling of vectors of class 'Date'
12 | * Improves handling of vectors of class 'difftime'
13 | * Improves output for vectors of class 'times'
14 | * Corrects occasional error with labelling
15 | * Improves handling of vectors with multiple classes
16 | 
17 | # datadictionary 0.1.0
18 | 
19 | * Added a `NEWS.md` file to track changes to the package.
20 | * Package renamed from `dd` to `datadictionary`
21 | 


--------------------------------------------------------------------------------
/R/create_dictionary.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' Create a data dictionary from any dataset
  3 | #'
  4 | #'
  5 | #' @param dataset The dataset you wish to summarise
  6 | #' @param file The file path to write an Excel spreadsheet (optional)
  7 | #' @param var_labels A named vector of variable labels (optional)
  8 | #' @param id_var A variable/vector of variables that are identifiers (optional)
  9 | 
 10 | #' @return Either an Excel spreadsheet or a dataframe
 11 | #'
 12 | #' @importFrom haven 'as_factor'
 13 | #' @importFrom openxlsx 'write.xlsx'
 14 | #' @importFrom labelled 'var_label'
 15 | #' @importFrom dplyr 'bind_rows'
 16 | #'
 17 | #' @examples
 18 | #'
 19 | #'  # A simple dictionary printed to console
 20 | #'  create_dictionary(esoph)
 21 | #'
 22 | #'  # You can specify id variable/s
 23 | #'  mtcars$id <- 1:nrow(mtcars)
 24 | #'  create_dictionary(mtcars, id_var = "id")
 25 | #'
 26 | #'  # You can also specify labels with a named vector
 27 | #'  iris.labels <- c(Sepal.Length = "Sepal length in mm",
 28 | #'      Sepal.Width = "Sepal width in mm",
 29 | #'      Petal.Length = "Petal length in mm",
 30 | #'      Petal.Width = "Petal width in mm",
 31 | #'      Species = "Species of iris")
 32 | #'  create_dictionary(iris, var_labels = iris.labels)
 33 | #'
 34 | #' @export
 35 | create_dictionary <- function(dataset,
 36 |                               id_var = NULL,
 37 |                               file = NULL,
 38 |                               var_labels = NULL) {
 39 | 
 40 |   # first check that the argument is correct class
 41 |   dataset_class <- class(dataset)
 42 | 
 43 |   if (! "data.frame" %in% dataset_class)
 44 |     stop("You can only make a dictionary for a dataframe or tibble")
 45 | 
 46 |   if (! is.null(file)) {
 47 |     if (grepl("xlsx$", file) == FALSE) {
 48 |       stop("You can only write to Excel files with extension `.xlsx`")
 49 |     }
 50 |   }
 51 | 
 52 |   if (is.null(file)) {
 53 |     output = TRUE
 54 |   } else {
 55 |     output = FALSE
 56 |   }
 57 | 
 58 |   if (! is.null(var_labels)) {
 59 |     labelled::var_label(dataset) <- var_labels
 60 |   }
 61 | 
 62 |   # initialise output dataframe with overall summary
 63 |   out <- dataset_summary(dataset)
 64 | 
 65 |   # create internal variable for the dataset
 66 |   df <- dataset
 67 | 
 68 |   # Use the id summary function for id var/s
 69 |   # remove the id vars from internal version of the data
 70 |   # once the summary is done so it doesn't get replicated
 71 |   if (! is.null(id_var)) {
 72 |     vec <- id_var
 73 | 
 74 |     for (i in vec) {
 75 |       f <- id_summary(dataset, i)
 76 | 
 77 |       out <- dplyr::bind_rows(out, f)
 78 | 
 79 |       df <- df[, ! names(df) == i]
 80 |       }
 81 |   }
 82 | 
 83 |   # find the names of the internal dataframe to iterate over
 84 |   df_col <- colnames(df)
 85 | 
 86 |   # summarise each column and append the summary to the output
 87 |   for (col in df_col) {
 88 | 
 89 |     x <- summarise_variable(df, col)
 90 | 
 91 |     out <- dplyr::bind_rows(out, x)
 92 |   }
 93 | 
 94 |   if (output == FALSE) {
 95 | 
 96 |     openxlsx::write.xlsx(out, file = file)
 97 | 
 98 |   } else {
 99 | 
100 |     return(out)
101 | 
102 |   }
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/R/summarise_variable.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' Summarise a single variable
 3 | #'
 4 | #' @param dataset The dataset with the variable you wish to summarise
 5 | #' @param column The column you wish to summarise as a quoted string
 6 | #'
 7 | #' @return A dataframe with a summary of the variable
 8 | #'
 9 | #' @examples
10 | #'  summarise_variable(mtcars, "mpg")
11 | #'
12 | #'  summarise_variable(iris, "Species")
13 | 
14 | #' @export
15 | summarise_variable <- function(dataset, column) {
16 | 
17 |   x <- class(dataset[[column]])
18 | 
19 |   if (sum(is.na(dataset[[column]])) == length(dataset[[column]])) {
20 |     allna_summary(dataset, column)
21 |   } else if ("factor" %in% x) {
22 |     factor_summary(dataset, column)
23 |   } else if ("haven_labelled" %in% x) {
24 |     label_summary(dataset, column)
25 |   } else if ("POSIXt" %in% x | "Date" %in% x) {
26 |     datetime_summary(dataset, column)
27 |   } else if ("times" %in% x) {
28 |     times_summary(dataset, column)
29 |   } else if ("difftime" %in% x |
30 |              "hms" %in% x |
31 |              "ms" %in% x |
32 |              "hm" %in% x) {
33 |     difftimes_summary(dataset, column)
34 |   } else if ("numeric" %in% x ||
35 |              "integer" %in% x ||
36 |              "double" %in% x) {
37 |     numeric_summary(dataset, column)
38 |   } else if ("logical" %in% x |
39 |              "boolean" %in% x ) {
40 |     logical_summary(dataset, column)
41 |   } else {
42 |     character_summary(dataset, column)
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' @importFrom tidyr 'pivot_longer'
  3 | #' @importFrom tidyselect 'everything'
  4 | #' @importFrom stats 'median'
  5 | #' @importFrom lubridate 'date'
  6 | #' @importFrom dplyr 'mutate'
  7 | #' @importFrom tibble 'rownames_to_column'
  8 | #' @importFrom chron 'as.times'
  9 | 
 10 | 
 11 | factor_summary <- function(dataset, column) {
 12 |   a <- as.data.frame(table(dataset[[column]]))
 13 |   names(a)[1] <- "summary"
 14 | 
 15 |   # throw a warning in case it should be numeric or character
 16 |   if (nrow(a) > 10) {
 17 |     msg <- paste0(column, " has more than 10 levels, did you want a character variable?")
 18 |     warning(msg)
 19 |   }
 20 | 
 21 |   # this creates the factor level with it's value in parentheses
 22 |   # e.g. Strongly disagree (5)
 23 | 
 24 |   a <- a |>
 25 |     dplyr::mutate(summary = paste(summary,
 26 |                                   " (",
 27 |                                   as.numeric(summary),
 28 |                                   ")", sep = ""))
 29 | 
 30 |   names(a)[2] <- "value"
 31 | 
 32 |   a$item <- ""
 33 |   a$item[1] <- gsub('"', '', deparse(column))
 34 | 
 35 |   a$class <- ""
 36 |   a$class[1] <-
 37 |     paste(class(dataset[[column]]), sep = " ", collapse = " ")
 38 | 
 39 |   a$label <- ""
 40 |   a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")),
 41 |                        "No label", attr(dataset[[column]], "label"))
 42 |   vars <- c("item", "label", "class", "summary", "value")
 43 |   a <- a[, vars]
 44 |   a[nrow(a) + 1, ] <-
 45 |     c("", "", "", "missing", sum(is.na(dataset[[column]])))
 46 | 
 47 |   a$value <- as.character(a$value)
 48 |   return(a)
 49 | }
 50 | 
 51 | numeric_summary <- function(dataset, column) {
 52 | 
 53 |   var <- dataset[[column]]
 54 | 
 55 |   a <- as.data.frame(round(mean(var, na.rm = TRUE)), digits = 2)
 56 |   names(a)[1] <- "mean"
 57 | 
 58 |   a$median = as.numeric(round(median(var, na.rm = TRUE)), digits = 2)
 59 |   a$min = round(min(var, na.rm = TRUE), digits = 2)
 60 |   a$max = round(max(var, na.rm = TRUE), digits = 2)
 61 |   a$missing = sum(is.na(dataset[[column]]))
 62 | 
 63 |   a <- a |>
 64 |     pivot_longer(cols = everything(),
 65 |                  names_to = "summary",
 66 |                  values_to = "value",
 67 |                  values_transform = list(value = as.character))
 68 | 
 69 |   # pivot_longer creates a tibble which actually messes with output
 70 |   a <- as.data.frame(a) # so coerce to df
 71 | 
 72 |   a$item <- ""
 73 |   a$item[1] <- gsub('"','', deparse(column))
 74 | 
 75 |   a$class <- ""
 76 |   a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ")
 77 | 
 78 |   a$label <- ""
 79 |   a$label[1] <- ifelse(
 80 |     is.null(attr(dataset[[column]], "label")),
 81 |     "No label", attr(dataset[[column]], "label"))
 82 | 
 83 |   vars <- c("item", "label", "class", "summary", "value")
 84 | 
 85 |   a <- a[, vars]
 86 | 
 87 |   a$value <- as.character(a$value)
 88 | 
 89 |   return(a)
 90 | }
 91 | 
 92 | character_summary <- function(dataset, column) {
 93 |   var <- dataset[[column]]
 94 | 
 95 | 
 96 |   a <- as.data.frame(length(unique(var)))
 97 |   names(a)[1] <- "unique responses"
 98 | 
 99 |   a$missing <- sum(is.na(var))
100 | 
101 |   a <- a |>
102 |     pivot_longer(cols = everything(), names_to = "summary")
103 | 
104 |   if (a$value[1] < 10) {
105 |     msg <- paste0(column, " has fewer than 10 unique values, did you want a factor?")
106 |     warning(msg)
107 |   }
108 | 
109 |   a <- as.data.frame(a)
110 | 
111 |   a$item <- ""
112 |   a$item[1] <- gsub('"', '', deparse(column))
113 | 
114 |   a$class <- ""
115 |   a$class[1] <-
116 |     paste(class(dataset[[column]]), sep = " ", collapse = " ")
117 | 
118 |   a$label <- ""
119 |   a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")),
120 |                        "No label", attr(dataset[[column]], "label"))
121 | 
122 |   vars <- c("item", "label", "class", "summary", "value")
123 |   a <- a[, vars]
124 | 
125 |   a$value <- as.character(a$value)
126 | 
127 |   return(a)
128 | }
129 | 
130 | 
131 | logical_summary <- function(dataset, column) {
132 | 
133 |   a <- as.data.frame(table(dataset[[column]]))
134 |   names(a)[1] <- "summary"
135 |   names(a)[2] <- "value"
136 | 
137 |   a$item <- ""
138 |   a$item[1] <- gsub('"', '', deparse(column))
139 | 
140 |   a$class <- ""
141 |   a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ")
142 | 
143 |   a$label <- ""
144 |   a$label[1] <- ifelse(
145 |     is.null(attr(dataset[[column]], "label")),
146 |     "No label", attr(dataset[[column]], "label")
147 |   )
148 | 
149 |   vars <- c("item", "label", "class", "summary", "value")
150 | 
151 |   a <- a[, vars]
152 |   a$summary <- as.character(a$summary)
153 |   a[nrow(a) + 1, ] <- c("", "", "", "missing", sum(is.na(dataset[[column]])))
154 | 
155 |   a$value <- as.character(a$value)
156 | 
157 |   return(a)
158 | 
159 | }
160 | 
161 | 
162 | datetime_summary <- function(dataset, column) {
163 |   var <- lubridate::date(dataset[[column]])
164 | 
165 |   a <- as.data.frame(as.character(mean(var, na.rm = TRUE)))
166 |   names(a)[1] <- "mean"
167 | 
168 |   date_mode <- as.Date(mode_stat(var), origin = '1970-01-01')
169 |   a$mode = paste(date_mode, sep = ", ", collapse = " ")
170 |   a$min = as.character(min(var, na.rm = TRUE))
171 |   a$max = as.character(max(var, na.rm = TRUE))
172 |   a$missing = as.character(sum(is.na(dataset[[column]])))
173 | 
174 |   a <- a |>
175 |     pivot_longer(cols = everything(), names_to = "summary")
176 |   a <- as.data.frame(a)
177 | 
178 |   a$item <- ""
179 |   a$item[1] <- gsub('"', '', deparse(column))
180 | 
181 |   a$class <- ""
182 |   a$class[1] <-
183 |     paste(class(dataset[[column]]), sep = " ", collapse = " ")
184 | 
185 |   a$label <- ""
186 |   a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")),
187 |                        "No label", attr(dataset[[column]], "label"))
188 | 
189 |   vars <- c("item", "label", "class", "summary", "value")
190 |   a <- a[, vars]
191 | 
192 |   a$value <- as.character(a$value)
193 | 
194 |   return(a)
195 | }
196 | 
197 | times_summary <- function(dataset, column) {
198 | 
199 |   a <- as.data.frame(as.character(mean(dataset[[column]], na.rm = TRUE)))
200 |   names(a)[1] <- "mean"
201 | 
202 |   a$median = as.character(median(dataset[[column]], na.rm = TRUE))
203 |   a$min = as.character(min(dataset[[column]], na.rm = TRUE))
204 |   a$max = as.character(max(dataset[[column]], na.rm = TRUE))
205 |   a$missing = as.character(sum(is.na(dataset[[column]])))
206 | 
207 |   a <- a |>
208 |     pivot_longer(cols = everything(), names_to = "summary")
209 |   a <- as.data.frame(a)
210 |   # a$value <- as.Date(a$value, format = "%Y-%m-%d")
211 | 
212 |   a$item <- ""
213 |   a$item[1] <- gsub('"','', deparse(column))
214 | 
215 |   a$class <- ""
216 |   a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ")
217 | 
218 |   a$label <- ""
219 |   a$label[1] <- ifelse(
220 |     is.null(attr(dataset[[column]], "label")),
221 |     "No label", attr(dataset[[column]], "label"))
222 | 
223 |   vars <- c("item", "label", "class", "summary", "value")
224 |   a <- a[, vars]
225 | 
226 |   a$value <- as.character(a$value)
227 | 
228 |   return(a)
229 | }
230 | 
231 | 
232 | label_summary <- function(dataset, column) {
233 | 
234 |   if (length(unique(dataset[[column]])) ==
235 |       length(attr(dataset[[column]], "labels"))) {
236 | 
237 |     label_values <-
238 |       as.data.frame(attributes(dataset[[column]])$labels) |>
239 |       tibble::rownames_to_column()
240 | 
241 |     names(label_values)[1] <- "label"
242 |     names(label_values)[2] <- "value"
243 | 
244 |     label_values$summary <-
245 |       paste(label_values$label, " (", label_values$value, ")",
246 |             sep = "")
247 | 
248 |     a <- as.data.frame(table(dataset[[column]]))
249 |     names(a)[1] <- "num_val"
250 |     names(a)[2] <- "value"
251 | 
252 |     a <- merge(a, label_values, by.x = "num_val", by.y = "value")
253 | 
254 |     a$item <- ""
255 |     a$item[1] <- gsub('"', '', deparse(column))
256 | 
257 |     a$class <- ""
258 |     a$class[1] <-
259 |       paste(class(dataset[[column]]), sep = " ", collapse = " ")
260 | 
261 |     a$label <- ""
262 |     a$label[1] <- ifelse(is.null(attr(dataset[[column]], "label")),
263 |                          "No label", attr(dataset[[column]], "label"))
264 | 
265 |     vars <- c("item", "label", "class", "summary", "value")
266 |     a <- a[, vars]
267 |     a[nrow(a) + 1, ] <-
268 |       c("", "", "", "missing", sum(is.na(dataset[[column]])))
269 |     a$value <- as.character(a$value)
270 | 
271 |     return(a)
272 | 
273 |   } else {
274 | 
275 |     msg <- paste0(column, " has different numbers of labels and levels. It has been treated as numeric")
276 |     warning(msg)
277 | 
278 |     numeric_summary(dataset = dataset, column = column)
279 | 
280 |   }
281 | 
282 | }
283 | 
284 | difftimes_summary <- function(dataset, column) {
285 | 
286 |   var <- dataset[[column]]
287 | 
288 |   a <- as.data.frame(floor(mean(var, na.rm = TRUE)))
289 |   names(a)[1] <- "mean"
290 | 
291 |   a$median = median(var, na.rm = TRUE)
292 |   a$min = min(var, na.rm = TRUE)
293 |   a$max = max(var, na.rm = TRUE)
294 |   a$missing = sum(is.na(dataset[[column]]))
295 | 
296 |   a <- a |>
297 |     pivot_longer(cols = everything(),
298 |                  names_to = "summary",
299 |                  values_to = "value",
300 |                  values_transform = list(value = as.character))
301 | 
302 |   # pivot_longer creates a tibble which actually messes with output
303 |   a <- as.data.frame(a) # so coerce to df
304 | 
305 |   a$item <- ""
306 |   a$item[1] <- gsub('"','', deparse(column))
307 | 
308 |   a$class <- ""
309 |   a$class[1] <- paste(class(dataset[[column]]), sep = " ", collapse = " ")
310 | 
311 |   a$label <- ""
312 |   a$label[1] <- ifelse(
313 |     is.null(attr(dataset[[column]], "label")),
314 |     "No label", attr(dataset[[column]], "label"))
315 | 
316 |   vars <- c("item", "label", "class", "summary", "value")
317 | 
318 |   a <- a[, vars]
319 | 
320 |   a$value <- as.character(a$value)
321 | 
322 |   return(a)
323 | }
324 | 
325 | 
326 | id_summary <- function(dataset, column) {
327 |   var <- dataset[[column]]
328 | 
329 |   item <- gsub('"', '', deparse(column))
330 |   label <- "Unique identifier"
331 |   class <- ""
332 |   summary <- "unique values"
333 |   value <- length(unique(var))
334 | 
335 |   a <- data.frame(item, label, class, summary, value)
336 |   a[nrow(a) + 1, ] <-
337 |     c("", "", "", "missing", sum(is.na(dataset[[column]])))
338 | 
339 |   a$value <- as.character(a$value)
340 | 
341 |   return(a)
342 | }
343 | 
344 | allna_summary <- function(dataset, column) {
345 |   a <- data.frame(
346 |     item = gsub('"', '', deparse(column)),
347 |     label = ifelse(is.null(attr(dataset[[column]], "label")),
348 |                    "No label", attr(dataset[[column]], "label")),
349 |     class = paste(class(dataset[[column]]), sep = " ", collapse = " "),
350 |     summary = "missing",
351 |     value = as.character(length(dataset[[column]]))
352 |   )
353 | }
354 | 
355 | dataset_summary <- function(dataset) {
356 |   x <- as.data.frame(nrow(dataset))
357 |   y <- as.data.frame(ncol(dataset))
358 | 
359 |   a <- cbind(x, y)
360 |   names(a)[1] <- "Rows in dataset"
361 |   names(a)[2] <- "Columns in dataset"
362 | 
363 |   a <- a |>
364 |     pivot_longer(cols = everything(), names_to = "summary")
365 |   a <- as.data.frame(a)
366 | 
367 |   a$item <- ""
368 | 
369 |   a$class <- ""
370 | 
371 |   a$label <- ""
372 | 
373 |   vars <- c("item", "label", "class", "summary", "value")
374 |   a <- a[, vars]
375 | 
376 |   a$value <- as.character(a$value)
377 | 
378 |   return(a)
379 | }
380 | 
381 | #' Get the mode of a vector
382 | #' @param x A vector
383 | #' @param freq Boolean when TRUE returns the frequency of the mode
384 | #' @keywords internal
385 | mode_stat <- function(x, freq = FALSE) {
386 |   z <- 2
387 |   if (freq)
388 |     z <- 1:2
389 |   run <- x
390 | 
391 |   run <- as.vector(run)
392 | 
393 |   run <- sort(run)
394 | 
395 |   run <- rle(run)
396 | 
397 |   run <- unclass(run)
398 | 
399 |   run <- data.frame(run)
400 | 
401 |   colnames(run) <- c("freq", "value")
402 | 
403 |   run[which(run$freq == max(run$freq)), z]
404 | }
405 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | ```{r, include = FALSE}
  6 | knitr::opts_chunk$set(
  7 |   collapse = TRUE,
  8 |   comment = "#>",
  9 |   fig.path = "man/figures/README-",
 10 |   out.width = "100%"
 11 | )
 12 | ```
 13 | 
 14 |  <!-- badges: start -->
 15 |  [![CRAN status](https://www.r-pkg.org/badges/version/datadictionary)](https://cran.r-project.org/package=datadictionary)
 16 |  [![R-CMD-check](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml)
 17 |  [![Codecov test coverage](https://codecov.io/gh/DoctorBJones/datadictionary/branch/main/graph/badge.svg)](https://app.codecov.io/gh/DoctorBJones/datadictionary?branch=main)
 18 |   <!-- badges: end -->
 19 | 
 20 | # datadictionary
 21 | 
 22 | The goal of `datadictionary` is to create a data dictionary from any dataframe or tibble in your R environment. While other packages exist I found they were complicated to use and/or the output wasn't what I was after. This package attempts to solve those problems by presenting tabular summaries of the dataset in a format that fits easily in a pane or screen, using a single line of code. 
 23 | 
 24 | It includes an overall summary of the dataset and at-a-glance summaries of each variable. All variables have a count of missing included, and different summaries are provided based on the data class.
 25 | 
 26 | For factors, labelled data and logicals the summary will include the name of each level with the level number in parentheses where appropriate. A value for the count of units in each level is included. 
 27 | 
 28 | For dates, integers and other numeric types of data the summary includes statistical summaries such as mean, median, mode, minimum and maximum. A value for each is included in the table. 
 29 | 
 30 | Character variables include only a count of unique values and missing values. This is the default so if you include a class of data that isn't yet implemented you should get this output.
 31 | 
 32 | You can nominate one or more identifier variables, for example individuals and clusters, so you only get a count of unique and missing values rather than nonsense numeric summaries. 
 33 | 
 34 | You can also include a vector to add labels if you want descriptions included in the document. Lastly, you can opt for the output to write directly to Excel.
 35 | 
 36 | 
 37 | ## Installation
 38 | 
 39 | You can install the current version of `datadictionary` from CRAN using:
 40 | 
 41 | ``` r
 42 | install.packages("datadictionary")
 43 | ```
 44 | 
 45 | You can install the development version of `datadictionary` from [GitHub](https://github.com/) with:
 46 | 
 47 | ``` r
 48 | # install.packages("devtools")
 49 | devtools::install_github("DoctorBJones/datadictionary")
 50 | ```
 51 | 
 52 | ## Example
 53 | 
 54 | You can print a basic data dictionary directly to your console or assign it to an object in your environment:
 55 | 
 56 | ```{r}
 57 | library(datadictionary)
 58 | 
 59 | create_dictionary(esoph)
 60 | 
 61 | esoph_dictionary <- create_dictionary(esoph)
 62 | ```
 63 | 
 64 | 
 65 | You specify one or more identifier variables by passing a quoted string or vector of quoted strings to `id_var`. This is useful if you have hierarchical data, for example and have identifiers for individuals, clusters or blocks.
 66 | 
 67 | ```{r}
 68 | 
 69 | # create fake id variables
 70 | mtcars$id1 <- 1:nrow(mtcars)
 71 | mtcars$id2 <- mtcars$id1*10
 72 | 
 73 | create_dictionary(mtcars, id_var = c("id1", "id2"))
 74 | 
 75 | ```
 76 | You can also optionally add labels for unlabelled variables. You need to pass a named vector  to `var_labels` where the names 
 77 | correspond to columns in your dataset. The vector must be of the same length as your dataset.
 78 | 
 79 | ```{r}
 80 | 
 81 | # Create labels as a named vector. 
 82 | iris.labels <- c(Sepal.Length = "Sepal length in mm",
 83 |                  Sepal.Width = "Sepal width in mm",
 84 |                  Petal.Length = "Petal length in mm",
 85 |                  Petal.Width = "Petal width in mm",
 86 |                  Species = "Species of iris")
 87 | 
 88 | create_dictionary(iris, var_labels = iris.labels)
 89 | ```
 90 | 
 91 | You can also write directly to Excel from the `create_dictionary` function if you pass a file path and name as a quoted string to the `file` parameter. There is no visible output for this use.
 92 | 
 93 | ```{r, eval = FALSE}
 94 | 
 95 | create_dictionary(ChickWeight, file = "chickweight_dictionary.xlsx")
 96 | 
 97 | ```
 98 | 
 99 | The package also includes a function to create a summary of a single variable in your dataset. There are no other arguments to this function.
100 | ```{r}
101 | 
102 | summarise_variable(iris, "Sepal.Length")
103 | 
104 | summarise_variable(ChickWeight, "Diet")
105 | ```
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- badges: start --> 
  3 | [![CRAN status](https://www.r-pkg.org/badges/version/datadictionary)](https://cran.r-project.org/package=datadictionary)
  4 | [![R-CMD-check](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/DoctorBJones/datadictionary/actions/workflows/R-CMD-check.yaml)
  5 | [![Codecov test
  6 | coverage](https://codecov.io/gh/DoctorBJones/datadictionary/branch/main/graph/badge.svg)](https://app.codecov.io/gh/DoctorBJones/datadictionary?branch=main)
  7 | <!-- badges: end -->
  8 | 
  9 | # datadictionary
 10 | 
 11 | The goal of `datadictionary` is to create a data dictionary from any
 12 | dataframe or tibble in your R environment. While other packages exist I
 13 | found they were complicated to use and/or the output wasn’t what I was
 14 | after. This package attempts to solve those problems by presenting
 15 | tabular summaries of the dataset in a format that fits easily in a pane
 16 | or screen, using a single line of code.
 17 | 
 18 | It includes an overall summary of the dataset and at-a-glance summaries
 19 | of each variable. All variables have a count of missing included, and
 20 | different summaries are provided based on the data class.
 21 | 
 22 | For factors, labelled data and logicals the summary will include the
 23 | name of each level with the level number in parentheses where
 24 | appropriate. A value for the count of units in each level is included.
 25 | 
 26 | For dates, integers and other numeric types of data the summary includes
 27 | statistical summaries such as mean, median, mode, minimum and maximum. A
 28 | value for each is included in the table.
 29 | 
 30 | Character variables include only a count of unique values and missing
 31 | values. This is the default so if you include a class of data that isn’t
 32 | yet implemented you should get this output.
 33 | 
 34 | You can nominate one or more identifier variables, for example
 35 | individuals and clusters, so you only get a count of unique and missing
 36 | values rather than nonsense numeric summaries.
 37 | 
 38 | You can also include a vector to add labels if you want descriptions
 39 | included in the document. Lastly, you can opt for the output to write
 40 | directly to Excel.
 41 | 
 42 | ## Installation
 43 | 
 44 | You can install the current version of `datadictionary` from CRAN using:
 45 | 
 46 | ``` r
 47 | install.packages("datadictionary")
 48 | ```
 49 | 
 50 | You can install the development version of `datadictionary` from
 51 | [GitHub](https://github.com/) with:
 52 | 
 53 | ``` r
 54 | # install.packages("devtools")
 55 | devtools::install_github("DoctorBJones/datadictionary")
 56 | ```
 57 | 
 58 | ## Example
 59 | 
 60 | You can print a basic data dictionary directly to your console or assign
 61 | it to an object in your environment:
 62 | 
 63 | ``` r
 64 | library(datadictionary)
 65 | 
 66 | create_dictionary(esoph)
 67 | #>         item    label          class            summary value
 68 | #> 1                                       Rows in dataset    88
 69 | #> 2                                    Columns in dataset     5
 70 | #> 3      agegp No label ordered factor          25-34 (1)    15
 71 | #> 4                                             35-44 (2)    15
 72 | #> 5                                             45-54 (3)    16
 73 | #> 6                                             55-64 (4)    16
 74 | #> 7                                             65-74 (5)    15
 75 | #> 8                                               75+ (6)    11
 76 | #> 9                                               missing     0
 77 | #> 10     alcgp No label ordered factor      0-39g/day (1)    23
 78 | #> 11                                            40-79 (2)    23
 79 | #> 12                                           80-119 (3)    21
 80 | #> 13                                             120+ (4)    21
 81 | #> 14                                              missing     0
 82 | #> 15     tobgp No label ordered factor       0-9g/day (1)    24
 83 | #> 16                                            10-19 (2)    24
 84 | #> 17                                            20-29 (3)    20
 85 | #> 18                                              30+ (4)    20
 86 | #> 19                                              missing     0
 87 | #> 20    ncases No label        numeric               mean     2
 88 | #> 21                                               median     1
 89 | #> 22                                                  min     0
 90 | #> 23                                                  max    17
 91 | #> 24                                              missing     0
 92 | #> 25 ncontrols No label        numeric               mean     9
 93 | #> 26                                               median     4
 94 | #> 27                                                  min     0
 95 | #> 28                                                  max    60
 96 | #> 29                                              missing     0
 97 | 
 98 | esoph_dictionary <- create_dictionary(esoph)
 99 | ```
100 | 
101 | You specify one or more identifier variables by passing a quoted string
102 | or vector of quoted strings to `id_var`. This is useful if you have
103 | hierarchical data, for example and have identifiers for individuals,
104 | clusters or blocks.
105 | 
106 | ``` r
107 | 
108 | # create fake id variables
109 | mtcars$id1 <- 1:nrow(mtcars)
110 | mtcars$id2 <- mtcars$id1*10
111 | 
112 | create_dictionary(mtcars, id_var = c("id1", "id2"))
113 | #>    item             label   class            summary value
114 | #> 1                                    Rows in dataset    32
115 | #> 2                                 Columns in dataset    13
116 | #> 3   id1 Unique identifier              unique values    32
117 | #> 4                                            missing     0
118 | #> 5   id2 Unique identifier              unique values    32
119 | #> 6                                            missing     0
120 | #> 7   mpg          No label numeric               mean    20
121 | #> 8                                             median    19
122 | #> 9                                                min  10.4
123 | #> 10                                               max  33.9
124 | #> 11                                           missing     0
125 | #> 12  cyl          No label numeric               mean     6
126 | #> 13                                            median     6
127 | #> 14                                               min     4
128 | #> 15                                               max     8
129 | #> 16                                           missing     0
130 | #> 17 disp          No label numeric               mean   231
131 | #> 18                                            median   196
132 | #> 19                                               min  71.1
133 | #> 20                                               max   472
134 | #> 21                                           missing     0
135 | #> 22   hp          No label numeric               mean   147
136 | #> 23                                            median   123
137 | #> 24                                               min    52
138 | #> 25                                               max   335
139 | #> 26                                           missing     0
140 | #> 27 drat          No label numeric               mean     4
141 | #> 28                                            median     4
142 | #> 29                                               min  2.76
143 | #> 30                                               max  4.93
144 | #> 31                                           missing     0
145 | #> 32   wt          No label numeric               mean     3
146 | #> 33                                            median     3
147 | #> 34                                               min  1.51
148 | #> 35                                               max  5.42
149 | #> 36                                           missing     0
150 | #> 37 qsec          No label numeric               mean    18
151 | #> 38                                            median    18
152 | #> 39                                               min  14.5
153 | #> 40                                               max  22.9
154 | #> 41                                           missing     0
155 | #> 42   vs          No label numeric               mean     0
156 | #> 43                                            median     0
157 | #> 44                                               min     0
158 | #> 45                                               max     1
159 | #> 46                                           missing     0
160 | #> 47   am          No label numeric               mean     0
161 | #> 48                                            median     0
162 | #> 49                                               min     0
163 | #> 50                                               max     1
164 | #> 51                                           missing     0
165 | #> 52 gear          No label numeric               mean     4
166 | #> 53                                            median     4
167 | #> 54                                               min     3
168 | #> 55                                               max     5
169 | #> 56                                           missing     0
170 | #> 57 carb          No label numeric               mean     3
171 | #> 58                                            median     2
172 | #> 59                                               min     1
173 | #> 60                                               max     8
174 | #> 61                                           missing     0
175 | ```
176 | 
177 | You can also optionally add labels for unlabelled variables. You need to
178 | pass a named vector to `var_labels` where the names correspond to
179 | columns in your dataset. The vector must be of the same length as your
180 | dataset.
181 | 
182 | ``` r
183 | 
184 | # Create labels as a named vector. 
185 | iris.labels <- c(Sepal.Length = "Sepal length in mm",
186 |                  Sepal.Width = "Sepal width in mm",
187 |                  Petal.Length = "Petal length in mm",
188 |                  Petal.Width = "Petal width in mm",
189 |                  Species = "Species of iris")
190 | 
191 | create_dictionary(iris, var_labels = iris.labels)
192 | #>            item              label   class            summary value
193 | #> 1                                             Rows in dataset   150
194 | #> 2                                          Columns in dataset     5
195 | #> 3  Sepal.Length Sepal length in mm numeric               mean     6
196 | #> 4                                                      median     6
197 | #> 5                                                         min   4.3
198 | #> 6                                                         max   7.9
199 | #> 7                                                     missing     0
200 | #> 8   Sepal.Width  Sepal width in mm numeric               mean     3
201 | #> 9                                                      median     3
202 | #> 10                                                        min     2
203 | #> 11                                                        max   4.4
204 | #> 12                                                    missing     0
205 | #> 13 Petal.Length Petal length in mm numeric               mean     4
206 | #> 14                                                     median     4
207 | #> 15                                                        min     1
208 | #> 16                                                        max   6.9
209 | #> 17                                                    missing     0
210 | #> 18  Petal.Width  Petal width in mm numeric               mean     1
211 | #> 19                                                     median     1
212 | #> 20                                                        min   0.1
213 | #> 21                                                        max   2.5
214 | #> 22                                                    missing     0
215 | #> 23      Species    Species of iris  factor         setosa (1)    50
216 | #> 24                                             versicolor (2)    50
217 | #> 25                                              virginica (3)    50
218 | #> 26                                                    missing     0
219 | ```
220 | 
221 | You can also write directly to Excel from the `create_dictionary`
222 | function if you pass a file path and name as a quoted string to the
223 | `file` parameter. There is no visible output for this use.
224 | 
225 | ``` r
226 | 
227 | create_dictionary(ChickWeight, file = "chickweight_dictionary.xlsx")
228 | ```
229 | 
230 | The package also includes a function to create a summary of a single
231 | variable in your dataset. There are no other arguments to this function.
232 | 
233 | ``` r
234 | 
235 | summarise_variable(iris, "Sepal.Length")
236 | #>           item    label   class summary value
237 | #> 1 Sepal.Length No label numeric    mean     6
238 | #> 2                                median     6
239 | #> 3                                   min   4.3
240 | #> 4                                   max   7.9
241 | #> 5                               missing     0
242 | 
243 | summarise_variable(ChickWeight, "Diet")
244 | #>   item    label  class summary value
245 | #> 1 Diet No label factor   1 (1)   220
246 | #> 2                        2 (2)   120
247 | #> 3                        3 (3)   120
248 | #> 4                        4 (4)   118
249 | #> 5                      missing     0
250 | ```
251 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
1 | ## R CMD check results
2 | 
3 | 0 errors | 0 warnings | 0 notes
4 | 
5 | 
6 | ## revdepcheck results
7 | 
8 | There are currently no downstream dependencies for this package
9 | 


--------------------------------------------------------------------------------
/datadictionary.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | LineEndingConversion: Posix
18 | 
19 | BuildType: Package
20 | PackageUseDevtools: Yes
21 | PackageInstallArgs: --no-multiarch --with-keep.source
22 | PackageRoxygenize: rd,collate,namespace
23 | 


--------------------------------------------------------------------------------
/man/create_dictionary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_dictionary.R
 3 | \name{create_dictionary}
 4 | \alias{create_dictionary}
 5 | \title{Create a data dictionary from any dataset}
 6 | \usage{
 7 | create_dictionary(dataset, id_var = NULL, file = NULL, var_labels = NULL)
 8 | }
 9 | \arguments{
10 | \item{dataset}{The dataset you wish to summarise}
11 | 
12 | \item{id_var}{A variable/vector of variables that are identifiers (optional)}
13 | 
14 | \item{file}{The file path to write an Excel spreadsheet (optional)}
15 | 
16 | \item{var_labels}{A named vector of variable labels (optional)}
17 | }
18 | \value{
19 | Either an Excel spreadsheet or a dataframe
20 | }
21 | \description{
22 | Create a data dictionary from any dataset
23 | }
24 | \examples{
25 | 
26 |  # A simple dictionary printed to console
27 |  create_dictionary(esoph)
28 | 
29 |  # You can specify id variable/s
30 |  mtcars$id <- 1:nrow(mtcars)
31 |  create_dictionary(mtcars, id_var = "id")
32 | 
33 |  # You can also specify labels with a named vector
34 |  iris.labels <- c(Sepal.Length = "Sepal length in mm",
35 |      Sepal.Width = "Sepal width in mm",
36 |      Petal.Length = "Petal length in mm",
37 |      Petal.Width = "Petal width in mm",
38 |      Species = "Species of iris")
39 |  create_dictionary(iris, var_labels = iris.labels)
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/man/mode_stat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{mode_stat}
 4 | \alias{mode_stat}
 5 | \title{Get the mode of a vector}
 6 | \usage{
 7 | mode_stat(x, freq = FALSE)
 8 | }
 9 | \arguments{
10 | \item{x}{A vector}
11 | 
12 | \item{freq}{Boolean when TRUE returns the frequency of the mode}
13 | }
14 | \description{
15 | Get the mode of a vector
16 | }
17 | \keyword{internal}
18 | 


--------------------------------------------------------------------------------
/man/summarise_variable.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summarise_variable.R
 3 | \name{summarise_variable}
 4 | \alias{summarise_variable}
 5 | \title{Summarise a single variable}
 6 | \usage{
 7 | summarise_variable(dataset, column)
 8 | }
 9 | \arguments{
10 | \item{dataset}{The dataset with the variable you wish to summarise}
11 | 
12 | \item{column}{The column you wish to summarise as a quoted string}
13 | }
14 | \value{
15 | A dataframe with a summary of the variable
16 | }
17 | \description{
18 | Summarise a single variable
19 | }
20 | \examples{
21 |  summarise_variable(mtcars, "mpg")
22 | 
23 |  summarise_variable(iris, "Species")
24 | }
25 | 


--------------------------------------------------------------------------------
/revdep/.gitignore:
--------------------------------------------------------------------------------
1 | checks
2 | library
3 | checks.noindex
4 | library.noindex
5 | cloud.noindex
6 | data.sqlite
7 | *.html
8 | 


--------------------------------------------------------------------------------
/revdep/README.md:
--------------------------------------------------------------------------------
  1 | # Platform
  2 | 
  3 | |field    |value                                     |
  4 | |:--------|:-----------------------------------------|
  5 | |version  |R version 4.2.1 (2022-06-23 ucrt)         |
  6 | |os       |Windows 10 x64 (build 22621)              |
  7 | |system   |x86_64, mingw32                           |
  8 | |ui       |RStudio                                   |
  9 | |language |(EN)                                      |
 10 | |collate  |English_Australia.utf8                    |
 11 | |ctype    |English_Australia.utf8                    |
 12 | |tz       |Australia/Sydney                          |
 13 | |date     |2023-03-12                                |
 14 | |rstudio  |2022.07.1+554 Spotted Wakerobin (desktop) |
 15 | |pandoc   |NA                                        |
 16 | 
 17 | # Dependencies
 18 | 
 19 | |package        |old     |new        |Δ  |
 20 | |:--------------|:-------|:----------|:--|
 21 | |datadictionary |0.1.0   |0.1.0.9000 |*  |
 22 | |backports      |1.4.1   |NA         |*  |
 23 | |base64enc      |0.1-3   |NA         |*  |
 24 | |bit            |4.0.5   |4.0.5      |   |
 25 | |bit64          |4.0.5   |4.0.5      |   |
 26 | |bslib          |0.4.2   |NA         |*  |
 27 | |cachem         |1.0.7   |NA         |*  |
 28 | |checkmate      |2.1.0   |NA         |*  |
 29 | |chron          |2.3-60  |2.3-60     |   |
 30 | |cli            |3.6.0   |3.6.0      |   |
 31 | |clipr          |0.8.0   |0.8.0      |   |
 32 | |colorspace     |2.1-0   |NA         |*  |
 33 | |cpp11          |0.4.3   |0.4.3      |   |
 34 | |crayon         |1.5.2   |1.5.2      |   |
 35 | |data.table     |1.14.8  |1.14.8     |   |
 36 | |digest         |0.6.31  |NA         |*  |
 37 | |dplyr          |1.1.0   |1.1.0      |   |
 38 | |ellipsis       |0.3.2   |0.3.2      |   |
 39 | |evaluate       |0.20    |NA         |*  |
 40 | |fansi          |1.0.4   |1.0.4      |   |
 41 | |farver         |2.1.1   |NA         |*  |
 42 | |fastmap        |1.1.1   |NA         |*  |
 43 | |forcats        |1.0.0   |1.0.0      |   |
 44 | |Formula        |1.2-5   |NA         |*  |
 45 | |fs             |1.6.1   |NA         |*  |
 46 | |generics       |0.1.3   |0.1.3      |   |
 47 | |ggplot2        |3.4.1   |NA         |*  |
 48 | |glue           |1.6.2   |1.6.2      |   |
 49 | |gridExtra      |2.3     |NA         |*  |
 50 | |gtable         |0.3.1   |NA         |*  |
 51 | |haven          |2.5.2   |2.5.2      |   |
 52 | |highr          |0.10    |NA         |*  |
 53 | |Hmisc          |5.0-1   |NA         |*  |
 54 | |hms            |1.1.2   |1.1.2      |   |
 55 | |htmlTable      |2.4.1   |NA         |*  |
 56 | |htmltools      |0.5.4   |NA         |*  |
 57 | |htmlwidgets    |1.6.1   |NA         |*  |
 58 | |isoband        |0.2.7   |NA         |*  |
 59 | |jquerylib      |0.1.4   |NA         |*  |
 60 | |jsonlite       |1.8.4   |NA         |*  |
 61 | |knitr          |1.42    |NA         |*  |
 62 | |labeling       |0.4.2   |NA         |*  |
 63 | |labelled       |NA      |2.10.0     |*  |
 64 | |lifecycle      |1.0.3   |1.0.3      |   |
 65 | |lubridate      |1.9.2   |1.9.2      |   |
 66 | |magrittr       |2.0.3   |2.0.3      |   |
 67 | |memoise        |2.0.1   |NA         |*  |
 68 | |mime           |0.12    |NA         |*  |
 69 | |munsell        |0.5.0   |NA         |*  |
 70 | |openxlsx       |4.2.5.2 |4.2.5.2    |   |
 71 | |pillar         |1.8.1   |1.8.1      |   |
 72 | |pkgconfig      |2.0.3   |2.0.3      |   |
 73 | |prettyunits    |1.1.1   |1.1.1      |   |
 74 | |progress       |1.2.2   |1.2.2      |   |
 75 | |purrr          |1.0.1   |1.0.1      |   |
 76 | |R6             |2.5.1   |2.5.1      |   |
 77 | |rappdirs       |0.3.3   |NA         |*  |
 78 | |RColorBrewer   |1.1-3   |NA         |*  |
 79 | |Rcpp           |1.0.10  |1.0.10     |   |
 80 | |readr          |2.1.4   |2.1.4      |   |
 81 | |rlang          |1.0.6   |1.0.6      |   |
 82 | |rmarkdown      |2.20    |NA         |*  |
 83 | |rstudioapi     |0.14    |NA         |*  |
 84 | |sass           |0.4.5   |NA         |*  |
 85 | |scales         |1.2.1   |NA         |*  |
 86 | |stringi        |1.7.12  |1.7.12     |   |
 87 | |stringr        |1.5.0   |1.5.0      |   |
 88 | |tibble         |3.2.0   |3.2.0      |   |
 89 | |tidyr          |1.3.0   |1.3.0      |   |
 90 | |tidyselect     |1.2.0   |1.2.0      |   |
 91 | |timechange     |0.2.0   |0.2.0      |   |
 92 | |tinytex        |0.44    |NA         |*  |
 93 | |tzdb           |0.3.0   |0.3.0      |   |
 94 | |utf8           |1.2.3   |1.2.3      |   |
 95 | |vctrs          |0.5.2   |0.5.2      |   |
 96 | |viridis        |0.6.2   |NA         |*  |
 97 | |viridisLite    |0.4.1   |NA         |*  |
 98 | |vroom          |1.6.1   |1.6.1      |   |
 99 | |withr          |2.5.0   |2.5.0      |   |
100 | |xfun           |0.37    |NA         |*  |
101 | |yaml           |2.3.7   |NA         |*  |
102 | |zip            |2.2.2   |2.2.2      |   |
103 | 
104 | # Revdeps
105 | 
106 | 


--------------------------------------------------------------------------------
/revdep/cran.md:
--------------------------------------------------------------------------------
1 | ## revdepcheck results
2 | 
3 | We checked 0 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
4 | 
5 |  * We saw 0 new problems
6 |  * We failed to check 0 packages
7 | 
8 | 


--------------------------------------------------------------------------------
/revdep/email.yml:
--------------------------------------------------------------------------------
1 | release_date: ???
2 | rel_release_date: ???
3 | my_news_url: ???
4 | release_version: ???
5 | release_details: ???
6 | 


--------------------------------------------------------------------------------
/revdep/failures.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*


--------------------------------------------------------------------------------
/revdep/problems.md:
--------------------------------------------------------------------------------
1 | *Wow, no problems at all. :)*


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(datadictionary)
11 | 
12 | test_check("datadictionary")
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test-create_dictionary.R:
--------------------------------------------------------------------------------
 1 | 
 2 | testthat::test_that("errors", {
 3 | 
 4 |   testthat::expect_error(
 5 |     create_dictionary(c(1,2,3,4)),
 6 |     "You can only make a dictionary for a dataframe or tibble"
 7 |   )
 8 | 
 9 |   testthat::expect_error(
10 |     create_dictionary(iris, file = "test.csv"),
11 |     "You can only write to Excel files with extension `.xlsx`"
12 |   )
13 | 
14 | })
15 | 
16 | testthat::test_that("dictionary",{
17 | 
18 |   # overall summary
19 |   over <- create_dictionary(
20 |     readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')),
21 |     id_var = "id")
22 | 
23 |   testthat::expect_equal(over$summary[1],
24 |                          "Rows in dataset"
25 |   )
26 | 
27 |   testthat::expect_equal(over$value[2],
28 |                          "15"
29 |   )
30 | 
31 |   # dimensions of object
32 |   len <- create_dictionary(
33 |     readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')),
34 |     id_var = "id")
35 | 
36 |   testthat::expect_equal(nrow(len),
37 |     67
38 |   )
39 | 
40 |   # id var properly summarised
41 |   testthat::expect_equal(
42 |     len$label[3],
43 |     "Unique identifier"
44 |   )
45 | 
46 |   # labelling working correctly
47 | 
48 |   test_labels <- c(
49 |     id = "ID",
50 |     start_date = "Start date",
51 |     end_date = "End date",
52 |     gender = "Gender",
53 |     age = "Age",
54 |     state = "State",
55 |     duration = "Time taken to complete survey",
56 |     likert = "Agreement",
57 |     speed = "How fast",
58 |     suggestions = "Policy suggestions",
59 |     lab_location = "Location",
60 |     effective_date = "Date recorded",
61 |     all_missing = "Missing data",
62 |     time_recorded = "Time recorded",
63 |     labelled_data = "Labelled"
64 |                    )
65 | 
66 |   lab <- create_dictionary(
67 |     readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')),
68 |     id_var = "id", var_labels = test_labels)
69 | 
70 |   testthat::expect_equal(
71 |     lab$label[5], "Start date"
72 |   )
73 | 
74 |   # writing to Excel
75 |   xl <- create_dictionary(
76 |     readRDS(file = testthat::test_path("testdata", 'tester_no_error.rds')),
77 |     file = "test.xlsx")
78 | 
79 |   testthat::test_path(
80 |     "~/test.xlsx"
81 |   )
82 | 
83 | })
84 | 
85 | 


--------------------------------------------------------------------------------
/tests/testthat/test-summarise_variable.R:
--------------------------------------------------------------------------------
  1 | testthat::test_that("length", {
  2 | 
  3 |   testthat::expect_length(
  4 |     summarise_variable(
  5 |       readRDS(file = testthat::test_path("testdata", 'tester.rds')),
  6 |       "start_date"),
  7 |     5
  8 |   )
  9 | 
 10 |   testthat::expect_equal(
 11 |     nrow(summarise_variable(
 12 |       readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 13 |       "lab_location")),
 14 |     3
 15 |   )
 16 | 
 17 | })
 18 | 
 19 | # Errors
 20 | 
 21 | testthat::test_that("error", {
 22 | 
 23 |   testthat::expect_warning(
 24 |     summarise_variable(
 25 |       readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 26 |       "bad_factor"),
 27 |     "bad_factor has more than 10 levels, did you want a character variable?"
 28 |   )
 29 | 
 30 |   testthat::expect_warning(
 31 |     summarise_variable(
 32 |       readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 33 |       "bad_labels"),
 34 |     "bad_labels has different numbers of labels and levels. It has been treated as numeric"
 35 |     )
 36 | 
 37 | })
 38 | 
 39 | # test each data class
 40 | testthat::test_that("classes", {
 41 | 
 42 |   #'Date' class
 43 |   sd <- summarise_variable(
 44 |     readRDS(file = testthat::test_path("testdata",'tester.rds')),
 45 |     "start_date")
 46 | 
 47 |   testthat::expect_equal(
 48 |     sd$value[1],
 49 |     "2022-02-17"
 50 |   )
 51 | 
 52 |   # POSIX date class
 53 |   ed <- summarise_variable(
 54 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 55 |     "end_date")
 56 | 
 57 |   testthat::expect_equal(
 58 |     ed$value[2],
 59 |     "2022-01-20 2022-04-18 2022-04-22"
 60 |   )
 61 | 
 62 |   # factor class
 63 |   g <- summarise_variable(
 64 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 65 |     "gender")
 66 | 
 67 |   testthat::expect_equal(
 68 |     g$summary[1],
 69 |     "Female (1)"
 70 |   )
 71 | 
 72 |   # integer
 73 |   a <- summarise_variable(
 74 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 75 |     "age")
 76 | 
 77 |   testthat::expect_equal(
 78 |     a$value[4],
 79 |     "49"
 80 |   )
 81 | 
 82 |   # haven labelled
 83 |   s <- summarise_variable(
 84 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 85 |     "state")
 86 | 
 87 |   testthat::expect_equal(
 88 |     s$summary[3],
 89 |     "Qld (3)"
 90 |   )
 91 | 
 92 |   # haven partially labelled
 93 |   p <- summarise_variable(
 94 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
 95 |     "bad_labels")
 96 | 
 97 |   testthat::expect_equal(
 98 |     p$summary[1],
 99 |     "mean"
100 |   )
101 | 
102 |   # difftime
103 |   d <- summarise_variable(
104 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
105 |     "duration")
106 | 
107 |   testthat::expect_equal(
108 |     d$value[2],
109 |     "20"
110 |   )
111 | 
112 |   # ordered factor
113 |   l <- summarise_variable(
114 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
115 |     "likert")
116 | 
117 |   testthat::expect_equal(
118 |     l$summary[3],
119 |     "Disagree (3)"
120 |   )
121 | 
122 |   # double
123 |   d <- summarise_variable(
124 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
125 |     "speed")
126 | 
127 |   testthat::expect_equal(
128 |     d$value[3],
129 |     "3.83"
130 |   )
131 | 
132 |   # character
133 | 
134 |   testthat::expect_warning(
135 |     summarise_variable(readRDS(file = testthat::test_path("testdata",'tester.rds')), "comments"),
136 |     "comments has fewer than 10 unique values, did you want a factor?"
137 |   )
138 | 
139 |   c  <- summarise_variable(
140 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
141 |     "suggestions")
142 |   testthat::expect_equal(
143 |     c$summary,
144 |     c("unique responses", "missing")
145 |   )
146 | 
147 |   # logical
148 |   log <- summarise_variable(
149 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
150 |     "lab_location")
151 | 
152 |   testthat::expect_equal(
153 |     log$summary[2],
154 |     "TRUE"
155 |   )
156 | 
157 |   # datetime
158 |   dttm <- summarise_variable(
159 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
160 |     "effective_date")
161 | 
162 |   testthat::expect_equal(
163 |     dttm$value[1],
164 |     "2022-04-13"
165 |   )
166 | 
167 |   # all missing values
168 |   nas <- summarise_variable(
169 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
170 |     "all_missing")
171 | 
172 |   testthat::expect_equal(
173 |     nas$value[1],
174 |     "11"
175 |   )
176 | 
177 |   # times
178 |   time <- summarise_variable(
179 |     readRDS(file = testthat::test_path("testdata", 'tester.rds')),
180 |     "time_recorded")
181 | 
182 |   testthat::expect_equal(
183 |     time$value[4],
184 |     "21:36:52"
185 |   )
186 | 
187 | })
188 | 
189 | testthat::test_that("NA and mode", {
190 | 
191 |   testthat::expect_equal(
192 |     nrow(summarise_variable(
193 |       readRDS(file = testthat::test_path("testdata", 'tester.rds')),
194 |       "all_missing")),
195 |     1
196 |   )
197 | 
198 |   tester <- readRDS(file = testthat::test_path("testdata", 'tester.rds'))
199 | 
200 |   testthat::expect_equal(
201 |     mode_stat(tester$start_date),
202 |     c(18996, 19084)
203 |   )
204 | 
205 | })
206 | 


--------------------------------------------------------------------------------
/tests/testthat/testdata/tester.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DoctorBJones/datadictionary/16a1a2dd4754e4d6e1c35759de43a03caa7974f2/tests/testthat/testdata/tester.rds


--------------------------------------------------------------------------------
/tests/testthat/testdata/tester_no_error.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DoctorBJones/datadictionary/16a1a2dd4754e4d6e1c35759de43a03caa7974f2/tests/testthat/testdata/tester_no_error.rds


--------------------------------------------------------------------------------