├── .Rhistory ├── .travis.yml ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── aggregateForBarplot.R ├── aggregateForHistogram.R ├── allCheckFunctions.R ├── allClasses.R ├── allSummaryFunctions.R ├── allVisualFunctions.R ├── allXFunctions.R ├── basicVisual.R ├── centralValue.R ├── check.R ├── checkFunction.R ├── checkResult.R ├── classes.R ├── countMissing.R ├── dataReporter-package.R ├── dataReporter_as_factor.R ├── description.R ├── identifyCaseIssues.R ├── identifyLoners.R ├── identifyMissing.R ├── identifyNums.R ├── identifyOutliers.R ├── identifyOutliersTBStyle.R ├── identifyWhitespace.R ├── isCPR.R ├── isEmpty.R ├── isKey.R ├── isSingular.R ├── isSupported.R ├── makeCodebook.R ├── makeDataReport.R ├── makeXFunction.R ├── messageGenerator.R ├── minMax.R ├── misc.R ├── quartiles.R ├── refCat.R ├── render.R ├── setChecks.R ├── setSummaries.R ├── setVisuals.R ├── smartNum.R ├── standardVisual.R ├── summarize.R ├── summaryFunction.R ├── summaryResult.R ├── tableVisual.R ├── uniqueValues.R ├── unpackLabelled.R ├── utility.R ├── variableType.R ├── visualFunction.R └── visualize.R ├── README.html ├── README.md ├── data ├── artData.rda ├── bigPresidentData.rda ├── exampleData.RData ├── presidentData.rda ├── testData.RData └── toyData.RData ├── inst └── CITATION ├── man ├── allCheckFunctions.Rd ├── allClasses.Rd ├── allSummaryFunctions.Rd ├── allVisualFunctions.Rd ├── artData.Rd ├── basicVisual.Rd ├── basicVisualCFLB.Rd ├── bigPresidentData.Rd ├── centralValue.Rd ├── check.Rd ├── checkFunction.Rd ├── checkResult.Rd ├── classes.Rd ├── countMissing.Rd ├── defaultCharacterChecks.Rd ├── defaultCharacterSummaries.Rd ├── defaultDateChecks.Rd ├── defaultDateSummaries.Rd ├── defaultFactorChecks.Rd ├── defaultFactorSummaries.Rd ├── defaultHavenlabelledChecks.Rd ├── defaultHavenlabelledSummaries.Rd ├── defaultIntegerChecks.Rd ├── defaultIntegerSummaries.Rd ├── defaultLabelledChecks.Rd ├── defaultLabelledSummaries.Rd ├── defaultLogicalChecks.Rd ├── defaultLogicalSummaries.Rd ├── defaultNumericChecks.Rd ├── defaultNumericSummaries.Rd ├── description.Rd ├── exampleData.Rd ├── figures │ └── logo.png ├── identifyCaseIssues.Rd ├── identifyLoners.Rd ├── identifyMissing.Rd ├── identifyNums.Rd ├── identifyOutliers.Rd ├── identifyOutliersTBStyle.Rd ├── identifyWhitespace.Rd ├── isCPR.Rd ├── isKey.Rd ├── isSingular.Rd ├── isSupported.Rd ├── makeCodebook.Rd ├── makeDataReport.Rd ├── messageGenerator.Rd ├── minMax.Rd ├── presidentData.Rd ├── quartiles.Rd ├── refCat.Rd ├── render.Rd ├── setChecks.Rd ├── setSummaries.Rd ├── setVisuals.Rd ├── smartNum.Rd ├── standardVisual.Rd ├── summarize.Rd ├── summaryFunction.Rd ├── summaryResult.Rd ├── tableVisual.Rd ├── testData.Rd ├── toyData.Rd ├── uniqueValues.Rd ├── variableType.Rd ├── visualFunction.Rd ├── visualize.Rd └── whoami_available.Rd ├── tests ├── testthat.R └── testthat │ ├── atomic.R │ ├── spss_labelled.rda │ ├── testcheck.R │ ├── testisLoner.R │ ├── testmakeReport.R │ ├── testminMax.R │ ├── testsummarize.R │ └── testvariableType.R └── vignettes ├── .gitignore └── extending_dataReporter.Rmd /.Rhistory: -------------------------------------------------------------------------------- 1 | library(devtools) 2 | install_github("ekstroem/reporteR") 3 | library(reporteR) 4 | data(toydata) 5 | data(toyData) 6 | makeDataReport(toyData) 7 | ?makeDataReport 8 | makeCodebook(toyData) 9 | runUrl("https://github.com/ekstroem/reporteR/raw/master/app/app.zip") 10 | library(shiny) 11 | runUrl("https://github.com/ekstroem/reporteR/raw/master/app/app.zip") 12 | vignette("exending_reporteR") 13 | document(build_vignettes = TRUE 14 | ) 15 | ?document 16 | build() 17 | vignette("extending_reporteR") 18 | vigentte() 19 | vignette(package = "reporteR") 20 | build() 21 | install() 22 | vignette("extending_reporteR") 23 | vignette() 24 | vignette(package = "reporteR") 25 | document() 26 | ?install 27 | install(build_vignettes = TRUE) 28 | library(reporteR) 29 | vignette("extending_reporteR") 30 | library(devtools) 31 | install(build_vignettes = TRUE) 32 | library(dataReporter) 33 | data(toyData) 34 | toyData 35 | summarize(toyData) 36 | makeDataReport(toyData) 37 | vignette("extending_dataReporter") 38 | library(tidyverse) 39 | library(dataReporter) 40 | library(labelled) 41 | codebook_data <- rio::import("https://osf.io/s87kd/download", "csv") 42 | val_labels(codebook_data$gender) <- c("male" = 1, "female" = 2) 43 | val_labels(codebook_data$education) <- c("in high school" = 1, 44 | "finished high school" = 2, 45 | "some college" = 3, 46 | "college graduate" = 4, 47 | "graduate degree" = 5) 48 | codebook_data <- codebook_data %>% 49 | select(gender, education, age) 50 | codebook_data 51 | class(codebook_data$gender) 52 | class(codebook_data$education) 53 | variableType(codebook_data$gender) 54 | defaultHavenlabelledSummaries() 55 | variableType(codebook_data$countMissing) 56 | countMissing(codebook_data$gender) 57 | makeCodebook(codebook_data, replace=TRUE, output = "html") 58 | summarize(codebook_data$gender 59 | summarize(codebook_data$gender) 60 | summarize(codebook_data$gender) 61 | summarize(codebook_data$education) 62 | check(codebook_data$gender 63 | check(codebook_data$gender) 64 | check(codebook_data$gender) 65 | check(codebook_data$education) 66 | last_error() 67 | rlang::last_error() 68 | debugonce(check) 69 | check(codebook_data$education) 70 | debugonce(check) 71 | check(codebook_data$education) 72 | defaultHavenlabelledChecks() 73 | debugonce("identifyMissing") 74 | check(codebook_data$education) 75 | reporteR_as_factor(v) 76 | reporteR_as_factor() 77 | reporteR_as_factor 78 | dataReporter_as_factor(codebook_data$education) 79 | dataReporter:::dataReporter_as_factor(codebook_data$education) 80 | devtools::install_github("https://github.com/ekstroem/dataReporter") 81 | dataReporter:::dataReporter_as_factor(codebook_data$education) 82 | library(dataReporter) 83 | dataReporter:::dataReporter_as_factor(codebook_data$education) 84 | dataReporter_as_factor'' 85 | dataReporter_as_factor 86 | dataReporter:::dataReporter_as_factor 87 | devtools::check() 88 | install() 89 | devtools::install() 90 | dataReporter:::dataReporter_as_factor(codebook_data$education) 91 | devtools::install() 92 | dataReporter:::dataReporter_as_factor(codebook_data$education) 93 | traceback() 94 | debugonce(dataReporter:::dataReporter_haven_replace_with) 95 | dataReporter:::dataReporter_as_factor(codebook_data$education) 96 | out 97 | out 98 | tagged 99 | debugonce(dataReporter:::dataReporter_haven_replace_with) 100 | dataReporter:::dataReporter_as_factor(codebook_data$education) 101 | !any(tagged) 102 | return(out) 103 | out 104 | debugonce(dataReporter:::dataReporter_as_factor) 105 | dataReporter:::dataReporter_as_factor(codebook_data$education) 106 | levs 107 | debugonce(dataReporter:::dataReporter_as_factor) 108 | dataReporter:::dataReporter_as_factor(codebook_data$education) 109 | stats::setNames(vals, levs) 110 | sort(c(stats::setNames(vals, levs), labels), 111 | na.last = TRUE) 112 | labels 113 | stats::setNames(vals, levs) 114 | c(stats::setNames(vals, levs), labels) 115 | A 116 | codebook_data$gender 117 | codebook_data$education 118 | codebook_data$gender 119 | summarize(codebook_data$gender) 120 | summarize(codebook_data$education) 121 | as_factor 122 | ?as_factor 123 | haven::as_factor 124 | devtools::install() 125 | summarize(codebook_data$education) 126 | makeCodebook(codebook_data, replace=TRUE, output = "html") 127 | getwd() 128 | makeDataReport(codebook_data, codebook = TRUE, replace=TRUE, output = "html") 129 | devtools::install_github("ekstroem/dataReporter") 130 | makeDataReport(codebook_data, replace=TRUE, output = "html") 131 | dataReporter:::doCheckLabs() 132 | dataReporter:::doCheckLabs 133 | attr(codebook_data$education) 134 | ?attr 135 | attr(codebook_data$education, "label") 136 | attr(codebook_data$education, "labels") 137 | attributes(codebook_data$education) 138 | ?attr 139 | ?attributes 140 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: r 4 | pandoc_version: 1.17 5 | sudo: false 6 | cache: packages 7 | 8 | 9 | notifications: 10 | email: 11 | recipients: 12 | - ahpe@sund.ku.dk 13 | - github@ekstroem.com 14 | on_success: never # default: change 15 | on_failure: always # default: always -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: dataReporter 2 | Type: Package 3 | Title: Reproducible Data Screening Checks and Report of Possible Errors 4 | Version: 1.0.5 5 | Date: 2025-04-01 6 | Authors@R: c(person(given="Anne Helby", family="Petersen", email="ahpe@sund.ku.dk", role=c("aut")), 7 | person(given="Claus Thorn", family="Ekstrøm", email="ekstrom@sund.ku.dk", role=c("aut", "cre"))) 8 | Description: Data screening is an important first step of any statistical 9 | analysis. 'dataReporter' auto generates a customizable data report with a thorough 10 | summary of the checks and the results that a human can use to identify possible 11 | errors. It provides an extendable suite of test for common potential 12 | errors in a dataset. See Petersen AH, Ekstrøm CT (2019). "dataMaid: Your Assistant for Documenting Supervised Data Quality Screening in R." _Journal of Statistical Software_, *90*(6), 1-38 for more information. 13 | URL: https://github.com/ekstroem/dataReporter 14 | BugReports: https://github.com/ekstroem/dataReporter/issues 15 | Imports: 16 | ggplot2, 17 | gridExtra, 18 | haven, 19 | htmltools, 20 | magrittr, 21 | methods, 22 | pander, 23 | rmarkdown (>= 1.10), 24 | robustbase (>= 0.93-2), 25 | stringi, 26 | whoami, 27 | rlang 28 | Suggests: 29 | knitr, 30 | testthat 31 | Depends: 32 | R (>= 3.5.0) 33 | VignetteBuilder: knitr 34 | SystemRequirements: pandoc (>= 2.0; https://pandoc.org), git, whoami 35 | Encoding: UTF-8 36 | LazyData: true 37 | ByteCompile: true 38 | License: GPL-2 39 | RoxygenNote: 7.3.1 40 | Collate: 41 | 'aggregateForBarplot.R' 42 | 'aggregateForHistogram.R' 43 | 'allCheckFunctions.R' 44 | 'allClasses.R' 45 | 'allSummaryFunctions.R' 46 | 'allVisualFunctions.R' 47 | 'allXFunctions.R' 48 | 'makeXFunction.R' 49 | 'visualFunction.R' 50 | 'basicVisual.R' 51 | 'summaryFunction.R' 52 | 'centralValue.R' 53 | 'check.R' 54 | 'checkResult.R' 55 | 'messageGenerator.R' 56 | 'checkFunction.R' 57 | 'identifyMissing.R' 58 | 'minMax.R' 59 | 'classes.R' 60 | 'countMissing.R' 61 | 'dataReporter-package.R' 62 | 'dataReporter_as_factor.R' 63 | 'description.R' 64 | 'identifyCaseIssues.R' 65 | 'identifyLoners.R' 66 | 'identifyNums.R' 67 | 'identifyOutliers.R' 68 | 'identifyOutliersTBStyle.R' 69 | 'identifyWhitespace.R' 70 | 'isCPR.R' 71 | 'isSingular.R' 72 | 'isEmpty.R' 73 | 'isKey.R' 74 | 'isSupported.R' 75 | 'makeCodebook.R' 76 | 'makeDataReport.R' 77 | 'misc.R' 78 | 'quartiles.R' 79 | 'refCat.R' 80 | 'render.R' 81 | 'setChecks.R' 82 | 'setSummaries.R' 83 | 'setVisuals.R' 84 | 'smartNum.R' 85 | 'standardVisual.R' 86 | 'summarize.R' 87 | 'summaryResult.R' 88 | 'tableVisual.R' 89 | 'uniqueValues.R' 90 | 'unpackLabelled.R' 91 | 'utility.R' 92 | 'variableType.R' 93 | 'visualize.R' 94 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # dataReporter 1.0.5 2 | 3 | * Updated Rd files to add a missing backslash and provide correct package reference to robustbase in link. To accommodate CRAN requirements 4 | 5 | # dataReporter 1.0.3 6 | 7 | * Changed the internal call to `aes()` to fix that `aes_string()` was deprecated in ggplot2 3.0.0. Thanks to Florian Kohrt for pointing this out. 8 | 9 | # dataReporter 1.0.2 10 | 11 | * Fixed identifyMissRepChar() in order to account for updates to the intersect() function in base R. 12 | 13 | 14 | # dataReporter 1.0.0 15 | 16 | * Initial version uploaded to CRAN 17 | -------------------------------------------------------------------------------- /R/aggregateForBarplot.R: -------------------------------------------------------------------------------- 1 | #Aggregates data ready for bar plotting via. 2 | #This means that plotting code length depends 3 | #on the number of levels, not on the length of v 4 | aggregateForBarplot <- function(v) { 5 | outF <- data.frame(table(v)) 6 | names(outF) <- c("x", "y") 7 | outF 8 | } 9 | 10 | 11 | #testing 12 | #a <- sample(letters[1:4], 100, replace = TRUE, prob = c(0.1, 0.6, 0.2, 0.1)) 13 | #d <- data.frame(table(a)) 14 | #ggplot(data.frame(a = a), aes(x = a)) + 15 | # geom_bar() 16 | #ggplot(aggregateForBarplot(a), aes(x = x, y = y)) + 17 | # geom_bar(stat = "identity") 18 | -------------------------------------------------------------------------------- /R/aggregateForHistogram.R: -------------------------------------------------------------------------------- 1 | #Aggregates data ready for histogram plotting via ggplot2 and 2 | #geom_rect. This means that plotting code length depends 3 | #on the number of bins, not on the length of v. 4 | #Uses binning procedure from hist() so that basicVisual and 5 | #standardVisual create identical histograms, except for 6 | #plotting style. 7 | #note: "Sturges" is breaks default in hist() 8 | #and we need to set breaks for Date variables only 9 | aggregateForHistogram <- function(v, breaks = "Sturges") { 10 | d <- hist(v, plot = FALSE, breaks = breaks) 11 | nBins <- length(d$counts) 12 | out <- data.frame(xmin = d$breaks[-(nBins+1)], 13 | xmax = d$breaks[-1], 14 | ymin = 0, 15 | ymax = d$counts) 16 | out 17 | } 18 | 19 | #testing 20 | #a <- rexp(100000) 21 | #d <- aggregateForHistogram(a) 22 | #ggplot(d, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax)) + 23 | # geom_rect(col = "white") 24 | #p2 <- ggplot(data.frame(a = a), aes(x = a)) + 25 | # geom_histogram(col = "white", breaks = cuts) #note: cuts should be set globally for this line to work 26 | #grid.arrange(p1, p2, nrow = 2) 27 | 28 | 29 | #Old version 30 | #aggregateForHistogram <- function(v, bins = 20) { 31 | # minVal <- min(v) 32 | # maxVal <- max(v) 33 | # binWidth <- (maxVal - minVal)/bins 34 | # if (binWidth != 0) { 35 | # cuts <- c(minVal, (minVal + (1:(bins-1))*binWidth), maxVal) 36 | # factorV <- cut(v, breaks = cuts, include.lowest = TRUE) 37 | # outF <- as.data.frame(table(factorV)) 38 | # outF$xmin <- cuts[-(bins+1)] 39 | # outF$xmax <- cuts[-1] 40 | # outF$ymin <- 0 41 | # outF$ymax <- outF$Freq 42 | # } else { 43 | # outF <- data.frame(xmin = minVal, xmax = maxVal, 44 | # ymin = 0, ymax = length(v)) 45 | # } 46 | # outF 47 | #} -------------------------------------------------------------------------------- /R/allCheckFunctions.R: -------------------------------------------------------------------------------- 1 | #' @title Overview of all available checkFunctions 2 | #' 3 | #' @description Produce an overview of all functions of class \code{checkFunction} 4 | #' available in the workspace or imported from packages. This overview includes 5 | #' the descriptions and a list of what classes the functions are each intended 6 | #' to be called on. 7 | #' 8 | #' @return An object of class \code{functionSummary}. This object has entries \code{$name} 9 | #' (the function names), \code{$description} (the function descriptions, as obtained from their 10 | #' \code{description} attributes) and \code{$classes} (the classes each function is indeded 11 | #' to be called on, as obtained from their \code{classes} attributes). 12 | #' 13 | #' @seealso \code{\link{checkFunction}} \code{\link{allVisualFunctions}} 14 | #' \code{\link{allSummaryFunctions}} 15 | #' 16 | #' @examples 17 | #' allCheckFunctions() 18 | #' 19 | #' @export 20 | allCheckFunctions <- function() { 21 | allXFunctions("checkFunction") 22 | } 23 | -------------------------------------------------------------------------------- /R/allClasses.R: -------------------------------------------------------------------------------- 1 | #' @title Vector of all variable classes in \code{dataReporter} 2 | #' 3 | #' @description Returns the names of the eight data classes for which 4 | #' \code{dataReporter} is implemented, namely \code{"character"}, \code{"Date"}, 5 | #' \code{"factor"}, \code{"integer"}, \code{"labelled"}, 6 | #' \code{"haven_labelled"}, \code{"logical"} and 7 | #' \code{"numeric"}. 8 | #' 9 | #' @examples 10 | #' allClasses() 11 | #' 12 | #' @export 13 | allClasses <- function() { 14 | c("character", "Date", "factor", "integer", "labelled", 15 | "haven_labelled", 16 | "logical", "numeric") 17 | } 18 | -------------------------------------------------------------------------------- /R/allSummaryFunctions.R: -------------------------------------------------------------------------------- 1 | #' @title Overview of all available summaryFunctions 2 | #' 3 | #' @description Produce an overview of all functions of class \code{summaryFunction} 4 | #' available in the workspace or imported from packages. This overview includes 5 | #' the descriptions and a list of what classes the functions are each intended 6 | #' to be called on. 7 | #' 8 | #' @return An object of class \code{functionSummary}. This object has entries \code{$name} 9 | #' (the function names), \code{$description} (the function descriptions, as obtained from their 10 | #' \code{description} attributes) and \code{$classes} (the classes each function is indeded 11 | #' to be called on, as obtained from their \code{classes} attributes). 12 | #' 13 | #' @seealso \code{\link{summaryFunction}} \code{\link{allVisualFunctions}} 14 | #' \code{\link{allCheckFunctions}} 15 | #' 16 | #' @examples 17 | #' allSummaryFunctions() 18 | #' 19 | #' @export 20 | allSummaryFunctions <- function() { 21 | allXFunctions("summaryFunction") 22 | } 23 | -------------------------------------------------------------------------------- /R/allVisualFunctions.R: -------------------------------------------------------------------------------- 1 | #' @title Overview of all available visualFunctions 2 | #' 3 | #' @description Produce an overview of all functions of class \code{visualFunction} 4 | #' available in the workspace or imported from packages. This overview includes 5 | #' the descriptions and a list of what classes the functions are each intended 6 | #' to be called on. 7 | #' 8 | #' @return An object of class \code{functionSummary}. This object has entries \code{$name} 9 | #' (the function names), \code{$description} (the function descriptions, as obtained from their 10 | #' \code{description} attributes) and \code{$classes} (the classes each function is indeded 11 | #' to be called on, as obtained from their \code{classes} attributes). 12 | #' 13 | #' @seealso \code{\link{visualFunction}} \code{\link{allCheckFunctions}} 14 | #' \code{\link{allSummaryFunctions}} 15 | #' 16 | #' @examples 17 | #' allVisualFunctions() 18 | #' 19 | #' @export 20 | allVisualFunctions <- function() { 21 | allXFunctions("visualFunction") 22 | } 23 | 24 | -------------------------------------------------------------------------------- /R/allXFunctions.R: -------------------------------------------------------------------------------- 1 | #Make a functionSummary of all functions of type X, that is, 2 | #coerce their names, description attributes and classes attributes 3 | #into a list. 4 | #Called from allVisualFunction(), allSummaryFunctions(), allCheckFunctions(). 5 | allXFunctions <- function(X) { 6 | allF <- Filter(function(x) X %in% class(get(x)), union(ls(envir = .GlobalEnv), 7 | ls("package:dataReporter"))) 8 | out <- list(name = allF, description = sapply(allF, function(x) description(get(x))), 9 | classes = lapply(allF, function(x) classes(get(x)))) 10 | class(out) <- c("functionSummary", "list") 11 | out 12 | } 13 | 14 | -------------------------------------------------------------------------------- /R/centralValue.R: -------------------------------------------------------------------------------- 1 | #' @title summaryFunction for central values 2 | #' 3 | #' @description A \code{summaryFunction}, intended to be called from 4 | #' \code{\link{summarize}}, which returns the central value of a variable. 5 | #' For numeric and integer variables, this is the median. For 6 | #' character, factor, (have_)labelled, Date and logical variables, the central value is the mode 7 | #' (i.e. the value that occurs the largest number of times). 8 | #' 9 | #' @param v A variable (vector). 10 | #' 11 | #' @param ... Extra arguments to be passed to class-specific functions. These incluse 12 | #' \code{maxDecimals} (default is 2) which controls the rounding of integer and numeric 13 | #' values. 14 | #' 15 | #' @details Note that NA, NaN and Inf values are ignored for numeric and integer variables, while 16 | #' only NA values are ignored for factor, character, Date and (haven_)labelled variables. No values are 17 | #' ignored for logical variables. 18 | #' 19 | #' @return An object of class \code{summaryResult} with the following entries: \code{$feature} 20 | #' (the mode/median),\code{$result} (the central value of \code{v}) and \code{$value} (identical 21 | #' to \code{$result}). 22 | #' 23 | #' If the mode is returned and it is not uniquely determined, the first value qualifying as a mode is 24 | #' returned, when the variable is sorted according to \code{\link{sort}}. 25 | #' 26 | #' @seealso \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 27 | #' \code{\link{allSummaryFunctions}} 28 | #' 29 | #' @examples 30 | #' #central value of an integer variable: 31 | #' centralValue(c(rep(1, 25), rep(2, 10), rep(3, 20))) 32 | #' 33 | #' #central value of a character variable: 34 | #' centralValue(as.character(c(rep(1, 20), rep(2, 10), rep(3, 20)))) 35 | #' 36 | #' @importFrom stats na.omit median 37 | #' @export 38 | centralValue <- function(v, ...) UseMethod("centralValue") 39 | 40 | 41 | #assign methods to generic centralValue function 42 | 43 | #' @export 44 | centralValue.character <- function(v, ...) centralValueCF(v) 45 | 46 | #' @export 47 | centralValue.factor <- function(v, ...) centralValueCF(v) 48 | 49 | #' @export 50 | centralValue.labelled <- function(v, ...) centralValueL(v) 51 | 52 | 53 | #' @export 54 | centralValue.haven_labelled <- function(v, ...) centralValueL(v) 55 | 56 | 57 | #' @export 58 | centralValue.numeric <- function(v, ...) centralValueIN(v, ...) 59 | 60 | #' @export 61 | centralValue.integer <- function(v, ...) centralValueIN(v, ...) 62 | 63 | #' @export 64 | centralValue.logical <- function(v, ...) centralValueB(v) 65 | 66 | #' @export 67 | centralValue.Date <- function(v, ...) centralValueCF(v) 68 | 69 | 70 | #' @include summaryFunction.R 71 | centralValue <- summaryFunction(centralValue, 72 | "Compute median for numeric variables, mode for categorical variables", 73 | allClasses()) 74 | 75 | 76 | ##########################################Not exported below######################################### 77 | 78 | 79 | #methods for each variable type 80 | 81 | #logical variables 82 | centralValueB <- function(v) { 83 | vMode <- names(which.max(table(v, exclude=NULL)))[1] 84 | summaryResult(list(feature="Mode", 85 | result=paste("\"", escapeRMDStyle(vMode), "\"", sep=""), 86 | value = vMode)) 87 | } 88 | 89 | #character and factor variables 90 | ##' @importFrom stats na.omit 91 | centralValueCF <- function(v) { 92 | centralValueB(na.omit(v)) 93 | } 94 | 95 | #labelled variables 96 | centralValueL <- function(v) { 97 | centralValueB(na.omit(dataReporter_as_factor(v))) 98 | } 99 | 100 | #integer and numeric variables 101 | ##' @importFrom stats median na.omit 102 | centralValueIN <- function(v, maxDecimals = 2) { 103 | v <- na.omit(v) 104 | val <- median(v) 105 | summaryResult(list(feature="Median", 106 | result=round(val, maxDecimals), value = val)) 107 | } 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /R/checkResult.R: -------------------------------------------------------------------------------- 1 | #' @title Create object of class checkResult 2 | #' 3 | #' @description Convert a list resulting from the checks performed in a 4 | #' \code{\link{checkFunction}} into a \code{checkResult} object, thereby 5 | #' supplying it with a \code{print()} method. 6 | #' 7 | #' @param ls A list with entries \code{$problem} (logical indicating whether 8 | #' a problem was found), \code{$message} (a character string containing a 9 | #' message describing the problem) and \code{$problemValues} (the values 10 | #' in the checked variables that were marked as problematic). Note that 11 | #' \code{$message} and \code{$problemValues} can be left empty (i.e. 12 | #' \code{""} and \code{NULL}, respectively), if they are not relevant. 13 | #' 14 | #' @return A S3 object of class \code{checkResult}, identical to the inputted 15 | #' list, \code{ls}, except for its class attribute. 16 | #' 17 | #' @seealso \code{\link{checkFunction}} 18 | #' 19 | #' @export 20 | checkResult <- function(ls) { 21 | if (length(setdiff(names(ls), c("problem", "message", "problemValues"))) != 0) { 22 | stop("The inputted list does not qualify as a checkResult") 23 | } else { 24 | class(ls) <- "checkResult" 25 | } 26 | ls 27 | } 28 | 29 | 30 | #' @export 31 | print.checkResult <- function(x, ...) { 32 | if (x$problem) { 33 | mes <- x$message 34 | } else mes <- "No problems found." 35 | 36 | #remove escaping and quoting designed for rmarkdown rendering 37 | mes <- gsub("\\\\\"", "", mes) 38 | 39 | cat(mes) 40 | } -------------------------------------------------------------------------------- /R/classes.R: -------------------------------------------------------------------------------- 1 | #' Extract the contents of the attribute \code{classes} 2 | #' 3 | #' If the object, \code{x}, is itself of 4 | #' class \code{\link{checkFunction}}, \code{\link{summaryFunction}} 5 | #' or \code{\link{visualFunction}}, the contents of \code{x}'s 6 | #' attribute \code{classes} is returned. Otherwise, \code{NULL} is 7 | #' returned. 8 | #' 9 | #' @param x The object for which the \code{classes} 10 | #' attribute should be extracted. 11 | #' 12 | #' @return The classes for which \code{x} is intended to be called, 13 | #' given as a vector of characters. 14 | #' 15 | #' @examples 16 | #' #Extract the classes of the checkFunction identifyMissing 17 | #' classes(identifyMissing) 18 | #' 19 | #' #Extract the classes of the summaryFunction minMax 20 | #' classes(minMax) 21 | #' 22 | #' #Extract the classes of the visualFunction basicVisual 23 | #' classes(basicVisual) 24 | #' 25 | #' @include minMax.R identifyMissing.R basicVisual.R 26 | #' 27 | #' @export 28 | classes <- function(x) UseMethod("classes") 29 | 30 | #' @export 31 | classes.default <- function(x) NULL 32 | 33 | #' @export 34 | classes.checkFunction <- function(x) attr(x, "classes") 35 | 36 | #' @export 37 | classes.summaryFunction <- function(x) attr(x, "classes") 38 | 39 | #' @export 40 | classes.visualFunction <- function(x) attr(x, "classes") 41 | 42 | 43 | #' @rdname classes 44 | #' @usage classes(x) <- value 45 | #' @param value New value 46 | #' @export classes<- 47 | `classes<-` <- function(x, value) { 48 | attr(x, "classes") <- value 49 | x 50 | } 51 | -------------------------------------------------------------------------------- /R/countMissing.R: -------------------------------------------------------------------------------- 1 | #' Summary function for missing values 2 | #' 3 | #' A \code{\link{summaryFunction}}, intended to be called from 4 | #' \code{\link{summarize}} (and \code{\link{makeDataReport}}), which counts the 5 | #' number of missing (\code{NA}) values in a variable. 6 | #' 7 | #' @param v A variable (vector). 8 | #' @param ... Not in use. 9 | #' 10 | #' @return A \code{\link{summaryResult}} object with the following entries: 11 | #' \code{$feature} ("No. missing obs."), \code{$result} (the number and percentage 12 | #' missing observations) and \code{$value} (the number of missing observations). 13 | #' 14 | #' @seealso \code{\link{summarize}}, \code{\link{allSummaryFunctions}}, 15 | #' \code{\link{summaryFunction}}, \code{\link{summaryResult}} 16 | #' 17 | #' @examples 18 | #' countMissing(c(1:100, rep(NA, 10))) 19 | #' 20 | #' @export 21 | countMissing <- function(v, ...) { 22 | noMissing <- sum(is.na(v)) 23 | percentMissing <- round(100*noMissing/length(v),2) 24 | summaryResult(list(feature = "Number of missing obs." , 25 | result = paste(noMissing, " (", 26 | percentMissing," %)", sep=""), 27 | value = noMissing)) 28 | } 29 | 30 | #' @include summaryFunction.R 31 | countMissing <- summaryFunction(countMissing, 32 | "Compute proportion of missing observations", 33 | allClasses()) 34 | -------------------------------------------------------------------------------- /R/dataReporter_as_factor.R: -------------------------------------------------------------------------------- 1 | #'@importFrom haven as_factor 2 | dataReporter_as_factor <- function(v) { 3 | as_factor(v) 4 | } 5 | 6 | 7 | 8 | ## Adding a verbatim copy of the unexported function dataReporter_haven_replace_with 9 | ## so a note does not pop up when checking the package. 10 | dataReporter_haven_replace_with_old <- function(x, from, to) 11 | { 12 | stopifnot(length(from) == length(to)) 13 | #out <- x 14 | out <- rep(NA, length(x)) 15 | matches <- match(x, from, incomparables = NA) 16 | 17 | if (anyNA(matches)) { 18 | out[!is.na(matches)] <- to[matches[!is.na(matches)]] 19 | } else { 20 | out <- to[matches] 21 | } 22 | 23 | tagged <- haven::is_tagged_na(x) 24 | if (!any(tagged)) { 25 | return(out) 26 | } 27 | matches <- match(haven::na_tag(x), haven::na_tag(from), incomparables = NA) 28 | out[!is.na(matches)] <- to[matches[!is.na(matches)]] 29 | out 30 | } 31 | -------------------------------------------------------------------------------- /R/description.R: -------------------------------------------------------------------------------- 1 | #' Extract the contents of the attribute \code{description} 2 | #' 3 | #' If the object, \code{x}, is itself of 4 | #' class \code{\link{checkFunction}}, \code{\link{summaryFunction}} 5 | #' or \code{\link{visualFunction}}, the contents of \code{x}'s 6 | #' attribute \code{description} is returned. Otherwise, \code{NULL} is 7 | #' returned. 8 | #' 9 | #' @param x The object for which the \code{description} 10 | #' attribute should be extracted. 11 | #' 12 | #' @return A description of what \code{x} does, given as 13 | #' a character string. 14 | #' 15 | #' @examples 16 | #' #Extract the description of the checkFunction identifyMissing 17 | #' description(identifyMissing) 18 | #' 19 | #' #Extract the description of the summaryFunction minMax 20 | #' description(minMax) 21 | #' 22 | #' #Extract the description of the visualFunction basicVisual 23 | #' description(basicVisual) 24 | #' 25 | #' @include minMax.R identifyMissing.R basicVisual.R 26 | #' 27 | #' @export 28 | description <- function(x) UseMethod("description") 29 | 30 | #' @export 31 | description.default <- function(x) deparse(substitute(x)) 32 | 33 | #' @export 34 | description.checkFunction <- function(x) attr(x, "description") 35 | 36 | #' @export 37 | description.summaryFunction <- function(x) attr(x, "description") 38 | 39 | #' @export 40 | description.visualFunction <- function(x) attr(x, "description") 41 | 42 | 43 | #' @rdname description 44 | #' @usage description(x) <- value 45 | #' @param value New value 46 | #' @export description<- 47 | `description<-` <- function(x, value) { 48 | attr(x, "description") <- value 49 | x 50 | } 51 | -------------------------------------------------------------------------------- /R/identifyCaseIssues.R: -------------------------------------------------------------------------------- 1 | #' @title A checkFunction for identifying case issues 2 | #' 3 | #' @description A \code{\link{checkFunction}} to be called from 4 | #' \code{\link{check}} that identifies values in a vector 5 | #' that appear multiple times with different case settings. 6 | #' 7 | #' @param v A character, factor, haven_labelled or labelled variable to check. 8 | #' 9 | #' @param nMax The maximum number of problematic values to report. 10 | #' Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 11 | #' in the outputted message, or to \code{0} for no output. 12 | #' 13 | #' @return A \code{\link{checkResult}} with three entires: 14 | #' \code{$problem} (a logical indicating whether case issues where found), 15 | #' \code{$message} (a message describing which values in \code{v} resulted 16 | #' in case issues) and \code{$problemValues} (the problematic values 17 | #' in their original format). Note that Only unique problematic values 18 | #' are listed and they are presented in alphabetical order. 19 | #' 20 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 21 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 22 | #' 23 | #' @examples 24 | #' identifyCaseIssues(c("val", "b", "1", "1", "vAl", "VAL", "oh", "OH")) 25 | #' 26 | #' @importFrom stats na.omit 27 | #' @export 28 | identifyCaseIssues <- function(v, nMax = 10) UseMethod("identifyCaseIssues") 29 | 30 | 31 | #add methods to generic identifyCaseIssues function 32 | #' @export 33 | identifyCaseIssues.character <- function(v, nMax = 10) identifyCaseIssuesC(v, nMax = nMax) 34 | 35 | #' @export 36 | identifyCaseIssues.factor <- function(v, nMax = 10) identifyCaseIssuesF(v, nMax = nMax) 37 | 38 | #' @export 39 | identifyCaseIssues.labelled <- function(v, nMax = 10) identifyCaseIssuesL(v, nMax = nMax) 40 | 41 | #' @export 42 | identifyCaseIssues.haven_labelled <- function(v, nMax = 10) identifyCaseIssuesL(v, nMax = nMax) 43 | 44 | #make it a checkFunction 45 | #' @include checkFunction.R 46 | identifyCaseIssues <- checkFunction(identifyCaseIssues, "Identify case issues", 47 | c("character", "factor")) 48 | 49 | 50 | ##########################################Not exported below######################################### 51 | 52 | identifyCaseIssuesMessage <- "Note that there might be case problems with the following levels:" 53 | 54 | #character variable 55 | identifyCaseIssuesC <- function(v, nMax) { 56 | v <- na.omit(v) 57 | vLevs <- unique(v) 58 | vLevsLower <- tolower(vLevs) 59 | problemOcc <- vLevs[which(duplicated(vLevsLower) | duplicated(vLevsLower, fromLast = TRUE))] 60 | if (length(problemOcc) > 0) { 61 | problem <- TRUE 62 | problemValues <- sort(problemOcc) 63 | } else { 64 | problem <- FALSE 65 | problemValues <- NULL 66 | } 67 | outMessage <- messageGenerator(list(problem = problem, 68 | problemValues = problemValues), 69 | message = identifyCaseIssuesMessage, 70 | nMax = nMax) 71 | checkResult(list(problem = problem, message = outMessage, problemValues = problemValues)) 72 | } 73 | 74 | 75 | #factor variable 76 | identifyCaseIssuesF <- function(v, nMax) { 77 | v <- as.character(v) 78 | identifyCaseIssuesC(v, nMax = nMax) 79 | } 80 | 81 | #labelled variable 82 | identifyCaseIssuesL <- function(v, nMax) { 83 | identifyCaseIssuesF(dataReporter_as_factor(v), nMax) 84 | } 85 | -------------------------------------------------------------------------------- /R/identifyLoners.R: -------------------------------------------------------------------------------- 1 | #' A checkFunction for identifying sparsely represented values (loners) 2 | #' 3 | #' A \code{\link{checkFunction}} to be called from \code{\link{check}} that identifies values that 4 | #' only occur less than 6 times in factor, (haven_)labelled, or character variables (that is, loners). 5 | #' 6 | #' @param v A character, (haven_)labelled, or factor variable to check. 7 | #' 8 | #' @param nMax The maximum number of problematic values to report. 9 | #' Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 10 | #' in the outputted message, or to \code{0} for no output. 11 | #' 12 | #' @return A \code{\link{checkResult}} with three entires: 13 | #' \code{$problem} (a logical indicating whether case issues where found), 14 | #' \code{$message} (a message describing which values in \code{v} were loners) and 15 | #' \code{$problemValues} (the problematic values in their original format). 16 | #' Note that Only unique problematic values 17 | #' are listed and they are presented in alphabetical order. 18 | #' 19 | #' @details For character, (haven_)labelled, and factor variables, identify values that only have a 20 | #' very low number of observations, as these categories might be 21 | #' problematic when conducting an analysis. Unused factor levels are 22 | #' not considered "loners". "Loners" are defined as values with 5 or less 23 | #' observations, reflecting the commonly use rule of thumb for performing 24 | #' chi squared tests. 25 | #' 26 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 27 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 28 | #' 29 | #' @examples 30 | #' identifyLoners(c(rep(c("a", "b", "c"), 10), "d", "d")) 31 | #' 32 | #' @importFrom stats na.omit 33 | #' @export 34 | identifyLoners <- function(v, nMax = 10) UseMethod("identifyLoners") 35 | 36 | 37 | #add methods to generic identifyLoners function 38 | #' @export 39 | identifyLoners.factor <- function(v, nMax = 10) identifyLonersF(v, nMax = nMax) 40 | #' @export 41 | identifyLoners.labelled <- function(v, nMax = 10) identifyLonersL(v, nMax = nMax) 42 | #' @export 43 | identifyLoners.haven_labelled <- function(v, nMax = 10) identifyLonersL(v, nMax = nMax) 44 | #' @export 45 | identifyLoners.character <- function(v, nMax = 10) identifyLonersC(v, nMax = nMax) 46 | 47 | 48 | #make it a checkFunction 49 | #' @include checkFunction.R 50 | identifyLoners <- checkFunction(identifyLoners, "Identify levels with < 6 obs.", 51 | c("character", "factor")) 52 | 53 | ##########################################Not exported below######################################### 54 | 55 | identifyLonersMessage <- "Note that the following levels have at most five observations:" 56 | 57 | #For character/factor variables, identify values that only have a 58 | #very low number of observations, as these categories might be 59 | #problematic when conducting an analysis. Unused factor levels are 60 | #not considered "loners". "Loners" have 5 or less observations. 61 | 62 | 63 | #factor variables 64 | identifyLonersF <- function(v, nMax) { 65 | v <- factor(na.omit(v)) # Drop unused levels 66 | vLev <- levels(v) # Get the levels that are left. 67 | lonerOcc <- vLev[which(table(v) <= 5)] 68 | if (length(lonerOcc) > 0) { 69 | problem <- TRUE 70 | problemValues <- lonerOcc 71 | } else { 72 | problem <- FALSE 73 | problemValues <- NULL 74 | } 75 | outMessage <- messageGenerator(list(problem=problem, 76 | problemValues=problemValues), 77 | message = identifyLonersMessage, 78 | nMax = nMax) 79 | checkResult(list(problem = problem, message = outMessage, 80 | problemValues = problemValues)) 81 | } 82 | 83 | #character variables 84 | identifyLonersC <- function(v, nMax) { 85 | v <- factor(v) 86 | identifyLonersF(v, nMax) 87 | } 88 | 89 | identifyLonersL <- function(v, nMax) { 90 | v <- dataReporter_as_factor(v) 91 | identifyLonersF(v, nMax) 92 | } 93 | 94 | -------------------------------------------------------------------------------- /R/identifyNums.R: -------------------------------------------------------------------------------- 1 | #' @title A checkFunction 2 | #' 3 | #' @description A \code{\link{checkFunction}} to be called from 4 | #' \code{\link{check}} for identifying numeric variables that have 5 | #' been misclassified as categorical. 6 | #' 7 | #' @param v A character, factor, or (haven_)labelled variable to check. 8 | #' 9 | #' @param nVals An integer determining how many unique values a variable must have 10 | #' before it can potentially be determined to be a misclassified numeric variable. 11 | #' The default is \code{12}. 12 | #' 13 | #' @param ... Not in use. 14 | #' 15 | #' @return A \code{\link{checkResult}} with three entires: 16 | #' \code{$problem} (a logical indicating the variable is suspected to be 17 | #' a misclassified numeric variable), \code{$message} (if a problem was found, 18 | #' the following message: "Note: The variable consists exclusively of numbers and takes 19 | #' a lot of different values. Is it perhaps a misclassified numeric variable?", 20 | #' otherwise "") and \code{$problemValues} (always \code{NULL}). 21 | #' 22 | #' @details A categorical variable is suspected to be a misclassified 23 | #' numeric variable if it has the following two properties: First, 24 | #' it should consist exclusively of numbers (possibly including signs 25 | #' and decimals points). Secondly, it must have at least \code{nVals} unique values. 26 | #' The default values of \code{nVals} is 12, which means that 27 | #' e.g. variables including answers on a scale from 0-10 will 28 | #' not be recognized as misclassified numerics. 29 | #' 30 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 31 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 32 | #' 33 | #' @examples 34 | #' #Positive and negative numbers, saved as characters 35 | #' identifyNums(c(as.character(-9:9))) 36 | #' 37 | #' #An ordinary character variable 38 | #' identifyNums(c("a", "b", "c", "d", "e.f", "-a", 1:100)) 39 | #' 40 | #' 41 | #' @importFrom stats na.omit 42 | #' @importFrom haven as_factor 43 | #' @export 44 | identifyNums <- function(v, nVals = 12, ...) { 45 | out <- list(problem = FALSE, message = "", problemValues = NULL) 46 | 47 | 48 | #note: update to haven made as_factor not work on character variables! 49 | if ("labelled" %in% class(v)) { 50 | v <- na.omit(as.character(dataReporter_as_factor(v))) 51 | } else v <- na.omit(as.character(v)) 52 | 53 | if (length(unique(v)) < nVals) { 54 | return(checkResult(out)) 55 | } 56 | v[v==""] <- "a" #make sure v contains no empty strings 57 | v <- gsub("^-{1}", "", v) #remove signs (prefixed -) 58 | v <- sub("\\.{1}", "", v) #remove decimal points 59 | v <- gsub("[[:digit:]]", "", v) #replace numbers with empty strings 60 | if (sum(nchar(v)) == 0) { 61 | out$problem <- TRUE 62 | out$message <- "Note: The variable consists exclusively of numbers and takes a lot of different values. Is it perhaps a misclassified numeric variable?" 63 | } 64 | checkResult(out) 65 | } 66 | 67 | #' @include checkFunction.R 68 | identifyNums <- checkFunction(identifyNums, 69 | "Identify misclassified numeric or integer variables", 70 | c("character", "factor", "labelled", "haven_labelled")) 71 | -------------------------------------------------------------------------------- /R/identifyWhitespace.R: -------------------------------------------------------------------------------- 1 | #' @title A checkFunction for identifying whitespace 2 | #' 3 | #' @description A checkFunction to be called from \code{\link{check}} 4 | #' that identifies prefixed and suffixed whitespace(s) in character, 5 | #' (haven_)labelled or factor variables. 6 | #' 7 | #' @param v A character, (haven_)labelled or factor variable to check. 8 | #' 9 | #' @param nMax The maximum number of problematic values to report. 10 | #' Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 11 | #' in the outputted message, or to \code{0} for no output. 12 | #' 13 | #' @return A \code{\link{checkResult}} with three entires: 14 | #' \code{$problem} (a logical indicating whether any whitespaces were 15 | #' fount), \code{$message} (a message describing which values were prefixed 16 | #' or suffixed with whitespace) and \code{$problemValues} (the problematic 17 | #' values). Note that only unique values are printed in the message, and that 18 | #' they are sorted alphabetically. 19 | #' 20 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 21 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 22 | #' 23 | #' @examples 24 | #' identifyWhitespace(c("a", " b", "c", "d ", "e ")) 25 | #' 26 | #' @importFrom stats na.omit 27 | #' @importFrom utils tail 28 | #' @export 29 | identifyWhitespace <- function(v, nMax = 10) UseMethod("identifyWhitespace") 30 | 31 | 32 | #add methods to generic identifyWhitespace function 33 | 34 | #'@export 35 | identifyWhitespace.character <- function(v, nMax = 10) identifyWhitespaceC(v, nMax = nMax) 36 | 37 | #'@export 38 | identifyWhitespace.factor <- function(v, nMax = 10) identifyWhitespaceF(v, nMax = nMax) 39 | 40 | #'@export 41 | identifyWhitespace.labelled <- function(v, nMax = 10) identifyWhitespaceL(v, nMax = nMax) 42 | 43 | #'@export 44 | identifyWhitespace.haven_labelled <- function(v, nMax = 10) identifyWhitespaceL(v, nMax = nMax) 45 | 46 | #make it a checkFunction 47 | #' @include checkFunction.R 48 | identifyWhitespace <- checkFunction(identifyWhitespace, "Identify prefixed and suffixed whitespace", 49 | c("character", "factor", "labelled", "haven_labelled")) 50 | 51 | 52 | ##########################################Not exported below######################################### 53 | identifyWhitespaceMessage <- "The following values appear with prefixed or suffixed white space:" 54 | 55 | #character variables 56 | identifyWhitespaceC <- function(v, nMax) { 57 | v <- na.omit(v) 58 | # wsPrefixPlaces <- sapply(v, substr, 1, 1) == " " 59 | wsPrefixPlaces <- substr(v, 1, 1) == " " 60 | # wsSuffixPlaces <- sapply(v, function(x) {tail(strsplit(x, "")[[1]], 1)}) == " " 61 | wsSuffixPlaces <- substr(v, nchar(v), nchar(v)) == " " 62 | 63 | allWsPlaces <- wsPrefixPlaces | wsSuffixPlaces 64 | if (any(allWsPlaces)) { 65 | problem <- TRUE 66 | problemValues <- unique(v[allWsPlaces]) 67 | } else { 68 | problem <- FALSE 69 | problemValues <- NULL 70 | } 71 | outMessage <- messageGenerator(list(problem=problem, 72 | problemValues=problemValues), 73 | message = identifyWhitespaceMessage, 74 | nMax = nMax) 75 | checkResult(list(problem = problem, message = outMessage, 76 | problemValues = problemValues)) 77 | } 78 | 79 | #factor variables 80 | identifyWhitespaceF <- function(v, nMax) { 81 | identifyWhitespaceC(as.character(v), nMax) 82 | } 83 | 84 | #labelled variables 85 | identifyWhitespaceL <- function(v, nMax) { 86 | v <- na.omit(dataReporter_as_factor(v)) 87 | identifyWhitespaceF(v, nMax = nMax) 88 | } 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /R/isCPR.R: -------------------------------------------------------------------------------- 1 | #' @title Check if a variable consists of Danish CPR numbers 2 | #' 3 | #' @description A \code{\link{checkFunction}} that checks if \code{v} consists exclusively 4 | #' of valid Danish civil registration (CPR) numbers, ignoring missing values. This 5 | #' function is intended for use as a precheck in \code{\link{makeDataReport}}, ensuring 6 | #' that CPR numbers are not included in a \code{dataReporter} output document. 7 | #' 8 | #' @param v A variable (vector) to check. This variable is allowed to have any class. 9 | #' 10 | #' @param ... Not in use. 11 | #' 12 | #' @return A \code{\link{checkResult}} with three entires: 13 | #' \code{$problem} (a logical indicating whether the variable consists 14 | #' of CPR numbers), \code{$message} (if a problem was found, 15 | #' the following message: "Warning: The variable seems to consist of 16 | #' Danish civil registration (CPR) numbers.", 17 | #' otherwise "") and \code{$problemValues} (always \code{NULL}). 18 | #' 19 | #' @examples 20 | #' 21 | #' CPRs <- c("010188-3639", "020187-1476", "040506-8664", "010290-3684", "010291-1180", 22 | #' "010293-1599", "010294-1268", "010295-1360", "010296-3970", "010297-2007", 23 | #' "010270-2905", "010271-0134", "010272-1403", "010273-3088", "010274-1633") 24 | #' nonCPRs <- c(1:10) 25 | #' mixedCPRs <- c(CPRs, nonCPRs) 26 | #' 27 | #' #identify problem 28 | #' isCPR(CPRs) 29 | #' 30 | #' #no problem as there are no CPRs 31 | #' isCPR(nonCPRs) 32 | #' 33 | #' #no problem because not ALL values are CPRs 34 | #' isCPR(mixedCPRs) 35 | #' 36 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 37 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 38 | #' 39 | #' @importFrom stats na.omit 40 | #' @importFrom haven as_factor 41 | #' @export 42 | isCPR <- function(v, ...) { #Note: Implementation works until the year 2036... 43 | out <- list(problem=FALSE, message="", problemValues = NULL) 44 | m <- "Warning: The variable seems to consist of Danish civil regristration (CPR) numbers." 45 | 46 | if (any(c("labelled", "haven_labelled") %in% class(v))) v <- dataReporter_as_factor(v) 47 | 48 | v <- as.character(na.omit(v)) 49 | if (length(v) == 0) return(checkResult(out)) #if v consists only of NAs 50 | posCPR <- FALSE 51 | chars <- nchar(v) 52 | 53 | if (!all(chars %in% c(10,11))) return(checkResult(out)) 54 | 55 | if (all(chars == 10)) { 56 | posCPR <- grepl("[0-9]{10}", v) 57 | } 58 | if (all(chars== 11)) { 59 | posCPR <- grepl("[0-9]{6}-[0-9]{4}", v) 60 | } 61 | 62 | if (!all(posCPR)) return(checkResult(out)) 63 | 64 | if (!all(isDanishDate(substring(v, 1, 6)))) return(checkResult(out)) 65 | 66 | v <- gsub("-", "", v) 67 | 68 | year <- as.numeric(substring(v, 5, 6)) 69 | digit7 <- substring(v, 7, 7) 70 | 71 | noCheckPl <- year<36 & year>=7 & digit7 >= 4 #is this right? 72 | 73 | if (!all(noCheckPl)) { 74 | check <- function(x) { 75 | x <- as.numeric(strsplit(x, "")[[1]]) 76 | a <- c(4, 3, 2, 7, 6, 5, 4, 3, 2, 1) 77 | (x %*% a) %% 11 == 0 #note: x %*% a = a %*% x for 1 x n vectors in R 78 | } 79 | res <- sapply(v[!noCheckPl], check) 80 | if (!all(res)) return(checkResult(out)) 81 | } else if (!all(digit7[noCheckPl]>3)) return(checkResult(out)) 82 | 83 | out$problem <- TRUE 84 | out$message <- m 85 | checkResult(out) 86 | } 87 | 88 | 89 | #Make it a checkFunction 90 | #' @include checkFunction.R allClasses.R 91 | isCPR <- checkFunction(isCPR, "Identify Danish CPR numbers", 92 | classes = allClasses()) 93 | 94 | 95 | ##########################################Not exported below######################################### 96 | 97 | #Checks whether strs contains only entries on the form DDMMYY 98 | isDanishDate <- function(strs) { 99 | if (!(all(nchar(strs) == 6) & all(grepl("[0-9]{6}", strs)))) return(FALSE) 100 | 101 | ds <- as.numeric(substring(strs, 1, 2)) 102 | ms <- as.numeric(substring(strs, 3, 4)) 103 | 104 | if (any(ms > 13)) return(FALSE) 105 | 106 | mds <- c(31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) 107 | maxDs <- mds[ms] 108 | 109 | if (any(ds > maxDs)) return(FALSE) 110 | 111 | TRUE 112 | } 113 | 114 | -------------------------------------------------------------------------------- /R/isEmpty.R: -------------------------------------------------------------------------------- 1 | #' @title Old version of isSingular, kept for compatibility 2 | #' @rdname isSingular 3 | #' @include isSingular.R 4 | #' @export 5 | isEmpty <- isSingular -------------------------------------------------------------------------------- /R/isKey.R: -------------------------------------------------------------------------------- 1 | #' @title Check if a variable qualifies as a key 2 | #' 3 | #' @description A \code{\link{checkFunction}} that checks if \code{v} 4 | #' is a key, that is, if every observation has a unique value in \code{v} and 5 | #' \code{v} is not a numeric/integer nor a Date variable. This 6 | #' function is intended for use as a precheck in \code{\link{makeDataReport}}. 7 | #' 8 | #' @param v A variable (vector) to check. All variable types are allowed. 9 | #' 10 | #' @return A \code{\link{checkResult}} with three entires: 11 | #' \code{$problem} (a logical indicating whether \code{v} is a key), 12 | #' \code{$message} (if a problem was found, the following message: 13 | #' "The variable is a key (distinct values for each observation).", 14 | #' otherwise "") and \code{$problemValues} (always \code{NULL}). 15 | #' 16 | #' @details Note that numeric or integer variables are not considered candidates 17 | #' for keys, as truly continuous measurements will most likely result in unique 18 | #' values for each observation. 19 | #' 20 | #' @examples 21 | #' keyVar <- c("a", "b", "c", "d", "e", "f") 22 | #' notKeyVar <- c("a", "a", "b", "c", "d", "e", "f") 23 | #' 24 | #' isKey(keyVar) 25 | #' isKey(notKeyVar) 26 | #' 27 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 28 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 29 | #' 30 | #' @export 31 | isKey <- function(v) { 32 | out <- list(problem = FALSE, message = "", problemValues = NULL) 33 | if (length(unique(v)) == length(v) & !any(class(v) %in% c("numeric", "integer", "Date"))) { 34 | out$problem <- TRUE 35 | out$message <- "The variable is a key (distinct values for each observation)." 36 | } 37 | checkResult(out) 38 | } 39 | 40 | 41 | #make it a checkFunction 42 | #' @include allClasses.R checkFunction.R 43 | isKey <- checkFunction(isKey, "Check if the variable is a key", allClasses()) 44 | -------------------------------------------------------------------------------- /R/isSingular.R: -------------------------------------------------------------------------------- 1 | #' @title Check if a variable only contains a single value 2 | #' 3 | #' @description A \code{\link{checkFunction}} that checks if \code{v} only 4 | #' contains a single unique value, aside from missing values. This 5 | #' function is intended for use as a precheck in \code{\link{makeDataReport}}. 6 | #' 7 | #' @param v A variable (vector) to check. All variable types are allowed. 8 | #' 9 | #' @return A \code{\link{checkResult}} with three entires: 10 | #' \code{$problem} (a logical indicating whether \code{v} contains only one value), 11 | #' \code{$message} (if a problem was found, a message describing which single 12 | #' value the variable takes and how many missing observations it contains, otherwise 13 | #' ""), and \code{$problemValues} (always \code{NULL}). 14 | #' 15 | #' @examples 16 | #' singularVar <- c(rep("a", 10), NA, NA) 17 | #' notSingularVar <- c("a", "a", "b", "c", "d", "e", "f", NA, NA) 18 | #' 19 | #' isSingular(singularVar) 20 | #' isSingular(notSingularVar) 21 | #' 22 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 23 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 24 | #' 25 | #' @importFrom haven as_factor 26 | #' 27 | #' @export 28 | isSingular <- function(v) { 29 | lV <- length(v) 30 | 31 | if (any(c("labelled", "haven_labelled") %in% class(v))) v <- dataReporter_as_factor(v) #otherwise na.omit does not work 32 | 33 | v <- na.omit(v) 34 | pctMiss <- round(100*(lV - length(v))/lV, 2) 35 | out <- list(problem = FALSE, message = "", problemValues = NULL) 36 | nVals <- length(unique(v)) 37 | if (nVals <= 1) { 38 | allNA <- nVals == 0 39 | val <- ifelse(allNA, "NA", as.character(v[1])) 40 | out$problem <- TRUE 41 | out$message <- paste("The variable only takes one ", 42 | ifelse(allNA, "", "(non-missing) "), 43 | "value: ", printProblemValues(val), ".", 44 | ifelse(allNA, "", 45 | paste(" The variable contains", 46 | pctMiss, 47 | "\\% missing observations.")), 48 | sep="") 49 | } 50 | checkResult(out) 51 | } 52 | 53 | 54 | #make it a checkFunction 55 | #' @include allClasses.R allClasses.R 56 | isSingular <- checkFunction(isSingular, 57 | "Check if the variable contains only a single value", 58 | allClasses()) 59 | 60 | -------------------------------------------------------------------------------- /R/isSupported.R: -------------------------------------------------------------------------------- 1 | #' @title Check if a variable has a class supported by dataReporter 2 | #' 3 | #' @description A \code{\link{checkFunction}} that checks if \code{v} has 4 | #' one of the classes supported by dataReporter, namely \code{character}, 5 | #' \code{factor}, \code{numeric}, \code{integer}, \code{labelled}, 6 | #' \code{haven_labelled}, 7 | #' \code{logical} and \code{Date} (inlcuding other classes that inherits 8 | #' from any of these classes). A user supported list can be provided 9 | #' in the \code{treatXasY} argument, which will let the user decide 10 | #' how unsupported classes should be treated. This 11 | #' function is intended for use as a precheck in \code{\link{makeDataReport}}. 12 | #' 13 | #' @param v A variable (vector) to check. All variable types are allowed. 14 | #' 15 | #' @return A \code{\link{checkResult}} with three entires: 16 | #' \code{$problem} (a logical indicating whether \code{v} contains only one value), 17 | #' \code{$message} (if a problem was found, a message describing which single 18 | #' value the variable takes and how many missing observations it contains, otherwise 19 | #' ""), and \code{$problemValues} (always \code{NULL}). 20 | #' 21 | #' @examples 22 | #' integerVar <- 1:10 #supported 23 | #' rawVar <- as.raw(1:10) #not supported 24 | #' 25 | #' isSupported(integerVar) 26 | #' isSupported(rawVar) 27 | #' 28 | #' @seealso \code{\link{check}}, \code{\link{allCheckFunctions}}, 29 | #' \code{\link{checkFunction}}, \code{\link{checkResult}} 30 | #' 31 | #' @export 32 | isSupported <- function(v) { 33 | suppClasses <- c("character", "factor", "labelled", "haven_labelled", 34 | "numeric", "integer", 35 | "logical", "Date") 36 | vClasses <- class(v) 37 | out <- list(problem = FALSE, message = "", problemValues = NULL) 38 | if (any(vClasses %in% suppClasses)) { 39 | return(checkResult(out)) 40 | } 41 | out$problem <- TRUE 42 | out$message <- paste("The variable has class", vClasses[1], 43 | "which is not supported by dataReporter.") 44 | checkResult(out) 45 | } 46 | 47 | 48 | #make it a checkFunction 49 | #' @include allClasses.R allClasses.R 50 | isSupported <- checkFunction(isSupported, 51 | "Check if the variable class is supported by dataReporter.", 52 | allClasses()) 53 | 54 | -------------------------------------------------------------------------------- /R/makeXFunction.R: -------------------------------------------------------------------------------- 1 | #Make a function of type X (among visualFunction, summaryFunction and 2 | #checkFunction). If classes is null, and fName is a S3 generic, 3 | #the function will make a look-up for all methods available in the 4 | #global enviroment and fill out the slot accordingly. 5 | #of now, they must have their classes specified explicitly! 6 | #' @importFrom utils methods 7 | makeXFunction <- function(fName, description, classes, X) { 8 | if (exists(fName, 1)) { 9 | f <- get(fName, 1) 10 | } else { 11 | f <- get(fName) 12 | } 13 | #note: default pos (-1) will look in dataReporter namespace first 14 | #when called interactively. This results in weird behaviour when 15 | #users try to overwrite our functions. 16 | 17 | if (is.null(description)) description <- fName 18 | if (is.null(classes)) { 19 | theseMethods <- as.character(methods(fName)) #methods() needs the name in order 20 | #to work inside the function 21 | 22 | #if (length(theseMethods) == 0) { 23 | # callEnv <- parent.env(as.environment(-1L)) 24 | # theseMethods <- as.character(utils::.S3methods(fName, envir = callEnv)) 25 | #} 26 | 27 | if (length(theseMethods) > 0) { 28 | classes <- sub(paste(fName, ".", sep=""), 29 | "", theseMethods) 30 | } else classes <- character() 31 | } 32 | class(f) <- c(X, "function") 33 | attr(f, "description") <- description 34 | attr(f, "classes") <- classes 35 | f 36 | } 37 | 38 | #tryCatch(mget("identifyMissing", envir = as.environment(1), 39 | # inherits = T), finally = print("hello" 40 | # ) 41 | # ) 42 | 43 | 44 | #foo <- function(x) UseMethod("foo") 45 | # 46 | #foo.character <- function(x) x 47 | #foo.numeric <- function(x) x + 1 48 | # 49 | #makeAttributedFunction <- function(fName, classes = NULL) { 50 | # if (is.null(classes)) { 51 | # browser() 52 | # theseMethods <- as.character(methods(fName)) 53 | # if (length(theseMethods) > 0) { 54 | # classes <- sub(paste(fName, ".", sep=""), 55 | # "", theseMethods) 56 | # } else classes <- character() 57 | # } 58 | # f <- get(fName) 59 | # attr(f, "classes") <- classes 60 | # class(f) <- "attributedFunction" 61 | # f 62 | #} 63 | # 64 | #foo <- makeAttributedFunction("foo") 65 | 66 | -------------------------------------------------------------------------------- /R/minMax.R: -------------------------------------------------------------------------------- 1 | #' @title summaryFunction for minimum and maximum 2 | #' 3 | #' @description A \code{summaryFunction}, intended to be called from 4 | #' \code{\link{summarize}}, which returns the minimum and maximum values of a variable. 5 | #' NA, NaN and Inf values are removed prior to the computations. 6 | #' 7 | #' @param v A variable (vector) of type numeric or integer. 8 | #' 9 | #' @inheritParams makeDataReport 10 | #' 11 | #' @return An object of class \code{summaryResult} with the following entries: \code{$feature} 12 | #' ("Min. and max."), \code{$result} (the minimum and maximum of \code{v}), and \code{$value} 13 | #' (minimum and maximum in their orignial format). 14 | #' 15 | #' @seealso \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 16 | #' \code{\link{allSummaryFunctions}} 17 | #' 18 | #' @examples 19 | #' minMax(c(1:100)) 20 | #' 21 | #' @importFrom stats na.omit 22 | #' @export 23 | minMax <- function(v, maxDecimals = 2) { 24 | v <- na.omit(v) #maybe keep Infs instead? 25 | if(length(v)>0) { 26 | minV <- min(v) 27 | maxV <- max(v) 28 | } else { 29 | minV <- NA 30 | maxV <- NA 31 | } 32 | summaryResult(list(feature="Min. and max.", 33 | result=paste(round(minV, maxDecimals), 34 | round(maxV, maxDecimals), 35 | sep="; "), 36 | value = c(minV, maxV))) 37 | } 38 | 39 | #Make it a summaryFunction 40 | #' @include summaryFunction.R 41 | minMax <- summaryFunction(minMax, "Find minimum and maximum values", 42 | c("integer", "numeric", "Date")) 43 | -------------------------------------------------------------------------------- /R/misc.R: -------------------------------------------------------------------------------- 1 | #Miscellaneous methods that do not belong in any specfic 2 | #other file. Only methods are allowed in this file, other 3 | #functions, however minor, should have their own files. 4 | 5 | #Print overview of all functions of a certain type, used in 6 | #allVisualFunctions(), allSummaryFunctions(), 7 | #allCheckFunctions(). 8 | #' @importFrom pander pander 9 | #' @export 10 | print.functionSummary <- function(x, ...) { 11 | x$classes <- sapply(x$classes, function(x) paste(x, collapse=", ")) 12 | pander::pander(data.frame(x, row.names = NULL), justify="left") 13 | } 14 | 15 | -------------------------------------------------------------------------------- /R/quartiles.R: -------------------------------------------------------------------------------- 1 | #' @title summaryFunction for quartiles 2 | #' 3 | #' @description A \code{\link{summaryFunction}}, intended to be called from \code{\link{summarize}}, 4 | #' which calculates the 1st and 3rd quartiles of a variable. NA, NaN and Inf values are removed 5 | #' prior to the computations. 6 | #' 7 | #' @param v A variable (vector) of type numeric or integer. 8 | #' 9 | #' @inheritParams makeDataReport 10 | #' 11 | #' @details The quartiles are computed using the \code{\link[stats]{quantile}} function from \code{stats}, 12 | #' using type 7 quantiles for integer and numeric variables and type 1 quantiles for Date variables. 13 | #' 14 | #' @return An object of class \code{summaryResult} with the following entries: \code{$feature} 15 | #' ("1st and 3rd quartiles"), \code{$result} (the 1st and 3rd quartiles of \code{v}) and 16 | #' \code{$value} (the quartiles in their original format). 17 | #' 18 | #' @seealso \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 19 | #' \code{\link{allSummaryFunctions}} 20 | #' 21 | #' @examples 22 | #' quartiles(c(1:100)) 23 | #' 24 | #' quartiles(rnorm(1000), maxDecimals = 4) 25 | #' 26 | #' @importFrom stats na.omit quantile 27 | #' @export 28 | quartiles <- function(v, maxDecimals = 2) { 29 | v <- na.omit(v) #maybe keep Inf's? 30 | quants <- quantile(v, c(0.25, 0.75), 31 | type = ifelse("Date" %in% class(v), 32 | 1, 7)) 33 | summaryResult(list(feature="1st and 3rd quartiles", 34 | result = paste(round(quants, maxDecimals), 35 | collapse="; "), 36 | value = c(quants))) 37 | } 38 | 39 | #' @include summaryFunction.R 40 | quartiles <- summaryFunction(quartiles, "Compute 1st and 3rd quartiles", 41 | classes = c("Date", "integer", "numeric")) 42 | 43 | 44 | -------------------------------------------------------------------------------- /R/refCat.R: -------------------------------------------------------------------------------- 1 | #' @title summaryFunction that finds reference level for factor variables 2 | #' 3 | #' @description A \code{summaryFunction}, intended to be called from 4 | #' \code{\link{summarize}}, which returns the reference level of a factor variable, 5 | #' i.e. the first category as returned by \code{levels(v)}. This level will serve 6 | #' as the reference category and get absorbed into the intercept for most standard 7 | #' model fitting procedures and therefore, it may be convenient to know. 8 | #' 9 | #' @param v A variable (vector) of type factor. 10 | #' 11 | #' @param ... Not in use. 12 | #' 13 | #' @return An object of class \code{summaryResult} with the following entries: \code{$feature} 14 | #' ("Reference level"), \code{$result} (the reference level of \code{v}), and \code{$value} 15 | #' (identical to result). 16 | #' 17 | #' @seealso \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 18 | #' \code{\link{allSummaryFunctions}} 19 | #' 20 | #' @examples 21 | #' refCat(factor(letters)) 22 | #' 23 | #' @importFrom stats na.omit 24 | #' @export 25 | refCat <- function(v, ...) { 26 | val <- levels(v)[1] 27 | res <- val 28 | summaryResult(list(feature = "Reference category", result = res, 29 | value = val)) 30 | } 31 | 32 | #Make it a summaryFunction 33 | #' @include summaryFunction.R 34 | refCat <- summaryFunction(refCat, "Find reference level", 35 | c("factor")) 36 | -------------------------------------------------------------------------------- /R/render.R: -------------------------------------------------------------------------------- 1 | #' @title Simplified Rmarkdown rendering 2 | #' 3 | #' @description Render a Rmarkdown (.Rmd) file, \code{file}, to the output 4 | #' format specified in its preamble. If no output format is specified, 5 | #' it will be rendered to html. 6 | #' 7 | #' @param file A character string path to the file that is to be rendered. 8 | #' This file must be of type Rmarkdown (.Rmd) 9 | #' 10 | #' @param quiet A logical. Should messages during rendering be surpressed? 11 | #' 12 | #' @details This function is merely a simplified version (in terms of 13 | #' possible arguments) of the rendering function from the \code{rmarkdown} package. 14 | #' Therefore, we refer to this functions for more details: 15 | #' \code{\link[rmarkdown]{render}}. We have included this simplified version in 16 | #' \code{dataReporter} in order to help new R users with rendering their output 17 | #' documents as generated by \code{\link{makeDataReport}}. 18 | #' 19 | #' @seealso \code{\link[rmarkdown]{render}}. 20 | #' 21 | #' @export 22 | render <- function(file, quiet) { 23 | 24 | if (!rmarkdown::pandoc_available()) { 25 | stop("pandoc appears not to be installed on the system. Please see the vignette for the rmarkdown package on how to install pandoc") 26 | } 27 | 28 | rmarkdown::render(file, quiet=quiet) 29 | } 30 | -------------------------------------------------------------------------------- /R/setChecks.R: -------------------------------------------------------------------------------- 1 | #' Set check arguments for makeDataReport 2 | #' 3 | #' This function is a tool for easily specifying the \code{checks} argument of 4 | #' \code{\link{makeDataReport}}. Note that all available check function options can be inspected 5 | #' by calling \code{allCheckFunctions()}. 6 | #' 7 | #' @param character A character vector of function names to be used as checks for character 8 | #' variables. The default options are available by calling \code{defaultCharacterChecks()}. 9 | #' 10 | #' @param factor A character vector of function names to be used as checks for factor 11 | #' variables. The default options are available by calling \code{defaultFactorChecks()}. 12 | #' 13 | #' @param labelled A character vector of function names to be used as checks for labelled 14 | #' variables. The default options are available by calling \code{defaultLabelledChecks()}. 15 | #' 16 | #' @param haven_labelled A character vector of function names to be used as checks for haven_labelled 17 | #' variables. The default options are available by calling \code{defaultHavenlabelledChecks()}. 18 | #' 19 | #' @param numeric A character vector of function names to be used as checks for numeric 20 | #' variables. The default options are available by calling \code{defaultNumericChecks()}. 21 | #' 22 | #' @param integer A character vector of function names to be used as checks for integer 23 | #' variables. The default options are available by calling \code{defaultIntegerChecks()}. 24 | #' 25 | #' @param logical A character vector of function names to be used as checks for logical 26 | #' variables. The default options are available by calling \code{defaultLogicalChecks()}. 27 | #' 28 | #' @param Date A character vector of function names to be used as checks for Date 29 | #' variables. The default options are available by calling \code{defaultDateChecks()}. 30 | #' 31 | #' @param all A character vector of function names to be used as checks for all 32 | #' variables. Note that this overrules the choices made for specific variable types by using 33 | #' the other arguments. 34 | #' 35 | #' @return A list with one entry for each data class supported by \code{makeDataReport}. Each 36 | #' entry then contains a character vector of function names that are to be called as checks for 37 | #' that variable type. 38 | #' 39 | #' @seealso \code{\link{makeDataReport}}, \code{\link{allCheckFunctions}}, 40 | #' \code{\link{defaultCharacterChecks}}, 41 | #' \code{\link{defaultFactorChecks}}, \code{\link{defaultLabelledChecks}}, 42 | #' \code{\link{defaultHavenlabelledChecks}}, 43 | #' \code{\link{defaultNumericChecks}}, \code{\link{defaultIntegerChecks}}, 44 | #' \code{\link{defaultLogicalChecks}}, \code{\link{defaultDateChecks}} 45 | #' @examples 46 | #' #Only identify missing values for characters, logicals and labelled variables: 47 | #' setChecks(character = "identifyMissing", factor = "identifyMissing", 48 | #' labelled = "identifyMissing") 49 | #' 50 | #' #Used in a call to makeDataReport(): 51 | #' \donttest{ 52 | #' data(toyData) 53 | #' makeDataReport(toyData, checks = setChecks(character = "identifyMissing", 54 | #' factor = "identifyMissing", labelled = "identifyMissing"), replace = TRUE) 55 | #' } 56 | #' 57 | #' @export 58 | setChecks <- function(character = defaultCharacterChecks(), 59 | factor = defaultFactorChecks(), 60 | labelled = defaultLabelledChecks(), 61 | haven_labelled = defaultHavenlabelledChecks(), 62 | numeric = defaultNumericChecks(), 63 | integer = defaultIntegerChecks(), 64 | logical = defaultLogicalChecks(), 65 | Date = defaultDateChecks(), all = NULL) { 66 | if (!is.null(all)) { 67 | character <- factor <- labelled <- haven_labelled <- numeric <- integer <- logical <- Date <- all 68 | } 69 | outList <- list(character = character, factor = factor, 70 | labelled = labelled, haven_labelled = haven_labelled, 71 | numeric = numeric, 72 | integer = integer, logical = logical, 73 | Date = Date) 74 | outList 75 | } -------------------------------------------------------------------------------- /R/setSummaries.R: -------------------------------------------------------------------------------- 1 | #' Set summary arguments for makeDataReport 2 | #' 3 | #' This function is a tool for easily specifying the \code{summaries} argument of 4 | #' \code{\link{makeDataReport}}. Note that all available summary function options can be inspected 5 | #' by calling \code{allSummaryFunctions()}. 6 | #' 7 | #' @param character A character vector of function names to be used as summaries for character 8 | #' variables. The default options are available by calling \code{defaultCharacterSummaries()}. 9 | #' 10 | #' @param factor A character vector of function names to be used as summaries for factor 11 | #' variables. The default options are available by calling \code{defaultFactorSummaries()}. 12 | #' 13 | #' @param labelled A character vector of function names to be used as summaries for labelled 14 | #' variables. The default options are available by calling \code{defaultLabelledSummaries()}. 15 | #' 16 | #' @param haven_labelled A character vector of function names to be used as summaries for haven_labelled 17 | #' variables. The default options are available by calling \code{defaultHavenlabelledSummaries()}. 18 | #' 19 | #' @param numeric A character vector of function names to be used as summaries for numeric 20 | #' variables. The default options are available by calling \code{defaultNumericSummaries()}. 21 | #' 22 | #' @param integer A character vector of function names to be used as summaries for integer 23 | #' variables. The default options are available by calling \code{defaultIntegerSummaries()}. 24 | #' 25 | #' @param logical A character vector of function names to be used as summaries for logical 26 | #' variables. The default options are available by calling \code{defaultLogicalSummaries()}. 27 | #' 28 | #' @param Date A character vector of function names to be used as summaries for Date 29 | #' variables. The default options are available by calling \code{defaultDateSummaries()}. 30 | #' 31 | #' @param all A character vector of function names to be used as summaries for all 32 | #' variables. Note that this overrules the choices made for specific variable types by using 33 | #' the other arguments. 34 | #' 35 | #' @return A list with one entry for each data class supported by \code{makeDataReport}. Each 36 | #' entry then contains a character vector of function names that are to be called as summaries for 37 | #' that variable type. 38 | #' 39 | #' @seealso \code{\link{makeDataReport}}, \code{\link{allSummaryFunctions}}, 40 | #' \code{\link{defaultCharacterSummaries}}, 41 | #' \code{\link{defaultFactorSummaries}}, \code{\link{defaultLabelledSummaries}}, 42 | #' \code{\link{defaultHavenlabelledSummaries}}, 43 | #' \code{\link{defaultNumericSummaries}}, \code{\link{defaultIntegerSummaries}}, 44 | #' \code{\link{defaultLogicalSummaries}}, \code{\link{defaultDateSummaries}} 45 | #' @examples 46 | #' #Don't include central value (median/mode) summary for numerical and integer 47 | #' #variables: 48 | #' setSummaries(numeric = defaultNumericSummaries(remove = "centralValue"), 49 | #' integer = defaultIntegerSummaries(remove = "centralValue")) 50 | #' 51 | #' 52 | #' #Used in a call to makeDataReport(): 53 | #' \donttest{ 54 | #' data(toyData) 55 | #' makeDataReport(toyData, 56 | #' setSummaries(numeric = defaultNumericSummaries(remove = "centralValue"), 57 | #' integer = defaultIntegerSummaries(remove = "centralValue")), replace = TRUE) 58 | #' } 59 | #' 60 | #' @export 61 | setSummaries <- function(character = defaultCharacterSummaries(), 62 | factor = defaultFactorSummaries(), 63 | labelled = defaultLabelledSummaries(), 64 | haven_labelled = defaultHavenlabelledSummaries(), 65 | numeric = defaultNumericSummaries(), 66 | integer = defaultIntegerSummaries(), 67 | logical = defaultLogicalSummaries(), 68 | Date = defaultDateSummaries(), all = NULL) { 69 | if (!is.null(all)) { 70 | character <- factor <- labelled <- haven_labelled <- numeric <- integer <- logical <- Date <- all 71 | } 72 | outList <- list(character = character, factor = factor, 73 | labelled = labelled, 74 | haven_labelled = haven_labelled, 75 | numeric = numeric, 76 | integer = integer, logical = logical, 77 | Date = Date) 78 | outList 79 | } -------------------------------------------------------------------------------- /R/setVisuals.R: -------------------------------------------------------------------------------- 1 | #' Set visual arguments for makeDataReport 2 | #' 3 | #' This function is a tool for easily specifying the \code{visuals} argument of 4 | #' \code{\link{makeDataReport}}. Note that only a single visual function can 5 | #' be provided for each variable type. If more than one is supplied, only 6 | #' the first one is used. The default is to use a single visual function for all 7 | #' variable types (as specified in the argument \code{all}), but class-specific choices 8 | #' of visual functions can also be used. Note that class-specific arguments overwrites 9 | #' the contents of \code{all}. Note that all available visual function options can be inspected 10 | #' by calling \code{allVisualFunctions()}. 11 | #' 12 | #' @param character A function name (character string) to be used as the visual function for character 13 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 14 | #' argument is used instead. 15 | #' 16 | #' @param factor A function name (character string) to be used as the visual function for factor 17 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 18 | #' argument is used instead. 19 | #' 20 | #' @param labelled A function name (character string) to be used as the visual function for labelled 21 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 22 | #' argument is used instead. 23 | #' 24 | #' @param haven_labelled A function name (character string) to be used as the visual function for haven_labelled 25 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 26 | #' argument is used instead. 27 | #' 28 | #' @param numeric A function name (character string) to be used as the visual function for numeric 29 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 30 | #' argument is used instead. 31 | #' 32 | #' @param integer A function name (character string) to be used as the visual function for integer 33 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 34 | #' argument is used instead. 35 | #' 36 | #' @param logical A function name (character string) to be used as the visual function for logical 37 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 38 | #' argument is used instead. 39 | #' 40 | #' @param Date A function name (character string) to be used as the visual function for Date 41 | #' variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 42 | #' argument is used instead. 43 | #' 44 | #' @param all A function name (character string) to be used as the visual function for all 45 | #' variables. 46 | #' 47 | #' @return A list with one entry for each data class supported by \code{makeDataReport}. Each 48 | #' entry then contains a character string with a function name that is to be called as the visual 49 | #' function for that variable type. 50 | #' 51 | #' @seealso \code{\link{makeDataReport}}, \code{\link{allVisualFunctions}} 52 | #' @examples 53 | #' #Set visual type to basicVisual for all variable types: 54 | #' setVisuals(all = "basicVisual") 55 | #' 56 | #' #Used in a call to makeDataReport(): 57 | #' \donttest{ 58 | #' data(toyData) 59 | #' makeDataReport(toyData, visuals = setVisuals(all = "basicVisual"), replace = TRUE) 60 | #' } 61 | #' 62 | #' @export 63 | setVisuals <- function(character = NULL, 64 | factor = NULL, 65 | labelled = NULL, 66 | haven_labelled = NULL, 67 | numeric = NULL, 68 | integer = NULL, 69 | logical = NULL, 70 | Date = NULL, all = "standardVisual") { 71 | if (is.null(character)) character <- all 72 | if (is.null(factor)) factor <- all 73 | if (is.null(labelled)) labelled <- all 74 | if (is.null(haven_labelled)) haven_labelled <- all 75 | if (is.null(numeric)) numeric <- all 76 | if (is.null(integer)) integer <- all 77 | if (is.null(logical)) logical <- all 78 | if (is.null(Date)) Date <- all 79 | 80 | outList <- list(character = character, factor = factor, 81 | labelled = labelled, 82 | haven_labelled = haven_labelled, 83 | numeric = numeric, 84 | integer = integer, logical = logical, 85 | Date = Date) 86 | 87 | outList <- lapply(outList, function(x) x[1]) 88 | 89 | outList 90 | } -------------------------------------------------------------------------------- /R/smartNum.R: -------------------------------------------------------------------------------- 1 | #' @title Smart class to handle numerics as factor 2 | #' 3 | #' @description S3 class meant for representing numeric variables that act like 4 | #' factor variables by taking only a few different values. This class 5 | #' is used in makeDataReport() in order to get appropriate summaries, visualizations 6 | #' and checks for such variables. In other words, such variables will be 7 | #' treated like factor variables instead of numerics. 8 | #' 9 | #' @param v A numeric vector 10 | #' 11 | #' @return A \code{smartNum} object that is handled in \code{makeDataReport} in the same way as a factor. 12 | #' 13 | #' @export 14 | smartNum <- function(v) { 15 | oriClass <- class(v) 16 | origLabel <- attr(v, "label", exact = TRUE) 17 | origDesc <- attr(v, "shortDescription") 18 | v <- factor(v) 19 | attr(v, "label") <- origLabel 20 | attr(v, "shortDescription") <- origDesc 21 | attr(v, "originalClass") <- oriClass 22 | class(v) <- c("smartNum", "factor") 23 | v 24 | } 25 | 26 | 27 | #Get the original class of a smartNum or fakeLabelled object. Used in makeDataReport(). 28 | oClass <- function(v) UseMethod("oClass") 29 | 30 | #' @exportS3Method 31 | oClass.default <- function(v) { 32 | oC <- attr(v, "orginalClass") 33 | if (!is.null(oC)) return(oC) 34 | else class(v) 35 | } 36 | 37 | #' @exportS3Method 38 | oClass.smartNum <- function(v) attr(v, "originalClass") 39 | 40 | #' @exportS3Method 41 | oClass.fakeLabelled <- function(v) attr(v, "originalClass") 42 | -------------------------------------------------------------------------------- /R/summaryFunction.R: -------------------------------------------------------------------------------- 1 | #' @title Create an object of class summaryFunction 2 | #' 3 | #' @description Convert a function, \code{f}, into an S3 4 | #' \code{summaryFunction} object. This adds \code{f} to the 5 | #' overview list returned by an \code{allSummaryFunctions()} 6 | #' call. 7 | #' 8 | #' @inheritParams checkFunction 9 | #' 10 | #' @param description A character string describing the summary 11 | #' returned by \code{f}. If \code{NULL} (the default), the 12 | #' name of \code{f} will be used instead. 13 | #' 14 | #' @return A function of class \code{summaryFunction} which has to attributes, 15 | #' namely \code{classes} and \code{description}. 16 | #' 17 | #' @details \code{summaryFunction} represents the functions used in 18 | #' \code{\link{summarize}} and \code{\link{makeDataReport}} for summarizing the 19 | #' features of variables in a dataset. 20 | #' 21 | #' An example of defining a new \code{summaryFunction} is given below. 22 | #' Note that the minimal requirements for such a function (in order for it to be 23 | #' compatible with \code{summarize()} and \code{makeDataReport()}) is the following 24 | #' input/output-structure: It must input at least two arguments, namely 25 | #' \code{v} (a vector variable) and \code{...}. Additional implemented 26 | #' arguments from \code{summarize()} and \code{makeDataReport()} include 27 | #' \code{maxDecimals}, see e.g. the pre-defined \code{summaryFunction} 28 | #' \code{\link{minMax}} for more details about how this arguments should 29 | #' be used. 30 | #' The output must be a list with at least the two entries \code{$feature} 31 | #' (a short character string describing what was summarized) and \code{$result} 32 | #' (a value or a character string with the result of the summarization). 33 | #' However, if the result of a \code{summaryFunction} is furthermore 34 | #' converted to a \code{\link{summaryResult}} object, a \code{print()} 35 | #' method also becomes available for consistent formatting of 36 | #' \code{summaryFunction} results. 37 | #' 38 | #' Note that all available \code{summaryFunction}s are listed by the call 39 | #' \code{allSummaryFunctions()} and we recommed looking into these function, 40 | #' if more knowledge about \code{summaryFunction}s is required. 41 | #' 42 | #' @include makeXFunction.R allClasses.R 43 | #' 44 | #' @seealso \code{\link{allSummaryFunctions}}, \code{\link{summarize}}, 45 | #' \code{\link{makeDataReport}}, \code{\link{checkResult}} 46 | #' 47 | #' @examples 48 | #' 49 | #' #Define a valid summaryFunction that can be called from summarize() 50 | #' #and makeDataReport(). This function counts how many zero entries a given 51 | #' #variable has: 52 | #' countZeros <- function(v, ...) { 53 | #' res <- length(which(v == 0)) 54 | #' summaryResult(list(feature = "No. zeros", result = res, value = res)) 55 | #' } 56 | #' 57 | #' #Convert it to a summaryFunction object. We don't count zeros for 58 | #' #logical variables, as they have a different meaning here (FALSE): 59 | #' countZeros <- summaryFunction(countZeros, description = "Count number of zeros", 60 | #' classes = setdiff(allClasses(), "logical")) 61 | #' 62 | #' #Call it directly : 63 | #' countZeros(c(0, 0, 0, 1:100)) 64 | #' 65 | #' #Call it via summarize(): 66 | #' data(cars) 67 | #' summarize(cars, numericSummaries = c(defaultNumericSummaries(), 68 | #' "countZeros")) 69 | #' 70 | #' #Note that countZeros now appears in a allSummaryFunctions() call: 71 | #' allSummaryFunctions() 72 | #' 73 | #' @export 74 | summaryFunction <- function(f, description, classes = NULL) { 75 | f <- deparse(substitute(f)) 76 | makeXFunction(f, description, classes, "summaryFunction") 77 | } 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /R/summaryResult.R: -------------------------------------------------------------------------------- 1 | #' @title Create object of class summaryResult 2 | #' 3 | #' @description Convert a list resulting from the summaries performed in a 4 | #' \code{\link{summaryFunction}} into a \code{summaryResult} object, thereby 5 | #' supplying it with a \code{print()} method. 6 | #' 7 | #' @param ls A list with entries \code{$feature} (a character string describing 8 | #' what summary was obtained), \code{$result} (the result of the summary, either 9 | #' a value from the variable, a numeric or a character string) and 10 | #' \code{$value} (the result in its most raw format, often identical to the 11 | #' \code{$result} input). 12 | #' 13 | #' @return A S3 object of class \code{summaryResult}, identical to the inputted 14 | #' list, \code{ls}, except for its class attribute. 15 | #' 16 | #' @seealso \code{\link{summaryFunction}} 17 | #' 18 | #' @export 19 | summaryResult <- function(ls) { 20 | entryNames <- names(ls) 21 | if (length(setdiff(entryNames, c("feature", "result", "value"))) != 0) { 22 | stop("The inputted list does not qualify as a summaryResult") 23 | } 24 | if (!("value" %in% entryNames) & !("result" %in% entryNames)) { 25 | stop("A summaryResult must have a $value slot or a $res slot") 26 | } 27 | if (!("result" %in% entryNames)) { 28 | if (is.numeric(ls$value)) { 29 | ls$result <- round(ls$value, 4) 30 | } else { 31 | ls$result <- ls$value 32 | } 33 | #make sure e.g. vectors and lists are collapsed into one character string 34 | ls$result <- paste(ls$result, collapse = " ") 35 | } 36 | class(ls) <- "summaryResult" 37 | ls 38 | } 39 | 40 | #' @export 41 | print.summaryResult <- function(x, ...) { 42 | mes <- paste(x$feature, ": ", 43 | paste(x$result, collapse = ", "), 44 | sep="") 45 | cat(mes) 46 | } 47 | -------------------------------------------------------------------------------- /R/tableVisual.R: -------------------------------------------------------------------------------- 1 | #' Produce tables for the makeDataReport visualizations. 2 | #' 3 | #' Produce a table of the distribution of a categorical (character, labelled, haven_labelled or factor) variable. 4 | #' Note that \code{tableVisual} is a \code{\link{visualFunction}}, compatible with the 5 | #' \code{\link{visualize}} and \code{\link{makeDataReport}} functions. 6 | #' 7 | #' @param v The variable (vector) to be plotted. 8 | #' @param vnam The name of the variable. 9 | #' @param doEval If TRUE, the table itself is returned. Otherwise, the function returns 10 | #' a character string containing standalone R code for producing the table. 11 | #' 12 | #' @examples 13 | #' 14 | #' #Save a variable 15 | #' myVar <- c("red", "blue", "red", "red", NA) 16 | #' 17 | #' #Plot a variable 18 | #' tableVisual(myVar, "MyVar") 19 | #' 20 | #' #Produce code for plotting a variable 21 | #' tableVisual(myVar, "MyVar", doEval = FALSE) 22 | #' 23 | #' @seealso \code{\link{visualize}}, \code{\link{basicVisual}}, \code{\link{standardVisual}} 24 | #' 25 | #' @importFrom pander pander 26 | #' @export 27 | tableVisual <- function(v, vnam, doEval = TRUE) { 28 | x <- table(v, useNA = "always") 29 | x <- t(rbind(x, paste(round(rbind(x/length(v)),4)*100, "%", sep = ""))) 30 | x <- cbind(dimnames(x)[[1]], x) 31 | rownames(x) <- NULL 32 | dimnames(x)[[2]] <- c("value", "count", "percentage") 33 | thisCall <- call("pander", x = x, keep.trailing.zeros = TRUE) 34 | if (!doEval) return(deparse(thisCall)) 35 | else return(eval(thisCall)) 36 | } 37 | 38 | 39 | 40 | 41 | #' @include visualFunction.R 42 | tableVisual <- visualFunction(tableVisual, "Distribution tables", 43 | classes = c("character", "factor", "labelled", "haven_labelled")) 44 | 45 | 46 | ##########################################Not exported below######################################### 47 | 48 | #v <- toyData$pill 49 | #x <- table(v, useNA = "always") 50 | #x <- t(rbind(x, paste(round(rbind(x/length(v)),4)*100, "%", sep = ""))) 51 | #dimnames(x)[[2]] <- c("count", "percentage") 52 | #thisCall <- call("pander", x = x, caption = "vnam", keep.trailing.zeros = TRUE) 53 | #eval(thisCall) 54 | -------------------------------------------------------------------------------- /R/uniqueValues.R: -------------------------------------------------------------------------------- 1 | #' @title summaryFunction for unique values 2 | #' 3 | #' @description A \code{\link{summaryFunction}} type function, intended to be called from 4 | #' \code{\link{summarize}} to be called from \code{\link{summarize}}, which counts the 5 | #' number of unique (excluding \code{NA}s) values in a variable. 6 | #' 7 | #' @param v A variable (vector). 8 | #' 9 | #' @param ... Not in use. 10 | #' 11 | #' @return An object of class \code{summaryResult} with the following entries: 12 | #' \code{$feature} ("No. unique values") and \code{$result} (the number of unique 13 | #' values in \code{v}). 14 | #' 15 | #' @seealso \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 16 | #' \code{\link{allSummaryFunctions}} 17 | #' 18 | #' @examples 19 | #' uniqueValues(c(1:3, rep(NA, 10), Inf, NaN)) 20 | #' 21 | #' @importFrom stats na.omit 22 | #' @export 23 | uniqueValues <- function(v, ...) UseMethod("uniqueValues") 24 | 25 | #assign methods to generic uniqueValues function 26 | 27 | #' @export 28 | uniqueValues.character <- function(v, ...) uniqueValuesCFBI(v) 29 | 30 | #' @export 31 | uniqueValues.factor <- function(v, ...) uniqueValuesCFBI(v) 32 | 33 | #' @export 34 | uniqueValues.labelled <- function(v, ...) uniqueValuesL(v) #?PROBLEM? 35 | 36 | #' @export 37 | uniqueValues.haven_labelled <- function(v, ...) uniqueValuesL(v) #?PROBLEM? 38 | 39 | 40 | #' @export 41 | uniqueValues.numeric <- function(v, ...) uniqueValuesN(v) 42 | 43 | #' @export 44 | uniqueValues.integer <- function(v, ...) uniqueValuesCFBI(v) 45 | 46 | #' @export 47 | uniqueValues.logical <- function(v, ...) uniqueValuesCFBI(v) 48 | 49 | #' @export 50 | uniqueValues.Date <- function(v, ...) uniqueValuesCFBI(v) 51 | 52 | 53 | #Make it a summaryFunction 54 | #' @include summaryFunction.R 55 | #' @export 56 | uniqueValues <- summaryFunction(uniqueValues, "Count number of unique values", allClasses()) 57 | 58 | 59 | ##########################################Not exported below######################################### 60 | 61 | 62 | #methods for each variable type 63 | uniqueValuesCFBI <- function(v) { 64 | noUnique <- length(unique(na.omit(v))) 65 | summaryResult(list(feature="Number of unique values", 66 | result = noUnique, 67 | value = noUnique)) 68 | } 69 | 70 | uniqueValuesN <- function(v) { 71 | out <- uniqueValuesCFBI(v) 72 | 73 | #check for NaNs 74 | if (any(is.nan(v))) out$result <- out$result + 1 75 | 76 | out 77 | } 78 | 79 | uniqueValuesL <- function(v) { 80 | uniqueValuesCFBI(dataReporter_as_factor(v)) 81 | } 82 | 83 | -------------------------------------------------------------------------------- /R/unpackLabelled.R: -------------------------------------------------------------------------------- 1 | #Maybe delete? Deal when labelled stuff is implemented. 2 | #this function is used in identifyMissing.R and 3 | #identifyWhitespace.R 4 | unpackLabelled <- function(v) { 5 | c(as.character(v), attributes(v)$labels) 6 | } 7 | 8 | 9 | -------------------------------------------------------------------------------- /R/utility.R: -------------------------------------------------------------------------------- 1 | #' Find out if the whoami package binaries is installed (git + whoami) 2 | #' @return logical that is TRUE if whoami and git can be found 3 | #' @export 4 | whoami_available <- function() { 5 | 6 | ## check if pandoc can be found on the path 7 | whoami_bin <- as.character(Sys.which('whoami')) 8 | git_bin <- as.character(Sys.which('git')) 9 | 10 | ## return whatever found 11 | (whoami_bin != '') & (git_bin != '') 12 | 13 | } 14 | 15 | 16 | -------------------------------------------------------------------------------- /R/variableType.R: -------------------------------------------------------------------------------- 1 | #' @title Summary function for original class 2 | #' 3 | #' @description A \code{\link{summaryFunction}} type function, intended to be called from 4 | #' \code{\link{summarize}}, which finds the 5 | #' original class of a variable. This is just the class for all objects but those of class 6 | #' \code{smartNum}. 7 | #' 8 | #' @param v A variable (vector). 9 | #' 10 | #' @param ... Not in use. 11 | #' 12 | #' @return An object of class \code{summaryResult} with the following entries: 13 | #' \code{$feature} ("Variable type"), \code{$result} (the (original) class of 14 | #' \code{v}) and \code{$value} (identical to \code{$result}). 15 | #' 16 | #' @seealso \code{\link{summarize}} 17 | #' 18 | #' @examples 19 | #' #For standard variables: 20 | #' varX <- c(rep(c(1,2,3), each=10)) 21 | #' class(varX) 22 | #' variableType(varX) 23 | #' 24 | #' #For smartNum variables: 25 | #' smartX <- dataReporter::smartNum(varX) 26 | #' class(smartX) 27 | #' variableType(smartX) 28 | #' 29 | #' @include smartNum.R 30 | #' @export 31 | variableType <- function(v, ...) { 32 | vClass <- oClass(v)[1] 33 | summaryResult(list(feature="Variable type", result = vClass, value = vClass)) 34 | } 35 | 36 | #' @include variableType.R 37 | variableType <- summaryFunction(variableType, "Data class of variable", allClasses()) 38 | -------------------------------------------------------------------------------- /R/visualFunction.R: -------------------------------------------------------------------------------- 1 | #' @title Create an object of class visualFunction 2 | #' 3 | #' @description Convert a function, \code{f}, into an S3 4 | #' \code{visualFunction} object. This adds \code{f} to the 5 | #' overview list returned by an \code{allVisualFunctions()} 6 | #' call. 7 | #' 8 | #' @inheritParams checkFunction 9 | #' 10 | #' @param description A character string describing the visualization 11 | #' returned by \code{f}. If \code{NULL} (the default), the name of 12 | #' \code{f} will be used instead. 13 | #' 14 | #' @return A function of class \code{visualFunction} which has to attributes, 15 | #' namely \code{classes} and \code{description}. 16 | #' 17 | #' @details \code{visualFunction} represents the functions used in 18 | #' \code{\link{visualize}} and \code{\link{makeDataReport}} for plotting the 19 | #' distributions of the variables in a dataset. 20 | #' 21 | #' An example of defining a new \code{visualFunction} is given below. 22 | #' Note that the minimal requirements for such a function (in order for it to be 23 | #' compatible with \code{visualize()} and \code{makeDataReport()}) is the following 24 | #' input/output-structure: It must input exactly the following three arguments, 25 | #' namely \code{v} (a vector variable), \code{vnam} (a character string with 26 | #' the name of the variable) and \code{doEval} (a logical). The last argument 27 | #' is supposed to control whether the function produces a plot in the 28 | #' graphic device (if \code{doEval = TRUE}) or instead returns a character 29 | #' string including \code{R} code for generating such a plot. In the latter 30 | #' setting, the code must be stand-alone, that is, it cannot depend on object 31 | #' available in an environment. In practice, this will typically imply that 32 | #' the data variable is included in the code snip. 33 | #' It is not strictly necessary to implement the \code{doEval = TRUE} setting 34 | #' for the \code{visualFunction} to be compatible with \code{\link{makeDataReport}}, 35 | #' but we recommend doing it anyway such that the function can also be used 36 | #' interactively. 37 | #' 38 | #' Note that all available \code{visualFunction}s are listed by the call 39 | #' \code{allVisualFunctions()} and we recommed looking into these function, 40 | #' if more knowledge about \code{visualFunction}s is required. 41 | #' 42 | #' @include makeXFunction.R allClasses.R 43 | #' 44 | #' @seealso \code{\link{allVisualFunctions}}, \code{\link{visualize}}, 45 | #' \code{\link{makeDataReport}} 46 | #' 47 | #' @examples 48 | #' #Defining a new visualFunction: 49 | #' mosaicVisual <- function(v, vnam, doEval) { 50 | #' thisCall <- call("mosaicplot", table(v), main = vnam, xlab = "") 51 | #' if (doEval) { 52 | #' return(eval(thisCall)) 53 | #' } else return(deparse(thisCall)) 54 | #' } 55 | #' mosaicVisual <- visualFunction(mosaicVisual, description = "Mosaicplots from graphics", 56 | #' classes = allClasses()) 57 | #' 58 | #' #mosaicVisual is now included in a allVisualFunctions() call: 59 | #' allVisualFunctions() 60 | #' 61 | #' #Create a mosaic plot: 62 | #' ABCvar <- c(rep("a", 10), rep("b", 20), rep("c", 5)) 63 | #' mosaicVisual(ABCvar, "ABCvar", TRUE) 64 | #' 65 | #' #Create a character string with the code for a mosaic plot: 66 | #' mosaicVisual(ABCvar, "ABCVar", FALSE) 67 | #' 68 | #' #Extract or set description of a visualFunction: 69 | #' description(mosaicVisual) 70 | #' description(mosaicVisual) <- "A cubist version of a pie chart" 71 | #' description(mosaicVisual) 72 | #' 73 | #' 74 | #' @export 75 | visualFunction <- function(f, description, classes = NULL) { 76 | f <- deparse(substitute(f)) 77 | makeXFunction(f, description, classes, "visualFunction") 78 | } 79 | 80 | 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dataReporter 2 | 3 | 4 | 5 | [![Travis-CI Build 6 | Status](https://travis-ci.org/ekstroem/dataReporter.svg?branch=master)](https://travis-ci.org/ekstroem/dataReporter) 7 | [![CRAN\_Release\_Badge](http://www.r-pkg.org/badges/version-ago/dataReporter)](https://CRAN.R-project.org/package=dataReporter) 8 | ![Download counter](http://cranlogs.r-pkg.org/badges/grand-total/dataReporter) 9 | 10 | 11 | dataReporter is an R package for documenting and creating reports on data cleanliness. 12 | 13 | 14 | ## Installation 15 | 16 | This github page contains the *development version* of dataReporter. For the 17 | latest stable version download the package from CRAN directly using 18 | 19 | ```{r} 20 | install.packages("dataReporter") 21 | ``` 22 | 23 | To install the development version of dataReporter run the following 24 | commands from within R (requires that the `devtools` package is already installed) 25 | 26 | ```{r} 27 | devtools::install_github("ekstroem/dataReporter") 28 | ``` 29 | 30 | ## Package overview 31 | 32 | A super simple way to get started is to load the package and use the 33 | `makeDataReport()` function on a data frame (if you try to generate several 34 | reports for the same data, then it may be necessary to add the `replace=TRUE` 35 | argument to overwrite the existing report). 36 | 37 | ```{r} 38 | library("dataReporter") 39 | data(trees) 40 | makeDataReport(trees) 41 | ``` 42 | 43 | This will create a report with summaries and error checks for each 44 | variable in the `trees` data frame. The format of the report depends on your OS and whether 45 | you have have a [LaTeX](https://www.latex-project.org/) installation on your computer, which 46 | is needed for creating pdf reports. 47 | 48 | 49 | ### Using dataReporter interactively 50 | 51 | The dataReporter package can also be used interactively by running checks 52 | for the individual variables or for all variables in the dataset 53 | 54 | ```{r} 55 | data(toyData) 56 | check(toyData$events) # Individual check of events 57 | check(toyData) # Check all variables at once 58 | ``` 59 | 60 | By default the standard battery of tests is run depending on the 61 | variable type. If we just want a specific test for, say, a numeric 62 | variable then we can specify that. All available checks can be viewed 63 | by calling `allCheckFunctions()`. See [the 64 | documentation](https://www.jstatsoft.org/index.php/jss/article/view/v090i06/v90i06.pdf) 65 | for an overview of the checks available or how to create and include 66 | your own tests. 67 | 68 | 69 | ```{r} 70 | check(toyData$events, checks = setChecks(numeric = "identifyMissing")) 71 | ``` 72 | 73 | We can also access the graphics or summary tables that are produced for a variable by calling the `visualize` or `summarize` functions. One can visualize a single variable or a full dataset: 74 | 75 | ```{r} 76 | #Visualize a variable 77 | visualize(toyData$events) 78 | 79 | #Visualize a dataset 80 | visualize(toyData) 81 | ``` 82 | 83 | The same is true for summaries. Note also that the choice of checks/visualizations/summaries are customizable: 84 | 85 | ```{r} 86 | #Summarize a variable with default settings: 87 | summarize(toyData$events) 88 | 89 | #Summarize a variable with user-specified settings: 90 | summarize(toyData$events, summaries = setSummaries(all = c("centralValue", "minMax")) 91 | ``` 92 | 93 | 94 | ## Detailed documentation 95 | 96 | You can read the main paper accompanying the package at the [Journal 97 | of Statistical 98 | Software](https://www.jstatsoft.org/article/view/v090i06). It provides 99 | a detailed introduction to the dataReporter package (original launched under the name `dataMaid`). 100 | 101 | We also have two blog posts that provide an introduction to the package. The can be found [here (the primary one)](https://sandsynligvis.dk/2017/08/21/datamaid-your-personal-assistant-for-cleaning-up-the-data-cleaning-process/) and [here](https://sandsynligvis.dk/2018/03/03/generating-codebooks-in-r/). 102 | 103 | Moreover, we have 104 | created a vignette that describes how to extend dataReporter to include 105 | user-defined data screening checks, summaries and visualizations. This 106 | vignette is called `extending_dataReporter`: 107 | 108 | ```{r} 109 | vignette("extending_dataReporter") 110 | ``` 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /data/artData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/data/artData.rda -------------------------------------------------------------------------------- /data/bigPresidentData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/data/bigPresidentData.rda -------------------------------------------------------------------------------- /data/exampleData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/data/exampleData.RData -------------------------------------------------------------------------------- /data/presidentData.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/data/presidentData.rda -------------------------------------------------------------------------------- /data/testData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/data/testData.RData -------------------------------------------------------------------------------- /data/toyData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/data/toyData.RData -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | bibentry(bibtype = "Article", 2 | title = "{dataMaid}: Your Assistant for Documenting Supervised Data Quality Screening in {R}", 3 | author = c(person(given = c("Anne", "Helby"), 4 | family = "Petersen"), 5 | person(given = c("Claus", "Thorn"), 6 | family = "Ekstr{\\o}m")), 7 | journal = "Journal of Statistical Software", 8 | year = "2019", 9 | volume = "90", 10 | number = "6", 11 | pages = "1--38", 12 | doi = "10.18637/jss.v090.i06", 13 | 14 | header = "To cite dataReporter in publications use:" 15 | ) 16 | 17 | -------------------------------------------------------------------------------- /man/allCheckFunctions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/allCheckFunctions.R 3 | \name{allCheckFunctions} 4 | \alias{allCheckFunctions} 5 | \title{Overview of all available checkFunctions} 6 | \usage{ 7 | allCheckFunctions() 8 | } 9 | \value{ 10 | An object of class \code{functionSummary}. This object has entries \code{$name} 11 | (the function names), \code{$description} (the function descriptions, as obtained from their 12 | \code{description} attributes) and \code{$classes} (the classes each function is indeded 13 | to be called on, as obtained from their \code{classes} attributes). 14 | } 15 | \description{ 16 | Produce an overview of all functions of class \code{checkFunction} 17 | available in the workspace or imported from packages. This overview includes 18 | the descriptions and a list of what classes the functions are each intended 19 | to be called on. 20 | } 21 | \examples{ 22 | allCheckFunctions() 23 | 24 | } 25 | \seealso{ 26 | \code{\link{checkFunction}} \code{\link{allVisualFunctions}} 27 | \code{\link{allSummaryFunctions}} 28 | } 29 | -------------------------------------------------------------------------------- /man/allClasses.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/allClasses.R 3 | \name{allClasses} 4 | \alias{allClasses} 5 | \title{Vector of all variable classes in \code{dataReporter}} 6 | \usage{ 7 | allClasses() 8 | } 9 | \description{ 10 | Returns the names of the eight data classes for which 11 | \code{dataReporter} is implemented, namely \code{"character"}, \code{"Date"}, 12 | \code{"factor"}, \code{"integer"}, \code{"labelled"}, 13 | \code{"haven_labelled"}, \code{"logical"} and 14 | \code{"numeric"}. 15 | } 16 | \examples{ 17 | allClasses() 18 | 19 | } 20 | -------------------------------------------------------------------------------- /man/allSummaryFunctions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/allSummaryFunctions.R 3 | \name{allSummaryFunctions} 4 | \alias{allSummaryFunctions} 5 | \title{Overview of all available summaryFunctions} 6 | \usage{ 7 | allSummaryFunctions() 8 | } 9 | \value{ 10 | An object of class \code{functionSummary}. This object has entries \code{$name} 11 | (the function names), \code{$description} (the function descriptions, as obtained from their 12 | \code{description} attributes) and \code{$classes} (the classes each function is indeded 13 | to be called on, as obtained from their \code{classes} attributes). 14 | } 15 | \description{ 16 | Produce an overview of all functions of class \code{summaryFunction} 17 | available in the workspace or imported from packages. This overview includes 18 | the descriptions and a list of what classes the functions are each intended 19 | to be called on. 20 | } 21 | \examples{ 22 | allSummaryFunctions() 23 | 24 | } 25 | \seealso{ 26 | \code{\link{summaryFunction}} \code{\link{allVisualFunctions}} 27 | \code{\link{allCheckFunctions}} 28 | } 29 | -------------------------------------------------------------------------------- /man/allVisualFunctions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/allVisualFunctions.R 3 | \name{allVisualFunctions} 4 | \alias{allVisualFunctions} 5 | \title{Overview of all available visualFunctions} 6 | \usage{ 7 | allVisualFunctions() 8 | } 9 | \value{ 10 | An object of class \code{functionSummary}. This object has entries \code{$name} 11 | (the function names), \code{$description} (the function descriptions, as obtained from their 12 | \code{description} attributes) and \code{$classes} (the classes each function is indeded 13 | to be called on, as obtained from their \code{classes} attributes). 14 | } 15 | \description{ 16 | Produce an overview of all functions of class \code{visualFunction} 17 | available in the workspace or imported from packages. This overview includes 18 | the descriptions and a list of what classes the functions are each intended 19 | to be called on. 20 | } 21 | \examples{ 22 | allVisualFunctions() 23 | 24 | } 25 | \seealso{ 26 | \code{\link{visualFunction}} \code{\link{allCheckFunctions}} 27 | \code{\link{allSummaryFunctions}} 28 | } 29 | -------------------------------------------------------------------------------- /man/artData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataReporter-package.R 3 | \docType{data} 4 | \name{artData} 5 | \alias{artData} 6 | \title{Semi-artificial data about masterpieces of art} 7 | \format{ 8 | A data frame with 200 rows and 11 variables. 9 | \describe{ 10 | \item{ArtistID}{A unique ID used for cataloging the artists (fictional).} 11 | \item{ArtistName}{The name of the artist.} 12 | \item{NoOfMiddlenames}{The number of middlenames the artist has.} 13 | \item{Title}{The title of the painting.} 14 | \item{Year}{The approximate year in which the painting was made.} 15 | \item{Location}{The current location of the painting.} 16 | \item{Continent}{The continent of the current location of the painting.} 17 | \item{Width}{The width of the painting, in centimeters.} 18 | \item{Height}{The height of the painting, in centimers.} 19 | \item{Media}{The media/materials of the painting.} 20 | \item{Movement}{The artistic movement(s) the painting belongs to.} 21 | } 22 | } 23 | \source{ 24 | Semi-artificial dataset constructed based on the Master Works of Art dataset available from 25 | \href{https://www.data-explorer.com/data/}{Data Explorer}. 26 | } 27 | \usage{ 28 | artData 29 | } 30 | \description{ 31 | A dataset with information about 200 painting and their painters. 32 | Each observation in the dataset corresponds to a painting. A single artificial variable, 33 | namely an artist ID variable, has been included. Otherwise the information should 34 | be truthful. 35 | } 36 | \examples{ 37 | data(artData) 38 | 39 | } 40 | \keyword{datasets} 41 | -------------------------------------------------------------------------------- /man/basicVisual.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/basicVisual.R 3 | \name{basicVisual} 4 | \alias{basicVisual} 5 | \title{Produce distribution plots in the base R (graphics) style using \code{\link{plot}} and 6 | \code{\link{barplot}}} 7 | \usage{ 8 | basicVisual(v, vnam, doEval = TRUE) 9 | } 10 | \arguments{ 11 | \item{v}{The variable (vector) to be plotted.} 12 | 13 | \item{vnam}{The name of the variable which will appear as the title of the plot.} 14 | 15 | \item{doEval}{If TRUE, the plot itself is returned. Otherwise, the function returns 16 | a character string containing standalone R code for producing the plot.} 17 | } 18 | \description{ 19 | Plot the distribution of a variable, depending on its data class, using the base R 20 | plotting functions. Note that \code{basicVisual} is a \code{\link{visualFunction}}, compatible with the 21 | \code{\link{visualize}} and \code{\link{makeDataReport}} functions. 22 | } 23 | \details{ 24 | For character, factor, logical and (haven_)labelled variables, a barplot is produced. For numeric, 25 | integer or Date variables, \code{basicVisual} produces a histogram instead. Note that for 26 | integer and numeric variables, all non-finite (i.e. \code{NA}, \code{NaN}, \code{Inf}) values are 27 | removed prior to plotting. For character, factor, (haven_)labelled and logical variables, only \code{NA} 28 | values are removed. 29 | } 30 | \examples{ 31 | 32 | #Save a variable 33 | myVar <- c(1:10) 34 | #Plot a variable 35 | basicVisual(myVar, "MyVar") 36 | 37 | #Produce code for plotting a variable 38 | basicVisual(myVar, "MyVar", doEval = FALSE) 39 | 40 | } 41 | \seealso{ 42 | \code{\link{visualize}}, \code{\link{standardVisual}} 43 | } 44 | -------------------------------------------------------------------------------- /man/basicVisualCFLB.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/basicVisual.R 3 | \name{basicVisualCFLB} 4 | \alias{basicVisualCFLB} 5 | \title{importFrom stats na.omit} 6 | \usage{ 7 | basicVisualCFLB(v, vnam, doEval = TRUE) 8 | } 9 | \arguments{ 10 | \item{v}{The variable (vector) to be plotted.} 11 | 12 | \item{vnam}{The name of the variable which will appear as the title of the plot.} 13 | 14 | \item{doEval}{If TRUE, the plot itself is returned. Otherwise, the function returns 15 | a character string containing standalone R code for producing the plot.} 16 | } 17 | \description{ 18 | importFrom stats na.omit 19 | } 20 | -------------------------------------------------------------------------------- /man/bigPresidentData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataReporter-package.R 3 | \docType{data} 4 | \name{bigPresidentData} 5 | \alias{bigPresidentData} 6 | \title{Semi-artificial data about the US presidents (extended version)} 7 | \format{ 8 | A data frame with 47 rows and 15 variables. 9 | \describe{ 10 | \item{lastName}{A \code{Name} type variable containing the last name of the president.} 11 | \item{firstName}{A \code{Name} type variable containing the first name of the president.} 12 | \item{orderOfPresidency}{A factor variable indicating the order of the presidents (with George Washington 13 | as number 1 and Donald Trump as number 45).} 14 | \item{birthday}{A Date variable with the birthday of the president.} 15 | \item{dateOfDeath}{A Date variable with the date of the president's death.} 16 | \item{stateOfBirth}{A character variable with the state in which the president was born.} 17 | \item{party}{A charcter variable with the party to which the president was associated.} 18 | \item{presidencyBeginDate}{A Date variable with the date of inauguration of the president.} 19 | \item{presidencyEndDate}{A Date variable with the date at which the presidency ends.} 20 | \item{assassinationAttempt}{A numeric variable indicating whether there was an assassination 21 | attempt (\code{1}) or not (\code{0}) on the president.} 22 | \item{sex}{A factor variable with the sex of the president.} 23 | \item{ethnicity}{A factor variable with the ethnicity of the president.} 24 | \item{presidencyYears}{A numeric variable with the duration of the presidency, in years.} 25 | \item{ageAtInauguration}{A character variable with the age at inauguration.} 26 | \item{favoriteNumber}{A \code{complex} type variable with a fictional favorite number for 27 | each president.} 28 | } 29 | } 30 | \source{ 31 | Artificial dataset constructed based on the US president dataset available from 32 | \href{https://www.data-explorer.com/data/}{Data Explorer}. 33 | } 34 | \usage{ 35 | bigPresidentData 36 | } 37 | \description{ 38 | A dataset with information about the first 45 US presidents as well as a 46th 39 | person, who is not a US president, and a duplicate of one of the 45 actual presidents. 40 | The dataset was constructed to show the capabilities 41 | of \code{dataReporter} and therefore, it has been constructed to include errors and miscodings. 42 | Each observation in the dataset corresponds to a person. The dataset uses the 43 | non-standard class \code{Name} which is simply an attribute that has been added to 44 | two variables in order to show how \code{dataReporter} handles non-supported classes. Note that the dataset 45 | is an extended and more error-filled version of the dataset \code{presidentData} which is 46 | also included in the package. 47 | } 48 | \examples{ 49 | data(bigPresidentData) 50 | 51 | } 52 | \references{ 53 | Petersen AH, Ekstrøm CT (2019). “dataMaid: Your Assistant for Documenting Supervised Data Quality Screening in R.” _Journal of Statistical Software_, *90*(6), 1-38. doi: 10.18637/jss.v090.i06 ( \doi{10.18637/jss.v090.i06}). 54 | } 55 | \keyword{datasets} 56 | -------------------------------------------------------------------------------- /man/centralValue.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/centralValue.R 3 | \name{centralValue} 4 | \alias{centralValue} 5 | \title{summaryFunction for central values} 6 | \usage{ 7 | centralValue(v, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector).} 11 | 12 | \item{...}{Extra arguments to be passed to class-specific functions. These incluse 13 | \code{maxDecimals} (default is 2) which controls the rounding of integer and numeric 14 | values.} 15 | } 16 | \value{ 17 | An object of class \code{summaryResult} with the following entries: \code{$feature} 18 | (the mode/median),\code{$result} (the central value of \code{v}) and \code{$value} (identical 19 | to \code{$result}). 20 | 21 | If the mode is returned and it is not uniquely determined, the first value qualifying as a mode is 22 | returned, when the variable is sorted according to \code{\link{sort}}. 23 | } 24 | \description{ 25 | A \code{summaryFunction}, intended to be called from 26 | \code{\link{summarize}}, which returns the central value of a variable. 27 | For numeric and integer variables, this is the median. For 28 | character, factor, (have_)labelled, Date and logical variables, the central value is the mode 29 | (i.e. the value that occurs the largest number of times). 30 | } 31 | \details{ 32 | Note that NA, NaN and Inf values are ignored for numeric and integer variables, while 33 | only NA values are ignored for factor, character, Date and (haven_)labelled variables. No values are 34 | ignored for logical variables. 35 | } 36 | \examples{ 37 | #central value of an integer variable: 38 | centralValue(c(rep(1, 25), rep(2, 10), rep(3, 20))) 39 | 40 | #central value of a character variable: 41 | centralValue(as.character(c(rep(1, 20), rep(2, 10), rep(3, 20)))) 42 | 43 | } 44 | \seealso{ 45 | \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 46 | \code{\link{allSummaryFunctions}} 47 | } 48 | -------------------------------------------------------------------------------- /man/check.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{check} 4 | \alias{check} 5 | \title{Perform checks of potential errors in variable/dataset} 6 | \usage{ 7 | check(v, nMax = 10, checks = setChecks(), ...) 8 | } 9 | \arguments{ 10 | \item{v}{the vector or the dataset (\code{data.frame}) to be checked.} 11 | 12 | \item{nMax}{If a check is supposed to identify problematic values, 13 | this argument controls if all of these should be pasted onto the outputted 14 | message, or if only the first \code{nMax} should be included. If set to \code{Inf}, 15 | all problematic values are printed.} 16 | 17 | \item{checks}{A list of checks to use on each supported variable type. We recommend 18 | using \code{\link{setChecks}} for creating this list and refer to the documentation 19 | of this function for more details.} 20 | 21 | \item{\dots}{Other arguments that are passed on to the checking functions. 22 | These includes general parameters controlling how the check results are 23 | formatted (e.g. \code{maxDecimals}, which controls the number of decimals 24 | printed for numerical, problematic values).} 25 | } 26 | \value{ 27 | If \code{v} is a variable, a list of objects of class 28 | \code{\link{checkResult}}, which each summarizes the result of a 29 | \code{\link{checkFunction}} call performed on \code{v}. 30 | See \code{\link{checkResult}} for more details. If \code{V} is a 31 | \code{data.frame}, a list of lists of the form above 32 | is returned instead with one entry for each variable in \code{v}. 33 | } 34 | \description{ 35 | Run a set of validation checks to check a variable vector or a full dataset 36 | for potential errors. 37 | Which checks are performed depends on the class of the variable and on user 38 | inputs. 39 | } 40 | \details{ 41 | It should be noted that the default options for each variable type 42 | are returned by calling e.g. \code{defaultCharacterChecks()}, 43 | \code{defaultFactorChecks()}, \code{defaultNumericChecks()}, etc. A complete 44 | overview of all default options can be obtained by calling \code{setChecks()}. 45 | Moreover, all available \code{checkFunction}s (including both locally defined 46 | functions and functions imported from \code{dataReporter} or other packages) can 47 | be viewed by calling \code{allCheckFunctions()}. 48 | } 49 | \examples{ 50 | 51 | x <- 1:5 52 | check(x) 53 | 54 | #Annoyingly coded missing as 99 55 | y <- c(rnorm(100), rep(99, 10)) 56 | check(y) 57 | 58 | #Check y for outliers and print 4 decimals for problematic variables 59 | check(y, checks = setChecks(numeric = "identifyOutliers"), maxDecimals = 4) 60 | 61 | #Change what checks are performed on a variable, now only identifyMissing is called 62 | # for numeric variables 63 | check(y, checks = setChecks(numeric = "identifyMissing")) 64 | 65 | #Check a full data.frame at once 66 | data(cars) 67 | check(cars) 68 | 69 | #Check a full data.frame at once, while changing the standard settings for 70 | #several data classes at once. Here, we ommit the check of miscoded missing values for factors 71 | #and we only do this check for numeric variables: 72 | check(cars, checks = setChecks(factor = defaultFactorChecks(remove = "identifyMissing"), 73 | numeric = "identifyMissing")) 74 | 75 | } 76 | \references{ 77 | Petersen AH, Ekstrøm CT (2019). “dataMaid: Your Assistant for Documenting Supervised Data Quality Screening in R.” _Journal of Statistical Software_, *90*(6), 1-38. doi: 10.18637/jss.v090.i06 ( \doi{10.18637/jss.v090.i06}). 78 | } 79 | \seealso{ 80 | \code{\link{setChecks}}, 81 | \code{\link{allCheckFunctions}} \code{\link{checkResult}} 82 | \code{\link{checkFunction}}, \code{\link{defaultCharacterChecks}}, 83 | \code{\link{defaultFactorChecks}}, \code{\link{defaultLabelledChecks}}, 84 | \code{\link{defaultHavenlabelledChecks}}, 85 | \code{\link{defaultNumericChecks}}, \code{\link{defaultIntegerChecks}}, 86 | \code{\link{defaultLogicalChecks}}, \code{\link{defaultDateChecks}} 87 | } 88 | \keyword{misc} 89 | -------------------------------------------------------------------------------- /man/checkResult.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/checkResult.R 3 | \name{checkResult} 4 | \alias{checkResult} 5 | \title{Create object of class checkResult} 6 | \usage{ 7 | checkResult(ls) 8 | } 9 | \arguments{ 10 | \item{ls}{A list with entries \code{$problem} (logical indicating whether 11 | a problem was found), \code{$message} (a character string containing a 12 | message describing the problem) and \code{$problemValues} (the values 13 | in the checked variables that were marked as problematic). Note that 14 | \code{$message} and \code{$problemValues} can be left empty (i.e. 15 | \code{""} and \code{NULL}, respectively), if they are not relevant.} 16 | } 17 | \value{ 18 | A S3 object of class \code{checkResult}, identical to the inputted 19 | list, \code{ls}, except for its class attribute. 20 | } 21 | \description{ 22 | Convert a list resulting from the checks performed in a 23 | \code{\link{checkFunction}} into a \code{checkResult} object, thereby 24 | supplying it with a \code{print()} method. 25 | } 26 | \seealso{ 27 | \code{\link{checkFunction}} 28 | } 29 | -------------------------------------------------------------------------------- /man/classes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/classes.R 3 | \name{classes} 4 | \alias{classes} 5 | \alias{classes<-} 6 | \title{Extract the contents of the attribute \code{classes}} 7 | \usage{ 8 | classes(x) 9 | 10 | classes(x) <- value 11 | } 12 | \arguments{ 13 | \item{x}{The object for which the \code{classes} 14 | attribute should be extracted.} 15 | 16 | \item{value}{New value} 17 | } 18 | \value{ 19 | The classes for which \code{x} is intended to be called, 20 | given as a vector of characters. 21 | } 22 | \description{ 23 | If the object, \code{x}, is itself of 24 | class \code{\link{checkFunction}}, \code{\link{summaryFunction}} 25 | or \code{\link{visualFunction}}, the contents of \code{x}'s 26 | attribute \code{classes} is returned. Otherwise, \code{NULL} is 27 | returned. 28 | } 29 | \examples{ 30 | #Extract the classes of the checkFunction identifyMissing 31 | classes(identifyMissing) 32 | 33 | #Extract the classes of the summaryFunction minMax 34 | classes(minMax) 35 | 36 | #Extract the classes of the visualFunction basicVisual 37 | classes(basicVisual) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /man/countMissing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/countMissing.R 3 | \name{countMissing} 4 | \alias{countMissing} 5 | \title{Summary function for missing values} 6 | \usage{ 7 | countMissing(v, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector).} 11 | 12 | \item{...}{Not in use.} 13 | } 14 | \value{ 15 | A \code{\link{summaryResult}} object with the following entries: 16 | \code{$feature} ("No. missing obs."), \code{$result} (the number and percentage 17 | missing observations) and \code{$value} (the number of missing observations). 18 | } 19 | \description{ 20 | A \code{\link{summaryFunction}}, intended to be called from 21 | \code{\link{summarize}} (and \code{\link{makeDataReport}}), which counts the 22 | number of missing (\code{NA}) values in a variable. 23 | } 24 | \examples{ 25 | countMissing(c(1:100, rep(NA, 10))) 26 | 27 | } 28 | \seealso{ 29 | \code{\link{summarize}}, \code{\link{allSummaryFunctions}}, 30 | \code{\link{summaryFunction}}, \code{\link{summaryResult}} 31 | } 32 | -------------------------------------------------------------------------------- /man/defaultCharacterChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultCharacterChecks} 4 | \alias{defaultCharacterChecks} 5 | \title{Default checks for character variables} 6 | \usage{ 7 | defaultCharacterChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | character type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultCharacterSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultCharacterSummaries} 4 | \alias{defaultCharacterSummaries} 5 | \title{Default summary functions for character variables} 6 | \usage{ 7 | defaultCharacterSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | character type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | #remove "variableType" from the summaries: 25 | defaultCharacterSummaries(remove = "variableType") 26 | 27 | } 28 | \seealso{ 29 | \code{\link{variableType}}, \code{\link{countMissing}}, \code{\link{uniqueValues}}, 30 | \code{\link{centralValue}} 31 | } 32 | -------------------------------------------------------------------------------- /man/defaultDateChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultDateChecks} 4 | \alias{defaultDateChecks} 5 | \title{Default checks for Date variables} 6 | \usage{ 7 | defaultDateChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | Date type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultDateSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultDateSummaries} 4 | \alias{defaultDateSummaries} 5 | \title{Default summary functions for Date variables} 6 | \usage{ 7 | defaultDateSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | Date type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | defaultDateSummaries() 25 | 26 | } 27 | \seealso{ 28 | \code{\link{variableType}}, \code{\link{countMissing}}, \code{\link{uniqueValues}}, 29 | \code{\link{centralValue}}, \code{\link{minMax}}, \code{\link{quartiles}} 30 | } 31 | -------------------------------------------------------------------------------- /man/defaultFactorChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultFactorChecks} 4 | \alias{defaultFactorChecks} 5 | \title{Default checks for factor variables} 6 | \usage{ 7 | defaultFactorChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | factor type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultFactorSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultFactorSummaries} 4 | \alias{defaultFactorSummaries} 5 | \title{Default summary functions for factor variables} 6 | \usage{ 7 | defaultFactorSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | factor type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | #remove "countMissing" for the summaries: 25 | defaultFactorSummaries(remove = "countMissing") 26 | 27 | } 28 | \seealso{ 29 | \code{\link{variableType}}, \code{\link{countMissing}}, \code{\link{uniqueValues}}, 30 | \code{\link{centralValue}} 31 | } 32 | -------------------------------------------------------------------------------- /man/defaultHavenlabelledChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultHavenlabelledChecks} 4 | \alias{defaultHavenlabelledChecks} 5 | \title{Default checks for haven_labelled variables} 6 | \usage{ 7 | defaultHavenlabelledChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | haven_labelled type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultHavenlabelledSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultHavenlabelledSummaries} 4 | \alias{defaultHavenlabelledSummaries} 5 | \title{Default summary functions for haven_labelled variables} 6 | \usage{ 7 | defaultHavenlabelledSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | haven_labelled type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | #remove "centralValue": 25 | defaultHavenlabelledSummaries(remove = "centralValue") 26 | 27 | } 28 | \seealso{ 29 | \code{\link{variableType}}, 30 | \code{\link{countMissing}}, \code{\link{uniqueValues}}, \code{\link{centralValue}} 31 | } 32 | -------------------------------------------------------------------------------- /man/defaultIntegerChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultIntegerChecks} 4 | \alias{defaultIntegerChecks} 5 | \title{Default checks for integer variables} 6 | \usage{ 7 | defaultIntegerChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | integer type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultIntegerSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultIntegerSummaries} 4 | \alias{defaultIntegerSummaries} 5 | \title{Default summary functions for integer variables} 6 | \usage{ 7 | defaultIntegerSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | integer type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | #remove "countMissing": 25 | defaultIntegerSummaries(remove = "countMissing") 26 | 27 | } 28 | \seealso{ 29 | \code{\link{variableType}}, 30 | \code{\link{countMissing}}, \code{\link{uniqueValues}}, 31 | \code{\link{centralValue}}, \code{\link{quartiles}}, \code{\link{minMax}} 32 | } 33 | -------------------------------------------------------------------------------- /man/defaultLabelledChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultLabelledChecks} 4 | \alias{defaultLabelledChecks} 5 | \title{Default checks for labelled variables} 6 | \usage{ 7 | defaultLabelledChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | labelled type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultLabelledSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultLabelledSummaries} 4 | \alias{defaultLabelledSummaries} 5 | \title{Default summary functions for labelled variables} 6 | \usage{ 7 | defaultLabelledSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | labelled type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | #remove "centralValue": 25 | defaultLabelledSummaries(remove = "centralValue") 26 | 27 | } 28 | \seealso{ 29 | \code{\link{variableType}}, 30 | \code{\link{countMissing}}, \code{\link{uniqueValues}}, \code{\link{centralValue}} 31 | } 32 | -------------------------------------------------------------------------------- /man/defaultLogicalChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultLogicalChecks} 4 | \alias{defaultLogicalChecks} 5 | \title{Default checks for logical variables} 6 | \usage{ 7 | defaultLogicalChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | logical type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultLogicalSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultLogicalSummaries} 4 | \alias{defaultLogicalSummaries} 5 | \title{Default summary functions for logical variables} 6 | \usage{ 7 | defaultLogicalSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | logical type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | #remove "uniqueValues": 25 | defaultLogicalSummaries(remove = "uniqueValues") 26 | 27 | } 28 | \seealso{ 29 | \code{\link{variableType}}, 30 | \code{\link{countMissing}}, \code{\link{uniqueValues}}, \code{\link{centralValue}} 31 | } 32 | -------------------------------------------------------------------------------- /man/defaultNumericChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check.R 3 | \name{defaultNumericChecks} 4 | \alias{defaultNumericChecks} 5 | \title{Default checks for numeric variables} 6 | \usage{ 7 | defaultNumericChecks(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A vector of function names. 16 | } 17 | \description{ 18 | Default options for which checks to perform on 19 | numeric type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | -------------------------------------------------------------------------------- /man/defaultNumericSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summarize.R 3 | \name{defaultNumericSummaries} 4 | \alias{defaultNumericSummaries} 5 | \title{Default summary functions for numeric variables} 6 | \usage{ 7 | defaultNumericSummaries(remove = NULL, add = NULL) 8 | } 9 | \arguments{ 10 | \item{remove}{Character vector of function names. Checks to remove from the returned vector} 11 | 12 | \item{add}{Character vector of function names. Checks to add to the returned vector} 13 | } 14 | \value{ 15 | A list of function names (as character strings). 16 | } 17 | \description{ 18 | Default options for which summaries to apply on 19 | numeric type variables in \code{\link{check}} and \code{\link{makeDataReport}}, 20 | possibly user-modified by adding extra function names using \code{add} or 21 | removing default function names with \code{remove}. 22 | } 23 | \examples{ 24 | #remove "uniqueValues": 25 | defaultNumericSummaries(remove = "uniqueValues") 26 | 27 | } 28 | \seealso{ 29 | \code{\link{variableType}}, 30 | \code{\link{countMissing}}, \code{\link{uniqueValues}}, 31 | \code{\link{centralValue}}, \code{\link{quartiles}}, \code{\link{minMax}} 32 | } 33 | -------------------------------------------------------------------------------- /man/description.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/description.R 3 | \name{description} 4 | \alias{description} 5 | \alias{description<-} 6 | \title{Extract the contents of the attribute \code{description}} 7 | \usage{ 8 | description(x) 9 | 10 | description(x) <- value 11 | } 12 | \arguments{ 13 | \item{x}{The object for which the \code{description} 14 | attribute should be extracted.} 15 | 16 | \item{value}{New value} 17 | } 18 | \value{ 19 | A description of what \code{x} does, given as 20 | a character string. 21 | } 22 | \description{ 23 | If the object, \code{x}, is itself of 24 | class \code{\link{checkFunction}}, \code{\link{summaryFunction}} 25 | or \code{\link{visualFunction}}, the contents of \code{x}'s 26 | attribute \code{description} is returned. Otherwise, \code{NULL} is 27 | returned. 28 | } 29 | \examples{ 30 | #Extract the description of the checkFunction identifyMissing 31 | description(identifyMissing) 32 | 33 | #Extract the description of the summaryFunction minMax 34 | description(minMax) 35 | 36 | #Extract the description of the visualFunction basicVisual 37 | description(basicVisual) 38 | 39 | } 40 | -------------------------------------------------------------------------------- /man/exampleData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataReporter-package.R 3 | \docType{data} 4 | \name{exampleData} 5 | \alias{exampleData} 6 | \title{Example data with zero-inflated variables} 7 | \format{ 8 | A \code{data.frame} with 300 observations on the following 6 variables. 9 | \describe{ 10 | \item{\code{addresses}}{a factor with fictitious US addresses} 11 | \item{\code{binomial}}{a numeric vector with a binomial distributed variable} 12 | \item{\code{poisson}}{a numeric vector with a Poisson distributed variable} 13 | \item{\code{gauss}}{a numeric vector with a Gaussian distributed variable} 14 | \item{\code{zigauss}}{a numeric vector with a zero-inflated Gaussian distributed variable} 15 | \item{\code{bpinteraction}}{a factor with interactions between binomial and poisson values} 16 | } 17 | } 18 | \source{ 19 | Artificial data 20 | } 21 | \usage{ 22 | exampleData 23 | } 24 | \description{ 25 | An artificial dataset, intended for presenting the extended features of \code{dataReporter}, 26 | which is a toolset for identifying potential errors in a dataset. 27 | } 28 | \examples{ 29 | 30 | isID <- function(v, nMax = NULL, ...) { 31 | out <- list(problem = FALSE, message = "") 32 | if (class(v) \%in\% c("character", "factor", "labelled", "numeric", "integer")) { 33 | v <- as.character(v) 34 | lengths <- nchar(v) 35 | if (all(lengths > 10) & length(unique(lengths)) == 1) { 36 | out$problem <- TRUE 37 | out$message <- "Warning: This variable seems to contain ID codes!" 38 | } 39 | } 40 | out 41 | } 42 | 43 | 44 | countZeros <- function(v, ...) { 45 | res <- length(which(v == 0)) 46 | summaryResult(list(feature = "No. zeros", result = res, value = res)) 47 | } 48 | countZeros <- summaryFunction(countZeros, description = "Count number of zeros", 49 | classes = allClasses()) 50 | summarize(toyData, numericSummaries = c(defaultNumericSummaries())) 51 | 52 | 53 | mosaicVisual <- function(v, vnam, doEval) { 54 | thisCall <- call("mosaicplot", table(v), main = vnam, xlab = "") 55 | if (doEval) { 56 | return(eval(thisCall)) 57 | } else return(deparse(thisCall)) 58 | } 59 | mosaicVisual <- visualFunction(mosaicVisual, 60 | description = "Mosaic plots using graphics", 61 | classes = allClasses()) 62 | 63 | identifyColons <- function(v, nMax = Inf, ... ) { 64 | v <- unique(na.omit(v)) 65 | problemMessage <- "Note: The following values include colons:" 66 | problem <- FALSE 67 | problemValues <- NULL 68 | 69 | problemValues <- v[sapply(gregexpr("[[:xdigit:]]:[[:xdigit:]]", v), 70 | function(x) all(x != -1))] 71 | 72 | if (length(problemValues) > 0) { 73 | problem <- TRUE 74 | } 75 | 76 | problemStatus <- list(problem = problem, 77 | problemValues = problemValues) 78 | outMessage <- messageGenerator(problemStatus, problemMessage, nMax) 79 | 80 | checkResult(list(problem = problem, 81 | message = outMessage, 82 | problemValues = problemValues)) 83 | } 84 | 85 | identifyColons <- checkFunction(identifyColons, 86 | description = "Identify non-suffixed nor -prefixed colons", 87 | classes = c("character", "factor", "labelled")) 88 | 89 | \donttest{ 90 | makeDataReport(exampleData, replace = TRUE, 91 | preChecks = c("isKey", "isEmpty", "isID"), 92 | allVisuals = "mosaicVisual", 93 | characterSummaries = c(defaultCharacterSummaries(), "countZeros"), 94 | factorSummaries = c(defaultFactorSummaries(), "countZeros"), 95 | labelledSummaries = c(defaultLabelledSummaries(), "countZeros"), 96 | numericSummaries = c(defaultNumericSummaries(), "countZeros"), 97 | integerSummaries = c(defaultIntegerSummaries(), "countZeros"), 98 | characterChecks = c(defaultCharacterChecks(), "identifyColons"), 99 | factorChecks = c(defaultFactorChecks(), "identifyColons"), 100 | labelledCheck = c(defaultLabelledChecks(), "identifyColons")) 101 | 102 | 103 | 104 | } 105 | 106 | } 107 | \keyword{datasets} 108 | -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/man/figures/logo.png -------------------------------------------------------------------------------- /man/identifyCaseIssues.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identifyCaseIssues.R 3 | \name{identifyCaseIssues} 4 | \alias{identifyCaseIssues} 5 | \title{A checkFunction for identifying case issues} 6 | \usage{ 7 | identifyCaseIssues(v, nMax = 10) 8 | } 9 | \arguments{ 10 | \item{v}{A character, factor, haven_labelled or labelled variable to check.} 11 | 12 | \item{nMax}{The maximum number of problematic values to report. 13 | Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 14 | in the outputted message, or to \code{0} for no output.} 15 | } 16 | \value{ 17 | A \code{\link{checkResult}} with three entires: 18 | \code{$problem} (a logical indicating whether case issues where found), 19 | \code{$message} (a message describing which values in \code{v} resulted 20 | in case issues) and \code{$problemValues} (the problematic values 21 | in their original format). Note that Only unique problematic values 22 | are listed and they are presented in alphabetical order. 23 | } 24 | \description{ 25 | A \code{\link{checkFunction}} to be called from 26 | \code{\link{check}} that identifies values in a vector 27 | that appear multiple times with different case settings. 28 | } 29 | \examples{ 30 | identifyCaseIssues(c("val", "b", "1", "1", "vAl", "VAL", "oh", "OH")) 31 | 32 | } 33 | \seealso{ 34 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 35 | \code{\link{checkFunction}}, \code{\link{checkResult}} 36 | } 37 | -------------------------------------------------------------------------------- /man/identifyLoners.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identifyLoners.R 3 | \name{identifyLoners} 4 | \alias{identifyLoners} 5 | \title{A checkFunction for identifying sparsely represented values (loners)} 6 | \usage{ 7 | identifyLoners(v, nMax = 10) 8 | } 9 | \arguments{ 10 | \item{v}{A character, (haven_)labelled, or factor variable to check.} 11 | 12 | \item{nMax}{The maximum number of problematic values to report. 13 | Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 14 | in the outputted message, or to \code{0} for no output.} 15 | } 16 | \value{ 17 | A \code{\link{checkResult}} with three entires: 18 | \code{$problem} (a logical indicating whether case issues where found), 19 | \code{$message} (a message describing which values in \code{v} were loners) and 20 | \code{$problemValues} (the problematic values in their original format). 21 | Note that Only unique problematic values 22 | are listed and they are presented in alphabetical order. 23 | } 24 | \description{ 25 | A \code{\link{checkFunction}} to be called from \code{\link{check}} that identifies values that 26 | only occur less than 6 times in factor, (haven_)labelled, or character variables (that is, loners). 27 | } 28 | \details{ 29 | For character, (haven_)labelled, and factor variables, identify values that only have a 30 | very low number of observations, as these categories might be 31 | problematic when conducting an analysis. Unused factor levels are 32 | not considered "loners". "Loners" are defined as values with 5 or less 33 | observations, reflecting the commonly use rule of thumb for performing 34 | chi squared tests. 35 | } 36 | \examples{ 37 | identifyLoners(c(rep(c("a", "b", "c"), 10), "d", "d")) 38 | 39 | } 40 | \seealso{ 41 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 42 | \code{\link{checkFunction}}, \code{\link{checkResult}} 43 | } 44 | -------------------------------------------------------------------------------- /man/identifyMissing.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identifyMissing.R 3 | \name{identifyMissing} 4 | \alias{identifyMissing} 5 | \title{A checkFunction for identifying miscoded missing values.} 6 | \usage{ 7 | identifyMissing(v, nMax = 10, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A variable to check.} 11 | 12 | \item{nMax}{The maximum number of problematic values to report. 13 | Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 14 | in the outputted message, or to \code{0} for no output.} 15 | 16 | \item{...}{Not in use.} 17 | } 18 | \value{ 19 | A \code{\link{checkResult}} with three entires: 20 | \code{$problem} (a logical indicating whether midcoded missing values where found), 21 | \code{$message} (a message describing which values in \code{v} were suspected to be 22 | miscoded missing values), and \code{$problemValues} (the problematic values 23 | in their original format). Note that Only unique problematic values 24 | are listed and that they are presented in alphabetical order. 25 | } 26 | \description{ 27 | A checkFunction to be called from \code{\link{check}} that identifies values that 28 | appear to be miscoded missing values. 29 | } 30 | \details{ 31 | \code{identifyMissing} tries to identify common choices of missing values outside of the 32 | R standard (\code{NA}). These include special words (NaN and Inf (no matter the cases)), 33 | one or more -9/9's (e.g. 999, "99", -9, "-99"), one ore more -8/8's (e.g. -8, 888, -8888), 34 | Stata style missing values (commencing with ".") and other character strings 35 | ("", " ", "-", "NA" miscoded as character). If the variable is numeric/integer or a 36 | character/factor variable consisting only of numbers and with more than 11 different values, 37 | the numeric miscoded missing values (999, 888, -99, -8 etc.) are 38 | only recognized as miscoded missing if they are maximum or minimum, respectively, and the distance 39 | between the second largest/smallest value and this maximum/minimum value is greater than one. 40 | } 41 | \examples{ 42 | 43 | #Identify miscoded numeric missing values 44 | v1 <- c(1:15, 99) 45 | v2 <- c(v1, 98) 46 | v3 <- c(-999, v2, 9999) 47 | identifyMissing(v1) 48 | identifyMissing(v2) 49 | identifyMissing(v3) 50 | identifyMissing(factor(v3)) 51 | 52 | } 53 | \seealso{ 54 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 55 | \code{\link{checkFunction}}, \code{\link{checkResult}} 56 | } 57 | -------------------------------------------------------------------------------- /man/identifyNums.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identifyNums.R 3 | \name{identifyNums} 4 | \alias{identifyNums} 5 | \title{A checkFunction} 6 | \usage{ 7 | identifyNums(v, nVals = 12, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A character, factor, or (haven_)labelled variable to check.} 11 | 12 | \item{nVals}{An integer determining how many unique values a variable must have 13 | before it can potentially be determined to be a misclassified numeric variable. 14 | The default is \code{12}.} 15 | 16 | \item{...}{Not in use.} 17 | } 18 | \value{ 19 | A \code{\link{checkResult}} with three entires: 20 | \code{$problem} (a logical indicating the variable is suspected to be 21 | a misclassified numeric variable), \code{$message} (if a problem was found, 22 | the following message: "Note: The variable consists exclusively of numbers and takes 23 | a lot of different values. Is it perhaps a misclassified numeric variable?", 24 | otherwise "") and \code{$problemValues} (always \code{NULL}). 25 | } 26 | \description{ 27 | A \code{\link{checkFunction}} to be called from 28 | \code{\link{check}} for identifying numeric variables that have 29 | been misclassified as categorical. 30 | } 31 | \details{ 32 | A categorical variable is suspected to be a misclassified 33 | numeric variable if it has the following two properties: First, 34 | it should consist exclusively of numbers (possibly including signs 35 | and decimals points). Secondly, it must have at least \code{nVals} unique values. 36 | The default values of \code{nVals} is 12, which means that 37 | e.g. variables including answers on a scale from 0-10 will 38 | not be recognized as misclassified numerics. 39 | } 40 | \examples{ 41 | #Positive and negative numbers, saved as characters 42 | identifyNums(c(as.character(-9:9))) 43 | 44 | #An ordinary character variable 45 | identifyNums(c("a", "b", "c", "d", "e.f", "-a", 1:100)) 46 | 47 | 48 | } 49 | \seealso{ 50 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 51 | \code{\link{checkFunction}}, \code{\link{checkResult}} 52 | } 53 | -------------------------------------------------------------------------------- /man/identifyOutliers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identifyOutliers.R 3 | \name{identifyOutliers} 4 | \alias{identifyOutliers} 5 | \title{A checkFunction for identifying outliers} 6 | \usage{ 7 | identifyOutliers(v, nMax = 10, maxDecimals = 2) 8 | } 9 | \arguments{ 10 | \item{v}{A Date, numeric or integer variable to check.} 11 | 12 | \item{nMax}{The maximum number of problematic values to report. 13 | Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 14 | in the outputted message, or to \code{0} for no output.} 15 | 16 | \item{maxDecimals}{A positive integer or \code{Inf}. Number of decimals used when 17 | printing numerical values in the data summary and in problematic values from the 18 | data checks. If \code{Inf}, no rounding is performed.} 19 | } 20 | \value{ 21 | A \code{\link{checkResult}} with three entires: 22 | \code{$problem} (a logical indicating whether outliers were found), 23 | \code{$message} (a message describing which values are outliers) and 24 | \code{$problemValues} (the outlier values). 25 | } 26 | \description{ 27 | A checkFunction to be called from \code{\link{check}} that identifies outlier values 28 | in a Date/numeric/integer variable. 29 | } 30 | \details{ 31 | Outliers are identified based on an outlier rule that is 32 | appropriate for asymmetric data. Outliers are observations outside the range 33 | 34 | \deqn{Q1 - 1.5*exp(a*MC)*IQR ; Q3 + 1.5*exp(b*MC)*IQR } 35 | 36 | where Q1, Q3, and IQR are the first quartile, third quartile, and 37 | inter-quartile range, MC is the 'medcouple', a robust concept and 38 | estimator of skewness, and a and b are appropriate constants (-4 39 | and 3). The medcouple is defined as a scaled median difference of 40 | the left and right half of distribution, and hence not based on the 41 | third moment as the classical skewness. 42 | 43 | When the data are symmetric, the measure reduces to the 44 | standard outlier rule also used in Tukey Boxplots (consistent with 45 | the \code{\link{boxplot}} function), i.e. as values that are 46 | smaller than the 1st quartile minus the inter quartile range (IQR) 47 | or greater than the third quartile plus the IQR. 48 | 49 | For Date variables, the calculations are done on their raw numeric format (as 50 | obtained by using \code{\link{unclass}}), after which they are translated back to Dates. 51 | Note that no rounding is performed for Dates, no matter the value of \code{maxDecimals}. 52 | } 53 | \examples{ 54 | identifyOutliers(c(1:10, 200, 200, 700)) 55 | 56 | } 57 | \seealso{ 58 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 59 | \code{\link{checkFunction}}, \code{\link{checkResult}}, \code{\link[robustbase]{mc}} 60 | } 61 | -------------------------------------------------------------------------------- /man/identifyOutliersTBStyle.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identifyOutliersTBStyle.R 3 | \name{identifyOutliersTBStyle} 4 | \alias{identifyOutliersTBStyle} 5 | \title{A checkFunction for identifying outliers Turkey Boxstole style} 6 | \usage{ 7 | identifyOutliersTBStyle(v, nMax = 10, maxDecimals = 2) 8 | } 9 | \arguments{ 10 | \item{v}{A numeric, integer or Date variable to check.} 11 | 12 | \item{nMax}{The maximum number of problematic values to report. 13 | Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 14 | in the outputted message, or to \code{0} for no output.} 15 | 16 | \item{maxDecimals}{A positive integer or \code{Inf}. Number of decimals used when 17 | printing numerical values in the data summary and in problematic values from the 18 | data checks. If \code{Inf}, no rounding is performed.} 19 | } 20 | \value{ 21 | A \code{\link{checkResult}} with three entires: 22 | \code{$problem} (a logical indicating whether outliers were found), 23 | \code{$message} (a message describing which values are outliers) and 24 | \code{$problemValues} (the outlier values). 25 | } 26 | \description{ 27 | A checkFunction to be called from \code{\link{check}} that identifies outlier values 28 | in a numeric/integer/Date variable by use of the Turkey Boxplot method (consistent witht the 29 | \code{\link{boxplot}} function). 30 | } 31 | \details{ 32 | Outliers are defined in the style of Turkey Boxplots (consistent with the 33 | \code{\link{boxplot}} function), i.e. as values that are smaller than the 1st quartile minus 34 | the inter quartile range (IQR) or greater than the third quartile plus the IQR. 35 | 36 | For Date variables, the calculations are done on their raw numeric format (as 37 | obtained by using \code{\link{unclass}}), after which they are translated back to Dates. 38 | Note that no rounding is performed for Dates, no matter the value of \code{maxDecimals}. 39 | } 40 | \examples{ 41 | identifyOutliersTBStyle(c(1:10, 200, 200, 700)) 42 | 43 | } 44 | \seealso{ 45 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 46 | \code{\link{checkFunction}}, \code{\link{checkResult}} 47 | } 48 | -------------------------------------------------------------------------------- /man/identifyWhitespace.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/identifyWhitespace.R 3 | \name{identifyWhitespace} 4 | \alias{identifyWhitespace} 5 | \title{A checkFunction for identifying whitespace} 6 | \usage{ 7 | identifyWhitespace(v, nMax = 10) 8 | } 9 | \arguments{ 10 | \item{v}{A character, (haven_)labelled or factor variable to check.} 11 | 12 | \item{nMax}{The maximum number of problematic values to report. 13 | Default is \code{10}. Set to \code{Inf} if all problematic values are to be included 14 | in the outputted message, or to \code{0} for no output.} 15 | } 16 | \value{ 17 | A \code{\link{checkResult}} with three entires: 18 | \code{$problem} (a logical indicating whether any whitespaces were 19 | fount), \code{$message} (a message describing which values were prefixed 20 | or suffixed with whitespace) and \code{$problemValues} (the problematic 21 | values). Note that only unique values are printed in the message, and that 22 | they are sorted alphabetically. 23 | } 24 | \description{ 25 | A checkFunction to be called from \code{\link{check}} 26 | that identifies prefixed and suffixed whitespace(s) in character, 27 | (haven_)labelled or factor variables. 28 | } 29 | \examples{ 30 | identifyWhitespace(c("a", " b", "c", "d ", "e ")) 31 | 32 | } 33 | \seealso{ 34 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 35 | \code{\link{checkFunction}}, \code{\link{checkResult}} 36 | } 37 | -------------------------------------------------------------------------------- /man/isCPR.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/isCPR.R 3 | \name{isCPR} 4 | \alias{isCPR} 5 | \title{Check if a variable consists of Danish CPR numbers} 6 | \usage{ 7 | isCPR(v, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector) to check. This variable is allowed to have any class.} 11 | 12 | \item{...}{Not in use.} 13 | } 14 | \value{ 15 | A \code{\link{checkResult}} with three entires: 16 | \code{$problem} (a logical indicating whether the variable consists 17 | of CPR numbers), \code{$message} (if a problem was found, 18 | the following message: "Warning: The variable seems to consist of 19 | Danish civil registration (CPR) numbers.", 20 | otherwise "") and \code{$problemValues} (always \code{NULL}). 21 | } 22 | \description{ 23 | A \code{\link{checkFunction}} that checks if \code{v} consists exclusively 24 | of valid Danish civil registration (CPR) numbers, ignoring missing values. This 25 | function is intended for use as a precheck in \code{\link{makeDataReport}}, ensuring 26 | that CPR numbers are not included in a \code{dataReporter} output document. 27 | } 28 | \examples{ 29 | 30 | CPRs <- c("010188-3639", "020187-1476", "040506-8664", "010290-3684", "010291-1180", 31 | "010293-1599", "010294-1268", "010295-1360", "010296-3970", "010297-2007", 32 | "010270-2905", "010271-0134", "010272-1403", "010273-3088", "010274-1633") 33 | nonCPRs <- c(1:10) 34 | mixedCPRs <- c(CPRs, nonCPRs) 35 | 36 | #identify problem 37 | isCPR(CPRs) 38 | 39 | #no problem as there are no CPRs 40 | isCPR(nonCPRs) 41 | 42 | #no problem because not ALL values are CPRs 43 | isCPR(mixedCPRs) 44 | 45 | } 46 | \seealso{ 47 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 48 | \code{\link{checkFunction}}, \code{\link{checkResult}} 49 | } 50 | -------------------------------------------------------------------------------- /man/isKey.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/isKey.R 3 | \name{isKey} 4 | \alias{isKey} 5 | \title{Check if a variable qualifies as a key} 6 | \usage{ 7 | isKey(v) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector) to check. All variable types are allowed.} 11 | } 12 | \value{ 13 | A \code{\link{checkResult}} with three entires: 14 | \code{$problem} (a logical indicating whether \code{v} is a key), 15 | \code{$message} (if a problem was found, the following message: 16 | "The variable is a key (distinct values for each observation).", 17 | otherwise "") and \code{$problemValues} (always \code{NULL}). 18 | } 19 | \description{ 20 | A \code{\link{checkFunction}} that checks if \code{v} 21 | is a key, that is, if every observation has a unique value in \code{v} and 22 | \code{v} is not a numeric/integer nor a Date variable. This 23 | function is intended for use as a precheck in \code{\link{makeDataReport}}. 24 | } 25 | \details{ 26 | Note that numeric or integer variables are not considered candidates 27 | for keys, as truly continuous measurements will most likely result in unique 28 | values for each observation. 29 | } 30 | \examples{ 31 | keyVar <- c("a", "b", "c", "d", "e", "f") 32 | notKeyVar <- c("a", "a", "b", "c", "d", "e", "f") 33 | 34 | isKey(keyVar) 35 | isKey(notKeyVar) 36 | 37 | } 38 | \seealso{ 39 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 40 | \code{\link{checkFunction}}, \code{\link{checkResult}} 41 | } 42 | -------------------------------------------------------------------------------- /man/isSingular.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/isSingular.R, R/isEmpty.R 3 | \name{isSingular} 4 | \alias{isSingular} 5 | \alias{isEmpty} 6 | \title{Check if a variable only contains a single value} 7 | \usage{ 8 | isSingular(v) 9 | 10 | isEmpty(v) 11 | } 12 | \arguments{ 13 | \item{v}{A variable (vector) to check. All variable types are allowed.} 14 | } 15 | \value{ 16 | A \code{\link{checkResult}} with three entires: 17 | \code{$problem} (a logical indicating whether \code{v} contains only one value), 18 | \code{$message} (if a problem was found, a message describing which single 19 | value the variable takes and how many missing observations it contains, otherwise 20 | ""), and \code{$problemValues} (always \code{NULL}). 21 | } 22 | \description{ 23 | A \code{\link{checkFunction}} that checks if \code{v} only 24 | contains a single unique value, aside from missing values. This 25 | function is intended for use as a precheck in \code{\link{makeDataReport}}. 26 | } 27 | \examples{ 28 | singularVar <- c(rep("a", 10), NA, NA) 29 | notSingularVar <- c("a", "a", "b", "c", "d", "e", "f", NA, NA) 30 | 31 | isSingular(singularVar) 32 | isSingular(notSingularVar) 33 | 34 | } 35 | \seealso{ 36 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 37 | \code{\link{checkFunction}}, \code{\link{checkResult}} 38 | } 39 | -------------------------------------------------------------------------------- /man/isSupported.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/isSupported.R 3 | \name{isSupported} 4 | \alias{isSupported} 5 | \title{Check if a variable has a class supported by dataReporter} 6 | \usage{ 7 | isSupported(v) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector) to check. All variable types are allowed.} 11 | } 12 | \value{ 13 | A \code{\link{checkResult}} with three entires: 14 | \code{$problem} (a logical indicating whether \code{v} contains only one value), 15 | \code{$message} (if a problem was found, a message describing which single 16 | value the variable takes and how many missing observations it contains, otherwise 17 | ""), and \code{$problemValues} (always \code{NULL}). 18 | } 19 | \description{ 20 | A \code{\link{checkFunction}} that checks if \code{v} has 21 | one of the classes supported by dataReporter, namely \code{character}, 22 | \code{factor}, \code{numeric}, \code{integer}, \code{labelled}, 23 | \code{haven_labelled}, 24 | \code{logical} and \code{Date} (inlcuding other classes that inherits 25 | from any of these classes). A user supported list can be provided 26 | in the \code{treatXasY} argument, which will let the user decide 27 | how unsupported classes should be treated. This 28 | function is intended for use as a precheck in \code{\link{makeDataReport}}. 29 | } 30 | \examples{ 31 | integerVar <- 1:10 #supported 32 | rawVar <- as.raw(1:10) #not supported 33 | 34 | isSupported(integerVar) 35 | isSupported(rawVar) 36 | 37 | } 38 | \seealso{ 39 | \code{\link{check}}, \code{\link{allCheckFunctions}}, 40 | \code{\link{checkFunction}}, \code{\link{checkResult}} 41 | } 42 | -------------------------------------------------------------------------------- /man/makeCodebook.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/makeCodebook.R 3 | \name{makeCodebook} 4 | \alias{makeCodebook} 5 | \title{Produce a data codebook} 6 | \usage{ 7 | makeCodebook(data, vol = "", reportTitle = NULL, file = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{data}{The dataset to be checked. This dataset should be of class \code{data.frame}, 11 | \code{tibble} or \code{matrix}. If it is of class \code{matrix}, it will be converted to a 12 | \code{data.frame}.} 13 | 14 | \item{vol}{Extra text string or numeric that is appended on the end of the output 15 | file name(s). For example, if the dataset is called "myData", no file argument is 16 | supplied and \code{vol=2}, the output file will be called "codebook_myData2.Rmd"} 17 | 18 | \item{reportTitle}{A text string. If supplied, this will be the printed title of the 19 | report. If left unspecified, the title with the name of the supplied dataset.} 20 | 21 | \item{file}{The filename of the outputted rmarkdown (.Rmd) file. 22 | If set to \code{NULL} (the default), the filename will be the name of \code{data} 23 | prefixed with "codebook_", if this qualifies as a valid file name (e.g. no special 24 | characters allowed). Otherwise, \code{makeCodebook()} tries to create a valid filename by 25 | substituting illegal characters. Note that a valid file is of type .Rmd, hence all 26 | filenames should have a ".Rmd"-suffix.} 27 | 28 | \item{...}{Additional parameters passed to \code{makeDataReport}.} 29 | } 30 | \description{ 31 | Make a data codebook that summarizes the contents of a dataset. 32 | The result is saved to an R markdown file which can be 33 | rendered into an easy-to-read codebook in pdf, html or word formats. 34 | } 35 | \references{ 36 | Petersen AH, Ekstrøm CT (2019). “dataMaid: Your Assistant for Documenting Supervised Data Quality Screening in R.” _Journal of Statistical Software_, *90*(6), 1-38. doi: 10.18637/jss.v090.i06 ( \doi{10.18637/jss.v090.i06}). 37 | } 38 | -------------------------------------------------------------------------------- /man/messageGenerator.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/messageGenerator.R 3 | \name{messageGenerator} 4 | \alias{messageGenerator} 5 | \title{Produce a message for the output of a checkFunction} 6 | \usage{ 7 | messageGenerator( 8 | problemStatus, 9 | message = "Note that a check function found the following problematic values:", 10 | nMax = 10 11 | ) 12 | } 13 | \arguments{ 14 | \item{problemStatus}{A list consisting of two entries: 15 | 16 | \code{$problem} - logical indicating whether a problem was found by the 17 | \code{checkFunction} responsible for the making the \code{messageGenerator()} call, 18 | 19 | \code{$problemValues} - a vector of values from the variable that were 20 | deemed problematic (see details below).} 21 | 22 | \item{message}{Optional, but recommended. A message describing what problem the 23 | problem values are related to. If \code{NULL} a standard message is added using the name 24 | of the function that called \code{messageGenerator}.} 25 | 26 | \item{nMax}{Maximum number of problem values to be printed in the message. If the total 27 | number of problem values exceeds nMax, the number of omitted problem 28 | values are added to the message. Defaults to \code{Inf}, in which case all problem 29 | values are printed.} 30 | } 31 | \value{ 32 | A character string with a problem description. 33 | } 34 | \description{ 35 | Helper function for producing output messages for 36 | \code{\link{checkFunction}} type functions. 37 | } 38 | \details{ 39 | This function is a tool for building \code{\link{checkFunction}}s for the 40 | \code{dataReporter} \code{\link{makeDataReport}} function. \code{checkFunction}s will often identify a number 41 | of values in a variable that are somehow problematic. \code{messageGenerator} takes 42 | these values, pastes them together with a problem description and makes sure that the 43 | formatting is appropriate for being rendered in a \code{rmarkdown} document. 44 | We recommend writing short and precise problem descriptions (see examples), 45 | but if no message is supplied, the following message is generated: 46 | "Note that a check function found the following problematic values: [problem values]". 47 | } 48 | \examples{ 49 | 50 | #Varibales with/without underscores 51 | noUSVar <- c(1:10) 52 | USVar <- c("_a", "n_b", "b_", "_", 1:10) 53 | 54 | #Define a checkFunction using messageGenerator with a manual 55 | #problem description: 56 | identifyUnderscores <- function(v, nMax = Inf) { 57 | v <- as.character(v) 58 | underscorePlaces <- regexpr("_", v) > 0 59 | problemValues <- unique(v[underscorePlaces]) 60 | problem <- any(underscorePlaces) 61 | message <- messageGenerator(list(problemValues = problemValues, problem = problem), 62 | "The following values contain underscores:", 63 | nMax = nMax) 64 | checkResult(list(problem = problem, message = message, 65 | problemValues = problemValues)) 66 | } 67 | 68 | identifyUnderscores(noUSVar) #no problem 69 | identifyUnderscores(USVar) #problems 70 | 71 | #Only print the first two problemvalues in the message: 72 | identifyUnderscores(USVar, nMax = 2) 73 | 74 | #Define same function, but without a manual problem description in 75 | #the messageGenerator-call: 76 | identifyUnderscores2 <- function(v, nMax = Inf) { 77 | v <- as.character(v) 78 | underscorePlaces <- regexpr("_", v) > 0 79 | problemValues <- unique(v[underscorePlaces]) 80 | problem <- any(underscorePlaces) 81 | message <- messageGenerator(list(problemValues = problemValues, 82 | problem = problem), nMax = nMax) 83 | checkResult(list(problem = problem, message = message, 84 | problemValues = problemValues)) 85 | } 86 | 87 | identifyUnderscores2(noUSVar) #no problem 88 | identifyUnderscores2(USVar) #problems 89 | 90 | } 91 | \seealso{ 92 | \code{\link{check}}, \code{\link{checkFunction}}, \code{\link{makeDataReport}} 93 | } 94 | -------------------------------------------------------------------------------- /man/minMax.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/minMax.R 3 | \name{minMax} 4 | \alias{minMax} 5 | \title{summaryFunction for minimum and maximum} 6 | \usage{ 7 | minMax(v, maxDecimals = 2) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector) of type numeric or integer.} 11 | 12 | \item{maxDecimals}{A positive integer or \code{Inf}. Number of decimals used when 13 | printing numerical values in the data summary and in problematic values from the 14 | data checks. If \code{Inf}, no rounding is performed.} 15 | } 16 | \value{ 17 | An object of class \code{summaryResult} with the following entries: \code{$feature} 18 | ("Min. and max."), \code{$result} (the minimum and maximum of \code{v}), and \code{$value} 19 | (minimum and maximum in their orignial format). 20 | } 21 | \description{ 22 | A \code{summaryFunction}, intended to be called from 23 | \code{\link{summarize}}, which returns the minimum and maximum values of a variable. 24 | NA, NaN and Inf values are removed prior to the computations. 25 | } 26 | \examples{ 27 | minMax(c(1:100)) 28 | 29 | } 30 | \seealso{ 31 | \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 32 | \code{\link{allSummaryFunctions}} 33 | } 34 | -------------------------------------------------------------------------------- /man/presidentData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataReporter-package.R 3 | \docType{data} 4 | \name{presidentData} 5 | \alias{presidentData} 6 | \title{Semi-artificial data about the US presidents} 7 | \format{ 8 | A data frame with 46 rows and 11 variables. 9 | \describe{ 10 | \item{lastName}{A \code{Name} type variable containing the last name of the president.} 11 | \item{firstName}{A \code{Name} type variable containing the first name of the president.} 12 | \item{orderOfPresidency}{A factor variable indicating the order of the presidents (with George Washington 13 | as number 1 and Donald Trump as number 45).} 14 | \item{birthday}{A Date variable with the birthday of the president} 15 | \item{stateOfBirth}{A character variable with the state in which the president was born.} 16 | \item{assassinationAttempt}{A numeric variable indicating whether there was an assassination 17 | attempt (\code{1}) or not (\code{0}) on the president.} 18 | \item{sex}{A factor variable with the sex of the president.} 19 | \item{ethnicity}{A factor variable with the ethnicity of the president.} 20 | \item{presidencyYears}{A numeric variable with the duration of the presidency, in years.} 21 | \item{ageAtInauguration}{A character variable with the age at inauguration.} 22 | \item{favoriteNumber}{A \code{complex} type variable with a fictional favorite number for 23 | each president.} 24 | } 25 | } 26 | \source{ 27 | Artificial dataset constructed based on the US president dataset available from 28 | \href{https://www.data-explorer.com/data/}{Data Explorer}. 29 | } 30 | \usage{ 31 | presidentData 32 | } 33 | \description{ 34 | A dataset with information about the first 45 US presidents as well as a 46th 35 | person, who is not a US president. The dataset was constructed to show the capabilities 36 | of \code{dataReporter} and therefore, it has been constructed to include errors and miscodings. 37 | Each observation in the dataset corresponds to a person. The dataset uses the 38 | non-standard class \code{Name} which is simply an attribute that has been added to 39 | two variables in order to show how \code{dataReporter} handles non-supported classes. 40 | } 41 | \examples{ 42 | data(presidentData) 43 | 44 | } 45 | \references{ 46 | Petersen AH, Ekstrøm CT (2019). “dataMaid: Your Assistant for Documenting Supervised Data Quality Screening in R.” _Journal of Statistical Software_, *90*(6), 1-38. doi: 10.18637/jss.v090.i06 ( \doi{10.18637/jss.v090.i06}). 47 | } 48 | \keyword{datasets} 49 | -------------------------------------------------------------------------------- /man/quartiles.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/quartiles.R 3 | \name{quartiles} 4 | \alias{quartiles} 5 | \title{summaryFunction for quartiles} 6 | \usage{ 7 | quartiles(v, maxDecimals = 2) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector) of type numeric or integer.} 11 | 12 | \item{maxDecimals}{A positive integer or \code{Inf}. Number of decimals used when 13 | printing numerical values in the data summary and in problematic values from the 14 | data checks. If \code{Inf}, no rounding is performed.} 15 | } 16 | \value{ 17 | An object of class \code{summaryResult} with the following entries: \code{$feature} 18 | ("1st and 3rd quartiles"), \code{$result} (the 1st and 3rd quartiles of \code{v}) and 19 | \code{$value} (the quartiles in their original format). 20 | } 21 | \description{ 22 | A \code{\link{summaryFunction}}, intended to be called from \code{\link{summarize}}, 23 | which calculates the 1st and 3rd quartiles of a variable. NA, NaN and Inf values are removed 24 | prior to the computations. 25 | } 26 | \details{ 27 | The quartiles are computed using the \code{\link[stats]{quantile}} function from \code{stats}, 28 | using type 7 quantiles for integer and numeric variables and type 1 quantiles for Date variables. 29 | } 30 | \examples{ 31 | quartiles(c(1:100)) 32 | 33 | quartiles(rnorm(1000), maxDecimals = 4) 34 | 35 | } 36 | \seealso{ 37 | \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 38 | \code{\link{allSummaryFunctions}} 39 | } 40 | -------------------------------------------------------------------------------- /man/refCat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/refCat.R 3 | \name{refCat} 4 | \alias{refCat} 5 | \title{summaryFunction that finds reference level for factor variables} 6 | \usage{ 7 | refCat(v, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector) of type factor.} 11 | 12 | \item{...}{Not in use.} 13 | } 14 | \value{ 15 | An object of class \code{summaryResult} with the following entries: \code{$feature} 16 | ("Reference level"), \code{$result} (the reference level of \code{v}), and \code{$value} 17 | (identical to result). 18 | } 19 | \description{ 20 | A \code{summaryFunction}, intended to be called from 21 | \code{\link{summarize}}, which returns the reference level of a factor variable, 22 | i.e. the first category as returned by \code{levels(v)}. This level will serve 23 | as the reference category and get absorbed into the intercept for most standard 24 | model fitting procedures and therefore, it may be convenient to know. 25 | } 26 | \examples{ 27 | refCat(factor(letters)) 28 | 29 | } 30 | \seealso{ 31 | \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 32 | \code{\link{allSummaryFunctions}} 33 | } 34 | -------------------------------------------------------------------------------- /man/render.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/render.R 3 | \name{render} 4 | \alias{render} 5 | \title{Simplified Rmarkdown rendering} 6 | \usage{ 7 | render(file, quiet) 8 | } 9 | \arguments{ 10 | \item{file}{A character string path to the file that is to be rendered. 11 | This file must be of type Rmarkdown (.Rmd)} 12 | 13 | \item{quiet}{A logical. Should messages during rendering be surpressed?} 14 | } 15 | \description{ 16 | Render a Rmarkdown (.Rmd) file, \code{file}, to the output 17 | format specified in its preamble. If no output format is specified, 18 | it will be rendered to html. 19 | } 20 | \details{ 21 | This function is merely a simplified version (in terms of 22 | possible arguments) of the rendering function from the \code{rmarkdown} package. 23 | Therefore, we refer to this functions for more details: 24 | \code{\link[rmarkdown]{render}}. We have included this simplified version in 25 | \code{dataReporter} in order to help new R users with rendering their output 26 | documents as generated by \code{\link{makeDataReport}}. 27 | } 28 | \seealso{ 29 | \code{\link[rmarkdown]{render}}. 30 | } 31 | -------------------------------------------------------------------------------- /man/setChecks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/setChecks.R 3 | \name{setChecks} 4 | \alias{setChecks} 5 | \title{Set check arguments for makeDataReport} 6 | \usage{ 7 | setChecks( 8 | character = defaultCharacterChecks(), 9 | factor = defaultFactorChecks(), 10 | labelled = defaultLabelledChecks(), 11 | haven_labelled = defaultHavenlabelledChecks(), 12 | numeric = defaultNumericChecks(), 13 | integer = defaultIntegerChecks(), 14 | logical = defaultLogicalChecks(), 15 | Date = defaultDateChecks(), 16 | all = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{character}{A character vector of function names to be used as checks for character 21 | variables. The default options are available by calling \code{defaultCharacterChecks()}.} 22 | 23 | \item{factor}{A character vector of function names to be used as checks for factor 24 | variables. The default options are available by calling \code{defaultFactorChecks()}.} 25 | 26 | \item{labelled}{A character vector of function names to be used as checks for labelled 27 | variables. The default options are available by calling \code{defaultLabelledChecks()}.} 28 | 29 | \item{haven_labelled}{A character vector of function names to be used as checks for haven_labelled 30 | variables. The default options are available by calling \code{defaultHavenlabelledChecks()}.} 31 | 32 | \item{numeric}{A character vector of function names to be used as checks for numeric 33 | variables. The default options are available by calling \code{defaultNumericChecks()}.} 34 | 35 | \item{integer}{A character vector of function names to be used as checks for integer 36 | variables. The default options are available by calling \code{defaultIntegerChecks()}.} 37 | 38 | \item{logical}{A character vector of function names to be used as checks for logical 39 | variables. The default options are available by calling \code{defaultLogicalChecks()}.} 40 | 41 | \item{Date}{A character vector of function names to be used as checks for Date 42 | variables. The default options are available by calling \code{defaultDateChecks()}.} 43 | 44 | \item{all}{A character vector of function names to be used as checks for all 45 | variables. Note that this overrules the choices made for specific variable types by using 46 | the other arguments.} 47 | } 48 | \value{ 49 | A list with one entry for each data class supported by \code{makeDataReport}. Each 50 | entry then contains a character vector of function names that are to be called as checks for 51 | that variable type. 52 | } 53 | \description{ 54 | This function is a tool for easily specifying the \code{checks} argument of 55 | \code{\link{makeDataReport}}. Note that all available check function options can be inspected 56 | by calling \code{allCheckFunctions()}. 57 | } 58 | \examples{ 59 | #Only identify missing values for characters, logicals and labelled variables: 60 | setChecks(character = "identifyMissing", factor = "identifyMissing", 61 | labelled = "identifyMissing") 62 | 63 | #Used in a call to makeDataReport(): 64 | \donttest{ 65 | data(toyData) 66 | makeDataReport(toyData, checks = setChecks(character = "identifyMissing", 67 | factor = "identifyMissing", labelled = "identifyMissing"), replace = TRUE) 68 | } 69 | 70 | } 71 | \seealso{ 72 | \code{\link{makeDataReport}}, \code{\link{allCheckFunctions}}, 73 | \code{\link{defaultCharacterChecks}}, 74 | \code{\link{defaultFactorChecks}}, \code{\link{defaultLabelledChecks}}, 75 | \code{\link{defaultHavenlabelledChecks}}, 76 | \code{\link{defaultNumericChecks}}, \code{\link{defaultIntegerChecks}}, 77 | \code{\link{defaultLogicalChecks}}, \code{\link{defaultDateChecks}} 78 | } 79 | -------------------------------------------------------------------------------- /man/setSummaries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/setSummaries.R 3 | \name{setSummaries} 4 | \alias{setSummaries} 5 | \title{Set summary arguments for makeDataReport} 6 | \usage{ 7 | setSummaries( 8 | character = defaultCharacterSummaries(), 9 | factor = defaultFactorSummaries(), 10 | labelled = defaultLabelledSummaries(), 11 | haven_labelled = defaultHavenlabelledSummaries(), 12 | numeric = defaultNumericSummaries(), 13 | integer = defaultIntegerSummaries(), 14 | logical = defaultLogicalSummaries(), 15 | Date = defaultDateSummaries(), 16 | all = NULL 17 | ) 18 | } 19 | \arguments{ 20 | \item{character}{A character vector of function names to be used as summaries for character 21 | variables. The default options are available by calling \code{defaultCharacterSummaries()}.} 22 | 23 | \item{factor}{A character vector of function names to be used as summaries for factor 24 | variables. The default options are available by calling \code{defaultFactorSummaries()}.} 25 | 26 | \item{labelled}{A character vector of function names to be used as summaries for labelled 27 | variables. The default options are available by calling \code{defaultLabelledSummaries()}.} 28 | 29 | \item{haven_labelled}{A character vector of function names to be used as summaries for haven_labelled 30 | variables. The default options are available by calling \code{defaultHavenlabelledSummaries()}.} 31 | 32 | \item{numeric}{A character vector of function names to be used as summaries for numeric 33 | variables. The default options are available by calling \code{defaultNumericSummaries()}.} 34 | 35 | \item{integer}{A character vector of function names to be used as summaries for integer 36 | variables. The default options are available by calling \code{defaultIntegerSummaries()}.} 37 | 38 | \item{logical}{A character vector of function names to be used as summaries for logical 39 | variables. The default options are available by calling \code{defaultLogicalSummaries()}.} 40 | 41 | \item{Date}{A character vector of function names to be used as summaries for Date 42 | variables. The default options are available by calling \code{defaultDateSummaries()}.} 43 | 44 | \item{all}{A character vector of function names to be used as summaries for all 45 | variables. Note that this overrules the choices made for specific variable types by using 46 | the other arguments.} 47 | } 48 | \value{ 49 | A list with one entry for each data class supported by \code{makeDataReport}. Each 50 | entry then contains a character vector of function names that are to be called as summaries for 51 | that variable type. 52 | } 53 | \description{ 54 | This function is a tool for easily specifying the \code{summaries} argument of 55 | \code{\link{makeDataReport}}. Note that all available summary function options can be inspected 56 | by calling \code{allSummaryFunctions()}. 57 | } 58 | \examples{ 59 | #Don't include central value (median/mode) summary for numerical and integer 60 | #variables: 61 | setSummaries(numeric = defaultNumericSummaries(remove = "centralValue"), 62 | integer = defaultIntegerSummaries(remove = "centralValue")) 63 | 64 | 65 | #Used in a call to makeDataReport(): 66 | \donttest{ 67 | data(toyData) 68 | makeDataReport(toyData, 69 | setSummaries(numeric = defaultNumericSummaries(remove = "centralValue"), 70 | integer = defaultIntegerSummaries(remove = "centralValue")), replace = TRUE) 71 | } 72 | 73 | } 74 | \seealso{ 75 | \code{\link{makeDataReport}}, \code{\link{allSummaryFunctions}}, 76 | \code{\link{defaultCharacterSummaries}}, 77 | \code{\link{defaultFactorSummaries}}, \code{\link{defaultLabelledSummaries}}, 78 | \code{\link{defaultHavenlabelledSummaries}}, 79 | \code{\link{defaultNumericSummaries}}, \code{\link{defaultIntegerSummaries}}, 80 | \code{\link{defaultLogicalSummaries}}, \code{\link{defaultDateSummaries}} 81 | } 82 | -------------------------------------------------------------------------------- /man/setVisuals.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/setVisuals.R 3 | \name{setVisuals} 4 | \alias{setVisuals} 5 | \title{Set visual arguments for makeDataReport} 6 | \usage{ 7 | setVisuals( 8 | character = NULL, 9 | factor = NULL, 10 | labelled = NULL, 11 | haven_labelled = NULL, 12 | numeric = NULL, 13 | integer = NULL, 14 | logical = NULL, 15 | Date = NULL, 16 | all = "standardVisual" 17 | ) 18 | } 19 | \arguments{ 20 | \item{character}{A function name (character string) to be used as the visual function for character 21 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 22 | argument is used instead.} 23 | 24 | \item{factor}{A function name (character string) to be used as the visual function for factor 25 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 26 | argument is used instead.} 27 | 28 | \item{labelled}{A function name (character string) to be used as the visual function for labelled 29 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 30 | argument is used instead.} 31 | 32 | \item{haven_labelled}{A function name (character string) to be used as the visual function for haven_labelled 33 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 34 | argument is used instead.} 35 | 36 | \item{numeric}{A function name (character string) to be used as the visual function for numeric 37 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 38 | argument is used instead.} 39 | 40 | \item{integer}{A function name (character string) to be used as the visual function for integer 41 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 42 | argument is used instead.} 43 | 44 | \item{logical}{A function name (character string) to be used as the visual function for logical 45 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 46 | argument is used instead.} 47 | 48 | \item{Date}{A function name (character string) to be used as the visual function for Date 49 | variables. If \code{NULL} (the default) the argument is ignored and the contents of the \code{all} 50 | argument is used instead.} 51 | 52 | \item{all}{A function name (character string) to be used as the visual function for all 53 | variables.} 54 | } 55 | \value{ 56 | A list with one entry for each data class supported by \code{makeDataReport}. Each 57 | entry then contains a character string with a function name that is to be called as the visual 58 | function for that variable type. 59 | } 60 | \description{ 61 | This function is a tool for easily specifying the \code{visuals} argument of 62 | \code{\link{makeDataReport}}. Note that only a single visual function can 63 | be provided for each variable type. If more than one is supplied, only 64 | the first one is used. The default is to use a single visual function for all 65 | variable types (as specified in the argument \code{all}), but class-specific choices 66 | of visual functions can also be used. Note that class-specific arguments overwrites 67 | the contents of \code{all}. Note that all available visual function options can be inspected 68 | by calling \code{allVisualFunctions()}. 69 | } 70 | \examples{ 71 | #Set visual type to basicVisual for all variable types: 72 | setVisuals(all = "basicVisual") 73 | 74 | #Used in a call to makeDataReport(): 75 | \donttest{ 76 | data(toyData) 77 | makeDataReport(toyData, visuals = setVisuals(all = "basicVisual"), replace = TRUE) 78 | } 79 | 80 | } 81 | \seealso{ 82 | \code{\link{makeDataReport}}, \code{\link{allVisualFunctions}} 83 | } 84 | -------------------------------------------------------------------------------- /man/smartNum.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/smartNum.R 3 | \name{smartNum} 4 | \alias{smartNum} 5 | \title{Smart class to handle numerics as factor} 6 | \usage{ 7 | smartNum(v) 8 | } 9 | \arguments{ 10 | \item{v}{A numeric vector} 11 | } 12 | \value{ 13 | A \code{smartNum} object that is handled in \code{makeDataReport} in the same way as a factor. 14 | } 15 | \description{ 16 | S3 class meant for representing numeric variables that act like 17 | factor variables by taking only a few different values. This class 18 | is used in makeDataReport() in order to get appropriate summaries, visualizations 19 | and checks for such variables. In other words, such variables will be 20 | treated like factor variables instead of numerics. 21 | } 22 | -------------------------------------------------------------------------------- /man/standardVisual.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/standardVisual.R 3 | \name{standardVisual} 4 | \alias{standardVisual} 5 | \title{Produce distribution plots using ggplot from ggplot2.} 6 | \usage{ 7 | standardVisual(v, vnam, doEval = TRUE) 8 | } 9 | \arguments{ 10 | \item{v}{The variable (vector) to be plotted.} 11 | 12 | \item{vnam}{The name of the variable which will appear as the title of the plot.} 13 | 14 | \item{doEval}{If TRUE, the plot itself is returned. Otherwise, the function returns 15 | a character string containing standalone R code for producing the plot.} 16 | } 17 | \description{ 18 | Plot the distribution of a variable, depending on its data class, by use of ggplot2. 19 | Note that \code{standardVisual} is a \code{\link{visualFunction}}, compatible with the 20 | \code{\link{visualize}} and \code{\link{makeDataReport}} functions. 21 | } 22 | \details{ 23 | For character, factor, logical and (haven_)labelled variables, a barplot is produced. For numeric, 24 | integer or Date variables, \code{standardVisual} produces a histogram instead. Note that for 25 | integer and numeric variables, all non-finite (i.e. \code{NA}, \code{NaN}, \code{Inf}) values are 26 | removed prior to plotting. For character, Date, factor, (haven_)labelled and logical variables, 27 | only \code{NA} values are removed. 28 | } 29 | \examples{ 30 | 31 | #Save a variable 32 | myVar <- c(1:10) 33 | 34 | #Plot a variable 35 | standardVisual(myVar, "MyVar") 36 | 37 | #Produce code for plotting a variable 38 | standardVisual(myVar, "MyVar", doEval = FALSE) 39 | 40 | } 41 | \seealso{ 42 | \code{\link{visualize}}, \code{\link{basicVisual}} 43 | } 44 | -------------------------------------------------------------------------------- /man/summaryFunction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summaryFunction.R 3 | \name{summaryFunction} 4 | \alias{summaryFunction} 5 | \title{Create an object of class summaryFunction} 6 | \usage{ 7 | summaryFunction(f, description, classes = NULL) 8 | } 9 | \arguments{ 10 | \item{f}{A function. See details and examples below for the 11 | exact requirements of this function.} 12 | 13 | \item{description}{A character string describing the summary 14 | returned by \code{f}. If \code{NULL} (the default), the 15 | name of \code{f} will be used instead.} 16 | 17 | \item{classes}{The classes for which \code{f} is intended to 18 | be called. If \code{NULL} (the default), one of two things happens. 19 | If \code{f} is not a S3 generic function, the \code{classes} 20 | attribute of \code{f} will be an empty character string. If 21 | \code{f} is a S3 generic function, an automatic look-up 22 | for methods will be conducted, and the \code{classes} attribute 23 | will then be filled out automatically. Note that the function 24 | \code{\link{allClasses}} (listing all classes used in \code{dataReporter}) 25 | might be useful.} 26 | } 27 | \value{ 28 | A function of class \code{summaryFunction} which has to attributes, 29 | namely \code{classes} and \code{description}. 30 | } 31 | \description{ 32 | Convert a function, \code{f}, into an S3 33 | \code{summaryFunction} object. This adds \code{f} to the 34 | overview list returned by an \code{allSummaryFunctions()} 35 | call. 36 | } 37 | \details{ 38 | \code{summaryFunction} represents the functions used in 39 | \code{\link{summarize}} and \code{\link{makeDataReport}} for summarizing the 40 | features of variables in a dataset. 41 | 42 | An example of defining a new \code{summaryFunction} is given below. 43 | Note that the minimal requirements for such a function (in order for it to be 44 | compatible with \code{summarize()} and \code{makeDataReport()}) is the following 45 | input/output-structure: It must input at least two arguments, namely 46 | \code{v} (a vector variable) and \code{...}. Additional implemented 47 | arguments from \code{summarize()} and \code{makeDataReport()} include 48 | \code{maxDecimals}, see e.g. the pre-defined \code{summaryFunction} 49 | \code{\link{minMax}} for more details about how this arguments should 50 | be used. 51 | The output must be a list with at least the two entries \code{$feature} 52 | (a short character string describing what was summarized) and \code{$result} 53 | (a value or a character string with the result of the summarization). 54 | However, if the result of a \code{summaryFunction} is furthermore 55 | converted to a \code{\link{summaryResult}} object, a \code{print()} 56 | method also becomes available for consistent formatting of 57 | \code{summaryFunction} results. 58 | 59 | Note that all available \code{summaryFunction}s are listed by the call 60 | \code{allSummaryFunctions()} and we recommed looking into these function, 61 | if more knowledge about \code{summaryFunction}s is required. 62 | } 63 | \examples{ 64 | 65 | #Define a valid summaryFunction that can be called from summarize() 66 | #and makeDataReport(). This function counts how many zero entries a given 67 | #variable has: 68 | countZeros <- function(v, ...) { 69 | res <- length(which(v == 0)) 70 | summaryResult(list(feature = "No. zeros", result = res, value = res)) 71 | } 72 | 73 | #Convert it to a summaryFunction object. We don't count zeros for 74 | #logical variables, as they have a different meaning here (FALSE): 75 | countZeros <- summaryFunction(countZeros, description = "Count number of zeros", 76 | classes = setdiff(allClasses(), "logical")) 77 | 78 | #Call it directly : 79 | countZeros(c(0, 0, 0, 1:100)) 80 | 81 | #Call it via summarize(): 82 | data(cars) 83 | summarize(cars, numericSummaries = c(defaultNumericSummaries(), 84 | "countZeros")) 85 | 86 | #Note that countZeros now appears in a allSummaryFunctions() call: 87 | allSummaryFunctions() 88 | 89 | } 90 | \seealso{ 91 | \code{\link{allSummaryFunctions}}, \code{\link{summarize}}, 92 | \code{\link{makeDataReport}}, \code{\link{checkResult}} 93 | } 94 | -------------------------------------------------------------------------------- /man/summaryResult.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summaryResult.R 3 | \name{summaryResult} 4 | \alias{summaryResult} 5 | \title{Create object of class summaryResult} 6 | \usage{ 7 | summaryResult(ls) 8 | } 9 | \arguments{ 10 | \item{ls}{A list with entries \code{$feature} (a character string describing 11 | what summary was obtained), \code{$result} (the result of the summary, either 12 | a value from the variable, a numeric or a character string) and 13 | \code{$value} (the result in its most raw format, often identical to the 14 | \code{$result} input).} 15 | } 16 | \value{ 17 | A S3 object of class \code{summaryResult}, identical to the inputted 18 | list, \code{ls}, except for its class attribute. 19 | } 20 | \description{ 21 | Convert a list resulting from the summaries performed in a 22 | \code{\link{summaryFunction}} into a \code{summaryResult} object, thereby 23 | supplying it with a \code{print()} method. 24 | } 25 | \seealso{ 26 | \code{\link{summaryFunction}} 27 | } 28 | -------------------------------------------------------------------------------- /man/tableVisual.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tableVisual.R 3 | \name{tableVisual} 4 | \alias{tableVisual} 5 | \title{Produce tables for the makeDataReport visualizations.} 6 | \usage{ 7 | tableVisual(v, vnam, doEval = TRUE) 8 | } 9 | \arguments{ 10 | \item{v}{The variable (vector) to be plotted.} 11 | 12 | \item{vnam}{The name of the variable.} 13 | 14 | \item{doEval}{If TRUE, the table itself is returned. Otherwise, the function returns 15 | a character string containing standalone R code for producing the table.} 16 | } 17 | \description{ 18 | Produce a table of the distribution of a categorical (character, labelled, haven_labelled or factor) variable. 19 | Note that \code{tableVisual} is a \code{\link{visualFunction}}, compatible with the 20 | \code{\link{visualize}} and \code{\link{makeDataReport}} functions. 21 | } 22 | \examples{ 23 | 24 | #Save a variable 25 | myVar <- c("red", "blue", "red", "red", NA) 26 | 27 | #Plot a variable 28 | tableVisual(myVar, "MyVar") 29 | 30 | #Produce code for plotting a variable 31 | tableVisual(myVar, "MyVar", doEval = FALSE) 32 | 33 | } 34 | \seealso{ 35 | \code{\link{visualize}}, \code{\link{basicVisual}}, \code{\link{standardVisual}} 36 | } 37 | -------------------------------------------------------------------------------- /man/testData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataReporter-package.R 3 | \docType{data} 4 | \name{testData} 5 | \alias{testData} 6 | \title{Extended example data to test the features of dataReporter} 7 | \format{ 8 | A data frame with 15 rows and 14 variables. 9 | \describe{ 10 | \item{charVar}{A character vector with a single missing observation.} 11 | \item{factorVar}{A factor vector with a miscoded missing observation, \code{999}.} 12 | \item{numVar}{A numeric vector} 13 | \item{intVar}{An integer vector} 14 | \item{boolVar}{A logical vector with three missing observations.} 15 | \item{keyVar}{A character vector with unique codes for each observation.} 16 | \item{emptyVar}{A numeric vector where all entries are identical.} 17 | \item{numOutlierVar}{A numeric vector with a possible outlier (\code{100}).} 18 | \item{smartNumVar}{A numeric vector that takes only two different values.} 19 | \item{cprVar}{A character vector with levels in the format of Danish CPR numbers 20 | (social security numbers).} 21 | \item{cprKeyVar}{A character vector with levels in the format of Danish CPR numbers 22 | (social security numbers) with unique levels for each observation.} 23 | \item{miscodedMissingVar}{A character vector with levels corresponding to 24 | various miscoded (non-\code{NA}) misssing codes.} 25 | \item{misclassifiedNumVar}{A misclassified factor variable, where every level 26 | is a number and a many (12) different levels are in use.} 27 | \item{dateVar}{A Date vector.} 28 | \item{labelledVar}{A labelled vector with two missing observations.} 29 | } 30 | } 31 | \source{ 32 | Artificial data 33 | } 34 | \usage{ 35 | testData 36 | } 37 | \description{ 38 | A dataset of constructed data used as test bed when using \code{dataReporter} for identifying 39 | potential errors in a dataset. 40 | } 41 | \examples{ 42 | data(testData) 43 | 44 | } 45 | \keyword{datasets} 46 | -------------------------------------------------------------------------------- /man/toyData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dataReporter-package.R 3 | \docType{data} 4 | \name{toyData} 5 | \alias{toyData} 6 | \title{Small example data to show the features of dataReporter} 7 | \format{ 8 | A \code{data.frame} with 15 rows and 6 variables. 9 | \describe{ 10 | \item{pill}{A factor variable with two levels (\code{"red"} and \code{"blue"}) and a few 11 | (correctly coded) missing observations. This represents the colour of a pill.} 12 | \item{events}{A numeric variable with one obvious outlier value (\code{82}), two miscoded 13 | missing values (\code{999} and \code{NaN}) and a few correctly coded missing values. The number of previous events.} 14 | \item{region}{A factor variable where two of the levels (\code{"other"} and \code{"OTHER"} 15 | are the same word with different case settings. Moreover, the variable includes a Stata-style 16 | miscoded missing value (\code{"."}). Used to represent geographical regions or treatment centers.}. 17 | \item{change}{A numeric variable (random draws from a standard normal distribution). Representing a change in a measured variable.} 18 | \item{id}{A factor variable with unique codes for each observation (a character string 19 | with a number between 1 and 15), i.e. a key variable.} 20 | \item{spotifysong}{A factor variable that has the same level (\code{"Irrelevant"}) for all 21 | observations, i.e. a empty variable. The latest song played on Spotify.} 22 | } 23 | } 24 | \source{ 25 | Artificial data 26 | } 27 | \usage{ 28 | toyData 29 | } 30 | \description{ 31 | An artificial dataset, intended for presenting the key features of \code{dataReporter}, which is a 32 | toolset for identifying potential errors in a dataset. 33 | } 34 | \examples{ 35 | data(toyData) 36 | 37 | } 38 | \references{ 39 | Petersen AH, Ekstrøm CT (2019). “dataMaid: Your Assistant for Documenting Supervised Data Quality Screening in R.” _Journal of Statistical Software_, *90*(6), 1-38. doi: 10.18637/jss.v090.i06 ( \doi{10.18637/jss.v090.i06}). 40 | } 41 | \keyword{datasets} 42 | -------------------------------------------------------------------------------- /man/uniqueValues.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/uniqueValues.R 3 | \name{uniqueValues} 4 | \alias{uniqueValues} 5 | \title{summaryFunction for unique values} 6 | \usage{ 7 | uniqueValues(v, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector).} 11 | 12 | \item{...}{Not in use.} 13 | } 14 | \value{ 15 | An object of class \code{summaryResult} with the following entries: 16 | \code{$feature} ("No. unique values") and \code{$result} (the number of unique 17 | values in \code{v}). 18 | } 19 | \description{ 20 | A \code{\link{summaryFunction}} type function, intended to be called from 21 | \code{\link{summarize}} to be called from \code{\link{summarize}}, which counts the 22 | number of unique (excluding \code{NA}s) values in a variable. 23 | } 24 | \examples{ 25 | uniqueValues(c(1:3, rep(NA, 10), Inf, NaN)) 26 | 27 | } 28 | \seealso{ 29 | \code{\link{summaryFunction}}, \code{\link{summarize}}, \code{\link{summaryResult}}, 30 | \code{\link{allSummaryFunctions}} 31 | } 32 | -------------------------------------------------------------------------------- /man/variableType.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/variableType.R 3 | \name{variableType} 4 | \alias{variableType} 5 | \title{Summary function for original class} 6 | \usage{ 7 | variableType(v, ...) 8 | } 9 | \arguments{ 10 | \item{v}{A variable (vector).} 11 | 12 | \item{...}{Not in use.} 13 | } 14 | \value{ 15 | An object of class \code{summaryResult} with the following entries: 16 | \code{$feature} ("Variable type"), \code{$result} (the (original) class of 17 | \code{v}) and \code{$value} (identical to \code{$result}). 18 | } 19 | \description{ 20 | A \code{\link{summaryFunction}} type function, intended to be called from 21 | \code{\link{summarize}}, which finds the 22 | original class of a variable. This is just the class for all objects but those of class 23 | \code{smartNum}. 24 | } 25 | \examples{ 26 | #For standard variables: 27 | varX <- c(rep(c(1,2,3), each=10)) 28 | class(varX) 29 | variableType(varX) 30 | 31 | #For smartNum variables: 32 | smartX <- dataReporter::smartNum(varX) 33 | class(smartX) 34 | variableType(smartX) 35 | 36 | } 37 | \seealso{ 38 | \code{\link{summarize}} 39 | } 40 | -------------------------------------------------------------------------------- /man/visualFunction.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualFunction.R 3 | \name{visualFunction} 4 | \alias{visualFunction} 5 | \title{Create an object of class visualFunction} 6 | \usage{ 7 | visualFunction(f, description, classes = NULL) 8 | } 9 | \arguments{ 10 | \item{f}{A function. See details and examples below for the 11 | exact requirements of this function.} 12 | 13 | \item{description}{A character string describing the visualization 14 | returned by \code{f}. If \code{NULL} (the default), the name of 15 | \code{f} will be used instead.} 16 | 17 | \item{classes}{The classes for which \code{f} is intended to 18 | be called. If \code{NULL} (the default), one of two things happens. 19 | If \code{f} is not a S3 generic function, the \code{classes} 20 | attribute of \code{f} will be an empty character string. If 21 | \code{f} is a S3 generic function, an automatic look-up 22 | for methods will be conducted, and the \code{classes} attribute 23 | will then be filled out automatically. Note that the function 24 | \code{\link{allClasses}} (listing all classes used in \code{dataReporter}) 25 | might be useful.} 26 | } 27 | \value{ 28 | A function of class \code{visualFunction} which has to attributes, 29 | namely \code{classes} and \code{description}. 30 | } 31 | \description{ 32 | Convert a function, \code{f}, into an S3 33 | \code{visualFunction} object. This adds \code{f} to the 34 | overview list returned by an \code{allVisualFunctions()} 35 | call. 36 | } 37 | \details{ 38 | \code{visualFunction} represents the functions used in 39 | \code{\link{visualize}} and \code{\link{makeDataReport}} for plotting the 40 | distributions of the variables in a dataset. 41 | 42 | An example of defining a new \code{visualFunction} is given below. 43 | Note that the minimal requirements for such a function (in order for it to be 44 | compatible with \code{visualize()} and \code{makeDataReport()}) is the following 45 | input/output-structure: It must input exactly the following three arguments, 46 | namely \code{v} (a vector variable), \code{vnam} (a character string with 47 | the name of the variable) and \code{doEval} (a logical). The last argument 48 | is supposed to control whether the function produces a plot in the 49 | graphic device (if \code{doEval = TRUE}) or instead returns a character 50 | string including \code{R} code for generating such a plot. In the latter 51 | setting, the code must be stand-alone, that is, it cannot depend on object 52 | available in an environment. In practice, this will typically imply that 53 | the data variable is included in the code snip. 54 | It is not strictly necessary to implement the \code{doEval = TRUE} setting 55 | for the \code{visualFunction} to be compatible with \code{\link{makeDataReport}}, 56 | but we recommend doing it anyway such that the function can also be used 57 | interactively. 58 | 59 | Note that all available \code{visualFunction}s are listed by the call 60 | \code{allVisualFunctions()} and we recommed looking into these function, 61 | if more knowledge about \code{visualFunction}s is required. 62 | } 63 | \examples{ 64 | #Defining a new visualFunction: 65 | mosaicVisual <- function(v, vnam, doEval) { 66 | thisCall <- call("mosaicplot", table(v), main = vnam, xlab = "") 67 | if (doEval) { 68 | return(eval(thisCall)) 69 | } else return(deparse(thisCall)) 70 | } 71 | mosaicVisual <- visualFunction(mosaicVisual, description = "Mosaicplots from graphics", 72 | classes = allClasses()) 73 | 74 | #mosaicVisual is now included in a allVisualFunctions() call: 75 | allVisualFunctions() 76 | 77 | #Create a mosaic plot: 78 | ABCvar <- c(rep("a", 10), rep("b", 20), rep("c", 5)) 79 | mosaicVisual(ABCvar, "ABCvar", TRUE) 80 | 81 | #Create a character string with the code for a mosaic plot: 82 | mosaicVisual(ABCvar, "ABCVar", FALSE) 83 | 84 | #Extract or set description of a visualFunction: 85 | description(mosaicVisual) 86 | description(mosaicVisual) <- "A cubist version of a pie chart" 87 | description(mosaicVisual) 88 | 89 | 90 | } 91 | \seealso{ 92 | \code{\link{allVisualFunctions}}, \code{\link{visualize}}, 93 | \code{\link{makeDataReport}} 94 | } 95 | -------------------------------------------------------------------------------- /man/visualize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/visualize.R 3 | \name{visualize} 4 | \alias{visualize} 5 | \title{Produce distribution plots} 6 | \usage{ 7 | visualize(v, vnam = NULL, visuals = setVisuals(), doEval = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{v}{The variable (vector) or dataset (data.frame) which is to be plotted.} 11 | 12 | \item{vnam}{The name of the variable. This name might be printed on the plots, depending on the 13 | choice of plotting function. If not supplied, it will default to the name of \code{v}.} 14 | 15 | \item{visuals}{A list of visual functions to use on each supported variable type. We recommend 16 | using \code{\link{setVisuals}} for creating this list and refer to the documentation 17 | of this function for more details. This function allows for choosing variable-type dependent 18 | visuals. However, if \code{visualize()} is called on a full dataset, all visualizations 19 | must be of the same type and therefore, the \code{all} argument of \code{setVisuals} is used.} 20 | 21 | \item{doEval}{A logical. If \code{TRUE} (the default), \code{visualize} has the side effect of 22 | producing a plot (or multiple plots, if \code{v} is a data.frame). Otherwise, 23 | visualize returns a character string containing R-code for producing the plot (or, when \code{v} is 24 | a data.frame, a list of such character strings).} 25 | 26 | \item{...}{Additional arguments used for class-specific choices of visual functions 27 | (see \emph{details}).} 28 | } 29 | \description{ 30 | Generic shell function that calls a plotting function in order to produce a marginal 31 | distribution plot for a variable (or for each variable in a dataset). What type of plot is made 32 | might depend on the data class of the variable. 33 | } 34 | \details{ 35 | Visual functions can be supplied using their names (in character strings) using 36 | \code{setVisuals}. Note that only a single visual function is allowed for each variable class. 37 | The default visual settings can be inspected by calling \code{setVisuals()}. 38 | An overview of all available \code{visualFunction}s can be obtained by calling 39 | \code{\link{allVisualFunctions}}. 40 | 41 | A user defined visual function can be supplied using its function name. Details on how 42 | to construct valid visual functions are found in \code{\link{visualFunction}}. 43 | } 44 | \examples{ 45 | #Standard use: Return standalone code for plotting a function: 46 | visualize(c(1:10), "Variable 1", doEval = FALSE) 47 | 48 | #Define a new visualization function and call it using visualize either 49 | #using allVisual or a class specific argument: 50 | mosaicVisual <- function(v, vnam, doEval) { 51 | thisCall <- call("mosaicplot", table(v), main = vnam, xlab = "") 52 | if (doEval) { 53 | return(eval(thisCall)) 54 | } else return(deparse(thisCall)) 55 | } 56 | mosaicVisual <- visualFunction(mosaicVisual, 57 | description = "Mosaicplots from graphics", 58 | classes = allClasses()) 59 | 60 | #Inspect all options for visualFunctions: 61 | allVisualFunctions() 62 | 63 | #set mosaicVisual for all variable types: 64 | visualize(c("1", "1", "1", "2", "2", "a"), "My variable", 65 | visuals = setVisuals(all = "mosaicVisual")) 66 | 67 | #set mosaicVisual only for character variables: 68 | visualize(c("1", "1", "1", "2", "2", "a"), "My variable", 69 | visuals = setVisuals(character = "mosaicVisual")) 70 | 71 | #this will use standardVisual, as our variable is not numeric: 72 | visualize(c("1", "1", "1", "2", "2", "a"), "My variable", 73 | visuals = setVisuals(numeric = "mosaicVisual")) 74 | 75 | #return code for a mosaic plot 76 | visualize(c("1", "1", "1", "2", "2", "a"), "My variable", 77 | allVisuals = "mosaicVisual", doEval=FALSE) 78 | 79 | #Produce multiple plots easily by calling visualize on a full dataset: 80 | data(testData) 81 | testData2 <- testData[, c("charVar", "factorVar", "numVar", "intVar")] 82 | visualize(testData2) 83 | 84 | #When using visualize on a dataset, datatype specific arguments have no 85 | #influence: 86 | visualize(testData2, setVisuals(character = "basicVisual", 87 | factor = "basicVisual")) 88 | 89 | #But we can still use the "all" argument in setVisuals: 90 | visualize(testData2, visuals = setVisuals(all = "basicVisual")) 91 | 92 | 93 | } 94 | \references{ 95 | Petersen AH, Ekstrøm CT (2019). “dataMaid: Your Assistant for Documenting Supervised Data Quality Screening in R.” _Journal of Statistical Software_, *90*(6), 1-38. doi: 10.18637/jss.v090.i06 ( \doi{10.18637/jss.v090.i06}). 96 | } 97 | \seealso{ 98 | \code{\link{setVisuals}}, \code{\link{allVisualFunctions}}, 99 | \code{\link{standardVisual}}, \code{\link{basicVisual}} 100 | } 101 | -------------------------------------------------------------------------------- /man/whoami_available.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utility.R 3 | \name{whoami_available} 4 | \alias{whoami_available} 5 | \title{Find out if the whoami package binaries is installed (git + whoami)} 6 | \usage{ 7 | whoami_available() 8 | } 9 | \value{ 10 | logical that is TRUE if whoami and git can be found 11 | } 12 | \description{ 13 | Find out if the whoami package binaries is installed (git + whoami) 14 | } 15 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(dataReporter) 3 | 4 | test_check("dataReporter") 5 | -------------------------------------------------------------------------------- /tests/testthat/atomic.R: -------------------------------------------------------------------------------- 1 | ## Define atomic types 2 | typel <- c(TRUE, FALSE, TRUE, TRUE, FALSE) 3 | typei <- c(1L, 2L, 3L, 4L, 5L) 4 | typed <- c(1.0, 2.3, 4.5, 6.7, 8.9) 5 | typec <- c(1 + 2i, 2 + 0i, 3 + 3i, 4 - 0i, 5+2i) 6 | types <- c(LETTERS[1:5], LETTERS[1:5]) 7 | typef <- factor(types) 8 | typelab <- labelled(types, labels=c(A="A", BB="B", CCC="C", DDDD="D", EEEEE="E")) 9 | typer <- sapply(types, charToRaw) 10 | typelist <- list(a=1:3, b=1:10) 11 | typeDate <- as.Date(c("1jan1960", "2jan1960", "31mar1960", "30jul1960"), "%d%b%Y") 12 | -------------------------------------------------------------------------------- /tests/testthat/spss_labelled.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekstroem/dataReporter/90bcf67e591de0c3b3bfab2620c2d77851c432e1/tests/testthat/spss_labelled.rda -------------------------------------------------------------------------------- /tests/testthat/testcheck.R: -------------------------------------------------------------------------------- 1 | context("dataReporter check") 2 | 3 | library(dataReporter) 4 | library(haven) 5 | 6 | Sys.setenv(TZ="Europe/Copenhagen") 7 | 8 | ## Define atomic types 9 | typel <- c(TRUE, FALSE, TRUE, TRUE, FALSE) 10 | typei <- c(1L, 2L, 3L, 4L, 5L) 11 | typed <- c(1.0, 2.3, 4.5, 6.7, 8.9) 12 | typec <- c(1 + 2i, 2 + 0i, 3 + 3i, 4 - 0i, 5+2i) 13 | types <- c(LETTERS[1:5], LETTERS[1:5]) 14 | typef <- factor(types) 15 | typelab <- labelled(types, labels=c(A="A", BB="B", CCC="C", DDDD="D", EEEEE="E")) 16 | typer <- sapply(types, charToRaw) 17 | typelist <- list(a=1:3, b=1:10) 18 | typeDate <- as.Date(c("1jan1960", "2jan1960", "31mar1960", "30jul1960"), "%d%b%Y", tz="Europe/Copenhagen") 19 | 20 | 21 | ## 22 | ## Check that the right check are performed for each atomic type 23 | ## 24 | 25 | ## Each type should result in a list 26 | test_that("check returns a list for (most) atomic vectors", { 27 | expect_is(check(typel), "list") 28 | expect_is(check(typei), "list") 29 | expect_is(check(typed), "list") 30 | expect_warning(check(typec)) 31 | expect_is(check(types), "list") 32 | expect_is(check(typef), "list") 33 | expect_is(check(typelab), "list") 34 | expect_warning(check(typelist)) 35 | expect_warning(check(typer)) 36 | expect_is(check(typeDate), "list") 37 | }) 38 | 39 | ## Check the right number of tests. This needs to be updated 40 | test_that("check return the right number of tests for (most) atomic vectors", { 41 | expect_equal(length(check(typel)), 1) 42 | expect_equal(length(check(typei)), 2) 43 | expect_equal(length(check(typed)), 2) 44 | ##expect_is(check(typec), "list") 45 | expect_equal(length(check(types)), 5) 46 | expect_equal(length(check(typef)), 5) 47 | expect_equal(length(check(typelab)), 5) 48 | ##expect_is(check(typer), "list") 49 | expect_equal(length(check(typeDate)), 2) 50 | }) 51 | -------------------------------------------------------------------------------- /tests/testthat/testisLoner.R: -------------------------------------------------------------------------------- 1 | context("check function: isLoner") 2 | 3 | library("dataReporter") 4 | 5 | 6 | ## Read atomic vectors 7 | source("atomic.R") 8 | 9 | x <- factor(LETTERS[c(4,4,1,2,1,3,2,4,2, 4,4,1,2,1,3,2,4,2, 4,4,1,2,1,3,2,4,2)+1], levels=c("A", "B", "C", "D", "E")) 10 | 11 | ## Each type should result in a list 12 | test_that("isLoner returns the right values", { 13 | expect_equal(dataReporter::identifyLoners(x)$problemValues, "D") 14 | }) 15 | 16 | -------------------------------------------------------------------------------- /tests/testthat/testmakeReport.R: -------------------------------------------------------------------------------- 1 | context("Testing input/output of makeDataReport") 2 | 3 | # Test for labeled 4 | 5 | # Read a dummy SPSS dataset 6 | load("spss_labelled.rda") 7 | 8 | test_that("Can parse an SPSS dataset with labels", { 9 | 10 | expect_null(makeDataReport(dummydata, render=FALSE, replace=TRUE, openResult=FALSE)) 11 | 12 | }) 13 | 14 | 15 | # Remove the mess we've made so far 16 | 17 | unlink("reporteR_dummydata.Rmd") -------------------------------------------------------------------------------- /tests/testthat/testminMax.R: -------------------------------------------------------------------------------- 1 | context("summary function: minMax") 2 | 3 | library(dataReporter) 4 | 5 | 6 | ## Read atomic vectors 7 | source("atomic.R") 8 | 9 | ## Each type should result in a list 10 | test_that("minMax returns a summaryResult", { 11 | 12 | ## First check the atomic vectors 13 | expect_equal(minMax(typel)$value[1], 0L) 14 | expect_equal(minMax(typel)$value[2], 1L) 15 | expect_equal(minMax(typei)$value[1], 1L) 16 | expect_equal(minMax(typei)$value[2], 5L) 17 | expect_equal(minMax(typed)$value[1], 1.0) 18 | expect_equal(minMax(typed)$value[2], 8.9) 19 | 20 | expect_error(minMax(typec)) 21 | expect_error(minMax(types)) 22 | expect_error(minMax(typef)) 23 | expect_error(minMax(typer)) 24 | expect_error(minMax(typelab)) 25 | expect_error(minMax(typelist)) 26 | expect_equal(minMax(typeDate)$value[1], as.Date("1960-01-01")) 27 | 28 | 29 | ## Check the output format 30 | expect_is(minMax(1:5), "summaryResult") 31 | expect_is(minMax(c(NA, 1:5)), "summaryResult") 32 | expect_is(minMax(c(NA, NA, NA)), "summaryResult") 33 | expect_is(minMax(c(NA, NA, Inf)), "summaryResult") 34 | 35 | expect_equal(minMax(1:5)$value[1], 1) 36 | expect_equal(minMax(c(NA, 1:5))$value[1], 1) 37 | expect_equal(minMax(c(NA, NA, NA))$value[1], NA) 38 | expect_equal(minMax(c(NA, NA, Inf))$value[1], Inf) 39 | 40 | }) 41 | 42 | -------------------------------------------------------------------------------- /tests/testthat/testsummarize.R: -------------------------------------------------------------------------------- 1 | context("dataReporter summarize") 2 | 3 | library(dataReporter) 4 | library(haven) 5 | 6 | ## Define atomic types 7 | typel <- c(TRUE, FALSE, TRUE, TRUE, FALSE) 8 | typei <- c(1L, 2L, 3L, 4L, 5L) 9 | typed <- c(1.0, 2.3, 4.5, 6.7, 8.9) 10 | typec <- c(1 + 2i, 2 + 0i, 3 + 3i, 4 - 0i, 5+2i) 11 | types <- c(LETTERS[1:5], LETTERS[1:5]) 12 | typef <- factor(types) 13 | typelab <- labelled(types, labels=c(A="A", BB="B", CCC="C", DDDD="D", EEEEE="E")) 14 | typer <- sapply(types, charToRaw) 15 | typelist <- list(a=1:3, b=1:10) 16 | typeDate <- as.Date(c("1jan1960", "2jan1960", "31mar1960", "30jul1960"), "%d%b%Y") 17 | 18 | ## 19 | ## Check that the right check are performed for each atomic type 20 | ## 21 | 22 | ## Each type should result in a list 23 | test_that("summarize returns a list for (most) atomic vectors", { 24 | expect_is(summarize(typel), "list") 25 | expect_is(summarize(typei), "list") 26 | expect_is(summarize(typed), "list") 27 | expect_warning(summarize(typec)) 28 | expect_is(summarize(types), "list") 29 | expect_is(summarize(typef), "list") 30 | expect_is(summarize(typelab), "list") 31 | expect_warning(summarize(typelist)) 32 | expect_warning(summarize(typer)) 33 | expect_is(summarize(typeDate), "list") 34 | }) 35 | 36 | ## Summarize the right number of tests. This needs to be updated 37 | test_that("summarize return the right number of tests for (most) atomic vectors", { 38 | expect_equal(length(summarize(typel)), 4) 39 | expect_equal(length(summarize(typei)), 6) 40 | expect_equal(length(summarize(typed)), 6) 41 | ##expect_is(summarize(typec), "list") 42 | expect_equal(length(summarize(types)), 4) 43 | expect_equal(length(summarize(typef)), 5) 44 | expect_equal(length(summarize(typelab)), 4) 45 | ##expect_is(summarize(typer), "list") 46 | expect_equal(length(summarize(typeDate)), 6) 47 | }) 48 | -------------------------------------------------------------------------------- /tests/testthat/testvariableType.R: -------------------------------------------------------------------------------- 1 | context("summary function: variableType") 2 | 3 | library(dataReporter) 4 | 5 | 6 | ## Read atomic vectors 7 | source("atomic.R") 8 | 9 | 10 | test_that("variableType returns the right value", { 11 | ## First check the atomic vectors 12 | expect_equal(variableType(typel)$value, "logical") 13 | expect_equal(variableType(typei)$value, "integer") 14 | expect_equal(variableType(typed)$value, "numeric") 15 | expect_equal(variableType(typec)$value, "complex") 16 | expect_equal(variableType(types)$value, "character") 17 | expect_equal(variableType(typef)$value, "factor") 18 | expect_true(variableType(typelab)$value %in% c("labelled", "haven_labelled")) 19 | expect_equal(variableType(typelist)$value, "list") 20 | expect_equal(variableType(typeDate)$value, "Date") 21 | }) 22 | 23 | 24 | 25 | ## Each type should result in a list 26 | test_that("variableType returns a summaryResult", { 27 | 28 | ## Check the output format 29 | expect_is(variableType(typel), "summaryResult") 30 | expect_is(variableType(typei), "summaryResult") 31 | expect_is(variableType(typed), "summaryResult") 32 | expect_is(variableType(typec), "summaryResult") 33 | expect_is(variableType(types), "summaryResult") 34 | expect_is(variableType(typef), "summaryResult") 35 | expect_is(variableType(typelab), "summaryResult") 36 | expect_is(variableType(typelist), "summaryResult") 37 | expect_is(variableType(typeDate), "summaryResult") 38 | 39 | }) 40 | 41 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | --------------------------------------------------------------------------------