├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ └── test-coverage.yaml ├── .gitignore ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── arcsinh_x.R ├── bestLogConstant.R ├── bestNormalize-package.R ├── bestNormalize.R ├── binarize.R ├── boxcox.R ├── data.R ├── double_reverse_log.R ├── exp_x.R ├── lambert.R ├── log_x.R ├── no_transform.R ├── orderNorm.R ├── plots.R ├── sqrt_x.R ├── step_best_normalize.R ├── step_orderNorm.R └── yeojohnson.R ├── README.Rmd ├── README.md ├── bestNormalize.Rproj ├── codecov.yml ├── cran-comments.md ├── data └── autotrader.RData ├── inst └── CITATION ├── man ├── arcsinh_x.Rd ├── autotrader.Rd ├── bestLogConstant.Rd ├── bestNormalize-package.Rd ├── bestNormalize.Rd ├── binarize.Rd ├── boxcox.Rd ├── double_reverse_log.Rd ├── exp_x.Rd ├── lambert.Rd ├── log_x.Rd ├── no_transform.Rd ├── orderNorm.Rd ├── plot.bestNormalize.Rd ├── reexports.Rd ├── sqrt_x.Rd ├── step_best_normalize.Rd ├── step_orderNorm.Rd └── yeojohnson.Rd ├── revdep ├── .gitignore ├── README.md ├── cran.md ├── failures.md └── problems.md ├── tests ├── testthat-bestNormalize.R ├── testthat-custom-parallel.R ├── testthat-main-transforms.R ├── testthat-methods-step.R ├── testthat-minor-transforms.R └── testthat │ ├── Rplots.pdf │ ├── test_arcsinh_x.R │ ├── test_bestNormalize.R │ ├── test_binarize.R │ ├── test_bn_custom_normstat.R │ ├── test_bn_parallel.R │ ├── test_boxcox.R │ ├── test_custom_fn.R │ ├── test_double_reverse_log.R │ ├── test_exp_x.R │ ├── test_lambert.R │ ├── test_log_x.R │ ├── test_no_transform.R │ ├── test_orderNorm.R │ ├── test_plot_methods.R │ ├── test_print_methods.R │ ├── test_sqrt_x.R │ ├── test_step_fns.R │ └── test_yeojohnson.R └── vignettes ├── bestNormalize.Rmd ├── customization.Rmd └── parallel_timings.jpg /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^cran-comments\.md$ 4 | ^README\.Rmd$ 5 | ^README-.*\.png$ 6 | ^\.travis\.yml$ 7 | ^doc$ 8 | ^Meta$ 9 | ^CRAN-RELEASE$ 10 | ^\.github$ 11 | ^codecov\.yml$ 12 | ^revdep$ 13 | ^\.git$ 14 | ^CRAN-SUBMISSION$ 15 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | 7 | name: pkgdown 8 | 9 | jobs: 10 | pkgdown: 11 | runs-on: ubuntu-18.04 12 | env: 13 | RSPM: https://packagemanager.rstudio.com/cran/__linux__/bionic/latest 14 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - uses: r-lib/actions/setup-r@v1 20 | id: install-r 21 | 22 | - uses: r-lib/actions/setup-pandoc@v1 23 | 24 | - name: Install pak and query dependencies 25 | run: | 26 | install.packages("pak", repos = "https://r-lib.github.io/p/pak/dev/") 27 | saveRDS(pak::pkg_deps("local::.", dependencies = TRUE), ".github/r-depends.rds") 28 | shell: Rscript {0} 29 | 30 | - name: Restore R package cache 31 | uses: actions/cache@v2 32 | with: 33 | path: | 34 | ${{ env.R_LIBS_USER }}/* 35 | !${{ env.R_LIBS_USER }}/pak 36 | key: ubuntu-18.04-${{ steps.install-r.outputs.installed-r-version }}-1-${{ hashFiles('.github/r-depends.rds') }} 37 | restore-keys: ubuntu-18.04-${{ steps.install-r.outputs.installed-r-version }}-1- 38 | 39 | - name: Install system dependencies 40 | if: runner.os == 'Linux' 41 | run: | 42 | pak::local_system_requirements(execute = TRUE) 43 | pak::pkg_system_requirements("pkgdown", execute = TRUE) 44 | shell: Rscript {0} 45 | 46 | - name: Install dependencies 47 | run: | 48 | pak::local_install_dev_deps(upgrade = TRUE, dependencies = c("all", "Config/Needs/website")) 49 | pak::pkg_install("pkgdown") 50 | shell: Rscript {0} 51 | 52 | - name: Install package 53 | run: R CMD INSTALL . 54 | 55 | - name: Build and deploy pkgdown site 56 | run: | 57 | git config --local user.name "$GITHUB_ACTOR" 58 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 59 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 60 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | issue_comment: 3 | types: [created] 4 | name: Commands 5 | jobs: 6 | document: 7 | if: startsWith(github.event.comment.body, '/document') 8 | name: document 9 | runs-on: macOS-latest 10 | env: 11 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 12 | steps: 13 | - uses: actions/checkout@v2 14 | - uses: r-lib/actions/pr-fetch@v1 15 | with: 16 | repo-token: ${{ secrets.GITHUB_TOKEN }} 17 | - uses: r-lib/actions/setup-r@v1 18 | - name: Install dependencies 19 | run: Rscript -e 'install.packages(c("remotes", "roxygen2"))' -e 'remotes::install_deps(dependencies = TRUE)' 20 | - name: Document 21 | run: Rscript -e 'roxygen2::roxygenise()' 22 | - name: commit 23 | run: | 24 | git config --local user.email "actions@github.com" 25 | git config --local user.name "GitHub Actions" 26 | git add man/\* NAMESPACE 27 | git commit -m 'Document' 28 | - uses: r-lib/actions/pr-push@v1 29 | with: 30 | repo-token: ${{ secrets.GITHUB_TOKEN }} 31 | style: 32 | if: startsWith(github.event.comment.body, '/style') 33 | name: style 34 | runs-on: macOS-latest 35 | env: 36 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 37 | steps: 38 | - uses: actions/checkout@v2 39 | - uses: r-lib/actions/pr-fetch@v1 40 | with: 41 | repo-token: ${{ secrets.GITHUB_TOKEN }} 42 | - uses: r-lib/actions/setup-r@v1 43 | - name: Install dependencies 44 | run: Rscript -e 'install.packages("styler")' 45 | - name: Style 46 | run: Rscript -e 'styler::style_pkg()' 47 | - name: commit 48 | run: | 49 | git config --local user.email "actions@github.com" 50 | git config --local user.name "GitHub Actions" 51 | git add \*.R 52 | git commit -m 'Style' 53 | - uses: r-lib/actions/pr-push@v1 54 | with: 55 | repo-token: ${{ secrets.GITHUB_TOKEN }} 56 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | pull_request: 7 | branches: 8 | - main 9 | - master 10 | 11 | name: test-coverage 12 | 13 | jobs: 14 | test-coverage: 15 | runs-on: ubuntu-18.04 16 | env: 17 | RSPM: https://packagemanager.rstudio.com/cran/__linux__/bionic/latest 18 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | 23 | - uses: r-lib/actions/setup-r@v1 24 | id: install-r 25 | 26 | - name: Install pak and query dependencies 27 | run: | 28 | install.packages("pak", repos = "https://r-lib.github.io/p/pak/dev/") 29 | saveRDS(pak::pkg_deps("local::.", dependencies = TRUE), ".github/r-depends.rds") 30 | shell: Rscript {0} 31 | 32 | - name: Restore R package cache 33 | uses: actions/cache@v2 34 | with: 35 | path: | 36 | ${{ env.R_LIBS_USER }}/* 37 | !${{ env.R_LIBS_USER }}/pak 38 | key: ubuntu-18.04-${{ steps.install-r.outputs.installed-r-version }}-1-${{ hashFiles('.github/r-depends.rds') }} 39 | restore-keys: ubuntu-18.04-${{ steps.install-r.outputs.installed-r-version }}-1- 40 | 41 | - name: Install system dependencies 42 | if: runner.os == 'Linux' 43 | run: | 44 | pak::local_system_requirements(execute = TRUE) 45 | pak::pkg_system_requirements("covr", execute = TRUE) 46 | shell: Rscript {0} 47 | 48 | - name: Install dependencies 49 | run: | 50 | pak::local_install_dev_deps(upgrade = TRUE) 51 | pak::pkg_install("covr") 52 | shell: Rscript {0} 53 | 54 | - name: Test coverage 55 | run: covr::codecov() 56 | shell: Rscript {0} 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | doc 4 | Meta 5 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: bestNormalize 2 | Type: Package 3 | Title: Normalizing Transformation Functions 4 | Version: 1.9.1.9000 5 | Date: 2023-09-20 6 | Authors@R: person("Ryan Andrew", "Peterson", 7 | email = "ryan.a.peterson@cuanschutz.edu", 8 | role = c("aut", "cre"), 9 | comment = c(ORCID = "0000-0002-4650-5798")) 10 | Description: Estimate a suite of normalizing transformations, including 11 | a new adaptation of a technique based on ranks which can guarantee 12 | normally distributed transformed data if there are no ties: ordered 13 | quantile normalization (ORQ). ORQ normalization combines a rank-mapping 14 | approach with a shifted logit approximation that allows 15 | the transformation to work on data outside the original domain. It is 16 | also able to handle new data within the original domain via linear 17 | interpolation. The package is built to estimate the best normalizing 18 | transformation for a vector consistently and accurately. It implements 19 | the Box-Cox transformation, the Yeo-Johnson transformation, three types 20 | of Lambert WxF transformations, and the ordered quantile normalization 21 | transformation. It estimates the normalization efficacy of other 22 | commonly used transformations, and it allows users to specify 23 | custom transformations or normalization statistics. Finally, functionality 24 | can be integrated into a machine learning workflow via recipes. 25 | URL: 26 | https://petersonr.github.io/bestNormalize/, 27 | https://github.com/petersonR/bestNormalize 28 | License: GPL-3 29 | Depends: 30 | R (>= 3.1.0) 31 | Imports: 32 | LambertW (>= 0.6.5), 33 | nortest, 34 | dplyr, 35 | doParallel, 36 | foreach, 37 | doRNG, 38 | recipes, 39 | tibble, 40 | methods, 41 | butcher, 42 | purrr, 43 | generics 44 | Suggests: 45 | knitr, 46 | rmarkdown, 47 | MASS, 48 | testthat, 49 | mgcv, 50 | parallel, 51 | ggplot2, 52 | scales, 53 | rlang, 54 | covr 55 | VignetteBuilder: knitr 56 | LazyData: true 57 | RoxygenNote: 7.3.1 58 | Encoding: UTF-8 59 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(axe_env,step_best_normalize) 4 | S3method(axe_env,step_orderNorm) 5 | S3method(bake,step_best_normalize) 6 | S3method(bake,step_orderNorm) 7 | S3method(plot,bestNormalize) 8 | S3method(plot,boxcox) 9 | S3method(plot,lambert) 10 | S3method(plot,orderNorm) 11 | S3method(plot,yeojohnson) 12 | S3method(predict,arcsinh_x) 13 | S3method(predict,bestLogConstant) 14 | S3method(predict,bestNormalize) 15 | S3method(predict,binarize) 16 | S3method(predict,boxcox) 17 | S3method(predict,center_scale) 18 | S3method(predict,double_reverse_log) 19 | S3method(predict,exp_x) 20 | S3method(predict,lambert) 21 | S3method(predict,log_x) 22 | S3method(predict,no_transform) 23 | S3method(predict,orderNorm) 24 | S3method(predict,sqrt_x) 25 | S3method(predict,yeojohnson) 26 | S3method(prep,step_best_normalize) 27 | S3method(prep,step_orderNorm) 28 | S3method(print,arcsinh_x) 29 | S3method(print,bestLogConstant) 30 | S3method(print,bestNormalize) 31 | S3method(print,binarize) 32 | S3method(print,boxcox) 33 | S3method(print,center_scale) 34 | S3method(print,double_reverse_log) 35 | S3method(print,exp_x) 36 | S3method(print,lambert) 37 | S3method(print,log_x) 38 | S3method(print,no_transform) 39 | S3method(print,orderNorm) 40 | S3method(print,sqrt_x) 41 | S3method(print,step_best_normalize) 42 | S3method(print,step_orderNorm) 43 | S3method(print,yeojohnson) 44 | S3method(required_pkgs,step_best_normalize) 45 | S3method(required_pkgs,step_orderNorm) 46 | S3method(tidy,bestNormalize) 47 | S3method(tidy,no_transform) 48 | S3method(tidy,step_best_normalize) 49 | S3method(tidy,step_orderNorm) 50 | export(arcsinh_x) 51 | export(bestLogConstant) 52 | export(bestNormalize) 53 | export(binarize) 54 | export(boxcox) 55 | export(center_scale) 56 | export(double_reverse_log) 57 | export(exp_x) 58 | export(lambert) 59 | export(log_x) 60 | export(no_transform) 61 | export(orderNorm) 62 | export(required_pkgs) 63 | export(sqrt_x) 64 | export(step_bestNormalize) 65 | export(step_bestNormalize_new) 66 | export(step_best_normalize) 67 | export(step_orderNorm) 68 | export(yeojohnson) 69 | importFrom(butcher,axe_env) 70 | importFrom(doParallel,registerDoParallel) 71 | importFrom(doRNG,"%dorng%") 72 | importFrom(foreach,"%dopar%") 73 | importFrom(foreach,foreach) 74 | importFrom(generics,required_pkgs) 75 | importFrom(graphics,legend) 76 | importFrom(graphics,lines) 77 | importFrom(graphics,plot) 78 | importFrom(graphics,points) 79 | importFrom(methods,is) 80 | importFrom(purrr,map) 81 | importFrom(recipes,add_step) 82 | importFrom(recipes,bake) 83 | importFrom(recipes,check_type) 84 | importFrom(recipes,ellipse_check) 85 | importFrom(recipes,is_trained) 86 | importFrom(recipes,prep) 87 | importFrom(recipes,print_step) 88 | importFrom(recipes,rand_id) 89 | importFrom(recipes,recipe) 90 | importFrom(recipes,recipes_eval_select) 91 | importFrom(recipes,sel2char) 92 | importFrom(recipes,step) 93 | importFrom(recipes,tidy) 94 | importFrom(stats,approx) 95 | importFrom(stats,density) 96 | importFrom(stats,fitted) 97 | importFrom(stats,glm) 98 | importFrom(stats,lm) 99 | importFrom(stats,median) 100 | importFrom(stats,optimize) 101 | importFrom(stats,pnorm) 102 | importFrom(stats,predict) 103 | importFrom(stats,predict.glm) 104 | importFrom(stats,qnorm) 105 | importFrom(stats,quantile) 106 | importFrom(stats,sd) 107 | importFrom(stats,var) 108 | importFrom(tibble,as_tibble) 109 | importFrom(tibble,tibble) 110 | -------------------------------------------------------------------------------- /R/arcsinh_x.R: -------------------------------------------------------------------------------- 1 | #' arcsinh(x) Transformation 2 | #' 3 | #' @name arcsinh_x 4 | #' @aliases predict.arcsinh_x 5 | #' 6 | #' @description Perform a arcsinh(x) transformation 7 | #' @param x A vector to normalize with with x 8 | #' @param standardize If TRUE, the transformed values are also centered and 9 | #' scaled, such that the transformation attempts a standard normal 10 | #' @param object an object of class 'arcsinh_x' 11 | #' @param newdata a vector of data to be (potentially reverse) transformed 12 | #' @param inverse if TRUE, performs reverse transformation 13 | #' @param ... additional arguments 14 | #' @details \code{arcsinh_x} performs an arcsinh transformation in the context of 15 | #' bestNormalize, such that it creates a transformation that can be estimated 16 | #' and applied to new data via the \code{predict} function. 17 | #' 18 | #' The function is explicitly: log(x + sqrt(x^2 + 1)) 19 | #' 20 | #' @return A list of class \code{arcsinh_x} with elements 21 | #' \item{x.t}{transformed 22 | #' original data} 23 | #' \item{x}{original data} 24 | #' \item{mean}{mean after transformation but prior to standardization} 25 | #' \item{sd}{sd after transformation but prior to standardization} 26 | #' \item{n}{number of nonmissing observations} 27 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 28 | #' \item{standardize}{was the transformation standardized} 29 | #' 30 | #' The \code{predict} function returns the numeric value of the transformation 31 | #' performed on new data, and allows for the inverse transformation as well. 32 | #' 33 | #' @examples 34 | #' x <- rgamma(100, 1, 1) 35 | #' 36 | #' arcsinh_x_obj <- arcsinh_x(x) 37 | #' arcsinh_x_obj 38 | #' p <- predict(arcsinh_x_obj) 39 | #' x2 <- predict(arcsinh_x_obj, newdata = p, inverse = TRUE) 40 | #' 41 | #' all.equal(x2, x) 42 | #' 43 | #' @importFrom stats sd 44 | #' @export 45 | arcsinh_x <- function(x, standardize = TRUE, ...) { 46 | stopifnot(is.numeric(x)) 47 | 48 | x.t <- asinh(x) 49 | mu <- mean(x.t, na.rm = TRUE) 50 | sigma <- sd(x.t, na.rm = TRUE) 51 | if (standardize) x.t <- (x.t - mu) / sigma 52 | 53 | ptest <- nortest::pearson.test(x.t) 54 | 55 | val <- list( 56 | x.t = x.t, 57 | x = x, 58 | mean = mu, 59 | sd = sigma, 60 | n = length(x.t) - sum(is.na(x)), 61 | norm_stat = unname(ptest$statistic / ptest$df), 62 | standardize = standardize 63 | ) 64 | class(val) <- c('arcsinh_x', class(val)) 65 | val 66 | } 67 | 68 | #' @rdname arcsinh_x 69 | #' @method predict arcsinh_x 70 | #' @export 71 | predict.arcsinh_x <- function(object, newdata = NULL, inverse = FALSE, ...) { 72 | if (is.null(newdata) & !inverse) 73 | newdata <- object$x 74 | if (is.null(newdata) & inverse) 75 | newdata <- object$x.t 76 | 77 | if (inverse) { 78 | if (object$standardize) 79 | newdata <- newdata * object$sd + object$mean 80 | newdata <- sinh(newdata) 81 | } else if (!inverse) { 82 | newdata <- asinh(newdata) 83 | if (object$standardize) 84 | newdata <- (newdata - object$mean) / object$sd 85 | } 86 | unname(newdata) 87 | } 88 | 89 | #' @rdname arcsinh_x 90 | #' @method print arcsinh_x 91 | #' @export 92 | print.arcsinh_x <- function(x, ...) { 93 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 94 | 'asinh(x) Transformation with', x$n, 'nonmissing obs.:\n', 95 | 'Relevant statistics:\n', 96 | '- mean (before standardization) =', x$mean, '\n', 97 | '- sd (before standardization) =', x$sd, '\n') 98 | } 99 | 100 | 101 | -------------------------------------------------------------------------------- /R/bestLogConstant.R: -------------------------------------------------------------------------------- 1 | #' Calculate and perform best normalizing log transformation (experimental) 2 | #' 3 | #' @aliases predict.bestLogConstant 4 | #' 5 | #' @description Similar to bestNormalize, this selects the 6 | #' best candidate constant for a log transformation on the basis 7 | #' of the Pearson P test statistic for normality. The 8 | #' transformation that has the lowest P (calculated on the transformed data) 9 | #' is selected. This function is currently in development and may not behave 10 | #' as expected. 11 | #' 12 | #' See details for more information. 13 | #' 14 | #' @param x A vector to normalize 15 | #' @param standardize If TRUE, the transformed values are also centered and 16 | #' scaled, such that the transformation attempts a standard normal. This will 17 | #' not change the normality statistic. 18 | #' @param newdata a vector of data to be (reverse) transformed 19 | #' @param inverse if TRUE, performs reverse transformation 20 | #' @param object an object of class 'bestLogConstant' 21 | #' @param a (optional) a list of candidate constants to choose from 22 | #' @param ... additional arguments. 23 | 24 | #' @details \code{bestLogConstant} estimates the optimal normalizing constant 25 | #' for a log transformation. This transformation can be performed on new data, and 26 | #' inverted, via the \code{predict} function. 27 | #' 28 | #' @return A list of class \code{bestLogConstant} with elements 29 | #' 30 | #' \item{x.t}{transformed original data} \item{x}{original data} 31 | #' \item{norm_stats}{Pearson's Pearson's P / degrees of freedom} 32 | #' \item{method}{out-of-sample or in-sample, number of folds + repeats} 33 | #' \item{chosen_constant}{the chosen constant transformation (of class `log_x`)} 34 | #' \item{other_transforms}{the other transformations (of class `log_x`)} 35 | #' 36 | #' The \code{predict} function returns the numeric value of the transformation 37 | #' performed on new data, and allows for the inverse transformation as well. 38 | #' 39 | #' 40 | #' @seealso \code{\link[bestNormalize]{bestNormalize}}, \code{\link{log_x}}, 41 | #' @export 42 | bestLogConstant <- function(x, 43 | a, 44 | standardize = TRUE, 45 | ...) { 46 | stopifnot(is.numeric(x)) 47 | x.t <- list() 48 | 49 | if(missing(a)) 50 | a = c(10^(-2:9)) 51 | 52 | args <- lapply(a, function(x) list("a"= x, "standardize" = standardize)) 53 | 54 | method_calls <- rep("log_x", length = length(a)) 55 | names(args) <- method_calls 56 | 57 | 58 | for(i in 1:length(method_calls)) { 59 | args_i <- args[[i]] 60 | args_i$x <- x 61 | 62 | trans_i <- try(do.call(method_calls[i], args_i), silent = TRUE) 63 | x.t[[i]] <- trans_i 64 | } 65 | 66 | norm_stat_fn <- function(x) { 67 | val <- nortest::pearson.test(x) 68 | unname(val$stat/val$df) 69 | } 70 | 71 | norm_stats <- unlist(lapply(x.t, function(x) norm_stat_fn(x$x.t))) 72 | names(norm_stats) <- paste0("a=",a) 73 | best_idx <- which.min(norm_stats) 74 | 75 | val <- list( 76 | x.t = x.t[[best_idx]]$x.t, 77 | x = x, 78 | a = a, 79 | best_a = a[best_idx], 80 | norm_stats = norm_stats, 81 | chosen_transform = x.t[[best_idx]], 82 | other_transforms = x.t[names(x.t) != best_idx], 83 | standardize = standardize 84 | ) 85 | class(val) <- 'bestLogConstant' 86 | val 87 | } 88 | 89 | #' @rdname bestLogConstant 90 | #' @method predict bestLogConstant 91 | #' @importFrom stats predict 92 | #' @export 93 | predict.bestLogConstant <- function(object, newdata = NULL, inverse = FALSE, ...) { 94 | predict(object$chosen_transform, newdata = newdata, inverse = inverse, ...) 95 | } 96 | 97 | #' @rdname bestLogConstant 98 | #' @method print bestLogConstant 99 | #' @export 100 | print.bestLogConstant <- function(x, ...) { 101 | 102 | results <- paste0(" - ", x$a, ": ", round(x$norm_stats, 4), collapse="\n") 103 | 104 | prettyD <- paste0( 105 | 'Estimated Normality Statistics\n', 106 | "(Pearson P / df, lower => more normal):\n", 107 | results, '\n') 108 | 109 | cat('Best Normalizing log constant transformation with', x$chosen_transform$n, 'Observations\n', 110 | prettyD, '\nBased off these, bestLogConstant chose:\n') 111 | print(x$chosen_transform) 112 | } 113 | -------------------------------------------------------------------------------- /R/bestNormalize-package.R: -------------------------------------------------------------------------------- 1 | #' bestNormalize: Flexibly calculate the best normalizing transformation for a 2 | #' vector 3 | #' 4 | #' The \code{bestNormalize} package provides several normalizing transformations, and introduces a 5 | #' new transformation based off of the order statistics, \code{orderNorm}. 6 | #' Perhaps the most useful function is \code{bestNormalize}, which attempts all 7 | #' of these transformations and picks the best one based off of a goodness of 8 | #' fit statistic. 9 | #' 10 | #' @docType package 11 | #' @aliases bestNormalize-package 12 | "_PACKAGE" -------------------------------------------------------------------------------- /R/binarize.R: -------------------------------------------------------------------------------- 1 | #' Binarize 2 | #' 3 | #' @name binarize 4 | #' @aliases predict.binarize 5 | #' 6 | #' @description This function will perform a binarizing transformation, which 7 | #' could be used as a last resort if the data cannot be adequately normalized. 8 | #' This may be useful when accidentally attempting normalization of a binary 9 | #' vector (which could occur if implementing bestNormalize in an automated 10 | #' fashion). 11 | #' 12 | #' Note that the transformation is not one-to-one, in contrast to the other 13 | #' functions in this package. 14 | #' 15 | #' @param x A vector to binarize 16 | #' @param location_measure which location measure should be used? can either be 17 | #' "median", "mean", "mode", a number, or a function. 18 | #' @param newdata a vector of data to be (reverse) transformed 19 | #' @param inverse if TRUE, performs reverse transformation 20 | #' @param object an object of class 'binarize' 21 | #' @param ... additional arguments 22 | #' 23 | #' @return A list of class \code{binarize} with elements 24 | #' \item{x.t}{transformed original data} 25 | #' \item{x}{original data} 26 | #' \item{method}{location_measure used for original fitting} 27 | #' \item{location}{estimated location_measure} 28 | #' \item{n}{number of nonmissing observations} 29 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 30 | #' 31 | #' The \code{predict} function with \code{inverse = FALSE} returns the numeric 32 | #' value (0 or 1) of the transformation on \code{newdata} (which defaults to 33 | #' the original data). 34 | #' 35 | #' If \code{inverse = TRUE}, since the transform is not 1-1, it will create 36 | #' and return a factor that indicates where the original data was cut. 37 | #' 38 | #' @examples 39 | #' x <- rgamma(100, 1, 1) 40 | #' binarize_obj <- binarize(x) 41 | #' (p <- predict(binarize_obj)) 42 | #' 43 | #' predict(binarize_obj, newdata = p, inverse = TRUE) 44 | #' @importFrom stats density median 45 | #' @export 46 | binarize <- function(x, location_measure = 'median') { 47 | stopifnot(is.numeric(x)) 48 | 49 | # Check and set location measure 50 | if (is.numeric(location_measure)) { 51 | loc <- location_measure 52 | } else if (is.function(location_measure)) { 53 | loc <- location_measure(x) 54 | } else if (location_measure == 'median') { 55 | loc <- median(x, na.rm = TRUE) 56 | } else if (location_measure == 'mean') { 57 | loc <- mean(x, na.rm = TRUE) 58 | } else if (location_measure == 'mode') { 59 | dens <- density(x[!is.na(x)]) 60 | loc <- dens$x[which.max(dens$y)] 61 | } 62 | 63 | x.t <- as.numeric(x > loc) 64 | 65 | ptest <- nortest::pearson.test(x.t) 66 | 67 | val <- list( 68 | x.t = x.t, 69 | x = x, 70 | method = location_measure, 71 | location = loc, 72 | n = length(x.t) - sum(is.na(x)), 73 | norm_stat = unname(ptest$statistic / ptest$df) 74 | ) 75 | class(val) <- 'binarize' 76 | val 77 | } 78 | 79 | 80 | #' @rdname binarize 81 | #' @method predict binarize 82 | #' @export 83 | predict.binarize <- function(object, newdata = NULL, inverse = FALSE, ...) { 84 | if (is.null(newdata) & !inverse) 85 | newdata <- object$x 86 | if (is.null(newdata)) 87 | newdata <- object$x.t 88 | 89 | if (!inverse) 90 | return(as.numeric(newdata > object$location)) 91 | prettyLoc <- round(object$location, 92 | getOption('digits', 2)) 93 | labels <- c(paste0('< ', prettyLoc), 94 | paste0('>= ', prettyLoc)) 95 | factor(newdata, levels = 0:1, labels = labels) 96 | } 97 | 98 | 99 | #' @rdname binarize 100 | #' @method print binarize 101 | #' @export 102 | print.binarize <- function(x, ...) { 103 | cat('Binarize Transformation with', x$n, 104 | 'nonmissing obs.\nEstimated Statistic:\n -', x$method, 105 | '=', x$location) 106 | } 107 | 108 | -------------------------------------------------------------------------------- /R/boxcox.R: -------------------------------------------------------------------------------- 1 | #' Box-Cox Normalization 2 | #' 3 | #' @name boxcox 4 | #' @aliases predict.boxcox 5 | #' 6 | #' @description Perform a Box-Cox transformation and center/scale a vector to 7 | #' attempt normalization 8 | #' @param x A vector to normalize with Box-Cox 9 | #' @param standardize If TRUE, the transformed values are also centered and 10 | #' scaled, such that the transformation attempts a standard normal 11 | #' @param ... Additional arguments that can be passed to the estimation of the 12 | #' lambda parameter (lower, upper, epsilon) 13 | #' @param object an object of class 'boxcox' 14 | #' @param newdata a vector of data to be (reverse) transformed 15 | #' @param inverse if TRUE, performs reverse transformation 16 | #' @details \code{boxcox} estimates the optimal value of lambda for the Box-Cox 17 | #' transformation. This transformation can be performed on new data, and 18 | #' inverted, via the \code{predict} function. 19 | #' 20 | #' The function will return an error if a user attempt to transform nonpositive 21 | #' data. 22 | #' 23 | #' 24 | #' @return A list of class \code{boxcox} with elements 25 | #' \item{x.t}{transformed 26 | #' original data} 27 | #' \item{x}{original data} 28 | #' \item{mean}{mean after transformation but prior to standardization} 29 | #' \item{sd}{sd after transformation but prior to standardization} 30 | #' \item{lambda}{estimated lambda value for skew transformation} 31 | #' \item{n}{number of nonmissing observations} 32 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 33 | #' \item{standardize}{was the transformation standardized} 34 | #' 35 | #' The \code{predict} function returns the numeric value of the transformation 36 | #' performed on new data, and allows for the inverse transformation as well. 37 | #' 38 | #' @references Box, G. E. P. and Cox, D. R. (1964) An analysis of 39 | #' transformations. Journal of the Royal Statistical Society B, 26, 211-252. 40 | #' 41 | #' @examples 42 | #' x <- rgamma(100, 1, 1) 43 | #' 44 | #' bc_obj <- boxcox(x) 45 | #' bc_obj 46 | #' p <- predict(bc_obj) 47 | #' x2 <- predict(bc_obj, newdata = p, inverse = TRUE) 48 | #' 49 | #' all.equal(x2, x) 50 | #' @seealso \code{\link[MASS]{boxcox}} 51 | #' @importFrom stats sd 52 | #' @export 53 | boxcox <- function(x, standardize = TRUE, ...) { 54 | stopifnot(is.numeric(x)) 55 | l <- estimate_boxcox_lambda(x, ...) 56 | x.t <- boxcox_trans(x, l) 57 | mu <- mean(x.t, na.rm = TRUE) 58 | sigma <- sd(x.t, na.rm = TRUE) 59 | if (standardize) x.t <- (x.t - mu) / sigma 60 | 61 | ptest <- nortest::pearson.test(x.t) 62 | 63 | val <- list( 64 | x.t = x.t, 65 | x = x, 66 | mean = mu, 67 | sd = sigma, 68 | lambda = l, 69 | n = length(x.t) - sum(is.na(x)), 70 | norm_stat = unname(ptest$statistic / ptest$df), 71 | standardize = standardize 72 | ) 73 | class(val) <- c('boxcox', class(val)) 74 | val 75 | } 76 | 77 | #' @rdname boxcox 78 | #' @method predict boxcox 79 | #' @export 80 | predict.boxcox <- function(object, newdata = NULL, inverse = FALSE, ...) { 81 | if (is.null(newdata) & !inverse) 82 | newdata <- object$x 83 | if (is.null(newdata) & inverse) 84 | newdata <- object$x.t 85 | 86 | if (inverse) { 87 | if (object$standardize) newdata <- newdata * object$sd + object$mean 88 | newdata <- inv_boxcox_trans(newdata, object$lambda) 89 | } else if (!inverse) { 90 | newdata <- boxcox_trans(newdata, object$lambda) 91 | if (object$standardize) newdata <- (newdata - object$mean) / object$sd 92 | } 93 | unname(newdata) 94 | } 95 | 96 | #' @rdname boxcox 97 | #' @method print boxcox 98 | #' @export 99 | print.boxcox <- function(x, ...) { 100 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 101 | 'Box Cox Transformation with', x$n, 'nonmissing obs.:\n', 102 | 'Estimated statistics:\n', 103 | '- lambda =', x$lambda, '\n', 104 | '- mean (before standardization) =', x$mean, '\n', 105 | '- sd (before standardization) =', x$sd, '\n') 106 | } 107 | 108 | # Modified version of boxcox from MASS package 109 | #' @importFrom stats lm optimize 110 | estimate_boxcox_lambda <- function(x, lower = -1, upper = 2, eps = .001) { 111 | n <- length(x) 112 | ccID <- !is.na(x) 113 | x <- x[ccID] 114 | 115 | if (any(x <= 0)) 116 | stop("x must be positive") 117 | 118 | log_x <- log(x) 119 | xbar <- exp(mean(log_x)) 120 | 121 | boxcox_loglik <- function(lambda) { 122 | gm0 <- xbar ^ (lambda - 1) 123 | z <- if (abs(lambda) <= eps) 124 | log_x / gm0 125 | else 126 | (x ^ lambda - 1) / (lambda * gm0) 127 | var_z <- var(z) * (n - 1) / n 128 | - .5 * n * log(var_z) 129 | } 130 | 131 | results <- optimize(boxcox_loglik, lower = lower, 132 | upper = upper, maximum = TRUE, 133 | tol = .0001) 134 | 135 | results$maximum 136 | } 137 | 138 | # Internal transformation functions 139 | boxcox_trans <- function(x, lambda, eps = .001) { 140 | if (lambda < 0) 141 | x[x < 0] <- NA 142 | if (abs(lambda) < eps) 143 | val <- log(x) 144 | else 145 | val <- (sign(x) * abs(x) ^ lambda - 1) / lambda 146 | val 147 | } 148 | 149 | inv_boxcox_trans <- function(x, lambda, eps = .001) { 150 | if (lambda < 0) 151 | x[x > -1 / lambda] <- NA 152 | if (abs(lambda) < eps) 153 | val <- exp(x) 154 | else { 155 | x <- x * lambda + 1 156 | val <- sign(x) * abs(x) ^ (1 / lambda) 157 | } 158 | val 159 | } 160 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Prices of 6,283 cars listed on Autotrader 2 | #' 3 | #' A dataset containing the prices and other attributes of over 6000 cars in the Minneapolis area. 4 | #' 5 | #' @format A data frame with 6283 rows and 10 variables: 6 | #' \describe{ 7 | #' \item{price}{price, in US dollars} 8 | #' \item{Car_Info}{Raw description from website} 9 | #' \item{Link}{hyperlink to listing (must be appended to https://www.autotrader.com/)} 10 | #' \item{Make}{Car manufacturer} 11 | #' \item{Year}{Year car manufactured} 12 | #' \item{Location}{Location of listing} 13 | #' \item{Radius}{Radius chosen for search} 14 | #' \item{mileage}{mileage on vehicle} 15 | #' \item{status}{used/new/certified} 16 | #' \item{model}{make and model, separated by space} 17 | #' } 18 | #' @source \url{https://www.autotrader.com/} 19 | "autotrader" -------------------------------------------------------------------------------- /R/double_reverse_log.R: -------------------------------------------------------------------------------- 1 | #' Double Reverse Log(x + a) Transformation 2 | #' 3 | #' @name double_reverse_log 4 | #' @aliases predict.double_reverse_log 5 | #' 6 | #' @description First reverses scores, then perform a log_b(x) 7 | #' normalization transformation, and then reverses scores again. 8 | #' 9 | #' @param x A vector to normalize with with x 10 | #' @param b The base of the log (defaults to 10) 11 | #' @param standardize If TRUE, the transformed values are also centered and 12 | #' scaled, such that the transformation attempts a standard normal 13 | #' @param eps The cushion for the transformation range (defaults to 10 percent) 14 | #' @param warn Should a warning result from infinite values? 15 | #' @param object an object of class 'double_reverse_log' 16 | #' @param newdata a vector of data to be (potentially reverse) transformed 17 | #' @param inverse if TRUE, performs reverse transformation 18 | #' @param ... additional arguments 19 | #' @details \code{double_reverse_log} performs a simple log transformation in the 20 | #' context of bestNormalize, such that it creates a transformation that can be 21 | #' estimated and applied to new data via the \code{predict} function. The parameter 22 | #' a is essentially estimated by the training set by default (estimated as the 23 | #' minimum possible to some extent epsilon), while the base must be specified 24 | #' beforehand. 25 | #' 26 | #' @return A list of class \code{double_reverse_log} with elements 27 | #' \item{x.t}{transformed 28 | #' original data} 29 | #' \item{x}{original data} 30 | #' \item{mean}{mean after transformation but prior to standardization} 31 | #' \item{sd}{sd after transformation but prior to standardization} 32 | #' \item{b}{estimated base b value} 33 | #' \item{n}{number of nonmissing observations} 34 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 35 | #' \item{standardize}{was the transformation standardized} 36 | #' 37 | #' The \code{predict} function returns the numeric value of the transformation 38 | #' performed on new data, and allows for the inverse transformation as well. 39 | #' 40 | #' @examples 41 | #' x <- rgamma(100, 1, 1) 42 | #' 43 | #' double_reverse_log_obj <- double_reverse_log(x) 44 | #' double_reverse_log_obj 45 | #' p <- predict(double_reverse_log_obj) 46 | #' x2 <- predict(double_reverse_log_obj, newdata = p, inverse = TRUE) 47 | #' 48 | #' all.equal(x2, x) 49 | #' 50 | #' @importFrom stats sd 51 | #' @export 52 | double_reverse_log <- function(x, 53 | b = 10, 54 | standardize = TRUE, 55 | eps = diff(range(x, na.rm = TRUE))/10, 56 | warn = TRUE, 57 | ...) { 58 | stopifnot(is.numeric(x)) 59 | 60 | # Calculated padded max + min of x 61 | max_x <- max(x, na.rm = TRUE) + eps 62 | min_x <- min(x, na.rm = TRUE) - eps 63 | 64 | # calculate padded max(x.t) 65 | max_xt <- log(max_x - min_x, base = b) 66 | 67 | # Perform transformation (reverse, log, reverse) 68 | x_rev <- max_x - x 69 | x.t_rev <- log(x_rev, base = b) 70 | x.t <- max_xt - x.t_rev 71 | 72 | stopifnot(!all(infinite_idx <- is.infinite(x.t))) 73 | if(any(infinite_idx)) { 74 | warning("Some values are infinite") 75 | standardize <- FALSE 76 | } 77 | 78 | mu <- mean(x.t, na.rm = TRUE) 79 | sigma <- sd(x.t, na.rm = TRUE) 80 | if (standardize) x.t <- (x.t - mu) / sigma 81 | 82 | ptest <- nortest::pearson.test(x.t) 83 | 84 | val <- list( 85 | x.t = x.t, 86 | x = x, 87 | mean = mu, 88 | sd = sigma, 89 | b = b, 90 | eps = eps, 91 | n = length(x.t) - sum(is.na(x)), 92 | norm_stat = unname(ptest$statistic / ptest$df), 93 | standardize = standardize, 94 | max_x = max_x, 95 | min_x = min_x, 96 | max_xt = max_xt 97 | ) 98 | class(val) <- c('double_reverse_log', class(val)) 99 | val 100 | 101 | } 102 | 103 | #' @rdname double_reverse_log 104 | #' @method predict double_reverse_log 105 | #' @export 106 | predict.double_reverse_log <- function(object, newdata = NULL, inverse = FALSE, ...) { 107 | if (is.null(newdata) & !inverse) 108 | newdata <- object$x 109 | if (is.null(newdata) & inverse){ 110 | newdata <- object$x.t 111 | } 112 | if (inverse) { 113 | if (object$standardize) { 114 | newdata <- newdata * object$sd + object$mean 115 | } 116 | 117 | # Perform transformation 118 | new_xt_rev <- object$max_xt - newdata 119 | new_x_rev <- object$b^new_xt_rev 120 | newdata <- object$max_x - new_x_rev 121 | 122 | } else if (!inverse) { 123 | 124 | new_x_rev <- object$max_x - newdata 125 | new_xt_rev <- log(new_x_rev, object$b) 126 | newdata <- object$max_xt - new_xt_rev 127 | 128 | if (object$standardize) { 129 | newdata <- (newdata - object$mean) / object$sd 130 | } 131 | } 132 | 133 | unname(newdata) 134 | } 135 | 136 | #' @rdname double_reverse_log 137 | #' @method print double_reverse_log 138 | #' @export 139 | print.double_reverse_log <- function(x, ...) { 140 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 141 | 'double reversed Log_b(x + a) Transformation with', x$n, 'nonmissing obs.:\n', 142 | 'Relevant statistics:\n', 143 | '- a =', x$a, '\n', 144 | '- b =', x$b, '\n', 145 | '- max(x) =', x$max_x, '; min(x) =', x$min_x, '\n', 146 | '- mean (before standardization) =', x$mean, '\n', 147 | '- sd (before standardization) =', x$sd, '\n') 148 | } 149 | 150 | 151 | -------------------------------------------------------------------------------- /R/exp_x.R: -------------------------------------------------------------------------------- 1 | #' exp(x) Transformation 2 | #' 3 | #' @name exp_x 4 | #' @aliases predict.exp_x 5 | #' 6 | #' @description Perform a exp(x) transformation 7 | #' @param x A vector to normalize with with x 8 | #' @param standardize If TRUE, the transformed values are also centered and 9 | #' scaled, such that the transformation attempts a standard normal 10 | #' @param warn Should a warning result from infinite values? 11 | #' @param object an object of class 'exp_x' 12 | #' @param newdata a vector of data to be (potentially reverse) transformed 13 | #' @param inverse if TRUE, performs reverse transformation 14 | #' @param ... additional arguments 15 | #' @details \code{exp_x} performs a simple exponential transformation in the context of 16 | #' bestNormalize, such that it creates a transformation that can be estimated 17 | #' and applied to new data via the \code{predict} function. 18 | #' 19 | #' @return A list of class \code{exp_x} with elements 20 | #' \item{x.t}{transformed 21 | #' original data} 22 | #' \item{x}{original data} 23 | #' \item{mean}{mean after transformation but prior to standardization} 24 | #' \item{sd}{sd after transformation but prior to standardization} 25 | #' \item{n}{number of nonmissing observations} 26 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 27 | #' \item{standardize}{was the transformation standardized} 28 | #' 29 | #' The \code{predict} function returns the numeric value of the transformation 30 | #' performed on new data, and allows for the inverse transformation as well. 31 | #' 32 | #' @examples 33 | #' x <- rgamma(100, 1, 1) 34 | #' 35 | #' exp_x_obj <- exp_x(x) 36 | #' exp_x_obj 37 | #' p <- predict(exp_x_obj) 38 | #' x2 <- predict(exp_x_obj, newdata = p, inverse = TRUE) 39 | #' 40 | #' all.equal(x2, x) 41 | #' 42 | #' @importFrom stats sd 43 | #' @export 44 | exp_x <- function(x, standardize = TRUE, warn = TRUE, ...) { 45 | stopifnot(is.numeric(x)) 46 | 47 | x.t <- exp(x) 48 | mu <- mean(x.t, na.rm = TRUE) 49 | sigma <- sd(x.t, na.rm = TRUE) 50 | infinite_idx <- is.infinite(x.t) 51 | if (standardize) x.t <- (x.t - mu) / sigma 52 | 53 | if(any(infinite_idx)) { 54 | stop("infinite post-transformation values") 55 | } 56 | 57 | ptest <- nortest::pearson.test(x.t) 58 | 59 | val <- list( 60 | x.t = x.t, 61 | x = x, 62 | mean = mu, 63 | sd = sigma, 64 | n = length(x.t) - sum(is.na(x)), 65 | norm_stat = unname(ptest$statistic / ptest$df), 66 | standardize = standardize 67 | ) 68 | class(val) <- c('exp_x', class(val)) 69 | val 70 | } 71 | 72 | #' @rdname exp_x 73 | #' @method predict exp_x 74 | #' @export 75 | predict.exp_x <- function(object, newdata = NULL, inverse = FALSE, ...) { 76 | if (is.null(newdata) & !inverse) 77 | newdata <- object$x 78 | if (is.null(newdata) & inverse) 79 | newdata <- object$x.t 80 | 81 | if (inverse) { 82 | if (object$standardize) 83 | newdata <- newdata * object$sd + object$mean 84 | newdata <- log(newdata) 85 | } else if (!inverse) { 86 | newdata <- exp(newdata) 87 | if (object$standardize) 88 | newdata <- (newdata - object$mean) / object$sd 89 | } 90 | unname(newdata) 91 | } 92 | 93 | #' @rdname exp_x 94 | #' @method print exp_x 95 | #' @export 96 | print.exp_x <- function(x, ...) { 97 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 98 | 'exp(x) Transformation with', x$n, 'nonmissing obs.:\n', 99 | 'Relevant statistics:\n', 100 | '- mean (before standardization) =', x$mean, '\n', 101 | '- sd (before standardization) =', x$sd, '\n') 102 | } 103 | 104 | 105 | -------------------------------------------------------------------------------- /R/lambert.R: -------------------------------------------------------------------------------- 1 | #' Lambert W x F Normalization 2 | #' 3 | #' @name lambert 4 | #' @aliases predict.lambert 5 | #' 6 | #' @description Perform Lambert's W x F transformation and center/scale a vector 7 | #' to attempt normalization via the \code{LambertW} package. 8 | #' @param x A vector to normalize with Box-Cox 9 | #' @param type a character indicating which transformation to perform (options 10 | #' are "s", "h", and "hh", see details) 11 | #' @param standardize If TRUE, the transformed values are also centered and 12 | #' scaled, such that the transformation attempts a standard normal 13 | #' @param ... Additional arguments that can be passed to the 14 | #' LambertW::Gaussianize function 15 | #' @param newdata a vector of data to be (reverse) transformed 16 | #' @param inverse if TRUE, performs reverse transformation 17 | #' @param object an object of class 'lambert' 18 | #' @param warn should the function show warnings 19 | #' 20 | #' @details \code{lambert} uses the \code{LambertW} package to estimate a 21 | #' normalizing (or "Gaussianizing") transformation. This transformation can be 22 | #' performed on new data, and inverted, via the \code{predict} function. 23 | #' 24 | #' NOTE: The type = "s" argument is the only one that does the 1-1 transform 25 | #' consistently, and so it is the only method currently used in 26 | #' \code{bestNormalize()}. Use type = "h" or type = 'hh' at risk of not having 27 | #' this estimate 1-1 transform. These alternative types are effective when the 28 | #' data has exceptionally heavy tails, e.g. the Cauchy distribution. 29 | #' 30 | #' Additionally, sometimes (depending on the distribution) this method will be 31 | #' unable to extrapolate beyond the observed bounds. In these cases, NaN is 32 | #' returned. 33 | #' 34 | #' @return A list of class \code{lambert} with elements 35 | #' \item{x.t}{transformed original data} 36 | #' \item{x}{original data} 37 | #' \item{mean}{mean after transformation but prior to standardization} 38 | #' \item{sd}{sd after transformation but prior to standardization} 39 | #' \item{tau.mat}{estimated parameters of LambertW::Gaussianize} 40 | #' \item{n}{number of nonmissing observations} 41 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 42 | #' \item{standardize}{was the transformation standardized} 43 | #' 44 | #' The \code{predict} function returns the numeric value of the transformation 45 | #' performed on new data, and allows for the inverse transformation as well. 46 | #' 47 | #' @references Georg M. Goerg (2016). LambertW: An R package for Lambert W x F 48 | #' Random Variables. R package version 0.6.4. 49 | #' 50 | #' Georg M. Goerg (2011): Lambert W random variables - a new family of 51 | #' generalized skewed distributions with applications to risk estimation. 52 | #' Annals of Applied Statistics 3(5). 2197-2230. 53 | #' 54 | #' Georg M. Goerg (2014): The Lambert Way to Gaussianize heavy-tailed data 55 | #' with the inverse of Tukey's h transformation as a special case. The 56 | #' Scientific World Journal. 57 | #' 58 | #' @examples 59 | #' \dontrun{ 60 | #' x <- rgamma(100, 1, 1) 61 | #' 62 | #' lambert_obj <- lambert(x) 63 | #' lambert_obj 64 | #' p <- predict(lambert_obj) 65 | #' x2 <- predict(lambert_obj, newdata = p, inverse = TRUE) 66 | #' 67 | #' all.equal(x2, x) 68 | #' } 69 | #' 70 | #' @seealso \code{\link[LambertW]{Gaussianize}} 71 | #' @importFrom stats sd 72 | #' @export 73 | lambert <- function(x, type = 's', standardize = TRUE, warn = FALSE, ...) { 74 | stopifnot(is.numeric(x)) 75 | na_idx <- is.na(x) 76 | x_complete <- x[!na_idx] 77 | obj <- unname(LambertW::Gaussianize(x_complete, type = type, return.tau.mat = TRUE, ...)) 78 | 79 | x.t <- x 80 | x.t[!na_idx] <- obj[[1]] 81 | tau.mat <- obj[[2]] 82 | 83 | mu <- mean(x.t, na.rm = TRUE) 84 | sigma <- sd(x.t, na.rm = TRUE) 85 | 86 | if (standardize) x.t <- (x.t - mu) / sigma 87 | 88 | attributes(x.t) <- NULL 89 | 90 | ptest <- nortest::pearson.test(x.t) 91 | 92 | val <- list( 93 | x.t = unname(x.t), 94 | x = x, 95 | mean = mu, 96 | sd = sigma, 97 | tau.mat = tau.mat, 98 | n = length(x.t) - sum(na_idx), 99 | type = type, 100 | norm_stat = unname(ptest$statistic / ptest$df), 101 | standardize = standardize 102 | ) 103 | 104 | class(val) <- 'lambert' 105 | val 106 | } 107 | 108 | #' @rdname lambert 109 | #' @method predict lambert 110 | #' @export 111 | predict.lambert <- function(object, 112 | newdata = NULL, 113 | inverse = FALSE, 114 | ...) { 115 | if (is.null(newdata) & !inverse) 116 | newdata <- object$x 117 | if (is.null(newdata) & inverse) 118 | newdata <- object$x.t 119 | 120 | if (inverse & object$standardize) 121 | newdata <- newdata * object$sd + object$mean 122 | 123 | stopifnot(is.numeric(newdata)) 124 | 125 | na_idx <- is.na(newdata) 126 | 127 | newdata[!na_idx] <- LambertW::Gaussianize( 128 | as.matrix(newdata[!na_idx]), 129 | type = object$type, 130 | tau.mat = object$tau.mat, 131 | inverse = inverse 132 | ) 133 | 134 | if (!inverse & object$standardize) 135 | newdata <- (newdata - object$mean) / object$sd 136 | 137 | attributes(newdata) <- NULL 138 | unname(newdata) 139 | } 140 | 141 | #' @rdname lambert 142 | #' @method print lambert 143 | #' @export 144 | print.lambert <- function(x, ...) { 145 | prettyTau <- apply(cbind('- ', 146 | rownames(x$tau.mat)[-c(1:2)], ' = ', 147 | round(x$tau.mat[-c(1:2)],4), '\n' 148 | ), 1, paste, collapse = '') 149 | 150 | 151 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 152 | 'Lambert WxF Transformation of type', x$type, 153 | 'with', x$n, 'nonmissing obs.:\n', 154 | 'Estimated statistics:\n', 155 | prettyTau, 156 | '- mean (before standardization) =', x$mean, '\n', 157 | '- sd (before standardization) =', x$sd, '\n') 158 | } 159 | -------------------------------------------------------------------------------- /R/log_x.R: -------------------------------------------------------------------------------- 1 | #' Log(x + a) Transformation 2 | #' 3 | #' @name log_x 4 | #' @aliases predict.log_x 5 | #' 6 | #' @description Perform a log_b (x+a) normalization transformation 7 | #' @param x A vector to normalize with with x 8 | #' @param standardize If TRUE, the transformed values are also centered and 9 | #' scaled, such that the transformation attempts a standard normal 10 | #' @param a The constant to add to x (defaults to max(0, -min(x) + eps)); 11 | #' see \code{bestLogConstant} 12 | #' @param b The base of the log (defaults to 10) 13 | #' @param eps The allowed error in the expression for the selected a 14 | #' @param warn Should a warning result from infinite values? 15 | #' @param object an object of class 'log_x' 16 | #' @param newdata a vector of data to be (potentially reverse) transformed 17 | #' @param inverse if TRUE, performs reverse transformation 18 | #' @param ... additional arguments 19 | #' @details \code{log_x} performs a simple log transformation in the context of 20 | #' bestNormalize, such that it creates a transformation that can be estimated 21 | #' and applied to new data via the \code{predict} function. The parameter a is 22 | #' essentially estimated by the training set by default (estimated as the minimum 23 | #' possible to some extent epsilon), while the base must be 24 | #' specified beforehand. 25 | #' 26 | #' @return A list of class \code{log_x} with elements 27 | #' \item{x.t}{transformed 28 | #' original data} 29 | #' \item{x}{original data} 30 | #' \item{mean}{mean after transformation but prior to standardization} 31 | #' \item{sd}{sd after transformation but prior to standardization} 32 | #' \item{a}{estimated a value} 33 | #' \item{b}{estimated base b value} 34 | #' \item{n}{number of nonmissing observations} 35 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 36 | #' \item{standardize}{was the transformation standardized} 37 | #' 38 | #' The \code{predict} function returns the numeric value of the transformation 39 | #' performed on new data, and allows for the inverse transformation as well. 40 | #' 41 | #' @examples 42 | #' x <- rgamma(100, 1, 1) 43 | #' 44 | #' log_x_obj <- log_x(x) 45 | #' log_x_obj 46 | #' p <- predict(log_x_obj) 47 | #' x2 <- predict(log_x_obj, newdata = p, inverse = TRUE) 48 | #' 49 | #' all.equal(x2, x) 50 | #' 51 | #' @importFrom stats sd 52 | #' @export 53 | log_x <- function(x, a = NULL, b = 10, standardize = TRUE, eps = .001, warn = TRUE, ...) { 54 | stopifnot(is.numeric(x)) 55 | 56 | min_a <- max(0, -(min(x, na.rm = TRUE) - eps)) 57 | if(!length(a)) 58 | a <- min_a 59 | if(a < min_a) { 60 | warning("Setting a < max(0, -(min(x) - eps)) can lead to transformation issues", 61 | "Standardize set to FALSE") 62 | standardize <- FALSE 63 | } 64 | 65 | 66 | x.t <- log(x + a, base = b) 67 | 68 | stopifnot(!all(infinite_idx <- is.infinite(x.t))) 69 | if(any(infinite_idx)) { 70 | warning("Some values are infinite") 71 | standardize <- FALSE 72 | } 73 | 74 | mu <- mean(x.t, na.rm = TRUE) 75 | sigma <- sd(x.t, na.rm = TRUE) 76 | if (standardize) x.t <- (x.t - mu) / sigma 77 | 78 | ptest <- nortest::pearson.test(x.t) 79 | 80 | val <- list( 81 | x.t = x.t, 82 | x = x, 83 | mean = mu, 84 | sd = sigma, 85 | a = a, 86 | b = b, 87 | eps = eps, 88 | n = length(x.t) - sum(is.na(x)), 89 | norm_stat = unname(ptest$statistic / ptest$df), 90 | standardize = standardize 91 | ) 92 | class(val) <- c('log_x', class(val)) 93 | val 94 | } 95 | 96 | #' @rdname log_x 97 | #' @method predict log_x 98 | #' @export 99 | predict.log_x <- function(object, newdata = NULL, inverse = FALSE, ...) { 100 | if (is.null(newdata) & !inverse) 101 | newdata <- object$x 102 | if (is.null(newdata) & inverse) 103 | newdata <- object$x.t 104 | 105 | if (inverse) { 106 | if (object$standardize) 107 | newdata <- newdata * object$sd + object$mean 108 | newdata <- object$b^newdata - object$a 109 | } else if (!inverse) { 110 | newdata <- log(newdata + object$a, object$b) 111 | if (object$standardize) 112 | newdata <- (newdata - object$mean) / object$sd 113 | } 114 | unname(newdata) 115 | } 116 | 117 | #' @rdname log_x 118 | #' @method print log_x 119 | #' @export 120 | print.log_x <- function(x, ...) { 121 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 122 | 'Log_b(x + a) Transformation with', x$n, 'nonmissing obs.:\n', 123 | 'Relevant statistics:\n', 124 | '- a =', x$a, '\n', 125 | '- b =', x$b, '\n', 126 | '- mean (before standardization) =', x$mean, '\n', 127 | '- sd (before standardization) =', x$sd, '\n') 128 | } 129 | 130 | 131 | -------------------------------------------------------------------------------- /R/no_transform.R: -------------------------------------------------------------------------------- 1 | #' Identity transformation and center/scale transform 2 | #' 3 | #' @name no_transform 4 | #' @aliases predict.no_transform center_scale 5 | #' 6 | #' @description Perform an identity transformation. Admittedly it seems odd to 7 | #' have a dedicated function to essentially do I(x), but it makes sense to 8 | #' keep the same syntax as the other transformations so it plays nicely 9 | #' with them. As a benefit, the bestNormalize function will also show 10 | #' a comparable normalization statistic for the untransformed data. If 11 | #' \code{standardize == TRUE}, \code{center_scale} passes to bestNormalize instead. 12 | #' @param x A vector 13 | #' @param warn Should a warning result from infinite values? 14 | #' @param object an object of class 'no_transform' 15 | #' @param newdata a vector of data to be (potentially reverse) transformed 16 | #' @param inverse if TRUE, performs reverse transformation 17 | #' @param ... additional arguments 18 | #' @details \code{no_transform} creates a identity transformation object 19 | #' that can be applied to new data via the \code{predict} function. 20 | #' 21 | #' @return A list of class \code{no_transform} with elements 22 | #' \item{x.t}{transformed original data} 23 | #' \item{x}{original data} 24 | #' \item{n}{number of nonmissing observations} 25 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 26 | #' 27 | #' The \code{predict} function returns the numeric value of the transformation 28 | #' performed on new data, and allows for the inverse transformation as well. 29 | #' 30 | #' @examples 31 | #' x <- rgamma(100, 1, 1) 32 | #' 33 | #' no_transform_obj <- no_transform(x) 34 | #' no_transform_obj 35 | #' p <- predict(no_transform_obj) 36 | #' x2 <- predict(no_transform_obj, newdata = p, inverse = TRUE) 37 | #' 38 | #' all.equal(x2, x) 39 | #' 40 | #' @importFrom stats sd 41 | #' @export 42 | no_transform <- function(x, warn = TRUE, ...) { 43 | stopifnot(is.numeric(x)) 44 | 45 | x.t <- x 46 | 47 | if (all(infinite_idx <- is.infinite(x.t))) { 48 | stop("Transformation infinite for all x") 49 | } 50 | if(any(infinite_idx) & warn) { 51 | warning("Some values (but not all) transformed values are infinite") 52 | } 53 | 54 | mu <- mean(x.t, na.rm = TRUE) 55 | sigma <- sd(x.t, na.rm = TRUE) 56 | 57 | ptest <- nortest::pearson.test(x.t) 58 | 59 | val <- list( 60 | x.t = x.t, 61 | x = x, 62 | mean = mu, 63 | sd = sigma, 64 | n = length(x.t) - sum(is.na(x)), 65 | norm_stat = unname(ptest$statistic / ptest$df) 66 | ) 67 | class(val) <- c('no_transform', class(val)) 68 | val 69 | } 70 | 71 | #' @rdname no_transform 72 | #' @method predict no_transform 73 | #' @export 74 | predict.no_transform <- function(object, newdata = NULL, inverse = FALSE, ...) { 75 | if (is.null(newdata) & !inverse) 76 | newdata <- object$x 77 | if (is.null(newdata) & inverse) 78 | newdata <- object$x.t 79 | 80 | if (inverse) { 81 | newdata <- newdata 82 | } else if (!inverse) { 83 | newdata <- newdata 84 | } 85 | unname(newdata) 86 | } 87 | 88 | #' @rdname no_transform 89 | #' @method print no_transform 90 | #' @export 91 | print.no_transform <- function(x, ...) { 92 | cat('I(x) Transformation with', x$n, 'nonmissing obs.\n') 93 | } 94 | 95 | #' @rdname no_transform 96 | #' @importFrom stats sd 97 | #' @export 98 | center_scale <- function(x, warn = TRUE, ...) { 99 | stopifnot(is.numeric(x)) 100 | 101 | x.t <- x 102 | 103 | if (all(infinite_idx <- is.infinite(x.t))) { 104 | stop("Transformation infinite for all x") 105 | } 106 | if(any(infinite_idx) & warn) { 107 | warning("Some values (but not all) transformed values are infinite") 108 | } 109 | 110 | mu <- mean(x.t, na.rm = TRUE) 111 | sigma <- sd(x.t, na.rm = TRUE) 112 | 113 | x.t <- (x.t - mu) / sigma 114 | 115 | ptest <- nortest::pearson.test(x.t) 116 | 117 | val <- list( 118 | x.t = x.t, 119 | x = x, 120 | mean = mu, 121 | sd = sigma, 122 | n = length(x.t) - sum(is.na(x)), 123 | norm_stat = unname(ptest$statistic / ptest$df) 124 | ) 125 | class(val) <- c('center_scale', class(val)) 126 | val 127 | } 128 | 129 | #' @rdname no_transform 130 | #' @method predict center_scale 131 | #' @export 132 | predict.center_scale <- function(object, newdata = NULL, inverse = FALSE, ...) { 133 | if (is.null(newdata) & !inverse) 134 | newdata <- object$x 135 | if (is.null(newdata) & inverse) 136 | newdata <- object$x.t 137 | 138 | if (inverse) { 139 | newdata <- newdata * object$sd + object$mean 140 | } else if (!inverse) { 141 | newdata <- (newdata - object$mean) / object$sd 142 | } 143 | unname(newdata) 144 | } 145 | 146 | #' @rdname no_transform 147 | #' @method print center_scale 148 | #' @export 149 | print.center_scale <- function(x, ...) { 150 | cat('center_scale(x) Transformation with', x$n, 'nonmissing obs.\n', 151 | 'Estimated statistics:\n', 152 | '- mean (before standardization) =', x$mean, '\n', 153 | '- sd (before standardization) =', x$sd, '\n') 154 | } 155 | 156 | #' @rdname no_transform 157 | #' @param x A `no_transform` object. 158 | #' @param ... not used 159 | #' @importFrom tibble tibble 160 | #' @export 161 | tidy.no_transform <- function(x, ...) { 162 | value <- tibble( 163 | "transform" = c("no_transform"), 164 | "norm_stat" = x$norm_stat, 165 | "chosen" = 1 166 | ) 167 | } 168 | -------------------------------------------------------------------------------- /R/plots.R: -------------------------------------------------------------------------------- 1 | #' Transformation plotting 2 | #' 3 | #' Plots transformation functions for objects produced by the bestNormalize 4 | #' package 5 | #' 6 | #' @name plot.bestNormalize 7 | #' @aliases plot.orderNorm plot.boxcox plot.yeojohnson plot.lambert 8 | #' 9 | #' @details The plots produced by the individual transformations are simply 10 | #' plots of the original values by the newly transformed values, with a line 11 | #' denoting where transformations would take place for new data. 12 | #' 13 | #' For the bestNormalize object, this plots each of the possible 14 | #' transformations run by the original call to bestNormalize. The first 15 | #' argument in the "cols" parameter refers to the color of the chosen 16 | #' transformation. 17 | #' 18 | #' @rdname plot.bestNormalize 19 | #' @param x a fitted transformation 20 | #' @param inverse if TRUE, plots the inverse transformation 21 | #' @param bounds a vector of bounds to plot for the transformation 22 | #' @param cols a vector of colors to use for the transforms (see details) 23 | #' @param methods a vector of transformations to plot 24 | #' @param leg_loc the location of the legend on the plot 25 | #' @param ... further parameters to be passed to \code{plot} and \code{lines} 26 | #' @importFrom graphics legend lines plot points 27 | #' @export 28 | plot.bestNormalize <- function(x, inverse = FALSE, bounds = NULL, 29 | cols = NULL, 30 | methods = NULL, 31 | leg_loc = 'top', 32 | ...) { 33 | 34 | if(!inverse) { 35 | xvals <- x$x 36 | x_t <- x$x.t 37 | } else { 38 | xvals <- x$x.t 39 | x_t <- x$x 40 | } 41 | 42 | if(is.null(bounds)) { 43 | xx <- seq(min(xvals, na.rm = TRUE), max(xvals, na.rm = TRUE), length = 1000) 44 | } else 45 | xx <- seq(min(bounds), max(bounds), length = 1000) 46 | 47 | yy <- predict(x, newdata = xx, inverse = inverse, warn = FALSE) 48 | 49 | ## Other methods to plot 50 | if(is.null(methods)) { 51 | methods <- c(names(x$other_transforms)) 52 | ys <- lapply(x$other_transforms, function(obj) { 53 | predict(obj, newdata = xx, inverse = inverse, warn = FALSE) 54 | }) 55 | } else { 56 | methods_found <- intersect(names(x$other_transforms), methods) 57 | if(length(methods_found) != length(methods)) 58 | stop("Not all specified methods found") 59 | methods <- methods_found 60 | 61 | other_transforms <- x$other_transforms[names(x$other_transforms) %in% methods] 62 | 63 | ys <- lapply(other_transforms, function(obj) { 64 | predict(obj, newdata = xx, inverse = inverse, warn = FALSE) 65 | }) 66 | } 67 | 68 | ## Color palette 69 | if(is.null(cols)) { 70 | cols <- 1:(length(methods) + 1) 71 | } 72 | 73 | 74 | plot(xx, yy, ylim = range(yy, ys, na.rm = TRUE), 75 | xlim = range(xx), type = 'l', 76 | col = cols[1], lwd = 2, 77 | xlab = ifelse(inverse, "g(x)", "x"), 78 | ylab = ifelse(!inverse, "g(x)", "x"), 79 | ...) 80 | 81 | lapply(1:length(ys), function(i) {lines(xx, ys[[i]], col = cols[i + 1], lwd = 2, ...)}) 82 | 83 | labs <- c(class(x$chosen_transform)[1], methods) 84 | 85 | legend(leg_loc, labs, col = cols, bty = 'n', lwd = 2) 86 | if(!inverse) 87 | points(x = jitter(xvals), y = rep(min(range(ys, yy, na.rm = TRUE)), length(xvals)), pch = '|') 88 | else 89 | points(x = rep(min(xvals, bounds), length(x$x)), y = x$x, pch = '_') 90 | invisible(x) 91 | } 92 | 93 | #' @rdname plot.bestNormalize 94 | #' @importFrom graphics lines plot 95 | #' @export 96 | plot.orderNorm <- function(x, inverse = FALSE, bounds = NULL, ...) { 97 | 98 | if(!inverse) { 99 | xvals <- x$x 100 | x_t <- x$x.t 101 | } else { 102 | xvals <- x$x.t 103 | x_t <- x$x 104 | } 105 | 106 | if(is.null(bounds)) { 107 | xx <- seq(min(xvals, na.rm = TRUE), max(xvals, na.rm = TRUE), length = 1000) 108 | } else 109 | xx <- seq(min(bounds, na.rm = TRUE), max(bounds, na.rm = TRUE), length = 100) 110 | 111 | yy <- predict(x, newdata = xx, inverse = inverse, warn = FALSE) 112 | plot(xvals, x_t, pch = 20, ylim = range(yy, na.rm = TRUE), xlim = range(xx), ...) 113 | lines(xx, yy, col = 'slateblue', lwd = 2, ...) 114 | invisible(x) 115 | } 116 | 117 | #' @rdname plot.bestNormalize 118 | #' @importFrom graphics lines plot 119 | #' @export 120 | plot.boxcox <- plot.orderNorm 121 | 122 | #' @rdname plot.bestNormalize 123 | #' @importFrom graphics lines plot 124 | #' @export 125 | plot.yeojohnson <- plot.orderNorm 126 | 127 | #' @rdname plot.bestNormalize 128 | #' @importFrom graphics lines plot 129 | #' @export 130 | plot.lambert <- plot.orderNorm 131 | 132 | -------------------------------------------------------------------------------- /R/sqrt_x.R: -------------------------------------------------------------------------------- 1 | #' sqrt(x + a) Normalization 2 | #' 3 | #' @name sqrt_x 4 | #' @aliases predict.sqrt_x 5 | #' 6 | #' @description Perform a sqrt (x+a) normalization transformation 7 | #' @param x A vector to normalize with with x 8 | #' @param standardize If TRUE, the transformed values are also centered and 9 | #' scaled, such that the transformation attempts a standard normal 10 | #' @param a The constant to add to x (defaults to max(0, -min(x))) 11 | #' @param object an object of class 'sqrt_x' 12 | #' @param newdata a vector of data to be (potentially reverse) transformed 13 | #' @param inverse if TRUE, performs reverse transformation 14 | #' @param ... additional arguments 15 | #' @details \code{sqrt_x} performs a simple square-root transformation in the 16 | #' context of bestNormalize, such that it creates a transformation that can be 17 | #' estimated and applied to new data via the \code{predict} function. The 18 | #' parameter a is essentially estimated by the training set by default 19 | #' (estimated as the minimum possible), while the base 20 | #' must be specified beforehand. 21 | #' 22 | #' @return A list of class \code{sqrt_x} with elements \item{x.t}{transformed 23 | #' original data} 24 | #' \item{x}{original data} 25 | #' \item{mean}{mean after transformation but prior to standardization} 26 | #' \item{sd}{sd after transformation but prior to standardization} 27 | #' \item{n}{number of nonmissing observations} 28 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 29 | #' \item{standardize}{was the transformation standardized} 30 | #' 31 | #' The \code{predict} function returns the numeric value of the transformation 32 | #' performed on new data, and allows for the inverse transformation as well. 33 | #' 34 | #' @examples 35 | #' x <- rgamma(100, 1, 1) 36 | #' 37 | #' sqrt_x_obj <- sqrt_x(x) 38 | #' sqrt_x_obj 39 | #' p <- predict(sqrt_x_obj) 40 | #' x2 <- predict(sqrt_x_obj, newdata = p, inverse = TRUE) 41 | #' 42 | #' all.equal(x2, x) 43 | #' 44 | #' @importFrom stats sd 45 | #' @export 46 | sqrt_x <- function(x, a = NULL, standardize = TRUE, ...) { 47 | stopifnot(is.numeric(x)) 48 | 49 | min_a <- max(0, -(min(x, na.rm = TRUE))) 50 | if(!length(a)) 51 | a <- min_a 52 | if(a < min_a) { 53 | warning("Setting a < max(0, -(min(x))) can lead to transformation issues", 54 | "Standardize set to FALSE") 55 | standardize <- FALSE 56 | } 57 | 58 | 59 | x.t <- sqrt(x + a) 60 | mu <- mean(x.t, na.rm = TRUE) 61 | sigma <- sd(x.t, na.rm = TRUE) 62 | if (standardize) x.t <- (x.t - mu) / sigma 63 | 64 | ptest <- nortest::pearson.test(x.t) 65 | 66 | val <- list( 67 | x.t = x.t, 68 | x = x, 69 | mean = mu, 70 | sd = sigma, 71 | a = a, 72 | n = length(x.t) - sum(is.na(x)), 73 | norm_stat = unname(ptest$statistic / ptest$df), 74 | standardize = standardize 75 | ) 76 | class(val) <- c('sqrt_x', class(val)) 77 | val 78 | } 79 | 80 | #' @rdname sqrt_x 81 | #' @method predict sqrt_x 82 | #' @export 83 | predict.sqrt_x <- function(object, newdata = NULL, inverse = FALSE, ...) { 84 | if (is.null(newdata) & !inverse) 85 | newdata <- object$x 86 | if (is.null(newdata) & inverse) 87 | newdata <- object$x.t 88 | 89 | if (inverse) { 90 | if (object$standardize) 91 | newdata <- newdata * object$sd + object$mean 92 | newdata <- newdata^2 - object$a 93 | } else if (!inverse) { 94 | newdata <- sqrt(newdata + object$a) 95 | if (object$standardize) 96 | newdata <- (newdata - object$mean) / object$sd 97 | } 98 | unname(newdata) 99 | } 100 | 101 | #' @rdname sqrt_x 102 | #' @method print sqrt_x 103 | #' @export 104 | print.sqrt_x <- function(x, ...) { 105 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 106 | 'sqrt(x + a) Transformation with', x$n, 'nonmissing obs.:\n', 107 | 'Relevant statistics:\n', 108 | '- a =', x$a, '\n', 109 | '- mean (before standardization) =', x$mean, '\n', 110 | '- sd (before standardization) =', x$sd, '\n') 111 | } 112 | 113 | 114 | -------------------------------------------------------------------------------- /R/step_best_normalize.R: -------------------------------------------------------------------------------- 1 | #' Run bestNormalize transformation for \code{recipes} implementation 2 | #' 3 | #' @aliases step_bestNormalize step_bestNormalize_new 4 | #' 5 | #' @description `step_best_normalize` creates a specification of a recipe step 6 | #' (see `recipes` package) that will transform data using the best of a suite 7 | #' of normalization transformations estimated (by default) using 8 | #' cross-validation. 9 | #' 10 | #' @param recipe A formula or recipe 11 | #' @param ... One or more selector functions to choose which variables are 12 | #' affected by the step. See [selections()] for more details. For the `tidy` 13 | #' method, these are not currently used. 14 | #' @param role Not used by this step since no new variables are created. 15 | #' @param transform_info A numeric vector of transformation values. This (was 16 | #' transform_info) is `NULL` until computed by [prep.recipe()]. 17 | #' @param transform_options options to be passed to bestNormalize 18 | #' @param num_unique An integer where data that have less possible values will 19 | #' not be evaluate for a transformation. 20 | #' @param trained For recipes functionality 21 | #' @param skip For recipes functionality 22 | #' @param id For recipes functionality 23 | #' 24 | #' 25 | #' 26 | #' @return An updated version of `recipe` with the new step added to the 27 | #' sequence of existing steps (if any). For the `tidy` method, a tibble with 28 | #' columns `terms` (the selectors or variables selected) and `value` (the 29 | #' lambda estimate). 30 | #' @concept preprocessing 31 | #' @concept transformation_methods 32 | #' @export 33 | #' 34 | #' @details The bestnormalize transformation can be used to rescale a variable 35 | #' to be more similar to a normal distribution. See `?bestNormalize` for more 36 | #' information; `step_best_normalize` is the implementation of `bestNormalize` 37 | #' in the `recipes` context. 38 | #' 39 | #' As of version 1.7, the `butcher` package can be used to (hopefully) improve 40 | #' scalability of this function on bigger data sets. 41 | #' 42 | #' @examples 43 | #' 44 | #' library(recipes) 45 | #' rec <- recipe(~ ., data = as.data.frame(iris)) 46 | #' 47 | #' bn_trans <- step_best_normalize(rec, all_numeric()) 48 | #' 49 | #' bn_estimates <- prep(bn_trans, training = as.data.frame(iris)) 50 | #' 51 | #' bn_data <- bake(bn_estimates, as.data.frame(iris)) 52 | #' 53 | #' plot(density(iris[, "Petal.Length"]), main = "before") 54 | #' plot(density(bn_data$Petal.Length), main = "after") 55 | #' 56 | #' tidy(bn_trans, number = 1) 57 | #' tidy(bn_estimates, number = 1) 58 | #' 59 | #' @seealso \code{\link[bestNormalize]{bestNormalize}} \code{\link{orderNorm}}, 60 | #' [recipe()] [prep.recipe()] [bake.recipe()] 61 | #' 62 | #' @importFrom recipes recipe rand_id add_step ellipse_check step 63 | #' 64 | step_best_normalize <- 65 | function(recipe, 66 | ..., 67 | role = NA, 68 | trained = FALSE, 69 | transform_info = NULL, 70 | transform_options = list(), 71 | num_unique = 5, 72 | skip = FALSE, 73 | id = rand_id("best_normalize")) { 74 | add_step( 75 | recipe, 76 | step_best_normalize_new( 77 | terms = ellipse_check(...), 78 | role = role, 79 | trained = trained, 80 | transform_info = transform_info, 81 | transform_options = transform_options, 82 | num_unique = num_unique, 83 | skip = skip, 84 | id = id 85 | ) 86 | ) 87 | } 88 | 89 | step_best_normalize_new <- 90 | function(terms, role, trained, transform_info, transform_options, num_unique, skip, id) { 91 | step( 92 | subclass = "best_normalize", 93 | terms = terms, 94 | role = role, 95 | trained = trained, 96 | transform_info = transform_info, 97 | transform_options = transform_options, 98 | num_unique = num_unique, 99 | skip = skip, 100 | id = id 101 | ) 102 | } 103 | 104 | #' @export 105 | #' @importFrom recipes prep recipes_eval_select check_type 106 | prep.step_best_normalize <- function(x, training, info = NULL, ...) { 107 | col_names <- recipes_eval_select(x$terms, training, info) 108 | check_type(training[, col_names]) 109 | 110 | values <- apply( 111 | training[, col_names], 112 | 2, 113 | estimate_bn, 114 | transform_options = x$transform_options, 115 | num_unique = x$num_unique 116 | ) 117 | 118 | step_best_normalize_new( 119 | terms = x$terms, 120 | role = x$role, 121 | trained = TRUE, 122 | transform_info = values, 123 | transform_options = x$transform_options, 124 | num_unique = x$num_unique, 125 | skip = x$skip, 126 | id = x$id 127 | ) 128 | } 129 | 130 | #' @export 131 | #' @importFrom recipes bake 132 | #' @importFrom tibble as_tibble 133 | bake.step_best_normalize <- function(object, new_data, ...) { 134 | if (length(object$transform_info) == 0) 135 | return(as_tibble(new_data)) 136 | param <- names(object$transform_info) 137 | for (i in seq_along(object$transform_info)) 138 | new_data[, param[i]] <- 139 | predict(object$transform_info[[param[i]]], getElement(new_data, param[i]), warn = FALSE) 140 | as_tibble(new_data) 141 | } 142 | #' @export 143 | #' @importFrom recipes print_step 144 | print.step_best_normalize <- 145 | function(x, width = max(20, options()$width - 35), ...) { 146 | title <- "bestNormalize transformation on " 147 | print_step(names(x$transform_info), x$terms, x$trained, width = width, title = title) 148 | invisible(x) 149 | } 150 | 151 | ## estimates the transformations 152 | estimate_bn <- function(dat, 153 | transform_options = list(), 154 | num_unique = 5) { 155 | 156 | # Returns the identity transformation if not enough unique values 157 | if (length(unique(dat)) < num_unique) 158 | return(no_transform(dat)) 159 | 160 | transform_options$x <- dat 161 | 162 | # Set some new defaults to bestNormalize unless otherwise specified 163 | if(is.null(transform_options$r)) 164 | transform_options$r <- 1 165 | if(is.null(transform_options$warn)) 166 | transform_options$warn <- FALSE 167 | if(is.null(transform_options$quiet)) 168 | transform_options$quiet <- TRUE 169 | 170 | res <- do.call(bestNormalize, transform_options) 171 | res 172 | } 173 | 174 | #' @rdname step_best_normalize 175 | #' @param x A `step_best_normalize` object. 176 | #' @export 177 | #' @importFrom recipes tidy is_trained sel2char 178 | #' @importFrom tibble tibble 179 | #' 180 | tidy.step_best_normalize <- function(x, ...) { 181 | if (is_trained(x)) { 182 | val <- lapply(x$transform_info, tidy) 183 | chosen <- sapply(val, function(xx) xx$transform[xx$chosen == 1]) 184 | res <- tibble(terms = names(x$transform_info), 185 | chosen_transform = chosen, 186 | cv_info = val) 187 | } else { 188 | term_names <- sel2char(x$terms) 189 | res <- tibble(terms = term_names, chosen = as.double(NA), cv_info = as.double(NA)) 190 | } 191 | res$id <- x$id 192 | res 193 | } 194 | 195 | #' @export 196 | step_bestNormalize <- function(...) { 197 | .Deprecated("step_best_normalize", package = "bestNormalize", old = "step_bestNormalize") 198 | step_best_normalize(...) 199 | } 200 | 201 | #' @export 202 | step_bestNormalize_new <- function(...) { 203 | .Deprecated("step_best_normalize_new", package = "bestNormalize", old = "step_bestNormalize_new") 204 | step_best_normalize_new(...) 205 | } 206 | 207 | #' @rdname step_best_normalize 208 | #' @param x A `step_best_normalize` object. 209 | #' @importFrom butcher axe_env 210 | #' @importFrom purrr map 211 | #' @export 212 | axe_env.step_best_normalize <- function(x, ...) { 213 | x$terms <- purrr::map(x$terms, function(z) butcher::axe_env(z, ...)) 214 | x 215 | } 216 | 217 | #' @export 218 | required_pkgs.step_best_normalize <- function(x, ...) { 219 | c("bestNormalize") 220 | } 221 | -------------------------------------------------------------------------------- /R/step_orderNorm.R: -------------------------------------------------------------------------------- 1 | #' ORQ normalization (orderNorm) for \code{recipes} implementation 2 | #' 3 | #' @description `step_orderNorm` creates a specification of a recipe step (see 4 | #' `recipes` package) that will transform data using the ORQ (orderNorm) 5 | #' transformation, which approximates the "true" normalizing transformation if 6 | #' one exists. This is considerably faster than `step_bestNormalize`. 7 | #' 8 | #' @param recipe A formula or recipe 9 | #' @param ... One or more selector functions to choose which variables are 10 | #' affected by the step. See [selections()] for more details. For the `tidy` 11 | #' method, these are not currently used. 12 | #' @param role Not used by this step since no new variables are created. 13 | #' @param transform_info A numeric vector of transformation values. This (was 14 | #' transform_info) is `NULL` until computed by [prep.recipe()]. 15 | #' @param transform_options options to be passed to orderNorm 16 | #' @param num_unique An integer where data that have less possible values will 17 | #' not be evaluate for a transformation. 18 | #' @param trained For recipes functionality 19 | #' @param skip For recipes functionality 20 | #' @param id For recipes functionality 21 | #' 22 | #' 23 | #' @return An updated version of `recipe` with the new step added to the 24 | #' sequence of existing steps (if any). For the `tidy` method, a tibble with 25 | #' columns `terms` (the selectors or variables selected) and `value` (the 26 | #' lambda estimate). 27 | #' @concept preprocessing 28 | #' @concept transformation_methods 29 | #' @export 30 | #' 31 | #' @details The orderNorm transformation can be used to rescale a variable to be 32 | #' more similar to a normal distribution. See `?orderNorm` for more 33 | #' information; `step_orderNorm` is the implementation of `orderNorm` in the 34 | #' `recipes` context. 35 | #' 36 | #' As of version 1.7, the `butcher` package can be used to (hopefully) improve 37 | #' scalability of this function on bigger data sets. 38 | #' 39 | #' @examples 40 | #' library(recipes) 41 | #' rec <- recipe(~ ., data = as.data.frame(iris)) 42 | #' 43 | #' orq_trans <- step_orderNorm(rec, all_numeric()) 44 | #' 45 | #' orq_estimates <- prep(orq_trans, training = as.data.frame(iris)) 46 | #' 47 | #' orq_data <- bake(orq_estimates, as.data.frame(iris)) 48 | #' 49 | #' plot(density(iris[, "Petal.Length"]), main = "before") 50 | #' plot(density(orq_data$Petal.Length), main = "after") 51 | #' 52 | #' tidy(orq_trans, number = 1) 53 | #' tidy(orq_estimates, number = 1) 54 | #' 55 | #' 56 | #' @seealso \code{\link[bestNormalize]{orderNorm}} \code{\link{bestNormalize}}, 57 | #' [recipe()] [prep.recipe()] [bake.recipe()] 58 | #' 59 | #' @references Ryan A. Peterson (2019). Ordered quantile normalization: a 60 | #' semiparametric transformation built for the cross-validation era. Journal 61 | #' of Applied Statistics, 1-16. 62 | #' 63 | #' @importFrom recipes recipe rand_id add_step ellipse_check step 64 | #' 65 | step_orderNorm <- 66 | function(recipe, 67 | ..., 68 | role = NA, 69 | trained = FALSE, 70 | transform_info = NULL, 71 | transform_options = list(), 72 | num_unique = 5, 73 | skip = FALSE, 74 | id = rand_id("orderNorm")) { 75 | add_step( 76 | recipe, 77 | step_orderNorm_new( 78 | terms = ellipse_check(...), 79 | role = role, 80 | trained = trained, 81 | transform_info = transform_info, 82 | transform_options = transform_options, 83 | num_unique = num_unique, 84 | skip = skip, 85 | id = id 86 | ) 87 | ) 88 | } 89 | 90 | step_orderNorm_new <- 91 | function(terms, role, trained, transform_info, transform_options, num_unique, skip, id) { 92 | step( 93 | subclass = "orderNorm", 94 | terms = terms, 95 | role = role, 96 | trained = trained, 97 | transform_info = transform_info, 98 | transform_options = transform_options, 99 | num_unique = num_unique, 100 | skip = skip, 101 | id = id 102 | ) 103 | } 104 | 105 | #' @export 106 | #' @importFrom recipes prep recipes_eval_select check_type 107 | prep.step_orderNorm <- function(x, training, info = NULL, ...) { 108 | col_names <- recipes_eval_select(x$terms, training, info) 109 | check_type(training[, col_names]) 110 | 111 | values <- apply( 112 | training[, col_names], 113 | 2, 114 | estimate_orq, 115 | transform_options = x$transform_options, 116 | num_unique = x$num_unique 117 | ) 118 | 119 | step_orderNorm_new( 120 | terms = x$terms, 121 | role = x$role, 122 | trained = TRUE, 123 | transform_info = values, 124 | transform_options = x$transform_options, 125 | num_unique = x$num_unique, 126 | skip = x$skip, 127 | id = x$id 128 | ) 129 | } 130 | 131 | #' @export 132 | #' @importFrom tibble as_tibble 133 | bake.step_orderNorm <- function(object, new_data, ...) { 134 | if (length(object$transform_info) == 0) 135 | return(as_tibble(new_data)) 136 | param <- names(object$transform_info) 137 | for (i in seq_along(object$transform_info)) 138 | new_data[, param[i]] <- 139 | predict(object$transform_info[[param[i]]], getElement(new_data, param[i]), warn = FALSE) 140 | as_tibble(new_data) 141 | } 142 | 143 | #' @export 144 | #' @importFrom recipes print_step 145 | print.step_orderNorm <- 146 | function(x, width = max(20, options()$width - 35), ...) { 147 | title <- "orderNorm transformation on " 148 | print_step(names(x$transform_info), x$terms, x$trained, width = width, title = title) 149 | invisible(x) 150 | } 151 | 152 | ## estimates the transformations 153 | estimate_orq <- function(dat, 154 | transform_options = list(), 155 | num_unique = 5) { 156 | 157 | # Returns the identity transformation if not enough unique values 158 | if (length(unique(dat)) < num_unique) 159 | return(no_transform(dat)) 160 | 161 | transform_options$x <- dat 162 | 163 | # Set some new defaults to orderNorm unless otherwise specified 164 | if(is.null(transform_options$warn)) 165 | transform_options$warn <- FALSE 166 | 167 | res <- do.call(orderNorm, transform_options) 168 | res 169 | } 170 | 171 | #' @rdname step_orderNorm 172 | #' @param x A `step_orderNorm` object. 173 | #' @export 174 | #' @importFrom recipes tidy is_trained sel2char 175 | #' @importFrom tibble tibble 176 | tidy.step_orderNorm <- function(x, ...) { 177 | if (is_trained(x)) { 178 | res <- tibble(terms = names(x$transform_info), 179 | value = x$transform_info) 180 | } else { 181 | term_names <- sel2char(x$terms) 182 | res <- tibble(terms = term_names, value = as.double(NA)) 183 | } 184 | res$id <- x$id 185 | res 186 | } 187 | 188 | #' @rdname step_orderNorm 189 | #' @param x A `step_orderNorm` object. 190 | #' @importFrom butcher axe_env 191 | #' @importFrom purrr map 192 | #' @export 193 | axe_env.step_orderNorm <- function(x, ...) { 194 | x$terms <- purrr::map(x$terms, function(z) butcher::axe_env(z, ...)) 195 | x 196 | } 197 | 198 | 199 | #' @importFrom generics required_pkgs 200 | #' @export 201 | generics::required_pkgs 202 | 203 | #' @export 204 | required_pkgs.step_orderNorm <- function(x, ...) { 205 | c("bestNormalize") 206 | } 207 | -------------------------------------------------------------------------------- /R/yeojohnson.R: -------------------------------------------------------------------------------- 1 | #'Yeo-Johnson Normalization 2 | #' 3 | #'@name yeojohnson 4 | #'@aliases predict.yeojohnson 5 | #' 6 | #'@description Perform a Yeo-Johnson Transformation and center/scale a vector to 7 | #' attempt normalization 8 | #'@param x A vector to normalize with Yeo-Johnson 9 | #'@param eps A value to compare lambda against to see if it is equal to zero 10 | #' @param standardize If TRUE, the transformed values are also centered and 11 | #' scaled, such that the transformation attempts a standard normal 12 | #'@param ... Additional arguments that can be passed to the estimation of the 13 | #' lambda parameter (lower, upper) 14 | #'@param newdata a vector of data to be (reverse) transformed 15 | #'@param inverse if TRUE, performs reverse transformation 16 | #'@param object an object of class 'yeojohnson' 17 | #'@details \code{yeojohnson} estimates the optimal value of lambda for the 18 | #' Yeo-Johnson transformation. This transformation can be performed on new 19 | #' data, and inverted, via the \code{predict} function. 20 | #' 21 | #' The Yeo-Johnson is similar to the Box-Cox method, however it allows for the 22 | #' transformation of nonpositive data as well. The \code{step_YeoJohnson} 23 | #' function in the \code{recipes} package is another useful resource (see 24 | #' references). 25 | #' 26 | #'@return A list of class \code{yeojohnson} with elements 27 | #' 28 | #' \item{x.t}{transformed original data} 29 | #' \item{x}{original data} 30 | #' \item{mean}{mean after transformation but prior to standardization} 31 | #' \item{sd}{sd after transformation but prior to standardization} 32 | #' \item{lambda}{estimated lambda value for skew transformation} 33 | #' \item{n}{number of nonmissing observations} 34 | #' \item{norm_stat}{Pearson's P / degrees of freedom} 35 | #' \item{standardize}{Was the transformation standardized} 36 | #' 37 | #' The \code{predict} function returns the numeric value of the transformation 38 | #' performed on new data, and allows for the inverse transformation as well. 39 | #' 40 | #'@references Yeo, I. K., & Johnson, R. A. (2000). A new family of power 41 | #' transformations to improve normality or symmetry. Biometrika. 42 | #' 43 | #' Max Kuhn and Hadley Wickham (2017). recipes: Preprocessing Tools to Create 44 | #' Design Matrices. R package version 0.1.0.9000. 45 | #' https://github.com/topepo/recipes 46 | #' 47 | #' 48 | #' 49 | #' @examples 50 | #' 51 | #' x <- rgamma(100, 1, 1) 52 | #' 53 | #' yeojohnson_obj <- yeojohnson(x) 54 | #' yeojohnson_obj 55 | #' p <- predict(yeojohnson_obj) 56 | #' x2 <- predict(yeojohnson_obj, newdata = p, inverse = TRUE) 57 | #' 58 | #' all.equal(x2, x) 59 | #' 60 | #'@importFrom stats sd 61 | #'@export 62 | yeojohnson <- function(x, eps = .001, standardize = TRUE, ...) { 63 | stopifnot(is.numeric(x)) 64 | lambda <- estimate_yeojohnson_lambda(x, eps = eps, ...) 65 | x.t <- x 66 | na_idx <- is.na(x) 67 | x.t[!na_idx] <- yeojohnson_trans(x[!na_idx], lambda, eps) 68 | mu <- mean(x.t, na.rm = TRUE) 69 | sigma <- sd(x.t, na.rm = TRUE) 70 | if (standardize) x.t <- (x.t - mu) / sigma 71 | 72 | ptest <- nortest::pearson.test(x.t) 73 | 74 | val <- list( 75 | x.t = x.t, 76 | x = x, 77 | mean = mu, 78 | sd = sigma, 79 | lambda = lambda, 80 | eps = eps, 81 | n = length(x.t) - sum(na_idx), 82 | norm_stat = unname(ptest$statistic / ptest$df), 83 | standardize = standardize 84 | ) 85 | class(val) <- 'yeojohnson' 86 | val 87 | } 88 | 89 | #' @rdname yeojohnson 90 | #' @method predict yeojohnson 91 | #' @export 92 | predict.yeojohnson <- function(object, 93 | newdata = NULL, 94 | inverse = FALSE, 95 | ...) { 96 | if (is.null(newdata) & !inverse) 97 | newdata <- object$x 98 | if (is.null(newdata)) 99 | newdata <- object$x.t 100 | 101 | na_idx <- is.na(newdata) 102 | 103 | if (inverse) { 104 | if(object$standardize) newdata <- newdata * object$sd + object$mean 105 | newdata[!na_idx] <- inv_yeojohnson_trans(newdata[!na_idx], object$lambda) 106 | } else { 107 | newdata[!na_idx] <- yeojohnson_trans(newdata[!na_idx], object$lambda) 108 | if(object$standardize) newdata <- (newdata - object$mean) / object$sd 109 | } 110 | 111 | unname(newdata) 112 | } 113 | 114 | #' @rdname yeojohnson 115 | #' @method print yeojohnson 116 | #' @export 117 | print.yeojohnson <- function(x, ...) { 118 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 119 | 'Yeo-Johnson Transformation with', x$n, 'nonmissing obs.:\n', 120 | 'Estimated statistics:\n', 121 | '- lambda =', x$lambda, '\n', 122 | '- mean (before standardization) =', x$mean, '\n', 123 | '- sd (before standardization) =', x$sd, '\n') 124 | } 125 | 126 | # Helper functions that estimates yj lambda parameter 127 | #' @importFrom stats var optimize 128 | estimate_yeojohnson_lambda <- function(x, lower = -5, upper = 5, eps = .001, ...) { 129 | n <- length(x) 130 | ccID <- !is.na(x) 131 | x <- x[ccID] 132 | pos_idx = which(x >= 0) 133 | neg_idx = which(x < 0) 134 | 135 | constant <- sum(sign(x) * log(abs(x) + 1)) 136 | 137 | # See references, Yeo & Johnson Biometrika (2000) 138 | yj_loglik <- function(lambda) { 139 | x_t <- yeojohnson_trans(x, lambda, eps, pos_idx = pos_idx, neg_idx = neg_idx) 140 | x_t_bar <- mean(x_t) 141 | x_t_var <- var(x_t) * (n - 1) / n 142 | -0.5 * n * log(x_t_var) + (lambda - 1) * constant 143 | } 144 | 145 | results <- optimize(yj_loglik, lower = lower, 146 | upper = upper, maximum = TRUE, 147 | tol = .0001) 148 | 149 | results$maximum 150 | } 151 | 152 | yeojohnson_trans <- function(x, lambda, eps = .001, pos_idx, neg_idx) { 153 | if(missing(pos_idx)) 154 | pos_idx <- which(x >= 0) 155 | if(missing(neg_idx)) 156 | neg_idx <- which(x < 0) 157 | 158 | # Transform negative values 159 | if (length(pos_idx)>0) { 160 | if (abs(lambda) < eps) { 161 | x[pos_idx] <- log(x[pos_idx] + 1) 162 | } else { 163 | x[pos_idx] <- ((x[pos_idx] + 1) ^ lambda - 1) / lambda 164 | } 165 | } 166 | 167 | # Transform nonnegative values 168 | if (length(neg_idx)>0){ 169 | if (abs(lambda - 2) < eps) { 170 | x[neg_idx] <- - log(-x[neg_idx] + 1) 171 | } else { 172 | x[neg_idx] <- - ((-x[neg_idx] + 1) ^ (2 - lambda) - 1) / (2 - lambda) 173 | } 174 | } 175 | x 176 | } 177 | 178 | inv_yeojohnson_trans <- function(x, lambda, eps = .001) { 179 | val <- x 180 | neg_idx <- x < 0 181 | 182 | if(any(!neg_idx)) { 183 | if(abs(lambda) < eps) { 184 | val[!neg_idx] <- exp(x[!neg_idx]) - 1 185 | } else { 186 | val[!neg_idx] <- (x[!neg_idx] * lambda + 1) ^ (1 / lambda) - 1 187 | } 188 | } 189 | if(any(neg_idx)) { 190 | if(abs(lambda - 2) < eps) { 191 | val[neg_idx] <- -expm1(-x[neg_idx]) 192 | } else { 193 | val[neg_idx] <- 1 - (-(2 - lambda) * x[neg_idx] + 1) ^ (1 / (2 - lambda)) 194 | } 195 | } 196 | val 197 | } 198 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE, include=FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "README-" 12 | ) 13 | library(bestNormalize) 14 | ``` 15 | 16 | 17 | # bestNormalize: Flexibly calculate the best normalizing transformation for a vector 18 | 19 | [![CRAN version](https://www.r-pkg.org/badges/version/bestNormalize)](https://cran.r-project.org/package=bestNormalize) 20 | [![R-CMD-check](https://github.com/petersonR/bestNormalize/workflows/R-CMD-check/badge.svg)](https://github.com/petersonR/bestNormalize/actions) 21 | [![downloads](https://cranlogs.r-pkg.org/badges/bestNormalize)](https://cran.r-project.org/package=bestNormalize) 22 | [![Codecov test coverage](https://codecov.io/gh/petersonR/bestNormalize/branch/master/graph/badge.svg)](https://app.codecov.io/gh/petersonR/bestNormalize?branch=master) 23 | 24 | 25 | The `bestNormalize` R package was designed to help find a normalizing transformation for a vector. There are many techniques that have been developed in this aim, however each has been subject to their own strengths/weaknesses, and it is unclear on how to decide which will work best until the data is observed. This package will look at a range of possible transformations and return the best one, i.e. the one that makes it look the *most* normal. 26 | 27 | Note that some authors use the term "normalize" differently than in this package. We define "normalize": to transform a vector of data in such a way that the transformed values follow a Gaussian distribution (or equivalently, a bell curve). This is in contrast to other such techniques designed to transform values to the 0-1 range, or to the -1 to 1 range. 28 | 29 | This package also introduces a new adaptation of a normalization technique, which we call Ordered Quantile normalization (`orderNorm()`, or ORQ). ORQ transforms the data based off of a rank mapping to the normal distribution. This allows us to *guarantee* normally distributed transformed data (if ties are not present). The adaptation uses a shifted logit approximation on the ranks transformation to perform the transformation on newly observed data outside of the original domain. On new data within the original domain, the transformation uses linear interpolation of the fitted transformation. 30 | 31 | To evaluate the efficacy of the normalization technique, the `bestNormalize()` function implements repeated cross-validation to estimate the Pearson's P statistic divided by its degrees of freedom. This is called the "Normality statistic", and if it is close to 1 (or less), then the transformation can be thought of as working well. The function is designed to select the transformation that produces the lowest P / df value, when estimated on out-of-sample data (estimating this on in-sample data will always choose the orderNorm technique, and is generally not the main goal of these procedures). 32 | 33 | ## Installation 34 | 35 | You can install the most recent (devel) version of bestNormalize from GitHub with: 36 | 37 | ```{r gh-installation, eval = FALSE} 38 | # install.packages("devtools") 39 | devtools::install_github("petersonR/bestNormalize") 40 | ``` 41 | 42 | Or, you can download it from CRAN with: 43 | 44 | ```{r cran-installation, eval = FALSE} 45 | install.packages("bestNormalize") 46 | ``` 47 | 48 | ## Example 49 | 50 | In this example, we generate 1000 draws from a gamma distribution, and normalize them: 51 | 52 | ```{r, eval = FALSE} 53 | library(bestNormalize) 54 | ``` 55 | 56 | ```{r example} 57 | set.seed(100) 58 | x <- rgamma(1000, 1, 1) 59 | 60 | # Estimate best transformation with repeated cross-validation 61 | BN_obj <- bestNormalize(x, allow_lambert_s = TRUE) 62 | BN_obj 63 | 64 | # Perform transformation 65 | gx <- predict(BN_obj) 66 | 67 | # Perform reverse transformation 68 | x2 <- predict(BN_obj, newdata = gx, inverse = TRUE) 69 | 70 | # Prove the transformation is 1:1 71 | all.equal(x2, x) 72 | 73 | ``` 74 | 75 | As of version 1.3, the package supports leave-one-out cross-validation as well. ORQ normalization works very well when the size of the test dataset is low relative to the training data set, so it will often be selected via leave-one-out cross-validation (which is why we set `allow_orderNorm = FALSE` here). 76 | 77 | ```{r} 78 | (BN_loo <- bestNormalize(x, allow_orderNorm = FALSE, allow_lambert_s = TRUE, loo = TRUE)) 79 | ``` 80 | 81 | It is also possible to visualize these transformations: 82 | 83 | ```{r, eval = FALSE} 84 | plot(BN_obj, leg_loc = "bottomright") 85 | ``` 86 | 87 | 88 | For a more in depth tutorial, please consult [the package vignette](https://CRAN.R-project.org/package=bestNormalize/vignettes/bestNormalize.html), or the [package website](https://petersonr.github.io/bestNormalize/). 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # bestNormalize: Flexibly calculate the best normalizing transformation for a vector 5 | 6 | [![CRAN 7 | version](https://www.r-pkg.org/badges/version/bestNormalize)](https://cran.r-project.org/package=bestNormalize) 8 | [![R-CMD-check](https://github.com/petersonR/bestNormalize/workflows/R-CMD-check/badge.svg)](https://github.com/petersonR/bestNormalize/actions) 9 | [![downloads](https://cranlogs.r-pkg.org/badges/bestNormalize)](https://cran.r-project.org/package=bestNormalize) 10 | [![Codecov test 11 | coverage](https://codecov.io/gh/petersonR/bestNormalize/branch/master/graph/badge.svg)](https://app.codecov.io/gh/petersonR/bestNormalize?branch=master) 12 | 13 | The `bestNormalize` R package was designed to help find a normalizing 14 | transformation for a vector. There are many techniques that have been 15 | developed in this aim, however each has been subject to their own 16 | strengths/weaknesses, and it is unclear on how to decide which will work 17 | best until the data is observed. This package will look at a range of 18 | possible transformations and return the best one, i.e. the one that 19 | makes it look the *most* normal. 20 | 21 | Note that some authors use the term “normalize” differently than in this 22 | package. We define “normalize”: to transform a vector of data in such a 23 | way that the transformed values follow a Gaussian distribution (or 24 | equivalently, a bell curve). This is in contrast to other such 25 | techniques designed to transform values to the 0-1 range, or to the -1 26 | to 1 range. 27 | 28 | This package also introduces a new adaptation of a normalization 29 | technique, which we call Ordered Quantile normalization (`orderNorm()`, 30 | or ORQ). ORQ transforms the data based off of a rank mapping to the 31 | normal distribution. This allows us to *guarantee* normally distributed 32 | transformed data (if ties are not present). The adaptation uses a 33 | shifted logit approximation on the ranks transformation to perform the 34 | transformation on newly observed data outside of the original domain. On 35 | new data within the original domain, the transformation uses linear 36 | interpolation of the fitted transformation. 37 | 38 | To evaluate the efficacy of the normalization technique, the 39 | `bestNormalize()` function implements repeated cross-validation to 40 | estimate the Pearson’s P statistic divided by its degrees of freedom. 41 | This is called the “Normality statistic”, and if it is close to 1 (or 42 | less), then the transformation can be thought of as working well. The 43 | function is designed to select the transformation that produces the 44 | lowest P / df value, when estimated on out-of-sample data (estimating 45 | this on in-sample data will always choose the orderNorm technique, and 46 | is generally not the main goal of these procedures). 47 | 48 | ## Installation 49 | 50 | You can install the most recent (devel) version of bestNormalize from 51 | GitHub with: 52 | 53 | ``` r 54 | # install.packages("devtools") 55 | devtools::install_github("petersonR/bestNormalize") 56 | ``` 57 | 58 | Or, you can download it from CRAN with: 59 | 60 | ``` r 61 | install.packages("bestNormalize") 62 | ``` 63 | 64 | ## Example 65 | 66 | In this example, we generate 1000 draws from a gamma distribution, and 67 | normalize them: 68 | 69 | ``` r 70 | library(bestNormalize) 71 | ``` 72 | 73 | ``` r 74 | set.seed(100) 75 | x <- rgamma(1000, 1, 1) 76 | 77 | # Estimate best transformation with repeated cross-validation 78 | BN_obj <- bestNormalize(x, allow_lambert_s = TRUE) 79 | #> Warning: package 'lamW' was built under R version 4.0.5 80 | BN_obj 81 | #> Best Normalizing transformation with 1000 Observations 82 | #> Estimated Normality Statistics (Pearson P / df, lower => more normal): 83 | #> - arcsinh(x): 3.6204 84 | #> - Box-Cox: 0.96 85 | #> - Center+scale: 6.7851 86 | #> - Exp(x): 50.8513 87 | #> - Lambert's W (type s): 1.0572 88 | #> - Log_b(x+a): 1.908 89 | #> - orderNorm (ORQ): 1.0516 90 | #> - sqrt(x + a): 1.4556 91 | #> - Yeo-Johnson: 1.7385 92 | #> Estimation method: Out-of-sample via CV with 10 folds and 5 repeats 93 | #> 94 | #> Based off these, bestNormalize chose: 95 | #> Standardized Box Cox Transformation with 1000 nonmissing obs.: 96 | #> Estimated statistics: 97 | #> - lambda = 0.2739638 98 | #> - mean (before standardization) = -0.3870903 99 | #> - sd (before standardization) = 1.045498 100 | 101 | # Perform transformation 102 | gx <- predict(BN_obj) 103 | 104 | # Perform reverse transformation 105 | x2 <- predict(BN_obj, newdata = gx, inverse = TRUE) 106 | 107 | # Prove the transformation is 1:1 108 | all.equal(x2, x) 109 | #> [1] TRUE 110 | ``` 111 | 112 | As of version 1.3, the package supports leave-one-out cross-validation 113 | as well. ORQ normalization works very well when the size of the test 114 | dataset is low relative to the training data set, so it will often be 115 | selected via leave-one-out cross-validation (which is why we set 116 | `allow_orderNorm = FALSE` here). 117 | 118 | ``` r 119 | (BN_loo <- bestNormalize(x, allow_orderNorm = FALSE, allow_lambert_s = TRUE, loo = TRUE)) 120 | #> Best Normalizing transformation with 1000 Observations 121 | #> Estimated Normality Statistics (Pearson P / df, lower => more normal): 122 | #> - arcsinh(x): 14.0712 123 | #> - Box-Cox: 0.8077 124 | #> - Center+scale: 26.5181 125 | #> - Exp(x): 451.435 126 | #> - Lambert's W (type s): 1.269 127 | #> - Log_b(x+a): 4.5374 128 | #> - sqrt(x + a): 3.3655 129 | #> - Yeo-Johnson: 5.7997 130 | #> Estimation method: Out-of-sample via leave-one-out CV 131 | #> 132 | #> Based off these, bestNormalize chose: 133 | #> Standardized Box Cox Transformation with 1000 nonmissing obs.: 134 | #> Estimated statistics: 135 | #> - lambda = 0.2739638 136 | #> - mean (before standardization) = -0.3870903 137 | #> - sd (before standardization) = 1.045498 138 | ``` 139 | 140 | It is also possible to visualize these transformations: 141 | 142 | ``` r 143 | plot(BN_obj, leg_loc = "bottomright") 144 | ``` 145 | 146 | For a more in depth tutorial, please consult [the package 147 | vignette](https://CRAN.R-project.org/package=bestNormalize/vignettes/bestNormalize.html), 148 | or the [package website](https://petersonr.github.io/bestNormalize/). 149 | -------------------------------------------------------------------------------- /bestNormalize.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | This is a submission of bestNormalize 1.9.1 that 2 | fixes issues introduced by an update to lamW. 3 | 4 | ## Test environments 5 | - GitHub Actions (ubuntu-18.04): 3.5, 3.6, oldrel, release, devel 6 | - GitHub Actions (windows): 3.6, release 7 | - Github Actions (macOS): release 8 | - Local (macOS): release 9 | - CRAN Windows Builder: release 10 | 11 | ## R CMD check results 12 | There were no ERRORs, WARNINGs, or NOTEs. 13 | 14 | ## revdepcheck results 15 | 16 | We checked 3 reverse dependencies (2 from CRAN + 1 from Bioconductor), comparing R CMD check results across CRAN and dev versions of this package. 17 | 18 | * We saw 0 new problems 19 | * We failed to check 0 packages 20 | 21 | -------------------------------------------------------------------------------- /data/autotrader.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petersonR/bestNormalize/46c0ea73e57eb2130bac9078b501688fd48d65c9/data/autotrader.RData -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite 'bestNormalize' software in publications, use (1); to cite the orderNorm (ORQ) technique use (2), or cite both.") 2 | 3 | bibentry(bibtype = "article", 4 | title = "{Finding Optimal Normalizing Transformations via 5 | bestNormalize}", 6 | author = as.person("Ryan A. Peterson"), 7 | year = "2021", 8 | journal = "{The R Journal}", 9 | doi = "10.32614/RJ-2021-041", 10 | pages = "310--329", 11 | volume = "13", 12 | number = "1", 13 | textVersion = 14 | "(1) Peterson, R. A. (2021). Finding Optimal Normalizing Transformations via bestNormalize. The R Journal, 13:1, 310-329, DOI:10.32614/RJ-2021-041" 15 | ) 16 | 17 | bibentry(bibtype = "article", 18 | title = "Ordered quantile normalization: a semiparametric transformation built for the cross-validation era", 19 | author = c(as.person("Ryan A. Peterson"), 20 | as.person("Joseph E. Cavanaugh")), 21 | year = "2020", 22 | journal = "Journal of Applied Statistics", 23 | publisher = "Taylor & Francis", 24 | pages = "2312-2327", 25 | doi = "10.1080/02664763.2019.1630372", 26 | volume = "47", 27 | number = "13-15", 28 | textVersion = 29 | "(2) Ryan A. Peterson & Joseph E. Cavanaugh (2020). Ordered quantile normalization: a semiparametric transformation built for the cross-validation era. Journal of Applied Statistics, 47:13-15, 2312-2327, DOI: 10.1080/02664763.2019.1630372" 30 | ) 31 | 32 | -------------------------------------------------------------------------------- /man/arcsinh_x.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/arcsinh_x.R 3 | \name{arcsinh_x} 4 | \alias{arcsinh_x} 5 | \alias{predict.arcsinh_x} 6 | \alias{print.arcsinh_x} 7 | \title{arcsinh(x) Transformation} 8 | \usage{ 9 | arcsinh_x(x, standardize = TRUE, ...) 10 | 11 | \method{predict}{arcsinh_x}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{arcsinh_x}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize with with x} 17 | 18 | \item{standardize}{If TRUE, the transformed values are also centered and 19 | scaled, such that the transformation attempts a standard normal} 20 | 21 | \item{...}{additional arguments} 22 | 23 | \item{object}{an object of class 'arcsinh_x'} 24 | 25 | \item{newdata}{a vector of data to be (potentially reverse) transformed} 26 | 27 | \item{inverse}{if TRUE, performs reverse transformation} 28 | } 29 | \value{ 30 | A list of class \code{arcsinh_x} with elements 31 | \item{x.t}{transformed 32 | original data} 33 | \item{x}{original data} 34 | \item{mean}{mean after transformation but prior to standardization} 35 | \item{sd}{sd after transformation but prior to standardization} 36 | \item{n}{number of nonmissing observations} 37 | \item{norm_stat}{Pearson's P / degrees of freedom} 38 | \item{standardize}{was the transformation standardized} 39 | 40 | The \code{predict} function returns the numeric value of the transformation 41 | performed on new data, and allows for the inverse transformation as well. 42 | } 43 | \description{ 44 | Perform a arcsinh(x) transformation 45 | } 46 | \details{ 47 | \code{arcsinh_x} performs an arcsinh transformation in the context of 48 | bestNormalize, such that it creates a transformation that can be estimated 49 | and applied to new data via the \code{predict} function. 50 | 51 | The function is explicitly: log(x + sqrt(x^2 + 1)) 52 | } 53 | \examples{ 54 | x <- rgamma(100, 1, 1) 55 | 56 | arcsinh_x_obj <- arcsinh_x(x) 57 | arcsinh_x_obj 58 | p <- predict(arcsinh_x_obj) 59 | x2 <- predict(arcsinh_x_obj, newdata = p, inverse = TRUE) 60 | 61 | all.equal(x2, x) 62 | 63 | } 64 | -------------------------------------------------------------------------------- /man/autotrader.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{autotrader} 5 | \alias{autotrader} 6 | \title{Prices of 6,283 cars listed on Autotrader} 7 | \format{ 8 | A data frame with 6283 rows and 10 variables: 9 | \describe{ 10 | \item{price}{price, in US dollars} 11 | \item{Car_Info}{Raw description from website} 12 | \item{Link}{hyperlink to listing (must be appended to https://www.autotrader.com/)} 13 | \item{Make}{Car manufacturer} 14 | \item{Year}{Year car manufactured} 15 | \item{Location}{Location of listing} 16 | \item{Radius}{Radius chosen for search} 17 | \item{mileage}{mileage on vehicle} 18 | \item{status}{used/new/certified} 19 | \item{model}{make and model, separated by space} 20 | } 21 | } 22 | \source{ 23 | \url{https://www.autotrader.com/} 24 | } 25 | \usage{ 26 | autotrader 27 | } 28 | \description{ 29 | A dataset containing the prices and other attributes of over 6000 cars in the Minneapolis area. 30 | } 31 | \keyword{datasets} 32 | -------------------------------------------------------------------------------- /man/bestLogConstant.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bestLogConstant.R 3 | \name{bestLogConstant} 4 | \alias{bestLogConstant} 5 | \alias{predict.bestLogConstant} 6 | \alias{print.bestLogConstant} 7 | \title{Calculate and perform best normalizing log transformation (experimental)} 8 | \usage{ 9 | bestLogConstant(x, a, standardize = TRUE, ...) 10 | 11 | \method{predict}{bestLogConstant}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{bestLogConstant}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize} 17 | 18 | \item{a}{(optional) a list of candidate constants to choose from} 19 | 20 | \item{standardize}{If TRUE, the transformed values are also centered and 21 | scaled, such that the transformation attempts a standard normal. This will 22 | not change the normality statistic.} 23 | 24 | \item{...}{additional arguments.} 25 | 26 | \item{object}{an object of class 'bestLogConstant'} 27 | 28 | \item{newdata}{a vector of data to be (reverse) transformed} 29 | 30 | \item{inverse}{if TRUE, performs reverse transformation} 31 | } 32 | \value{ 33 | A list of class \code{bestLogConstant} with elements 34 | 35 | \item{x.t}{transformed original data} \item{x}{original data} 36 | \item{norm_stats}{Pearson's Pearson's P / degrees of freedom} 37 | \item{method}{out-of-sample or in-sample, number of folds + repeats} 38 | \item{chosen_constant}{the chosen constant transformation (of class `log_x`)} 39 | \item{other_transforms}{the other transformations (of class `log_x`)} 40 | 41 | The \code{predict} function returns the numeric value of the transformation 42 | performed on new data, and allows for the inverse transformation as well. 43 | } 44 | \description{ 45 | Similar to bestNormalize, this selects the 46 | best candidate constant for a log transformation on the basis 47 | of the Pearson P test statistic for normality. The 48 | transformation that has the lowest P (calculated on the transformed data) 49 | is selected. This function is currently in development and may not behave 50 | as expected. 51 | 52 | See details for more information. 53 | } 54 | \details{ 55 | \code{bestLogConstant} estimates the optimal normalizing constant 56 | for a log transformation. This transformation can be performed on new data, and 57 | inverted, via the \code{predict} function. 58 | } 59 | \seealso{ 60 | \code{\link[bestNormalize]{bestNormalize}}, \code{\link{log_x}}, 61 | } 62 | -------------------------------------------------------------------------------- /man/bestNormalize-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bestNormalize-package.R 3 | \docType{package} 4 | \name{bestNormalize-package} 5 | \alias{bestNormalize-package} 6 | \title{bestNormalize: Flexibly calculate the best normalizing transformation for a 7 | vector} 8 | \description{ 9 | The \code{bestNormalize} package provides several normalizing transformations, and introduces a 10 | new transformation based off of the order statistics, \code{orderNorm}. 11 | Perhaps the most useful function is \code{bestNormalize}, which attempts all 12 | of these transformations and picks the best one based off of a goodness of 13 | fit statistic. 14 | } 15 | \seealso{ 16 | Useful links: 17 | \itemize{ 18 | \item \url{https://petersonr.github.io/bestNormalize/} 19 | \item \url{https://github.com/petersonR/bestNormalize} 20 | } 21 | 22 | } 23 | \author{ 24 | \strong{Maintainer}: Ryan Andrew Peterson \email{ryan.a.peterson@cuanschutz.edu} (\href{https://orcid.org/0000-0002-4650-5798}{ORCID}) 25 | 26 | } 27 | -------------------------------------------------------------------------------- /man/bestNormalize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bestNormalize.R 3 | \name{bestNormalize} 4 | \alias{bestNormalize} 5 | \alias{predict.bestNormalize} 6 | \alias{print.bestNormalize} 7 | \alias{tidy.bestNormalize} 8 | \title{Calculate and perform best normalizing transformation} 9 | \usage{ 10 | bestNormalize( 11 | x, 12 | standardize = TRUE, 13 | allow_orderNorm = TRUE, 14 | allow_lambert_s = FALSE, 15 | allow_lambert_h = FALSE, 16 | allow_exp = TRUE, 17 | out_of_sample = TRUE, 18 | cluster = NULL, 19 | k = 10, 20 | r = 5, 21 | loo = FALSE, 22 | warn = FALSE, 23 | quiet = FALSE, 24 | tr_opts = list(), 25 | new_transforms = list(), 26 | norm_stat_fn = NULL, 27 | ... 28 | ) 29 | 30 | \method{predict}{bestNormalize}(object, newdata = NULL, inverse = FALSE, ...) 31 | 32 | \method{print}{bestNormalize}(x, ...) 33 | 34 | \method{tidy}{bestNormalize}(x, ...) 35 | } 36 | \arguments{ 37 | \item{x}{A `bestNormalize` object.} 38 | 39 | \item{standardize}{If TRUE, the transformed values are also centered and 40 | scaled, such that the transformation attempts a standard normal. This will 41 | not change the normality statistic.} 42 | 43 | \item{allow_orderNorm}{set to FALSE if orderNorm should not be applied} 44 | 45 | \item{allow_lambert_s}{Set to FALSE if the lambertW of type "s" should not 46 | be applied (see details). Expect about 2-3x elapsed computing time if TRUE.} 47 | 48 | \item{allow_lambert_h}{Set to TRUE if the lambertW of type "h" and "hh" 49 | should be applied (see details). Expect about 2-4x elapsed computing time.} 50 | 51 | \item{allow_exp}{Set to TRUE if the exponential transformation should be 52 | applied (sometimes this will cause errors with heavy right skew)} 53 | 54 | \item{out_of_sample}{if FALSE, estimates quickly in-sample performance} 55 | 56 | \item{cluster}{name of cluster set using \code{makeCluster}} 57 | 58 | \item{k}{number of folds} 59 | 60 | \item{r}{number of repeats} 61 | 62 | \item{loo}{should leave-one-out CV be used instead of repeated CV? (see 63 | details)} 64 | 65 | \item{warn}{Should bestNormalize warn when a method doesn't work?} 66 | 67 | \item{quiet}{Should a progress-bar not be displayed for cross-validation 68 | progress?} 69 | 70 | \item{tr_opts}{a list (of lists), specifying options to be passed to each 71 | transformation (see details)} 72 | 73 | \item{new_transforms}{a named list of new transformation functions and their 74 | predict methods (see details)} 75 | 76 | \item{norm_stat_fn}{if specified, a function to calculate to assess normality 77 | (default is the Pearson chi-squared statistic divided by its d.f.)} 78 | 79 | \item{...}{not used} 80 | 81 | \item{object}{an object of class 'bestNormalize'} 82 | 83 | \item{newdata}{a vector of data to be (reverse) transformed} 84 | 85 | \item{inverse}{if TRUE, performs reverse transformation} 86 | } 87 | \value{ 88 | A list of class \code{bestNormalize} with elements 89 | 90 | \item{x.t}{transformed original data} \item{x}{original data} 91 | \item{norm_stats}{Pearson's Pearson's P / degrees of freedom} 92 | \item{method}{out-of-sample or in-sample, number of folds + repeats} 93 | \item{chosen_transform}{the chosen transformation (of appropriate class)} 94 | \item{other_transforms}{the other transformations (of appropriate class)} 95 | \item{oos_preds}{Out-of-sample predictions (if loo == TRUE) or 96 | normalization stats} 97 | 98 | The \code{predict} function returns the numeric value of the transformation 99 | performed on new data, and allows for the inverse transformation as well. 100 | } 101 | \description{ 102 | Performs a suite of normalizing transformations, and selects the 103 | best one on the basis of the Pearson P test statistic for normality. The 104 | transformation that has the lowest P (calculated on the transformed data) 105 | is selected. See details for more information. 106 | } 107 | \details{ 108 | \code{bestNormalize} estimates the optimal normalizing 109 | transformation. This transformation can be performed on new data, and 110 | inverted, via the \code{predict} function. 111 | 112 | This function currently estimates the Yeo-Johnson transformation, 113 | the Box Cox transformation (if the data is positive), the log_10(x+a) 114 | transformation, the square-root (x+a) transformation, and the arcsinh 115 | transformation. a is set to max(0, -min(x) + eps) by default. If 116 | allow_orderNorm == TRUE and if out_of_sample == FALSE then the ordered 117 | quantile normalization technique will likely be chosen since it essentially 118 | forces the data to follow a normal distribution. More information on the 119 | orderNorm technique can be found in the package vignette, or using 120 | \code{?orderNorm}. 121 | 122 | 123 | Repeated cross-validation is used by default to estimate the out-of-sample 124 | performance of each transformation if out_of_sample = TRUE. While this can 125 | take some time, users can speed it up by creating a cluster via the 126 | \code{parallel} package's \code{makeCluster} function, and passing the name 127 | of this cluster to \code{bestNormalize} via the cl argument. For best 128 | performance, we recommend the number of clusters to be set to the number of 129 | repeats r. Care should be taken to account for the number of observations 130 | per fold; too small a number and the estimated normality statistic could be 131 | inaccurate, or at least suffer from high variability. 132 | 133 | 134 | As of version 1.3, users can use leave-one-out cross-validation as well for 135 | each method by setting \code{loo} to \code{TRUE}. This will take a lot of 136 | time for bigger vectors, but it will have the most accurate estimate of 137 | normalization efficacy. Note that if this method is selected, arguments 138 | \code{k, r} are ignored. This method will still work in parallel with the 139 | \code{cl} argument. 140 | 141 | 142 | Note that the Lambert transformation of type "h" or "hh" can be done by 143 | setting allow_lambert_h = TRUE, however this can take significantly longer 144 | to run. 145 | 146 | Use \code{tr_opts} in order to set options for each transformation. For 147 | instance, if you want to overide the default a selection for \code{log_x}, 148 | set \code{tr_opts$log_x = list(a = 1)}. 149 | 150 | See the package's vignette on how to use custom functions with 151 | bestNormalize. All it takes is to create an S3 class and predict method for 152 | the new transformation and load it into the environment, then the new 153 | custom function (and its predict method) can be passed to bestNormalize 154 | with \code{new_transform}. 155 | } 156 | \examples{ 157 | 158 | x <- rgamma(100, 1, 1) 159 | 160 | \dontrun{ 161 | # With Repeated CV 162 | BN_obj <- bestNormalize(x) 163 | BN_obj 164 | p <- predict(BN_obj) 165 | x2 <- predict(BN_obj, newdata = p, inverse = TRUE) 166 | 167 | all.equal(x2, x) 168 | } 169 | 170 | 171 | \dontrun{ 172 | # With leave-one-out CV 173 | BN_obj <- bestNormalize(x, loo = TRUE) 174 | BN_obj 175 | p <- predict(BN_obj) 176 | x2 <- predict(BN_obj, newdata = p, inverse = TRUE) 177 | 178 | all.equal(x2, x) 179 | } 180 | 181 | # Without CV 182 | BN_obj <- bestNormalize(x, allow_orderNorm = FALSE, out_of_sample = FALSE) 183 | BN_obj 184 | p <- predict(BN_obj) 185 | x2 <- predict(BN_obj, newdata = p, inverse = TRUE) 186 | 187 | all.equal(x2, x) 188 | 189 | } 190 | \seealso{ 191 | \code{\link[bestNormalize]{boxcox}}, \code{\link{orderNorm}}, 192 | \code{\link{yeojohnson}} 193 | } 194 | -------------------------------------------------------------------------------- /man/binarize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/binarize.R 3 | \name{binarize} 4 | \alias{binarize} 5 | \alias{predict.binarize} 6 | \alias{print.binarize} 7 | \title{Binarize} 8 | \usage{ 9 | binarize(x, location_measure = "median") 10 | 11 | \method{predict}{binarize}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{binarize}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to binarize} 17 | 18 | \item{location_measure}{which location measure should be used? can either be 19 | "median", "mean", "mode", a number, or a function.} 20 | 21 | \item{object}{an object of class 'binarize'} 22 | 23 | \item{newdata}{a vector of data to be (reverse) transformed} 24 | 25 | \item{inverse}{if TRUE, performs reverse transformation} 26 | 27 | \item{...}{additional arguments} 28 | } 29 | \value{ 30 | A list of class \code{binarize} with elements 31 | \item{x.t}{transformed original data} 32 | \item{x}{original data} 33 | \item{method}{location_measure used for original fitting} 34 | \item{location}{estimated location_measure} 35 | \item{n}{number of nonmissing observations} 36 | \item{norm_stat}{Pearson's P / degrees of freedom} 37 | 38 | The \code{predict} function with \code{inverse = FALSE} returns the numeric 39 | value (0 or 1) of the transformation on \code{newdata} (which defaults to 40 | the original data). 41 | 42 | If \code{inverse = TRUE}, since the transform is not 1-1, it will create 43 | and return a factor that indicates where the original data was cut. 44 | } 45 | \description{ 46 | This function will perform a binarizing transformation, which 47 | could be used as a last resort if the data cannot be adequately normalized. 48 | This may be useful when accidentally attempting normalization of a binary 49 | vector (which could occur if implementing bestNormalize in an automated 50 | fashion). 51 | 52 | Note that the transformation is not one-to-one, in contrast to the other 53 | functions in this package. 54 | } 55 | \examples{ 56 | x <- rgamma(100, 1, 1) 57 | binarize_obj <- binarize(x) 58 | (p <- predict(binarize_obj)) 59 | 60 | predict(binarize_obj, newdata = p, inverse = TRUE) 61 | } 62 | -------------------------------------------------------------------------------- /man/boxcox.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/boxcox.R 3 | \name{boxcox} 4 | \alias{boxcox} 5 | \alias{predict.boxcox} 6 | \alias{print.boxcox} 7 | \title{Box-Cox Normalization} 8 | \usage{ 9 | boxcox(x, standardize = TRUE, ...) 10 | 11 | \method{predict}{boxcox}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{boxcox}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize with Box-Cox} 17 | 18 | \item{standardize}{If TRUE, the transformed values are also centered and 19 | scaled, such that the transformation attempts a standard normal} 20 | 21 | \item{...}{Additional arguments that can be passed to the estimation of the 22 | lambda parameter (lower, upper, epsilon)} 23 | 24 | \item{object}{an object of class 'boxcox'} 25 | 26 | \item{newdata}{a vector of data to be (reverse) transformed} 27 | 28 | \item{inverse}{if TRUE, performs reverse transformation} 29 | } 30 | \value{ 31 | A list of class \code{boxcox} with elements 32 | \item{x.t}{transformed 33 | original data} 34 | \item{x}{original data} 35 | \item{mean}{mean after transformation but prior to standardization} 36 | \item{sd}{sd after transformation but prior to standardization} 37 | \item{lambda}{estimated lambda value for skew transformation} 38 | \item{n}{number of nonmissing observations} 39 | \item{norm_stat}{Pearson's P / degrees of freedom} 40 | \item{standardize}{was the transformation standardized} 41 | 42 | The \code{predict} function returns the numeric value of the transformation 43 | performed on new data, and allows for the inverse transformation as well. 44 | } 45 | \description{ 46 | Perform a Box-Cox transformation and center/scale a vector to 47 | attempt normalization 48 | } 49 | \details{ 50 | \code{boxcox} estimates the optimal value of lambda for the Box-Cox 51 | transformation. This transformation can be performed on new data, and 52 | inverted, via the \code{predict} function. 53 | 54 | The function will return an error if a user attempt to transform nonpositive 55 | data. 56 | } 57 | \examples{ 58 | x <- rgamma(100, 1, 1) 59 | 60 | bc_obj <- boxcox(x) 61 | bc_obj 62 | p <- predict(bc_obj) 63 | x2 <- predict(bc_obj, newdata = p, inverse = TRUE) 64 | 65 | all.equal(x2, x) 66 | } 67 | \references{ 68 | Box, G. E. P. and Cox, D. R. (1964) An analysis of 69 | transformations. Journal of the Royal Statistical Society B, 26, 211-252. 70 | } 71 | \seealso{ 72 | \code{\link[MASS]{boxcox}} 73 | } 74 | -------------------------------------------------------------------------------- /man/double_reverse_log.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/double_reverse_log.R 3 | \name{double_reverse_log} 4 | \alias{double_reverse_log} 5 | \alias{predict.double_reverse_log} 6 | \alias{print.double_reverse_log} 7 | \title{Double Reverse Log(x + a) Transformation} 8 | \usage{ 9 | double_reverse_log( 10 | x, 11 | b = 10, 12 | standardize = TRUE, 13 | eps = diff(range(x, na.rm = TRUE))/10, 14 | warn = TRUE, 15 | ... 16 | ) 17 | 18 | \method{predict}{double_reverse_log}(object, newdata = NULL, inverse = FALSE, ...) 19 | 20 | \method{print}{double_reverse_log}(x, ...) 21 | } 22 | \arguments{ 23 | \item{x}{A vector to normalize with with x} 24 | 25 | \item{b}{The base of the log (defaults to 10)} 26 | 27 | \item{standardize}{If TRUE, the transformed values are also centered and 28 | scaled, such that the transformation attempts a standard normal} 29 | 30 | \item{eps}{The cushion for the transformation range (defaults to 10 percent)} 31 | 32 | \item{warn}{Should a warning result from infinite values?} 33 | 34 | \item{...}{additional arguments} 35 | 36 | \item{object}{an object of class 'double_reverse_log'} 37 | 38 | \item{newdata}{a vector of data to be (potentially reverse) transformed} 39 | 40 | \item{inverse}{if TRUE, performs reverse transformation} 41 | } 42 | \value{ 43 | A list of class \code{double_reverse_log} with elements 44 | \item{x.t}{transformed 45 | original data} 46 | \item{x}{original data} 47 | \item{mean}{mean after transformation but prior to standardization} 48 | \item{sd}{sd after transformation but prior to standardization} 49 | \item{b}{estimated base b value} 50 | \item{n}{number of nonmissing observations} 51 | \item{norm_stat}{Pearson's P / degrees of freedom} 52 | \item{standardize}{was the transformation standardized} 53 | 54 | The \code{predict} function returns the numeric value of the transformation 55 | performed on new data, and allows for the inverse transformation as well. 56 | } 57 | \description{ 58 | First reverses scores, then perform a log_b(x) 59 | normalization transformation, and then reverses scores again. 60 | } 61 | \details{ 62 | \code{double_reverse_log} performs a simple log transformation in the 63 | context of bestNormalize, such that it creates a transformation that can be 64 | estimated and applied to new data via the \code{predict} function. The parameter 65 | a is essentially estimated by the training set by default (estimated as the 66 | minimum possible to some extent epsilon), while the base must be specified 67 | beforehand. 68 | } 69 | \examples{ 70 | x <- rgamma(100, 1, 1) 71 | 72 | double_reverse_log_obj <- double_reverse_log(x) 73 | double_reverse_log_obj 74 | p <- predict(double_reverse_log_obj) 75 | x2 <- predict(double_reverse_log_obj, newdata = p, inverse = TRUE) 76 | 77 | all.equal(x2, x) 78 | 79 | } 80 | -------------------------------------------------------------------------------- /man/exp_x.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/exp_x.R 3 | \name{exp_x} 4 | \alias{exp_x} 5 | \alias{predict.exp_x} 6 | \alias{print.exp_x} 7 | \title{exp(x) Transformation} 8 | \usage{ 9 | exp_x(x, standardize = TRUE, warn = TRUE, ...) 10 | 11 | \method{predict}{exp_x}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{exp_x}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize with with x} 17 | 18 | \item{standardize}{If TRUE, the transformed values are also centered and 19 | scaled, such that the transformation attempts a standard normal} 20 | 21 | \item{warn}{Should a warning result from infinite values?} 22 | 23 | \item{...}{additional arguments} 24 | 25 | \item{object}{an object of class 'exp_x'} 26 | 27 | \item{newdata}{a vector of data to be (potentially reverse) transformed} 28 | 29 | \item{inverse}{if TRUE, performs reverse transformation} 30 | } 31 | \value{ 32 | A list of class \code{exp_x} with elements 33 | \item{x.t}{transformed 34 | original data} 35 | \item{x}{original data} 36 | \item{mean}{mean after transformation but prior to standardization} 37 | \item{sd}{sd after transformation but prior to standardization} 38 | \item{n}{number of nonmissing observations} 39 | \item{norm_stat}{Pearson's P / degrees of freedom} 40 | \item{standardize}{was the transformation standardized} 41 | 42 | The \code{predict} function returns the numeric value of the transformation 43 | performed on new data, and allows for the inverse transformation as well. 44 | } 45 | \description{ 46 | Perform a exp(x) transformation 47 | } 48 | \details{ 49 | \code{exp_x} performs a simple exponential transformation in the context of 50 | bestNormalize, such that it creates a transformation that can be estimated 51 | and applied to new data via the \code{predict} function. 52 | } 53 | \examples{ 54 | x <- rgamma(100, 1, 1) 55 | 56 | exp_x_obj <- exp_x(x) 57 | exp_x_obj 58 | p <- predict(exp_x_obj) 59 | x2 <- predict(exp_x_obj, newdata = p, inverse = TRUE) 60 | 61 | all.equal(x2, x) 62 | 63 | } 64 | -------------------------------------------------------------------------------- /man/lambert.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lambert.R 3 | \name{lambert} 4 | \alias{lambert} 5 | \alias{predict.lambert} 6 | \alias{print.lambert} 7 | \title{Lambert W x F Normalization} 8 | \usage{ 9 | lambert(x, type = "s", standardize = TRUE, warn = FALSE, ...) 10 | 11 | \method{predict}{lambert}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{lambert}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize with Box-Cox} 17 | 18 | \item{type}{a character indicating which transformation to perform (options 19 | are "s", "h", and "hh", see details)} 20 | 21 | \item{standardize}{If TRUE, the transformed values are also centered and 22 | scaled, such that the transformation attempts a standard normal} 23 | 24 | \item{warn}{should the function show warnings} 25 | 26 | \item{...}{Additional arguments that can be passed to the 27 | LambertW::Gaussianize function} 28 | 29 | \item{object}{an object of class 'lambert'} 30 | 31 | \item{newdata}{a vector of data to be (reverse) transformed} 32 | 33 | \item{inverse}{if TRUE, performs reverse transformation} 34 | } 35 | \value{ 36 | A list of class \code{lambert} with elements 37 | \item{x.t}{transformed original data} 38 | \item{x}{original data} 39 | \item{mean}{mean after transformation but prior to standardization} 40 | \item{sd}{sd after transformation but prior to standardization} 41 | \item{tau.mat}{estimated parameters of LambertW::Gaussianize} 42 | \item{n}{number of nonmissing observations} 43 | \item{norm_stat}{Pearson's P / degrees of freedom} 44 | \item{standardize}{was the transformation standardized} 45 | 46 | The \code{predict} function returns the numeric value of the transformation 47 | performed on new data, and allows for the inverse transformation as well. 48 | } 49 | \description{ 50 | Perform Lambert's W x F transformation and center/scale a vector 51 | to attempt normalization via the \code{LambertW} package. 52 | } 53 | \details{ 54 | \code{lambert} uses the \code{LambertW} package to estimate a 55 | normalizing (or "Gaussianizing") transformation. This transformation can be 56 | performed on new data, and inverted, via the \code{predict} function. 57 | 58 | NOTE: The type = "s" argument is the only one that does the 1-1 transform 59 | consistently, and so it is the only method currently used in 60 | \code{bestNormalize()}. Use type = "h" or type = 'hh' at risk of not having 61 | this estimate 1-1 transform. These alternative types are effective when the 62 | data has exceptionally heavy tails, e.g. the Cauchy distribution. 63 | 64 | Additionally, sometimes (depending on the distribution) this method will be 65 | unable to extrapolate beyond the observed bounds. In these cases, NaN is 66 | returned. 67 | } 68 | \examples{ 69 | \dontrun{ 70 | x <- rgamma(100, 1, 1) 71 | 72 | lambert_obj <- lambert(x) 73 | lambert_obj 74 | p <- predict(lambert_obj) 75 | x2 <- predict(lambert_obj, newdata = p, inverse = TRUE) 76 | 77 | all.equal(x2, x) 78 | } 79 | 80 | } 81 | \references{ 82 | Georg M. Goerg (2016). LambertW: An R package for Lambert W x F 83 | Random Variables. R package version 0.6.4. 84 | 85 | Georg M. Goerg (2011): Lambert W random variables - a new family of 86 | generalized skewed distributions with applications to risk estimation. 87 | Annals of Applied Statistics 3(5). 2197-2230. 88 | 89 | Georg M. Goerg (2014): The Lambert Way to Gaussianize heavy-tailed data 90 | with the inverse of Tukey's h transformation as a special case. The 91 | Scientific World Journal. 92 | } 93 | \seealso{ 94 | \code{\link[LambertW]{Gaussianize}} 95 | } 96 | -------------------------------------------------------------------------------- /man/log_x.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/log_x.R 3 | \name{log_x} 4 | \alias{log_x} 5 | \alias{predict.log_x} 6 | \alias{print.log_x} 7 | \title{Log(x + a) Transformation} 8 | \usage{ 9 | log_x(x, a = NULL, b = 10, standardize = TRUE, eps = 0.001, warn = TRUE, ...) 10 | 11 | \method{predict}{log_x}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{log_x}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize with with x} 17 | 18 | \item{a}{The constant to add to x (defaults to max(0, -min(x) + eps)); 19 | see \code{bestLogConstant}} 20 | 21 | \item{b}{The base of the log (defaults to 10)} 22 | 23 | \item{standardize}{If TRUE, the transformed values are also centered and 24 | scaled, such that the transformation attempts a standard normal} 25 | 26 | \item{eps}{The allowed error in the expression for the selected a} 27 | 28 | \item{warn}{Should a warning result from infinite values?} 29 | 30 | \item{...}{additional arguments} 31 | 32 | \item{object}{an object of class 'log_x'} 33 | 34 | \item{newdata}{a vector of data to be (potentially reverse) transformed} 35 | 36 | \item{inverse}{if TRUE, performs reverse transformation} 37 | } 38 | \value{ 39 | A list of class \code{log_x} with elements 40 | \item{x.t}{transformed 41 | original data} 42 | \item{x}{original data} 43 | \item{mean}{mean after transformation but prior to standardization} 44 | \item{sd}{sd after transformation but prior to standardization} 45 | \item{a}{estimated a value} 46 | \item{b}{estimated base b value} 47 | \item{n}{number of nonmissing observations} 48 | \item{norm_stat}{Pearson's P / degrees of freedom} 49 | \item{standardize}{was the transformation standardized} 50 | 51 | The \code{predict} function returns the numeric value of the transformation 52 | performed on new data, and allows for the inverse transformation as well. 53 | } 54 | \description{ 55 | Perform a log_b (x+a) normalization transformation 56 | } 57 | \details{ 58 | \code{log_x} performs a simple log transformation in the context of 59 | bestNormalize, such that it creates a transformation that can be estimated 60 | and applied to new data via the \code{predict} function. The parameter a is 61 | essentially estimated by the training set by default (estimated as the minimum 62 | possible to some extent epsilon), while the base must be 63 | specified beforehand. 64 | } 65 | \examples{ 66 | x <- rgamma(100, 1, 1) 67 | 68 | log_x_obj <- log_x(x) 69 | log_x_obj 70 | p <- predict(log_x_obj) 71 | x2 <- predict(log_x_obj, newdata = p, inverse = TRUE) 72 | 73 | all.equal(x2, x) 74 | 75 | } 76 | -------------------------------------------------------------------------------- /man/no_transform.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/no_transform.R 3 | \name{no_transform} 4 | \alias{no_transform} 5 | \alias{predict.no_transform} 6 | \alias{center_scale} 7 | \alias{print.no_transform} 8 | \alias{predict.center_scale} 9 | \alias{print.center_scale} 10 | \alias{tidy.no_transform} 11 | \title{Identity transformation and center/scale transform} 12 | \usage{ 13 | no_transform(x, warn = TRUE, ...) 14 | 15 | \method{predict}{no_transform}(object, newdata = NULL, inverse = FALSE, ...) 16 | 17 | \method{print}{no_transform}(x, ...) 18 | 19 | center_scale(x, warn = TRUE, ...) 20 | 21 | \method{predict}{center_scale}(object, newdata = NULL, inverse = FALSE, ...) 22 | 23 | \method{print}{center_scale}(x, ...) 24 | 25 | \method{tidy}{no_transform}(x, ...) 26 | } 27 | \arguments{ 28 | \item{x}{A `no_transform` object.} 29 | 30 | \item{warn}{Should a warning result from infinite values?} 31 | 32 | \item{...}{not used} 33 | 34 | \item{object}{an object of class 'no_transform'} 35 | 36 | \item{newdata}{a vector of data to be (potentially reverse) transformed} 37 | 38 | \item{inverse}{if TRUE, performs reverse transformation} 39 | } 40 | \value{ 41 | A list of class \code{no_transform} with elements 42 | \item{x.t}{transformed original data} 43 | \item{x}{original data} 44 | \item{n}{number of nonmissing observations} 45 | \item{norm_stat}{Pearson's P / degrees of freedom} 46 | 47 | The \code{predict} function returns the numeric value of the transformation 48 | performed on new data, and allows for the inverse transformation as well. 49 | } 50 | \description{ 51 | Perform an identity transformation. Admittedly it seems odd to 52 | have a dedicated function to essentially do I(x), but it makes sense to 53 | keep the same syntax as the other transformations so it plays nicely 54 | with them. As a benefit, the bestNormalize function will also show 55 | a comparable normalization statistic for the untransformed data. If 56 | \code{standardize == TRUE}, \code{center_scale} passes to bestNormalize instead. 57 | } 58 | \details{ 59 | \code{no_transform} creates a identity transformation object 60 | that can be applied to new data via the \code{predict} function. 61 | } 62 | \examples{ 63 | x <- rgamma(100, 1, 1) 64 | 65 | no_transform_obj <- no_transform(x) 66 | no_transform_obj 67 | p <- predict(no_transform_obj) 68 | x2 <- predict(no_transform_obj, newdata = p, inverse = TRUE) 69 | 70 | all.equal(x2, x) 71 | 72 | } 73 | -------------------------------------------------------------------------------- /man/orderNorm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/orderNorm.R 3 | \name{orderNorm} 4 | \alias{orderNorm} 5 | \alias{predict.orderNorm} 6 | \alias{print.orderNorm} 7 | \title{Calculate and perform Ordered Quantile normalizing transformation} 8 | \usage{ 9 | orderNorm(x, n_logit_fit = min(length(x), 10000), ..., warn = TRUE) 10 | 11 | \method{predict}{orderNorm}(object, newdata = NULL, inverse = FALSE, warn = TRUE, ...) 12 | 13 | \method{print}{orderNorm}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize} 17 | 18 | \item{n_logit_fit}{Number of points used to fit logit approximation} 19 | 20 | \item{...}{additional arguments} 21 | 22 | \item{warn}{transforms outside observed range or ties will yield warning} 23 | 24 | \item{object}{an object of class 'orderNorm'} 25 | 26 | \item{newdata}{a vector of data to be (reverse) transformed} 27 | 28 | \item{inverse}{if TRUE, performs reverse transformation} 29 | } 30 | \value{ 31 | A list of class \code{orderNorm} with elements 32 | 33 | \item{x.t}{transformed original data} \item{x}{original data} 34 | \item{n}{number of nonmissing observations} \item{ties_status}{indicator if 35 | ties are present} \item{fit}{fit to be used for extrapolation, if needed} 36 | \item{norm_stat}{Pearson's P / degrees of freedom} 37 | 38 | The \code{predict} function returns the numeric value of the transformation 39 | performed on new data, and allows for the inverse transformation as well. 40 | } 41 | \description{ 42 | The Ordered Quantile (ORQ) normalization transformation, 43 | \code{orderNorm()}, is a rank-based procedure by which the values of a 44 | vector are mapped to their percentile, which is then mapped to the same 45 | percentile of the normal distribution. Without the presence of ties, this 46 | essentially guarantees that the transformation leads to a uniform 47 | distribution. 48 | 49 | The transformation is: \deqn{g(x) = \Phi ^ {-1} ((rank(x) - .5) / 50 | (length(x)))} 51 | 52 | Where \eqn{\Phi} refers to the standard normal cdf, rank(x) refers to each 53 | observation's rank, and length(x) refers to the number of observations. 54 | 55 | By itself, this method is certainly not new; the earliest mention of it 56 | that I could find is in a 1947 paper by Bartlett (see references). This 57 | formula was outlined explicitly in Van der Waerden, and expounded upon in 58 | Beasley (2009). However there is a key difference to this version of it, as 59 | explained below. 60 | 61 | Using linear interpolation between these percentiles, the ORQ normalization 62 | becomes a 1-1 transformation that can be applied to new data. However, 63 | outside of the observed domain of x, it is unclear how to extrapolate the 64 | transformation. In the ORQ normalization procedure, a binomial glm with a 65 | logit link is used on the ranks in order to extrapolate beyond the bounds 66 | of the original domain of x. The inverse normal CDF is then applied to 67 | these extrapolated predictions in order to extrapolate the transformation. 68 | This mitigates the influence of heavy-tailed distributions while preserving 69 | the 1-1 nature of the transformation. The extrapolation will provide a 70 | warning unless warn = FALSE.) However, we found that the extrapolation was 71 | able to perform very well even on data as heavy-tailed as a Cauchy 72 | distribution (paper to be published). 73 | 74 | The fit used to perform the extrapolation uses a default of 10000 75 | observations (or length(x) if that is less). This added approximation 76 | improves the scalability, both computationally and in terms of memory used. 77 | Do not set this value to be too low (e.g. <100), as there is no benefit to 78 | doing so. Increase if your test data set is large relative to 10000 and/or 79 | if you are worried about losing signal in the extremes of the range. 80 | 81 | This transformation can be performed on new data and inverted via the 82 | \code{predict} function. 83 | } 84 | \examples{ 85 | 86 | x <- rgamma(100, 1, 1) 87 | 88 | orderNorm_obj <- orderNorm(x) 89 | orderNorm_obj 90 | p <- predict(orderNorm_obj) 91 | x2 <- predict(orderNorm_obj, newdata = p, inverse = TRUE) 92 | 93 | all.equal(x2, x) 94 | } 95 | \references{ 96 | Bartlett, M. S. "The Use of Transformations." Biometrics, vol. 3, no. 1, 97 | 1947, pp. 39-52. JSTOR www.jstor.org/stable/3001536. 98 | 99 | Van der Waerden BL. Order tests for the two-sample problem and their power. 100 | 1952;55:453-458. Ser A. 101 | 102 | Beasley TM, Erickson S, Allison DB. Rank-based inverse normal transformations 103 | are increasingly used, but are they merited? Behav. Genet. 2009;39(5): 104 | 580-595. pmid:19526352 105 | } 106 | \seealso{ 107 | \code{\link{boxcox}}, \code{\link{lambert}}, 108 | \code{\link{bestNormalize}}, \code{\link{yeojohnson}} 109 | } 110 | -------------------------------------------------------------------------------- /man/plot.bestNormalize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plots.R 3 | \name{plot.bestNormalize} 4 | \alias{plot.bestNormalize} 5 | \alias{plot.orderNorm} 6 | \alias{plot.boxcox} 7 | \alias{plot.yeojohnson} 8 | \alias{plot.lambert} 9 | \title{Transformation plotting} 10 | \usage{ 11 | \method{plot}{bestNormalize}( 12 | x, 13 | inverse = FALSE, 14 | bounds = NULL, 15 | cols = NULL, 16 | methods = NULL, 17 | leg_loc = "top", 18 | ... 19 | ) 20 | 21 | \method{plot}{orderNorm}(x, inverse = FALSE, bounds = NULL, ...) 22 | 23 | \method{plot}{boxcox}(x, inverse = FALSE, bounds = NULL, ...) 24 | 25 | \method{plot}{yeojohnson}(x, inverse = FALSE, bounds = NULL, ...) 26 | 27 | \method{plot}{lambert}(x, inverse = FALSE, bounds = NULL, ...) 28 | } 29 | \arguments{ 30 | \item{x}{a fitted transformation} 31 | 32 | \item{inverse}{if TRUE, plots the inverse transformation} 33 | 34 | \item{bounds}{a vector of bounds to plot for the transformation} 35 | 36 | \item{cols}{a vector of colors to use for the transforms (see details)} 37 | 38 | \item{methods}{a vector of transformations to plot} 39 | 40 | \item{leg_loc}{the location of the legend on the plot} 41 | 42 | \item{...}{further parameters to be passed to \code{plot} and \code{lines}} 43 | } 44 | \description{ 45 | Plots transformation functions for objects produced by the bestNormalize 46 | package 47 | } 48 | \details{ 49 | The plots produced by the individual transformations are simply 50 | plots of the original values by the newly transformed values, with a line 51 | denoting where transformations would take place for new data. 52 | 53 | For the bestNormalize object, this plots each of the possible 54 | transformations run by the original call to bestNormalize. The first 55 | argument in the "cols" parameter refers to the color of the chosen 56 | transformation. 57 | } 58 | -------------------------------------------------------------------------------- /man/reexports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_orderNorm.R 3 | \docType{import} 4 | \name{reexports} 5 | \alias{reexports} 6 | \alias{required_pkgs} 7 | \title{Objects exported from other packages} 8 | \keyword{internal} 9 | \description{ 10 | These objects are imported from other packages. Follow the links 11 | below to see their documentation. 12 | 13 | \describe{ 14 | \item{generics}{\code{\link[generics]{required_pkgs}}} 15 | }} 16 | 17 | -------------------------------------------------------------------------------- /man/sqrt_x.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sqrt_x.R 3 | \name{sqrt_x} 4 | \alias{sqrt_x} 5 | \alias{predict.sqrt_x} 6 | \alias{print.sqrt_x} 7 | \title{sqrt(x + a) Normalization} 8 | \usage{ 9 | sqrt_x(x, a = NULL, standardize = TRUE, ...) 10 | 11 | \method{predict}{sqrt_x}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{sqrt_x}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize with with x} 17 | 18 | \item{a}{The constant to add to x (defaults to max(0, -min(x)))} 19 | 20 | \item{standardize}{If TRUE, the transformed values are also centered and 21 | scaled, such that the transformation attempts a standard normal} 22 | 23 | \item{...}{additional arguments} 24 | 25 | \item{object}{an object of class 'sqrt_x'} 26 | 27 | \item{newdata}{a vector of data to be (potentially reverse) transformed} 28 | 29 | \item{inverse}{if TRUE, performs reverse transformation} 30 | } 31 | \value{ 32 | A list of class \code{sqrt_x} with elements \item{x.t}{transformed 33 | original data} 34 | \item{x}{original data} 35 | \item{mean}{mean after transformation but prior to standardization} 36 | \item{sd}{sd after transformation but prior to standardization} 37 | \item{n}{number of nonmissing observations} 38 | \item{norm_stat}{Pearson's P / degrees of freedom} 39 | \item{standardize}{was the transformation standardized} 40 | 41 | The \code{predict} function returns the numeric value of the transformation 42 | performed on new data, and allows for the inverse transformation as well. 43 | } 44 | \description{ 45 | Perform a sqrt (x+a) normalization transformation 46 | } 47 | \details{ 48 | \code{sqrt_x} performs a simple square-root transformation in the 49 | context of bestNormalize, such that it creates a transformation that can be 50 | estimated and applied to new data via the \code{predict} function. The 51 | parameter a is essentially estimated by the training set by default 52 | (estimated as the minimum possible), while the base 53 | must be specified beforehand. 54 | } 55 | \examples{ 56 | x <- rgamma(100, 1, 1) 57 | 58 | sqrt_x_obj <- sqrt_x(x) 59 | sqrt_x_obj 60 | p <- predict(sqrt_x_obj) 61 | x2 <- predict(sqrt_x_obj, newdata = p, inverse = TRUE) 62 | 63 | all.equal(x2, x) 64 | 65 | } 66 | -------------------------------------------------------------------------------- /man/step_best_normalize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_best_normalize.R 3 | \name{step_best_normalize} 4 | \alias{step_best_normalize} 5 | \alias{step_bestNormalize} 6 | \alias{step_bestNormalize_new} 7 | \alias{tidy.step_best_normalize} 8 | \alias{axe_env.step_best_normalize} 9 | \title{Run bestNormalize transformation for \code{recipes} implementation} 10 | \usage{ 11 | step_best_normalize( 12 | recipe, 13 | ..., 14 | role = NA, 15 | trained = FALSE, 16 | transform_info = NULL, 17 | transform_options = list(), 18 | num_unique = 5, 19 | skip = FALSE, 20 | id = rand_id("best_normalize") 21 | ) 22 | 23 | \method{tidy}{step_best_normalize}(x, ...) 24 | 25 | \method{axe_env}{step_best_normalize}(x, ...) 26 | } 27 | \arguments{ 28 | \item{recipe}{A formula or recipe} 29 | 30 | \item{...}{One or more selector functions to choose which variables are 31 | affected by the step. See [selections()] for more details. For the `tidy` 32 | method, these are not currently used.} 33 | 34 | \item{role}{Not used by this step since no new variables are created.} 35 | 36 | \item{trained}{For recipes functionality} 37 | 38 | \item{transform_info}{A numeric vector of transformation values. This (was 39 | transform_info) is `NULL` until computed by [prep.recipe()].} 40 | 41 | \item{transform_options}{options to be passed to bestNormalize} 42 | 43 | \item{num_unique}{An integer where data that have less possible values will 44 | not be evaluate for a transformation.} 45 | 46 | \item{skip}{For recipes functionality} 47 | 48 | \item{id}{For recipes functionality} 49 | 50 | \item{x}{A `step_best_normalize` object.} 51 | } 52 | \value{ 53 | An updated version of `recipe` with the new step added to the 54 | sequence of existing steps (if any). For the `tidy` method, a tibble with 55 | columns `terms` (the selectors or variables selected) and `value` (the 56 | lambda estimate). 57 | } 58 | \description{ 59 | `step_best_normalize` creates a specification of a recipe step 60 | (see `recipes` package) that will transform data using the best of a suite 61 | of normalization transformations estimated (by default) using 62 | cross-validation. 63 | } 64 | \details{ 65 | The bestnormalize transformation can be used to rescale a variable 66 | to be more similar to a normal distribution. See `?bestNormalize` for more 67 | information; `step_best_normalize` is the implementation of `bestNormalize` 68 | in the `recipes` context. 69 | 70 | As of version 1.7, the `butcher` package can be used to (hopefully) improve 71 | scalability of this function on bigger data sets. 72 | } 73 | \examples{ 74 | 75 | library(recipes) 76 | rec <- recipe(~ ., data = as.data.frame(iris)) 77 | 78 | bn_trans <- step_best_normalize(rec, all_numeric()) 79 | 80 | bn_estimates <- prep(bn_trans, training = as.data.frame(iris)) 81 | 82 | bn_data <- bake(bn_estimates, as.data.frame(iris)) 83 | 84 | plot(density(iris[, "Petal.Length"]), main = "before") 85 | plot(density(bn_data$Petal.Length), main = "after") 86 | 87 | tidy(bn_trans, number = 1) 88 | tidy(bn_estimates, number = 1) 89 | 90 | } 91 | \seealso{ 92 | \code{\link[bestNormalize]{bestNormalize}} \code{\link{orderNorm}}, 93 | [recipe()] [prep.recipe()] [bake.recipe()] 94 | } 95 | \concept{preprocessing} 96 | \concept{transformation_methods} 97 | -------------------------------------------------------------------------------- /man/step_orderNorm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/step_orderNorm.R 3 | \name{step_orderNorm} 4 | \alias{step_orderNorm} 5 | \alias{tidy.step_orderNorm} 6 | \alias{axe_env.step_orderNorm} 7 | \title{ORQ normalization (orderNorm) for \code{recipes} implementation} 8 | \usage{ 9 | step_orderNorm( 10 | recipe, 11 | ..., 12 | role = NA, 13 | trained = FALSE, 14 | transform_info = NULL, 15 | transform_options = list(), 16 | num_unique = 5, 17 | skip = FALSE, 18 | id = rand_id("orderNorm") 19 | ) 20 | 21 | \method{tidy}{step_orderNorm}(x, ...) 22 | 23 | \method{axe_env}{step_orderNorm}(x, ...) 24 | } 25 | \arguments{ 26 | \item{recipe}{A formula or recipe} 27 | 28 | \item{...}{One or more selector functions to choose which variables are 29 | affected by the step. See [selections()] for more details. For the `tidy` 30 | method, these are not currently used.} 31 | 32 | \item{role}{Not used by this step since no new variables are created.} 33 | 34 | \item{trained}{For recipes functionality} 35 | 36 | \item{transform_info}{A numeric vector of transformation values. This (was 37 | transform_info) is `NULL` until computed by [prep.recipe()].} 38 | 39 | \item{transform_options}{options to be passed to orderNorm} 40 | 41 | \item{num_unique}{An integer where data that have less possible values will 42 | not be evaluate for a transformation.} 43 | 44 | \item{skip}{For recipes functionality} 45 | 46 | \item{id}{For recipes functionality} 47 | 48 | \item{x}{A `step_orderNorm` object.} 49 | } 50 | \value{ 51 | An updated version of `recipe` with the new step added to the 52 | sequence of existing steps (if any). For the `tidy` method, a tibble with 53 | columns `terms` (the selectors or variables selected) and `value` (the 54 | lambda estimate). 55 | } 56 | \description{ 57 | `step_orderNorm` creates a specification of a recipe step (see 58 | `recipes` package) that will transform data using the ORQ (orderNorm) 59 | transformation, which approximates the "true" normalizing transformation if 60 | one exists. This is considerably faster than `step_bestNormalize`. 61 | } 62 | \details{ 63 | The orderNorm transformation can be used to rescale a variable to be 64 | more similar to a normal distribution. See `?orderNorm` for more 65 | information; `step_orderNorm` is the implementation of `orderNorm` in the 66 | `recipes` context. 67 | 68 | As of version 1.7, the `butcher` package can be used to (hopefully) improve 69 | scalability of this function on bigger data sets. 70 | } 71 | \examples{ 72 | library(recipes) 73 | rec <- recipe(~ ., data = as.data.frame(iris)) 74 | 75 | orq_trans <- step_orderNorm(rec, all_numeric()) 76 | 77 | orq_estimates <- prep(orq_trans, training = as.data.frame(iris)) 78 | 79 | orq_data <- bake(orq_estimates, as.data.frame(iris)) 80 | 81 | plot(density(iris[, "Petal.Length"]), main = "before") 82 | plot(density(orq_data$Petal.Length), main = "after") 83 | 84 | tidy(orq_trans, number = 1) 85 | tidy(orq_estimates, number = 1) 86 | 87 | 88 | } 89 | \references{ 90 | Ryan A. Peterson (2019). Ordered quantile normalization: a 91 | semiparametric transformation built for the cross-validation era. Journal 92 | of Applied Statistics, 1-16. 93 | } 94 | \seealso{ 95 | \code{\link[bestNormalize]{orderNorm}} \code{\link{bestNormalize}}, 96 | [recipe()] [prep.recipe()] [bake.recipe()] 97 | } 98 | \concept{preprocessing} 99 | \concept{transformation_methods} 100 | -------------------------------------------------------------------------------- /man/yeojohnson.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/yeojohnson.R 3 | \name{yeojohnson} 4 | \alias{yeojohnson} 5 | \alias{predict.yeojohnson} 6 | \alias{print.yeojohnson} 7 | \title{Yeo-Johnson Normalization} 8 | \usage{ 9 | yeojohnson(x, eps = 0.001, standardize = TRUE, ...) 10 | 11 | \method{predict}{yeojohnson}(object, newdata = NULL, inverse = FALSE, ...) 12 | 13 | \method{print}{yeojohnson}(x, ...) 14 | } 15 | \arguments{ 16 | \item{x}{A vector to normalize with Yeo-Johnson} 17 | 18 | \item{eps}{A value to compare lambda against to see if it is equal to zero} 19 | 20 | \item{standardize}{If TRUE, the transformed values are also centered and 21 | scaled, such that the transformation attempts a standard normal} 22 | 23 | \item{...}{Additional arguments that can be passed to the estimation of the 24 | lambda parameter (lower, upper)} 25 | 26 | \item{object}{an object of class 'yeojohnson'} 27 | 28 | \item{newdata}{a vector of data to be (reverse) transformed} 29 | 30 | \item{inverse}{if TRUE, performs reverse transformation} 31 | } 32 | \value{ 33 | A list of class \code{yeojohnson} with elements 34 | 35 | \item{x.t}{transformed original data} 36 | \item{x}{original data} 37 | \item{mean}{mean after transformation but prior to standardization} 38 | \item{sd}{sd after transformation but prior to standardization} 39 | \item{lambda}{estimated lambda value for skew transformation} 40 | \item{n}{number of nonmissing observations} 41 | \item{norm_stat}{Pearson's P / degrees of freedom} 42 | \item{standardize}{Was the transformation standardized} 43 | 44 | The \code{predict} function returns the numeric value of the transformation 45 | performed on new data, and allows for the inverse transformation as well. 46 | } 47 | \description{ 48 | Perform a Yeo-Johnson Transformation and center/scale a vector to 49 | attempt normalization 50 | } 51 | \details{ 52 | \code{yeojohnson} estimates the optimal value of lambda for the 53 | Yeo-Johnson transformation. This transformation can be performed on new 54 | data, and inverted, via the \code{predict} function. 55 | 56 | The Yeo-Johnson is similar to the Box-Cox method, however it allows for the 57 | transformation of nonpositive data as well. The \code{step_YeoJohnson} 58 | function in the \code{recipes} package is another useful resource (see 59 | references). 60 | } 61 | \examples{ 62 | 63 | x <- rgamma(100, 1, 1) 64 | 65 | yeojohnson_obj <- yeojohnson(x) 66 | yeojohnson_obj 67 | p <- predict(yeojohnson_obj) 68 | x2 <- predict(yeojohnson_obj, newdata = p, inverse = TRUE) 69 | 70 | all.equal(x2, x) 71 | 72 | } 73 | \references{ 74 | Yeo, I. K., & Johnson, R. A. (2000). A new family of power 75 | transformations to improve normality or symmetry. Biometrika. 76 | 77 | Max Kuhn and Hadley Wickham (2017). recipes: Preprocessing Tools to Create 78 | Design Matrices. R package version 0.1.0.9000. 79 | https://github.com/topepo/recipes 80 | } 81 | -------------------------------------------------------------------------------- /revdep/.gitignore: -------------------------------------------------------------------------------- 1 | checks 2 | library 3 | checks.noindex 4 | library.noindex 5 | cloud.noindex 6 | data.sqlite 7 | *.html 8 | -------------------------------------------------------------------------------- /revdep/README.md: -------------------------------------------------------------------------------- 1 | # Platform 2 | 3 | |field |value | 4 | |:--------|:----------------------------------------| 5 | |version |R version 4.2.1 (2022-06-23) | 6 | |os |macOS Ventura 13.0.1 | 7 | |system |x86_64, darwin17.0 | 8 | |ui |RStudio | 9 | |language |(EN) | 10 | |collate |en_US.UTF-8 | 11 | |ctype |en_US.UTF-8 | 12 | |tz |America/Denver | 13 | |date |2023-08-17 | 14 | |rstudio |2022.12.0+353 Elsbeth Geranium (desktop) | 15 | |pandoc |NA | 16 | 17 | # Dependencies 18 | 19 | |package |old |new |Δ | 20 | |:-------------|:-----|:------|:--| 21 | |bestNormalize |1.9.0 |1.9.1 |* | 22 | |clock |NA |0.7.0 |* | 23 | |cpp11 |NA |0.4.6 |* | 24 | |digest |NA |0.6.33 |* | 25 | |dplyr |NA |1.1.2 |* | 26 | |future |NA |1.33.0 |* | 27 | |future.apply |NA |1.11.0 |* | 28 | |ggplot2 |NA |3.4.3 |* | 29 | |lamW |NA |2.2.0 |* | 30 | |parallelly |NA |1.36.0 |* | 31 | |progressr |NA |0.14.0 |* | 32 | |purrr |NA |1.0.2 |* | 33 | |Rcpp |NA |1.0.11 |* | 34 | |recipes |NA |1.0.7 |* | 35 | |rlang |NA |1.1.1 |* | 36 | |tzdb |NA |0.4.0 |* | 37 | |vctrs |NA |0.6.3 |* | 38 | |viridisLite |NA |0.4.2 |* | 39 | 40 | # Revdeps 41 | 42 | -------------------------------------------------------------------------------- /revdep/cran.md: -------------------------------------------------------------------------------- 1 | ## revdepcheck results 2 | 3 | We checked 3 reverse dependencies (2 from CRAN + 1 from Bioconductor), comparing R CMD check results across CRAN and dev versions of this package. 4 | 5 | * We saw 0 new problems 6 | * We failed to check 0 packages 7 | 8 | -------------------------------------------------------------------------------- /revdep/failures.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /revdep/problems.md: -------------------------------------------------------------------------------- 1 | *Wow, no problems at all. :)* -------------------------------------------------------------------------------- /tests/testthat-bestNormalize.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(bestNormalize) 3 | 4 | test_check(package = 'bestNormalize', filter = "bestNormalize") 5 | -------------------------------------------------------------------------------- /tests/testthat-custom-parallel.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(bestNormalize) 3 | 4 | test_check(package = 'bestNormalize', filter = "parallel|custom") 5 | -------------------------------------------------------------------------------- /tests/testthat-main-transforms.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(bestNormalize) 3 | 4 | test_check(package = 'bestNormalize', filter = "boxcox|yeojohnson|lambert|orderNorm") 5 | -------------------------------------------------------------------------------- /tests/testthat-methods-step.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(bestNormalize) 3 | 4 | test_check(package = 'bestNormalize', filter = "methods|step") 5 | -------------------------------------------------------------------------------- /tests/testthat-minor-transforms.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(bestNormalize) 3 | 4 | test_check(package = 'bestNormalize', filter = "arcsinh|binarize|exp_x|log|no_transform|sqrt|center_scale") 5 | -------------------------------------------------------------------------------- /tests/testthat/Rplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petersonR/bestNormalize/46c0ea73e57eb2130bac9078b501688fd48d65c9/tests/testthat/Rplots.pdf -------------------------------------------------------------------------------- /tests/testthat/test_arcsinh_x.R: -------------------------------------------------------------------------------- 1 | context('arcsinh_x functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | arcsinh_x_obj <- arcsinh_x(train) 6 | 7 | test_that('arcsinh_x Transforms original data consistently', { 8 | expect_equal(arcsinh_x_obj$x.t, predict(arcsinh_x_obj)) 9 | expect_equal(arcsinh_x_obj$x, predict(arcsinh_x_obj, inverse = TRUE)) 10 | }) 11 | 12 | test_that('arcsinh_x Transforms new data consistently', { 13 | nd <- seq(0, 4, length = 100) 14 | pred <- predict(arcsinh_x_obj, newdata = nd) 15 | expect_true(!any(is.na(pred))) 16 | 17 | nd2 <- predict(arcsinh_x_obj, newdata = pred, inverse = TRUE) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('arcsinh_x correctly handles missing original data', { 22 | b <- arcsinh_x(c(NA, train)) 23 | expect_equal(as.numeric(NA), b$x.t[1]) 24 | expect_equal(as.numeric(NA), predict(b)[1]) 25 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 26 | }) 27 | 28 | test_that('arcsinh_x correctly handles missing new data', { 29 | b <- arcsinh_x(train) 30 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 31 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 32 | }) 33 | 34 | # Test standardization 35 | arcsinh_x_obj <- arcsinh_x(train, standardize = FALSE) 36 | 37 | test_that('arcsinh_x Transforms original data consistently', { 38 | expect_equal(arcsinh_x_obj$x.t, predict(arcsinh_x_obj)) 39 | expect_equal(arcsinh_x_obj$x, predict(arcsinh_x_obj, inverse = TRUE)) 40 | }) 41 | 42 | test_that('arcsinh_x Transforms new data consistently', { 43 | nd <- seq(0, 4, length = 100) 44 | pred <- predict(arcsinh_x_obj, newdata = nd) 45 | expect_true(!any(is.na(pred))) 46 | 47 | nd2 <- predict(arcsinh_x_obj, newdata = pred, inverse = TRUE) 48 | expect_equal(nd, nd2) 49 | }) 50 | 51 | test_that('arcsinh_x correctly handles missing original data', { 52 | b <- arcsinh_x(c(NA, train), standardize = FALSE) 53 | expect_equal(as.numeric(NA), b$x.t[1]) 54 | expect_equal(as.numeric(NA), predict(b)[1]) 55 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 56 | }) 57 | 58 | test_that('arcsinh_x correctly handles missing new data', { 59 | b <- arcsinh_x(train, standardize = FALSE) 60 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 61 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 62 | }) 63 | 64 | -------------------------------------------------------------------------------- /tests/testthat/test_bestNormalize.R: -------------------------------------------------------------------------------- 1 | context('bestNormalize functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | 6 | BNobject <- suppressWarnings(bestNormalize(train, out_of_sample = FALSE, quiet = TRUE)) 7 | BNobject4 <- bestNormalize(train, allow_orderNorm = FALSE, out_of_sample = FALSE, quiet = TRUE) 8 | BNobject5 <- suppressWarnings(bestNormalize(train, out_of_sample = TRUE, quiet = TRUE)) 9 | # BNobject6 <- suppressWarnings(bestNormalize(train, quiet = TRUE, loo = TRUE, allow_lambert_s = TRUE)) 10 | 11 | # Test transformations 12 | test_that('BestNormalize transformations with positive data', { 13 | 14 | working_transforms <- c(names(BNobject$other_transforms), class(BNobject$chosen_transform)) 15 | 16 | expect_equal(working_transforms, 17 | c("arcsinh_x", "boxcox", "center_scale", "double_reverse_log", "exp_x", 18 | "log_x", "sqrt_x", "yeojohnson", "orderNorm")) 19 | 20 | expect_equal(BNobject$x.t, predict(BNobject)) 21 | expect_equal(BNobject$x, predict(BNobject, inverse = TRUE)) 22 | expect_equal(BNobject4$x.t, predict(BNobject4)) 23 | expect_equal(BNobject4$x, predict(BNobject4, inverse = TRUE)) 24 | }) 25 | 26 | # 27 | BNobject <- suppressWarnings(bestNormalize(c(-1, train), quiet = TRUE)) 28 | BNobject4 <- suppressWarnings(bestNormalize(c(-1, train), allow_orderNorm = FALSE, out_of_sample = FALSE, quiet = TRUE)) 29 | test_that('BestNormalize transformations with mixed data, in-sample', { 30 | expect_equal(BNobject$x.t, predict(BNobject)) 31 | expect_equal(BNobject$x, predict(BNobject, inverse = TRUE)) 32 | expect_equal(BNobject4$x.t, predict(BNobject4), check.attributes = FALSE) 33 | expect_equal(BNobject4$x, predict(BNobject4, inverse = TRUE)) 34 | }) 35 | 36 | 37 | BNobject <- suppressWarnings(bestNormalize(c(-train), out_of_sample = FALSE, quiet = TRUE)) 38 | BNobject4 <- suppressWarnings(bestNormalize(c(-train), allow_orderNorm = FALSE, out_of_sample = FALSE, quiet = TRUE)) 39 | test_that('BestNormalize transformations with negative data', { 40 | expect_equal(BNobject$x.t, predict(BNobject)) 41 | expect_equal(BNobject$x, predict(BNobject, inverse = TRUE)) 42 | expect_equal(BNobject4$x.t, predict(BNobject4)) 43 | expect_equal(BNobject4$x, predict(BNobject4, inverse = TRUE)) 44 | }) 45 | 46 | train2 <- c(train, -1, NA) 47 | BNobject <- suppressWarnings(bestNormalize(train2, out_of_sample = FALSE, quiet = TRUE)) 48 | BNobject4 <- suppressWarnings(bestNormalize(train2, allow_orderNorm = FALSE, out_of_sample = FALSE, quiet = TRUE)) 49 | test_that('BestNormalize transformations with mixed data and missing values', { 50 | expect_equal(BNobject$x.t, predict(BNobject)) 51 | expect_equal(BNobject$x, predict(BNobject, inverse = TRUE)) 52 | expect_equal(BNobject4$x.t, predict(BNobject4), check.attributes = FALSE) 53 | expect_equal(BNobject4$x, predict(BNobject4, inverse = TRUE)) 54 | }) 55 | 56 | test_that('bestNormalize handles missing original data', { 57 | suppressWarnings(b <- bestNormalize(c(NA, train), out_of_sample = FALSE, quiet = TRUE)) 58 | expect_equal(as.numeric(NA), b$x.t[1]) 59 | expect_equal(as.numeric(NA), predict(b)[1]) 60 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 61 | }) 62 | 63 | test_that('bestNormalize handles missing new data', { 64 | suppressWarnings(b <- bestNormalize(train, out_of_sample = FALSE, quiet = TRUE)) 65 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 66 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 67 | }) 68 | 69 | # Test standardize = FALSE 70 | train2 <- c(train, -1, NA) 71 | BNobject <- suppressWarnings(bestNormalize(train2, standardize = FALSE, out_of_sample = FALSE, quiet = TRUE)) 72 | BNobject4 <- suppressWarnings(bestNormalize(train2, standardize = FALSE, 73 | allow_orderNorm = FALSE, out_of_sample = FALSE, quiet = TRUE)) 74 | test_that('BestNormalize transformations without standardization', { 75 | expect_equal(BNobject$x.t, predict(BNobject)) 76 | expect_equal(BNobject$x, predict(BNobject, inverse = TRUE)) 77 | expect_equal(BNobject4$x.t, predict(BNobject4), check.attributes = FALSE) 78 | expect_equal(BNobject4$x, predict(BNobject4, inverse = TRUE)) 79 | }) 80 | 81 | test_that('bestNormalize without standardization handles missing original data', { 82 | suppressWarnings(b <- bestNormalize(c(NA, train), standardize = FALSE, out_of_sample = FALSE, quiet = TRUE)) 83 | expect_equal(as.numeric(NA), b$x.t[1]) 84 | expect_equal(as.numeric(NA), predict(b)[1]) 85 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 86 | }) 87 | 88 | test_that('bestNormalize without standardization handles missing new data', { 89 | suppressWarnings(b <- bestNormalize(train, standardize = FALSE, out_of_sample = FALSE, quiet = TRUE)) 90 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 91 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 92 | }) 93 | 94 | # Test lambert functionality in bestNormalize 95 | test_that("bestNormalize works with lambert of type s", { 96 | skip_on_cran() 97 | skip_on_ci() 98 | b <- suppressWarnings(bestNormalize(train, allow_lambert_s = TRUE, quiet = TRUE, out_of_sample = FALSE)) 99 | expect_true(!is.null(b$other_transforms$lambert_s)) 100 | expect_true(is.null(b$other_transforms$lambert_h)) 101 | expect_true(is.null(b$other_transforms$lambert_hh)) 102 | }) 103 | 104 | test_that("bestNormalize works with lambert of type h", { 105 | skip_on_cran() 106 | skip_on_ci() 107 | b <- suppressWarnings(bestNormalize(train, allow_lambert_h = TRUE, allow_lambert_s = FALSE, quiet = TRUE, out_of_sample = FALSE)) 108 | expect_true(is.null(b$other_transforms$lambert_s)) 109 | expect_true(!is.null(b$other_transforms$lambert_h)) 110 | expect_true(!is.null(b$other_transforms$lambert_hh)) 111 | }) 112 | 113 | test_that("bestNormalize works with lambert of type h", { 114 | skip_on_cran() 115 | skip_on_ci() 116 | b <- suppressWarnings(bestNormalize(train, allow_lambert_h = TRUE, allow_lambert_s = FALSE, quiet = TRUE, out_of_sample = FALSE)) 117 | expect_true(is.null(b$other_transforms$lambert_s)) 118 | expect_true(!is.null(b$other_transforms$lambert_h)) 119 | expect_true(!is.null(b$other_transforms$lambert_hh)) 120 | }) 121 | 122 | test_that("options work for bestNormalize", { 123 | ## Log_x 124 | expect_silent(b <- bestNormalize(train, tr_opts = list(log_x = list(a = 1)), warn = FALSE)) 125 | expect_equal(b$other_transforms$log_x$a, 1) 126 | expect_silent(b <- bestNormalize(train, tr_opts = list(log_x = list(a = 100)), warn = FALSE)) 127 | expect_equal(b$other_transforms$log_x$a, 100) 128 | 129 | expect_silent(b <- bestNormalize(train, tr_opts = list(log_x = list(a = 1, b = 5)), warn = FALSE)) 130 | expect_equal(b$other_transforms$log_x$b, 5) 131 | expect_silent(b <- bestNormalize(train, tr_opts = list(log_x = list(a = 100, b = 10)), warn = FALSE)) 132 | expect_equal(b$other_transforms$log_x$b, 10) 133 | 134 | expect_silent(b <- bestNormalize(train, tr_opts = list(log_x = list(eps = 1)), warn = FALSE)) 135 | expect_equal(b$other_transforms$log_x$eps, 1) 136 | expect_silent(b <- bestNormalize(train, tr_opts = list(log_x = list(eps = 100)), warn = FALSE)) 137 | expect_equal(b$other_transforms$log_x$eps, 100) 138 | 139 | ## Sqrt_x 140 | expect_silent(b <- bestNormalize(train, tr_opts = list(sqrt_x = list(a = 1)), warn = FALSE)) 141 | expect_equal(b$other_transforms$sqrt_x$a, 1) 142 | expect_silent(b <- bestNormalize(train, tr_opts = list(sqrt_x = list(a = 100)), warn = FALSE)) 143 | expect_equal(b$other_transforms$sqrt_x$a, 100) 144 | 145 | ## yeo_johnson 146 | expect_silent(b <- bestNormalize(train, tr_opts = list(yeojohnson = list(eps = 0.1)), warn = FALSE)) 147 | expect_equal(b$other_transforms$yeojohnson$eps, .1) 148 | expect_silent(b <- bestNormalize(train, tr_opts = list(yeojohnson = list(eps = .01)), warn = FALSE)) 149 | expect_equal(b$other_transforms$yeojohnson$eps, .01) 150 | }) 151 | -------------------------------------------------------------------------------- /tests/testthat/test_binarize.R: -------------------------------------------------------------------------------- 1 | context('binarize functionality') 2 | 3 | data(iris) 4 | 5 | train <- iris$Petal.Width 6 | binarize.obj <- binarize(train) 7 | bn2 <- binarize(train, "mean") 8 | bn3 <- binarize(train, "mode") 9 | 10 | test_that('binarize ransforms original data consistently' , { 11 | expect_equal(binarize.obj$x.t, predict(binarize.obj)) 12 | expect_equal(bn2$x.t, predict(bn2)) 13 | expect_equal(bn3$x.t, predict(bn3)) 14 | }) 15 | 16 | test_that('binarize Transforms new data', { 17 | nd <- seq(0, 4, length = 100) 18 | pred <- predict(binarize.obj, newdata = nd) 19 | expect_true(!any(is.na(pred))) 20 | 21 | pred <- predict(bn2, newdata = nd) 22 | expect_true(!any(is.na(pred))) 23 | 24 | pred <- predict(bn3, newdata = nd) 25 | expect_true(!any(is.na(pred))) 26 | }) 27 | 28 | test_that('binarize correctly handles missing original data', { 29 | b <- binarize(c(NA, train)) 30 | expect_equal(as.numeric(NA), b$x.t[1]) 31 | expect_equal(as.numeric(NA), predict(b)[1]) 32 | expect_equal(as.numeric(NA), as.numeric(predict(b, inverse = TRUE)[1])) 33 | 34 | b <- binarize(c(NA, train), "mean") 35 | expect_equal(as.numeric(NA), b$x.t[1]) 36 | expect_equal(as.numeric(NA), predict(b)[1]) 37 | expect_equal(as.numeric(NA), as.numeric(predict(b, inverse = TRUE)[1])) 38 | 39 | b <- binarize(c(NA, train), "mode") 40 | expect_equal(as.numeric(NA), b$x.t[1]) 41 | expect_equal(as.numeric(NA), predict(b)[1]) 42 | expect_equal(as.numeric(NA), as.numeric(predict(b, inverse = TRUE)[1])) 43 | }) 44 | 45 | test_that('binarize correctly handles missing new data', { 46 | b <- binarize(train) 47 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 48 | expect_equal(as.numeric(NA), as.numeric(predict(b, newdata = c(1, NA), inverse = TRUE)[2])) 49 | 50 | b <- binarize(train, "mean") 51 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 52 | expect_equal(as.numeric(NA), as.numeric(predict(b, newdata = c(1, NA), inverse = TRUE)[2])) 53 | 54 | b <- binarize(train, "mode") 55 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 56 | expect_equal(as.numeric(NA), as.numeric(predict(b, newdata = c(1, NA), inverse = TRUE)[2])) 57 | }) -------------------------------------------------------------------------------- /tests/testthat/test_bn_parallel.R: -------------------------------------------------------------------------------- 1 | context('bestNormalize parallel functionality') 2 | 3 | skip_on_cran() 4 | set.seed(1) 5 | 6 | data(iris) 7 | iris <- iris[1:30,] 8 | 9 | train <- iris$Petal.Width 10 | 11 | cl <- parallel::makeCluster(2) 12 | 13 | test_that("Parallel functionality works for RCV", { 14 | b <- bestNormalize(train, warn = FALSE, cluster = cl, r = 2, quiet = TRUE) 15 | expect_true(is.null(b$other_transforms$lambert_h)) 16 | 17 | b <- bestNormalize(train, warn = FALSE, cluster = cl, allow_lambert_s = TRUE, r = 2, quiet = TRUE) 18 | expect_true(!is.null(b$other_transforms$lambert_s)) 19 | expect_true(is.null(b$other_transforms$lambert_h)) 20 | 21 | b <- bestNormalize(train, warn = FALSE, cluster = cl, allow_lambert_h = TRUE, allow_lambert_s = FALSE, r = 2, quiet = TRUE) 22 | expect_true(is.null(b$other_transforms$lambert_s)) 23 | expect_true(!is.null(b$other_transforms$lambert_h)) 24 | 25 | b <- bestNormalize(train, warn = FALSE, cluster = cl, allow_lambert_h = TRUE, allow_lambert_s = TRUE, r = 2, quiet = TRUE) 26 | expect_true(!is.null(b$other_transforms$lambert_s)) 27 | expect_true(!is.null(b$other_transforms$lambert_h)) 28 | 29 | }) 30 | 31 | train <- iris$Petal.Width 32 | 33 | test_that("Parallel functionality works for LOO", { 34 | b <- bestNormalize(train, warn = FALSE, cluster = cl, loo = TRUE, quiet = TRUE) 35 | b <- bestNormalize(train, warn = FALSE, allow_lambert_h = TRUE, 36 | allow_lambert_s = TRUE, loo = TRUE, quiet = FALSE) 37 | expect_true(!is.null(b$other_transforms$lambert_s)) 38 | expect_true(!is.null(b$other_transforms$lambert_h)) 39 | 40 | }) 41 | 42 | ## Test custom functions and parallelization 43 | 44 | ## Define user-function 45 | cuberoot_x <- function(x, a = NULL, standardize = TRUE) { 46 | stopifnot(is.numeric(x)) 47 | 48 | min_a <- max(0, -(min(x, na.rm = TRUE))) 49 | if(!length(a)) 50 | a <- min_a 51 | if(a < min_a) { 52 | warning("Setting a < max(0, -(min(x))) can lead to transformation issues", 53 | "Standardize set to FALSE") 54 | standardize <- FALSE 55 | } 56 | 57 | 58 | x.t <- (x + a)^(1/3) 59 | mu <- mean(x.t, na.rm = TRUE) 60 | sigma <- sd(x.t, na.rm = TRUE) 61 | if (standardize) x.t <- (x.t - mu) / sigma 62 | 63 | # Get in-sample normality statistic results 64 | ptest <- nortest::pearson.test(x.t) 65 | 66 | val <- list( 67 | x.t = x.t, 68 | x = x, 69 | mean = mu, 70 | sd = sigma, 71 | a = a, 72 | n = length(x.t) - sum(is.na(x)), 73 | norm_stat = unname(ptest$statistic / ptest$df), 74 | standardize = standardize 75 | ) 76 | 77 | # Assign class 78 | class(val) <- c('cuberoot_x', class(val)) 79 | val 80 | } 81 | 82 | predict.cuberoot_x <- function(object, newdata = NULL, inverse = FALSE, ...) { 83 | 84 | # If no data supplied and not inverse 85 | if (is.null(newdata) & !inverse) 86 | newdata <- object$x 87 | 88 | # If no data supplied and inverse 89 | if (is.null(newdata) & inverse) 90 | newdata <- object$x.t 91 | 92 | # Actually performing transformations 93 | 94 | # Perform inverse transformation as estimated 95 | if (inverse) { 96 | 97 | # Reverse-standardize 98 | if (object$standardize) 99 | newdata <- newdata * object$sd + object$mean 100 | 101 | # Reverse-cube-root (cube) 102 | newdata <- newdata^3 - object$a 103 | 104 | 105 | # Otherwise, perform transformation as estimated 106 | } else if (!inverse) { 107 | # Take cube root 108 | newdata <- (newdata + object$a)^(1/3) 109 | 110 | # Standardize to mean 0, sd 1 111 | if (object$standardize) 112 | newdata <- (newdata - object$mean) / object$sd 113 | } 114 | 115 | # Return transformed data 116 | unname(newdata) 117 | } 118 | 119 | ## Define user-function 120 | quadroot_x <- function(x, a = NULL, standardize = TRUE) { 121 | stopifnot(is.numeric(x)) 122 | 123 | min_a <- max(0, -(min(x, na.rm = TRUE))) 124 | if(!length(a)) 125 | a <- min_a 126 | if(a < min_a) { 127 | warning("Setting a < max(0, -(min(x))) can lead to transformation issues", 128 | "Standardize set to FALSE") 129 | standardize <- FALSE 130 | } 131 | 132 | 133 | x.t <- (x + a)^(1/4) 134 | mu <- mean(x.t, na.rm = TRUE) 135 | sigma <- sd(x.t, na.rm = TRUE) 136 | if (standardize) x.t <- (x.t - mu) / sigma 137 | 138 | # Get in-sample normality statistic results 139 | ptest <- nortest::pearson.test(x.t) 140 | 141 | val <- list( 142 | x.t = x.t, 143 | x = x, 144 | mean = mu, 145 | sd = sigma, 146 | a = a, 147 | n = length(x.t) - sum(is.na(x)), 148 | norm_stat = unname(ptest$statistic / ptest$df), 149 | standardize = standardize 150 | ) 151 | 152 | # Assign class 153 | class(val) <- c('quadroot_x', class(val)) 154 | val 155 | } 156 | 157 | predict.quadroot_x <- function(object, newdata = NULL, inverse = FALSE, ...) { 158 | 159 | # If no data supplied and not inverse 160 | if (is.null(newdata) & !inverse) 161 | newdata <- object$x 162 | 163 | # If no data supplied and inverse 164 | if (is.null(newdata) & inverse) 165 | newdata <- object$x.t 166 | 167 | # Actually performing transformations 168 | 169 | # Perform inverse transformation as estimated 170 | if (inverse) { 171 | 172 | # Reverse-standardize 173 | if (object$standardize) 174 | newdata <- newdata * object$sd + object$mean 175 | 176 | # Reverse-quad-root (quad) 177 | newdata <- newdata^4 - object$a 178 | 179 | 180 | # Otherwise, perform transformation as estimated 181 | } else if (!inverse) { 182 | # Take quad root 183 | newdata <- (newdata + object$a)^(1/4) 184 | 185 | # Standardize to mean 0, sd 1 186 | if (object$standardize) 187 | newdata <- (newdata - object$mean) / object$sd 188 | } 189 | 190 | # Return transformed data 191 | unname(newdata) 192 | } 193 | 194 | 195 | new_transforms <- list( 196 | quadroot_x = quadroot_x, 197 | predict.quadroot_x = predict.quadroot_x, 198 | cuberoot_x = cuberoot_x, 199 | predict.cuberoot_x = predict.cuberoot_x 200 | ) 201 | 202 | test_that("Parallel functionality works for RCV", { 203 | expect_silent(b <- bestNormalize(train, warn = FALSE, cluster = cl, r = 2, quiet = TRUE, new_transforms = new_transforms)) 204 | expect_true(is.null(b$other_transforms$lambert_h)) 205 | 206 | expect_silent(b <- bestNormalize(train, warn = FALSE, cluster = cl, allow_lambert_s = TRUE, 207 | r = 2, quiet = TRUE, new_transforms = new_transforms)) 208 | expect_true(!is.null(b$other_transforms$lambert_s)) 209 | expect_true(is.null(b$other_transforms$lambert_h)) 210 | 211 | expect_silent(b <- bestNormalize(train, warn = FALSE, cluster = cl, allow_lambert_h = TRUE, 212 | r = 2, quiet = TRUE, new_transforms = new_transforms)) 213 | expect_true(!is.null(b$other_transforms$lambert_h)) 214 | 215 | expect_silent(b <- bestNormalize(train, warn = FALSE, cluster = cl, allow_lambert_h = TRUE, 216 | allow_lambert_s = TRUE, r = 2, quiet = TRUE, new_transforms = new_transforms)) 217 | expect_true(!is.null(b$other_transforms$lambert_s)) 218 | expect_true(!is.null(b$other_transforms$lambert_h)) 219 | 220 | }) 221 | 222 | train <- iris$Petal.Width 223 | 224 | test_that("Parallel functionality works for LOO", { 225 | b <- bestNormalize(train, warn = FALSE, cluster = cl, loo = TRUE, quiet = TRUE, new_transforms = new_transforms) 226 | b <- bestNormalize(train, warn = FALSE, cluster = cl, allow_lambert_h = TRUE, 227 | allow_lambert_s = TRUE, loo = TRUE, quiet = TRUE, new_transforms = new_transforms) 228 | expect_true(!is.null(b$other_transforms$lambert_s)) 229 | expect_true(!is.null(b$other_transforms$lambert_h)) 230 | 231 | }) 232 | 233 | 234 | parallel::stopCluster(cl) 235 | 236 | -------------------------------------------------------------------------------- /tests/testthat/test_boxcox.R: -------------------------------------------------------------------------------- 1 | context('boxcox functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | boxcox_obj <- boxcox(train) 6 | 7 | test_that('boxcox Transforms original data consistently', { 8 | expect_equal(boxcox_obj$x.t, predict(boxcox_obj)) 9 | expect_equal(boxcox_obj$x, predict(boxcox_obj, inverse = T)) 10 | }) 11 | 12 | test_that('boxcox Transforms new data consistently', { 13 | nd <- seq(0, 4, length = 100) 14 | pred <- predict(boxcox_obj, newdata = nd) 15 | expect_true(!any(is.na(pred))) 16 | 17 | nd2 <- predict(boxcox_obj, newdata = pred, inverse = TRUE) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('boxcox does not try to estimate with negatives' , { 22 | expect_error(boxcox(c(-1, 2, 3))) 23 | }) 24 | 25 | test_that('boxcox correctly handles missing original data', { 26 | b <- boxcox(c(NA, train)) 27 | expect_equal(as.numeric(NA), b$x.t[1]) 28 | expect_equal(as.numeric(NA), predict(b)[1]) 29 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 30 | }) 31 | 32 | test_that('boxcox correctly handles missing new data', { 33 | b <- boxcox(train) 34 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 35 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 36 | }) 37 | 38 | # Test standardization 39 | boxcox_obj <- boxcox(train, standardize = FALSE) 40 | 41 | test_that('boxcox Transforms original data consistently', { 42 | expect_equal(boxcox_obj$x.t, predict(boxcox_obj)) 43 | expect_equal(boxcox_obj$x, predict(boxcox_obj, inverse = TRUE)) 44 | }) 45 | 46 | test_that('boxcox Transforms new data consistently', { 47 | nd <- seq(0, 4, length = 100) 48 | pred <- predict(boxcox_obj, newdata = nd) 49 | expect_true(!any(is.na(pred))) 50 | 51 | nd2 <- predict(boxcox_obj, newdata = pred, inverse = TRUE) 52 | expect_equal(nd, nd2) 53 | }) 54 | 55 | test_that('boxcox does not try to estimate with negatives' , { 56 | expect_error(boxcox(c(-1, 2, 3), standardize = FALSE)) 57 | }) 58 | 59 | test_that('boxcox correctly handles missing original data', { 60 | b <- boxcox(c(NA, train), standardize = FALSE) 61 | expect_equal(as.numeric(NA), b$x.t[1]) 62 | expect_equal(as.numeric(NA), predict(b)[1]) 63 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 64 | }) 65 | 66 | test_that('boxcox correctly handles missing new data', { 67 | b <- boxcox(train, standardize = FALSE) 68 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 69 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 70 | }) 71 | -------------------------------------------------------------------------------- /tests/testthat/test_double_reverse_log.R: -------------------------------------------------------------------------------- 1 | context('double_reverse_log functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | double_reverse_log_obj <- double_reverse_log(train) 6 | 7 | test_that('double_reverse_log Transforms original data consistently', { 8 | expect_equal(double_reverse_log_obj$x.t, predict(double_reverse_log_obj)) 9 | expect_equal(double_reverse_log_obj$x, predict(double_reverse_log_obj, inverse = TRUE)) 10 | }) 11 | 12 | test_that('double_reverse_log Transforms new data consistently', { 13 | nd <- seq(0, 2.7, length = 100) 14 | pred <- predict(double_reverse_log_obj, newdata = nd) 15 | expect_true(!any(is.na(pred))) 16 | 17 | nd2 <- predict(double_reverse_log_obj, newdata = pred, inverse = TRUE) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('double_reverse_log correctly handles missing original data', { 22 | b <- double_reverse_log(c(NA, train)) 23 | expect_equal(as.numeric(NA), b$x.t[1]) 24 | expect_equal(as.numeric(NA), predict(b)[1]) 25 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 26 | }) 27 | 28 | test_that('double_reverse_log correctly handles missing new data', { 29 | b <- double_reverse_log(train) 30 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 31 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 32 | }) 33 | 34 | # Test standardization 35 | double_reverse_log_obj <- double_reverse_log(train, standardize = FALSE) 36 | 37 | test_that('double_reverse_log Transforms original data consistently', { 38 | expect_equal(double_reverse_log_obj$x.t, predict(double_reverse_log_obj)) 39 | expect_equal(double_reverse_log_obj$x, predict(double_reverse_log_obj, inverse = TRUE)) 40 | }) 41 | 42 | test_that('double_reverse_log Transforms new data consistently', { 43 | nd <- seq(1, 2.7, length = 100) 44 | pred <- predict(double_reverse_log_obj, newdata = nd) 45 | expect_true(!any(is.na(pred))) 46 | 47 | nd2 <- predict(double_reverse_log_obj, newdata = pred, inverse = TRUE) 48 | expect_equal(nd, nd2) 49 | }) 50 | 51 | test_that('double_reverse_log correctly handles missing original data', { 52 | b <- double_reverse_log(c(NA, train), standardize = FALSE) 53 | expect_equal(as.numeric(NA), b$x.t[1]) 54 | expect_equal(as.numeric(NA), predict(b)[1]) 55 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 56 | }) 57 | 58 | test_that('double_reverse_log correctly handles missing new data', { 59 | b <- double_reverse_log(train, standardize = FALSE) 60 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 61 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 62 | }) 63 | 64 | 65 | double_reverse_log_obj <- double_reverse_log(train, b = exp(1)) 66 | 67 | test_that('double_reverse_log Transforms new data the same regardless of predict order', { 68 | nd <- seq(0, 2.7, length = 100) 69 | 70 | pred <- predict(double_reverse_log_obj, newdata = nd) 71 | nd2 <- predict(double_reverse_log_obj, newdata = pred, inverse = TRUE) 72 | expect_equal(nd, nd2) 73 | 74 | p1 <- sapply(nd, function(nd_i) predict(double_reverse_log_obj, newdata = nd_i)) 75 | p2 <- predict(double_reverse_log_obj, newdata = nd) 76 | expect_equal(p1, p2) 77 | 78 | }) 79 | 80 | double_reverse_log_obj <- double_reverse_log(train, eps = 1.6) 81 | 82 | test_that('double_reverse_log Transforms new data well if padding increased', { 83 | nd <- seq(0, 4, length = 100) 84 | 85 | pred <- predict(double_reverse_log_obj, newdata = nd) 86 | nd2 <- predict(double_reverse_log_obj, newdata = pred, inverse = TRUE) 87 | expect_equal(nd, nd2) 88 | 89 | p1 <- sapply(nd, function(nd_i) predict(double_reverse_log_obj, newdata = nd_i)) 90 | p2 <- predict(double_reverse_log_obj, newdata = nd) 91 | expect_equal(p1, p2) 92 | 93 | }) 94 | -------------------------------------------------------------------------------- /tests/testthat/test_exp_x.R: -------------------------------------------------------------------------------- 1 | context('exp_x functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | exp_x_obj <- exp_x(train) 6 | 7 | test_that('exp_x Transforms original data consistently', { 8 | expect_equal(exp_x_obj$x.t, predict(exp_x_obj)) 9 | expect_equal(exp_x_obj$x, predict(exp_x_obj, inverse = T)) 10 | }) 11 | 12 | test_that('exp_x Transforms new data consistently', { 13 | nd <- seq(0, 4, length = 100) 14 | pred <- predict(exp_x_obj, newdata = nd) 15 | expect_true(!any(is.na(pred))) 16 | 17 | nd2 <- predict(exp_x_obj, newdata = pred, inverse = TRUE) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('exp_x correctly handles missing original data', { 22 | b <- exp_x(c(NA, train)) 23 | expect_equal(as.numeric(NA), b$x.t[1]) 24 | expect_equal(as.numeric(NA), predict(b)[1]) 25 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 26 | }) 27 | 28 | test_that('exp_x correctly handles missing new data', { 29 | b <- exp_x(train) 30 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 31 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 32 | }) 33 | 34 | # Test standardization 35 | exp_x_obj <- exp_x(train, standardize = FALSE) 36 | 37 | test_that('exp_x Transforms original data consistently', { 38 | expect_equal(exp_x_obj$x.t, predict(exp_x_obj)) 39 | expect_equal(exp_x_obj$x, predict(exp_x_obj, inverse = TRUE)) 40 | }) 41 | 42 | test_that('exp_x Transforms new data consistently', { 43 | nd <- seq(0, 4, length = 100) 44 | pred <- predict(exp_x_obj, newdata = nd) 45 | expect_true(!any(is.na(pred))) 46 | 47 | nd2 <- predict(exp_x_obj, newdata = pred, inverse = TRUE) 48 | expect_equal(nd, nd2) 49 | }) 50 | 51 | test_that('exp_x correctly handles missing original data', { 52 | b <- exp_x(c(NA, train), standardize = FALSE) 53 | expect_equal(as.numeric(NA), b$x.t[1]) 54 | expect_equal(as.numeric(NA), predict(b)[1]) 55 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 56 | }) 57 | 58 | test_that('exp_x correctly handles missing new data', { 59 | b <- exp_x(train, standardize = FALSE) 60 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 61 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 62 | }) 63 | 64 | -------------------------------------------------------------------------------- /tests/testthat/test_lambert.R: -------------------------------------------------------------------------------- 1 | context('lambert functionality') 2 | 3 | ## LambertW package doesn't work quite right on linux and 4 | # solaris machines (it uses too many threads) 5 | skip_on_os("linux") 6 | skip_on_os("solaris") 7 | 8 | data(iris) 9 | 10 | train <- iris$Petal.Width 11 | 12 | lambert_obj <- lambert(train) 13 | 14 | test_that('lambert Transforms original data consistently' , { 15 | expect_equal(lambert_obj$x.t, predict(lambert_obj)) 16 | expect_equal(lambert_obj$x, predict(lambert_obj, inverse = TRUE)) 17 | }) 18 | 19 | test_that('lambert Transforms new data consistently', { 20 | nd <- seq(-1, 4, length = 100) 21 | pred <- predict(lambert_obj, newdata = nd) 22 | expect_true(!any(is.na(pred))) 23 | 24 | nd2 <- predict(lambert_obj, newdata = pred, inverse = TRUE) 25 | expect_equal(nd, nd2) 26 | }) 27 | 28 | lambert_obj <- lambert(train, standardize = FALSE) 29 | 30 | test_that('lambert without standardization Transforms original data consistently' , { 31 | expect_equal(lambert_obj$x.t, predict(lambert_obj)) 32 | expect_equal(lambert_obj$x, predict(lambert_obj, inverse = TRUE)) 33 | }) 34 | 35 | test_that('lambert without standardization Transforms new data consistently', { 36 | nd <- seq(-1, 4, length = 100) 37 | pred <- predict(lambert_obj, newdata = nd) 38 | expect_true(!any(is.na(pred))) 39 | 40 | nd2 <- predict(lambert_obj, newdata = pred, inverse = TRUE) 41 | expect_equal(nd, nd2) 42 | }) 43 | 44 | # for type = 'h' 45 | 46 | lambert_obj <- lambert(train, type = 'h') 47 | 48 | test_that('lambert Transforms original data consistently' , { 49 | expect_equal(lambert_obj$x.t, predict(lambert_obj)) 50 | expect_equal(lambert_obj$x, predict(lambert_obj, inverse = TRUE)) 51 | }) 52 | 53 | test_that('lambert Transforms new data consistently', { 54 | nd <- seq(0, 4, length = 100) 55 | pred <- predict(lambert_obj, newdata = nd) 56 | expect_true(!any(is.na(pred))) 57 | 58 | nd2 <- predict(lambert_obj, newdata = pred, inverse = TRUE) 59 | expect_equal(nd, nd2, tolerance = .001) 60 | }) 61 | 62 | test_that('lambert correctly handles missing original data', { 63 | b <- lambert(c(NA, train)) 64 | expect_equal(as.numeric(NA), b$x.t[1]) 65 | expect_equal(as.numeric(NA), predict(b)[1]) 66 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 67 | }) 68 | 69 | test_that('lambert correctly handles missing new data', { 70 | b <- lambert(train) 71 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 72 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 73 | }) 74 | -------------------------------------------------------------------------------- /tests/testthat/test_log_x.R: -------------------------------------------------------------------------------- 1 | context('log_x functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | log_x_obj <- log_x(train) 6 | 7 | test_that('log_x Transforms original data consistently', { 8 | expect_equal(log_x_obj$x.t, predict(log_x_obj)) 9 | expect_equal(log_x_obj$x, predict(log_x_obj, inverse = TRUE)) 10 | }) 11 | 12 | test_that('log_x Transforms new data consistently', { 13 | nd <- seq(0, 4, length = 100) 14 | pred <- predict(log_x_obj, newdata = nd) 15 | expect_true(!any(is.na(pred))) 16 | 17 | nd2 <- predict(log_x_obj, newdata = pred, inverse = TRUE) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('log_x correctly handles missing original data', { 22 | b <- log_x(c(NA, train)) 23 | expect_equal(as.numeric(NA), b$x.t[1]) 24 | expect_equal(as.numeric(NA), predict(b)[1]) 25 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 26 | }) 27 | 28 | test_that('log_x correctly handles missing new data', { 29 | b <- log_x(train) 30 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 31 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 32 | }) 33 | 34 | # Test standardization 35 | log_x_obj <- log_x(train, standardize = FALSE) 36 | 37 | test_that('log_x Transforms original data consistently', { 38 | expect_equal(log_x_obj$x.t, predict(log_x_obj)) 39 | expect_equal(log_x_obj$x, predict(log_x_obj, inverse = TRUE)) 40 | }) 41 | 42 | test_that('log_x Transforms new data consistently', { 43 | nd <- seq(0, 4, length = 100) 44 | pred <- predict(log_x_obj, newdata = nd) 45 | expect_true(!any(is.na(pred))) 46 | 47 | nd2 <- predict(log_x_obj, newdata = pred, inverse = TRUE) 48 | expect_equal(nd, nd2) 49 | }) 50 | 51 | test_that('log_x correctly handles missing original data', { 52 | b <- log_x(c(NA, train), standardize = FALSE) 53 | expect_equal(as.numeric(NA), b$x.t[1]) 54 | expect_equal(as.numeric(NA), predict(b)[1]) 55 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 56 | }) 57 | 58 | test_that('log_x correctly handles missing new data', { 59 | b <- log_x(train, standardize = FALSE) 60 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 61 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 62 | }) 63 | 64 | 65 | log_x_obj <- log_x(train, a = 1) 66 | 67 | test_that('log_x Transforms new data consistently (given a)', { 68 | nd <- seq(0, 4, length = 100) 69 | pred <- predict(log_x_obj, newdata = nd) 70 | expect_true(!any(is.na(pred))) 71 | 72 | nd2 <- predict(log_x_obj, newdata = pred, inverse = TRUE) 73 | expect_equal(nd, nd2) 74 | }) 75 | 76 | log_x_obj <- log_x(train, a = 1, b = exp(1)) 77 | 78 | test_that('log_x Transforms new data consistently (given a and b)', { 79 | nd <- seq(0, 4, length = 100) 80 | pred <- predict(log_x_obj, newdata = nd) 81 | expect_true(!any(is.na(pred))) 82 | 83 | nd2 <- predict(log_x_obj, newdata = pred, inverse = TRUE) 84 | expect_equal(nd, nd2) 85 | }) 86 | -------------------------------------------------------------------------------- /tests/testthat/test_no_transform.R: -------------------------------------------------------------------------------- 1 | context('no_transform functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | no_transform_obj <- no_transform(train) 6 | 7 | test_that('no_transform Transforms original data consistently', { 8 | expect_equal(no_transform_obj$x.t, predict(no_transform_obj)) 9 | expect_equal(no_transform_obj$x, predict(no_transform_obj, inverse = TRUE)) 10 | }) 11 | 12 | test_that('no_transform Transforms new data consistently', { 13 | nd <- seq(0, 4, length = 100) 14 | pred <- predict(no_transform_obj, newdata = nd) 15 | expect_true(!any(is.na(pred))) 16 | 17 | nd2 <- predict(no_transform_obj, newdata = pred, inverse = TRUE) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('no_transform correctly handles missing original data', { 22 | b <- no_transform(c(NA, train)) 23 | expect_equal(as.numeric(NA), b$x.t[1]) 24 | expect_equal(as.numeric(NA), predict(b)[1]) 25 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 26 | }) 27 | 28 | test_that('no_transform correctly handles missing new data', { 29 | b <- no_transform(train) 30 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 31 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 32 | }) 33 | 34 | data(iris) 35 | train <- iris$Petal.Width 36 | center_scale_obj <- center_scale(train) 37 | 38 | test_that('center_scale Transforms original data consistently', { 39 | expect_equal(center_scale_obj$x.t, predict(center_scale_obj)) 40 | expect_equal(center_scale_obj$x, predict(center_scale_obj, inverse = TRUE)) 41 | }) 42 | 43 | test_that('center_scale Transforms new data consistently', { 44 | nd <- seq(0, 4, length = 100) 45 | pred <- predict(center_scale_obj, newdata = nd) 46 | expect_true(!any(is.na(pred))) 47 | 48 | nd2 <- predict(center_scale_obj, newdata = pred, inverse = TRUE) 49 | expect_equal(nd, nd2) 50 | }) 51 | 52 | test_that('center_scale correctly handles missing original data', { 53 | b <- center_scale(c(NA, train)) 54 | expect_equal(as.numeric(NA), b$x.t[1]) 55 | expect_equal(as.numeric(NA), predict(b)[1]) 56 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 57 | }) 58 | 59 | test_that('center_scale correctly handles missing new data', { 60 | b <- center_scale(train) 61 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 62 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 63 | }) 64 | 65 | -------------------------------------------------------------------------------- /tests/testthat/test_orderNorm.R: -------------------------------------------------------------------------------- 1 | context('orderNorm functionality') 2 | 3 | data(iris) 4 | 5 | train <- iris$Petal.Width 6 | orderNorm_obj <- suppressWarnings(orderNorm(train)) 7 | 8 | test_that('orderNorm transforms original data consistently', { 9 | expect_equal(orderNorm_obj$x.t, predict(orderNorm_obj)) 10 | expect_equal(orderNorm_obj$x, predict(orderNorm_obj, inverse = TRUE)) 11 | }) 12 | 13 | test_that('orderNorm Transforms new data consistently', { 14 | nd <- seq(0, 4, length = 100) 15 | expect_warning(pred <- predict(orderNorm_obj, newdata = nd)) 16 | expect_true(!any(is.na(pred))) 17 | expect_warning(nd2 <- predict(orderNorm_obj, newdata = pred, inverse = TRUE)) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('orderNorm correctly handles missing original data', { 22 | suppressWarnings(b <- orderNorm(c(NA, train))) 23 | expect_equal(as.numeric(NA), b$x.t[1]) 24 | expect_equal(as.numeric(NA), predict(b)[1]) 25 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 26 | }) 27 | 28 | test_that('orderNorm correctly handles missing new data', { 29 | suppressWarnings(b <- orderNorm(train)) 30 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 31 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 32 | }) 33 | 34 | test_that('orderNorm Transforms new data consistently using n_logit_fit < n', { 35 | orderNorm_obj <- suppressWarnings(orderNorm(train, n_logit_fit = 50)) 36 | nd <- seq(0, 4, length = 100) 37 | expect_warning(pred <- predict(orderNorm_obj, newdata = nd)) 38 | expect_true(!any(is.na(pred))) 39 | expect_warning(nd2 <- predict(orderNorm_obj, newdata = pred, inverse = TRUE)) 40 | expect_equal(nd, nd2) 41 | 42 | expect_equal(nrow(orderNorm_obj$fit$model), 50) 43 | }) -------------------------------------------------------------------------------- /tests/testthat/test_plot_methods.R: -------------------------------------------------------------------------------- 1 | context('plot method functionality') 2 | 3 | data(iris) 4 | 5 | train <- c(NA, -1, iris$Petal.Width) 6 | 7 | bn <- suppressWarnings(bestNormalize(train, quiet = TRUE)) 8 | test_that('bestNormalize plot method runs (in-sample) without error or warning', { 9 | expect_silent(plot(bn)) 10 | }) 11 | 12 | m2plot <- names(bn$other_transforms) 13 | m2plot <- m2plot[!(m2plot %in% c("exp_x"))] 14 | 15 | test_that('bestNormalize plot method runs (out-of-sample) without error or warning', { 16 | expect_warning(plot(bn, bounds = c(-2, 10), methods = m2plot)) 17 | }) 18 | 19 | on <- suppressWarnings(orderNorm(train)) 20 | test_that('orderNorm plot method runs without error or warning', { 21 | expect_silent(plot(on)) 22 | expect_silent(plot(on, bounds = c(-2, 10))) 23 | }) 24 | 25 | test_that('lambert plot method runs without error or warning', { 26 | skip_on_cran() 27 | skip_on_travis() 28 | expect_silent(lw <- suppressWarnings(lambert(train))) 29 | expect_silent(plot(lw)) 30 | }) 31 | 32 | bc <- suppressWarnings(boxcox(train[train > 0])) 33 | test_that('boxcox plot method runs without error or warning', { 34 | expect_silent(plot(bc)) 35 | }) 36 | 37 | yj <- suppressWarnings(yeojohnson(train)) 38 | test_that('yeojohnson plot method runs without error or warning', { 39 | expect_silent(plot(yj)) 40 | }) 41 | -------------------------------------------------------------------------------- /tests/testthat/test_print_methods.R: -------------------------------------------------------------------------------- 1 | context('print method functionality') 2 | 3 | data(iris) 4 | 5 | train <- c(NA, -1, iris$Petal.Width) 6 | 7 | bn <- suppressWarnings(bestNormalize(train, quiet = TRUE)) 8 | test_that('bestNormalize print method runs without error or warning', { 9 | expect_output(print(bn)) 10 | }) 11 | 12 | on <- suppressWarnings(orderNorm(train)) 13 | test_that('orderNorm print method runs without error or warning', { 14 | expect_output(print(on)) 15 | }) 16 | 17 | test_that('lambert print method runs without error or warning', { 18 | skip_on_cran() 19 | skip_on_travis() 20 | lw <- suppressWarnings(lambert(train)) 21 | 22 | expect_output(print(lw)) 23 | }) 24 | 25 | bc <- suppressWarnings(boxcox(train[train > 0])) 26 | test_that('boxcox print method runs without error or warning', { 27 | expect_output(print(bc)) 28 | }) 29 | 30 | ex <- suppressWarnings(exp_x(train)) 31 | test_that('exp_x print method runs without error or warning', { 32 | expect_output(print(ex)) 33 | }) 34 | 35 | lx <- suppressWarnings(log_x(train)) 36 | test_that('log_x print method runs without error or warning', { 37 | expect_output(print(lx)) 38 | }) 39 | 40 | nt <- suppressWarnings(no_transform(train)) 41 | test_that('no_transform print method runs without error or warning', { 42 | expect_output(print(nt)) 43 | expect_silent(val <- tidy(nt)) 44 | }) 45 | 46 | cs <- suppressWarnings(center_scale(train)) 47 | test_that('center_scale print method runs without error or warning', { 48 | expect_output(print(cs)) 49 | }) 50 | 51 | sx <- suppressWarnings(sqrt_x(train)) 52 | test_that('sqrt_x print method runs without error or warning', { 53 | expect_output(print(sx)) 54 | }) 55 | 56 | yj <- suppressWarnings(yeojohnson(train)) 57 | test_that('yeojohnson print method runs without error or warning', { 58 | expect_output(print(yj)) 59 | }) 60 | 61 | bn <- suppressWarnings(binarize(train)) 62 | test_that('binarize print method runs without error or warning', { 63 | expect_output(print(bn)) 64 | }) 65 | 66 | as <- suppressWarnings(arcsinh_x(train)) 67 | test_that('arcsinh_x print method runs without error or warning', { 68 | expect_output(print(as)) 69 | }) 70 | 71 | 72 | -------------------------------------------------------------------------------- /tests/testthat/test_sqrt_x.R: -------------------------------------------------------------------------------- 1 | context('sqrt_x functionality') 2 | 3 | data(iris) 4 | train <- iris$Petal.Width 5 | sqrt_x_obj <- sqrt_x(train) 6 | 7 | test_that('sqrt_x Transforms original data consistently', { 8 | expect_equal(sqrt_x_obj$x.t, predict(sqrt_x_obj)) 9 | expect_equal(sqrt_x_obj$x, predict(sqrt_x_obj, inverse = TRUE)) 10 | }) 11 | 12 | test_that('sqrt_x Transforms new data consistently', { 13 | nd <- seq(0, 4, length = 100) 14 | pred <- predict(sqrt_x_obj, newdata = nd) 15 | expect_true(!any(is.na(pred))) 16 | 17 | nd2 <- predict(sqrt_x_obj, newdata = pred, inverse = TRUE) 18 | expect_equal(nd, nd2) 19 | }) 20 | 21 | test_that('sqrt_x correctly handles missing original data', { 22 | b <- sqrt_x(c(NA, train)) 23 | expect_equal(as.numeric(NA), b$x.t[1]) 24 | expect_equal(as.numeric(NA), predict(b)[1]) 25 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 26 | }) 27 | 28 | test_that('sqrt_x correctly handles missing new data', { 29 | b <- sqrt_x(train) 30 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 31 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 32 | }) 33 | 34 | # Test standardization 35 | sqrt_x_obj <- sqrt_x(train, standardize = FALSE) 36 | 37 | test_that('sqrt_x Transforms original data consistently', { 38 | expect_equal(sqrt_x_obj$x.t, predict(sqrt_x_obj)) 39 | expect_equal(sqrt_x_obj$x, predict(sqrt_x_obj, inverse = TRUE)) 40 | }) 41 | 42 | test_that('sqrt_x Transforms new data consistently', { 43 | nd <- seq(0, 4, length = 100) 44 | pred <- predict(sqrt_x_obj, newdata = nd) 45 | expect_true(!any(is.na(pred))) 46 | 47 | nd2 <- predict(sqrt_x_obj, newdata = pred, inverse = TRUE) 48 | expect_equal(nd, nd2) 49 | }) 50 | 51 | test_that('sqrt_x correctly handles missing original data', { 52 | b <- sqrt_x(c(NA, train), standardize = FALSE) 53 | expect_equal(as.numeric(NA), b$x.t[1]) 54 | expect_equal(as.numeric(NA), predict(b)[1]) 55 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 56 | }) 57 | 58 | test_that('sqrt_x correctly handles missing new data', { 59 | b <- sqrt_x(train, standardize = FALSE) 60 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 61 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 62 | }) 63 | 64 | 65 | sqrt_x_obj <- sqrt_x(train, a = 1) 66 | 67 | test_that('sqrt_x Transforms new data consistently (given a)', { 68 | nd <- seq(0, 4, length = 100) 69 | pred <- predict(sqrt_x_obj, newdata = nd) 70 | expect_true(!any(is.na(pred))) 71 | 72 | nd2 <- predict(sqrt_x_obj, newdata = pred, inverse = TRUE) 73 | expect_equal(nd, nd2) 74 | }) -------------------------------------------------------------------------------- /tests/testthat/test_step_fns.R: -------------------------------------------------------------------------------- 1 | context('step_* recipes functionality') 2 | library(recipes) 3 | 4 | # par(mfrow = c(2,1)) 5 | rec <- recipe(~ ., data = as.data.frame(iris)) 6 | set.seed(1) 7 | 8 | test_that('step_* transformations with iris data', { 9 | dt1 <- orderNorm(iris$Petal.Length, warn = FALSE)$x.t 10 | 11 | ## Using BoxCox 12 | 13 | expect_silent(bc_trans <- step_BoxCox(rec, all_numeric())) 14 | expect_silent(bc_estimates <- prep(bc_trans, training = as.data.frame(iris))) 15 | expect_silent(bc_data <- bake(bc_estimates, as.data.frame(iris))) 16 | # plot(density(iris[, "Petal.Length"]), main = "before") 17 | # plot(density(bc_data$Petal.Length), main = "after") 18 | expect_equal(nrow(tidy(bc_trans, number = 1)), 1) 19 | expect_equal(nrow(tidy(bc_estimates, number = 1)), 4) 20 | 21 | ## Using bestNormalize 22 | 23 | # Check deprecation 24 | expect_warning(bn_trans <- step_bestNormalize(rec, all_numeric())) 25 | 26 | # Check step_best_normalize 27 | expect_silent(bn_trans <- step_best_normalize(rec, all_numeric())) 28 | expect_silent(bn_estimates <- prep(bn_trans, training = as.data.frame(iris))) 29 | expect_silent(bn_data <- bake(bn_estimates, as.data.frame(iris))) 30 | # plot(density(iris[, "Petal.Length"]), main = "before") 31 | # plot(density(bn_data$Petal.Length), main = "after") 32 | expect_equal(nrow(tidy(bn_trans, number = 1)), 1) 33 | expect_equal(nrow(tidy(bn_estimates, number = 1)), 4) 34 | expect_equal(unname(tidy(bn_estimates, number = 1)$chosen_transform[3]), "orderNorm") 35 | expect_identical(dt1, bn_data$Petal.Length) 36 | 37 | ## LOO 38 | expect_silent(bn_trans <- step_best_normalize(rec, all_numeric(), transform_options = list(loo = TRUE, allow_orderNorm = FALSE))) 39 | expect_silent(bn_estimates <- prep(bn_trans, training = as.data.frame(iris))) 40 | expect_silent(bn_data <- bake(bn_estimates, as.data.frame(iris))) 41 | # plot(density(iris[, "Petal.Length"]), main = "before") 42 | # plot(density(bn_data$Petal.Length), main = "after") 43 | expect_equal(nrow(tidy(bn_trans, number = 1)), 1) 44 | expect_equal(nrow(tidy(bn_estimates, number = 1)), 4) 45 | expect_s3_class(tidy(bn_estimates, number = 1)$cv_info[[3]]$tr_object[[1]], "double_reverse_log") 46 | 47 | ## Faster (use in-sample metrics, does NOT use orderNorm) 48 | expect_silent(bn_trans <- step_best_normalize(rec, all_numeric(), transform_options = list(out_of_sample = FALSE, allow_orderNorm = FALSE))) 49 | expect_silent(bn_estimates <- prep(bn_trans, training = as.data.frame(iris))) 50 | expect_silent(bn_data <- bake(bn_estimates, as.data.frame(iris))) 51 | # plot(density(iris[, "Petal.Length"]), main = "before") 52 | # plot(density(bn_data$Petal.Length), main = "after") 53 | expect_equal(nrow(tidy(bn_trans, number = 1)), 1) 54 | expect_equal(nrow(tidy(bn_estimates, number = 1)), 4) 55 | expect_s3_class(tidy(bn_estimates, number = 1)$cv_info[[3]]$tr_object[[1]], "double_reverse_log") 56 | 57 | ## Fastest (only use ORQ (orderNorm) transformation) 58 | expect_silent(orq_trans <- step_orderNorm(rec, all_numeric())) 59 | expect_silent(orq_estimates <- prep(orq_trans, training = as.data.frame(iris))) 60 | expect_silent(orq_data <- bake(orq_estimates, as.data.frame(iris))) 61 | # plot(density(iris[, "Petal.Length"]), main = "before") 62 | # plot(density(orq_data$Petal.Length), main = "after") 63 | expect_equal(nrow(tidy(orq_trans, number = 1)), 1) 64 | expect_equal(nrow(tidy(orq_estimates, number = 1)), 4) 65 | expect_s3_class(tidy(orq_estimates, number = 1)$value[[3]], "orderNorm") 66 | expect_identical(dt1, orq_data$Petal.Length) 67 | 68 | }) 69 | 70 | iris2 <- iris 71 | iris2$Petal.Length[c(5,10, 19)] <- c(NA, -1, -3) 72 | iris2$group <- rep(1:2, nrow(iris)/2) 73 | 74 | test_that('step_* transformations with missing/negative/discrete data', { 75 | dt1 <- orderNorm(iris2$Petal.Length, warn = FALSE)$x.t 76 | 77 | ## Using BoxCox (should have warning due to negative data) 78 | expect_silent(bc_trans <- step_BoxCox(rec, all_numeric())) 79 | if(packageVersion("recipes") > "0.1.16") 80 | expect_warning(bc_estimates <- prep(bc_trans, training = as.data.frame(iris2))) else 81 | expect_silent(bc_estimates <- prep(bc_trans, training = as.data.frame(iris2))) 82 | expect_silent(bc_data <- bake(bc_estimates, as.data.frame(iris2))) 83 | # plot(density(iris2[, "Petal.Length"]), main = "before") 84 | # plot(density(bc_data$Petal.Length), main = "after") 85 | expect_equal(nrow(tidy(bc_trans, number = 1)), 1) 86 | expect_equal(nrow(tidy(bc_estimates, number = 1)), 3) 87 | 88 | ## Using bestNormalize 89 | expect_silent(bn_trans <- step_best_normalize(rec, all_numeric())) 90 | expect_silent(bn_estimates <- prep(bn_trans, training = as.data.frame(iris2))) 91 | expect_silent(bn_data <- bake(bn_estimates, as.data.frame(iris2))) 92 | # plot(density(iris2[, "Petal.Length"]), main = "before") 93 | # plot(density(bn_data$Petal.Length), main = "after") 94 | expect_equal(nrow(tidy(bn_trans, number = 1)), 1) 95 | expect_equal(nrow(tidy(bn_estimates, number = 1)), 4) 96 | expect_s3_class(tidy(bn_estimates, number = 1)$cv_info[[3]]$tr_object[[1]], "orderNorm") 97 | expect_identical(dt1, bn_data$Petal.Length) 98 | 99 | ## LOO 100 | expect_silent(bn_trans <- step_best_normalize(rec, all_numeric(), transform_options = list(loo = TRUE, allow_orderNorm = FALSE))) 101 | expect_silent(bn_estimates <- prep(bn_trans, training = as.data.frame(iris2))) 102 | expect_silent(bn_data <- bake(bn_estimates, as.data.frame(iris2))) 103 | # plot(density(iris2[, "Petal.Length"]), main = "before") 104 | # plot(density(bn_data$Petal.Length), main = "after") 105 | expect_equal(nrow(tidy(bn_trans, number = 1)), 1) 106 | expect_equal(nrow(tidy(bn_estimates, number = 1)), 4) 107 | # expect_s3_class(tidy(bn_estimates, number = 1)$cv_info[[3]]$tr_object[[1]], "lambert") 108 | 109 | ## Faster (use in-sample metrics, does NOT use orderNorm) 110 | expect_silent(bn_trans <- step_best_normalize(rec, all_numeric(), transform_options = list(out_of_sample = FALSE, allow_orderNorm = FALSE))) 111 | expect_silent(bn_estimates <- prep(bn_trans, training = as.data.frame(iris2))) 112 | expect_silent(bn_data <- bake(bn_estimates, as.data.frame(iris2))) 113 | # plot(density(iris2[, "Petal.Length"]), main = "before") 114 | # plot(density(bn_data$Petal.Length), main = "after") 115 | expect_equal(nrow(tidy(bn_trans, number = 1)), 1) 116 | expect_equal(nrow(tidy(bn_estimates, number = 1)), 4) 117 | expect_s3_class(tidy(bn_estimates, number = 1)$cv_info[[3]]$tr_object[[1]], "double_reverse_log") 118 | 119 | ## Fastest (only use ORQ (orderNorm) transformation) 120 | expect_silent(orq_trans <- step_orderNorm(rec, all_numeric())) 121 | expect_silent(orq_estimates <- prep(orq_trans, training = as.data.frame(iris2))) 122 | expect_silent(orq_data <- bake(orq_estimates, as.data.frame(iris2))) 123 | # plot(density(iris2[, "Petal.Length"]), main = "before") 124 | # plot(density(orq_data$Petal.Length), main = "after") 125 | expect_equal(nrow(tidy(orq_trans, number = 1)), 1) 126 | expect_equal(nrow(tidy(orq_estimates, number = 1)), 4) 127 | expect_s3_class(tidy(orq_estimates, number = 1)$value[[3]], "orderNorm") 128 | expect_identical(dt1, orq_data$Petal.Length) 129 | }) 130 | 131 | ## A little help from butcher package 132 | terms_empty_env <- function(axed, step_number) { 133 | expect_identical(attr(axed$steps[[step_number]]$terms[[1]], ".Environment"), 134 | rlang::base_env()) 135 | } 136 | 137 | impute_empty_env <- function(axed, step_number) { 138 | expect_identical(attr(axed$steps[[step_number]]$impute_with[[1]], ".Environment"), 139 | rlang::base_env()) 140 | } 141 | 142 | test_that("recipe + step_best_normalize + axe_env() works", { 143 | rec <- recipe(~ ., data = as.data.frame(state.x77)) %>% 144 | step_best_normalize(rec, all_numeric()) 145 | x <- axe_env(rec) 146 | terms_empty_env(x, 1) 147 | }) 148 | 149 | test_that("recipe + step_orderNorm + axe_env() works", { 150 | rec <- recipe(~ ., data = as.data.frame(state.x77)) %>% 151 | step_orderNorm(rec, all_numeric()) 152 | x <- axe_env(rec) 153 | terms_empty_env(x, 1) 154 | }) 155 | -------------------------------------------------------------------------------- /tests/testthat/test_yeojohnson.R: -------------------------------------------------------------------------------- 1 | context('yeojohnson functionality') 2 | 3 | data(iris) 4 | 5 | train <- iris$Petal.Width 6 | 7 | yeojohnson_obj <- yeojohnson(train) 8 | 9 | test_that('yeojohnson Transforms original data consistently', { 10 | expect_equal(yeojohnson_obj$x.t, predict(yeojohnson_obj)) 11 | expect_equal(yeojohnson_obj$x, predict(yeojohnson_obj, inverse = TRUE)) 12 | }) 13 | 14 | test_that('yeojohnson Transforms new data consistently', { 15 | nd <- seq(0, 4, length = 100) 16 | pred <- predict(yeojohnson_obj, newdata = nd) 17 | expect_true(!any(is.na(pred))) 18 | 19 | nd2 <- predict(yeojohnson_obj, newdata = pred, inverse = TRUE) 20 | expect_equal(nd, nd2) 21 | }) 22 | 23 | test_that('yeojohnson correctly handles missing original data', { 24 | b <- yeojohnson(c(NA, train)) 25 | expect_equal(as.numeric(NA), b$x.t[1]) 26 | expect_equal(as.numeric(NA), predict(b)[1]) 27 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 28 | }) 29 | 30 | test_that('yeojohnson correctly handles missing new data', { 31 | b <- yeojohnson(train) 32 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 33 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 34 | }) 35 | 36 | # without standardization 37 | 38 | yeojohnson_obj <- yeojohnson(train, standardize = FALSE) 39 | 40 | test_that('yeojohnson without standardization Transforms original data consistently', { 41 | expect_equal(yeojohnson_obj$x.t, predict(yeojohnson_obj)) 42 | expect_equal(yeojohnson_obj$x, predict(yeojohnson_obj, inverse = TRUE)) 43 | }) 44 | 45 | test_that('yeojohnson without standardization Transforms new data consistently', { 46 | nd <- seq(0, 4, length = 100) 47 | pred <- predict(yeojohnson_obj, newdata = nd) 48 | expect_true(!any(is.na(pred))) 49 | 50 | nd2 <- predict(yeojohnson_obj, newdata = pred, inverse = TRUE) 51 | expect_equal(nd, nd2) 52 | }) 53 | 54 | test_that('yeojohnson without standardization correctly handles missing original data', { 55 | b <- yeojohnson(c(NA, train), standardize = FALSE) 56 | expect_equal(as.numeric(NA), b$x.t[1]) 57 | expect_equal(as.numeric(NA), predict(b)[1]) 58 | expect_equal(as.numeric(NA), predict(b, inverse = TRUE)[1]) 59 | }) 60 | 61 | test_that('yeojohnson without standardization correctly handles missing new data', { 62 | b <- yeojohnson(train, standardize = FALSE) 63 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA))[2]) 64 | expect_equal(as.numeric(NA), predict(b, newdata = c(1, NA), inverse = TRUE)[2]) 65 | }) 66 | -------------------------------------------------------------------------------- /vignettes/customization.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Customization within bestNormalize" 3 | author: "Ryan A Peterson" 4 | date: "`r Sys.Date()`" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: true 8 | vignette: > 9 | %\VignetteIndexEntry{Customization within bestNormalize} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | ```{r setup, include=FALSE} 15 | knitr::opts_chunk$set(echo = TRUE, fig.height = 5, fig.width = 7) 16 | library(bestNormalize) 17 | ``` 18 | 19 | # Custom functions with bestNormalize 20 | 21 | This vignette will go over the steps required to implement a custom user-defined function within the `bestNormalize` framework. 22 | 23 | There are 3 steps. 24 | 25 | 1) Create transformation function 26 | 27 | 2) Create predict method for transformation function (that can be applied to new data) 28 | 29 | 3) Pass through new function and predict method to bestNormalize 30 | 31 | ## Example: cube-root 32 | 33 | ## S3 methods 34 | 35 | Here, we start by defining a new function that we'll call `cuberoot_x`, which will take an argument `a` (as does the `sqrt_x` function) which will try to add a constant if it sees any negative numbers in `x`. It will also take the argument `standardize` which will center and scale the transformed data so that it's centered at 0 with SD = 1. 36 | 37 | ```{r} 38 | ## Define user-function 39 | cuberoot_x <- function(x, a = NULL, standardize = TRUE, ...) { 40 | stopifnot(is.numeric(x)) 41 | 42 | min_a <- max(0, -(min(x, na.rm = TRUE))) 43 | if(!length(a)) 44 | a <- min_a 45 | if(a < min_a) { 46 | warning("Setting a < max(0, -(min(x))) can lead to transformation issues", 47 | "Standardize set to FALSE") 48 | standardize <- FALSE 49 | } 50 | 51 | 52 | x.t <- (x + a)^(1/3) 53 | mu <- mean(x.t, na.rm = TRUE) 54 | sigma <- sd(x.t, na.rm = TRUE) 55 | if (standardize) x.t <- (x.t - mu) / sigma 56 | 57 | # Get in-sample normality statistic results 58 | ptest <- nortest::pearson.test(x.t) 59 | 60 | val <- list( 61 | x.t = x.t, 62 | x = x, 63 | mean = mu, 64 | sd = sigma, 65 | a = a, 66 | n = length(x.t) - sum(is.na(x)), 67 | norm_stat = unname(ptest$statistic / ptest$df), 68 | standardize = standardize 69 | ) 70 | 71 | # Assign class 72 | class(val) <- c('cuberoot_x', class(val)) 73 | val 74 | } 75 | 76 | ``` 77 | 78 | Note that we assigned a class to the object this returns of the same name; this is necessary for successful implementation within `bestNormalize`. We'll also need an associated `predict` method that is used to apply the transformation to newly observed data. 79 | ` 80 | ```{r} 81 | 82 | predict.cuberoot_x <- function(object, newdata = NULL, inverse = FALSE, ...) { 83 | 84 | # If no data supplied and not inverse 85 | if (is.null(newdata) & !inverse) 86 | newdata <- object$x 87 | 88 | # If no data supplied and inverse 89 | if (is.null(newdata) & inverse) 90 | newdata <- object$x.t 91 | 92 | # Actually performing transformations 93 | 94 | # Perform inverse transformation as estimated 95 | if (inverse) { 96 | 97 | # Reverse-standardize 98 | if (object$standardize) 99 | newdata <- newdata * object$sd + object$mean 100 | 101 | # Reverse-cube-root (cube) 102 | newdata <- newdata^3 - object$a 103 | 104 | 105 | # Otherwise, perform transformation as estimated 106 | } else if (!inverse) { 107 | # Take cube root 108 | newdata <- (newdata + object$a)^(1/3) 109 | 110 | # Standardize to mean 0, sd 1 111 | if (object$standardize) 112 | newdata <- (newdata - object$mean) / object$sd 113 | } 114 | 115 | # Return transformed data 116 | unname(newdata) 117 | } 118 | 119 | ``` 120 | 121 | ## Optional: print method 122 | 123 | This will be printed when bestNormalize selects your custom method or when you print an object returned by your new custom function. 124 | 125 | ```{r} 126 | print.cuberoot_x <- function(x, ...) { 127 | cat(ifelse(x$standardize, "Standardized", "Non-Standardized"), 128 | 'cuberoot(x + a) Transformation with', x$n, 'nonmissing obs.:\n', 129 | 'Relevant statistics:\n', 130 | '- a =', x$a, '\n', 131 | '- mean (before standardization) =', x$mean, '\n', 132 | '- sd (before standardization) =', x$sd, '\n') 133 | } 134 | 135 | ``` 136 | 137 | Note: if you can find a similar transformation in the source code, it's easy to model your code after it. For instance, for `cuberoot_x` and `predict.cuberoot_x`, I used `sqrt_x.R` as a template file. 138 | 139 | ## Implementing with bestNormalize 140 | 141 | ```{r} 142 | # Store custom functions into list 143 | custom_transform <- list( 144 | cuberoot_x = cuberoot_x, 145 | predict.cuberoot_x = predict.cuberoot_x, 146 | print.cuberoot_x = print.cuberoot_x 147 | ) 148 | 149 | set.seed(123129) 150 | x <- rgamma(100, 1, 1) 151 | (b <- bestNormalize(x = x, new_transforms = custom_transform, standardize = FALSE)) 152 | ``` 153 | 154 | Evidently, the cube-rooting was the best normalizing transformation! 155 | 156 | ## Sanity check 157 | 158 | Is this code actually performing the cube-rooting? 159 | 160 | ```{r} 161 | all.equal(x^(1/3), b$chosen_transform$x.t) 162 | all.equal(x^(1/3), predict(b)) 163 | ``` 164 | 165 | It does indeed. 166 | 167 | # Using custom normalization statistics 168 | 169 | The bestNormalize package can estimate any univariate statistic using its CV framework. A user-defined function can be passed in through the `norm_stat_fn` argument, and this function will then be applied in lieu of the Pearson test statistic divided by its degree of freedom. 170 | 171 | The user-defined function must take an argument `x`, which indicates the data on which a user wants to evaluate the statistic. 172 | 173 | Here is an example using Lilliefors (Kolmogorov-Smirnov) normality test statistic: 174 | 175 | ```{r} 176 | bestNormalize(x, norm_stat_fn = function(x) nortest::lillie.test(x)$stat) 177 | ``` 178 | 179 | Here is an example using Lilliefors (Kolmogorov-Smirnov) normality test's p-value: 180 | 181 | ```{r} 182 | (dont_do_this <- bestNormalize(x, norm_stat_fn = function(x) nortest::lillie.test(x)$p)) 183 | ``` 184 | 185 | Note: `bestNormalize` will attempt to minimize this statistic by default, which is definitely not what you want to do when calculating the p-value. This is seen in the example above, as the WORST normalization transformation is chosen. 186 | 187 | In this case, a user is advised to either manually select the best one: 188 | 189 | ```{r} 190 | best_transform <- names(which.max(dont_do_this$norm_stats)) 191 | (do_this <- dont_do_this$other_transforms[[best_transform]]) 192 | ``` 193 | 194 | Or, the user can reverse their defined statistic (in this case by subtracting it from 1): 195 | 196 | ```{r} 197 | (do_this <- bestNormalize(x, norm_stat_fn = function(x) 1-nortest::lillie.test(x)$p)) 198 | ``` 199 | 200 | -------------------------------------------------------------------------------- /vignettes/parallel_timings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petersonR/bestNormalize/46c0ea73e57eb2130bac9078b501688fd48d65c9/vignettes/parallel_timings.jpg --------------------------------------------------------------------------------