├── .github ├── .gitignore ├── workflows │ ├── lock.yaml │ ├── pkgdown.yaml │ ├── test-coverage.yaml │ ├── R-CMD-check.yaml │ └── pr-commands.yaml └── CODE_OF_CONDUCT.md ├── docs └── CNAME ├── vignettes ├── .gitignore ├── binary-data.Rmd └── continuous-data.Rmd ├── LICENSE ├── .gitignore ├── data ├── ames_new.rda ├── qsar_binary.rda ├── okc_binary.RData └── datalist ├── man ├── figures │ └── logo.png ├── print.apd_pca.Rd ├── print.apd_hat_values.Rd ├── score.Rd ├── print.apd_similarity.Rd ├── binary.Rd ├── autoplot.apd_similarity.Rd ├── autoplot.apd_pca.Rd ├── ames_new.Rd ├── okc_binary.Rd ├── applicable-package.Rd ├── score.apd_pca.Rd ├── score.apd_hat_values.Rd ├── score.apd_similarity.Rd ├── apd_hat_values.Rd ├── apd_pca.Rd └── apd_similarity.Rd ├── tests ├── testthat.R ├── testthat │ ├── test-misc.R │ ├── _snaps │ │ ├── misc.md │ │ ├── pca-fit.md │ │ ├── pca-score.md │ │ ├── print.md │ │ ├── hat_values-score.md │ │ ├── hat_values-fit.md │ │ └── similarity.md │ ├── test-print.R │ ├── test-hat_values-score.R │ ├── test-plot.R │ ├── test-pca-score.R │ ├── test-pca-fit.R │ ├── test-hat_values-fit.R │ └── test-similarity.R └── spelling.R ├── pkgdown └── favicon │ ├── favicon.ico │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── apple-touch-icon.png │ ├── apple-touch-icon-60x60.png │ ├── apple-touch-icon-76x76.png │ └── apple-touch-icon-120x120.png ├── NEWS.md ├── R ├── applicable-package.R ├── misc.R ├── zzz.R ├── score.R ├── 0.R ├── s3_register.R ├── helpers.R ├── print.R ├── plot.R ├── data.R ├── hat_values-score.R ├── pca-score.R ├── hat_values-fit.R ├── pca-fit.R └── similarity.R ├── inst └── WORDLIST ├── .Rbuildignore ├── codecov.yml ├── applicable.Rproj ├── _pkgdown.yml ├── cran-comments.md ├── LICENSE.md ├── DESCRIPTION ├── NAMESPACE ├── README.md └── README.Rmd /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | applicable.tidymodels.org -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2023 2 | COPYRIGHT HOLDER: applicable authors 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rhistory 2 | .Rprofile 3 | .Rproj.user 4 | .DS_Store 5 | inst/doc 6 | -------------------------------------------------------------------------------- /data/ames_new.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/data/ames_new.rda -------------------------------------------------------------------------------- /data/qsar_binary.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/data/qsar_binary.rda -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(applicable) 3 | 4 | test_check("applicable") 5 | -------------------------------------------------------------------------------- /data/okc_binary.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/data/okc_binary.RData -------------------------------------------------------------------------------- /pkgdown/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/favicon.ico -------------------------------------------------------------------------------- /data/datalist: -------------------------------------------------------------------------------- 1 | qsar_binary: binary_tr binary_unk 2 | okc_binary: okc_binary_train okc_binary_test 3 | ames_new: ames_new 4 | -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png -------------------------------------------------------------------------------- /tests/testthat/test-misc.R: -------------------------------------------------------------------------------- 1 | test_that("`names0` fails if `num` is less than 1", { 2 | num <- 0 3 | expect_snapshot(error = TRUE, 4 | names0(num) 5 | ) 6 | }) 7 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/misc.md: -------------------------------------------------------------------------------- 1 | # `names0` fails if `num` is less than 1 2 | 3 | Code 4 | names0(num) 5 | Error 6 | `num` should be > 0 7 | 8 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if (requireNamespace("spelling", quietly = TRUE)) { 2 | spelling::spell_check_test( 3 | vignettes = TRUE, error = FALSE, skip_on_cran = TRUE 4 | ) 5 | } 6 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | applicable 0.0.1.1 2 | ================== 3 | 4 | Minor patch release: fixed failing units tests due to recent package updates. 5 | 6 | applicable 0.0.1 7 | ================== 8 | 9 | * First CRAN version. 10 | -------------------------------------------------------------------------------- /R/applicable-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | # The following block is used by usethis to automatically manage 5 | # roxygen namespace tags. Modify with care! 6 | ## usethis namespace: start 7 | ## usethis namespace: end 8 | NULL 9 | -------------------------------------------------------------------------------- /R/misc.R: -------------------------------------------------------------------------------- 1 | # from recipes:::names0 2 | names0 <- function(num, prefix = "x") { 3 | if (num < 1) { 4 | rlang::abort("`num` should be > 0") 5 | } 6 | ind <- format(1:num) 7 | ind <- gsub(" ", "0", ind) 8 | paste0(prefix, ind) 9 | } 10 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | # nocov start 2 | # takes after https://raw.githubusercontent.com/r-lib/vctrs/master/R/zzz.R 3 | .onLoad <- function(libname, pkgname) { 4 | s3_register("ggplot2::autoplot", "apd_similarity") 5 | s3_register("ggplot2::autoplot", "apd_pca") 6 | } 7 | 8 | # nocov end 9 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | al 2 | Ames 3 | Chemoinformatics 4 | CRC 5 | De 6 | doi 7 | et 8 | extensibility 9 | Gillet 10 | Jaccard 11 | Lifecycle 12 | Netzeva 13 | OkCupid 14 | pca 15 | pcas 16 | pre 17 | QSAR 18 | Springer 19 | tibble 20 | X'X 21 | intercal 22 | CMD 23 | reprex 24 | tidymodels 25 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^applicable\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^_pkgdown\.yml$ 4 | ^docs$ 5 | ^pkgdown$ 6 | ^README.Rmd$ 7 | .github 8 | ^\.github/workflows/check-full\.yaml$ 9 | ^LICENSE\.md$ 10 | ^cran-comments\.md$ 11 | ^NEWS\.md$ 12 | ^CODE_OF_CONDUCT\.md$ 13 | ^\.github$ 14 | ^codecov\.yml$ 15 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/pca-fit.md: -------------------------------------------------------------------------------- 1 | # pcs is provided 2 | 3 | Code 4 | new_apd_pca(blueprint = hardhat::default_xy_blueprint()) 5 | Error 6 | argument "pcs" is missing, with no default 7 | 8 | # `new_apd_pca` fails when blueprint is numeric 9 | 10 | Code 11 | new_apd_pca(pcs = 1, blueprint = 1) 12 | Error 13 | blueprint should be a blueprint, not a numeric. 14 | 15 | -------------------------------------------------------------------------------- /applicable.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://applicable.tidymodels.org 2 | 3 | template: 4 | package: tidytemplate 5 | bootstrap: 5 6 | bslib: 7 | primary: "#CA225E" 8 | 9 | includes: 10 | in_header: | 11 | 12 | 13 | # https://github.com/tidyverse/tidytemplate for css 14 | 15 | development: 16 | mode: auto 17 | 18 | 19 | figures: 20 | fig.width: 8 21 | fig.height: 5.75 22 | -------------------------------------------------------------------------------- /man/print.apd_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/print.R 3 | \name{print.apd_pca} 4 | \alias{print.apd_pca} 5 | \title{Print number of predictors and principal components used.} 6 | \usage{ 7 | \method{print}{apd_pca}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A \code{apd_pca} object.} 11 | 12 | \item{...}{Not currently used, but required for extensibility.} 13 | } 14 | \value{ 15 | None 16 | } 17 | \description{ 18 | Print number of predictors and principal components used. 19 | } 20 | \examples{ 21 | 22 | model <- apd_pca(~ Sepal.Length + Sepal.Width, iris) 23 | print(model) 24 | } 25 | -------------------------------------------------------------------------------- /man/print.apd_hat_values.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/print.R 3 | \name{print.apd_hat_values} 4 | \alias{print.apd_hat_values} 5 | \title{Print number of predictors and principal components used.} 6 | \usage{ 7 | \method{print}{apd_hat_values}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A \code{apd_hat_values} object.} 11 | 12 | \item{...}{Not currently used, but required for extensibility.} 13 | } 14 | \value{ 15 | None 16 | } 17 | \description{ 18 | Print number of predictors and principal components used. 19 | } 20 | \examples{ 21 | 22 | model <- apd_hat_values(~ Sepal.Length + Sepal.Width, iris) 23 | print(model) 24 | } 25 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## R CMD check results 2 | 3 | 0 errors | 0 warnings | 0 note 4 | 5 | ## 0.0.1.1 Submission 6 | 7 | This release fixes failing unit tests. 8 | 9 | ### Review - 2020-06-14 10 | 11 | Fix the following unit tests: 12 | 13 | > 14 | ── 1. Failure: `score_apd_pca_numeric` pcs output matches `stats::predict` output 15 | `actual_output` not equivalent to `expected`. 16 | current is not list-like 17 | > 18 | ── 2. Failure: `score` pcs output matches `stats::predict` output 19 | `actual_output` not equivalent to `expected`. 20 | current is not list-like 21 | > 22 | ── 3. Failure: `score_apd_pca_bridge` output is correct 23 | `actual_output` not equivalent to `expected`. 24 | current is not list-like 25 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/pca-score.md: -------------------------------------------------------------------------------- 1 | # `score_apd_pca_numeric` fails when model has no pcs argument 2 | 3 | Code 4 | score_apd_pca_numeric(mtcars, mtcars) 5 | Error 6 | The model must contain a pcs argument. 7 | 8 | # `score` fails when predictors only contain factors 9 | 10 | Code 11 | score(model, iris$Species) 12 | Error 13 | The class of `new_data`, 'factor', is not recognized. 14 | 15 | # `score` fails when predictors are vectors 16 | 17 | Code 18 | score(object) 19 | Error 20 | `object` is not of a recognized type. 21 | Only data.frame, matrix, recipe, and formula objects are allowed. 22 | A data.frame was specified. 23 | 24 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/print.md: -------------------------------------------------------------------------------- 1 | # print wording for `apd_pca` is correct 2 | 3 | Code 4 | print(x1) 5 | Output 6 | # Predictors: 7 | 1 8 | # Principal Components: 9 | 1 component was needed 10 | to capture at least 95% of the 11 | total variation in the predictors. 12 | 13 | --- 14 | 15 | Code 16 | print(x2) 17 | Output 18 | # Predictors: 19 | 2 20 | # Principal Components: 21 | 2 components were needed 22 | to capture at least 95% of the 23 | total variation in the predictors. 24 | 25 | # print for apd_hat_values work as expected 26 | 27 | Code 28 | print(x) 29 | Output 30 | # Predictors: 31 | 11 32 | 33 | -------------------------------------------------------------------------------- /man/score.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/score.R 3 | \name{score} 4 | \alias{score} 5 | \alias{score.default} 6 | \title{A scoring function} 7 | \usage{ 8 | score(object, ...) 9 | 10 | \method{score}{default}(object, ...) 11 | } 12 | \arguments{ 13 | \item{object}{Depending on the context: 14 | \itemize{ 15 | \item A \strong{data frame} of predictors. 16 | \item A \strong{matrix} of predictors. 17 | \item A \strong{recipe} specifying a set of preprocessing steps 18 | created from \code{\link[recipes:recipe]{recipes::recipe()}}. 19 | }} 20 | 21 | \item{...}{Not currently used, but required for extensibility.} 22 | } 23 | \value{ 24 | A tibble of predictions. 25 | } 26 | \description{ 27 | A scoring function 28 | } 29 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/hat_values-score.md: -------------------------------------------------------------------------------- 1 | # `score_apd_hat_values_numeric` fails when model has no pcs argument 2 | 3 | Code 4 | score_apd_hat_values_numeric(mtcars, mtcars) 5 | Error 6 | The model must contain an XtX_inv argument. 7 | 8 | # `score` fails when predictors only contain factors 9 | 10 | Code 11 | score(model, iris$Species) 12 | Error 13 | The class of `new_data`, 'factor', is not recognized. 14 | 15 | # `score` fails when predictors are vectors 16 | 17 | Code 18 | score(object) 19 | Error 20 | `object` is not of a recognized type. 21 | Only data.frame, matrix, recipe, and formula objects are allowed. 22 | A data.frame was specified. 23 | 24 | -------------------------------------------------------------------------------- /man/print.apd_similarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/print.R 3 | \name{print.apd_similarity} 4 | \alias{print.apd_similarity} 5 | \title{Print number of predictors and principal components used.} 6 | \usage{ 7 | \method{print}{apd_similarity}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{A \code{apd_similarity} object.} 11 | 12 | \item{...}{Not currently used, but required for extensibility.} 13 | } 14 | \value{ 15 | None 16 | } 17 | \description{ 18 | Print number of predictors and principal components used. 19 | } 20 | \examples{ 21 | 22 | set.seed(535) 23 | tr_x <- matrix( 24 | sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE), 25 | ncol = 20 26 | ) 27 | model <- apd_similarity(tr_x) 28 | print(model) 29 | } 30 | -------------------------------------------------------------------------------- /man/binary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{binary} 5 | \alias{binary} 6 | \alias{qsar_binary} 7 | \alias{binary_tr} 8 | \alias{binary_unk} 9 | \title{Binary QSAR Data} 10 | \value{ 11 | \item{binary_tr,binary_ukn}{data frame frames with 67 columns} 12 | } 13 | \description{ 14 | Binary QSAR Data 15 | } 16 | \details{ 17 | These data are from two different sources on quantitative 18 | structure-activity relationship (QSAR) modeling and contain 67 predictors 19 | that are either 0 or 1. The training set contains 4,330 samples and there 20 | are five unknown samples (both from the \code{Mutagen} data in the \code{QSARdata} 21 | package). 22 | } 23 | \examples{ 24 | data(qsar_binary) 25 | str(binary_tr) 26 | } 27 | \keyword{datasets} 28 | -------------------------------------------------------------------------------- /tests/testthat/test-print.R: -------------------------------------------------------------------------------- 1 | test_that("print wording for `apd_pca` is correct", { 2 | x1 <- apd_pca(~Sepal.Length, iris) 3 | 4 | expect_snapshot( 5 | print(x1) 6 | ) 7 | 8 | x2 <- apd_pca(~ Sepal.Length + Sepal.Width, iris) 9 | 10 | expect_snapshot( 11 | print(x2) 12 | ) 13 | }) 14 | 15 | test_that("print for `apd_pca` displays correct threshold", { 16 | threshold <- 0.72 17 | x <- apd_pca(~Sepal.Length, iris, threshold) 18 | percentage <- capture.output(x) 19 | percentage <- regmatches( 20 | percentage, 21 | regexpr("(\\d+)%", percentage) 22 | ) 23 | 24 | expected_output <- threshold * 100 25 | expected_output <- paste0(expected_output, "%") 26 | 27 | expect_equal( 28 | percentage, 29 | expected_output 30 | ) 31 | }) 32 | 33 | test_that("print for apd_hat_values work as expected", { 34 | x <- apd_hat_values(mtcars) 35 | 36 | expect_snapshot( 37 | print(x) 38 | ) 39 | }) 40 | -------------------------------------------------------------------------------- /man/autoplot.apd_similarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.R 3 | \name{autoplot.apd_similarity} 4 | \alias{autoplot.apd_similarity} 5 | \title{Plot the cumulative distribution function for similarity metrics} 6 | \usage{ 7 | \method{autoplot}{apd_similarity}(object, ...) 8 | } 9 | \arguments{ 10 | \item{object}{An object produced by \code{apd_similarity}.} 11 | 12 | \item{...}{Not currently used.} 13 | } 14 | \value{ 15 | A \code{ggplot} object that shows the cumulative probability versus the 16 | unique similarity values in the training set. Not that for large samples, 17 | this is an approximation based on a random sample of 5,000 training set 18 | points. 19 | } 20 | \description{ 21 | Plot the cumulative distribution function for similarity metrics 22 | } 23 | \examples{ 24 | set.seed(535) 25 | tr_x <- matrix( 26 | sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE), 27 | ncol = 20 28 | ) 29 | model <- apd_similarity(tr_x) 30 | } 31 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/hat_values-fit.md: -------------------------------------------------------------------------------- 1 | # XtX_inv is provided 2 | 3 | Code 4 | new_apd_hat_values(blueprint = hardhat::default_xy_blueprint()) 5 | Error 6 | argument "XtX_inv" is missing, with no default 7 | 8 | # `new_apd_hat_values` fails when blueprint is numeric 9 | 10 | Code 11 | new_apd_hat_values(XtX_inv = 1, blueprint = 1) 12 | Error 13 | blueprint should be a blueprint, not a numeric. 14 | 15 | # `apd_hat_values` fails when matrix has more predictors than samples 16 | 17 | Code 18 | apd_hat_values(bad_data) 19 | Error 20 | The number of columns must be less than the number of rows. 21 | 22 | # `apd_hat_values` fails when the matrix X^tX is singular 23 | 24 | Code 25 | apd_hat_values(bad_data) 26 | Error 27 | Unable to compute the hat values of the matrix X of 28 | predictors because the matrix resulting from multiplying 29 | the transpose of X by X is singular. 30 | 31 | -------------------------------------------------------------------------------- /man/autoplot.apd_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.R 3 | \name{autoplot.apd_pca} 4 | \alias{autoplot.apd_pca} 5 | \title{Plot the distribution function for pcas} 6 | \usage{ 7 | \method{autoplot}{apd_pca}(object, ...) 8 | } 9 | \arguments{ 10 | \item{object}{An object produced by \code{apd_pca}.} 11 | 12 | \item{...}{An optional set of \code{dplyr} selectors, such as \code{dplyr::matches()} or 13 | \code{dplyr::starts_with()} for selecting which variables should be shown in the 14 | plot.} 15 | } 16 | \value{ 17 | A \code{ggplot} object that shows the distribution function for each 18 | principal component. 19 | } 20 | \description{ 21 | Plot the distribution function for pcas 22 | } 23 | \examples{ 24 | library(ggplot2) 25 | library(dplyr) 26 | library(modeldata) 27 | data(biomass) 28 | 29 | biomass_ad <- apd_pca(biomass[, 3:8]) 30 | 31 | autoplot(biomass_ad) 32 | # Using selectors in `...` 33 | autoplot(biomass_ad, distance) + scale_x_log10() 34 | autoplot(biomass_ad, matches("PC[1-2]")) 35 | } 36 | -------------------------------------------------------------------------------- /man/ames_new.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{ames_new} 5 | \alias{ames_new} 6 | \title{Recent Ames Iowa Houses} 7 | \source{ 8 | De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing 9 | Data as an End of Semester Regression Project," \emph{Journal of Statistics 10 | Education}, Volume 19, Number 3. 11 | 12 | \url{https://www.cityofames.org/government/departments-divisions-a-h/city-assessor} 13 | 14 | \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt} 15 | 16 | \url{http://jse.amstat.org/v19n3/decock.pdf} 17 | } 18 | \value{ 19 | \item{ames_new}{a tibble} 20 | } 21 | \description{ 22 | More data related to the set described by De Cock (2011) where data where 23 | data were recorded for 2,930 properties in Ames IA. 24 | } 25 | \details{ 26 | This data sets includes three more properties added since the original 27 | reference. There are less fields in this data set; only those that could be 28 | transcribed from the assessor's office were included. 29 | } 30 | \keyword{datasets} 31 | -------------------------------------------------------------------------------- /tests/testthat/test-hat_values-score.R: -------------------------------------------------------------------------------- 1 | test_that("`score_apd_hat_values_numeric` fails when model has no pcs argument", { 2 | expect_snapshot(error = TRUE, 3 | score_apd_hat_values_numeric(mtcars, mtcars) 4 | ) 5 | }) 6 | 7 | test_that("`score` fails when predictors only contain factors", { 8 | model <- apd_hat_values(~., iris) 9 | expect_snapshot(error = TRUE, 10 | score(model, iris$Species) 11 | ) 12 | }) 13 | 14 | test_that("`score` fails when predictors are vectors", { 15 | object <- iris 16 | 17 | expect_snapshot(error = TRUE, 18 | score(object) 19 | ) 20 | }) 21 | 22 | test_that("`score` calculated hat_values are correct", { 23 | model <- apd_hat_values(mtcars %>% dplyr::slice(1:15)) 24 | predictors <- as.matrix(mtcars %>% dplyr::slice(16:30)) 25 | 26 | proj_matrix <- predictors %*% model$XtX_inv %*% t(predictors) 27 | expected <- diag(proj_matrix) 28 | 29 | actual_output <- score(model, predictors) 30 | actual_output <- actual_output$hat_values 31 | 32 | # Data frame method 33 | expect_equal(ignore_attr = TRUE, 34 | actual_output, 35 | expected 36 | ) 37 | }) 38 | -------------------------------------------------------------------------------- /man/okc_binary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{okc_binary} 5 | \alias{okc_binary} 6 | \alias{okc_binary_train} 7 | \alias{okc_binary_test} 8 | \title{OkCupid Binary Predictors} 9 | \source{ 10 | Kim (2015), "OkCupid Data for Introductory Statistics and Data Science Courses", \emph{Journal of Statistics Education}, Volume 23, Number 2. \url{https://www.tandfonline.com/doi/abs/10.1080/10691898.2015.11889737} 11 | 12 | Kuhn and Johnson (2020), \emph{Feature Engineering and Selection}, Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and \url{https://github.com/topepo/FES} 13 | } 14 | \value{ 15 | \item{okc_binary_train,okc_binary_test}{data frame frames with 61 columns} 16 | } 17 | \description{ 18 | OkCupid Binary Predictors 19 | } 20 | \details{ 21 | Data originally from Kim (2015) includes a training and test set 22 | consistent with Kuhn and Johnson (2020). Predictors include ethnicity 23 | indicators and a set of keywords derived from text essay data. 24 | } 25 | \examples{ 26 | data(okc_binary) 27 | str(okc_binary_train) 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2023 applicable authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /R/score.R: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # ---------------------- Model fit generic interface -------------------------- 3 | # ----------------------------------------------------------------------------- 4 | 5 | #' A scoring function 6 | #' 7 | #' @param object Depending on the context: 8 | #' 9 | #' * A __data frame__ of predictors. 10 | #' * A __matrix__ of predictors. 11 | #' * A __recipe__ specifying a set of preprocessing steps 12 | #' created from [recipes::recipe()]. 13 | #' 14 | #' @param ... Not currently used, but required for extensibility. 15 | #' 16 | #' @return 17 | #' 18 | #' A tibble of predictions. 19 | #' 20 | #' @export 21 | score <- function(object, ...) { 22 | UseMethod("score") 23 | } 24 | 25 | #' @export 26 | #' @export score.default 27 | #' @rdname score 28 | score.default <- function(object, ...) { 29 | cls <- class(object)[1] 30 | message <- 31 | "`object` is not of a recognized type. 32 | Only data.frame, matrix, recipe, and formula objects are allowed. 33 | A {cls} was specified." 34 | message <- glue::glue(message) 35 | rlang::abort(message = message) 36 | } 37 | -------------------------------------------------------------------------------- /.github/workflows/lock.yaml: -------------------------------------------------------------------------------- 1 | name: 'Lock Threads' 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | 7 | jobs: 8 | lock: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: dessant/lock-threads@v2 12 | with: 13 | github-token: ${{ github.token }} 14 | issue-lock-inactive-days: '14' 15 | # issue-exclude-labels: '' 16 | # issue-lock-labels: 'outdated' 17 | issue-lock-comment: > 18 | This issue has been automatically locked. If you believe you have 19 | found a related problem, please file a new issue (with a reprex: 20 | ) and link to this issue. 21 | issue-lock-reason: '' 22 | pr-lock-inactive-days: '14' 23 | # pr-exclude-labels: 'wip' 24 | pr-lock-labels: '' 25 | pr-lock-comment: > 26 | This pull request has been automatically locked. If you believe you 27 | have found a related problem, please file a new issue (with a reprex: 28 | ) and link to this issue. 29 | pr-lock-reason: '' 30 | # process-only: 'issues' 31 | -------------------------------------------------------------------------------- /man/applicable-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/applicable-package.R 3 | \docType{package} 4 | \name{applicable-package} 5 | \alias{applicable} 6 | \alias{applicable-package} 7 | \title{applicable: A Compilation of Applicability Domain Methods} 8 | \description{ 9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} 10 | 11 | A modeling package compiling applicability domain methods in R. It combines different methods to measure the amount of extrapolation new samples can have from the training set. See Netzeva et al (2005) \doi{10.1177/026119290503300209} for an overview of applicability domains. 12 | } 13 | \seealso{ 14 | Useful links: 15 | \itemize{ 16 | \item \url{https://github.com/tidymodels/applicable} 17 | \item \url{https://applicable.tidymodels.org} 18 | \item Report bugs at \url{https://github.com/tidymodels/applicable/issues} 19 | } 20 | 21 | } 22 | \author{ 23 | \strong{Maintainer}: Marly Gotti \email{marlygotti@gmail.com} 24 | 25 | Authors: 26 | \itemize{ 27 | \item Max Kuhn \email{max@posit.co} 28 | } 29 | 30 | Other contributors: 31 | \itemize{ 32 | \item Posit Software, PBC [copyright holder, funder] 33 | } 34 | 35 | } 36 | \keyword{internal} 37 | -------------------------------------------------------------------------------- /tests/testthat/test-plot.R: -------------------------------------------------------------------------------- 1 | test_that("output of autoplot.apd_pca is correct when no options are provided", { 2 | ad <- apd_pca(mtcars) 3 | ad_plot <- ggplot2::autoplot(ad) 4 | 5 | pctls <- ad$pctls %>% 6 | tidyr::gather(component, value, -percentile) 7 | 8 | expect_equal(ad_plot$data, pctls) 9 | expect_equal(ad_plot$labels$x, "abs(value)") 10 | expect_equal(ad_plot$labels$y, "percentile") 11 | }) 12 | 13 | test_that("output of autoplot.apd_pca is correct when options=matches are provided", { 14 | ad <- apd_pca(mtcars) 15 | ad_plot <- ggplot2::autoplot(ad, matches("PC[1-5]")) 16 | 17 | pctls <- ad$pctls %>% 18 | select(matches("PC[1-5]"), percentile) %>% 19 | tidyr::gather(component, value, -percentile) 20 | 21 | expect_equal(ad_plot$data, pctls) 22 | expect_equal(ad_plot$labels$x, "abs(value)") 23 | expect_equal(ad_plot$labels$y, "percentile") 24 | }) 25 | 26 | test_that("output of autoplot.apd_pca is correct when options=distance are provided", { 27 | ad <- apd_pca(mtcars) 28 | ad_plot <- ggplot2::autoplot(ad, "distance") 29 | 30 | pctls <- ad$pctls %>% 31 | select(matches("distance"), percentile) %>% 32 | tidyr::gather(component, value, -percentile) 33 | 34 | expect_equal(ad_plot$data, pctls) 35 | expect_equal(ad_plot$labels$x, "abs(value)") 36 | expect_equal(ad_plot$labels$y, "percentile") 37 | }) 38 | -------------------------------------------------------------------------------- /man/score.apd_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pca-score.R 3 | \name{score.apd_pca} 4 | \alias{score.apd_pca} 5 | \title{Predict from a \code{apd_pca}} 6 | \usage{ 7 | \method{score}{apd_pca}(object, new_data, type = "numeric", ...) 8 | } 9 | \arguments{ 10 | \item{object}{A \code{apd_pca} object.} 11 | 12 | \item{new_data}{A data frame or matrix of new samples.} 13 | 14 | \item{type}{A single character. The type of predictions to generate. 15 | Valid options are: 16 | \itemize{ 17 | \item \code{"numeric"} for numeric predictions. 18 | }} 19 | 20 | \item{...}{Not used, but required for extensibility.} 21 | } 22 | \value{ 23 | A tibble of predictions. The number of rows in the tibble is guaranteed 24 | to be the same as the number of rows in \code{new_data}. 25 | } 26 | \description{ 27 | Predict from a \code{apd_pca} 28 | } 29 | \details{ 30 | The function computes the principal components of the new data and 31 | their percentiles as compared to the training data. The number of principal 32 | components computed depends on the \code{threshold} given at fit time. It also 33 | computes the multivariate distance between each principal component and its 34 | mean. 35 | } 36 | \examples{ 37 | train <- mtcars[1:20, ] 38 | test <- mtcars[21:32, -1] 39 | 40 | # Fit 41 | mod <- apd_pca(mpg ~ cyl + log(drat), train) 42 | 43 | # Predict, with preprocessing 44 | score(mod, test) 45 | } 46 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - uses: r-lib/actions/setup-pandoc@v2 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::pkgdown, local::. 34 | needs: website 35 | 36 | - name: Build site 37 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 38 | shell: Rscript {0} 39 | 40 | - name: Deploy to GitHub pages 🚀 41 | if: github.event_name != 'pull_request' 42 | uses: JamesIves/github-pages-deploy-action@v4.4.1 43 | with: 44 | clean: false 45 | branch: gh-pages 46 | folder: docs 47 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: applicable 2 | Title: A Compilation of Applicability Domain Methods 3 | Version: 0.0.1.1 4 | Authors@R: c( 5 | person("Marly", "Gotti", , "marlygotti@gmail.com", role = c("aut", "cre")), 6 | person("Max", "Kuhn", , "max@posit.co", role = "aut"), 7 | person(given = "Posit Software, PBC", role = c("cph", "fnd")) 8 | ) 9 | Description: A modeling package compiling applicability domain methods in 10 | R. It combines different methods to measure the amount of 11 | extrapolation new samples can have from the training set. See Netzeva 12 | et al (2005) for an overview of 13 | applicability domains. 14 | License: MIT + file LICENSE 15 | URL: https://github.com/tidymodels/applicable, 16 | https://applicable.tidymodels.org 17 | BugReports: https://github.com/tidymodels/applicable/issues 18 | Depends: 19 | ggplot2, 20 | R (>= 3.4) 21 | Imports: 22 | dplyr, 23 | glue, 24 | hardhat (>= 0.1.2), 25 | Matrix, 26 | proxyC, 27 | purrr, 28 | rlang, 29 | stats, 30 | tibble, 31 | tidyr, 32 | tidyselect, 33 | utils 34 | Suggests: 35 | covr, 36 | knitr, 37 | modeldata, 38 | recipes (>= 0.1.7), 39 | rmarkdown, 40 | spelling, 41 | testthat (>= 3.0.0), 42 | xml2 43 | VignetteBuilder: 44 | knitr 45 | Encoding: UTF-8 46 | Language: en-US 47 | LazyData: true 48 | Roxygen: list(markdown = TRUE) 49 | RoxygenNote: 7.2.3 50 | Config/Needs/website: tidyverse/tidytemplate 51 | Config/testthat/edition: 3 52 | -------------------------------------------------------------------------------- /man/score.apd_hat_values.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hat_values-score.R 3 | \name{score.apd_hat_values} 4 | \alias{score.apd_hat_values} 5 | \title{Score new samples using hat values} 6 | \usage{ 7 | \method{score}{apd_hat_values}(object, new_data, type = "numeric", ...) 8 | } 9 | \arguments{ 10 | \item{object}{A \code{apd_hat_values} object.} 11 | 12 | \item{new_data}{A data frame or matrix of new predictors.} 13 | 14 | \item{type}{A single character. The type of predictions to generate. 15 | Valid options are: 16 | \itemize{ 17 | \item \code{"numeric"} for a numeric value that summarizes the hat values for 18 | each sample across the training set. 19 | }} 20 | 21 | \item{...}{Not used, but required for extensibility.} 22 | } 23 | \value{ 24 | A tibble of predictions. The number of rows in the tibble is guaranteed 25 | to be the same as the number of rows in \code{new_data}. For \code{type = "numeric"}, 26 | the tibble contains two columns \code{hat_values} and \code{hat_values_pctls}. The 27 | column \code{hat_values_pctls} is in percent units so that a value of 11.5 28 | indicates that, in the training set, 11.5 percent of the training set 29 | samples had smaller values than the sample being scored. 30 | } 31 | \description{ 32 | Score new samples using hat values 33 | } 34 | \examples{ 35 | train_data <- mtcars[1:20, ] 36 | test_data <- mtcars[21:32, ] 37 | 38 | hat_values_model <- apd_hat_values(train_data) 39 | 40 | hat_values_scoring <- score(hat_values_model, new_data = test_data) 41 | hat_values_scoring 42 | } 43 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /R/0.R: -------------------------------------------------------------------------------- 1 | #' @importFrom dplyr %>% 2 | #' @importFrom dplyr select 3 | #' @importFrom dplyr slice 4 | #' @importFrom dplyr matches 5 | #' @importFrom dplyr starts_with 6 | #' @importFrom dplyr rename_all 7 | #' @importFrom dplyr mutate 8 | #' @importFrom dplyr mutate_all 9 | #' @importFrom dplyr group_by 10 | #' @importFrom dplyr ungroup 11 | #' @importFrom dplyr count 12 | #' @importFrom dplyr sample_n 13 | #' @importFrom glue glue 14 | #' @importFrom tibble as_tibble 15 | #' @importFrom tibble tibble 16 | #' @importFrom purrr map_dfc 17 | #' @importFrom purrr map2_dfc 18 | #' @importFrom rlang abort 19 | #' @importFrom rlang enquos 20 | #' @importFrom rlang arg_match 21 | #' @importFrom stats predict 22 | #' @importFrom stats prcomp 23 | #' @importFrom stats approx 24 | #' @importFrom stats quantile 25 | #' @importFrom stats ecdf 26 | #' @importFrom stats setNames 27 | #' @importFrom hardhat validate_prediction_size 28 | #' @importFrom hardhat forge 29 | #' @importFrom hardhat mold 30 | #' @importFrom hardhat new_model 31 | #' @importFrom ggplot2 ggplot geom_step xlab ylab aes autoplot 32 | #' @importFrom Matrix Matrix colSums 33 | #' @importFrom tidyselect vars_select 34 | #' @importFrom tidyr gather 35 | #' @importFrom proxyC simil 36 | 37 | # ------------------------------------------------------------------------------ 38 | # nocov 39 | 40 | # Reduce false positives when R CMD check runs its "no visible binding for 41 | # global variable" check 42 | #' @importFrom utils globalVariables 43 | utils::globalVariables( 44 | c("cumulative", "n", "sim", "percentile", "component", "value") 45 | ) 46 | 47 | # nocov end 48 | -------------------------------------------------------------------------------- /R/s3_register.R: -------------------------------------------------------------------------------- 1 | # nocov start 2 | s3_register <- function(generic, class, method = NULL) { 3 | stopifnot(is.character(generic), length(generic) == 1) 4 | stopifnot(is.character(class), length(class) == 1) 5 | 6 | pieces <- strsplit(generic, "::")[[1]] 7 | stopifnot(length(pieces) == 2) 8 | package <- pieces[[1]] 9 | generic <- pieces[[2]] 10 | 11 | caller <- parent.frame() 12 | 13 | get_method_env <- function() { 14 | top <- topenv(caller) 15 | if (isNamespace(top)) { 16 | asNamespace(environmentName(top)) 17 | } else { 18 | caller 19 | } 20 | } 21 | get_method <- function(method, env) { 22 | if (is.null(method)) { 23 | get(paste0(generic, ".", class), envir = get_method_env()) 24 | } else { 25 | method 26 | } 27 | } 28 | 29 | method_fn <- get_method(method) 30 | stopifnot(is.function(method_fn)) 31 | 32 | # Always register hook in case package is later unloaded & reloaded 33 | setHook( 34 | packageEvent(package, "onLoad"), 35 | function(...) { 36 | ns <- asNamespace(package) 37 | 38 | # Refresh the method, it might have been updated by `devtools::load_all()` 39 | method_fn <- get_method(method) 40 | 41 | registerS3method(generic, class, method_fn, envir = ns) 42 | } 43 | ) 44 | 45 | # Avoid registration failures during loading (pkgload or regular) 46 | if (!isNamespaceLoaded(package)) { 47 | return(invisible()) 48 | } 49 | 50 | envir <- asNamespace(package) 51 | 52 | # Only register if generic can be accessed 53 | if (exists(generic, envir)) { 54 | registerS3method(generic, class, method_fn, envir = envir) 55 | } 56 | 57 | invisible() 58 | } 59 | 60 | # nocov end 61 | -------------------------------------------------------------------------------- /man/score.apd_similarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/similarity.R 3 | \name{score.apd_similarity} 4 | \alias{score.apd_similarity} 5 | \title{Score new samples using similarity methods} 6 | \usage{ 7 | \method{score}{apd_similarity}(object, new_data, type = "numeric", add_percentile = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{object}{A \code{apd_similarity} object.} 11 | 12 | \item{new_data}{A data frame or matrix of new predictors.} 13 | 14 | \item{type}{A single character. The type of predictions to generate. 15 | Valid options are: 16 | \itemize{ 17 | \item \code{"numeric"} for a numeric value that summarizes the similarity values for 18 | each sample across the training set. 19 | }} 20 | 21 | \item{add_percentile}{A single logical; should the percentile of the 22 | similarity score \emph{relative to the training set values} by computed?} 23 | 24 | \item{...}{Not used, but required for extensibility.} 25 | } 26 | \value{ 27 | A tibble of predictions. The number of rows in the tibble is guaranteed 28 | to be the same as the number of rows in \code{new_data}. For \code{type = "numeric"}, 29 | the tibble contains a column called "similarity". If \code{add_percentile = TRUE}, 30 | an additional column called \code{similarity_pctl} will be added. These values are 31 | in percent units so that a value of 11.5 indicates that, in the training set, 32 | 11.5 percent of the training set samples had smaller values than the sample 33 | being scored. 34 | } 35 | \description{ 36 | Score new samples using similarity methods 37 | } 38 | \examples{ 39 | \donttest{ 40 | data(qsar_binary) 41 | 42 | jacc_sim <- apd_similarity(binary_tr) 43 | 44 | mean_sim <- score(jacc_sim, new_data = binary_unk) 45 | mean_sim 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /man/apd_hat_values.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hat_values-fit.R 3 | \name{apd_hat_values} 4 | \alias{apd_hat_values} 5 | \alias{apd_hat_values.default} 6 | \alias{apd_hat_values.data.frame} 7 | \alias{apd_hat_values.matrix} 8 | \alias{apd_hat_values.formula} 9 | \alias{apd_hat_values.recipe} 10 | \title{Fit a \code{apd_hat_values}} 11 | \usage{ 12 | apd_hat_values(x, ...) 13 | 14 | \method{apd_hat_values}{default}(x, ...) 15 | 16 | \method{apd_hat_values}{data.frame}(x, ...) 17 | 18 | \method{apd_hat_values}{matrix}(x, ...) 19 | 20 | \method{apd_hat_values}{formula}(formula, data, ...) 21 | 22 | \method{apd_hat_values}{recipe}(x, data, ...) 23 | } 24 | \arguments{ 25 | \item{x}{Depending on the context: 26 | \itemize{ 27 | \item A \strong{data frame} of predictors. 28 | \item A \strong{matrix} of predictors. 29 | \item A \strong{recipe} specifying a set of preprocessing steps 30 | created from \code{\link[recipes:recipe]{recipes::recipe()}}. 31 | }} 32 | 33 | \item{...}{Not currently used, but required for extensibility.} 34 | 35 | \item{formula}{A formula specifying the predictor terms on the right-hand 36 | side. No outcome should be specified.} 37 | 38 | \item{data}{When a \strong{recipe} or \strong{formula} is used, \code{data} is specified as: 39 | \itemize{ 40 | \item A \strong{data frame} containing the predictors. 41 | }} 42 | } 43 | \value{ 44 | A \code{apd_hat_values} object. 45 | } 46 | \description{ 47 | \code{apd_hat_values()} fits a model. 48 | } 49 | \examples{ 50 | predictors <- mtcars[, -1] 51 | 52 | # Data frame interface 53 | mod <- apd_hat_values(predictors) 54 | 55 | # Formula interface 56 | mod2 <- apd_hat_values(mpg ~ ., mtcars) 57 | 58 | # Recipes interface 59 | library(recipes) 60 | rec <- recipe(mpg ~ ., mtcars) 61 | rec <- step_log(rec, disp) 62 | mod3 <- apd_hat_values(rec, mtcars) 63 | } 64 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | branches: [main, master] 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: ${{ matrix.config.os }} 18 | 19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | config: 25 | - {os: macos-latest, r: 'release'} 26 | 27 | - {os: windows-latest, r: 'release'} 28 | # Use 3.6 to trigger usage of RTools35 29 | - {os: windows-latest, r: '3.6'} 30 | # use 4.1 to check with rtools40's older compiler 31 | - {os: windows-latest, r: '4.1'} 32 | 33 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 34 | - {os: ubuntu-latest, r: 'release'} 35 | - {os: ubuntu-latest, r: 'oldrel-1'} 36 | 37 | env: 38 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 39 | R_KEEP_PKG_SOURCE: yes 40 | 41 | steps: 42 | - uses: actions/checkout@v3 43 | 44 | - uses: r-lib/actions/setup-pandoc@v2 45 | 46 | - uses: r-lib/actions/setup-r@v2 47 | with: 48 | r-version: ${{ matrix.config.r }} 49 | http-user-agent: ${{ matrix.config.http-user-agent }} 50 | use-public-rspm: true 51 | 52 | - uses: r-lib/actions/setup-r-dependencies@v2 53 | with: 54 | extra-packages: any::rcmdcheck 55 | needs: check 56 | 57 | - uses: r-lib/actions/check-r-package@v2 58 | with: 59 | upload-snapshots: true 60 | -------------------------------------------------------------------------------- /man/apd_pca.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pca-fit.R 3 | \name{apd_pca} 4 | \alias{apd_pca} 5 | \alias{apd_pca.default} 6 | \alias{apd_pca.data.frame} 7 | \alias{apd_pca.matrix} 8 | \alias{apd_pca.formula} 9 | \alias{apd_pca.recipe} 10 | \title{Fit a \code{apd_pca}} 11 | \usage{ 12 | apd_pca(x, ...) 13 | 14 | \method{apd_pca}{default}(x, ...) 15 | 16 | \method{apd_pca}{data.frame}(x, threshold = 0.95, ...) 17 | 18 | \method{apd_pca}{matrix}(x, threshold = 0.95, ...) 19 | 20 | \method{apd_pca}{formula}(formula, data, threshold = 0.95, ...) 21 | 22 | \method{apd_pca}{recipe}(x, data, threshold = 0.95, ...) 23 | } 24 | \arguments{ 25 | \item{x}{Depending on the context: 26 | \itemize{ 27 | \item A \strong{data frame} of predictors. 28 | \item A \strong{matrix} of predictors. 29 | \item A \strong{recipe} specifying a set of preprocessing steps 30 | created from \code{\link[recipes:recipe]{recipes::recipe()}}. 31 | }} 32 | 33 | \item{...}{Not currently used, but required for extensibility.} 34 | 35 | \item{threshold}{A number indicating the percentage of variance desired from 36 | the principal components. It must be a number greater than 0 and less or 37 | equal than 1.} 38 | 39 | \item{formula}{A formula specifying the predictor terms on the right-hand 40 | side. No outcome should be specified.} 41 | 42 | \item{data}{When a \strong{recipe} or \strong{formula} is used, \code{data} is specified as: 43 | \itemize{ 44 | \item A \strong{data frame} containing the predictors. 45 | }} 46 | } 47 | \value{ 48 | A \code{apd_pca} object. 49 | } 50 | \description{ 51 | \code{apd_pca()} fits a model. 52 | } 53 | \details{ 54 | The function computes the principal components that account for 55 | up to either 95\% or the provided \code{threshold} of variability. It also 56 | computes the percentiles of the absolute value of the principal components. 57 | Additionally, it calculates the mean of each principal component. 58 | } 59 | \examples{ 60 | predictors <- mtcars[, -1] 61 | 62 | # Data frame interface 63 | mod <- apd_pca(predictors) 64 | 65 | # Formula interface 66 | mod2 <- apd_pca(mpg ~ ., mtcars) 67 | 68 | # Recipes interface 69 | library(recipes) 70 | rec <- recipe(mpg ~ ., mtcars) 71 | rec <- step_log(rec, disp) 72 | mod3 <- apd_pca(rec, mtcars) 73 | } 74 | -------------------------------------------------------------------------------- /R/helpers.R: -------------------------------------------------------------------------------- 1 | # Find distance between each principal component and the respective mean 2 | # calculated on each principal components on the training set. 3 | find_distance_to_pca_means <- function(pcs, pca_means) { 4 | diffs <- sweep(pcs, 2, pca_means) 5 | sq_diff <- diffs^2 6 | dists <- apply(sq_diff, 1, function(x) sqrt(sum(x))) 7 | dists 8 | } 9 | 10 | # ----------------------------------------------------------------------------- 11 | # ----------------------------------------------------------------------------- 12 | # ----------------------------------------------------------------------------- 13 | 14 | # Find percentile 15 | get_ref_percentile <- function(x) { 16 | res <- stats::ecdf(x) 17 | grid <- seq(0, 1, length = 101) 18 | res <- stats::quantile(res, grid) 19 | unname(res) 20 | } 21 | 22 | # ----------------------------------------------------------------------------- 23 | # ----------------------------------------------------------------------------- 24 | # ----------------------------------------------------------------------------- 25 | 26 | # Find matrix XpX_inv 27 | get_inv <- function(X) { 28 | if (!is.matrix(X)) { 29 | X <- as.matrix(X) 30 | } 31 | 32 | XpX <- t(X) %*% X 33 | XpX_inv <- try(qr.solve(XpX), silent = TRUE) 34 | 35 | if (inherits(XpX_inv, "try-error")) { 36 | message <- as.character(XpX_inv) 37 | if (message == "Error in qr.solve(XpX) : singular matrix 'a' in solve\n") { 38 | message <- paste( 39 | "Unable to compute the hat values of the matrix X of", 40 | "predictors because the matrix resulting from multiplying", 41 | "the transpose of X by X is singular.", 42 | sep = "\n" 43 | ) 44 | } 45 | 46 | rlang::abort(message = message) 47 | } 48 | 49 | dimnames(XpX_inv) <- NULL 50 | XpX_inv 51 | } 52 | 53 | # ----------------------------------------------------------------------------- 54 | # ----------------------------------------------------------------------------- 55 | # ----------------------------------------------------------------------------- 56 | 57 | # Get percentile for new samples 58 | get_new_percentile <- function(ref, x_new, grid) { 59 | res <- approx(ref, grid, xout = x_new)$y 60 | res[x_new < min(ref, na.rm = TRUE)] <- 0 61 | res[x_new > max(ref, na.rm = TRUE)] <- 1 62 | res 63 | } 64 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(apd_hat_values,data.frame) 4 | S3method(apd_hat_values,default) 5 | S3method(apd_hat_values,formula) 6 | S3method(apd_hat_values,matrix) 7 | S3method(apd_hat_values,recipe) 8 | S3method(apd_pca,data.frame) 9 | S3method(apd_pca,default) 10 | S3method(apd_pca,formula) 11 | S3method(apd_pca,matrix) 12 | S3method(apd_pca,recipe) 13 | S3method(apd_similarity,data.frame) 14 | S3method(apd_similarity,default) 15 | S3method(apd_similarity,formula) 16 | S3method(apd_similarity,matrix) 17 | S3method(apd_similarity,recipe) 18 | S3method(autoplot,apd_pca) 19 | S3method(autoplot,apd_similarity) 20 | S3method(print,apd_hat_values) 21 | S3method(print,apd_pca) 22 | S3method(print,apd_similarity) 23 | S3method(score,apd_hat_values) 24 | S3method(score,apd_pca) 25 | S3method(score,apd_similarity) 26 | S3method(score,default) 27 | export(apd_hat_values) 28 | export(apd_pca) 29 | export(apd_similarity) 30 | export(autoplot.apd_pca) 31 | export(autoplot.apd_similarity) 32 | export(score) 33 | export(score.default) 34 | importFrom(Matrix,Matrix) 35 | importFrom(Matrix,colSums) 36 | importFrom(dplyr,"%>%") 37 | importFrom(dplyr,count) 38 | importFrom(dplyr,group_by) 39 | importFrom(dplyr,matches) 40 | importFrom(dplyr,mutate) 41 | importFrom(dplyr,mutate_all) 42 | importFrom(dplyr,rename_all) 43 | importFrom(dplyr,sample_n) 44 | importFrom(dplyr,select) 45 | importFrom(dplyr,slice) 46 | importFrom(dplyr,starts_with) 47 | importFrom(dplyr,ungroup) 48 | importFrom(ggplot2,aes) 49 | importFrom(ggplot2,autoplot) 50 | importFrom(ggplot2,geom_step) 51 | importFrom(ggplot2,ggplot) 52 | importFrom(ggplot2,xlab) 53 | importFrom(ggplot2,ylab) 54 | importFrom(glue,glue) 55 | importFrom(hardhat,forge) 56 | importFrom(hardhat,mold) 57 | importFrom(hardhat,new_model) 58 | importFrom(hardhat,validate_prediction_size) 59 | importFrom(proxyC,simil) 60 | importFrom(purrr,map2_dfc) 61 | importFrom(purrr,map_dfc) 62 | importFrom(rlang,abort) 63 | importFrom(rlang,arg_match) 64 | importFrom(rlang,enquos) 65 | importFrom(stats,approx) 66 | importFrom(stats,ecdf) 67 | importFrom(stats,prcomp) 68 | importFrom(stats,predict) 69 | importFrom(stats,quantile) 70 | importFrom(stats,setNames) 71 | importFrom(tibble,as_tibble) 72 | importFrom(tibble,tibble) 73 | importFrom(tidyr,gather) 74 | importFrom(tidyselect,vars_select) 75 | importFrom(utils,globalVariables) 76 | -------------------------------------------------------------------------------- /tests/testthat/test-pca-score.R: -------------------------------------------------------------------------------- 1 | test_that("`score_apd_pca_numeric` fails when model has no pcs argument", { 2 | expect_snapshot(error = TRUE, 3 | score_apd_pca_numeric(mtcars, mtcars) 4 | ) 5 | }) 6 | 7 | test_that("`score` fails when predictors only contain factors", { 8 | model <- apd_pca(~., iris) 9 | expect_snapshot(error = TRUE, 10 | score(model, iris$Species) 11 | ) 12 | }) 13 | 14 | test_that("`score` fails when predictors are vectors", { 15 | object <- iris 16 | 17 | expect_snapshot(error = TRUE, 18 | score(object) 19 | ) 20 | }) 21 | 22 | test_that("`score_apd_pca_numeric` pcs output matches `stats::predict` output", { 23 | model <- apd_pca(mtcars %>% dplyr::slice(1:15)) 24 | predictors <- as.matrix(mtcars %>% dplyr::slice(16:30)) 25 | 26 | expected <- stats::predict(model$pcs, predictors) 27 | expected <- as.data.frame(expected[, 1:model$num_comp, drop = FALSE]) 28 | 29 | # Select columns of the form PC{number} 30 | actual_output <- score_apd_pca_numeric(model, predictors) %>% 31 | dplyr::select(dplyr::matches("^PC\\d+$")) 32 | 33 | # Data frame method 34 | expect_equal(ignore_attr = TRUE, 35 | actual_output, 36 | expected 37 | ) 38 | }) 39 | 40 | test_that("`score` pcs output matches `stats::predict` output", { 41 | model <- apd_pca(mtcars %>% dplyr::slice(1:15)) 42 | predictors <- as.matrix(mtcars %>% dplyr::slice(16:30)) 43 | 44 | expected <- stats::predict(model$pcs, predictors) 45 | expected <- as.data.frame(expected[, 1:model$num_comp, drop = FALSE]) 46 | 47 | # Select columns of the form PC{number} 48 | actual_output <- score(model, predictors) %>% 49 | dplyr::select(dplyr::matches("^PC\\d+$")) 50 | 51 | # Data frame method 52 | expect_equal(ignore_attr = TRUE, 53 | actual_output, 54 | expected 55 | ) 56 | }) 57 | 58 | test_that("`score_apd_pca_bridge` output is correct", { 59 | model <- apd_pca(mtcars %>% dplyr::slice(1:15)) 60 | predictors <- as.matrix(mtcars %>% dplyr::slice(16:30)) 61 | 62 | expected <- stats::predict(model$pcs, predictors) 63 | expected <- as.data.frame(expected[, 1:model$num_comp, drop = FALSE]) 64 | 65 | # Select columns of the form PC{number} 66 | actual_output <- score_apd_pca_bridge("numeric", model, predictors) %>% 67 | dplyr::select(dplyr::matches("^PC\\d+$")) 68 | 69 | # Data frame method 70 | expect_equal(ignore_attr = TRUE, 71 | actual_output, 72 | expected 73 | ) 74 | }) 75 | -------------------------------------------------------------------------------- /R/print.R: -------------------------------------------------------------------------------- 1 | #' Print number of predictors and principal components used. 2 | #' 3 | #' @param x A `apd_pca` object. 4 | #' 5 | #' @param ... Not currently used, but required for extensibility. 6 | #' 7 | #' @return None 8 | #' 9 | #' @examples 10 | #' 11 | #' model <- apd_pca(~ Sepal.Length + Sepal.Width, iris) 12 | #' print(model) 13 | #' @export 14 | print.apd_pca <- function(x, ...) { 15 | predictors_count <- ncol(x$blueprint$ptypes$predictors) 16 | percentage <- x$threshold * 100 17 | num_comp <- x$num_comp 18 | wording <- "components were" 19 | 20 | if (num_comp == 1) { 21 | wording <- "component was" 22 | } 23 | 24 | print_output <- glue::glue( 25 | "# Predictors: 26 | {predictors_count} 27 | # Principal Components: 28 | {num_comp} {wording} needed 29 | to capture at least {percentage}% of the 30 | total variation in the predictors." 31 | ) 32 | 33 | cat(print_output) 34 | 35 | invisible(x) 36 | } 37 | 38 | #' Print number of predictors and principal components used. 39 | #' 40 | #' @param x A `apd_hat_values` object. 41 | #' 42 | #' @param ... Not currently used, but required for extensibility. 43 | #' 44 | #' @return None 45 | #' 46 | #' @examples 47 | #' 48 | #' model <- apd_hat_values(~ Sepal.Length + Sepal.Width, iris) 49 | #' print(model) 50 | #' @export 51 | print.apd_hat_values <- function(x, ...) { 52 | predictors_count <- ncol(x$blueprint$ptypes$predictors) 53 | 54 | print_output <- glue::glue( 55 | "# Predictors: 56 | {predictors_count}" 57 | ) 58 | cat(print_output) 59 | 60 | invisible(x) 61 | } 62 | 63 | #' Print number of predictors and principal components used. 64 | #' 65 | #' @param x A `apd_similarity` object. 66 | #' 67 | #' @param ... Not currently used, but required for extensibility. 68 | #' 69 | #' @return None 70 | #' 71 | #' @examples 72 | #' 73 | #' set.seed(535) 74 | #' tr_x <- matrix( 75 | #' sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE), 76 | #' ncol = 20 77 | #' ) 78 | #' model <- apd_similarity(tr_x) 79 | #' print(model) 80 | #' @export 81 | print.apd_similarity <- function(x, ...) { 82 | cat("Applicability domain via similarity\n") 83 | cat( 84 | "Reference data were", ncol(x$ref_data), "variables collected on", 85 | nrow(x$ref_data), "data points.\n" 86 | ) 87 | if (!is.na(x$quantile)) { 88 | cat( 89 | "New data summarized using the ", round(x$quantile * 100, 1), 90 | "th percentile.\n", 91 | sep = "" 92 | ) 93 | } else { 94 | cat("New data summarized using the mean.\n", sep = "") 95 | } 96 | invisible(x) 97 | } 98 | -------------------------------------------------------------------------------- /R/plot.R: -------------------------------------------------------------------------------- 1 | #' Plot the distribution function for pcas 2 | #' 3 | #' @param object An object produced by `apd_pca`. 4 | #' 5 | #' @param ... An optional set of `dplyr` selectors, such as `dplyr::matches()` or 6 | #' `dplyr::starts_with()` for selecting which variables should be shown in the 7 | #' plot. 8 | #' 9 | #' @return A `ggplot` object that shows the distribution function for each 10 | #' principal component. 11 | #' 12 | #' @examples 13 | #' library(ggplot2) 14 | #' library(dplyr) 15 | #' library(modeldata) 16 | #' data(biomass) 17 | #' 18 | #' biomass_ad <- apd_pca(biomass[, 3:8]) 19 | #' 20 | #' autoplot(biomass_ad) 21 | #' # Using selectors in `...` 22 | #' autoplot(biomass_ad, distance) + scale_x_log10() 23 | #' autoplot(biomass_ad, matches("PC[1-2]")) 24 | #' @export autoplot.apd_pca 25 | #' @export 26 | autoplot.apd_pca <- function(object, ...) { 27 | selections <- rlang::enquos(...) 28 | 29 | pctl_data <- object$pctls 30 | 31 | if (length(selections) > 0) { 32 | terms <- tidyselect::vars_select(names(pctl_data), !!!selections) 33 | pctl_data <- pctl_data %>% dplyr::select(!!terms, percentile) 34 | } 35 | 36 | pctl_data %>% 37 | tidyr::gather(component, value, -percentile) %>% 38 | ggplot2::ggplot(aes(x = value, y = percentile)) + 39 | ggplot2::geom_step(direction = "hv") + 40 | ggplot2::facet_wrap(~component) + 41 | xlab("abs(value)") 42 | } 43 | 44 | #' Plot the cumulative distribution function for similarity metrics 45 | #' 46 | #' @param object An object produced by `apd_similarity`. 47 | #' 48 | #' @param ... Not currently used. 49 | #' 50 | #' @return A `ggplot` object that shows the cumulative probability versus the 51 | #' unique similarity values in the training set. Not that for large samples, 52 | #' this is an approximation based on a random sample of 5,000 training set 53 | #' points. 54 | #' 55 | #' @examples 56 | #' set.seed(535) 57 | #' tr_x <- matrix( 58 | #' sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE), 59 | #' ncol = 20 60 | #' ) 61 | #' model <- apd_similarity(tr_x) 62 | #' @export autoplot.apd_similarity 63 | #' @export 64 | autoplot.apd_similarity <- function(object, ...) { 65 | lab <- 66 | dplyr::case_when( 67 | is.na(object$quantile) ~ "mean", 68 | object$quantile == 0.5 ~ "median", 69 | TRUE ~ paste0(round(object$quantile * 100, 1), "th quantile of") 70 | ) 71 | 72 | ggplot2::ggplot(object$ref_scores, ggplot2::aes(x = sim, y = cumulative)) + 73 | ggplot2::geom_step(direction = "vh") + 74 | ggplot2::ylab("Cumulative Probability") + 75 | ggplot2::xlab(paste(lab, "similarity (training set)")) 76 | } 77 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' Binary QSAR Data 2 | #' 3 | #' @details These data are from two different sources on quantitative 4 | #' structure-activity relationship (QSAR) modeling and contain 67 predictors 5 | #' that are either 0 or 1. The training set contains 4,330 samples and there 6 | #' are five unknown samples (both from the `Mutagen` data in the `QSARdata` 7 | #' package). 8 | #' 9 | #' @name binary 10 | #' @aliases qsar_binary binary_tr binary_unk 11 | #' @docType data 12 | #' @return \item{binary_tr,binary_ukn}{data frame frames with 67 columns} 13 | #' 14 | #' 15 | #' @keywords datasets 16 | #' @examples 17 | #' data(qsar_binary) 18 | #' str(binary_tr) 19 | NULL 20 | 21 | #' OkCupid Binary Predictors 22 | #' 23 | #' @details Data originally from Kim (2015) includes a training and test set 24 | #' consistent with Kuhn and Johnson (2020). Predictors include ethnicity 25 | #' indicators and a set of keywords derived from text essay data. 26 | #' 27 | #' @name okc_binary 28 | #' @aliases okc_binary okc_binary_train okc_binary_test 29 | #' @docType data 30 | #' @return \item{okc_binary_train,okc_binary_test}{data frame frames with 61 columns} 31 | #' 32 | #' @source 33 | #' Kim (2015), "OkCupid Data for Introductory Statistics and Data Science Courses", _Journal of Statistics Education_, Volume 23, Number 2. \url{https://www.tandfonline.com/doi/abs/10.1080/10691898.2015.11889737} 34 | #' 35 | #' Kuhn and Johnson (2020), _Feature Engineering and Selection_, Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and \url{https://github.com/topepo/FES} 36 | #' 37 | #' @keywords datasets 38 | #' @examples 39 | #' data(okc_binary) 40 | #' str(okc_binary_train) 41 | NULL 42 | 43 | #' Recent Ames Iowa Houses 44 | #' 45 | #' More data related to the set described by De Cock (2011) where data where 46 | #' data were recorded for 2,930 properties in Ames IA. 47 | #' 48 | #' This data sets includes three more properties added since the original 49 | #' reference. There are less fields in this data set; only those that could be 50 | #' transcribed from the assessor's office were included. 51 | #' 52 | #' @name ames_new 53 | #' @aliases ames_new 54 | #' @docType data 55 | #' @return \item{ames_new}{a tibble} 56 | #' @details 57 | #' 58 | #' 59 | #' @source De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing 60 | #' Data as an End of Semester Regression Project," \emph{Journal of Statistics 61 | #' Education}, Volume 19, Number 3. 62 | #' 63 | #' \url{https://www.cityofames.org/government/departments-divisions-a-h/city-assessor} 64 | #' 65 | #' \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt} 66 | #' 67 | #' \url{http://jse.amstat.org/v19n3/decock.pdf} 68 | #' 69 | #' @keywords datasets 70 | NULL 71 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | name: Commands 8 | 9 | jobs: 10 | document: 11 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} 12 | name: document 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v3 18 | 19 | - uses: r-lib/actions/pr-fetch@v2 20 | with: 21 | repo-token: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - uses: r-lib/actions/setup-r@v2 24 | with: 25 | use-public-rspm: true 26 | 27 | - uses: r-lib/actions/setup-r-dependencies@v2 28 | with: 29 | extra-packages: any::roxygen2 30 | needs: pr-document 31 | 32 | - name: Document 33 | run: roxygen2::roxygenise() 34 | shell: Rscript {0} 35 | 36 | - name: commit 37 | run: | 38 | git config --local user.name "$GITHUB_ACTOR" 39 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 40 | git add man/\* NAMESPACE 41 | git commit -m 'Document' 42 | 43 | - uses: r-lib/actions/pr-push@v2 44 | with: 45 | repo-token: ${{ secrets.GITHUB_TOKEN }} 46 | 47 | style: 48 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} 49 | name: style 50 | runs-on: ubuntu-latest 51 | env: 52 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 53 | steps: 54 | - uses: actions/checkout@v3 55 | 56 | - uses: r-lib/actions/pr-fetch@v2 57 | with: 58 | repo-token: ${{ secrets.GITHUB_TOKEN }} 59 | 60 | - uses: r-lib/actions/setup-r@v2 61 | 62 | - name: Install dependencies 63 | run: install.packages("styler") 64 | shell: Rscript {0} 65 | 66 | - name: Style 67 | run: styler::style_pkg() 68 | shell: Rscript {0} 69 | 70 | - name: commit 71 | run: | 72 | git config --local user.name "$GITHUB_ACTOR" 73 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 74 | git add \*.R 75 | git commit -m 'Style' 76 | 77 | - uses: r-lib/actions/pr-push@v2 78 | with: 79 | repo-token: ${{ secrets.GITHUB_TOKEN }} 80 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/similarity.md: -------------------------------------------------------------------------------- 1 | # bad args 2 | 3 | Code 4 | apd_similarity(tr_x, quantile = 2) 5 | Error 6 | The `quantile` argument should be NA or a single numeric value in [0, 1]. 7 | 8 | --- 9 | 10 | Code 11 | apd_similarity(tr_x_sp) 12 | Error 13 | `x` is not of a recognized type. 14 | Only data.frame, matrix, recipe, and formula objects are allowed. 15 | A dgCMatrix was specified. 16 | 17 | # printed output 18 | 19 | Code 20 | print(apd_similarity(tr_x)) 21 | Output 22 | Applicability domain via similarity 23 | Reference data were 20 variables collected on 50 data points. 24 | New data summarized using the mean. 25 | 26 | --- 27 | 28 | Code 29 | print(apd_similarity(tr_x)) 30 | Output 31 | Applicability domain via similarity 32 | Reference data were 20 variables collected on 50 data points. 33 | New data summarized using the mean. 34 | 35 | --- 36 | 37 | Code 38 | print(apd_similarity(tr_x)) 39 | Output 40 | Applicability domain via similarity 41 | Reference data were 20 variables collected on 50 data points. 42 | New data summarized using the mean. 43 | 44 | --- 45 | 46 | Code 47 | print(apd_similarity(tr_x, quantile = 0.13)) 48 | Output 49 | Applicability domain via similarity 50 | Reference data were 20 variables collected on 50 data points. 51 | New data summarized using the 13th percentile. 52 | 53 | # apd_similarity fails when quantile is neither NA nor a number in [0, 1] 54 | 55 | Code 56 | apd_similarity(tr_x, quantile = -1) 57 | Error 58 | The `quantile` argument should be NA or a single numeric value in [0, 1]. 59 | 60 | --- 61 | 62 | Code 63 | apd_similarity(tr_x, quantile = 3) 64 | Error 65 | The `quantile` argument should be NA or a single numeric value in [0, 1]. 66 | 67 | --- 68 | 69 | Code 70 | apd_similarity(tr_x, quantile = "la") 71 | Error 72 | The `quantile` argument should be NA or a single numeric value in [0, 1]. 73 | 74 | # apd_similarity outputs warning with zero variance variables 75 | 76 | Code 77 | apd_similarity(bad_data) 78 | Warning 79 | The following variables had zero variance and were removed: a, b, and d 80 | Output 81 | Applicability domain via similarity 82 | Reference data were 1 variables collected on 2 data points. 83 | New data summarized using the mean. 84 | 85 | # apd_similarity fails when all the variables have zero variance 86 | 87 | Code 88 | apd_similarity(bad_data) 89 | Error 90 | All variables have a single unique value. 91 | 92 | # apd_similarity fails data is not binary 93 | 94 | Code 95 | apd_similarity(bad_data) 96 | Error 97 | The following variables are not binary: b, and d 98 | 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # applicable 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml) 9 | [![Codecov test 10 | coverage](https://codecov.io/gh/tidymodels/applicable/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidymodels/applicable?branch=main) 11 | [![Lifecycle:experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html) 12 | [![CRAN 13 | status](https://www.r-pkg.org/badges/version/applicable)](https://cran.r-project.org/package=applicable) 14 | 15 | 16 | ## Introduction 17 | 18 | There are times when a model’s prediction should be taken with some 19 | skepticism. For example, if a new data point is substantially different 20 | from the training set, its predicted value may be suspect. In chemistry, 21 | it is not uncommon to create an “applicability domain” model that 22 | measures the amount of potential extrapolation new samples have from the 23 | training set. applicable contains different methods to measure how much 24 | a new data point is an extrapolation from the original data (if at all). 25 | 26 | ## Installation 27 | 28 | You can install the released version of applicable from 29 | [CRAN](https://CRAN.R-project.org) with: 30 | 31 | ``` r 32 | install.packages("applicable") 33 | ``` 34 | 35 | Install the development version of applicable from 36 | [GitHub](https://github.com/) with: 37 | 38 | ``` r 39 | # install.packages("pak") 40 | pak::pak("tidymodels/applicable") 41 | ``` 42 | 43 | ## Vignettes 44 | 45 | To learn about how to use applicable, check out the vignettes: 46 | 47 | - `vignette("binary-data", "applicable")`: Learn different methods to 48 | analyze binary data. 49 | 50 | - `vignette("continuous-data", "applicable")`: Learn different methods 51 | to analyze continuous data. 52 | 53 | ## Contributing 54 | 55 | This project is released with a [Contributor Code of 56 | Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). 57 | By contributing to this project, you agree to abide by its terms. 58 | 59 | - For questions and discussions about tidymodels packages, modeling, and 60 | machine learning, please [post on RStudio 61 | Community](https://community.rstudio.com/new-topic?category_id=15&tags=tidymodels,question). 62 | 63 | - If you think you have encountered a bug, please [submit an 64 | issue](https://github.com/tidymodels/applicable/issues). 65 | 66 | - Either way, learn how to create and share a 67 | [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) 68 | (a minimal, reproducible example), to clearly communicate about your 69 | code. 70 | 71 | - Check out further details on [contributing guidelines for tidymodels 72 | packages](https://www.tidymodels.org/contribute/) and [how to get 73 | help](https://www.tidymodels.org/help/). 74 | -------------------------------------------------------------------------------- /vignettes/binary-data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Applicability domain methods for binary data" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{binary-data} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | library(ggplot2) 16 | theme_set(theme_bw()) 17 | ``` 18 | 19 | ```{r, echo = FALSE} 20 | # TODO 21 | #- Mention different input data types: data.frame, recipes, matrix, etc. 22 | #- Maybe make a (better) conclusion? 23 | #- Explain the reason why the training set is diverse. 24 | ``` 25 | 26 | ## Introduction 27 | 28 | ```{r} 29 | library(applicable) 30 | ``` 31 | 32 | Similarity statistics can be used to compare data sets where all of the 33 | predictors are binary. One of the most common measures is the Jaccard index. 34 | 35 | For a training set of size `n`, there are `n` similarity statistics for each 36 | new sample. These can be summarized via the mean statistic or a quantile. In 37 | general, we want similarity to be low within the training set (i.e., a diverse 38 | training set) and high for new samples to be predicted. 39 | 40 | To analyze the Jaccard metric, `applicable` provides the following methods: 41 | 42 | * `apd_similarity`: analyzes samples in terms of similarity scores. For a 43 | training set of _n_ samples, a new sample is compared to each, resulting in _n_ 44 | similarity scores. These can be summarized into the median similarity. 45 | 46 | * `autoplot`: shows the cumulative probability versus the unique similarity 47 | values in the training set. 48 | 49 | * `score`: scores new samples using similarity methods. In particular, it 50 | calculates the similarity scores and if `add_percentile = TRUE`, it also 51 | estimates the percentile of the similarity scores. 52 | 53 | ## Example 54 | 55 | The example data is from two QSAR data sets where binary fingerprints are used 56 | as predictors. 57 | 58 | ```{r} 59 | data(qsar_binary) 60 | ``` 61 | 62 | Let us construct the model: 63 | 64 | ```{r} 65 | jacc_sim <- apd_similarity(binary_tr) 66 | jacc_sim 67 | ``` 68 | 69 | As we can see below, this is a fairly diverse training set: 70 | 71 | ```{r jac-plot} 72 | #| fig-alt: "Empirical cumulative distribution chart. Mean similarity along the x-axis, Cumulative Probability along the why axis. Reading from left to right, values stay close to 0 from x = 0 to x = 0.25, from x = 0.25 to x = 0.4 there is a near-linear upwards trend to about y = 0.70. After that y = 1." 73 | library(ggplot2) 74 | 75 | # Plot the empirical cumulative distribution function for the training set 76 | autoplot(jacc_sim) 77 | ``` 78 | 79 | We can compare the similarity between new samples and the training set: 80 | 81 | ```{r} 82 | # Summarize across all training set similarities 83 | mean_sim <- score(jacc_sim, new_data = binary_unk) 84 | mean_sim 85 | ``` 86 | 87 | Samples 3 and 5 are definitely extrapolations based on these predictors. 88 | In other words, the new samples are not similar to the training set and so 89 | predictions on them may not be very reliable. 90 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | editor_options: 4 | chunk_output_type: console 5 | --- 6 | 7 | 8 | 9 | ```{r, include = FALSE} 10 | knitr::opts_chunk$set( 11 | collapse = TRUE, 12 | comment = "#>", 13 | fig.path = "man/figures/README-", 14 | out.width = "100%" 15 | ) 16 | 17 | options(rlang__backtrace_on_error = "reminder") 18 | 19 | ``` 20 | 21 | # applicable 22 | 23 | 24 | [![R-CMD-check](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml) 25 | [![Codecov test coverage](https://codecov.io/gh/tidymodels/applicable/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidymodels/applicable?branch=main) 26 | [![Lifecycle:experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html) 27 | [![CRAN status](https://www.r-pkg.org/badges/version/applicable)](https://cran.r-project.org/package=applicable) 28 | 29 | 30 | ## Introduction 31 | 32 | There are times when a model's prediction should be taken with some skepticism. For example, if a new data point is substantially different from the training set, its predicted value may be suspect. In chemistry, it is not uncommon to create an "applicability domain" model that measures the amount of potential extrapolation new samples have from the training set. applicable contains different methods to measure how much a new data point is an extrapolation from the original data (if at all). 33 | 34 | ## Installation 35 | 36 | You can install the released version of applicable from [CRAN](https://CRAN.R-project.org) with: 37 | 38 | ``` r 39 | install.packages("applicable") 40 | ``` 41 | 42 | Install the development version of applicable from [GitHub](https://github.com/) with: 43 | 44 | ``` r 45 | # install.packages("pak") 46 | pak::pak("tidymodels/applicable") 47 | ``` 48 | 49 | ## Vignettes 50 | 51 | To learn about how to use applicable, check out the vignettes: 52 | 53 | - `vignette("binary-data", "applicable")`: Learn different methods to analyze binary data. 54 | 55 | - `vignette("continuous-data", "applicable")`: Learn different methods to analyze continuous data. 56 | 57 | ## Contributing 58 | 59 | This project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. 60 | 61 | - For questions and discussions about tidymodels packages, modeling, and machine learning, please [post on RStudio Community](https://community.rstudio.com/new-topic?category_id=15&tags=tidymodels,question). 62 | 63 | - If you think you have encountered a bug, please [submit an issue](https://github.com/tidymodels/applicable/issues). 64 | 65 | - Either way, learn how to create and share a [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) (a minimal, reproducible example), to clearly communicate about your code. 66 | 67 | - Check out further details on [contributing guidelines for tidymodels packages](https://www.tidymodels.org/contribute/) and [how to get help](https://www.tidymodels.org/help/). 68 | -------------------------------------------------------------------------------- /tests/testthat/test-pca-fit.R: -------------------------------------------------------------------------------- 1 | test_that("`new_apd_pca` arguments are assigned correctly", { 2 | x <- new_apd_pca( 3 | "pcs", 4 | "pca_means", 5 | "pctls", 6 | "threshold", 7 | "num_comp", 8 | blueprint = hardhat::default_xy_blueprint() 9 | ) 10 | 11 | expect_equal(names(x), c("pcs", "pca_means", "pctls", "threshold", "num_comp", "blueprint")) 12 | expect_equal(x$pcs, "pcs") 13 | expect_equal(x$pca_means, "pca_means") 14 | expect_equal(x$pctls, "pctls") 15 | expect_equal(x$threshold, "threshold") 16 | expect_equal(x$num_comp, "num_comp") 17 | expect_equal(x$blueprint, hardhat::default_xy_blueprint()) 18 | }) 19 | 20 | test_that("pcs is provided", { 21 | expect_snapshot(error = TRUE, 22 | new_apd_pca(blueprint = hardhat::default_xy_blueprint()) 23 | ) 24 | }) 25 | 26 | test_that("`new_apd_pca` fails when blueprint is numeric", { 27 | expect_snapshot(error = TRUE, 28 | new_apd_pca(pcs = 1, blueprint = 1) 29 | ) 30 | }) 31 | 32 | test_that("`new_apd_pca` returned blueprint is of class hardhat_blueprint", { 33 | x <- new_apd_pca( 34 | "pcs", 35 | "pca_means", 36 | "pctls", 37 | "threshold", 38 | "num_comp", 39 | blueprint = hardhat::default_xy_blueprint() 40 | ) 41 | 42 | expect_s3_class(x$blueprint, "hardhat_blueprint") 43 | }) 44 | 45 | 46 | test_that("`apd_pca` fails when model is not of class apd_pca", { 47 | model <- apd_pca(~ Sepal.Length + Species, iris) 48 | expect_s3_class(model, "apd_pca") 49 | }) 50 | 51 | test_that("`apd_pca` fails when model is not of class hardhat_model", { 52 | model <- apd_pca(~ Sepal.Length + Species, iris) 53 | expect_s3_class(model, "hardhat_model") 54 | }) 55 | 56 | test_that("pcs matches `prcomp` output for the data frame method", { 57 | expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE) 58 | expected$x <- NULL 59 | 60 | # Data frame method 61 | expect_equal(ignore_attr = TRUE, 62 | apd_pca(mtcars)$pcs, 63 | expected 64 | ) 65 | }) 66 | 67 | test_that("pcs matches `prcomp` output for the formula method", { 68 | expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE) 69 | expected$x <- NULL 70 | 71 | # Formula method 72 | expect_equal(ignore_attr = TRUE, 73 | apd_pca(~., mtcars)$pcs, 74 | expected 75 | ) 76 | }) 77 | 78 | test_that("pcs matches `prcomp` output for the recipe method", { 79 | expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE) 80 | expected$x <- NULL 81 | 82 | # Recipe method 83 | rec <- recipes::recipe(~., mtcars) 84 | expect_equal(ignore_attr = TRUE, 85 | apd_pca(rec, data = mtcars)$pcs, 86 | expected 87 | ) 88 | }) 89 | 90 | test_that("pcs matches `prcomp` output for the matrix method", { 91 | expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE) 92 | expected$x <- NULL 93 | 94 | # Matrix method 95 | expect_equal(ignore_attr = TRUE, 96 | apd_pca(as.matrix(mtcars))$pcs, 97 | expected 98 | ) 99 | }) 100 | 101 | test_that("`apd_pca` is not defined for vectors", { 102 | cls <- class(mtcars$mpg)[1] 103 | expected_message <- glue::glue("`x` is not of a recognized type. 104 | Only data.frame, matrix, recipe, and formula objects are allowed. 105 | A {cls} was specified.") 106 | 107 | expect_condition( 108 | apd_pca(mtcars$mpg), 109 | expected_message 110 | ) 111 | }) 112 | -------------------------------------------------------------------------------- /R/hat_values-score.R: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # ---------------------- Model function implementation ------------------------ 3 | # ----------------------------------------------------------------------------- 4 | 5 | score_apd_hat_values_numeric <- function(model, predictors) { 6 | if (!("XtX_inv" %in% names(model))) { 7 | rlang::abort("The model must contain an XtX_inv argument.") 8 | } 9 | 10 | proj_matrix <- predictors %*% model$XtX_inv %*% t(predictors) 11 | hat_values <- diag(proj_matrix) 12 | 13 | hat_values_pctls <- get_new_percentile( 14 | model$pctls$hat_values_pctls, 15 | hat_values, 16 | model$pctls$percentile 17 | ) 18 | 19 | tibble::as_tibble( 20 | cbind( 21 | hat_values, 22 | hat_values_pctls 23 | ) 24 | ) 25 | } 26 | 27 | # ----------------------------------------------------------------------------- 28 | # ------------------------ Model function bridge ------------------------------ 29 | # ----------------------------------------------------------------------------- 30 | 31 | score_apd_hat_values_bridge <- function(type, model, predictors) { 32 | predictors <- as.matrix(predictors) 33 | 34 | score_function <- get_hat_values_score_function(type) 35 | predictions <- score_function(model, predictors) 36 | 37 | hardhat::validate_prediction_size(predictions, predictors) 38 | 39 | predictions 40 | } 41 | 42 | get_hat_values_score_function <- function(type) { 43 | switch(type, 44 | numeric = score_apd_hat_values_numeric 45 | ) 46 | } 47 | 48 | # ----------------------------------------------------------------------------- 49 | # ----------------------- Model function interface ---------------------------- 50 | # ----------------------------------------------------------------------------- 51 | 52 | #' Score new samples using hat values 53 | #' 54 | #' @param object A `apd_hat_values` object. 55 | #' 56 | #' @param new_data A data frame or matrix of new predictors. 57 | #' 58 | #' @param type A single character. The type of predictions to generate. 59 | #' Valid options are: 60 | #' 61 | #' - `"numeric"` for a numeric value that summarizes the hat values for 62 | #' each sample across the training set. 63 | #' 64 | #' @param ... Not used, but required for extensibility. 65 | #' 66 | #' @return 67 | #' 68 | #' A tibble of predictions. The number of rows in the tibble is guaranteed 69 | #' to be the same as the number of rows in `new_data`. For `type = "numeric"`, 70 | #' the tibble contains two columns `hat_values` and `hat_values_pctls`. The 71 | #' column `hat_values_pctls` is in percent units so that a value of 11.5 72 | #' indicates that, in the training set, 11.5 percent of the training set 73 | #' samples had smaller values than the sample being scored. 74 | #' 75 | #' @examples 76 | #' train_data <- mtcars[1:20, ] 77 | #' test_data <- mtcars[21:32, ] 78 | #' 79 | #' hat_values_model <- apd_hat_values(train_data) 80 | #' 81 | #' hat_values_scoring <- score(hat_values_model, new_data = test_data) 82 | #' hat_values_scoring 83 | #' @export 84 | score.apd_hat_values <- function(object, new_data, type = "numeric", ...) { 85 | forged <- hardhat::forge(new_data, object$blueprint) 86 | rlang::arg_match(type, valid_predict_types()) 87 | score_apd_hat_values_bridge(type, object, forged$predictors) 88 | } 89 | 90 | # ----------------------------------------------------------------------------- 91 | # ----------------------- Helper functions ------------------------------------ 92 | # ----------------------------------------------------------------------------- 93 | 94 | valid_predict_types <- function() { 95 | c("numeric") 96 | } 97 | -------------------------------------------------------------------------------- /R/pca-score.R: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # ---------------------- Model function implementation ------------------------ 3 | # ----------------------------------------------------------------------------- 4 | 5 | score_apd_pca_numeric <- function(model, predictors) { 6 | if (!("pcs" %in% names(model))) { 7 | rlang::abort("The model must contain a pcs argument.") 8 | } 9 | 10 | # Predict output and subset using `num_comp` 11 | predicted_output <- stats::predict(model$pcs, predictors) 12 | predicted_output <- predicted_output[, 1:model$num_comp, drop = FALSE] 13 | 14 | # Compute distances between new pca values and the pca means 15 | dists <- find_distance_to_pca_means(predicted_output, model$pca_means) 16 | 17 | predicted_output <- 18 | as_tibble(predicted_output) %>% 19 | setNames(names0(ncol(predicted_output), "PC")) %>% 20 | mutate(distance = dists) 21 | 22 | # Compute percentile of new pca values 23 | new_pctls <- purrr::map2_dfc( 24 | model$pctls %>% 25 | dplyr::select(-percentile), 26 | predicted_output %>% mutate_all(abs), 27 | get_new_percentile, 28 | grid = model$pctls$percentile 29 | ) %>% 30 | dplyr::rename_all(paste0, "_pctl") 31 | 32 | tibble::as_tibble( 33 | cbind( 34 | predicted_output, 35 | new_pctls 36 | ) 37 | ) 38 | } 39 | 40 | # ----------------------------------------------------------------------------- 41 | # ------------------------ Model function bridge ------------------------------ 42 | # ----------------------------------------------------------------------------- 43 | 44 | score_apd_pca_bridge <- function(type, model, predictors) { 45 | predictors <- as.matrix(predictors) 46 | 47 | score_function <- get_pca_score_function(type) 48 | predictions <- score_function(model, predictors) 49 | 50 | hardhat::validate_prediction_size(predictions, predictors) 51 | 52 | predictions 53 | } 54 | 55 | # ----------------------------------------------------------------------------- 56 | # ----------------------- Model function interface ---------------------------- 57 | # ----------------------------------------------------------------------------- 58 | 59 | #' Predict from a `apd_pca` 60 | #' 61 | #' @param object A `apd_pca` object. 62 | #' 63 | #' @param new_data A data frame or matrix of new samples. 64 | #' 65 | #' @param type A single character. The type of predictions to generate. 66 | #' Valid options are: 67 | #' 68 | #' - `"numeric"` for numeric predictions. 69 | #' 70 | #' @param ... Not used, but required for extensibility. 71 | #' 72 | #' @details The function computes the principal components of the new data and 73 | #' their percentiles as compared to the training data. The number of principal 74 | #' components computed depends on the `threshold` given at fit time. It also 75 | #' computes the multivariate distance between each principal component and its 76 | #' mean. 77 | #' 78 | #' @return 79 | #' 80 | #' A tibble of predictions. The number of rows in the tibble is guaranteed 81 | #' to be the same as the number of rows in `new_data`. 82 | #' 83 | #' @examples 84 | #' train <- mtcars[1:20, ] 85 | #' test <- mtcars[21:32, -1] 86 | #' 87 | #' # Fit 88 | #' mod <- apd_pca(mpg ~ cyl + log(drat), train) 89 | #' 90 | #' # Predict, with preprocessing 91 | #' score(mod, test) 92 | #' @export 93 | score.apd_pca <- function(object, new_data, type = "numeric", ...) { 94 | forged <- hardhat::forge(new_data, object$blueprint) 95 | rlang::arg_match(type, valid_predict_types()) 96 | score_apd_pca_bridge(type, object, forged$predictors) 97 | } 98 | 99 | # ----------------------------------------------------------------------------- 100 | # ----------------------- Helper functions ------------------------------------ 101 | # ----------------------------------------------------------------------------- 102 | 103 | get_pca_score_function <- function(type) { 104 | switch(type, 105 | numeric = score_apd_pca_numeric 106 | ) 107 | } 108 | 109 | valid_predict_types <- function() { 110 | c("numeric") 111 | } 112 | -------------------------------------------------------------------------------- /man/apd_similarity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/similarity.R 3 | \name{apd_similarity} 4 | \alias{apd_similarity} 5 | \alias{apd_similarity.default} 6 | \alias{apd_similarity.data.frame} 7 | \alias{apd_similarity.matrix} 8 | \alias{apd_similarity.formula} 9 | \alias{apd_similarity.recipe} 10 | \title{Applicability domain methods using binary similarity analysis} 11 | \usage{ 12 | apd_similarity(x, ...) 13 | 14 | \method{apd_similarity}{default}(x, quantile = NA_real_, ...) 15 | 16 | \method{apd_similarity}{data.frame}(x, quantile = NA_real_, ...) 17 | 18 | \method{apd_similarity}{matrix}(x, quantile = NA_real_, ...) 19 | 20 | \method{apd_similarity}{formula}(formula, data, quantile = NA_real_, ...) 21 | 22 | \method{apd_similarity}{recipe}(x, data, quantile = NA_real_, ...) 23 | } 24 | \arguments{ 25 | \item{x}{Depending on the context: 26 | \itemize{ 27 | \item A \strong{data frame} of binary predictors. 28 | \item A \strong{matrix} of binary predictors. 29 | \item A \strong{recipe} specifying a set of preprocessing steps 30 | created from \code{\link[recipes:recipe]{recipes::recipe()}}. 31 | }} 32 | 33 | \item{...}{Options to pass to \code{proxyC::simil()}, such as \code{method}. If no 34 | options are specified, \code{method = "jaccard"} is used.} 35 | 36 | \item{quantile}{A real number between 0 and 1 or NA for how the similarity 37 | values for each sample versus the training set should be summarized. A value 38 | of \code{NA} specifies that the mean similarity is computed. Otherwise, the 39 | appropriate quantile is computed.} 40 | 41 | \item{formula}{A formula specifying the predictor terms on the right-hand 42 | side. No outcome should be specified.} 43 | 44 | \item{data}{When a \strong{recipe} or \strong{formula} is used, \code{data} is specified as: 45 | \itemize{ 46 | \item A \strong{data frame} containing the binary predictors. Any predictors with 47 | no 1's will be removed (with a warning). 48 | }} 49 | } 50 | \value{ 51 | A \code{apd_similarity} object. 52 | } 53 | \description{ 54 | \code{apd_similarity()} is used to analyze samples in terms of similarity scores 55 | for binary data. All features in the data should be binary (i.e. zero or 56 | one). 57 | } 58 | \details{ 59 | The function computes measures of similarity for different samples 60 | points. For example, suppose samples \code{A} and \code{B} both contain \emph{p} binary 61 | variables. First, a 2x2 table is constructed between \code{A} and \code{B} \emph{across 62 | their elements}. The table will contain \emph{p} entries across the four cells 63 | (see the example below). From this, different measures of likeness are 64 | computed. 65 | 66 | For a training set of \emph{n} samples, a new sample is compared to each, 67 | resulting in \emph{n} similarity scores. These can be summarized into a single 68 | value; the median similarity is used by default by the scoring function. 69 | 70 | For this method, the computational methods are fairly taxing for large data 71 | sets. The training set must be stored (albeit in a sparse matrix format) so 72 | object sizes may become large. 73 | 74 | By default, the computations are run in parallel using \emph{all possible 75 | cores}. To change this, call the \code{setThreadOptions} function in the 76 | \code{RcppParallel} package. 77 | } 78 | \examples{ 79 | \donttest{ 80 | data(qsar_binary) 81 | 82 | jacc_sim <- apd_similarity(binary_tr) 83 | jacc_sim 84 | 85 | # plot the empirical cumulative distribution function (ECDF) for the training set: 86 | library(ggplot2) 87 | autoplot(jacc_sim) 88 | 89 | # Example calculations for two samples: 90 | A <- as.matrix(binary_tr[1, ]) 91 | B <- as.matrix(binary_tr[2, ]) 92 | xtab <- table(A, B) 93 | xtab 94 | 95 | # Jaccard statistic 96 | xtab[2, 2] / (xtab[1, 2] + xtab[2, 1] + xtab[2, 2]) 97 | 98 | # Hamman statistic 99 | ((xtab[1, 1] + xtab[2, 2]) - (xtab[1, 2] + xtab[2, 1])) / sum(xtab) 100 | 101 | # Faith statistic 102 | (xtab[1, 1] + xtab[2, 2] / 2) / sum(xtab) 103 | 104 | # Summarize across all training set similarities 105 | mean_sim <- score(jacc_sim, new_data = binary_unk) 106 | mean_sim 107 | } 108 | } 109 | \references{ 110 | Leach, A. and Gillet V. (2007). \emph{An Introduction to 111 | Chemoinformatics}. Springer, New York 112 | } 113 | -------------------------------------------------------------------------------- /tests/testthat/test-hat_values-fit.R: -------------------------------------------------------------------------------- 1 | test_that("`new_apd_hat_values` arguments are assigned correctly", { 2 | x <- new_apd_hat_values( 3 | "XtX_inv", 4 | "pctls", 5 | blueprint = hardhat::default_xy_blueprint() 6 | ) 7 | 8 | expect_equal(names(x), c("XtX_inv", "pctls", "blueprint")) 9 | expect_equal(x$XtX_inv, "XtX_inv") 10 | expect_equal(x$pctls, "pctls") 11 | expect_equal(x$blueprint, hardhat::default_xy_blueprint()) 12 | }) 13 | 14 | test_that("XtX_inv is provided", { 15 | expect_snapshot(error = TRUE, 16 | new_apd_hat_values(blueprint = hardhat::default_xy_blueprint()) 17 | ) 18 | }) 19 | 20 | test_that("`new_apd_hat_values` fails when blueprint is numeric", { 21 | expect_snapshot(error = TRUE, 22 | new_apd_hat_values(XtX_inv = 1, blueprint = 1) 23 | ) 24 | }) 25 | 26 | test_that("`new_apd_hat_values` returned blueprint is of class hardhat_blueprint", { 27 | x <- new_apd_hat_values( 28 | "XtX_inv", 29 | "pctls", 30 | blueprint = hardhat::default_xy_blueprint() 31 | ) 32 | 33 | expect_s3_class(x$blueprint, "hardhat_blueprint") 34 | }) 35 | 36 | test_that("`apd_hat_values` fails when model is not of class apd_hat_values", { 37 | model <- apd_hat_values(~ Sepal.Length + Species, iris) 38 | expect_s3_class(model, "apd_hat_values") 39 | }) 40 | 41 | test_that("`apd_hat_values` fails when model is not of class hardhat_model", { 42 | model <- apd_hat_values(~ Sepal.Length + Species, iris) 43 | expect_s3_class(model, "hardhat_model") 44 | }) 45 | 46 | test_that("`apd_hat_values` is defined for data.frame objects", { 47 | x <- apd_hat_values(mtcars) 48 | X <- as.matrix(mtcars) 49 | XpX <- t(X) %*% X 50 | XtX_inv <- qr.solve(XpX) 51 | dimnames(XtX_inv) <- NULL 52 | 53 | expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar")) 54 | expect_equal(names(x), c("XtX_inv", "pctls", "blueprint")) 55 | expect_equal(x$XtX_inv, XtX_inv) 56 | }) 57 | 58 | test_that("`apd_hat_values` is defined for formula objects", { 59 | x <- apd_hat_values(~ Sepal.Width + Sepal.Length, iris) 60 | X <- as.matrix(iris %>% select(Sepal.Width, Sepal.Length)) 61 | XpX <- t(X) %*% X 62 | XtX_inv <- qr.solve(XpX) 63 | dimnames(XtX_inv) <- NULL 64 | 65 | expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar")) 66 | expect_equal(names(x), c("XtX_inv", "pctls", "blueprint")) 67 | expect_equal(x$XtX_inv, XtX_inv) 68 | }) 69 | 70 | test_that("`apd_hat_values` is defined for recipe objects", { 71 | rec <- recipes::recipe(~ Sepal.Width + Sepal.Length, iris) 72 | x <- apd_hat_values(rec, data = iris) 73 | X <- as.matrix(iris %>% select(Sepal.Width, Sepal.Length)) 74 | XpX <- t(X) %*% X 75 | XtX_inv <- qr.solve(XpX) 76 | dimnames(XtX_inv) <- NULL 77 | 78 | expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar")) 79 | expect_equal(names(x), c("XtX_inv", "pctls", "blueprint")) 80 | expect_equal(x$XtX_inv, XtX_inv) 81 | }) 82 | 83 | test_that("`apd_hat_values` is defined for matrix objects", { 84 | X <- as.matrix(iris %>% select(-Species)) 85 | x <- apd_hat_values(X) 86 | XpX <- t(X) %*% X 87 | XtX_inv <- qr.solve(XpX) 88 | dimnames(XtX_inv) <- NULL 89 | 90 | expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar")) 91 | expect_equal(names(x), c("XtX_inv", "pctls", "blueprint")) 92 | expect_equal(x$XtX_inv, XtX_inv) 93 | }) 94 | 95 | test_that("`apd_hat_values` is not defined for vectors", { 96 | cls <- class(mtcars$mpg)[1] 97 | expected_message <- glue::glue("`x` is not of a recognized type. 98 | Only data.frame, matrix, recipe, and formula objects are allowed. 99 | A {cls} was specified.") 100 | 101 | expect_condition( 102 | apd_hat_values(mtcars$mpg), 103 | expected_message 104 | ) 105 | }) 106 | 107 | test_that("`apd_hat_values` fails when matrix has more predictors than samples", { 108 | bad_data <- mtcars %>% 109 | slice(1:5) 110 | 111 | expect_snapshot(error = TRUE, 112 | apd_hat_values(bad_data) 113 | ) 114 | }) 115 | 116 | test_that("`apd_hat_values` fails when the matrix X^tX is singular", { 117 | bad_data <- matrix( 118 | rep(0, 6), 119 | nrow = 3 120 | ) 121 | colnames(bad_data) <- c("A", "B") 122 | 123 | expect_snapshot(error = TRUE, 124 | apd_hat_values(bad_data) 125 | ) 126 | }) 127 | 128 | test_that("`get_inv` behaves correctly when the input is not a matrix", { 129 | X <- c(1:5) 130 | expect_error(get_inv(X), NA) 131 | }) 132 | -------------------------------------------------------------------------------- /R/hat_values-fit.R: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # ---------------------- Model Constructor ------------------------------------ 3 | # ----------------------------------------------------------------------------- 4 | 5 | new_apd_hat_values <- function(XtX_inv, pctls, blueprint) { 6 | hardhat::new_model( 7 | XtX_inv = XtX_inv, 8 | pctls = pctls, 9 | blueprint = blueprint, 10 | class = "apd_hat_values" 11 | ) 12 | } 13 | 14 | # ----------------------------------------------------------------------------- 15 | # ---------------------- Model function implementation ------------------------ 16 | # ----------------------------------------------------------------------------- 17 | 18 | apd_hat_values_impl <- function(predictors) { 19 | X <- as.matrix(predictors) 20 | dimnames(X) <- NULL 21 | 22 | XtX_inv <- get_inv(X) 23 | 24 | P <- X %*% XtX_inv %*% t(X) 25 | hat_values <- diag(P) 26 | 27 | # Calculate percentile for all PCs and distances 28 | pctls <- as.data.frame(get_ref_percentile(hat_values)) %>% 29 | setNames("hat_values_pctls") %>% 30 | mutate(percentile = seq(0, 100, length = 101)) 31 | 32 | res <- 33 | list( 34 | XtX_inv = XtX_inv, 35 | pctls = pctls 36 | ) 37 | 38 | res 39 | } 40 | 41 | # ----------------------------------------------------------------------------- 42 | # ------------------------ Model function bridge ------------------------------ 43 | # ----------------------------------------------------------------------------- 44 | 45 | apd_hat_values_bridge <- function(processed, ...) { 46 | predictors <- processed$predictors 47 | 48 | if (ncol(predictors) >= nrow(predictors)) { 49 | rlang::abort("The number of columns must be less than the number of rows.") 50 | } 51 | 52 | fit <- apd_hat_values_impl(predictors) 53 | 54 | new_apd_hat_values( 55 | XtX_inv = fit$XtX_inv, 56 | pctls = fit$pctls, 57 | blueprint = processed$blueprint 58 | ) 59 | } 60 | 61 | # ----------------------------------------------------------------------------- 62 | # ----------------------- Model function interface ---------------------------- 63 | # ----------------------------------------------------------------------------- 64 | 65 | #' Fit a `apd_hat_values` 66 | #' 67 | #' `apd_hat_values()` fits a model. 68 | #' 69 | #' @param x Depending on the context: 70 | #' 71 | #' * A __data frame__ of predictors. 72 | #' * A __matrix__ of predictors. 73 | #' * A __recipe__ specifying a set of preprocessing steps 74 | #' created from [recipes::recipe()]. 75 | #' 76 | #' @param data When a __recipe__ or __formula__ is used, `data` is specified as: 77 | #' 78 | #' * A __data frame__ containing the predictors. 79 | #' 80 | #' @param formula A formula specifying the predictor terms on the right-hand 81 | #' side. No outcome should be specified. 82 | #' 83 | #' @param ... Not currently used, but required for extensibility. 84 | #' 85 | #' @return 86 | #' 87 | #' A `apd_hat_values` object. 88 | #' 89 | #' @examples 90 | #' predictors <- mtcars[, -1] 91 | #' 92 | #' # Data frame interface 93 | #' mod <- apd_hat_values(predictors) 94 | #' 95 | #' # Formula interface 96 | #' mod2 <- apd_hat_values(mpg ~ ., mtcars) 97 | #' 98 | #' # Recipes interface 99 | #' library(recipes) 100 | #' rec <- recipe(mpg ~ ., mtcars) 101 | #' rec <- step_log(rec, disp) 102 | #' mod3 <- apd_hat_values(rec, mtcars) 103 | #' @export 104 | apd_hat_values <- function(x, ...) { 105 | UseMethod("apd_hat_values") 106 | } 107 | 108 | # Default method 109 | 110 | #' @export 111 | #' @rdname apd_hat_values 112 | apd_hat_values.default <- function(x, ...) { 113 | cls <- class(x)[1] 114 | message <- 115 | "`x` is not of a recognized type. 116 | Only data.frame, matrix, recipe, and formula objects are allowed. 117 | A {cls} was specified." 118 | message <- glue::glue(message) 119 | rlang::abort(message = message) 120 | } 121 | 122 | # Data frame method 123 | 124 | #' @export 125 | #' @rdname apd_hat_values 126 | apd_hat_values.data.frame <- function(x, ...) { 127 | processed <- hardhat::mold(x, NA_real_) 128 | apd_hat_values_bridge(processed, ...) 129 | } 130 | 131 | # Matrix method 132 | 133 | #' @export 134 | #' @rdname apd_hat_values 135 | apd_hat_values.matrix <- function(x, ...) { 136 | processed <- hardhat::mold(x, NA_real_) 137 | apd_hat_values_bridge(processed, ...) 138 | } 139 | 140 | # Formula method 141 | 142 | #' @export 143 | #' @rdname apd_hat_values 144 | apd_hat_values.formula <- function(formula, data, ...) { 145 | processed <- hardhat::mold(formula, data) 146 | apd_hat_values_bridge(processed, ...) 147 | } 148 | 149 | # Recipe method 150 | 151 | #' @export 152 | #' @rdname apd_hat_values 153 | apd_hat_values.recipe <- function(x, data, ...) { 154 | processed <- hardhat::mold(x, data) 155 | apd_hat_values_bridge(processed, ...) 156 | } 157 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at codeofconduct@posit.co. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series of 85 | actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or permanent 92 | ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within the 112 | community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.1, available at 118 | . 119 | 120 | Community Impact Guidelines were inspired by 121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion]. 122 | 123 | For answers to common questions about this code of conduct, see the FAQ at 124 | . Translations are available at . 125 | 126 | [homepage]: https://www.contributor-covenant.org 127 | -------------------------------------------------------------------------------- /R/pca-fit.R: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # ---------------------- Model Constructor ------------------------------------ 3 | # ----------------------------------------------------------------------------- 4 | 5 | new_apd_pca <- function(pcs, pca_means, pctls, threshold, num_comp, blueprint) { 6 | hardhat::new_model( 7 | pcs = pcs, 8 | pca_means = pca_means, 9 | pctls = pctls, 10 | threshold = threshold, 11 | num_comp = num_comp, 12 | blueprint = blueprint, 13 | class = "apd_pca" 14 | ) 15 | } 16 | 17 | # ----------------------------------------------------------------------------- 18 | # ---------------------- Model function implementation ------------------------ 19 | # ----------------------------------------------------------------------------- 20 | 21 | apd_pca_impl <- function(predictors, threshold) { 22 | pcs <- stats::prcomp( 23 | predictors, 24 | center = TRUE, 25 | scale. = TRUE, 26 | retx = TRUE 27 | ) 28 | 29 | # TODO: verify threshold \in (0, 1] 30 | eigs <- pcs$sdev^2 31 | cum_sum <- cumsum(eigs) / sum(eigs) 32 | num_comp <- sum(cum_sum <= threshold) + 1 33 | 34 | # Update `pcs` count to `num_comp` 35 | pcs$x <- pcs$x[, 1:num_comp, drop = FALSE] 36 | 37 | # Find the mean of each principal component 38 | pca_means <- colMeans(pcs$x) 39 | 40 | # Compute distances between each principal component and its mean 41 | distance <- find_distance_to_pca_means(pcs$x, pca_means) 42 | pctls <- as_tibble(pcs$x) %>% 43 | setNames(names0(ncol(pcs$x), "PC")) %>% 44 | mutate_all(abs) %>% 45 | mutate(distance = distance) 46 | 47 | # Calculate percentile for all PCs and distances 48 | pctls <- map_dfc(pctls, get_ref_percentile) %>% 49 | mutate(percentile = seq(0, 100, length = 101)) 50 | 51 | pcs$x <- NULL 52 | 53 | res <- list( 54 | pcs = pcs, 55 | pctls = pctls, 56 | pca_means = pca_means, 57 | threshold = threshold, 58 | num_comp = num_comp 59 | ) 60 | res 61 | } 62 | 63 | # ----------------------------------------------------------------------------- 64 | # ------------------------ Model function bridge ------------------------------ 65 | # ----------------------------------------------------------------------------- 66 | 67 | apd_pca_bridge <- function(processed, threshold, ...) { 68 | predictors <- processed$predictors 69 | 70 | fit <- apd_pca_impl(predictors, threshold) 71 | 72 | new_apd_pca( 73 | pcs = fit$pcs, 74 | pca_means = fit$pca_means, 75 | pctls = fit$pctls, 76 | threshold = fit$threshold, 77 | num_comp = fit$num_comp, 78 | blueprint = processed$blueprint 79 | ) 80 | } 81 | 82 | # ----------------------------------------------------------------------------- 83 | # ----------------------- Model function interface ---------------------------- 84 | # ----------------------------------------------------------------------------- 85 | 86 | #' Fit a `apd_pca` 87 | #' 88 | #' `apd_pca()` fits a model. 89 | #' 90 | #' @param x Depending on the context: 91 | #' 92 | #' * A __data frame__ of predictors. 93 | #' * A __matrix__ of predictors. 94 | #' * A __recipe__ specifying a set of preprocessing steps 95 | #' created from [recipes::recipe()]. 96 | #' 97 | #' @param data When a __recipe__ or __formula__ is used, `data` is specified as: 98 | #' 99 | #' * A __data frame__ containing the predictors. 100 | #' 101 | #' @param formula A formula specifying the predictor terms on the right-hand 102 | #' side. No outcome should be specified. 103 | #' 104 | #' @param threshold A number indicating the percentage of variance desired from 105 | #' the principal components. It must be a number greater than 0 and less or 106 | #' equal than 1. 107 | #' 108 | #' @param ... Not currently used, but required for extensibility. 109 | #' 110 | #' @details The function computes the principal components that account for 111 | #' up to either 95% or the provided `threshold` of variability. It also 112 | #' computes the percentiles of the absolute value of the principal components. 113 | #' Additionally, it calculates the mean of each principal component. 114 | #' 115 | #' @return 116 | #' 117 | #' A `apd_pca` object. 118 | #' 119 | #' @examples 120 | #' predictors <- mtcars[, -1] 121 | #' 122 | #' # Data frame interface 123 | #' mod <- apd_pca(predictors) 124 | #' 125 | #' # Formula interface 126 | #' mod2 <- apd_pca(mpg ~ ., mtcars) 127 | #' 128 | #' # Recipes interface 129 | #' library(recipes) 130 | #' rec <- recipe(mpg ~ ., mtcars) 131 | #' rec <- step_log(rec, disp) 132 | #' mod3 <- apd_pca(rec, mtcars) 133 | #' @export 134 | apd_pca <- function(x, ...) { 135 | UseMethod("apd_pca") 136 | } 137 | 138 | # Default method 139 | 140 | #' @export 141 | #' @rdname apd_pca 142 | apd_pca.default <- function(x, ...) { 143 | cls <- class(x)[1] 144 | message <- 145 | "`x` is not of a recognized type. 146 | Only data.frame, matrix, recipe, and formula objects are allowed. 147 | A {cls} was specified." 148 | message <- glue::glue(message) 149 | rlang::abort(message = message) 150 | } 151 | 152 | # Data frame method 153 | 154 | #' @export 155 | #' @rdname apd_pca 156 | apd_pca.data.frame <- function(x, threshold = 0.95, ...) { 157 | processed <- hardhat::mold(x, NA_real_) 158 | apd_pca_bridge(processed, threshold, ...) 159 | } 160 | 161 | # Matrix method 162 | 163 | #' @export 164 | #' @rdname apd_pca 165 | apd_pca.matrix <- function(x, threshold = 0.95, ...) { 166 | processed <- hardhat::mold(x, NA_real_) 167 | apd_pca_bridge(processed, threshold, ...) 168 | } 169 | 170 | # Formula method 171 | 172 | #' @export 173 | #' @rdname apd_pca 174 | apd_pca.formula <- function(formula, data, threshold = 0.95, ...) { 175 | processed <- hardhat::mold(formula, data) 176 | apd_pca_bridge(processed, threshold, ...) 177 | } 178 | 179 | # Recipe method 180 | 181 | #' @export 182 | #' @rdname apd_pca 183 | apd_pca.recipe <- function(x, data, threshold = 0.95, ...) { 184 | processed <- hardhat::mold(x, data) 185 | apd_pca_bridge(processed, threshold, ...) 186 | } 187 | -------------------------------------------------------------------------------- /tests/testthat/test-similarity.R: -------------------------------------------------------------------------------- 1 | library(proxyC) 2 | library(Matrix) 3 | library(recipes) 4 | library(ggplot2) 5 | 6 | # ------------------------------------------------------------------------------ 7 | 8 | # simulate a small data set 9 | make_data <- function(p, n, rate = .5) { 10 | x <- matrix(sample(0:1, size = p * n, prob = rep(rate, 2), replace = TRUE), ncol = p) 11 | colnames(x) <- paste0("x", 1:p) 12 | x 13 | } 14 | 15 | set.seed(535) 16 | tr_x <- make_data(20, 50) 17 | un_x <- make_data(20, 10) 18 | 19 | tr_x_sp <- Matrix(tr_x, sparse = TRUE) 20 | un_x_sp <- Matrix(un_x, sparse = TRUE) 21 | 22 | tr_scores <- simil(tr_x_sp, tr_x_sp, method = "jaccard") 23 | un_scores <- simil(tr_x_sp, un_x_sp, method = "jaccard") 24 | 25 | mean_tr <- apply(tr_scores, 1, mean) 26 | mean_tab <- as.data.frame(table(mean_tr), stringsAsFactors = FALSE) 27 | mean_tab$mean_tr <- as.numeric(mean_tab$mean_tr) 28 | mean_tab$cumulative <- cumsum(mean_tab$Freq) / 50 29 | 30 | # ------------------------------------------------------------------------------ 31 | 32 | test_that("matrix method - mean similarity", { 33 | tmp <- apd_similarity(tr_x) 34 | tmp_scores <- score(tmp, un_x) 35 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean)) 36 | expect_equal(tmp$options, list(method = "jaccard")) 37 | expect_equal(tmp$ref_data, tr_x_sp) 38 | expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim) 39 | expect_equal(mean_tab$Freq, tmp$ref_scores$n) 40 | expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative) 41 | }) 42 | 43 | test_that("data frame method - quantile similarity", { 44 | tmp <- apd_similarity(tr_x, quantile = .1) 45 | tmp_scores <- score(tmp, un_x) 46 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1)) 47 | }) 48 | 49 | test_that("formula method - mean similarity", { 50 | sim_form <- as.formula(" ~.") 51 | tmp <- apd_similarity(sim_form, tr_x) 52 | tmp_scores <- score(tmp, un_x) 53 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean)) 54 | expect_equal(tmp$options, list(method = "jaccard")) 55 | expect_equal(tmp$ref_data, tr_x_sp) 56 | expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim) 57 | expect_equal(mean_tab$Freq, tmp$ref_scores$n) 58 | expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative) 59 | }) 60 | 61 | # ------------------------------------------------------------------------------ 62 | 63 | test_that("data frame method - mean similarity", { 64 | tmp <- apd_similarity(as.data.frame(tr_x)) 65 | tmp_scores <- score(tmp, as.data.frame(un_x)) 66 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean)) 67 | expect_equal(tmp$options, list(method = "jaccard")) 68 | expect_equal(tmp$ref_data, tr_x_sp) 69 | expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim) 70 | expect_equal(mean_tab$Freq, tmp$ref_scores$n) 71 | expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative) 72 | }) 73 | 74 | 75 | test_that("matrix method - quantile similarity", { 76 | tmp <- apd_similarity(as.data.frame(tr_x), quantile = .1) 77 | tmp_scores <- score(tmp, as.data.frame(un_x)) 78 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1)) 79 | }) 80 | 81 | test_that("formula method - quantile similarity", { 82 | sim_form <- as.formula(" ~.") 83 | tmp <- apd_similarity(sim_form, tr_x, quantile = .1) 84 | tmp_scores <- score(tmp, as.data.frame(un_x)) 85 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1)) 86 | }) 87 | 88 | # ------------------------------------------------------------------------------ 89 | 90 | test_that("recipe method - mean similarity", { 91 | rec <- 92 | recipe(~., data = as.data.frame(tr_x)) %>% 93 | step_zv(all_predictors()) 94 | tmp <- apd_similarity(rec, as.data.frame(tr_x)) 95 | tmp_scores <- score(tmp, as.data.frame(un_x)) 96 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean)) 97 | expect_equal(tmp$options, list(method = "jaccard")) 98 | expect_equal(tmp$ref_data, tr_x_sp) 99 | expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim) 100 | expect_equal(mean_tab$Freq, tmp$ref_scores$n) 101 | expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative) 102 | }) 103 | 104 | 105 | test_that("matrix method - quantile similarity", { 106 | rec <- 107 | recipe(~., data = as.data.frame(tr_x)) %>% 108 | step_zv(all_predictors()) 109 | tmp <- apd_similarity(rec, as.data.frame(tr_x), quantile = .1) 110 | tmp_scores <- score(tmp, as.data.frame(un_x)) 111 | expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1)) 112 | }) 113 | 114 | # ------------------------------------------------------------------------------ 115 | 116 | test_that("bad args", { 117 | expect_snapshot(error = TRUE, 118 | apd_similarity(tr_x, quantile = 2) 119 | ) 120 | expect_snapshot(error = TRUE, 121 | apd_similarity(tr_x_sp) 122 | ) 123 | }) 124 | 125 | # ------------------------------------------------------------------------------ 126 | 127 | test_that("printed output", { 128 | expect_snapshot(print(apd_similarity(tr_x))) 129 | expect_snapshot(print(apd_similarity(tr_x))) 130 | expect_snapshot(print(apd_similarity(tr_x))) 131 | expect_snapshot(print(apd_similarity(tr_x, quantile = .13))) 132 | }) 133 | # ------------------------------------------------------------------------------ 134 | 135 | test_that("plot output", { 136 | ad <- apd_similarity(tr_x) 137 | ad_plot <- autoplot(ad) 138 | expect_equal(ad_plot$data, ad$ref_scores) 139 | expect_equal(ad_plot$labels$x, "mean similarity (training set)") 140 | expect_equal(ad_plot$labels$y, "Cumulative Probability") 141 | }) 142 | 143 | # ------------------------------------------------------------------------------ 144 | 145 | test_that("apd_similarity fails when quantile is neither NA nor a number in [0, 1]", { 146 | 147 | expect_snapshot(error = TRUE, 148 | apd_similarity(tr_x, quantile = -1) 149 | ) 150 | 151 | expect_snapshot(error = TRUE, 152 | apd_similarity(tr_x, quantile = 3) 153 | ) 154 | 155 | expect_snapshot(error = TRUE, 156 | apd_similarity(tr_x, quantile = "la") 157 | ) 158 | }) 159 | 160 | # ------------------------------------------------------------------------------ 161 | 162 | test_that("apd_similarity outputs warning with zero variance variables ", { 163 | bad_data <- list( 164 | "a" = c(0, 0), 165 | "b" = c(0, 0), 166 | "c" = c(1, 1), 167 | "d" = c(0, 0) 168 | ) 169 | bad_data <- as.data.frame(bad_data) 170 | 171 | expect_snapshot( 172 | apd_similarity(bad_data) 173 | ) 174 | }) 175 | 176 | # ------------------------------------------------------------------------------ 177 | 178 | test_that("apd_similarity fails when all the variables have zero variance", { 179 | bad_data <- list( 180 | "a" = c(0, 0), 181 | "b" = c(0, 0), 182 | "d" = c(0, 0) 183 | ) 184 | bad_data <- as.data.frame(bad_data) 185 | 186 | expect_snapshot(error = TRUE, 187 | apd_similarity(bad_data) 188 | ) 189 | }) 190 | 191 | # ------------------------------------------------------------------------------ 192 | 193 | test_that("apd_similarity fails data is not binary", { 194 | bad_data <- list( 195 | "a" = c(0, 0), 196 | "b" = c(1, 3), 197 | "c" = c(1, 1), 198 | "d" = c(2, 0) 199 | ) 200 | bad_data <- as.data.frame(bad_data) 201 | 202 | expect_snapshot(error = TRUE, 203 | apd_similarity(bad_data) 204 | ) 205 | }) 206 | -------------------------------------------------------------------------------- /vignettes/continuous-data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Applicability domain methods for continuous data" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{continuous-data} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | 16 | prev_options <- options(width = 100) 17 | 18 | library(ggplot2) 19 | theme_set(theme_bw()) 20 | ``` 21 | 22 | ## Introduction 23 | 24 | ```{r} 25 | library(applicable) 26 | ``` 27 | 28 | `applicable` provides the following methods to analyze the applicability domain of your model: 29 | 30 | * Principal component analysis 31 | * Hat values statistics 32 | 33 | ## Example 34 | 35 | We will use the Ames IA housing data for our example. 36 | 37 | ```{r ames_data, message=FALSE} 38 | library(modeldata) 39 | data(ames) 40 | ``` 41 | 42 | There are `r format(nrow(ames), big.mark = ",")` properties in the data. 43 | 44 | The Sale Price was recorded along with `r ncol(ames)` predictors, including: 45 | 46 | * Location (e.g. neighborhood) and lot information. 47 | * House components (garage, fireplace, pool, porch, etc.). 48 | * General assessments such as overall quality and condition. 49 | * Number of bedrooms, baths, and so on. 50 | 51 | More details can be found in [De Cock (2011, Journal of Statistics Education)](http://jse.amstat.org/v19n3/decock.pdf). 52 | 53 | The raw data are at [`http://jse.amstat.org/v19n3/decock/AmesHousing.txt`](http://jse.amstat.org/v19n3/decock/AmesHousing.txt) but we 54 | will use a processed version found in the 55 | [`AmesHousing`](https://github.com/topepo/AmesHousing) package. `applicable` 56 | also contains an update for these data for three new properties (although fewer 57 | fields were collected on these). 58 | 59 | To pre-process the training set, we will use the _recipes_ package. We first 60 | tell the recipes that there is an additional value for the neighborhood in these 61 | data, then direct it to create dummy variables for all categorical predictors. In 62 | cases where there are no levels observed for a factor, we eliminate predictors 63 | with a single unique value, then estimate a transformation that will make the 64 | predictor distributions more symmetric. After these, the data are centered and 65 | scaled. These same transformations will be applied to the new data points using 66 | the statistics estimated from the training set. 67 | 68 | ```{r prep_data, message=FALSE} 69 | library(recipes) 70 | library(dplyr) 71 | 72 | ames_cols <- intersect(names(ames), names(ames_new)) 73 | 74 | training_data <- 75 | ames %>% 76 | # For consistency, only analyze the data on new properties 77 | dplyr::select(one_of(ames_cols)) %>% 78 | mutate( 79 | # There is a new neighborhood in ames_new 80 | Neighborhood = as.character(Neighborhood), 81 | Neighborhood = factor(Neighborhood, levels = levels(ames_new$Neighborhood)) 82 | ) 83 | 84 | 85 | training_recipe <- 86 | recipe( ~ ., data = training_data) %>% 87 | step_dummy(all_nominal()) %>% 88 | # Remove variables that have the same value for every data point. 89 | step_zv(all_predictors()) %>% 90 | # Transform variables to be distributed as Gaussian-like as possible. 91 | step_YeoJohnson(all_numeric()) %>% 92 | # Normalize numeric data to have a mean of zero and 93 | # standard deviation of one. 94 | step_normalize(all_numeric()) 95 | ``` 96 | 97 | 98 | ### Principal Component Analysis 99 | 100 | The following functions in `applicable` are used for principal component 101 | analysis: 102 | 103 | * `apd_pca`: computes the principal components that account for up 104 | to either 95% or the provided `threshold` of variability. It also computes the 105 | percentiles of the principal components and the mean of each principal 106 | component. 107 | * `autoplot`: plots the distribution function for pcas. You can also provide an 108 | optional set of `dplyr` selectors, such as `dplyr::matches()` or 109 | `dplyr::starts_with()`, for selecting which variables should be shown in the 110 | plot. 111 | * `score`: calculates the principal components of the new data and their 112 | percentiles as compared to the training data. The number of principal 113 | components computed depends on the `threshold` given at fit time. It also 114 | computes the multivariate distance between each principal component and its 115 | mean. 116 | 117 | Let us apply `apd_pca` modeling function to our data: 118 | 119 | ```{r} 120 | ames_pca <- apd_pca(training_recipe, training_data) 121 | ames_pca 122 | ``` 123 | 124 | Since no `threshold` was provided, the function computed the number of 125 | principal components that accounted for at most 95% of the total variance. 126 | 127 | For illustration, setting `threshold = 0.25` or 25%, we now need only 10 principal components: 128 | 129 | ```{r} 130 | ames_pca <- apd_pca(training_recipe, training_data, threshold = 0.25) 131 | ames_pca 132 | ``` 133 | 134 | Plotting the distribution function for the PCA scores is also helpful: 135 | 136 | ```{r autoplot} 137 | #| fig-alt: "Faceted line chart. abs(value) along the x-axis, percentile along the y-axis. The facets are distance, followed by PC 1 through 12. All lines go up fairly fast." 138 | library(ggplot2) 139 | autoplot(ames_pca) 140 | ``` 141 | 142 | You can use regular expressions to plot a smaller subset of the pca statistics: 143 | 144 | ```{r, echo = FALSE, results = "hold"} 145 | autoplot(ames_pca, matches("PC0[1-5]")) 146 | autoplot(ames_pca, distance) + scale_x_log10() 147 | ``` 148 | 149 | The `score` function compares the training data to new samples. Let's go back to 150 | the case where we capture 95% of the variation in the predictors and score the 151 | new samples. Since we used the recipe interface, we can give the score function 152 | the original data: 153 | 154 | ```{r new_sample} 155 | ames_pca <- apd_pca(training_recipe, training_data) 156 | pca_score <- score(ames_pca, ames_new) 157 | pca_score %>% select(matches("PC00[1-3]"), contains("distance")) 158 | ``` 159 | 160 | Notice how the samples, displayed in red, are fairly dissimilar to the training 161 | set in the first component: 162 | 163 | ```{r, echo = FALSE} 164 | #| fig-alt: "Histogram chart. PC001 along the x-axis, count along the y-axis. A vertical red line is placed inside the distribution." 165 | training_scores <- score(ames_pca, training_data) 166 | ggplot(training_scores, aes(x = PC001)) + 167 | geom_histogram(col = "white", binwidth = .5) + 168 | geom_vline(xintercept = pca_score$PC001, col = "red") 169 | ``` 170 | 171 | What is driving the first component? We can look at which predictors have the 172 | largest values in the rotation matrix (i.e. the values that define the linear 173 | combinations in the PC scores). The top five are: 174 | 175 | ```{r} 176 | # `ames_pca$pcs` is the output of `prcomp()` 177 | comp_one <- ames_pca$pcs$rotation[, 1] 178 | comp_one[order(abs(comp_one), decreasing = TRUE)] %>% head(5) 179 | ``` 180 | 181 | These three houses are extreme in the most influential variable (year built) since 182 | they were new homes. The also tend to have fairly large garages: 183 | 184 | ```{r, echo = FALSE} 185 | #| fig-alt: "Histogram chart. Garage_Area along the x-axis, count along the y-axis. Two red vertical lines are places. One on the edge of the distribution, another outside the distribution." 186 | ggplot(training_data, aes(x = Garage_Area)) + 187 | geom_histogram(col = "white", binwidth = 50) + 188 | geom_vline(xintercept = ames_new$Garage_Area, col = "red") 189 | ``` 190 | 191 | This may be what is driving the first component. 192 | 193 | However, the overall distance values are relatively small, which indicates that, 194 | overall, these new houses are not outside the mainstream of the data. 195 | 196 | 197 | 198 | ### Hat Values 199 | 200 | The [Hat or leverage values](https://en.wikipedia.org/wiki/Leverage_(statistics)) are based on the numerics of linear regression. The measure the distance of a data point to the center of the training set distribution. For example, if the numeric training set matrix was $X_{n \times p}$, the _hat matrix_ for the training set would be computed using 201 | 202 | $$H = X'(X'X)^{-1}X$$ 203 | 204 | The corresponding hat values for the training would be the diagonals of $H$. These values can be computed using `stats::hatvalues(lm_model)` but only for an `lm` model object. Also, it cannot compute the values for new samples. 205 | 206 | Suppose that we had a new, unknown sample (as a $p \times 1$ data vector $u$). The hat value for this sample would be 207 | 208 | $$h = u^\intercal(X^\intercal X)^{-1}u$$. 209 | 210 | The following functions in `applicable` are used to compute the hat values of your model: 211 | 212 | * `apd_hat_values`: computes the matrix $(X^\intercal X)^{-1}$. 213 | * `score`: calculates the hat values of new samples and their percentiles. 214 | 215 | Two caveats for using the hat values: 216 | 217 | 1. The numerical methods are less tolerant than PCA. For example, extremely correlated predictors will degrade the ability of the hat values to be effectively used. Also, since an inverse is used, there cannot be an linear dependencies within $X$. To resolve this the former example, the recipe step `recipes::step_corr()` can be used to reduce correlation. For the latter issue, `recipes::step_lincomp()` will identify and remove linear dependencies in the data (as shown below). 218 | 219 | 1. When using a linear or logistic model, the model adds an intercept columns of ones to $X$. For equivalent computations, you should add a vector or ones to the data or use `recipes::step_intercept()`. 220 | 221 | Let us apply `apd_hat_values` modeling function to our data (while ensuring that there are no linear dependencies): 222 | 223 | ```{r} 224 | non_singular_recipe <- 225 | training_recipe %>% 226 | step_lincomb(all_predictors()) 227 | 228 | # Recipe interface 229 | ames_hat <- apd_hat_values(non_singular_recipe, training_data) 230 | ``` 231 | 232 | 233 | ```{r reset_options} 234 | 235 | options(prev_options) 236 | 237 | ``` 238 | -------------------------------------------------------------------------------- /R/similarity.R: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | 3 | new_apd_similarity <- function(quantile, ref_data, options, ref_scores, blueprint) { 4 | hardhat::new_model( 5 | quantile = quantile, 6 | ref_data = ref_data, 7 | blueprint = blueprint, 8 | options = options, 9 | ref_scores = ref_scores, 10 | class = "apd_similarity" 11 | ) 12 | } 13 | 14 | # ----------------------------------------------------------------------------- 15 | # ---------------------- Model function implementation ------------------------ 16 | # ----------------------------------------------------------------------------- 17 | 18 | apd_similarity_impl <- function(predictors, quantile, options) { 19 | if (!any("method" == names(options))) { 20 | options$method <- "jaccard" 21 | } 22 | 23 | res <- list(quantile = quantile, ref_data = predictors, options = options) 24 | 25 | p <- nrow(predictors) 26 | keep_n <- min(p, 5000) 27 | sampling <- sample.int(p, keep_n) 28 | ref_scores <- 29 | tibble::tibble( 30 | sim = score_apd_similarity_numeric(res, predictors[sampling, , drop = FALSE], options) 31 | ) %>% 32 | dplyr::group_by(sim) %>% 33 | dplyr::count() %>% 34 | dplyr::ungroup() %>% 35 | dplyr::mutate(cumulative = cumsum(n) / sum(n)) 36 | res$ref_scores <- ref_scores 37 | res 38 | } 39 | 40 | # ----------------------------------------------------------------------------- 41 | # ---------------------- Model function bridge -------------------------------- 42 | # ----------------------------------------------------------------------------- 43 | 44 | apd_similarity_bridge <- function(processed, quantile = NA_real_, ...) { 45 | opts <- list(...) 46 | 47 | msg <- "The `quantile` argument should be NA or a single numeric value in [0, 1]." 48 | if (!is.na(quantile) && (!is.numeric(quantile) || length(quantile) != 1)) { 49 | rlang::abort(msg) 50 | } 51 | if (!is.na(quantile) && (quantile < 0 | quantile > 1)) { 52 | rlang::abort(msg) 53 | } 54 | 55 | predictors <- processed$predictors 56 | if (!is.matrix(predictors)) { 57 | predictors <- as.matrix(predictors) 58 | } 59 | 60 | not_bin <- apply(predictors, 2, function(x) any(x != 1 & x != 0)) 61 | if (any(not_bin)) { 62 | bad_x <- colnames(predictors)[not_bin] 63 | bad_x <- glue::glue_collapse(bad_x, sep = ", ", last = ", and ") 64 | rlang::abort( 65 | glue( 66 | "The following variables are not binary: {bad_x}" 67 | ) 68 | ) 69 | } 70 | 71 | if (!inherits(predictors, "dgCMatrix")) { 72 | predictors <- Matrix::Matrix(predictors, sparse = TRUE) 73 | } 74 | 75 | # check for binary and not zero-vars 76 | 77 | zv <- Matrix::colSums(predictors) 78 | if (all(zv == 0)) { 79 | rlang::abort("All variables have a single unique value.") 80 | } else { 81 | if (any(zv == 0)) { 82 | bad_x <- colnames(predictors)[zv == 0] 83 | bad_x <- glue::glue_collapse(bad_x, sep = ", ", last = ", and ") 84 | rlang::warn( 85 | glue( 86 | "The following variables had zero variance and were removed: {bad_x}" 87 | ) 88 | ) 89 | predictors <- predictors[, zv > 0, drop = FALSE] 90 | } 91 | } 92 | 93 | fit <- apd_similarity_impl(predictors, quantile = quantile, options = opts) 94 | 95 | new_apd_similarity( 96 | quantile = quantile, 97 | ref_data = fit$ref_data, 98 | options = fit$options, 99 | ref_scores = fit$ref_scores, 100 | blueprint = processed$blueprint 101 | ) 102 | } 103 | 104 | # ----------------------------------------------------------------------------- 105 | # ---------------------- Model function interface ----------------------------- 106 | # ----------------------------------------------------------------------------- 107 | 108 | #' Applicability domain methods using binary similarity analysis 109 | #' 110 | #' `apd_similarity()` is used to analyze samples in terms of similarity scores 111 | #' for binary data. All features in the data should be binary (i.e. zero or 112 | #' one). 113 | #' 114 | #' @param x Depending on the context: 115 | #' 116 | #' * A __data frame__ of binary predictors. 117 | #' * A __matrix__ of binary predictors. 118 | #' * A __recipe__ specifying a set of preprocessing steps 119 | #' created from [recipes::recipe()]. 120 | #' 121 | #' @param data When a __recipe__ or __formula__ is used, `data` is specified as: 122 | #' 123 | #' * A __data frame__ containing the binary predictors. Any predictors with 124 | #' no 1's will be removed (with a warning). 125 | #' 126 | #' @param formula A formula specifying the predictor terms on the right-hand 127 | #' side. No outcome should be specified. 128 | #' 129 | #' @param quantile A real number between 0 and 1 or NA for how the similarity 130 | #' values for each sample versus the training set should be summarized. A value 131 | #' of `NA` specifies that the mean similarity is computed. Otherwise, the 132 | #' appropriate quantile is computed. 133 | #' 134 | #' @param ... Options to pass to `proxyC::simil()`, such as `method`. If no 135 | #' options are specified, `method = "jaccard"` is used. 136 | #' 137 | #' @details The function computes measures of similarity for different samples 138 | #' points. For example, suppose samples `A` and `B` both contain _p_ binary 139 | #' variables. First, a 2x2 table is constructed between `A` and `B` _across 140 | #' their elements_. The table will contain _p_ entries across the four cells 141 | #' (see the example below). From this, different measures of likeness are 142 | #' computed. 143 | #' 144 | #' For a training set of _n_ samples, a new sample is compared to each, 145 | #' resulting in _n_ similarity scores. These can be summarized into a single 146 | #' value; the median similarity is used by default by the scoring function. 147 | #' 148 | #' For this method, the computational methods are fairly taxing for large data 149 | #' sets. The training set must be stored (albeit in a sparse matrix format) so 150 | #' object sizes may become large. 151 | #' 152 | #' By default, the computations are run in parallel using _all possible 153 | #' cores_. To change this, call the `setThreadOptions` function in the 154 | #' `RcppParallel` package. 155 | #' 156 | #' @return 157 | #' 158 | #' A `apd_similarity` object. 159 | #' 160 | #' @references Leach, A. and Gillet V. (2007). _An Introduction to 161 | #' Chemoinformatics_. Springer, New York 162 | #' @examples 163 | #' \donttest{ 164 | #' data(qsar_binary) 165 | #' 166 | #' jacc_sim <- apd_similarity(binary_tr) 167 | #' jacc_sim 168 | #' 169 | #' # plot the empirical cumulative distribution function (ECDF) for the training set: 170 | #' library(ggplot2) 171 | #' autoplot(jacc_sim) 172 | #' 173 | #' # Example calculations for two samples: 174 | #' A <- as.matrix(binary_tr[1, ]) 175 | #' B <- as.matrix(binary_tr[2, ]) 176 | #' xtab <- table(A, B) 177 | #' xtab 178 | #' 179 | #' # Jaccard statistic 180 | #' xtab[2, 2] / (xtab[1, 2] + xtab[2, 1] + xtab[2, 2]) 181 | #' 182 | #' # Hamman statistic 183 | #' ((xtab[1, 1] + xtab[2, 2]) - (xtab[1, 2] + xtab[2, 1])) / sum(xtab) 184 | #' 185 | #' # Faith statistic 186 | #' (xtab[1, 1] + xtab[2, 2] / 2) / sum(xtab) 187 | #' 188 | #' # Summarize across all training set similarities 189 | #' mean_sim <- score(jacc_sim, new_data = binary_unk) 190 | #' mean_sim 191 | #' } 192 | #' @export 193 | apd_similarity <- function(x, ...) { 194 | UseMethod("apd_similarity") 195 | } 196 | 197 | # Default method 198 | 199 | #' @export 200 | #' @rdname apd_similarity 201 | apd_similarity.default <- function(x, quantile = NA_real_, ...) { 202 | cls <- class(x)[1] 203 | message <- 204 | "`x` is not of a recognized type. 205 | Only data.frame, matrix, recipe, and formula objects are allowed. 206 | A {cls} was specified." 207 | message <- glue::glue(message) 208 | rlang::abort(message = message) 209 | } 210 | 211 | # Data frame method 212 | 213 | #' @export 214 | #' @rdname apd_similarity 215 | apd_similarity.data.frame <- function(x, quantile = NA_real_, ...) { 216 | processed <- hardhat::mold(x, NA_real_) 217 | apd_similarity_bridge(processed, quantile = quantile, ...) 218 | } 219 | 220 | # Matrix method 221 | 222 | #' @export 223 | #' @rdname apd_similarity 224 | apd_similarity.matrix <- function(x, quantile = NA_real_, ...) { 225 | processed <- hardhat::mold(x, NA_real_) 226 | apd_similarity_bridge(processed, quantile = quantile, ...) 227 | } 228 | 229 | # Formula method 230 | 231 | #' @export 232 | #' @rdname apd_similarity 233 | apd_similarity.formula <- function(formula, data, quantile = NA_real_, ...) { 234 | processed <- hardhat::mold(formula, data) 235 | apd_similarity_bridge(processed, quantile = quantile, ...) 236 | } 237 | 238 | # Recipe method 239 | 240 | #' @export 241 | #' @rdname apd_similarity 242 | apd_similarity.recipe <- function(x, data, quantile = NA_real_, ...) { 243 | processed <- hardhat::mold(x, data) 244 | apd_similarity_bridge(processed, quantile = quantile, ...) 245 | } 246 | 247 | # ----------------------------------------------------------------------------- 248 | # ---------------------- Scoring function implementation ---------------------- 249 | # ----------------------------------------------------------------------------- 250 | 251 | score_apd_similarity_numeric <- function(model, predictors, options) { 252 | predictors <- 253 | predictors[, colnames(predictors) %in% colnames(model$ref_data), drop = FALSE] 254 | 255 | if (!is.matrix(predictors)) { 256 | predictors <- as.matrix(predictors) 257 | } 258 | 259 | if (!inherits(predictors, "dgCMatrix")) { 260 | predictors <- Matrix::Matrix(predictors, sparse = TRUE) 261 | } 262 | 263 | cl <- 264 | rlang::call2( 265 | "simil", 266 | .ns = "proxyC", 267 | x = rlang::expr(model$ref_data), 268 | y = rlang::expr(predictors), 269 | !!!model$options 270 | ) 271 | sims <- rlang::eval_tidy(cl) 272 | if (is.na(model$quantile)) { 273 | res <- apply(sims, 2, mean, na.rm = TRUE) 274 | } else { 275 | res <- apply(sims, 2, quantile, probs = model$quantile, na.rm = TRUE) 276 | } 277 | res 278 | } 279 | 280 | # ----------------------------------------------------------------------------- 281 | # ---------------------- Scoring function bridge ------------------------------ 282 | # ----------------------------------------------------------------------------- 283 | 284 | score_apd_similarity_bridge <- function(type, model, predictors) { 285 | score_function <- get_sim_score_function(type) 286 | 287 | predictions <- score_function(model, predictors, options) 288 | predictions <- tibble::tibble(similarity = predictions) 289 | 290 | hardhat::validate_prediction_size(predictions, predictors) 291 | 292 | predictions 293 | } 294 | 295 | get_sim_score_function <- function(type) { 296 | switch(type, 297 | numeric = score_apd_similarity_numeric 298 | ) 299 | } 300 | 301 | # ----------------------------------------------------------------------------- 302 | # ---------------------- Scoring function interface --------------------------- 303 | # ----------------------------------------------------------------------------- 304 | 305 | #' Score new samples using similarity methods 306 | #' 307 | #' @param object A `apd_similarity` object. 308 | #' 309 | #' @param new_data A data frame or matrix of new predictors. 310 | #' 311 | #' @param type A single character. The type of predictions to generate. 312 | #' Valid options are: 313 | #' 314 | #' - `"numeric"` for a numeric value that summarizes the similarity values for 315 | #' each sample across the training set. 316 | #' 317 | #' @param add_percentile A single logical; should the percentile of the 318 | #' similarity score _relative to the training set values_ by computed? 319 | #' 320 | #' @param ... Not used, but required for extensibility. 321 | #' 322 | #' @return 323 | #' 324 | #' A tibble of predictions. The number of rows in the tibble is guaranteed 325 | #' to be the same as the number of rows in `new_data`. For `type = "numeric"`, 326 | #' the tibble contains a column called "similarity". If `add_percentile = TRUE`, 327 | #' an additional column called `similarity_pctl` will be added. These values are 328 | #' in percent units so that a value of 11.5 indicates that, in the training set, 329 | #' 11.5 percent of the training set samples had smaller values than the sample 330 | #' being scored. 331 | #' 332 | #' @examples 333 | #' \donttest{ 334 | #' data(qsar_binary) 335 | #' 336 | #' jacc_sim <- apd_similarity(binary_tr) 337 | #' 338 | #' mean_sim <- score(jacc_sim, new_data = binary_unk) 339 | #' mean_sim 340 | #' } 341 | #' @export 342 | score.apd_similarity <- function(object, new_data, type = "numeric", add_percentile = TRUE, ...) { 343 | forged <- hardhat::forge(new_data, object$blueprint) 344 | rlang::arg_match(type, valid_predict_types()) 345 | res <- score_apd_similarity_bridge(type, object, forged$predictors) 346 | if (add_percentile) { 347 | res$similarity_pctl <- sim_percentile(res$similarity, object$ref_scores) 348 | } 349 | res 350 | } 351 | 352 | # ----------------------------------------------------------------------------- 353 | # ---------------------- Helper functions ------------------------------------- 354 | # ----------------------------------------------------------------------------- 355 | 356 | sim_percentile <- function(sims, ref) { 357 | res <- stats::approx(ref$sim, ref$cumulative, xout = sims)$y 358 | res[sims < min(ref$sim, na.rm = TRUE)] <- 0 359 | res[sims > max(ref$sim, na.rm = TRUE)] <- 1 360 | 361 | res * 100 362 | } 363 | --------------------------------------------------------------------------------