├── .github
    ├── .gitignore
    ├── workflows
    │   ├── lock.yaml
    │   ├── pkgdown.yaml
    │   ├── test-coverage.yaml
    │   ├── R-CMD-check.yaml
    │   └── pr-commands.yaml
    └── CODE_OF_CONDUCT.md
├── docs
    └── CNAME
├── vignettes
    ├── .gitignore
    ├── binary-data.Rmd
    └── continuous-data.Rmd
├── LICENSE
├── .gitignore
├── data
    ├── ames_new.rda
    ├── qsar_binary.rda
    ├── okc_binary.RData
    └── datalist
├── man
    ├── figures
    │   └── logo.png
    ├── print.apd_pca.Rd
    ├── print.apd_hat_values.Rd
    ├── score.Rd
    ├── print.apd_similarity.Rd
    ├── binary.Rd
    ├── autoplot.apd_similarity.Rd
    ├── autoplot.apd_pca.Rd
    ├── ames_new.Rd
    ├── okc_binary.Rd
    ├── applicable-package.Rd
    ├── score.apd_pca.Rd
    ├── score.apd_hat_values.Rd
    ├── score.apd_similarity.Rd
    ├── apd_hat_values.Rd
    ├── apd_pca.Rd
    └── apd_similarity.Rd
├── tests
    ├── testthat.R
    ├── testthat
    │   ├── test-misc.R
    │   ├── _snaps
    │   │   ├── misc.md
    │   │   ├── pca-fit.md
    │   │   ├── pca-score.md
    │   │   ├── print.md
    │   │   ├── hat_values-score.md
    │   │   ├── hat_values-fit.md
    │   │   └── similarity.md
    │   ├── test-print.R
    │   ├── test-hat_values-score.R
    │   ├── test-plot.R
    │   ├── test-pca-score.R
    │   ├── test-pca-fit.R
    │   ├── test-hat_values-fit.R
    │   └── test-similarity.R
    └── spelling.R
├── pkgdown
    └── favicon
    │   ├── favicon.ico
    │   ├── favicon-16x16.png
    │   ├── favicon-32x32.png
    │   ├── apple-touch-icon.png
    │   ├── apple-touch-icon-60x60.png
    │   ├── apple-touch-icon-76x76.png
    │   └── apple-touch-icon-120x120.png
├── NEWS.md
├── R
    ├── applicable-package.R
    ├── misc.R
    ├── zzz.R
    ├── score.R
    ├── 0.R
    ├── s3_register.R
    ├── helpers.R
    ├── print.R
    ├── plot.R
    ├── data.R
    ├── hat_values-score.R
    ├── pca-score.R
    ├── hat_values-fit.R
    ├── pca-fit.R
    └── similarity.R
├── inst
    └── WORDLIST
├── .Rbuildignore
├── codecov.yml
├── applicable.Rproj
├── _pkgdown.yml
├── cran-comments.md
├── LICENSE.md
├── DESCRIPTION
├── NAMESPACE
├── README.md
└── README.Rmd


/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | applicable.tidymodels.org


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2023
2 | COPYRIGHT HOLDER: applicable authors
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rhistory
2 | .Rprofile
3 | .Rproj.user
4 | .DS_Store
5 | inst/doc
6 | 


--------------------------------------------------------------------------------
/data/ames_new.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/data/ames_new.rda


--------------------------------------------------------------------------------
/data/qsar_binary.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/data/qsar_binary.rda


--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/man/figures/logo.png


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(applicable)
3 | 
4 | test_check("applicable")
5 | 


--------------------------------------------------------------------------------
/data/okc_binary.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/data/okc_binary.RData


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/favicon.ico


--------------------------------------------------------------------------------
/data/datalist:
--------------------------------------------------------------------------------
1 | qsar_binary: binary_tr binary_unk
2 | okc_binary: okc_binary_train okc_binary_test
3 | ames_new: ames_new
4 | 


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/favicon-16x16.png


--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/favicon-32x32.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-60x60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-76x76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png


--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/applicable/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png


--------------------------------------------------------------------------------
/tests/testthat/test-misc.R:
--------------------------------------------------------------------------------
1 | test_that("`names0` fails if `num` is less than 1", {
2 |   num <- 0
3 |   expect_snapshot(error = TRUE,
4 |     names0(num)
5 |   )
6 | })
7 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/misc.md:
--------------------------------------------------------------------------------
1 | # `names0` fails if `num` is less than 1
2 | 
3 |     Code
4 |       names0(num)
5 |     Error <rlang_error>
6 |       `num` should be > 0
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/spelling.R:
--------------------------------------------------------------------------------
1 | if (requireNamespace("spelling", quietly = TRUE)) {
2 |   spelling::spell_check_test(
3 |     vignettes = TRUE, error = FALSE, skip_on_cran = TRUE
4 |   )
5 | }
6 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | applicable 0.0.1.1
 2 | ==================
 3 | 
 4 | Minor patch release: fixed failing units tests due to recent package updates.
 5 | 
 6 | applicable 0.0.1
 7 | ==================
 8 | 
 9 | * First CRAN version.
10 | 


--------------------------------------------------------------------------------
/R/applicable-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 | 
4 | # The following block is used by usethis to automatically manage
5 | # roxygen namespace tags. Modify with care!
6 | ## usethis namespace: start
7 | ## usethis namespace: end
8 | NULL
9 | 


--------------------------------------------------------------------------------
/R/misc.R:
--------------------------------------------------------------------------------
 1 | # from recipes:::names0
 2 | names0 <- function(num, prefix = "x") {
 3 |   if (num < 1) {
 4 |     rlang::abort("`num` should be > 0")
 5 |   }
 6 |   ind <- format(1:num)
 7 |   ind <- gsub(" ", "0", ind)
 8 |   paste0(prefix, ind)
 9 | }
10 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | # nocov start
2 | # takes after https://raw.githubusercontent.com/r-lib/vctrs/master/R/zzz.R
3 | .onLoad <- function(libname, pkgname) {
4 |   s3_register("ggplot2::autoplot", "apd_similarity")
5 |   s3_register("ggplot2::autoplot", "apd_pca")
6 | }
7 | 
8 | # nocov end
9 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
 1 | al
 2 | Ames
 3 | Chemoinformatics
 4 | CRC
 5 | De
 6 | doi
 7 | et
 8 | extensibility
 9 | Gillet
10 | Jaccard
11 | Lifecycle
12 | Netzeva
13 | OkCupid
14 | pca
15 | pcas
16 | pre
17 | QSAR
18 | Springer
19 | tibble
20 | X'X
21 | intercal
22 | CMD
23 | reprex
24 | tidymodels
25 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^applicable\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^_pkgdown\.yml$
 4 | ^docs$
 5 | ^pkgdown$
 6 | ^README.Rmd$
 7 | .github
 8 | ^\.github/workflows/check-full\.yaml$
 9 | ^LICENSE\.md$
10 | ^cran-comments\.md$
11 | ^NEWS\.md$
12 | ^CODE_OF_CONDUCT\.md$
13 | ^\.github$
14 | ^codecov\.yml$
15 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/pca-fit.md:
--------------------------------------------------------------------------------
 1 | # pcs is provided
 2 | 
 3 |     Code
 4 |       new_apd_pca(blueprint = hardhat::default_xy_blueprint())
 5 |     Error <simpleError>
 6 |       argument "pcs" is missing, with no default
 7 | 
 8 | # `new_apd_pca` fails when blueprint is numeric
 9 | 
10 |     Code
11 |       new_apd_pca(pcs = 1, blueprint = 1)
12 |     Error <rlang_error>
13 |       blueprint should be a blueprint, not a numeric.
14 | 
15 | 


--------------------------------------------------------------------------------
/applicable.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://applicable.tidymodels.org
 2 | 
 3 | template:
 4 |   package: tidytemplate
 5 |   bootstrap: 5
 6 |   bslib:
 7 |     primary: "#CA225E"
 8 | 
 9 |   includes:
10 |       in_header: |
11 |         <script defer data-domain="applicable.tidymodels.org,all.tidymodels.org" src="https://plausible.io/js/plausible.js"></script>
12 | 
13 | # https://github.com/tidyverse/tidytemplate for css
14 | 
15 | development:
16 |   mode: auto
17 | 
18 | 
19 | figures:
20 |   fig.width: 8
21 |   fig.height: 5.75
22 | 


--------------------------------------------------------------------------------
/man/print.apd_pca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.R
 3 | \name{print.apd_pca}
 4 | \alias{print.apd_pca}
 5 | \title{Print number of predictors and principal components used.}
 6 | \usage{
 7 | \method{print}{apd_pca}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{apd_pca} object.}
11 | 
12 | \item{...}{Not currently used, but required for extensibility.}
13 | }
14 | \value{
15 | None
16 | }
17 | \description{
18 | Print number of predictors and principal components used.
19 | }
20 | \examples{
21 | 
22 | model <- apd_pca(~ Sepal.Length + Sepal.Width, iris)
23 | print(model)
24 | }
25 | 


--------------------------------------------------------------------------------
/man/print.apd_hat_values.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.R
 3 | \name{print.apd_hat_values}
 4 | \alias{print.apd_hat_values}
 5 | \title{Print number of predictors and principal components used.}
 6 | \usage{
 7 | \method{print}{apd_hat_values}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{apd_hat_values} object.}
11 | 
12 | \item{...}{Not currently used, but required for extensibility.}
13 | }
14 | \value{
15 | None
16 | }
17 | \description{
18 | Print number of predictors and principal components used.
19 | }
20 | \examples{
21 | 
22 | model <- apd_hat_values(~ Sepal.Length + Sepal.Width, iris)
23 | print(model)
24 | }
25 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## R CMD check results
 2 | 
 3 | 0 errors | 0 warnings | 0 note
 4 | 
 5 | ## 0.0.1.1 Submission
 6 | 
 7 | This release fixes failing unit tests.
 8 | 
 9 | ### Review - 2020-06-14
10 | 
11 | Fix the following unit tests:
12 | 
13 |   >
14 |   ── 1. Failure: `score_apd_pca_numeric` pcs output matches `stats::predict` output
15 |   `actual_output` not equivalent to `expected`.
16 |   current is not list-like
17 |   > 
18 |   ── 2. Failure: `score` pcs output matches `stats::predict` output
19 |   `actual_output` not equivalent to `expected`.
20 |   current is not list-like
21 |   >
22 |   ── 3. Failure: `score_apd_pca_bridge` output is correct
23 |   `actual_output` not equivalent to `expected`.
24 |   current is not list-like
25 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/pca-score.md:
--------------------------------------------------------------------------------
 1 | # `score_apd_pca_numeric` fails when model has no pcs argument
 2 | 
 3 |     Code
 4 |       score_apd_pca_numeric(mtcars, mtcars)
 5 |     Error <rlang_error>
 6 |       The model must contain a pcs argument.
 7 | 
 8 | # `score` fails when predictors only contain factors
 9 | 
10 |     Code
11 |       score(model, iris$Species)
12 |     Error <rlang_error>
13 |       The class of `new_data`, 'factor', is not recognized.
14 | 
15 | # `score` fails when predictors are vectors
16 | 
17 |     Code
18 |       score(object)
19 |     Error <rlang_error>
20 |       `object` is not of a recognized type.
21 |       Only data.frame, matrix, recipe, and formula objects are allowed.
22 |       A data.frame was specified.
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/print.md:
--------------------------------------------------------------------------------
 1 | # print wording for `apd_pca` is correct
 2 | 
 3 |     Code
 4 |       print(x1)
 5 |     Output
 6 |       # Predictors:
 7 |          1
 8 |       # Principal Components:
 9 |          1 component was needed
10 |          to capture at least 95% of the
11 |          total variation in the predictors.
12 | 
13 | ---
14 | 
15 |     Code
16 |       print(x2)
17 |     Output
18 |       # Predictors:
19 |          2
20 |       # Principal Components:
21 |          2 components were needed
22 |          to capture at least 95% of the
23 |          total variation in the predictors.
24 | 
25 | # print for apd_hat_values work as expected
26 | 
27 |     Code
28 |       print(x)
29 |     Output
30 |       # Predictors:
31 |       11
32 | 
33 | 


--------------------------------------------------------------------------------
/man/score.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/score.R
 3 | \name{score}
 4 | \alias{score}
 5 | \alias{score.default}
 6 | \title{A scoring function}
 7 | \usage{
 8 | score(object, ...)
 9 | 
10 | \method{score}{default}(object, ...)
11 | }
12 | \arguments{
13 | \item{object}{Depending on the context:
14 | \itemize{
15 | \item A \strong{data frame} of predictors.
16 | \item A \strong{matrix} of predictors.
17 | \item A \strong{recipe} specifying a set of preprocessing steps
18 | created from \code{\link[recipes:recipe]{recipes::recipe()}}.
19 | }}
20 | 
21 | \item{...}{Not currently used, but required for extensibility.}
22 | }
23 | \value{
24 | A tibble of predictions.
25 | }
26 | \description{
27 | A scoring function
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/hat_values-score.md:
--------------------------------------------------------------------------------
 1 | # `score_apd_hat_values_numeric` fails when model has no pcs argument
 2 | 
 3 |     Code
 4 |       score_apd_hat_values_numeric(mtcars, mtcars)
 5 |     Error <rlang_error>
 6 |       The model must contain an XtX_inv argument.
 7 | 
 8 | # `score` fails when predictors only contain factors
 9 | 
10 |     Code
11 |       score(model, iris$Species)
12 |     Error <rlang_error>
13 |       The class of `new_data`, 'factor', is not recognized.
14 | 
15 | # `score` fails when predictors are vectors
16 | 
17 |     Code
18 |       score(object)
19 |     Error <rlang_error>
20 |       `object` is not of a recognized type.
21 |       Only data.frame, matrix, recipe, and formula objects are allowed.
22 |       A data.frame was specified.
23 | 
24 | 


--------------------------------------------------------------------------------
/man/print.apd_similarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/print.R
 3 | \name{print.apd_similarity}
 4 | \alias{print.apd_similarity}
 5 | \title{Print number of predictors and principal components used.}
 6 | \usage{
 7 | \method{print}{apd_similarity}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{A \code{apd_similarity} object.}
11 | 
12 | \item{...}{Not currently used, but required for extensibility.}
13 | }
14 | \value{
15 | None
16 | }
17 | \description{
18 | Print number of predictors and principal components used.
19 | }
20 | \examples{
21 | 
22 | set.seed(535)
23 | tr_x <- matrix(
24 |   sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE),
25 |   ncol = 20
26 |  )
27 | model <- apd_similarity(tr_x)
28 | print(model)
29 | }
30 | 


--------------------------------------------------------------------------------
/man/binary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{binary}
 5 | \alias{binary}
 6 | \alias{qsar_binary}
 7 | \alias{binary_tr}
 8 | \alias{binary_unk}
 9 | \title{Binary QSAR Data}
10 | \value{
11 | \item{binary_tr,binary_ukn}{data frame frames with 67 columns}
12 | }
13 | \description{
14 | Binary QSAR Data
15 | }
16 | \details{
17 | These data are from two different sources on quantitative
18 | structure-activity relationship (QSAR) modeling and contain 67 predictors
19 | that are either 0 or 1. The training set contains 4,330 samples and there
20 | are five unknown samples (both from the \code{Mutagen} data in the \code{QSARdata}
21 | package).
22 | }
23 | \examples{
24 | data(qsar_binary)
25 | str(binary_tr)
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/tests/testthat/test-print.R:
--------------------------------------------------------------------------------
 1 | test_that("print wording for `apd_pca` is correct", {
 2 |   x1 <- apd_pca(~Sepal.Length, iris)
 3 | 
 4 |   expect_snapshot(
 5 |     print(x1)
 6 |   )
 7 | 
 8 |   x2 <- apd_pca(~ Sepal.Length + Sepal.Width, iris)
 9 | 
10 |   expect_snapshot(
11 |     print(x2)
12 |   )
13 | })
14 | 
15 | test_that("print for `apd_pca` displays correct threshold", {
16 |   threshold <- 0.72
17 |   x <- apd_pca(~Sepal.Length, iris, threshold)
18 |   percentage <- capture.output(x)
19 |   percentage <- regmatches(
20 |     percentage,
21 |     regexpr("(\\d+)%", percentage)
22 |   )
23 | 
24 |   expected_output <- threshold * 100
25 |   expected_output <- paste0(expected_output, "%")
26 | 
27 |   expect_equal(
28 |     percentage,
29 |     expected_output
30 |   )
31 | })
32 | 
33 | test_that("print for apd_hat_values work as expected", {
34 |   x <- apd_hat_values(mtcars)
35 | 
36 |   expect_snapshot(
37 |     print(x)
38 |   )
39 | })
40 | 


--------------------------------------------------------------------------------
/man/autoplot.apd_similarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{autoplot.apd_similarity}
 4 | \alias{autoplot.apd_similarity}
 5 | \title{Plot the cumulative distribution function for similarity metrics}
 6 | \usage{
 7 | \method{autoplot}{apd_similarity}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{An object produced by \code{apd_similarity}.}
11 | 
12 | \item{...}{Not currently used.}
13 | }
14 | \value{
15 | A \code{ggplot} object that shows the cumulative probability versus the
16 | unique similarity values in the training set. Not that for large samples,
17 | this is an approximation based on a random sample of 5,000 training set
18 | points.
19 | }
20 | \description{
21 | Plot the cumulative distribution function for similarity metrics
22 | }
23 | \examples{
24 | set.seed(535)
25 | tr_x <- matrix(
26 |   sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE),
27 |   ncol = 20
28 | )
29 | model <- apd_similarity(tr_x)
30 | }
31 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/hat_values-fit.md:
--------------------------------------------------------------------------------
 1 | # XtX_inv is provided
 2 | 
 3 |     Code
 4 |       new_apd_hat_values(blueprint = hardhat::default_xy_blueprint())
 5 |     Error <simpleError>
 6 |       argument "XtX_inv" is missing, with no default
 7 | 
 8 | # `new_apd_hat_values` fails when blueprint is numeric
 9 | 
10 |     Code
11 |       new_apd_hat_values(XtX_inv = 1, blueprint = 1)
12 |     Error <rlang_error>
13 |       blueprint should be a blueprint, not a numeric.
14 | 
15 | # `apd_hat_values` fails when matrix has more predictors than samples
16 | 
17 |     Code
18 |       apd_hat_values(bad_data)
19 |     Error <rlang_error>
20 |       The number of columns must be less than the number of rows.
21 | 
22 | # `apd_hat_values` fails when the matrix X^tX is singular
23 | 
24 |     Code
25 |       apd_hat_values(bad_data)
26 |     Error <rlang_error>
27 |       Unable to compute the hat values of the matrix X of
28 |       predictors because the matrix resulting from multiplying
29 |       the transpose of X by X is singular.
30 | 
31 | 


--------------------------------------------------------------------------------
/man/autoplot.apd_pca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{autoplot.apd_pca}
 4 | \alias{autoplot.apd_pca}
 5 | \title{Plot the distribution function for pcas}
 6 | \usage{
 7 | \method{autoplot}{apd_pca}(object, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{An object produced by \code{apd_pca}.}
11 | 
12 | \item{...}{An optional set of \code{dplyr} selectors, such as \code{dplyr::matches()} or
13 | \code{dplyr::starts_with()} for selecting which variables should be shown in the
14 | plot.}
15 | }
16 | \value{
17 | A \code{ggplot} object that shows the distribution function for each
18 | principal component.
19 | }
20 | \description{
21 | Plot the distribution function for pcas
22 | }
23 | \examples{
24 | library(ggplot2)
25 | library(dplyr)
26 | library(modeldata)
27 | data(biomass)
28 | 
29 | biomass_ad <- apd_pca(biomass[, 3:8])
30 | 
31 | autoplot(biomass_ad)
32 | # Using selectors in `...`
33 | autoplot(biomass_ad, distance) + scale_x_log10()
34 | autoplot(biomass_ad, matches("PC[1-2]"))
35 | }
36 | 


--------------------------------------------------------------------------------
/man/ames_new.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{ames_new}
 5 | \alias{ames_new}
 6 | \title{Recent Ames Iowa Houses}
 7 | \source{
 8 | De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing
 9 | Data as an End of Semester Regression Project," \emph{Journal of Statistics
10 | Education},  Volume 19, Number 3.
11 | 
12 | \url{https://www.cityofames.org/government/departments-divisions-a-h/city-assessor}
13 | 
14 | \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt}
15 | 
16 | \url{http://jse.amstat.org/v19n3/decock.pdf}
17 | }
18 | \value{
19 | \item{ames_new}{a tibble}
20 | }
21 | \description{
22 | More data related to the set described by De Cock (2011) where data where
23 | data were recorded for 2,930 properties in Ames IA.
24 | }
25 | \details{
26 | This data sets includes three more properties added since the original
27 | reference. There are less fields in this data set; only those that could be
28 | transcribed from the assessor's office were included.
29 | }
30 | \keyword{datasets}
31 | 


--------------------------------------------------------------------------------
/tests/testthat/test-hat_values-score.R:
--------------------------------------------------------------------------------
 1 | test_that("`score_apd_hat_values_numeric` fails when model has no pcs argument", {
 2 |   expect_snapshot(error = TRUE,
 3 |     score_apd_hat_values_numeric(mtcars, mtcars)
 4 |   )
 5 | })
 6 | 
 7 | test_that("`score` fails when predictors only contain factors", {
 8 |   model <- apd_hat_values(~., iris)
 9 |   expect_snapshot(error = TRUE,
10 |     score(model, iris$Species)
11 |   )
12 | })
13 | 
14 | test_that("`score` fails when predictors are vectors", {
15 |   object <- iris
16 | 
17 |   expect_snapshot(error = TRUE,
18 |     score(object)
19 |   )
20 | })
21 | 
22 | test_that("`score` calculated hat_values are correct", {
23 |   model <- apd_hat_values(mtcars %>% dplyr::slice(1:15))
24 |   predictors <- as.matrix(mtcars %>% dplyr::slice(16:30))
25 | 
26 |   proj_matrix <- predictors %*% model$XtX_inv %*% t(predictors)
27 |   expected <- diag(proj_matrix)
28 | 
29 |   actual_output <- score(model, predictors)
30 |   actual_output <- actual_output$hat_values
31 | 
32 |   # Data frame method
33 |   expect_equal(ignore_attr = TRUE,
34 |     actual_output,
35 |     expected
36 |   )
37 | })
38 | 


--------------------------------------------------------------------------------
/man/okc_binary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{okc_binary}
 5 | \alias{okc_binary}
 6 | \alias{okc_binary_train}
 7 | \alias{okc_binary_test}
 8 | \title{OkCupid Binary Predictors}
 9 | \source{
10 | Kim (2015), "OkCupid Data for Introductory Statistics and Data Science Courses", \emph{Journal of Statistics Education}, Volume 23, Number 2. \url{https://www.tandfonline.com/doi/abs/10.1080/10691898.2015.11889737}
11 | 
12 | Kuhn and Johnson (2020), \emph{Feature Engineering and Selection}, Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and \url{https://github.com/topepo/FES}
13 | }
14 | \value{
15 | \item{okc_binary_train,okc_binary_test}{data frame frames with 61 columns}
16 | }
17 | \description{
18 | OkCupid Binary Predictors
19 | }
20 | \details{
21 | Data originally from Kim (2015) includes a training and test set
22 | consistent with Kuhn and Johnson (2020). Predictors include ethnicity
23 | indicators and a set of keywords derived from text essay data.
24 | }
25 | \examples{
26 | data(okc_binary)
27 | str(okc_binary_train)
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2023 applicable authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/R/score.R:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | # ---------------------- Model fit generic interface --------------------------
 3 | # -----------------------------------------------------------------------------
 4 | 
 5 | #' A scoring function
 6 | #'
 7 | #' @param object Depending on the context:
 8 | #'
 9 | #'   * A __data frame__ of predictors.
10 | #'   * A __matrix__ of predictors.
11 | #'   * A __recipe__ specifying a set of preprocessing steps
12 | #'     created from [recipes::recipe()].
13 | #'
14 | #' @param ... Not currently used, but required for extensibility.
15 | #'
16 | #' @return
17 | #'
18 | #' A tibble of predictions.
19 | #'
20 | #' @export
21 | score <- function(object, ...) {
22 |   UseMethod("score")
23 | }
24 | 
25 | #' @export
26 | #' @export score.default
27 | #' @rdname score
28 | score.default <- function(object, ...) {
29 |   cls <- class(object)[1]
30 |   message <-
31 |     "`object` is not of a recognized type.
32 |      Only data.frame, matrix, recipe, and formula objects are allowed.
33 |      A {cls} was specified."
34 |   message <- glue::glue(message)
35 |   rlang::abort(message = message)
36 | }
37 | 


--------------------------------------------------------------------------------
/.github/workflows/lock.yaml:
--------------------------------------------------------------------------------
 1 | name: 'Lock Threads'
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'
 6 | 
 7 | jobs:
 8 |   lock:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: dessant/lock-threads@v2
12 |         with:
13 |           github-token: ${{ github.token }}
14 |           issue-lock-inactive-days: '14'
15 | #          issue-exclude-labels: ''
16 | #          issue-lock-labels: 'outdated'
17 |           issue-lock-comment: >
18 |             This issue has been automatically locked. If you believe you have
19 |             found a related problem, please file a new issue (with a reprex:
20 |             <https://reprex.tidyverse.org>) and link to this issue.
21 |           issue-lock-reason: ''
22 |           pr-lock-inactive-days: '14'
23 | #          pr-exclude-labels: 'wip'
24 |           pr-lock-labels: ''
25 |           pr-lock-comment: >
26 |             This pull request has been automatically locked. If you believe you
27 |             have found a related problem, please file a new issue (with a reprex:
28 |             <https://reprex.tidyverse.org>) and link to this issue.
29 |           pr-lock-reason: ''
30 | #          process-only: 'issues'
31 | 


--------------------------------------------------------------------------------
/man/applicable-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/applicable-package.R
 3 | \docType{package}
 4 | \name{applicable-package}
 5 | \alias{applicable}
 6 | \alias{applicable-package}
 7 | \title{applicable: A Compilation of Applicability Domain Methods}
 8 | \description{
 9 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
10 | 
11 | A modeling package compiling applicability domain methods in R. It combines different methods to measure the amount of extrapolation new samples can have from the training set. See Netzeva et al (2005) \doi{10.1177/026119290503300209} for an overview of applicability domains.
12 | }
13 | \seealso{
14 | Useful links:
15 | \itemize{
16 |   \item \url{https://github.com/tidymodels/applicable}
17 |   \item \url{https://applicable.tidymodels.org}
18 |   \item Report bugs at \url{https://github.com/tidymodels/applicable/issues}
19 | }
20 | 
21 | }
22 | \author{
23 | \strong{Maintainer}: Marly Gotti \email{marlygotti@gmail.com}
24 | 
25 | Authors:
26 | \itemize{
27 |   \item Max Kuhn \email{max@posit.co}
28 | }
29 | 
30 | Other contributors:
31 | \itemize{
32 |   \item Posit Software, PBC [copyright holder, funder]
33 | }
34 | 
35 | }
36 | \keyword{internal}
37 | 


--------------------------------------------------------------------------------
/tests/testthat/test-plot.R:
--------------------------------------------------------------------------------
 1 | test_that("output of autoplot.apd_pca is correct when no options are provided", {
 2 |   ad <- apd_pca(mtcars)
 3 |   ad_plot <- ggplot2::autoplot(ad)
 4 | 
 5 |   pctls <- ad$pctls %>%
 6 |     tidyr::gather(component, value, -percentile)
 7 | 
 8 |   expect_equal(ad_plot$data, pctls)
 9 |   expect_equal(ad_plot$labels$x, "abs(value)")
10 |   expect_equal(ad_plot$labels$y, "percentile")
11 | })
12 | 
13 | test_that("output of autoplot.apd_pca is correct when options=matches are provided", {
14 |   ad <- apd_pca(mtcars)
15 |   ad_plot <- ggplot2::autoplot(ad, matches("PC[1-5]"))
16 | 
17 |   pctls <- ad$pctls %>%
18 |     select(matches("PC[1-5]"), percentile) %>%
19 |     tidyr::gather(component, value, -percentile)
20 | 
21 |   expect_equal(ad_plot$data, pctls)
22 |   expect_equal(ad_plot$labels$x, "abs(value)")
23 |   expect_equal(ad_plot$labels$y, "percentile")
24 | })
25 | 
26 | test_that("output of autoplot.apd_pca is correct when options=distance are provided", {
27 |   ad <- apd_pca(mtcars)
28 |   ad_plot <- ggplot2::autoplot(ad, "distance")
29 | 
30 |   pctls <- ad$pctls %>%
31 |     select(matches("distance"), percentile) %>%
32 |     tidyr::gather(component, value, -percentile)
33 | 
34 |   expect_equal(ad_plot$data, pctls)
35 |   expect_equal(ad_plot$labels$x, "abs(value)")
36 |   expect_equal(ad_plot$labels$y, "percentile")
37 | })
38 | 


--------------------------------------------------------------------------------
/man/score.apd_pca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pca-score.R
 3 | \name{score.apd_pca}
 4 | \alias{score.apd_pca}
 5 | \title{Predict from a \code{apd_pca}}
 6 | \usage{
 7 | \method{score}{apd_pca}(object, new_data, type = "numeric", ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A \code{apd_pca} object.}
11 | 
12 | \item{new_data}{A data frame or matrix of new samples.}
13 | 
14 | \item{type}{A single character. The type of predictions to generate.
15 | Valid options are:
16 | \itemize{
17 | \item \code{"numeric"} for numeric predictions.
18 | }}
19 | 
20 | \item{...}{Not used, but required for extensibility.}
21 | }
22 | \value{
23 | A tibble of predictions. The number of rows in the tibble is guaranteed
24 | to be the same as the number of rows in \code{new_data}.
25 | }
26 | \description{
27 | Predict from a \code{apd_pca}
28 | }
29 | \details{
30 | The function computes the principal components of the new data and
31 | their percentiles as compared to the training data. The number of principal
32 | components computed depends on the \code{threshold} given at fit time. It also
33 | computes the multivariate distance between each principal component and its
34 | mean.
35 | }
36 | \examples{
37 | train <- mtcars[1:20, ]
38 | test <- mtcars[21:32, -1]
39 | 
40 | # Fit
41 | mod <- apd_pca(mpg ~ cyl + log(drat), train)
42 | 
43 | # Predict, with preprocessing
44 | score(mod, test)
45 | }
46 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     steps:
23 |       - uses: actions/checkout@v3
24 | 
25 |       - uses: r-lib/actions/setup-pandoc@v2
26 | 
27 |       - uses: r-lib/actions/setup-r@v2
28 |         with:
29 |           use-public-rspm: true
30 | 
31 |       - uses: r-lib/actions/setup-r-dependencies@v2
32 |         with:
33 |           extra-packages: any::pkgdown, local::.
34 |           needs: website
35 | 
36 |       - name: Build site
37 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
38 |         shell: Rscript {0}
39 | 
40 |       - name: Deploy to GitHub pages 🚀
41 |         if: github.event_name != 'pull_request'
42 |         uses: JamesIves/github-pages-deploy-action@v4.4.1
43 |         with:
44 |           clean: false
45 |           branch: gh-pages
46 |           folder: docs
47 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: applicable
 2 | Title: A Compilation of Applicability Domain Methods
 3 | Version: 0.0.1.1
 4 | Authors@R: c(
 5 |     person("Marly", "Gotti", , "marlygotti@gmail.com", role = c("aut", "cre")),
 6 |     person("Max", "Kuhn", , "max@posit.co", role = "aut"),
 7 |     person(given = "Posit Software, PBC", role = c("cph", "fnd"))
 8 |   )
 9 | Description: A modeling package compiling applicability domain methods in
10 |     R.  It combines different methods to measure the amount of
11 |     extrapolation new samples can have from the training set. See Netzeva
12 |     et al (2005) <doi:10.1177/026119290503300209> for an overview of
13 |     applicability domains.
14 | License: MIT + file LICENSE
15 | URL: https://github.com/tidymodels/applicable,
16 |     https://applicable.tidymodels.org
17 | BugReports: https://github.com/tidymodels/applicable/issues
18 | Depends: 
19 |     ggplot2,
20 |     R (>= 3.4)
21 | Imports: 
22 |     dplyr,
23 |     glue,
24 |     hardhat (>= 0.1.2),
25 |     Matrix,
26 |     proxyC,
27 |     purrr,
28 |     rlang,
29 |     stats,
30 |     tibble,
31 |     tidyr,
32 |     tidyselect,
33 |     utils
34 | Suggests: 
35 |     covr,
36 |     knitr,
37 |     modeldata,
38 |     recipes (>= 0.1.7),
39 |     rmarkdown,
40 |     spelling,
41 |     testthat (>= 3.0.0),
42 |     xml2
43 | VignetteBuilder: 
44 |     knitr
45 | Encoding: UTF-8
46 | Language: en-US
47 | LazyData: true
48 | Roxygen: list(markdown = TRUE)
49 | RoxygenNote: 7.2.3
50 | Config/Needs/website: tidyverse/tidytemplate
51 | Config/testthat/edition: 3
52 | 


--------------------------------------------------------------------------------
/man/score.apd_hat_values.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hat_values-score.R
 3 | \name{score.apd_hat_values}
 4 | \alias{score.apd_hat_values}
 5 | \title{Score new samples using hat values}
 6 | \usage{
 7 | \method{score}{apd_hat_values}(object, new_data, type = "numeric", ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A \code{apd_hat_values} object.}
11 | 
12 | \item{new_data}{A data frame or matrix of new predictors.}
13 | 
14 | \item{type}{A single character. The type of predictions to generate.
15 | Valid options are:
16 | \itemize{
17 | \item \code{"numeric"} for a numeric value that summarizes the hat values for
18 | each sample across the training set.
19 | }}
20 | 
21 | \item{...}{Not used, but required for extensibility.}
22 | }
23 | \value{
24 | A tibble of predictions. The number of rows in the tibble is guaranteed
25 | to be the same as the number of rows in \code{new_data}. For \code{type = "numeric"},
26 | the tibble contains two columns \code{hat_values} and \code{hat_values_pctls}. The
27 | column \code{hat_values_pctls} is in percent units so that a value of 11.5
28 | indicates that, in the training set, 11.5 percent of the training set
29 | samples had smaller values than the sample being scored.
30 | }
31 | \description{
32 | Score new samples using hat values
33 | }
34 | \examples{
35 | train_data <- mtcars[1:20, ]
36 | test_data <- mtcars[21:32, ]
37 | 
38 | hat_values_model <- apd_hat_values(train_data)
39 | 
40 | hat_values_scoring <- score(hat_values_model, new_data = test_data)
41 | hat_values_scoring
42 | }
43 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: |
31 |           covr::codecov(
32 |             quiet = FALSE,
33 |             clean = FALSE,
34 |             install_path = file.path(Sys.getenv("RUNNER_TEMP"), "package")
35 |           )
36 |         shell: Rscript {0}
37 | 
38 |       - name: Show testthat output
39 |         if: always()
40 |         run: |
41 |           ## --------------------------------------------------------------------
42 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 |         shell: bash
44 | 
45 |       - name: Upload test results
46 |         if: failure()
47 |         uses: actions/upload-artifact@v3
48 |         with:
49 |           name: coverage-test-failures
50 |           path: ${{ runner.temp }}/package
51 | 


--------------------------------------------------------------------------------
/R/0.R:
--------------------------------------------------------------------------------
 1 | #' @importFrom dplyr %>%
 2 | #' @importFrom dplyr select
 3 | #' @importFrom dplyr slice
 4 | #' @importFrom dplyr matches
 5 | #' @importFrom dplyr starts_with
 6 | #' @importFrom dplyr rename_all
 7 | #' @importFrom dplyr mutate
 8 | #' @importFrom dplyr mutate_all
 9 | #' @importFrom dplyr group_by
10 | #' @importFrom dplyr ungroup
11 | #' @importFrom dplyr count
12 | #' @importFrom dplyr sample_n
13 | #' @importFrom glue glue
14 | #' @importFrom tibble as_tibble
15 | #' @importFrom tibble tibble
16 | #' @importFrom purrr map_dfc
17 | #' @importFrom purrr map2_dfc
18 | #' @importFrom rlang abort
19 | #' @importFrom rlang enquos
20 | #' @importFrom rlang arg_match
21 | #' @importFrom stats predict
22 | #' @importFrom stats prcomp
23 | #' @importFrom stats approx
24 | #' @importFrom stats quantile
25 | #' @importFrom stats ecdf
26 | #' @importFrom stats setNames
27 | #' @importFrom hardhat validate_prediction_size
28 | #' @importFrom hardhat forge
29 | #' @importFrom hardhat mold
30 | #' @importFrom hardhat new_model
31 | #' @importFrom ggplot2 ggplot geom_step xlab ylab aes autoplot
32 | #' @importFrom Matrix Matrix colSums
33 | #' @importFrom tidyselect vars_select
34 | #' @importFrom tidyr gather
35 | #' @importFrom proxyC simil
36 | 
37 | # ------------------------------------------------------------------------------
38 | # nocov
39 | 
40 | # Reduce false positives when R CMD check runs its "no visible binding for
41 | # global variable" check
42 | #' @importFrom utils globalVariables
43 | utils::globalVariables(
44 |   c("cumulative", "n", "sim", "percentile", "component", "value")
45 | )
46 | 
47 | # nocov end
48 | 


--------------------------------------------------------------------------------
/R/s3_register.R:
--------------------------------------------------------------------------------
 1 | # nocov start
 2 | s3_register <- function(generic, class, method = NULL) {
 3 |   stopifnot(is.character(generic), length(generic) == 1)
 4 |   stopifnot(is.character(class), length(class) == 1)
 5 | 
 6 |   pieces <- strsplit(generic, "::")[[1]]
 7 |   stopifnot(length(pieces) == 2)
 8 |   package <- pieces[[1]]
 9 |   generic <- pieces[[2]]
10 | 
11 |   caller <- parent.frame()
12 | 
13 |   get_method_env <- function() {
14 |     top <- topenv(caller)
15 |     if (isNamespace(top)) {
16 |       asNamespace(environmentName(top))
17 |     } else {
18 |       caller
19 |     }
20 |   }
21 |   get_method <- function(method, env) {
22 |     if (is.null(method)) {
23 |       get(paste0(generic, ".", class), envir = get_method_env())
24 |     } else {
25 |       method
26 |     }
27 |   }
28 | 
29 |   method_fn <- get_method(method)
30 |   stopifnot(is.function(method_fn))
31 | 
32 |   # Always register hook in case package is later unloaded & reloaded
33 |   setHook(
34 |     packageEvent(package, "onLoad"),
35 |     function(...) {
36 |       ns <- asNamespace(package)
37 | 
38 |       # Refresh the method, it might have been updated by `devtools::load_all()`
39 |       method_fn <- get_method(method)
40 | 
41 |       registerS3method(generic, class, method_fn, envir = ns)
42 |     }
43 |   )
44 | 
45 |   # Avoid registration failures during loading (pkgload or regular)
46 |   if (!isNamespaceLoaded(package)) {
47 |     return(invisible())
48 |   }
49 | 
50 |   envir <- asNamespace(package)
51 | 
52 |   # Only register if generic can be accessed
53 |   if (exists(generic, envir)) {
54 |     registerS3method(generic, class, method_fn, envir = envir)
55 |   }
56 | 
57 |   invisible()
58 | }
59 | 
60 | # nocov end
61 | 


--------------------------------------------------------------------------------
/man/score.apd_similarity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/similarity.R
 3 | \name{score.apd_similarity}
 4 | \alias{score.apd_similarity}
 5 | \title{Score new samples using similarity methods}
 6 | \usage{
 7 | \method{score}{apd_similarity}(object, new_data, type = "numeric", add_percentile = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{A \code{apd_similarity} object.}
11 | 
12 | \item{new_data}{A data frame or matrix of new predictors.}
13 | 
14 | \item{type}{A single character. The type of predictions to generate.
15 | Valid options are:
16 | \itemize{
17 | \item \code{"numeric"} for a numeric value that summarizes the similarity values for
18 | each sample across the training set.
19 | }}
20 | 
21 | \item{add_percentile}{A single logical; should the percentile of the
22 | similarity score \emph{relative to the training set values} by computed?}
23 | 
24 | \item{...}{Not used, but required for extensibility.}
25 | }
26 | \value{
27 | A tibble of predictions. The number of rows in the tibble is guaranteed
28 | to be the same as the number of rows in \code{new_data}. For \code{type = "numeric"},
29 | the tibble contains a column called "similarity". If \code{add_percentile = TRUE},
30 | an additional column called \code{similarity_pctl} will be added. These values are
31 | in percent units so that a value of 11.5 indicates that, in the training set,
32 | 11.5 percent of the training set samples had smaller values than the sample
33 | being scored.
34 | }
35 | \description{
36 | Score new samples using similarity methods
37 | }
38 | \examples{
39 | \donttest{
40 | data(qsar_binary)
41 | 
42 | jacc_sim <- apd_similarity(binary_tr)
43 | 
44 | mean_sim <- score(jacc_sim, new_data = binary_unk)
45 | mean_sim
46 | }
47 | }
48 | 


--------------------------------------------------------------------------------
/man/apd_hat_values.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hat_values-fit.R
 3 | \name{apd_hat_values}
 4 | \alias{apd_hat_values}
 5 | \alias{apd_hat_values.default}
 6 | \alias{apd_hat_values.data.frame}
 7 | \alias{apd_hat_values.matrix}
 8 | \alias{apd_hat_values.formula}
 9 | \alias{apd_hat_values.recipe}
10 | \title{Fit a \code{apd_hat_values}}
11 | \usage{
12 | apd_hat_values(x, ...)
13 | 
14 | \method{apd_hat_values}{default}(x, ...)
15 | 
16 | \method{apd_hat_values}{data.frame}(x, ...)
17 | 
18 | \method{apd_hat_values}{matrix}(x, ...)
19 | 
20 | \method{apd_hat_values}{formula}(formula, data, ...)
21 | 
22 | \method{apd_hat_values}{recipe}(x, data, ...)
23 | }
24 | \arguments{
25 | \item{x}{Depending on the context:
26 | \itemize{
27 | \item A \strong{data frame} of predictors.
28 | \item A \strong{matrix} of predictors.
29 | \item A \strong{recipe} specifying a set of preprocessing steps
30 | created from \code{\link[recipes:recipe]{recipes::recipe()}}.
31 | }}
32 | 
33 | \item{...}{Not currently used, but required for extensibility.}
34 | 
35 | \item{formula}{A formula specifying the predictor terms on the right-hand
36 | side. No outcome should be specified.}
37 | 
38 | \item{data}{When a \strong{recipe} or \strong{formula} is used, \code{data} is specified as:
39 | \itemize{
40 | \item A \strong{data frame} containing the predictors.
41 | }}
42 | }
43 | \value{
44 | A \code{apd_hat_values} object.
45 | }
46 | \description{
47 | \code{apd_hat_values()} fits a model.
48 | }
49 | \examples{
50 | predictors <- mtcars[, -1]
51 | 
52 | # Data frame interface
53 | mod <- apd_hat_values(predictors)
54 | 
55 | # Formula interface
56 | mod2 <- apd_hat_values(mpg ~ ., mtcars)
57 | 
58 | # Recipes interface
59 | library(recipes)
60 | rec <- recipe(mpg ~ ., mtcars)
61 | rec <- step_log(rec, disp)
62 | mod3 <- apd_hat_values(rec, mtcars)
63 | }
64 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 |     branches: [main, master]
12 | 
13 | name: R-CMD-check
14 | 
15 | jobs:
16 |   R-CMD-check:
17 |     runs-on: ${{ matrix.config.os }}
18 | 
19 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         config:
25 |           - {os: macos-latest,   r: 'release'}
26 | 
27 |           - {os: windows-latest, r: 'release'}
28 |           # Use 3.6 to trigger usage of RTools35
29 |           - {os: windows-latest, r: '3.6'}
30 |           # use 4.1 to check with rtools40's older compiler
31 |           - {os: windows-latest, r: '4.1'}
32 | 
33 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
34 |           - {os: ubuntu-latest,   r: 'release'}
35 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
36 | 
37 |     env:
38 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
39 |       R_KEEP_PKG_SOURCE: yes
40 | 
41 |     steps:
42 |       - uses: actions/checkout@v3
43 | 
44 |       - uses: r-lib/actions/setup-pandoc@v2
45 | 
46 |       - uses: r-lib/actions/setup-r@v2
47 |         with:
48 |           r-version: ${{ matrix.config.r }}
49 |           http-user-agent: ${{ matrix.config.http-user-agent }}
50 |           use-public-rspm: true
51 | 
52 |       - uses: r-lib/actions/setup-r-dependencies@v2
53 |         with:
54 |           extra-packages: any::rcmdcheck
55 |           needs: check
56 | 
57 |       - uses: r-lib/actions/check-r-package@v2
58 |         with:
59 |           upload-snapshots: true
60 | 


--------------------------------------------------------------------------------
/man/apd_pca.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pca-fit.R
 3 | \name{apd_pca}
 4 | \alias{apd_pca}
 5 | \alias{apd_pca.default}
 6 | \alias{apd_pca.data.frame}
 7 | \alias{apd_pca.matrix}
 8 | \alias{apd_pca.formula}
 9 | \alias{apd_pca.recipe}
10 | \title{Fit a \code{apd_pca}}
11 | \usage{
12 | apd_pca(x, ...)
13 | 
14 | \method{apd_pca}{default}(x, ...)
15 | 
16 | \method{apd_pca}{data.frame}(x, threshold = 0.95, ...)
17 | 
18 | \method{apd_pca}{matrix}(x, threshold = 0.95, ...)
19 | 
20 | \method{apd_pca}{formula}(formula, data, threshold = 0.95, ...)
21 | 
22 | \method{apd_pca}{recipe}(x, data, threshold = 0.95, ...)
23 | }
24 | \arguments{
25 | \item{x}{Depending on the context:
26 | \itemize{
27 | \item A \strong{data frame} of predictors.
28 | \item A \strong{matrix} of predictors.
29 | \item A \strong{recipe} specifying a set of preprocessing steps
30 | created from \code{\link[recipes:recipe]{recipes::recipe()}}.
31 | }}
32 | 
33 | \item{...}{Not currently used, but required for extensibility.}
34 | 
35 | \item{threshold}{A number indicating the percentage of variance desired from
36 | the principal components. It must be a number greater than 0 and less or
37 | equal than 1.}
38 | 
39 | \item{formula}{A formula specifying the predictor terms on the right-hand
40 | side. No outcome should be specified.}
41 | 
42 | \item{data}{When a \strong{recipe} or \strong{formula} is used, \code{data} is specified as:
43 | \itemize{
44 | \item A \strong{data frame} containing the predictors.
45 | }}
46 | }
47 | \value{
48 | A \code{apd_pca} object.
49 | }
50 | \description{
51 | \code{apd_pca()} fits a model.
52 | }
53 | \details{
54 | The function computes the principal components that account for
55 | up to either 95\% or the provided \code{threshold} of variability. It also
56 | computes the percentiles of the absolute value of the principal components.
57 | Additionally, it calculates the mean of each principal component.
58 | }
59 | \examples{
60 | predictors <- mtcars[, -1]
61 | 
62 | # Data frame interface
63 | mod <- apd_pca(predictors)
64 | 
65 | # Formula interface
66 | mod2 <- apd_pca(mpg ~ ., mtcars)
67 | 
68 | # Recipes interface
69 | library(recipes)
70 | rec <- recipe(mpg ~ ., mtcars)
71 | rec <- step_log(rec, disp)
72 | mod3 <- apd_pca(rec, mtcars)
73 | }
74 | 


--------------------------------------------------------------------------------
/R/helpers.R:
--------------------------------------------------------------------------------
 1 | # Find distance between each principal component and the respective mean
 2 | # calculated on each principal components on the training set.
 3 | find_distance_to_pca_means <- function(pcs, pca_means) {
 4 |   diffs <- sweep(pcs, 2, pca_means)
 5 |   sq_diff <- diffs^2
 6 |   dists <- apply(sq_diff, 1, function(x) sqrt(sum(x)))
 7 |   dists
 8 | }
 9 | 
10 | # -----------------------------------------------------------------------------
11 | # -----------------------------------------------------------------------------
12 | # -----------------------------------------------------------------------------
13 | 
14 | # Find percentile
15 | get_ref_percentile <- function(x) {
16 |   res <- stats::ecdf(x)
17 |   grid <- seq(0, 1, length = 101)
18 |   res <- stats::quantile(res, grid)
19 |   unname(res)
20 | }
21 | 
22 | # -----------------------------------------------------------------------------
23 | # -----------------------------------------------------------------------------
24 | # -----------------------------------------------------------------------------
25 | 
26 | # Find matrix XpX_inv
27 | get_inv <- function(X) {
28 |   if (!is.matrix(X)) {
29 |     X <- as.matrix(X)
30 |   }
31 | 
32 |   XpX <- t(X) %*% X
33 |   XpX_inv <- try(qr.solve(XpX), silent = TRUE)
34 | 
35 |   if (inherits(XpX_inv, "try-error")) {
36 |     message <- as.character(XpX_inv)
37 |     if (message == "Error in qr.solve(XpX) : singular matrix 'a' in solve\n") {
38 |       message <- paste(
39 |         "Unable to compute the hat values of the matrix X of",
40 |         "predictors because the matrix resulting from multiplying",
41 |         "the transpose of X by X is singular.",
42 |         sep = "\n"
43 |       )
44 |     }
45 | 
46 |     rlang::abort(message = message)
47 |   }
48 | 
49 |   dimnames(XpX_inv) <- NULL
50 |   XpX_inv
51 | }
52 | 
53 | # -----------------------------------------------------------------------------
54 | # -----------------------------------------------------------------------------
55 | # -----------------------------------------------------------------------------
56 | 
57 | # Get percentile for new samples
58 | get_new_percentile <- function(ref, x_new, grid) {
59 |   res <- approx(ref, grid, xout = x_new)$y
60 |   res[x_new < min(ref, na.rm = TRUE)] <- 0
61 |   res[x_new > max(ref, na.rm = TRUE)] <- 1
62 |   res
63 | }
64 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(apd_hat_values,data.frame)
 4 | S3method(apd_hat_values,default)
 5 | S3method(apd_hat_values,formula)
 6 | S3method(apd_hat_values,matrix)
 7 | S3method(apd_hat_values,recipe)
 8 | S3method(apd_pca,data.frame)
 9 | S3method(apd_pca,default)
10 | S3method(apd_pca,formula)
11 | S3method(apd_pca,matrix)
12 | S3method(apd_pca,recipe)
13 | S3method(apd_similarity,data.frame)
14 | S3method(apd_similarity,default)
15 | S3method(apd_similarity,formula)
16 | S3method(apd_similarity,matrix)
17 | S3method(apd_similarity,recipe)
18 | S3method(autoplot,apd_pca)
19 | S3method(autoplot,apd_similarity)
20 | S3method(print,apd_hat_values)
21 | S3method(print,apd_pca)
22 | S3method(print,apd_similarity)
23 | S3method(score,apd_hat_values)
24 | S3method(score,apd_pca)
25 | S3method(score,apd_similarity)
26 | S3method(score,default)
27 | export(apd_hat_values)
28 | export(apd_pca)
29 | export(apd_similarity)
30 | export(autoplot.apd_pca)
31 | export(autoplot.apd_similarity)
32 | export(score)
33 | export(score.default)
34 | importFrom(Matrix,Matrix)
35 | importFrom(Matrix,colSums)
36 | importFrom(dplyr,"%>%")
37 | importFrom(dplyr,count)
38 | importFrom(dplyr,group_by)
39 | importFrom(dplyr,matches)
40 | importFrom(dplyr,mutate)
41 | importFrom(dplyr,mutate_all)
42 | importFrom(dplyr,rename_all)
43 | importFrom(dplyr,sample_n)
44 | importFrom(dplyr,select)
45 | importFrom(dplyr,slice)
46 | importFrom(dplyr,starts_with)
47 | importFrom(dplyr,ungroup)
48 | importFrom(ggplot2,aes)
49 | importFrom(ggplot2,autoplot)
50 | importFrom(ggplot2,geom_step)
51 | importFrom(ggplot2,ggplot)
52 | importFrom(ggplot2,xlab)
53 | importFrom(ggplot2,ylab)
54 | importFrom(glue,glue)
55 | importFrom(hardhat,forge)
56 | importFrom(hardhat,mold)
57 | importFrom(hardhat,new_model)
58 | importFrom(hardhat,validate_prediction_size)
59 | importFrom(proxyC,simil)
60 | importFrom(purrr,map2_dfc)
61 | importFrom(purrr,map_dfc)
62 | importFrom(rlang,abort)
63 | importFrom(rlang,arg_match)
64 | importFrom(rlang,enquos)
65 | importFrom(stats,approx)
66 | importFrom(stats,ecdf)
67 | importFrom(stats,prcomp)
68 | importFrom(stats,predict)
69 | importFrom(stats,quantile)
70 | importFrom(stats,setNames)
71 | importFrom(tibble,as_tibble)
72 | importFrom(tibble,tibble)
73 | importFrom(tidyr,gather)
74 | importFrom(tidyselect,vars_select)
75 | importFrom(utils,globalVariables)
76 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pca-score.R:
--------------------------------------------------------------------------------
 1 | test_that("`score_apd_pca_numeric` fails when model has no pcs argument", {
 2 |   expect_snapshot(error = TRUE,
 3 |     score_apd_pca_numeric(mtcars, mtcars)
 4 |   )
 5 | })
 6 | 
 7 | test_that("`score` fails when predictors only contain factors", {
 8 |   model <- apd_pca(~., iris)
 9 |   expect_snapshot(error = TRUE,
10 |     score(model, iris$Species)
11 |   )
12 | })
13 | 
14 | test_that("`score` fails when predictors are vectors", {
15 |   object <- iris
16 | 
17 |   expect_snapshot(error = TRUE,
18 |     score(object)
19 |   )
20 | })
21 | 
22 | test_that("`score_apd_pca_numeric` pcs output matches `stats::predict` output", {
23 |   model <- apd_pca(mtcars %>% dplyr::slice(1:15))
24 |   predictors <- as.matrix(mtcars %>% dplyr::slice(16:30))
25 | 
26 |   expected <- stats::predict(model$pcs, predictors)
27 |   expected <- as.data.frame(expected[, 1:model$num_comp, drop = FALSE])
28 | 
29 |   # Select columns of the form PC{number}
30 |   actual_output <- score_apd_pca_numeric(model, predictors) %>%
31 |     dplyr::select(dplyr::matches("^PC\\d+$"))
32 | 
33 |   # Data frame method
34 |   expect_equal(ignore_attr = TRUE,
35 |     actual_output,
36 |     expected
37 |   )
38 | })
39 | 
40 | test_that("`score` pcs output matches `stats::predict` output", {
41 |   model <- apd_pca(mtcars %>% dplyr::slice(1:15))
42 |   predictors <- as.matrix(mtcars %>% dplyr::slice(16:30))
43 | 
44 |   expected <- stats::predict(model$pcs, predictors)
45 |   expected <- as.data.frame(expected[, 1:model$num_comp, drop = FALSE])
46 | 
47 |   # Select columns of the form PC{number}
48 |   actual_output <- score(model, predictors) %>%
49 |     dplyr::select(dplyr::matches("^PC\\d+$"))
50 | 
51 |   # Data frame method
52 |   expect_equal(ignore_attr = TRUE,
53 |     actual_output,
54 |     expected
55 |   )
56 | })
57 | 
58 | test_that("`score_apd_pca_bridge` output is correct", {
59 |   model <- apd_pca(mtcars %>% dplyr::slice(1:15))
60 |   predictors <- as.matrix(mtcars %>% dplyr::slice(16:30))
61 | 
62 |   expected <- stats::predict(model$pcs, predictors)
63 |   expected <- as.data.frame(expected[, 1:model$num_comp, drop = FALSE])
64 | 
65 |   # Select columns of the form PC{number}
66 |   actual_output <- score_apd_pca_bridge("numeric", model, predictors) %>%
67 |     dplyr::select(dplyr::matches("^PC\\d+$"))
68 | 
69 |   # Data frame method
70 |   expect_equal(ignore_attr = TRUE,
71 |     actual_output,
72 |     expected
73 |   )
74 | })
75 | 


--------------------------------------------------------------------------------
/R/print.R:
--------------------------------------------------------------------------------
 1 | #' Print number of predictors and principal components used.
 2 | #'
 3 | #' @param x A `apd_pca` object.
 4 | #'
 5 | #' @param ... Not currently used, but required for extensibility.
 6 | #'
 7 | #' @return None
 8 | #'
 9 | #' @examples
10 | #'
11 | #' model <- apd_pca(~ Sepal.Length + Sepal.Width, iris)
12 | #' print(model)
13 | #' @export
14 | print.apd_pca <- function(x, ...) {
15 |   predictors_count <- ncol(x$blueprint$ptypes$predictors)
16 |   percentage <- x$threshold * 100
17 |   num_comp <- x$num_comp
18 |   wording <- "components were"
19 | 
20 |   if (num_comp == 1) {
21 |     wording <- "component was"
22 |   }
23 | 
24 |   print_output <- glue::glue(
25 |     "# Predictors:
26 |       {predictors_count}
27 |    # Principal Components:
28 |       {num_comp} {wording} needed
29 |       to capture at least {percentage}% of the
30 |       total variation in the predictors."
31 |   )
32 | 
33 |   cat(print_output)
34 | 
35 |   invisible(x)
36 | }
37 | 
38 | #' Print number of predictors and principal components used.
39 | #'
40 | #' @param x A `apd_hat_values` object.
41 | #'
42 | #' @param ... Not currently used, but required for extensibility.
43 | #'
44 | #' @return None
45 | #'
46 | #' @examples
47 | #'
48 | #' model <- apd_hat_values(~ Sepal.Length + Sepal.Width, iris)
49 | #' print(model)
50 | #' @export
51 | print.apd_hat_values <- function(x, ...) {
52 |   predictors_count <- ncol(x$blueprint$ptypes$predictors)
53 | 
54 |   print_output <- glue::glue(
55 |     "# Predictors:
56 |         {predictors_count}"
57 |   )
58 |   cat(print_output)
59 | 
60 |   invisible(x)
61 | }
62 | 
63 | #' Print number of predictors and principal components used.
64 | #'
65 | #' @param x A `apd_similarity` object.
66 | #'
67 | #' @param ... Not currently used, but required for extensibility.
68 | #'
69 | #' @return None
70 | #'
71 | #' @examples
72 | #'
73 | #' set.seed(535)
74 | #' tr_x <- matrix(
75 | #'   sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE),
76 | #'   ncol = 20
77 | #'  )
78 | #' model <- apd_similarity(tr_x)
79 | #' print(model)
80 | #' @export
81 | print.apd_similarity <- function(x, ...) {
82 |   cat("Applicability domain via similarity\n")
83 |   cat(
84 |     "Reference data were", ncol(x$ref_data), "variables collected on",
85 |     nrow(x$ref_data), "data points.\n"
86 |   )
87 |   if (!is.na(x$quantile)) {
88 |     cat(
89 |       "New data summarized using the ", round(x$quantile * 100, 1),
90 |       "th percentile.\n",
91 |       sep = ""
92 |     )
93 |   } else {
94 |     cat("New data summarized using the mean.\n", sep = "")
95 |   }
96 |   invisible(x)
97 | }
98 | 


--------------------------------------------------------------------------------
/R/plot.R:
--------------------------------------------------------------------------------
 1 | #' Plot the distribution function for pcas
 2 | #'
 3 | #' @param object An object produced by `apd_pca`.
 4 | #'
 5 | #' @param ... An optional set of `dplyr` selectors, such as `dplyr::matches()` or
 6 | #'  `dplyr::starts_with()` for selecting which variables should be shown in the
 7 | #'  plot.
 8 | #'
 9 | #' @return A `ggplot` object that shows the distribution function for each
10 | #' principal component.
11 | #'
12 | #' @examples
13 | #' library(ggplot2)
14 | #' library(dplyr)
15 | #' library(modeldata)
16 | #' data(biomass)
17 | #'
18 | #' biomass_ad <- apd_pca(biomass[, 3:8])
19 | #'
20 | #' autoplot(biomass_ad)
21 | #' # Using selectors in `...`
22 | #' autoplot(biomass_ad, distance) + scale_x_log10()
23 | #' autoplot(biomass_ad, matches("PC[1-2]"))
24 | #' @export autoplot.apd_pca
25 | #' @export
26 | autoplot.apd_pca <- function(object, ...) {
27 |   selections <- rlang::enquos(...)
28 | 
29 |   pctl_data <- object$pctls
30 | 
31 |   if (length(selections) > 0) {
32 |     terms <- tidyselect::vars_select(names(pctl_data), !!!selections)
33 |     pctl_data <- pctl_data %>% dplyr::select(!!terms, percentile)
34 |   }
35 | 
36 |   pctl_data %>%
37 |     tidyr::gather(component, value, -percentile) %>%
38 |     ggplot2::ggplot(aes(x = value, y = percentile)) +
39 |     ggplot2::geom_step(direction = "hv") +
40 |     ggplot2::facet_wrap(~component) +
41 |     xlab("abs(value)")
42 | }
43 | 
44 | #' Plot the cumulative distribution function for similarity metrics
45 | #'
46 | #' @param object An object produced by `apd_similarity`.
47 | #'
48 | #' @param ... Not currently used.
49 | #'
50 | #' @return A `ggplot` object that shows the cumulative probability versus the
51 | #'  unique similarity values in the training set. Not that for large samples,
52 | #'  this is an approximation based on a random sample of 5,000 training set
53 | #'  points.
54 | #'
55 | #' @examples
56 | #' set.seed(535)
57 | #' tr_x <- matrix(
58 | #'   sample(0:1, size = 20 * 50, prob = rep(.5, 2), replace = TRUE),
59 | #'   ncol = 20
60 | #' )
61 | #' model <- apd_similarity(tr_x)
62 | #' @export autoplot.apd_similarity
63 | #' @export
64 | autoplot.apd_similarity <- function(object, ...) {
65 |   lab <-
66 |     dplyr::case_when(
67 |       is.na(object$quantile) ~ "mean",
68 |       object$quantile == 0.5 ~ "median",
69 |       TRUE ~ paste0(round(object$quantile * 100, 1), "th quantile of")
70 |     )
71 | 
72 |   ggplot2::ggplot(object$ref_scores, ggplot2::aes(x = sim, y = cumulative)) +
73 |     ggplot2::geom_step(direction = "vh") +
74 |     ggplot2::ylab("Cumulative Probability") +
75 |     ggplot2::xlab(paste(lab, "similarity (training set)"))
76 | }
77 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
 1 | #' Binary QSAR Data
 2 | #'
 3 | #' @details These data are from two different sources on quantitative
 4 | #'  structure-activity relationship (QSAR) modeling and contain 67 predictors
 5 | #'  that are either 0 or 1. The training set contains 4,330 samples and there
 6 | #'  are five unknown samples (both from the `Mutagen` data in the `QSARdata`
 7 | #'  package).
 8 | #'
 9 | #' @name binary
10 | #' @aliases qsar_binary binary_tr binary_unk
11 | #' @docType data
12 | #' @return \item{binary_tr,binary_ukn}{data frame frames with 67 columns}
13 | #'
14 | #'
15 | #' @keywords datasets
16 | #' @examples
17 | #' data(qsar_binary)
18 | #' str(binary_tr)
19 | NULL
20 | 
21 | #' OkCupid Binary Predictors
22 | #'
23 | #' @details Data originally from Kim (2015) includes a training and test set
24 | #'  consistent with Kuhn and Johnson (2020). Predictors include ethnicity
25 | #'  indicators and a set of keywords derived from text essay data.
26 | #'
27 | #' @name okc_binary
28 | #' @aliases okc_binary okc_binary_train okc_binary_test
29 | #' @docType data
30 | #' @return \item{okc_binary_train,okc_binary_test}{data frame frames with 61 columns}
31 | #'
32 | #' @source
33 | #' Kim (2015), "OkCupid Data for Introductory Statistics and Data Science Courses", _Journal of Statistics Education_, Volume 23, Number 2. \url{https://www.tandfonline.com/doi/abs/10.1080/10691898.2015.11889737}
34 | #'
35 | #' Kuhn and Johnson (2020), _Feature Engineering and Selection_, Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and \url{https://github.com/topepo/FES}
36 | #'
37 | #' @keywords datasets
38 | #' @examples
39 | #' data(okc_binary)
40 | #' str(okc_binary_train)
41 | NULL
42 | 
43 | #' Recent Ames Iowa Houses
44 | #'
45 | #' More data related to the set described by De Cock (2011) where data where
46 | #' data were recorded for 2,930 properties in Ames IA.
47 | #'
48 | #' This data sets includes three more properties added since the original
49 | #' reference. There are less fields in this data set; only those that could be
50 | #' transcribed from the assessor's office were included.
51 | #'
52 | #' @name ames_new
53 | #' @aliases ames_new
54 | #' @docType data
55 | #' @return \item{ames_new}{a tibble}
56 | #' @details
57 | #'
58 | #'
59 | #' @source De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing
60 | #' Data as an End of Semester Regression Project," \emph{Journal of Statistics
61 | #' Education},  Volume 19, Number 3.
62 | #'
63 | #' \url{https://www.cityofames.org/government/departments-divisions-a-h/city-assessor}
64 | #'
65 | #' \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt}
66 | #'
67 | #' \url{http://jse.amstat.org/v19n3/decock.pdf}
68 | #'
69 | #' @keywords datasets
70 | NULL
71 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 | 
 7 | name: Commands
 8 | 
 9 | jobs:
10 |   document:
11 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }}
12 |     name: document
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |     steps:
17 |       - uses: actions/checkout@v3
18 | 
19 |       - uses: r-lib/actions/pr-fetch@v2
20 |         with:
21 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
22 | 
23 |       - uses: r-lib/actions/setup-r@v2
24 |         with:
25 |           use-public-rspm: true
26 | 
27 |       - uses: r-lib/actions/setup-r-dependencies@v2
28 |         with:
29 |           extra-packages: any::roxygen2
30 |           needs: pr-document
31 | 
32 |       - name: Document
33 |         run: roxygen2::roxygenise()
34 |         shell: Rscript {0}
35 | 
36 |       - name: commit
37 |         run: |
38 |           git config --local user.name "$GITHUB_ACTOR"
39 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
40 |           git add man/\* NAMESPACE
41 |           git commit -m 'Document'
42 | 
43 |       - uses: r-lib/actions/pr-push@v2
44 |         with:
45 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
46 | 
47 |   style:
48 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }}
49 |     name: style
50 |     runs-on: ubuntu-latest
51 |     env:
52 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
53 |     steps:
54 |       - uses: actions/checkout@v3
55 | 
56 |       - uses: r-lib/actions/pr-fetch@v2
57 |         with:
58 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
59 | 
60 |       - uses: r-lib/actions/setup-r@v2
61 | 
62 |       - name: Install dependencies
63 |         run: install.packages("styler")
64 |         shell: Rscript {0}
65 | 
66 |       - name: Style
67 |         run: styler::style_pkg()
68 |         shell: Rscript {0}
69 | 
70 |       - name: commit
71 |         run: |
72 |           git config --local user.name "$GITHUB_ACTOR"
73 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
74 |           git add \*.R
75 |           git commit -m 'Style'
76 | 
77 |       - uses: r-lib/actions/pr-push@v2
78 |         with:
79 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
80 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/similarity.md:
--------------------------------------------------------------------------------
 1 | # bad args
 2 | 
 3 |     Code
 4 |       apd_similarity(tr_x, quantile = 2)
 5 |     Error <rlang_error>
 6 |       The `quantile` argument should be NA or a single numeric value in [0, 1].
 7 | 
 8 | ---
 9 | 
10 |     Code
11 |       apd_similarity(tr_x_sp)
12 |     Error <rlang_error>
13 |       `x` is not of a recognized type.
14 |       Only data.frame, matrix, recipe, and formula objects are allowed.
15 |       A dgCMatrix was specified.
16 | 
17 | # printed output
18 | 
19 |     Code
20 |       print(apd_similarity(tr_x))
21 |     Output
22 |       Applicability domain via similarity
23 |       Reference data were 20 variables collected on 50 data points.
24 |       New data summarized using the mean.
25 | 
26 | ---
27 | 
28 |     Code
29 |       print(apd_similarity(tr_x))
30 |     Output
31 |       Applicability domain via similarity
32 |       Reference data were 20 variables collected on 50 data points.
33 |       New data summarized using the mean.
34 | 
35 | ---
36 | 
37 |     Code
38 |       print(apd_similarity(tr_x))
39 |     Output
40 |       Applicability domain via similarity
41 |       Reference data were 20 variables collected on 50 data points.
42 |       New data summarized using the mean.
43 | 
44 | ---
45 | 
46 |     Code
47 |       print(apd_similarity(tr_x, quantile = 0.13))
48 |     Output
49 |       Applicability domain via similarity
50 |       Reference data were 20 variables collected on 50 data points.
51 |       New data summarized using the 13th percentile.
52 | 
53 | # apd_similarity fails when quantile is neither NA nor a number in [0, 1]
54 | 
55 |     Code
56 |       apd_similarity(tr_x, quantile = -1)
57 |     Error <rlang_error>
58 |       The `quantile` argument should be NA or a single numeric value in [0, 1].
59 | 
60 | ---
61 | 
62 |     Code
63 |       apd_similarity(tr_x, quantile = 3)
64 |     Error <rlang_error>
65 |       The `quantile` argument should be NA or a single numeric value in [0, 1].
66 | 
67 | ---
68 | 
69 |     Code
70 |       apd_similarity(tr_x, quantile = "la")
71 |     Error <rlang_error>
72 |       The `quantile` argument should be NA or a single numeric value in [0, 1].
73 | 
74 | # apd_similarity outputs warning with zero variance variables 
75 | 
76 |     Code
77 |       apd_similarity(bad_data)
78 |     Warning <rlang_warning>
79 |       The following variables had zero variance and were removed: a, b, and d
80 |     Output
81 |       Applicability domain via similarity
82 |       Reference data were 1 variables collected on 2 data points.
83 |       New data summarized using the mean.
84 | 
85 | # apd_similarity fails when all the variables have zero variance
86 | 
87 |     Code
88 |       apd_similarity(bad_data)
89 |     Error <rlang_error>
90 |       All variables have a single unique value.
91 | 
92 | # apd_similarity fails data is not binary
93 | 
94 |     Code
95 |       apd_similarity(bad_data)
96 |     Error <rlang_error>
97 |       The following variables are not binary: b, and d
98 | 
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # applicable <img src="man/figures/logo.png" align="right"/>
 5 | 
 6 | <!-- badges: start -->
 7 | 
 8 | [![R-CMD-check](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml)
 9 | [![Codecov test
10 | coverage](https://codecov.io/gh/tidymodels/applicable/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidymodels/applicable?branch=main)
11 | [![Lifecycle:experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html)
12 | [![CRAN
13 | status](https://www.r-pkg.org/badges/version/applicable)](https://cran.r-project.org/package=applicable)
14 | <!-- badges: end -->
15 | 
16 | ## Introduction
17 | 
18 | There are times when a model’s prediction should be taken with some
19 | skepticism. For example, if a new data point is substantially different
20 | from the training set, its predicted value may be suspect. In chemistry,
21 | it is not uncommon to create an “applicability domain” model that
22 | measures the amount of potential extrapolation new samples have from the
23 | training set. applicable contains different methods to measure how much
24 | a new data point is an extrapolation from the original data (if at all).
25 | 
26 | ## Installation
27 | 
28 | You can install the released version of applicable from
29 | [CRAN](https://CRAN.R-project.org) with:
30 | 
31 | ``` r
32 | install.packages("applicable")
33 | ```
34 | 
35 | Install the development version of applicable from
36 | [GitHub](https://github.com/) with:
37 | 
38 | ``` r
39 | # install.packages("pak")
40 | pak::pak("tidymodels/applicable")
41 | ```
42 | 
43 | ## Vignettes
44 | 
45 | To learn about how to use applicable, check out the vignettes:
46 | 
47 | - `vignette("binary-data", "applicable")`: Learn different methods to
48 |   analyze binary data.
49 | 
50 | - `vignette("continuous-data", "applicable")`: Learn different methods
51 |   to analyze continuous data.
52 | 
53 | ## Contributing
54 | 
55 | This project is released with a [Contributor Code of
56 | Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
57 | By contributing to this project, you agree to abide by its terms.
58 | 
59 | - For questions and discussions about tidymodels packages, modeling, and
60 |   machine learning, please [post on RStudio
61 |   Community](https://community.rstudio.com/new-topic?category_id=15&tags=tidymodels,question).
62 | 
63 | - If you think you have encountered a bug, please [submit an
64 |   issue](https://github.com/tidymodels/applicable/issues).
65 | 
66 | - Either way, learn how to create and share a
67 |   [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html)
68 |   (a minimal, reproducible example), to clearly communicate about your
69 |   code.
70 | 
71 | - Check out further details on [contributing guidelines for tidymodels
72 |   packages](https://www.tidymodels.org/contribute/) and [how to get
73 |   help](https://www.tidymodels.org/help/).
74 | 


--------------------------------------------------------------------------------
/vignettes/binary-data.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Applicability domain methods for binary data"
 3 | output: rmarkdown::html_vignette
 4 | vignette: >
 5 |   %\VignetteIndexEntry{binary-data}
 6 |   %\VignetteEngine{knitr::rmarkdown}
 7 |   %\VignetteEncoding{UTF-8}
 8 | ---
 9 | 
10 | ```{r, include = FALSE}
11 | knitr::opts_chunk$set(
12 |   collapse = TRUE,
13 |   comment = "#>"
14 | )
15 | library(ggplot2)
16 | theme_set(theme_bw())
17 | ```
18 | 
19 | ```{r, echo = FALSE}
20 | # TODO
21 | #- Mention different input data types: data.frame, recipes, matrix, etc.
22 | #- Maybe make a (better) conclusion?
23 | #- Explain the reason why the training set is diverse.
24 | ```
25 | 
26 | ## Introduction
27 | 
28 | ```{r}
29 | library(applicable)
30 | ```
31 | 
32 | Similarity statistics can be used to compare data sets where all of the
33 | predictors are binary. One of the most common measures is the Jaccard index.
34 | 
35 | For a training set of size `n`, there are `n` similarity statistics for each
36 | new sample. These can be summarized via the mean statistic or a quantile. In
37 | general, we want similarity to be low within the training set (i.e., a diverse
38 | training set) and high for new samples to be predicted.
39 | 
40 | To analyze the Jaccard metric, `applicable` provides the following methods:
41 | 
42 | * `apd_similarity`: analyzes samples in terms of similarity scores. For a
43 | training set of _n_ samples, a new sample is compared to each, resulting in _n_
44 | similarity scores. These can be summarized into the median similarity.
45 | 
46 | * `autoplot`: shows the cumulative probability versus the unique similarity
47 | values in the training set.
48 | 
49 | * `score`: scores new samples using similarity methods. In particular, it
50 | calculates the similarity scores and if `add_percentile = TRUE`, it also
51 | estimates the percentile of the similarity scores.
52 | 
53 | ## Example 
54 | 
55 | The example data is from two QSAR data sets where binary fingerprints are used
56 | as predictors.
57 | 
58 | ```{r}
59 | data(qsar_binary)
60 | ```
61 | 
62 | Let us construct the model:
63 | 
64 | ```{r}
65 | jacc_sim <- apd_similarity(binary_tr)
66 | jacc_sim
67 | ```
68 | 
69 | As we can see below, this is a fairly diverse training set:
70 | 
71 | ```{r jac-plot}
72 | #| fig-alt: "Empirical cumulative distribution chart. Mean similarity along the x-axis, Cumulative Probability along the why axis. Reading from left to right, values stay close to 0 from x = 0 to x = 0.25, from x = 0.25 to x = 0.4 there is a near-linear upwards trend to about y = 0.70. After that y = 1."
73 | library(ggplot2)
74 | 
75 | # Plot the empirical cumulative distribution function for the training set
76 | autoplot(jacc_sim)
77 | ```
78 | 
79 | We can compare the similarity between new samples and the training set:
80 | 
81 | ```{r}
82 | # Summarize across all training set similarities
83 | mean_sim <- score(jacc_sim, new_data = binary_unk)
84 | mean_sim
85 | ```
86 | 
87 | Samples 3 and 5 are definitely extrapolations based on these predictors.
88 | In other words, the new samples are not similar to the training set and so
89 | predictions on them may not be very reliable.
90 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | editor_options: 
 4 |   chunk_output_type: console
 5 | ---
 6 | 
 7 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 8 | 
 9 | ```{r, include = FALSE}
10 | knitr::opts_chunk$set(
11 |   collapse = TRUE,
12 |   comment = "#>",
13 |   fig.path = "man/figures/README-",
14 |   out.width = "100%"
15 | )
16 | 
17 | options(rlang__backtrace_on_error = "reminder")
18 | 
19 | ```
20 | 
21 | # applicable <img src="man/figures/logo.png" align="right"/>
22 | 
23 | <!-- badges: start -->
24 | [![R-CMD-check](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/applicable/actions/workflows/R-CMD-check.yaml)
25 | [![Codecov test coverage](https://codecov.io/gh/tidymodels/applicable/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidymodels/applicable?branch=main)
26 | [![Lifecycle:experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html)
27 | [![CRAN status](https://www.r-pkg.org/badges/version/applicable)](https://cran.r-project.org/package=applicable)
28 | <!-- badges: end -->
29 | 
30 | ## Introduction
31 | 
32 | There are times when a model's prediction should be taken with some skepticism. For example, if a new data point is substantially different from the training set, its predicted value may be suspect. In chemistry, it is not uncommon to create an "applicability domain" model that measures the amount of potential extrapolation new samples have from the training set. applicable contains different methods to measure how much a new data point is an extrapolation from the original data (if at all).
33 | 
34 | ## Installation
35 | 
36 | You can install the released version of applicable from [CRAN](https://CRAN.R-project.org) with:
37 | 
38 | ``` r
39 | install.packages("applicable")
40 | ```
41 | 
42 | Install the development version of applicable from [GitHub](https://github.com/) with:
43 | 
44 | ``` r
45 | # install.packages("pak")
46 | pak::pak("tidymodels/applicable")
47 | ```
48 | 
49 | ## Vignettes
50 | 
51 | To learn about how to use applicable, check out the vignettes:
52 | 
53 | - `vignette("binary-data", "applicable")`: Learn different methods to analyze binary data.
54 | 
55 | - `vignette("continuous-data", "applicable")`: Learn different methods to analyze continuous data.
56 | 
57 | ## Contributing
58 | 
59 | This project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms.
60 | 
61 | - For questions and discussions about tidymodels packages, modeling, and machine learning, please [post on RStudio Community](https://community.rstudio.com/new-topic?category_id=15&tags=tidymodels,question).
62 | 
63 | - If you think you have encountered a bug, please [submit an issue](https://github.com/tidymodels/applicable/issues).
64 | 
65 | - Either way, learn how to create and share a [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) (a minimal, reproducible example), to clearly communicate about your code.
66 | 
67 | - Check out further details on [contributing guidelines for tidymodels packages](https://www.tidymodels.org/contribute/) and [how to get help](https://www.tidymodels.org/help/).
68 | 


--------------------------------------------------------------------------------
/tests/testthat/test-pca-fit.R:
--------------------------------------------------------------------------------
  1 | test_that("`new_apd_pca` arguments are assigned correctly", {
  2 |   x <- new_apd_pca(
  3 |     "pcs",
  4 |     "pca_means",
  5 |     "pctls",
  6 |     "threshold",
  7 |     "num_comp",
  8 |     blueprint = hardhat::default_xy_blueprint()
  9 |   )
 10 | 
 11 |   expect_equal(names(x), c("pcs", "pca_means", "pctls", "threshold", "num_comp", "blueprint"))
 12 |   expect_equal(x$pcs, "pcs")
 13 |   expect_equal(x$pca_means, "pca_means")
 14 |   expect_equal(x$pctls, "pctls")
 15 |   expect_equal(x$threshold, "threshold")
 16 |   expect_equal(x$num_comp, "num_comp")
 17 |   expect_equal(x$blueprint, hardhat::default_xy_blueprint())
 18 | })
 19 | 
 20 | test_that("pcs is provided", {
 21 |   expect_snapshot(error = TRUE,
 22 |     new_apd_pca(blueprint = hardhat::default_xy_blueprint())
 23 |   )
 24 | })
 25 | 
 26 | test_that("`new_apd_pca` fails when blueprint is numeric", {
 27 |   expect_snapshot(error = TRUE,
 28 |     new_apd_pca(pcs = 1, blueprint = 1)
 29 |   )
 30 | })
 31 | 
 32 | test_that("`new_apd_pca` returned blueprint is of class hardhat_blueprint", {
 33 |   x <- new_apd_pca(
 34 |     "pcs",
 35 |     "pca_means",
 36 |     "pctls",
 37 |     "threshold",
 38 |     "num_comp",
 39 |     blueprint = hardhat::default_xy_blueprint()
 40 |   )
 41 | 
 42 |   expect_s3_class(x$blueprint, "hardhat_blueprint")
 43 | })
 44 | 
 45 | 
 46 | test_that("`apd_pca` fails when model is not of class apd_pca", {
 47 |   model <- apd_pca(~ Sepal.Length + Species, iris)
 48 |   expect_s3_class(model, "apd_pca")
 49 | })
 50 | 
 51 | test_that("`apd_pca` fails when model is not of class hardhat_model", {
 52 |   model <- apd_pca(~ Sepal.Length + Species, iris)
 53 |   expect_s3_class(model, "hardhat_model")
 54 | })
 55 | 
 56 | test_that("pcs matches `prcomp` output for the data frame method", {
 57 |   expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE)
 58 |   expected$x <- NULL
 59 | 
 60 |   # Data frame method
 61 |   expect_equal(ignore_attr = TRUE,
 62 |     apd_pca(mtcars)$pcs,
 63 |     expected
 64 |   )
 65 | })
 66 | 
 67 | test_that("pcs matches `prcomp` output for the formula method", {
 68 |   expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE)
 69 |   expected$x <- NULL
 70 | 
 71 |   # Formula method
 72 |   expect_equal(ignore_attr = TRUE,
 73 |     apd_pca(~., mtcars)$pcs,
 74 |     expected
 75 |   )
 76 | })
 77 | 
 78 | test_that("pcs matches `prcomp` output for the recipe method", {
 79 |   expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE)
 80 |   expected$x <- NULL
 81 | 
 82 |   # Recipe method
 83 |   rec <- recipes::recipe(~., mtcars)
 84 |   expect_equal(ignore_attr = TRUE,
 85 |     apd_pca(rec, data = mtcars)$pcs,
 86 |     expected
 87 |   )
 88 | })
 89 | 
 90 | test_that("pcs matches `prcomp` output for the matrix method", {
 91 |   expected <- stats::prcomp(mtcars, center = TRUE, scale. = TRUE)
 92 |   expected$x <- NULL
 93 | 
 94 |   # Matrix method
 95 |   expect_equal(ignore_attr = TRUE,
 96 |     apd_pca(as.matrix(mtcars))$pcs,
 97 |     expected
 98 |   )
 99 | })
100 | 
101 | test_that("`apd_pca` is not defined for vectors", {
102 |   cls <- class(mtcars$mpg)[1]
103 |   expected_message <- glue::glue("`x` is not of a recognized type.
104 |      Only data.frame, matrix, recipe, and formula objects are allowed.
105 |      A {cls} was specified.")
106 | 
107 |   expect_condition(
108 |     apd_pca(mtcars$mpg),
109 |     expected_message
110 |   )
111 | })
112 | 


--------------------------------------------------------------------------------
/R/hat_values-score.R:
--------------------------------------------------------------------------------
 1 | # -----------------------------------------------------------------------------
 2 | # ---------------------- Model function implementation ------------------------
 3 | # -----------------------------------------------------------------------------
 4 | 
 5 | score_apd_hat_values_numeric <- function(model, predictors) {
 6 |   if (!("XtX_inv" %in% names(model))) {
 7 |     rlang::abort("The model must contain an XtX_inv argument.")
 8 |   }
 9 | 
10 |   proj_matrix <- predictors %*% model$XtX_inv %*% t(predictors)
11 |   hat_values <- diag(proj_matrix)
12 | 
13 |   hat_values_pctls <- get_new_percentile(
14 |     model$pctls$hat_values_pctls,
15 |     hat_values,
16 |     model$pctls$percentile
17 |   )
18 | 
19 |   tibble::as_tibble(
20 |     cbind(
21 |       hat_values,
22 |       hat_values_pctls
23 |     )
24 |   )
25 | }
26 | 
27 | # -----------------------------------------------------------------------------
28 | # ------------------------ Model function bridge ------------------------------
29 | # -----------------------------------------------------------------------------
30 | 
31 | score_apd_hat_values_bridge <- function(type, model, predictors) {
32 |   predictors <- as.matrix(predictors)
33 | 
34 |   score_function <- get_hat_values_score_function(type)
35 |   predictions <- score_function(model, predictors)
36 | 
37 |   hardhat::validate_prediction_size(predictions, predictors)
38 | 
39 |   predictions
40 | }
41 | 
42 | get_hat_values_score_function <- function(type) {
43 |   switch(type,
44 |     numeric = score_apd_hat_values_numeric
45 |   )
46 | }
47 | 
48 | # -----------------------------------------------------------------------------
49 | # ----------------------- Model function interface ----------------------------
50 | # -----------------------------------------------------------------------------
51 | 
52 | #' Score new samples using hat values
53 | #'
54 | #' @param object A `apd_hat_values` object.
55 | #'
56 | #' @param new_data A data frame or matrix of new predictors.
57 | #'
58 | #' @param type A single character. The type of predictions to generate.
59 | #' Valid options are:
60 | #'
61 | #' - `"numeric"` for a numeric value that summarizes the hat values for
62 | #'   each sample across the training set.
63 | #'
64 | #' @param ... Not used, but required for extensibility.
65 | #'
66 | #' @return
67 | #'
68 | #' A tibble of predictions. The number of rows in the tibble is guaranteed
69 | #' to be the same as the number of rows in `new_data`. For `type = "numeric"`,
70 | #' the tibble contains two columns `hat_values` and `hat_values_pctls`. The
71 | #' column `hat_values_pctls` is in percent units so that a value of 11.5
72 | #' indicates that, in the training set, 11.5 percent of the training set
73 | #' samples had smaller values than the sample being scored.
74 | #'
75 | #' @examples
76 | #' train_data <- mtcars[1:20, ]
77 | #' test_data <- mtcars[21:32, ]
78 | #'
79 | #' hat_values_model <- apd_hat_values(train_data)
80 | #'
81 | #' hat_values_scoring <- score(hat_values_model, new_data = test_data)
82 | #' hat_values_scoring
83 | #' @export
84 | score.apd_hat_values <- function(object, new_data, type = "numeric", ...) {
85 |   forged <- hardhat::forge(new_data, object$blueprint)
86 |   rlang::arg_match(type, valid_predict_types())
87 |   score_apd_hat_values_bridge(type, object, forged$predictors)
88 | }
89 | 
90 | # -----------------------------------------------------------------------------
91 | # ----------------------- Helper functions ------------------------------------
92 | # -----------------------------------------------------------------------------
93 | 
94 | valid_predict_types <- function() {
95 |   c("numeric")
96 | }
97 | 


--------------------------------------------------------------------------------
/R/pca-score.R:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | # ---------------------- Model function implementation ------------------------
  3 | # -----------------------------------------------------------------------------
  4 | 
  5 | score_apd_pca_numeric <- function(model, predictors) {
  6 |   if (!("pcs" %in% names(model))) {
  7 |     rlang::abort("The model must contain a pcs argument.")
  8 |   }
  9 | 
 10 |   # Predict output and subset using `num_comp`
 11 |   predicted_output <- stats::predict(model$pcs, predictors)
 12 |   predicted_output <- predicted_output[, 1:model$num_comp, drop = FALSE]
 13 | 
 14 |   # Compute distances between new pca values and the pca means
 15 |   dists <- find_distance_to_pca_means(predicted_output, model$pca_means)
 16 | 
 17 |   predicted_output <-
 18 |     as_tibble(predicted_output) %>%
 19 |     setNames(names0(ncol(predicted_output), "PC")) %>%
 20 |     mutate(distance = dists)
 21 | 
 22 |   # Compute percentile of new pca values
 23 |   new_pctls <- purrr::map2_dfc(
 24 |     model$pctls %>%
 25 |       dplyr::select(-percentile),
 26 |     predicted_output %>% mutate_all(abs),
 27 |     get_new_percentile,
 28 |     grid = model$pctls$percentile
 29 |   ) %>%
 30 |     dplyr::rename_all(paste0, "_pctl")
 31 | 
 32 |   tibble::as_tibble(
 33 |     cbind(
 34 |       predicted_output,
 35 |       new_pctls
 36 |     )
 37 |   )
 38 | }
 39 | 
 40 | # -----------------------------------------------------------------------------
 41 | # ------------------------ Model function bridge ------------------------------
 42 | # -----------------------------------------------------------------------------
 43 | 
 44 | score_apd_pca_bridge <- function(type, model, predictors) {
 45 |   predictors <- as.matrix(predictors)
 46 | 
 47 |   score_function <- get_pca_score_function(type)
 48 |   predictions <- score_function(model, predictors)
 49 | 
 50 |   hardhat::validate_prediction_size(predictions, predictors)
 51 | 
 52 |   predictions
 53 | }
 54 | 
 55 | # -----------------------------------------------------------------------------
 56 | # ----------------------- Model function interface ----------------------------
 57 | # -----------------------------------------------------------------------------
 58 | 
 59 | #' Predict from a `apd_pca`
 60 | #'
 61 | #' @param object A `apd_pca` object.
 62 | #'
 63 | #' @param new_data A data frame or matrix of new samples.
 64 | #'
 65 | #' @param type A single character. The type of predictions to generate.
 66 | #' Valid options are:
 67 | #'
 68 | #' - `"numeric"` for numeric predictions.
 69 | #'
 70 | #' @param ... Not used, but required for extensibility.
 71 | #'
 72 | #' @details The function computes the principal components of the new data and
 73 | #' their percentiles as compared to the training data. The number of principal
 74 | #' components computed depends on the `threshold` given at fit time. It also
 75 | #' computes the multivariate distance between each principal component and its
 76 | #' mean.
 77 | #'
 78 | #' @return
 79 | #'
 80 | #' A tibble of predictions. The number of rows in the tibble is guaranteed
 81 | #' to be the same as the number of rows in `new_data`.
 82 | #'
 83 | #' @examples
 84 | #' train <- mtcars[1:20, ]
 85 | #' test <- mtcars[21:32, -1]
 86 | #'
 87 | #' # Fit
 88 | #' mod <- apd_pca(mpg ~ cyl + log(drat), train)
 89 | #'
 90 | #' # Predict, with preprocessing
 91 | #' score(mod, test)
 92 | #' @export
 93 | score.apd_pca <- function(object, new_data, type = "numeric", ...) {
 94 |   forged <- hardhat::forge(new_data, object$blueprint)
 95 |   rlang::arg_match(type, valid_predict_types())
 96 |   score_apd_pca_bridge(type, object, forged$predictors)
 97 | }
 98 | 
 99 | # -----------------------------------------------------------------------------
100 | # ----------------------- Helper functions ------------------------------------
101 | # -----------------------------------------------------------------------------
102 | 
103 | get_pca_score_function <- function(type) {
104 |   switch(type,
105 |     numeric = score_apd_pca_numeric
106 |   )
107 | }
108 | 
109 | valid_predict_types <- function() {
110 |   c("numeric")
111 | }
112 | 


--------------------------------------------------------------------------------
/man/apd_similarity.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/similarity.R
  3 | \name{apd_similarity}
  4 | \alias{apd_similarity}
  5 | \alias{apd_similarity.default}
  6 | \alias{apd_similarity.data.frame}
  7 | \alias{apd_similarity.matrix}
  8 | \alias{apd_similarity.formula}
  9 | \alias{apd_similarity.recipe}
 10 | \title{Applicability domain methods using binary similarity analysis}
 11 | \usage{
 12 | apd_similarity(x, ...)
 13 | 
 14 | \method{apd_similarity}{default}(x, quantile = NA_real_, ...)
 15 | 
 16 | \method{apd_similarity}{data.frame}(x, quantile = NA_real_, ...)
 17 | 
 18 | \method{apd_similarity}{matrix}(x, quantile = NA_real_, ...)
 19 | 
 20 | \method{apd_similarity}{formula}(formula, data, quantile = NA_real_, ...)
 21 | 
 22 | \method{apd_similarity}{recipe}(x, data, quantile = NA_real_, ...)
 23 | }
 24 | \arguments{
 25 | \item{x}{Depending on the context:
 26 | \itemize{
 27 | \item A \strong{data frame} of binary predictors.
 28 | \item A \strong{matrix} of binary predictors.
 29 | \item A \strong{recipe} specifying a set of preprocessing steps
 30 | created from \code{\link[recipes:recipe]{recipes::recipe()}}.
 31 | }}
 32 | 
 33 | \item{...}{Options to pass to \code{proxyC::simil()}, such as \code{method}. If no
 34 | options are specified, \code{method = "jaccard"} is used.}
 35 | 
 36 | \item{quantile}{A real number between 0 and 1 or NA for how the similarity
 37 | values for each sample versus the training set should be summarized. A value
 38 | of \code{NA} specifies that the mean similarity is computed. Otherwise, the
 39 | appropriate quantile is computed.}
 40 | 
 41 | \item{formula}{A formula specifying the predictor terms on the right-hand
 42 | side. No outcome should be specified.}
 43 | 
 44 | \item{data}{When a \strong{recipe} or \strong{formula} is used, \code{data} is specified as:
 45 | \itemize{
 46 | \item A \strong{data frame} containing the binary predictors. Any predictors with
 47 | no 1's will be removed (with a warning).
 48 | }}
 49 | }
 50 | \value{
 51 | A \code{apd_similarity} object.
 52 | }
 53 | \description{
 54 | \code{apd_similarity()} is used to analyze samples in terms of similarity scores
 55 | for binary data. All features in the data should be binary (i.e. zero or
 56 | one).
 57 | }
 58 | \details{
 59 | The function computes measures of similarity for different samples
 60 | points. For example, suppose samples \code{A} and \code{B} both contain \emph{p} binary
 61 | variables. First, a 2x2 table is constructed between \code{A} and \code{B} \emph{across
 62 | their elements}. The table will contain \emph{p} entries across the four cells
 63 | (see the example below). From this, different measures of likeness are
 64 | computed.
 65 | 
 66 | For a training set of \emph{n} samples, a new sample is compared to each,
 67 | resulting in \emph{n} similarity scores. These can be summarized into a single
 68 | value; the median similarity is used by default by the scoring function.
 69 | 
 70 | For this method, the computational methods are fairly taxing for large data
 71 | sets. The training set must be stored (albeit in a sparse matrix format) so
 72 | object sizes may become large.
 73 | 
 74 | By default, the computations are run in parallel using \emph{all possible
 75 | cores}. To change this, call the \code{setThreadOptions} function in the
 76 | \code{RcppParallel} package.
 77 | }
 78 | \examples{
 79 | \donttest{
 80 | data(qsar_binary)
 81 | 
 82 | jacc_sim <- apd_similarity(binary_tr)
 83 | jacc_sim
 84 | 
 85 | # plot the empirical cumulative distribution function (ECDF) for the training set:
 86 | library(ggplot2)
 87 | autoplot(jacc_sim)
 88 | 
 89 | # Example calculations for two samples:
 90 | A <- as.matrix(binary_tr[1, ])
 91 | B <- as.matrix(binary_tr[2, ])
 92 | xtab <- table(A, B)
 93 | xtab
 94 | 
 95 | # Jaccard statistic
 96 | xtab[2, 2] / (xtab[1, 2] + xtab[2, 1] + xtab[2, 2])
 97 | 
 98 | # Hamman statistic
 99 | ((xtab[1, 1] + xtab[2, 2]) - (xtab[1, 2] + xtab[2, 1])) / sum(xtab)
100 | 
101 | # Faith statistic
102 | (xtab[1, 1] + xtab[2, 2] / 2) / sum(xtab)
103 | 
104 | # Summarize across all training set similarities
105 | mean_sim <- score(jacc_sim, new_data = binary_unk)
106 | mean_sim
107 | }
108 | }
109 | \references{
110 | Leach, A. and Gillet V. (2007). \emph{An Introduction to
111 | Chemoinformatics}. Springer, New York
112 | }
113 | 


--------------------------------------------------------------------------------
/tests/testthat/test-hat_values-fit.R:
--------------------------------------------------------------------------------
  1 | test_that("`new_apd_hat_values` arguments are assigned correctly", {
  2 |   x <- new_apd_hat_values(
  3 |     "XtX_inv",
  4 |     "pctls",
  5 |     blueprint = hardhat::default_xy_blueprint()
  6 |   )
  7 | 
  8 |   expect_equal(names(x), c("XtX_inv", "pctls", "blueprint"))
  9 |   expect_equal(x$XtX_inv, "XtX_inv")
 10 |   expect_equal(x$pctls, "pctls")
 11 |   expect_equal(x$blueprint, hardhat::default_xy_blueprint())
 12 | })
 13 | 
 14 | test_that("XtX_inv is provided", {
 15 |   expect_snapshot(error = TRUE,
 16 |     new_apd_hat_values(blueprint = hardhat::default_xy_blueprint())
 17 |   )
 18 | })
 19 | 
 20 | test_that("`new_apd_hat_values` fails when blueprint is numeric", {
 21 |   expect_snapshot(error = TRUE,
 22 |     new_apd_hat_values(XtX_inv = 1, blueprint = 1)
 23 |   )
 24 | })
 25 | 
 26 | test_that("`new_apd_hat_values` returned blueprint is of class hardhat_blueprint", {
 27 |   x <- new_apd_hat_values(
 28 |     "XtX_inv",
 29 |     "pctls",
 30 |     blueprint = hardhat::default_xy_blueprint()
 31 |   )
 32 | 
 33 |   expect_s3_class(x$blueprint, "hardhat_blueprint")
 34 | })
 35 | 
 36 | test_that("`apd_hat_values` fails when model is not of class apd_hat_values", {
 37 |   model <- apd_hat_values(~ Sepal.Length + Species, iris)
 38 |   expect_s3_class(model, "apd_hat_values")
 39 | })
 40 | 
 41 | test_that("`apd_hat_values` fails when model is not of class hardhat_model", {
 42 |   model <- apd_hat_values(~ Sepal.Length + Species, iris)
 43 |   expect_s3_class(model, "hardhat_model")
 44 | })
 45 | 
 46 | test_that("`apd_hat_values` is defined for data.frame objects", {
 47 |   x <- apd_hat_values(mtcars)
 48 |   X <- as.matrix(mtcars)
 49 |   XpX <- t(X) %*% X
 50 |   XtX_inv <- qr.solve(XpX)
 51 |   dimnames(XtX_inv) <- NULL
 52 | 
 53 |   expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar"))
 54 |   expect_equal(names(x), c("XtX_inv", "pctls", "blueprint"))
 55 |   expect_equal(x$XtX_inv, XtX_inv)
 56 | })
 57 | 
 58 | test_that("`apd_hat_values` is defined for formula objects", {
 59 |   x <- apd_hat_values(~ Sepal.Width + Sepal.Length, iris)
 60 |   X <- as.matrix(iris %>% select(Sepal.Width, Sepal.Length))
 61 |   XpX <- t(X) %*% X
 62 |   XtX_inv <- qr.solve(XpX)
 63 |   dimnames(XtX_inv) <- NULL
 64 | 
 65 |   expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar"))
 66 |   expect_equal(names(x), c("XtX_inv", "pctls", "blueprint"))
 67 |   expect_equal(x$XtX_inv, XtX_inv)
 68 | })
 69 | 
 70 | test_that("`apd_hat_values` is defined for recipe objects", {
 71 |   rec <- recipes::recipe(~ Sepal.Width + Sepal.Length, iris)
 72 |   x <- apd_hat_values(rec, data = iris)
 73 |   X <- as.matrix(iris %>% select(Sepal.Width, Sepal.Length))
 74 |   XpX <- t(X) %*% X
 75 |   XtX_inv <- qr.solve(XpX)
 76 |   dimnames(XtX_inv) <- NULL
 77 | 
 78 |   expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar"))
 79 |   expect_equal(names(x), c("XtX_inv", "pctls", "blueprint"))
 80 |   expect_equal(x$XtX_inv, XtX_inv)
 81 | })
 82 | 
 83 | test_that("`apd_hat_values` is defined for matrix objects", {
 84 |   X <- as.matrix(iris %>% select(-Species))
 85 |   x <- apd_hat_values(X)
 86 |   XpX <- t(X) %*% X
 87 |   XtX_inv <- qr.solve(XpX)
 88 |   dimnames(XtX_inv) <- NULL
 89 | 
 90 |   expect_equal(class(x), c("apd_hat_values", "hardhat_model", "hardhat_scalar"))
 91 |   expect_equal(names(x), c("XtX_inv", "pctls", "blueprint"))
 92 |   expect_equal(x$XtX_inv, XtX_inv)
 93 | })
 94 | 
 95 | test_that("`apd_hat_values` is not defined for vectors", {
 96 |   cls <- class(mtcars$mpg)[1]
 97 |   expected_message <- glue::glue("`x` is not of a recognized type.
 98 |      Only data.frame, matrix, recipe, and formula objects are allowed.
 99 |      A {cls} was specified.")
100 | 
101 |   expect_condition(
102 |     apd_hat_values(mtcars$mpg),
103 |     expected_message
104 |   )
105 | })
106 | 
107 | test_that("`apd_hat_values` fails when matrix has more predictors than samples", {
108 |   bad_data <- mtcars %>%
109 |     slice(1:5)
110 | 
111 |   expect_snapshot(error = TRUE,
112 |     apd_hat_values(bad_data)
113 |   )
114 | })
115 | 
116 | test_that("`apd_hat_values` fails when the matrix X^tX is singular", {
117 |   bad_data <- matrix(
118 |     rep(0, 6),
119 |     nrow = 3
120 |   )
121 |   colnames(bad_data) <- c("A", "B")
122 | 
123 |   expect_snapshot(error = TRUE,
124 |     apd_hat_values(bad_data)
125 |   )
126 | })
127 | 
128 | test_that("`get_inv` behaves correctly when the input is not a matrix", {
129 |   X <- c(1:5)
130 |   expect_error(get_inv(X), NA)
131 | })
132 | 


--------------------------------------------------------------------------------
/R/hat_values-fit.R:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | # ---------------------- Model Constructor ------------------------------------
  3 | # -----------------------------------------------------------------------------
  4 | 
  5 | new_apd_hat_values <- function(XtX_inv, pctls, blueprint) {
  6 |   hardhat::new_model(
  7 |     XtX_inv = XtX_inv,
  8 |     pctls = pctls,
  9 |     blueprint = blueprint,
 10 |     class = "apd_hat_values"
 11 |   )
 12 | }
 13 | 
 14 | # -----------------------------------------------------------------------------
 15 | # ---------------------- Model function implementation ------------------------
 16 | # -----------------------------------------------------------------------------
 17 | 
 18 | apd_hat_values_impl <- function(predictors) {
 19 |   X <- as.matrix(predictors)
 20 |   dimnames(X) <- NULL
 21 | 
 22 |   XtX_inv <- get_inv(X)
 23 | 
 24 |   P <- X %*% XtX_inv %*% t(X)
 25 |   hat_values <- diag(P)
 26 | 
 27 |   # Calculate percentile for all PCs and distances
 28 |   pctls <- as.data.frame(get_ref_percentile(hat_values)) %>%
 29 |     setNames("hat_values_pctls") %>%
 30 |     mutate(percentile = seq(0, 100, length = 101))
 31 | 
 32 |   res <-
 33 |     list(
 34 |       XtX_inv = XtX_inv,
 35 |       pctls = pctls
 36 |     )
 37 | 
 38 |   res
 39 | }
 40 | 
 41 | # -----------------------------------------------------------------------------
 42 | # ------------------------ Model function bridge ------------------------------
 43 | # -----------------------------------------------------------------------------
 44 | 
 45 | apd_hat_values_bridge <- function(processed, ...) {
 46 |   predictors <- processed$predictors
 47 | 
 48 |   if (ncol(predictors) >= nrow(predictors)) {
 49 |     rlang::abort("The number of columns must be less than the number of rows.")
 50 |   }
 51 | 
 52 |   fit <- apd_hat_values_impl(predictors)
 53 | 
 54 |   new_apd_hat_values(
 55 |     XtX_inv = fit$XtX_inv,
 56 |     pctls = fit$pctls,
 57 |     blueprint = processed$blueprint
 58 |   )
 59 | }
 60 | 
 61 | # -----------------------------------------------------------------------------
 62 | # ----------------------- Model function interface ----------------------------
 63 | # -----------------------------------------------------------------------------
 64 | 
 65 | #' Fit a `apd_hat_values`
 66 | #'
 67 | #' `apd_hat_values()` fits a model.
 68 | #'
 69 | #' @param x Depending on the context:
 70 | #'
 71 | #'   * A __data frame__ of predictors.
 72 | #'   * A __matrix__ of predictors.
 73 | #'   * A __recipe__ specifying a set of preprocessing steps
 74 | #'     created from [recipes::recipe()].
 75 | #'
 76 | #' @param data When a __recipe__ or __formula__ is used, `data` is specified as:
 77 | #'
 78 | #'   * A __data frame__ containing the predictors.
 79 | #'
 80 | #' @param formula A formula specifying the predictor terms on the right-hand
 81 | #' side. No outcome should be specified.
 82 | #'
 83 | #' @param ... Not currently used, but required for extensibility.
 84 | #'
 85 | #' @return
 86 | #'
 87 | #' A `apd_hat_values` object.
 88 | #'
 89 | #' @examples
 90 | #' predictors <- mtcars[, -1]
 91 | #'
 92 | #' # Data frame interface
 93 | #' mod <- apd_hat_values(predictors)
 94 | #'
 95 | #' # Formula interface
 96 | #' mod2 <- apd_hat_values(mpg ~ ., mtcars)
 97 | #'
 98 | #' # Recipes interface
 99 | #' library(recipes)
100 | #' rec <- recipe(mpg ~ ., mtcars)
101 | #' rec <- step_log(rec, disp)
102 | #' mod3 <- apd_hat_values(rec, mtcars)
103 | #' @export
104 | apd_hat_values <- function(x, ...) {
105 |   UseMethod("apd_hat_values")
106 | }
107 | 
108 | # Default method
109 | 
110 | #' @export
111 | #' @rdname apd_hat_values
112 | apd_hat_values.default <- function(x, ...) {
113 |   cls <- class(x)[1]
114 |   message <-
115 |     "`x` is not of a recognized type.
116 |      Only data.frame, matrix, recipe, and formula objects are allowed.
117 |      A {cls} was specified."
118 |   message <- glue::glue(message)
119 |   rlang::abort(message = message)
120 | }
121 | 
122 | # Data frame method
123 | 
124 | #' @export
125 | #' @rdname apd_hat_values
126 | apd_hat_values.data.frame <- function(x, ...) {
127 |   processed <- hardhat::mold(x, NA_real_)
128 |   apd_hat_values_bridge(processed, ...)
129 | }
130 | 
131 | # Matrix method
132 | 
133 | #' @export
134 | #' @rdname apd_hat_values
135 | apd_hat_values.matrix <- function(x, ...) {
136 |   processed <- hardhat::mold(x, NA_real_)
137 |   apd_hat_values_bridge(processed, ...)
138 | }
139 | 
140 | # Formula method
141 | 
142 | #' @export
143 | #' @rdname apd_hat_values
144 | apd_hat_values.formula <- function(formula, data, ...) {
145 |   processed <- hardhat::mold(formula, data)
146 |   apd_hat_values_bridge(processed, ...)
147 | }
148 | 
149 | # Recipe method
150 | 
151 | #' @export
152 | #' @rdname apd_hat_values
153 | apd_hat_values.recipe <- function(x, data, ...) {
154 |   processed <- hardhat::mold(x, data)
155 |   apd_hat_values_bridge(processed, ...)
156 | }
157 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at codeofconduct@posit.co. 
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series of
 85 | actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or permanent
 92 | ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior, harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within the
112 | community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.1, available at
118 | <https://www.contributor-covenant.org/version/2/1/code_of_conduct.html>.
119 | 
120 | Community Impact Guidelines were inspired by
121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion].
122 | 
123 | For answers to common questions about this code of conduct, see the FAQ at
124 | <https://www.contributor-covenant.org/faq>. Translations are available at <https://www.contributor-covenant.org/translations>.
125 | 
126 | [homepage]: https://www.contributor-covenant.org
127 | 


--------------------------------------------------------------------------------
/R/pca-fit.R:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | # ---------------------- Model Constructor ------------------------------------
  3 | # -----------------------------------------------------------------------------
  4 | 
  5 | new_apd_pca <- function(pcs, pca_means, pctls, threshold, num_comp, blueprint) {
  6 |   hardhat::new_model(
  7 |     pcs = pcs,
  8 |     pca_means = pca_means,
  9 |     pctls = pctls,
 10 |     threshold = threshold,
 11 |     num_comp = num_comp,
 12 |     blueprint = blueprint,
 13 |     class = "apd_pca"
 14 |   )
 15 | }
 16 | 
 17 | # -----------------------------------------------------------------------------
 18 | # ---------------------- Model function implementation ------------------------
 19 | # -----------------------------------------------------------------------------
 20 | 
 21 | apd_pca_impl <- function(predictors, threshold) {
 22 |   pcs <- stats::prcomp(
 23 |     predictors,
 24 |     center = TRUE,
 25 |     scale. = TRUE,
 26 |     retx = TRUE
 27 |   )
 28 | 
 29 |   # TODO: verify threshold \in (0, 1]
 30 |   eigs <- pcs$sdev^2
 31 |   cum_sum <- cumsum(eigs) / sum(eigs)
 32 |   num_comp <- sum(cum_sum <= threshold) + 1
 33 | 
 34 |   # Update `pcs` count to `num_comp`
 35 |   pcs$x <- pcs$x[, 1:num_comp, drop = FALSE]
 36 | 
 37 |   # Find the mean of each principal component
 38 |   pca_means <- colMeans(pcs$x)
 39 | 
 40 |   # Compute distances between each principal component and its mean
 41 |   distance <- find_distance_to_pca_means(pcs$x, pca_means)
 42 |   pctls <- as_tibble(pcs$x) %>%
 43 |     setNames(names0(ncol(pcs$x), "PC")) %>%
 44 |     mutate_all(abs) %>%
 45 |     mutate(distance = distance)
 46 | 
 47 |   # Calculate percentile for all PCs and distances
 48 |   pctls <- map_dfc(pctls, get_ref_percentile) %>%
 49 |     mutate(percentile = seq(0, 100, length = 101))
 50 | 
 51 |   pcs$x <- NULL
 52 | 
 53 |   res <- list(
 54 |     pcs = pcs,
 55 |     pctls = pctls,
 56 |     pca_means = pca_means,
 57 |     threshold = threshold,
 58 |     num_comp = num_comp
 59 |   )
 60 |   res
 61 | }
 62 | 
 63 | # -----------------------------------------------------------------------------
 64 | # ------------------------ Model function bridge ------------------------------
 65 | # -----------------------------------------------------------------------------
 66 | 
 67 | apd_pca_bridge <- function(processed, threshold, ...) {
 68 |   predictors <- processed$predictors
 69 | 
 70 |   fit <- apd_pca_impl(predictors, threshold)
 71 | 
 72 |   new_apd_pca(
 73 |     pcs = fit$pcs,
 74 |     pca_means = fit$pca_means,
 75 |     pctls = fit$pctls,
 76 |     threshold = fit$threshold,
 77 |     num_comp = fit$num_comp,
 78 |     blueprint = processed$blueprint
 79 |   )
 80 | }
 81 | 
 82 | # -----------------------------------------------------------------------------
 83 | # ----------------------- Model function interface ----------------------------
 84 | # -----------------------------------------------------------------------------
 85 | 
 86 | #' Fit a `apd_pca`
 87 | #'
 88 | #' `apd_pca()` fits a model.
 89 | #'
 90 | #' @param x Depending on the context:
 91 | #'
 92 | #'   * A __data frame__ of predictors.
 93 | #'   * A __matrix__ of predictors.
 94 | #'   * A __recipe__ specifying a set of preprocessing steps
 95 | #'     created from [recipes::recipe()].
 96 | #'
 97 | #' @param data When a __recipe__ or __formula__ is used, `data` is specified as:
 98 | #'
 99 | #'   * A __data frame__ containing the predictors.
100 | #'
101 | #' @param formula A formula specifying the predictor terms on the right-hand
102 | #' side. No outcome should be specified.
103 | #'
104 | #' @param threshold A number indicating the percentage of variance desired from
105 | #' the principal components. It must be a number greater than 0 and less or
106 | #' equal than 1.
107 | #'
108 | #' @param ... Not currently used, but required for extensibility.
109 | #'
110 | #' @details The function computes the principal components that account for
111 | #' up to either 95% or the provided `threshold` of variability. It also
112 | #' computes the percentiles of the absolute value of the principal components.
113 | #' Additionally, it calculates the mean of each principal component.
114 | #'
115 | #' @return
116 | #'
117 | #' A `apd_pca` object.
118 | #'
119 | #' @examples
120 | #' predictors <- mtcars[, -1]
121 | #'
122 | #' # Data frame interface
123 | #' mod <- apd_pca(predictors)
124 | #'
125 | #' # Formula interface
126 | #' mod2 <- apd_pca(mpg ~ ., mtcars)
127 | #'
128 | #' # Recipes interface
129 | #' library(recipes)
130 | #' rec <- recipe(mpg ~ ., mtcars)
131 | #' rec <- step_log(rec, disp)
132 | #' mod3 <- apd_pca(rec, mtcars)
133 | #' @export
134 | apd_pca <- function(x, ...) {
135 |   UseMethod("apd_pca")
136 | }
137 | 
138 | # Default method
139 | 
140 | #' @export
141 | #' @rdname apd_pca
142 | apd_pca.default <- function(x, ...) {
143 |   cls <- class(x)[1]
144 |   message <-
145 |     "`x` is not of a recognized type.
146 |      Only data.frame, matrix, recipe, and formula objects are allowed.
147 |      A {cls} was specified."
148 |   message <- glue::glue(message)
149 |   rlang::abort(message = message)
150 | }
151 | 
152 | # Data frame method
153 | 
154 | #' @export
155 | #' @rdname apd_pca
156 | apd_pca.data.frame <- function(x, threshold = 0.95, ...) {
157 |   processed <- hardhat::mold(x, NA_real_)
158 |   apd_pca_bridge(processed, threshold, ...)
159 | }
160 | 
161 | # Matrix method
162 | 
163 | #' @export
164 | #' @rdname apd_pca
165 | apd_pca.matrix <- function(x, threshold = 0.95, ...) {
166 |   processed <- hardhat::mold(x, NA_real_)
167 |   apd_pca_bridge(processed, threshold, ...)
168 | }
169 | 
170 | # Formula method
171 | 
172 | #' @export
173 | #' @rdname apd_pca
174 | apd_pca.formula <- function(formula, data, threshold = 0.95, ...) {
175 |   processed <- hardhat::mold(formula, data)
176 |   apd_pca_bridge(processed, threshold, ...)
177 | }
178 | 
179 | # Recipe method
180 | 
181 | #' @export
182 | #' @rdname apd_pca
183 | apd_pca.recipe <- function(x, data, threshold = 0.95, ...) {
184 |   processed <- hardhat::mold(x, data)
185 |   apd_pca_bridge(processed, threshold, ...)
186 | }
187 | 


--------------------------------------------------------------------------------
/tests/testthat/test-similarity.R:
--------------------------------------------------------------------------------
  1 | library(proxyC)
  2 | library(Matrix)
  3 | library(recipes)
  4 | library(ggplot2)
  5 | 
  6 | # ------------------------------------------------------------------------------
  7 | 
  8 | # simulate a small data set
  9 | make_data <- function(p, n, rate = .5) {
 10 |   x <- matrix(sample(0:1, size = p * n, prob = rep(rate, 2), replace = TRUE), ncol = p)
 11 |   colnames(x) <- paste0("x", 1:p)
 12 |   x
 13 | }
 14 | 
 15 | set.seed(535)
 16 | tr_x <- make_data(20, 50)
 17 | un_x <- make_data(20, 10)
 18 | 
 19 | tr_x_sp <- Matrix(tr_x, sparse = TRUE)
 20 | un_x_sp <- Matrix(un_x, sparse = TRUE)
 21 | 
 22 | tr_scores <- simil(tr_x_sp, tr_x_sp, method = "jaccard")
 23 | un_scores <- simil(tr_x_sp, un_x_sp, method = "jaccard")
 24 | 
 25 | mean_tr <- apply(tr_scores, 1, mean)
 26 | mean_tab <- as.data.frame(table(mean_tr), stringsAsFactors = FALSE)
 27 | mean_tab$mean_tr <- as.numeric(mean_tab$mean_tr)
 28 | mean_tab$cumulative <- cumsum(mean_tab$Freq) / 50
 29 | 
 30 | # ------------------------------------------------------------------------------
 31 | 
 32 | test_that("matrix method - mean similarity", {
 33 |   tmp <- apd_similarity(tr_x)
 34 |   tmp_scores <- score(tmp, un_x)
 35 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean))
 36 |   expect_equal(tmp$options, list(method = "jaccard"))
 37 |   expect_equal(tmp$ref_data, tr_x_sp)
 38 |   expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim)
 39 |   expect_equal(mean_tab$Freq, tmp$ref_scores$n)
 40 |   expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative)
 41 | })
 42 | 
 43 | test_that("data frame method - quantile similarity", {
 44 |   tmp <- apd_similarity(tr_x, quantile = .1)
 45 |   tmp_scores <- score(tmp, un_x)
 46 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1))
 47 | })
 48 | 
 49 | test_that("formula method - mean similarity", {
 50 |   sim_form <- as.formula(" ~.")
 51 |   tmp <- apd_similarity(sim_form, tr_x)
 52 |   tmp_scores <- score(tmp, un_x)
 53 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean))
 54 |   expect_equal(tmp$options, list(method = "jaccard"))
 55 |   expect_equal(tmp$ref_data, tr_x_sp)
 56 |   expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim)
 57 |   expect_equal(mean_tab$Freq, tmp$ref_scores$n)
 58 |   expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative)
 59 | })
 60 | 
 61 | # ------------------------------------------------------------------------------
 62 | 
 63 | test_that("data frame method - mean similarity", {
 64 |   tmp <- apd_similarity(as.data.frame(tr_x))
 65 |   tmp_scores <- score(tmp, as.data.frame(un_x))
 66 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean))
 67 |   expect_equal(tmp$options, list(method = "jaccard"))
 68 |   expect_equal(tmp$ref_data, tr_x_sp)
 69 |   expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim)
 70 |   expect_equal(mean_tab$Freq, tmp$ref_scores$n)
 71 |   expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative)
 72 | })
 73 | 
 74 | 
 75 | test_that("matrix method - quantile similarity", {
 76 |   tmp <- apd_similarity(as.data.frame(tr_x), quantile = .1)
 77 |   tmp_scores <- score(tmp, as.data.frame(un_x))
 78 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1))
 79 | })
 80 | 
 81 | test_that("formula method - quantile similarity", {
 82 |   sim_form <- as.formula(" ~.")
 83 |   tmp <- apd_similarity(sim_form, tr_x, quantile = .1)
 84 |   tmp_scores <- score(tmp, as.data.frame(un_x))
 85 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1))
 86 | })
 87 | 
 88 | # ------------------------------------------------------------------------------
 89 | 
 90 | test_that("recipe method - mean similarity", {
 91 |   rec <-
 92 |     recipe(~., data = as.data.frame(tr_x)) %>%
 93 |     step_zv(all_predictors())
 94 |   tmp <- apd_similarity(rec, as.data.frame(tr_x))
 95 |   tmp_scores <- score(tmp, as.data.frame(un_x))
 96 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, mean))
 97 |   expect_equal(tmp$options, list(method = "jaccard"))
 98 |   expect_equal(tmp$ref_data, tr_x_sp)
 99 |   expect_equal(mean_tab$mean_tr, tmp$ref_scores$sim)
100 |   expect_equal(mean_tab$Freq, tmp$ref_scores$n)
101 |   expect_equal(mean_tab$cumulative, tmp$ref_scores$cumulative)
102 | })
103 | 
104 | 
105 | test_that("matrix method - quantile similarity", {
106 |   rec <-
107 |     recipe(~., data = as.data.frame(tr_x)) %>%
108 |     step_zv(all_predictors())
109 |   tmp <- apd_similarity(rec, as.data.frame(tr_x), quantile = .1)
110 |   tmp_scores <- score(tmp, as.data.frame(un_x))
111 |   expect_equal(tmp_scores$similarity, apply(un_scores, 2, quantile, probs = .1))
112 | })
113 | 
114 | # ------------------------------------------------------------------------------
115 | 
116 | test_that("bad args", {
117 |   expect_snapshot(error = TRUE,
118 |     apd_similarity(tr_x, quantile = 2)
119 |   )
120 |   expect_snapshot(error = TRUE,
121 |     apd_similarity(tr_x_sp)
122 |   )
123 | })
124 | 
125 | # ------------------------------------------------------------------------------
126 | 
127 | test_that("printed output", {
128 |   expect_snapshot(print(apd_similarity(tr_x)))
129 |   expect_snapshot(print(apd_similarity(tr_x)))
130 |   expect_snapshot(print(apd_similarity(tr_x)))
131 |   expect_snapshot(print(apd_similarity(tr_x, quantile = .13)))
132 | })
133 | # ------------------------------------------------------------------------------
134 | 
135 | test_that("plot output", {
136 |   ad <- apd_similarity(tr_x)
137 |   ad_plot <- autoplot(ad)
138 |   expect_equal(ad_plot$data, ad$ref_scores)
139 |   expect_equal(ad_plot$labels$x, "mean similarity (training set)")
140 |   expect_equal(ad_plot$labels$y, "Cumulative Probability")
141 | })
142 | 
143 | # ------------------------------------------------------------------------------
144 | 
145 | test_that("apd_similarity fails when quantile is neither NA nor a number in [0, 1]", {
146 | 
147 |   expect_snapshot(error = TRUE,
148 |     apd_similarity(tr_x, quantile = -1)
149 |   )
150 | 
151 |   expect_snapshot(error = TRUE,
152 |     apd_similarity(tr_x, quantile = 3)
153 |   )
154 | 
155 |   expect_snapshot(error = TRUE,
156 |     apd_similarity(tr_x, quantile = "la")
157 |   )
158 | })
159 | 
160 | # ------------------------------------------------------------------------------
161 | 
162 | test_that("apd_similarity outputs warning with zero variance variables ", {
163 |   bad_data <- list(
164 |     "a" = c(0, 0),
165 |     "b" = c(0, 0),
166 |     "c" = c(1, 1),
167 |     "d" = c(0, 0)
168 |   )
169 |   bad_data <- as.data.frame(bad_data)
170 | 
171 |   expect_snapshot(
172 |     apd_similarity(bad_data)
173 |   )
174 | })
175 | 
176 | # ------------------------------------------------------------------------------
177 | 
178 | test_that("apd_similarity fails when all the variables have zero variance", {
179 |   bad_data <- list(
180 |     "a" = c(0, 0),
181 |     "b" = c(0, 0),
182 |     "d" = c(0, 0)
183 |   )
184 |   bad_data <- as.data.frame(bad_data)
185 | 
186 |   expect_snapshot(error = TRUE,
187 |     apd_similarity(bad_data)
188 |   )
189 | })
190 | 
191 | # ------------------------------------------------------------------------------
192 | 
193 | test_that("apd_similarity fails data is not binary", {
194 |   bad_data <- list(
195 |     "a" = c(0, 0),
196 |     "b" = c(1, 3),
197 |     "c" = c(1, 1),
198 |     "d" = c(2, 0)
199 |   )
200 |   bad_data <- as.data.frame(bad_data)
201 | 
202 |   expect_snapshot(error = TRUE,
203 |     apd_similarity(bad_data)
204 |   )
205 | })
206 | 


--------------------------------------------------------------------------------
/vignettes/continuous-data.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Applicability domain methods for continuous data"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{continuous-data}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | 
 16 | prev_options <- options(width = 100)
 17 | 
 18 | library(ggplot2)
 19 | theme_set(theme_bw())
 20 | ```
 21 | 
 22 | ## Introduction
 23 | 
 24 | ```{r}
 25 | library(applicable)
 26 | ```
 27 | 
 28 | `applicable` provides the following methods to analyze the applicability domain of your model:
 29 | 
 30 | * Principal component analysis
 31 | * Hat values statistics
 32 | 
 33 | ## Example
 34 | 
 35 | We will use the Ames IA housing data for our example.
 36 | 
 37 | ```{r ames_data, message=FALSE}
 38 | library(modeldata)
 39 | data(ames)
 40 | ```
 41 | 
 42 | There are `r format(nrow(ames), big.mark = ",")` properties in the data.
 43 | 
 44 | The Sale Price was recorded along with `r ncol(ames)` predictors, including:
 45 | 
 46 | * Location (e.g. neighborhood) and lot information.
 47 | * House components (garage, fireplace, pool, porch, etc.).
 48 | * General assessments such as overall quality and condition.
 49 | * Number of bedrooms, baths, and so on.
 50 | 
 51 | More details can be found in [De Cock (2011, Journal of Statistics Education)](http://jse.amstat.org/v19n3/decock.pdf).
 52 | 
 53 | The raw data are at [`http://jse.amstat.org/v19n3/decock/AmesHousing.txt`](http://jse.amstat.org/v19n3/decock/AmesHousing.txt) but we 
 54 | will use a processed version found in the 
 55 | [`AmesHousing`](https://github.com/topepo/AmesHousing) package. `applicable` 
 56 | also contains an update for these data for three new properties (although fewer 
 57 | fields were collected on these). 
 58 | 
 59 | To pre-process the training set, we will use the _recipes_ package. We first
 60 | tell the recipes that there is an additional value for the neighborhood in these
 61 | data, then direct it to create dummy variables for all categorical predictors. In
 62 | cases where there are no levels observed for a factor, we eliminate predictors
 63 | with a single unique value, then estimate a transformation that will make the
 64 | predictor distributions more symmetric. After these, the data are centered and
 65 | scaled. These same transformations will be applied to the new data points using
 66 | the statistics estimated from the training set.
 67 | 
 68 | ```{r prep_data, message=FALSE}
 69 | library(recipes)
 70 | library(dplyr)
 71 | 
 72 | ames_cols <- intersect(names(ames), names(ames_new))
 73 | 
 74 | training_data <- 
 75 |   ames %>% 
 76 |   # For consistency, only analyze the data on new properties
 77 |   dplyr::select(one_of(ames_cols)) %>% 
 78 |   mutate(
 79 |     # There is a new neighborhood in ames_new
 80 |     Neighborhood = as.character(Neighborhood),
 81 |     Neighborhood = factor(Neighborhood, levels = levels(ames_new$Neighborhood))
 82 |   )
 83 | 
 84 | 
 85 | training_recipe <-
 86 |   recipe( ~ ., data = training_data) %>%
 87 |   step_dummy(all_nominal()) %>% 
 88 |   # Remove variables that have the same value for every data point.
 89 |   step_zv(all_predictors()) %>% 
 90 |   # Transform variables to be distributed as Gaussian-like as possible.
 91 |   step_YeoJohnson(all_numeric()) %>%
 92 |   # Normalize numeric data to have a mean of zero and
 93 |   # standard deviation of one.
 94 |   step_normalize(all_numeric())
 95 | ```
 96 | 
 97 | 
 98 | ### Principal Component Analysis
 99 | 
100 | The following functions in `applicable` are used for principal component
101 | analysis:
102 | 
103 | * `apd_pca`: computes the principal components that account for up
104 | to either 95% or the provided `threshold` of variability. It also computes the
105 | percentiles of the principal components and the mean of each principal
106 | component.
107 | * `autoplot`: plots the distribution function for pcas. You can also provide an
108 | optional set of `dplyr` selectors, such as `dplyr::matches()` or
109 | `dplyr::starts_with()`, for selecting which variables should be shown in the
110 | plot.
111 | * `score`: calculates the principal components of the new data and their
112 | percentiles as compared to the training data. The number of principal
113 | components computed depends on the `threshold` given at fit time. It also
114 | computes the multivariate distance between each principal component and its
115 | mean.
116 | 
117 | Let us apply `apd_pca` modeling function to our data:
118 | 
119 | ```{r}
120 | ames_pca <- apd_pca(training_recipe, training_data)
121 | ames_pca
122 | ```
123 | 
124 | Since no `threshold` was provided, the function computed the number of
125 | principal components that accounted for at most 95% of the total variance.
126 | 
127 | For illustration, setting `threshold = 0.25` or 25%, we now need only 10 principal components:
128 | 
129 | ```{r}
130 | ames_pca <- apd_pca(training_recipe, training_data, threshold = 0.25)
131 | ames_pca
132 | ```
133 | 
134 | Plotting the distribution function for the PCA scores is also helpful:
135 | 
136 | ```{r autoplot}
137 | #| fig-alt: "Faceted line chart. abs(value) along the x-axis, percentile along the y-axis. The facets are distance, followed by PC 1 through 12. All lines go up fairly fast."
138 | library(ggplot2)
139 | autoplot(ames_pca)
140 | ```
141 | 
142 | You can use regular expressions to plot a smaller subset of the pca statistics:
143 | 
144 | ```{r, echo = FALSE, results = "hold"}
145 | autoplot(ames_pca, matches("PC0[1-5]"))
146 | autoplot(ames_pca, distance) + scale_x_log10()
147 | ```
148 | 
149 | The `score` function compares the training data to new samples. Let's go back to 
150 | the case where we capture 95% of the variation in the predictors and score the 
151 | new samples. Since we used the recipe interface, we can give the score function
152 | the original data:
153 | 
154 | ```{r new_sample}
155 | ames_pca <- apd_pca(training_recipe, training_data)
156 | pca_score <- score(ames_pca, ames_new)
157 | pca_score %>% select(matches("PC00[1-3]"), contains("distance"))
158 | ```
159 | 
160 | Notice how the samples, displayed in red, are fairly dissimilar to the training
161 | set in the first component:
162 | 
163 | ```{r, echo = FALSE}
164 | #| fig-alt: "Histogram chart. PC001 along the x-axis, count along the y-axis. A vertical red line is placed inside the distribution."
165 | training_scores <- score(ames_pca, training_data)
166 | ggplot(training_scores, aes(x = PC001)) + 
167 |   geom_histogram(col = "white", binwidth = .5) + 
168 |   geom_vline(xintercept = pca_score$PC001, col = "red")
169 | ```
170 | 
171 | What is driving the first component? We can look at which predictors have the
172 | largest values in the rotation matrix (i.e. the values that define the linear 
173 | combinations in the PC scores). The top five are:
174 | 
175 | ```{r}
176 | # `ames_pca$pcs` is the output of `prcomp()`
177 | comp_one <- ames_pca$pcs$rotation[, 1]
178 | comp_one[order(abs(comp_one), decreasing = TRUE)] %>% head(5)
179 | ```
180 | 
181 | These three houses are extreme in the most influential variable (year built) since
182 | they were new homes. The also tend to have fairly large garages:
183 | 
184 | ```{r, echo = FALSE}
185 | #| fig-alt: "Histogram chart. Garage_Area along the x-axis, count along the y-axis. Two red vertical lines are places. One on the edge of the distribution, another outside the distribution."
186 | ggplot(training_data, aes(x = Garage_Area)) + 
187 |   geom_histogram(col = "white", binwidth = 50) + 
188 |   geom_vline(xintercept = ames_new$Garage_Area, col = "red")
189 | ```
190 | 
191 | This may be what is driving the first component. 
192 | 
193 | However, the overall distance values are relatively small, which indicates that,
194 | overall, these new houses are not outside the mainstream of the data. 
195 | 
196 | 
197 | 
198 | ### Hat Values
199 | 
200 | The [Hat or leverage values](https://en.wikipedia.org/wiki/Leverage_(statistics)) are based on the numerics of linear regression. The measure the distance of a data point to the center of the training set distribution. For example, if the numeric training set matrix was $X_{n \times p}$, the _hat matrix_ for the training set would be computed using 
201 | 
202 | $$H = X'(X'X)^{-1}X$$
203 | 
204 | The corresponding hat values for the training would be the diagonals of $H$. These values can be computed using `stats::hatvalues(lm_model)` but only for an `lm` model object. Also, it cannot compute the values for new samples. 
205 | 
206 | Suppose that we had a new, unknown sample (as a $p \times 1$ data vector $u$). The hat value for this sample would be 
207 | 
208 | $$h = u^\intercal(X^\intercal X)^{-1}u$$.
209 | 
210 | The following functions in `applicable` are used to compute the hat values of your model:
211 | 
212 | * `apd_hat_values`: computes the matrix $(X^\intercal X)^{-1}$.
213 | * `score`: calculates the hat values of new samples and their percentiles.
214 | 
215 | Two caveats for using the hat values:
216 | 
217 |  1. The numerical methods are less tolerant than PCA. For example, extremely correlated predictors will degrade the ability of the hat values to be effectively used. Also, since an inverse is used, there cannot be an linear dependencies within $X$. To resolve this the former example, the recipe step `recipes::step_corr()` can be used to reduce correlation. For the latter issue, `recipes::step_lincomp()` will identify and remove linear dependencies in the data (as shown below). 
218 |  
219 |  1. When using a linear or logistic model, the model adds an intercept columns of ones to $X$. For equivalent computations, you should add a vector or ones to the data or use `recipes::step_intercept()`. 
220 | 
221 | Let us apply `apd_hat_values` modeling function to our data (while ensuring that there are no linear dependencies):
222 | 
223 | ```{r}
224 | non_singular_recipe <- 
225 |   training_recipe %>% 
226 |   step_lincomb(all_predictors())
227 | 
228 | # Recipe interface
229 | ames_hat <- apd_hat_values(non_singular_recipe, training_data)
230 | ```
231 | 
232 | 
233 | ```{r reset_options}
234 | 
235 | options(prev_options)
236 | 
237 | ```
238 | 


--------------------------------------------------------------------------------
/R/similarity.R:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | 
  3 | new_apd_similarity <- function(quantile, ref_data, options, ref_scores, blueprint) {
  4 |   hardhat::new_model(
  5 |     quantile = quantile,
  6 |     ref_data = ref_data,
  7 |     blueprint = blueprint,
  8 |     options = options,
  9 |     ref_scores = ref_scores,
 10 |     class = "apd_similarity"
 11 |   )
 12 | }
 13 | 
 14 | # -----------------------------------------------------------------------------
 15 | # ---------------------- Model function implementation ------------------------
 16 | # -----------------------------------------------------------------------------
 17 | 
 18 | apd_similarity_impl <- function(predictors, quantile, options) {
 19 |   if (!any("method" == names(options))) {
 20 |     options$method <- "jaccard"
 21 |   }
 22 | 
 23 |   res <- list(quantile = quantile, ref_data = predictors, options = options)
 24 | 
 25 |   p <- nrow(predictors)
 26 |   keep_n <- min(p, 5000)
 27 |   sampling <- sample.int(p, keep_n)
 28 |   ref_scores <-
 29 |     tibble::tibble(
 30 |       sim = score_apd_similarity_numeric(res, predictors[sampling, , drop = FALSE], options)
 31 |     ) %>%
 32 |     dplyr::group_by(sim) %>%
 33 |     dplyr::count() %>%
 34 |     dplyr::ungroup() %>%
 35 |     dplyr::mutate(cumulative = cumsum(n) / sum(n))
 36 |   res$ref_scores <- ref_scores
 37 |   res
 38 | }
 39 | 
 40 | # -----------------------------------------------------------------------------
 41 | # ---------------------- Model function bridge --------------------------------
 42 | # -----------------------------------------------------------------------------
 43 | 
 44 | apd_similarity_bridge <- function(processed, quantile = NA_real_, ...) {
 45 |   opts <- list(...)
 46 | 
 47 |   msg <- "The `quantile` argument should be NA or a single numeric value in [0, 1]."
 48 |   if (!is.na(quantile) && (!is.numeric(quantile) || length(quantile) != 1)) {
 49 |     rlang::abort(msg)
 50 |   }
 51 |   if (!is.na(quantile) && (quantile < 0 | quantile > 1)) {
 52 |     rlang::abort(msg)
 53 |   }
 54 | 
 55 |   predictors <- processed$predictors
 56 |   if (!is.matrix(predictors)) {
 57 |     predictors <- as.matrix(predictors)
 58 |   }
 59 | 
 60 |   not_bin <- apply(predictors, 2, function(x) any(x != 1 & x != 0))
 61 |   if (any(not_bin)) {
 62 |     bad_x <- colnames(predictors)[not_bin]
 63 |     bad_x <- glue::glue_collapse(bad_x, sep = ", ", last = ", and ")
 64 |     rlang::abort(
 65 |       glue(
 66 |         "The following variables are not binary: {bad_x}"
 67 |       )
 68 |     )
 69 |   }
 70 | 
 71 |   if (!inherits(predictors, "dgCMatrix")) {
 72 |     predictors <- Matrix::Matrix(predictors, sparse = TRUE)
 73 |   }
 74 | 
 75 |   # check for binary and not zero-vars
 76 | 
 77 |   zv <- Matrix::colSums(predictors)
 78 |   if (all(zv == 0)) {
 79 |     rlang::abort("All variables have a single unique value.")
 80 |   } else {
 81 |     if (any(zv == 0)) {
 82 |       bad_x <- colnames(predictors)[zv == 0]
 83 |       bad_x <- glue::glue_collapse(bad_x, sep = ", ", last = ", and ")
 84 |       rlang::warn(
 85 |         glue(
 86 |           "The following variables had zero variance and were removed: {bad_x}"
 87 |         )
 88 |       )
 89 |       predictors <- predictors[, zv > 0, drop = FALSE]
 90 |     }
 91 |   }
 92 | 
 93 |   fit <- apd_similarity_impl(predictors, quantile = quantile, options = opts)
 94 | 
 95 |   new_apd_similarity(
 96 |     quantile = quantile,
 97 |     ref_data = fit$ref_data,
 98 |     options = fit$options,
 99 |     ref_scores = fit$ref_scores,
100 |     blueprint = processed$blueprint
101 |   )
102 | }
103 | 
104 | # -----------------------------------------------------------------------------
105 | # ---------------------- Model function interface -----------------------------
106 | # -----------------------------------------------------------------------------
107 | 
108 | #' Applicability domain methods using binary similarity analysis
109 | #'
110 | #' `apd_similarity()` is used to analyze samples in terms of similarity scores
111 | #'  for binary data. All features in the data should be binary (i.e. zero or
112 | #'  one).
113 | #'
114 | #' @param x Depending on the context:
115 | #'
116 | #'   * A __data frame__ of binary predictors.
117 | #'   * A __matrix__ of binary predictors.
118 | #'   * A __recipe__ specifying a set of preprocessing steps
119 | #'     created from [recipes::recipe()].
120 | #'
121 | #' @param data When a __recipe__ or __formula__ is used, `data` is specified as:
122 | #'
123 | #'   * A __data frame__ containing the binary predictors. Any predictors with
124 | #'     no 1's will be removed (with a warning).
125 | #'
126 | #' @param formula A formula specifying the predictor terms on the right-hand
127 | #' side. No outcome should be specified.
128 | #'
129 | #' @param quantile A real number between 0 and 1 or NA for how the similarity
130 | #'  values for each sample versus the training set should be summarized. A value
131 | #'  of `NA` specifies that the mean similarity is computed. Otherwise, the
132 | #'  appropriate quantile is computed.
133 | #'
134 | #' @param ... Options to pass to `proxyC::simil()`, such as `method`. If no
135 | #'  options are specified, `method = "jaccard"` is used.
136 | #'
137 | #' @details The function computes measures of similarity for different samples
138 | #'  points. For example, suppose samples `A` and `B` both contain _p_ binary
139 | #'  variables. First, a 2x2 table is constructed between `A` and `B` _across
140 | #'  their elements_. The table will contain _p_ entries across the four cells
141 | #'  (see the example below). From this, different measures of likeness are
142 | #'  computed.
143 | #'
144 | #' For a training set of _n_ samples, a new sample is compared to each,
145 | #'  resulting in _n_ similarity scores. These can be summarized into a single
146 | #'  value; the median similarity is used by default by the scoring function.
147 | #'
148 | #' For this method, the computational methods are fairly taxing for large data
149 | #'  sets. The training set must be stored (albeit in a sparse matrix format) so
150 | #'  object sizes may become large.
151 | #'
152 | #' By default, the computations are run in parallel using _all possible
153 | #'  cores_. To change this, call the `setThreadOptions` function in the
154 | #'  `RcppParallel` package.
155 | #'
156 | #' @return
157 | #'
158 | #' A `apd_similarity` object.
159 | #'
160 | #' @references Leach, A. and Gillet V. (2007). _An Introduction to
161 | #' Chemoinformatics_. Springer, New York
162 | #' @examples
163 | #' \donttest{
164 | #' data(qsar_binary)
165 | #'
166 | #' jacc_sim <- apd_similarity(binary_tr)
167 | #' jacc_sim
168 | #'
169 | #' # plot the empirical cumulative distribution function (ECDF) for the training set:
170 | #' library(ggplot2)
171 | #' autoplot(jacc_sim)
172 | #'
173 | #' # Example calculations for two samples:
174 | #' A <- as.matrix(binary_tr[1, ])
175 | #' B <- as.matrix(binary_tr[2, ])
176 | #' xtab <- table(A, B)
177 | #' xtab
178 | #'
179 | #' # Jaccard statistic
180 | #' xtab[2, 2] / (xtab[1, 2] + xtab[2, 1] + xtab[2, 2])
181 | #'
182 | #' # Hamman statistic
183 | #' ((xtab[1, 1] + xtab[2, 2]) - (xtab[1, 2] + xtab[2, 1])) / sum(xtab)
184 | #'
185 | #' # Faith statistic
186 | #' (xtab[1, 1] + xtab[2, 2] / 2) / sum(xtab)
187 | #'
188 | #' # Summarize across all training set similarities
189 | #' mean_sim <- score(jacc_sim, new_data = binary_unk)
190 | #' mean_sim
191 | #' }
192 | #' @export
193 | apd_similarity <- function(x, ...) {
194 |   UseMethod("apd_similarity")
195 | }
196 | 
197 | # Default method
198 | 
199 | #' @export
200 | #' @rdname apd_similarity
201 | apd_similarity.default <- function(x, quantile = NA_real_, ...) {
202 |   cls <- class(x)[1]
203 |   message <-
204 |     "`x` is not of a recognized type.
205 |      Only data.frame, matrix, recipe, and formula objects are allowed.
206 |      A {cls} was specified."
207 |   message <- glue::glue(message)
208 |   rlang::abort(message = message)
209 | }
210 | 
211 | # Data frame method
212 | 
213 | #' @export
214 | #' @rdname apd_similarity
215 | apd_similarity.data.frame <- function(x, quantile = NA_real_, ...) {
216 |   processed <- hardhat::mold(x, NA_real_)
217 |   apd_similarity_bridge(processed, quantile = quantile, ...)
218 | }
219 | 
220 | # Matrix method
221 | 
222 | #' @export
223 | #' @rdname apd_similarity
224 | apd_similarity.matrix <- function(x, quantile = NA_real_, ...) {
225 |   processed <- hardhat::mold(x, NA_real_)
226 |   apd_similarity_bridge(processed, quantile = quantile, ...)
227 | }
228 | 
229 | # Formula method
230 | 
231 | #' @export
232 | #' @rdname apd_similarity
233 | apd_similarity.formula <- function(formula, data, quantile = NA_real_, ...) {
234 |   processed <- hardhat::mold(formula, data)
235 |   apd_similarity_bridge(processed, quantile = quantile, ...)
236 | }
237 | 
238 | # Recipe method
239 | 
240 | #' @export
241 | #' @rdname apd_similarity
242 | apd_similarity.recipe <- function(x, data, quantile = NA_real_, ...) {
243 |   processed <- hardhat::mold(x, data)
244 |   apd_similarity_bridge(processed, quantile = quantile, ...)
245 | }
246 | 
247 | # -----------------------------------------------------------------------------
248 | # ---------------------- Scoring function implementation ----------------------
249 | # -----------------------------------------------------------------------------
250 | 
251 | score_apd_similarity_numeric <- function(model, predictors, options) {
252 |   predictors <-
253 |     predictors[, colnames(predictors) %in% colnames(model$ref_data), drop = FALSE]
254 | 
255 |   if (!is.matrix(predictors)) {
256 |     predictors <- as.matrix(predictors)
257 |   }
258 | 
259 |   if (!inherits(predictors, "dgCMatrix")) {
260 |     predictors <- Matrix::Matrix(predictors, sparse = TRUE)
261 |   }
262 | 
263 |   cl <-
264 |     rlang::call2(
265 |       "simil",
266 |       .ns = "proxyC",
267 |       x = rlang::expr(model$ref_data),
268 |       y = rlang::expr(predictors),
269 |       !!!model$options
270 |     )
271 |   sims <- rlang::eval_tidy(cl)
272 |   if (is.na(model$quantile)) {
273 |     res <- apply(sims, 2, mean, na.rm = TRUE)
274 |   } else {
275 |     res <- apply(sims, 2, quantile, probs = model$quantile, na.rm = TRUE)
276 |   }
277 |   res
278 | }
279 | 
280 | # -----------------------------------------------------------------------------
281 | # ---------------------- Scoring function bridge ------------------------------
282 | # -----------------------------------------------------------------------------
283 | 
284 | score_apd_similarity_bridge <- function(type, model, predictors) {
285 |   score_function <- get_sim_score_function(type)
286 | 
287 |   predictions <- score_function(model, predictors, options)
288 |   predictions <- tibble::tibble(similarity = predictions)
289 | 
290 |   hardhat::validate_prediction_size(predictions, predictors)
291 | 
292 |   predictions
293 | }
294 | 
295 | get_sim_score_function <- function(type) {
296 |   switch(type,
297 |     numeric = score_apd_similarity_numeric
298 |   )
299 | }
300 | 
301 | # -----------------------------------------------------------------------------
302 | # ---------------------- Scoring function interface ---------------------------
303 | # -----------------------------------------------------------------------------
304 | 
305 | #' Score new samples using similarity methods
306 | #'
307 | #' @param object A `apd_similarity` object.
308 | #'
309 | #' @param new_data A data frame or matrix of new predictors.
310 | #'
311 | #' @param type A single character. The type of predictions to generate.
312 | #' Valid options are:
313 | #'
314 | #' - `"numeric"` for a numeric value that summarizes the similarity values for
315 | #'   each sample across the training set.
316 | #'
317 | #' @param add_percentile A single logical; should the percentile of the
318 | #'  similarity score _relative to the training set values_ by computed?
319 | #'
320 | #' @param ... Not used, but required for extensibility.
321 | #'
322 | #' @return
323 | #'
324 | #' A tibble of predictions. The number of rows in the tibble is guaranteed
325 | #' to be the same as the number of rows in `new_data`. For `type = "numeric"`,
326 | #' the tibble contains a column called "similarity". If `add_percentile = TRUE`,
327 | #' an additional column called `similarity_pctl` will be added. These values are
328 | #' in percent units so that a value of 11.5 indicates that, in the training set,
329 | #' 11.5 percent of the training set samples had smaller values than the sample
330 | #' being scored.
331 | #'
332 | #' @examples
333 | #' \donttest{
334 | #' data(qsar_binary)
335 | #'
336 | #' jacc_sim <- apd_similarity(binary_tr)
337 | #'
338 | #' mean_sim <- score(jacc_sim, new_data = binary_unk)
339 | #' mean_sim
340 | #' }
341 | #' @export
342 | score.apd_similarity <- function(object, new_data, type = "numeric", add_percentile = TRUE, ...) {
343 |   forged <- hardhat::forge(new_data, object$blueprint)
344 |   rlang::arg_match(type, valid_predict_types())
345 |   res <- score_apd_similarity_bridge(type, object, forged$predictors)
346 |   if (add_percentile) {
347 |     res$similarity_pctl <- sim_percentile(res$similarity, object$ref_scores)
348 |   }
349 |   res
350 | }
351 | 
352 | # -----------------------------------------------------------------------------
353 | # ---------------------- Helper functions -------------------------------------
354 | # -----------------------------------------------------------------------------
355 | 
356 | sim_percentile <- function(sims, ref) {
357 |   res <- stats::approx(ref$sim, ref$cumulative, xout = sims)$y
358 |   res[sims < min(ref$sim, na.rm = TRUE)] <- 0
359 |   res[sims > max(ref$sim, na.rm = TRUE)] <- 1
360 | 
361 |   res * 100
362 | }
363 | 


--------------------------------------------------------------------------------