├── .github
├── .gitignore
└── workflows
│ ├── recheck.yml
│ ├── pkgdown.yaml
│ └── R-CMD-check.yaml
├── hex
└── lookout.png
├── man
├── figures
│ ├── logo.png
│ ├── README-pressure-1.png
│ ├── README-unnamed-chunk-2-1.png
│ ├── README-unnamed-chunk-2-2.png
│ └── README-unnamed-chunk-3-1.png
├── reexports.Rd
├── autoplot.lookoutliers.Rd
├── lookout_ts.Rd
├── autoplot.persistingoutliers.Rd
├── lookout-package.Rd
├── find_tda_bw.Rd
├── persisting_outliers.Rd
├── mvscale.Rd
└── lookout.Rd
├── pkgdown
├── favicon
│ ├── favicon.ico
│ ├── favicon-16x16.png
│ ├── favicon-32x32.png
│ ├── apple-touch-icon.png
│ ├── apple-touch-icon-60x60.png
│ ├── apple-touch-icon-76x76.png
│ ├── apple-touch-icon-120x120.png
│ ├── apple-touch-icon-152x152.png
│ └── apple-touch-icon-180x180.png
└── extra.css
├── .gitignore
├── NEWS.md
├── .Rbuildignore
├── R
├── utils.R
├── lookout-package.R
├── print.R
├── lookout_ts.R
├── bandwidth.R
├── autoplot_lookout.R
├── autoplot_persistence.R
├── outlier_persistence.R
├── mvscale.R
└── lookoutliers.R
├── lookout.Rproj
├── NAMESPACE
├── _pkgdown.yml
├── DESCRIPTION
├── README.Rmd
└── README.md
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 |
--------------------------------------------------------------------------------
/hex/lookout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/hex/lookout.png
--------------------------------------------------------------------------------
/man/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/logo.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/favicon.ico
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/doc
6 | .DS_Store
7 | .history
8 | docs
9 |
--------------------------------------------------------------------------------
/man/figures/README-pressure-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-pressure-1.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/favicon-16x16.png
--------------------------------------------------------------------------------
/pkgdown/favicon/favicon-32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/favicon-32x32.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-unnamed-chunk-2-1.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-unnamed-chunk-2-2.png
--------------------------------------------------------------------------------
/man/figures/README-unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-unnamed-chunk-3-1.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-60x60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-76x76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-152x152.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-152x152.png
--------------------------------------------------------------------------------
/pkgdown/favicon/apple-touch-icon-180x180.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-180x180.png
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # lookout 2.0.0
2 |
3 | * Added a `NEWS.md` file to track changes to the package.
4 | * Updated lookout algorithm as per Hyndman, Kandanaarachchi and Turner (2025).
5 |
--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^LICENSE\.md$
4 | ^\.travis\.yml$
5 | ^README\.Rmd$
6 | ^\.github$
7 | ^_pkgdown\.yml$
8 | ^docs$
9 | ^pkgdown$
10 | ^hex$
11 | .history
12 |
--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
1 | # Unitize each column of X
2 | unitize <- function(X) {
3 | for (col in seq_len(NCOL(X))) {
4 | maxcol <- max(X[, col])
5 | mincol <- min(X[, col])
6 | if (maxcol != mincol) {
7 | X[, col] <- (X[, col] - mincol) / (maxcol - mincol)
8 | }
9 | }
10 | X
11 | }
12 |
--------------------------------------------------------------------------------
/R/lookout-package.R:
--------------------------------------------------------------------------------
1 | #' @importFrom ggplot2 ggplot aes geom_raster xlab ylab geom_point
2 | #' @importFrom ggplot2 autoplot
3 | #' @export
4 | ggplot2::autoplot
5 | NULL
6 |
7 | #' @docType package
8 | #' @aliases NULL lookout-package
9 | #' @keywords internal
10 | "_PACKAGE"
11 |
12 | # The following block is used by usethis to automatically manage
13 | # roxygen namespace tags. Modify with care!
14 | ## usethis namespace: start
15 | ## usethis namespace: end
16 | NULL
17 |
--------------------------------------------------------------------------------
/.github/workflows/recheck.yml:
--------------------------------------------------------------------------------
1 | on:
2 | workflow_dispatch:
3 | inputs:
4 | which:
5 | type: choice
6 | description: Which dependents to check
7 | options:
8 | - strong
9 | - most
10 |
11 | name: Reverse dependency check
12 |
13 | jobs:
14 | revdep_check:
15 | name: Reverse check ${{ inputs.which }} dependents
16 | uses: r-devel/recheck/.github/workflows/recheck.yml@v1
17 | with:
18 | which: ${{ inputs.which }}
19 |
--------------------------------------------------------------------------------
/man/reexports.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lookout-package.R
3 | \docType{import}
4 | \name{reexports}
5 | \alias{reexports}
6 | \alias{autoplot}
7 | \title{Objects exported from other packages}
8 | \keyword{internal}
9 | \description{
10 | These objects are imported from other packages. Follow the links
11 | below to see their documentation.
12 |
13 | \describe{
14 | \item{ggplot2}{\code{\link[ggplot2]{autoplot}}}
15 | }}
16 |
17 |
--------------------------------------------------------------------------------
/lookout.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 | ProjectId: 6a5358b0-e0d8-4db8-9b9a-a0f3e182d4bf
3 |
4 | RestoreWorkspace: Default
5 | SaveWorkspace: Default
6 | AlwaysSaveHistory: Default
7 |
8 | EnableCodeIndexing: Yes
9 | UseSpacesForTab: Yes
10 | NumSpacesForTab: 2
11 | Encoding: UTF-8
12 |
13 | RnwWeave: Sweave
14 | LaTeX: pdfLaTeX
15 |
16 | AutoAppendNewline: Yes
17 | StripTrailingWhitespace: Yes
18 |
19 | BuildType: Package
20 | PackageUseDevtools: Yes
21 | PackageInstallArgs: --no-multiarch --with-keep.source
22 | PackageRoxygenize: rd,collate,namespace
23 |
--------------------------------------------------------------------------------
/R/print.R:
--------------------------------------------------------------------------------
1 | #' @method print persistingoutliers
2 | #' @export
3 |
4 | print.persistingoutliers <- function(x, ...) {
5 | cat("Persistent outliers using lookout algorithm")
6 | cat("\n\nCall: ")
7 | print(x$call)
8 | cat("\nLookout bandwidth: ", x$lookoutbw, "\n")
9 | }
10 |
11 | #' @method print lookoutliers
12 | #' @export
13 | print.lookoutliers <- function(x, ...) {
14 | cat("Leave-out-out KDE outliers using lookout algorithm")
15 | cat("\n\nCall: ")
16 | print(x$call)
17 | cat("\n")
18 | print(x$outliers)
19 | cat("\n")
20 | }
21 |
--------------------------------------------------------------------------------
/pkgdown/extra.css:
--------------------------------------------------------------------------------
1 | h1,
2 | .h1 {
3 | font-size: 2.5rem;
4 | font-weight: 700;
5 | }
6 |
7 | h2,
8 | .h2 {
9 | font-size: 2.0rem;
10 | font-weight: 700;
11 | }
12 |
13 | h3,
14 | .h3 {
15 | font-size: 1.5rem;
16 | font-weight: 700;
17 | }
18 |
19 | .bg-primary .navbar-nav .show>.nav-link,
20 | .bg-primary .navbar-nav .nav-link.active,
21 | .bg-primary .navbar-nav .nav-link:hover,
22 | .bg-primary .navbar-nav .nav-link:focus {
23 | color: #ffb81c !important;
24 | }
25 |
26 | .text-muted {
27 | color: #ffb81c !important;
28 | }
29 |
30 | .algolia-autocomplete .aa-dropdown-menu .aa-suggestion {
31 | color: #234460;
32 | }
33 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(autoplot,lookoutliers)
4 | S3method(autoplot,persistingoutliers)
5 | S3method(print,lookoutliers)
6 | S3method(print,persistingoutliers)
7 | export(autoplot)
8 | export(find_tda_bw)
9 | export(lookout)
10 | export(lookout_ts)
11 | export(mvscale)
12 | export(persisting_outliers)
13 | importFrom(ggplot2,aes)
14 | importFrom(ggplot2,autoplot)
15 | importFrom(ggplot2,geom_point)
16 | importFrom(ggplot2,geom_raster)
17 | importFrom(ggplot2,ggplot)
18 | importFrom(ggplot2,xlab)
19 | importFrom(ggplot2,ylab)
20 | importFrom(stats,dist)
21 | importFrom(stats,median)
22 | importFrom(stats,quantile)
23 | importFrom(stats,sd)
24 |
--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: https://sevvandi.github.io/lookout/
2 | template:
3 | bootstrap: 5
4 | theme: tango
5 | bootswatch: flatly
6 | bslib:
7 | base_font: { google: "Fira Sans" }
8 | heading_font: { google: "Fira Sans" }
9 | code_font: "Hack, mono"
10 | primary: "#234460"
11 | link-color: "#234460"
12 | includes:
13 | in_header:
14 |
15 | authors:
16 | Sevvandi Kandanaarachchi:
17 | href: https://sevvandi.github.io
18 | Rob Hyndman:
19 | href: https://robjhyndman.com
20 |
21 | navbar:
22 | type: light
23 |
24 | figures:
25 | dev: ragg::agg_png
26 | dpi: 300
27 | dev.args: []
28 | fig.ext: png
29 | fig.width: 8
30 | fig.height: 5
31 |
--------------------------------------------------------------------------------
/man/autoplot.lookoutliers.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/autoplot_lookout.R
3 | \name{autoplot.lookoutliers}
4 | \alias{autoplot.lookoutliers}
5 | \title{Plots outliers identified by lookout algorithm.}
6 | \usage{
7 | \method{autoplot}{lookoutliers}(object, columns = 1:2, ...)
8 | }
9 | \arguments{
10 | \item{object}{The output of the function \code{lookout}.}
11 |
12 | \item{columns}{Which columns of the original data to plot
13 | (specified as either numbers or strings)}
14 |
15 | \item{...}{Other arguments currently ignored.}
16 | }
17 | \value{
18 | A ggplot object.
19 | }
20 | \description{
21 | Scatterplot of two columns from the data set with outliers highlighted.
22 | }
23 | \examples{
24 | X <- rbind(
25 | data.frame(
26 | x = rnorm(500),
27 | y = rnorm(500)
28 | ),
29 | data.frame(
30 | x = rnorm(5, mean = 10, sd = 0.2),
31 | y = rnorm(5, mean = 10, sd = 0.2)
32 | )
33 | )
34 | lo <- lookout(X)
35 | autoplot(lo)
36 | }
37 |
--------------------------------------------------------------------------------
/man/lookout_ts.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lookout_ts.R
3 | \name{lookout_ts}
4 | \alias{lookout_ts}
5 | \title{Identifies outliers in univariate time series using the algorithm lookout.}
6 | \usage{
7 | lookout_ts(x, scale = FALSE, ...)
8 | }
9 | \arguments{
10 | \item{x}{The input univariate time series.}
11 |
12 | \item{scale}{If \code{TRUE}, the data is standardized. Using the old version,
13 | unit scaling is applied so that each column is in the range \code{[0,1]}.
14 | Under the new version, robust rotation and scaling is used so that the columns
15 | are approximately uncorrelated with unit variance. Default is \code{TRUE}.}
16 |
17 | \item{...}{Other arguments are passed to \code{\link{lookout}}.}
18 | }
19 | \value{
20 | A lookout object.
21 | }
22 | \description{
23 | This is the time series implementation of lookout which identifies outliers
24 | in the double differenced time series.
25 | }
26 | \examples{
27 | set.seed(1)
28 | x <- arima.sim(list(order = c(1, 1, 0), ar = 0.8), n = 200)
29 | x[50] <- x[50] + 10
30 | plot(x)
31 | lo <- lookout_ts(x)
32 | lo
33 | }
34 | \seealso{
35 | \code{\link{lookout}}
36 | }
37 |
--------------------------------------------------------------------------------
/man/autoplot.persistingoutliers.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/autoplot_persistence.R
3 | \name{autoplot.persistingoutliers}
4 | \alias{autoplot.persistingoutliers}
5 | \title{Plots outlier persistence for a range of significance levels.}
6 | \usage{
7 | \method{autoplot}{persistingoutliers}(object, alpha = object$alpha, ...)
8 | }
9 | \arguments{
10 | \item{object}{The output of the function \code{persisting_outliers}.}
11 |
12 | \item{alpha}{The significance levels to plot.}
13 |
14 | \item{...}{Other arguments currently ignored.}
15 | }
16 | \value{
17 | A ggplot object.
18 | }
19 | \description{
20 | This function plots outlier persistence for a range of significance levels
21 | using the algorithm lookout, an outlier detection method that uses
22 | leave-one-out kernel density estimates and generalized Pareto distributions
23 | to find outliers.
24 | }
25 | \examples{
26 | X <- rbind(
27 | data.frame(
28 | x = rnorm(500),
29 | y = rnorm(500)
30 | ),
31 | data.frame(
32 | x = rnorm(5, mean = 10, sd = 0.2),
33 | y = rnorm(5, mean = 10, sd = 0.2)
34 | )
35 | )
36 | plot(X, pch = 19)
37 | outliers <- persisting_outliers(X, scale = FALSE)
38 | autoplot(outliers)
39 | }
40 |
--------------------------------------------------------------------------------
/R/lookout_ts.R:
--------------------------------------------------------------------------------
1 | #' Identifies outliers in univariate time series using the algorithm lookout.
2 | #'
3 | #' This is the time series implementation of lookout which identifies outliers
4 | #' in the double differenced time series.
5 | #' @param x The input univariate time series.
6 | #' @inheritParams lookout
7 | #' @param ... Other arguments are passed to \code{\link{lookout}}.
8 | #' @return A lookout object.
9 | #' @seealso \code{\link{lookout}}
10 | #'
11 | #' @examples
12 | #' set.seed(1)
13 | #' x <- arima.sim(list(order = c(1, 1, 0), ar = 0.8), n = 200)
14 | #' x[50] <- x[50] + 10
15 | #' plot(x)
16 | #' lo <- lookout_ts(x)
17 | #' lo
18 | #' @export lookout_ts
19 | lookout_ts <- function(x, scale = FALSE, ...) {
20 | u <- c(0, diff(diff(x)))
21 | out <- lookout(u, scale = scale, ...)
22 | outliers <- out$outliers[, 1]
23 | # Keep only the most extreme outlier(s) in each consecutive sequence of outliers
24 | if (length(outliers) > 1) {
25 | oo <- c()
26 | clust <- cumsum(c(1, diff(outliers) > 1))
27 | len <- max(clust)
28 | for (kk in seq_len(len)) {
29 | inds <- outliers[which(clust == kk)]
30 | oo <- c(oo, inds[which.min(out$outlier_probability[inds])])
31 | }
32 | inds <- which(out$outliers[, 1] %in% oo)
33 | out$outliers <- out$outliers[inds, ]
34 | }
35 | out
36 | }
37 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: lookout
2 | Type: Package
3 | Title: Leave One Out Kernel Density Estimates for Outlier Detection
4 | Version: 2.0.0
5 | Authors@R: c(
6 | person("Sevvandi", "Kandanaarachchi", email = "sevvandik@gmail.com",
7 | role = c("aut", "cre"), comment = c(ORCID = "0000-0002-0337-0395")),
8 | person("Rob", "Hyndman", email = "rob.hyndman@monash.edu",
9 | role = c("aut"), comment = c(ORCID = "0000-0002-2140-5352")),
10 | person("Chris", "Fraley", role = "ctb", email = "fraley@u.washington.edu")
11 | )
12 | Maintainer: Sevvandi Kandanaarachchi
13 | Description: Outlier detection using leave-one-out kernel density estimates and
14 | extreme value theory. The bandwidth for kernel density estimates is computed
15 | using persistent homology, a technique in topological data analysis. Using
16 | peak-over-threshold method, a generalized Pareto distribution is fitted to
17 | the log of leave-one-out kde values to identify outliers.
18 | License: GPL-3
19 | Encoding: UTF-8
20 | LazyData: true
21 | Roxygen: list(markdown = TRUE)
22 | RoxygenNote: 7.3.3
23 | BugReports: https://github.com/sevvandi/lookout/issues
24 | Imports:
25 | evd,
26 | ggplot2,
27 | RANN,
28 | robustbase,
29 | stats,
30 | TDAstats,
31 | tidyr
32 | Suggests:
33 | knitr,
34 | rmarkdown
35 | URL: https://sevvandi.github.io/lookout/, https://github.com/sevvandi/lookout
36 |
--------------------------------------------------------------------------------
/man/lookout-package.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lookout-package.R
3 | \docType{package}
4 | \name{lookout-package}
5 | \alias{lookout-package}
6 | \title{lookout: Leave One Out Kernel Density Estimates for Outlier Detection}
7 | \description{
8 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}}
9 |
10 | Outlier detection using leave-one-out kernel density estimates and extreme value theory. The bandwidth for kernel density estimates is computed using persistent homology, a technique in topological data analysis. Using peak-over-threshold method, a generalized Pareto distribution is fitted to the log of leave-one-out kde values to identify outliers.
11 | }
12 | \seealso{
13 | Useful links:
14 | \itemize{
15 | \item \url{https://sevvandi.github.io/lookout/}
16 | \item \url{https://github.com/sevvandi/lookout}
17 | \item Report bugs at \url{https://github.com/sevvandi/lookout/issues}
18 | }
19 |
20 | }
21 | \author{
22 | \strong{Maintainer}: Sevvandi Kandanaarachchi \email{sevvandik@gmail.com} (\href{https://orcid.org/0000-0002-0337-0395}{ORCID})
23 |
24 | Authors:
25 | \itemize{
26 | \item Rob Hyndman \email{rob.hyndman@monash.edu} (\href{https://orcid.org/0000-0002-2140-5352}{ORCID})
27 | }
28 |
29 | Other contributors:
30 | \itemize{
31 | \item Chris Fraley \email{fraley@u.washington.edu} [contributor]
32 | }
33 |
34 | }
35 | \keyword{internal}
36 |
--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | release:
8 | types: [published]
9 | workflow_dispatch:
10 |
11 | name: pkgdown.yaml
12 |
13 | permissions: read-all
14 |
15 | jobs:
16 | pkgdown:
17 | runs-on: ubuntu-latest
18 | # Only restrict concurrency for non-PR jobs
19 | concurrency:
20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 | env:
22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 | permissions:
24 | contents: write
25 | steps:
26 | - uses: actions/checkout@v4
27 |
28 | - uses: r-lib/actions/setup-pandoc@v2
29 |
30 | - uses: r-lib/actions/setup-r@v2
31 | with:
32 | use-public-rspm: true
33 |
34 | - uses: r-lib/actions/setup-r-dependencies@v2
35 | with:
36 | extra-packages: any::pkgdown, local::.
37 | needs: website
38 |
39 | - name: Build site
40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 | shell: Rscript {0}
42 |
43 | - name: Deploy to GitHub pages 🚀
44 | if: github.event_name != 'pull_request'
45 | uses: JamesIves/github-pages-deploy-action@v4.5.0
46 | with:
47 | clean: false
48 | branch: gh-pages
49 | folder: docs
50 |
--------------------------------------------------------------------------------
/man/find_tda_bw.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/bandwidth.R
3 | \name{find_tda_bw}
4 | \alias{find_tda_bw}
5 | \title{Identifies bandwidth for outlier detection.}
6 | \usage{
7 | find_tda_bw(X, fast = TRUE, gamma = 0.97, use_differences = FALSE)
8 | }
9 | \arguments{
10 | \item{X}{The numerical input data in a data.frame, matrix or tibble format.}
11 |
12 | \item{fast}{If \code{TRUE} (default), makes the computation faster by
13 | sub-setting the data for the bandwidth calculation.}
14 |
15 | \item{gamma}{Parameter for bandwidth calculation giving the quantile of the
16 | Rips death radii to use for the bandwidth. Default is \code{0.97}. Ignored
17 | under the old version; where the lower limit of the maximum Rips death radii
18 | difference is used. Also ignored if \code{bw} is provided.}
19 |
20 | \item{use_differences}{If TRUE, the bandwidth is set to the lower point
21 | of the maximum Rips death radii differences. If FALSE,
22 | the gamma quantile of the Rips death radii is used. Default is FALSE.}
23 | }
24 | \value{
25 | The bandwidth
26 | }
27 | \description{
28 | This function identifies the bandwidth that is used in the kernel density
29 | estimate computation. The function uses topological data analysis (TDA)
30 | to find the badnwidth.
31 | }
32 | \examples{
33 | X <- rbind(
34 | data.frame(
35 | x = rnorm(500),
36 | y = rnorm(500)
37 | ),
38 | data.frame(
39 | x = rnorm(5, mean = 10, sd = 0.2),
40 | y = rnorm(5, mean = 10, sd = 0.2)
41 | )
42 | )
43 | find_tda_bw(X, fast = TRUE)
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 |
8 | name: R-CMD-check.yaml
9 |
10 | permissions: read-all
11 |
12 | jobs:
13 | R-CMD-check:
14 | runs-on: ${{ matrix.config.os }}
15 |
16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }})
17 |
18 | strategy:
19 | fail-fast: false
20 | matrix:
21 | config:
22 | - {os: macOS-latest, r: 'release'}
23 | - {os: windows-latest, r: 'release'}
24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
25 | - {os: ubuntu-latest, r: 'release'}
26 | - {os: ubuntu-latest, r: 'oldrel-1'}
27 |
28 | env:
29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
30 | R_KEEP_PKG_SOURCE: yes
31 |
32 | steps:
33 | - uses: actions/checkout@v4
34 |
35 | - uses: r-lib/actions/setup-pandoc@v2
36 |
37 | - uses: r-lib/actions/setup-r@v2
38 | with:
39 | r-version: ${{ matrix.config.r }}
40 | http-user-agent: ${{ matrix.config.http-user-agent }}
41 | use-public-rspm: true
42 |
43 | - name: Install XQuartz on macOS
44 | if: runner.os == 'macOS'
45 | run: |
46 | brew install --cask xquartz
47 |
48 | - uses: r-lib/actions/setup-r-dependencies@v2
49 | with:
50 | extra-packages: any::rcmdcheck
51 | needs: check
52 |
53 | - uses: r-lib/actions/check-r-package@v2
54 | with:
55 | upload-snapshots: true
56 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
57 |
--------------------------------------------------------------------------------
/R/bandwidth.R:
--------------------------------------------------------------------------------
1 | #' Identifies bandwidth for outlier detection.
2 | #'
3 | #' This function identifies the bandwidth that is used in the kernel density
4 | #' estimate computation. The function uses topological data analysis (TDA)
5 | #' to find the badnwidth.
6 | #'
7 | #' @inheritParams lookout
8 | #' @param use_differences If TRUE, the bandwidth is set to the lower point
9 | #' of the maximum Rips death radii differences. If FALSE,
10 | #' the gamma quantile of the Rips death radii is used. Default is FALSE.
11 | #'
12 | #' @return The bandwidth
13 | #'
14 | #' @examples
15 | #' X <- rbind(
16 | #' data.frame(
17 | #' x = rnorm(500),
18 | #' y = rnorm(500)
19 | #' ),
20 | #' data.frame(
21 | #' x = rnorm(5, mean = 10, sd = 0.2),
22 | #' y = rnorm(5, mean = 10, sd = 0.2)
23 | #' )
24 | #' )
25 | #' find_tda_bw(X, fast = TRUE)
26 | #'
27 | #' @export
28 | find_tda_bw <- function(X, fast = TRUE, gamma = 0.97, use_differences = FALSE) {
29 | stopifnot(gamma > 0 && gamma <= 1)
30 | X <- as.matrix(X)
31 |
32 | # select a subset of X for tda computation
33 | if (fast) {
34 | inds <- subset_for_tda(X)
35 | Xsub <- X[inds, ]
36 | } else {
37 | Xsub <- X
38 | }
39 |
40 | if (NCOL(X) == 1L) {
41 | phom <- TDAstats::calculate_homology(dist(Xsub), format = "distmat")
42 | } else {
43 | phom <- TDAstats::calculate_homology(Xsub, dim = 0)
44 | }
45 |
46 | death_radi <- phom[, 3L]
47 |
48 | # Added so that very small death radi are not chosen
49 | if (use_differences) {
50 | med_radi <- median(death_radi)
51 | death_radi_upper <- death_radi[death_radi >= med_radi]
52 | dr_thres_diff <- diff(death_radi_upper)
53 | return(death_radi_upper[which.max(dr_thres_diff)])
54 | } else {
55 | m <- NCOL(X)
56 | return(unname(quantile(death_radi, probs = gamma, type = 8L)^(2/m)))
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/R/autoplot_lookout.R:
--------------------------------------------------------------------------------
1 | #' Plots outliers identified by lookout algorithm.
2 | #'
3 | #' Scatterplot of two columns from the data set with outliers highlighted.
4 | #'
5 | #' @param object The output of the function `lookout`.
6 | #' @param columns Which columns of the original data to plot
7 | #' (specified as either numbers or strings)
8 | #' @param ... Other arguments currently ignored.
9 | #'
10 | #' @return A ggplot object.
11 | #'
12 | #' @examples
13 | #' X <- rbind(
14 | #' data.frame(
15 | #' x = rnorm(500),
16 | #' y = rnorm(500)
17 | #' ),
18 | #' data.frame(
19 | #' x = rnorm(5, mean = 10, sd = 0.2),
20 | #' y = rnorm(5, mean = 10, sd = 0.2)
21 | #' )
22 | #' )
23 | #' lo <- lookout(X)
24 | #' autoplot(lo)
25 | #' @export
26 | autoplot.lookoutliers <- function(object, columns = 1:2, ...) {
27 | # Column names
28 | varnames <- colnames(object$data)
29 | if (is.null(varnames)) {
30 | varnames <- paste0("V", seq(NCOL(object$data)))
31 | }
32 | X <- as.data.frame(object$data)
33 | colnames(X) <- varnames
34 | if (is.character(columns)) {
35 | columns <- match(columns, varnames)
36 | } else {
37 | columns <- columns[columns <= NCOL(X)]
38 | }
39 |
40 | # Outliers
41 | outliers <- NULL
42 | X$outliers <- rep(FALSE, NROW(X))
43 | X$outliers[object$outliers[, "Outliers"]] <- TRUE
44 |
45 | # y axis
46 | if (length(columns) > 1) {
47 | ..y <- X[, columns[2L]]
48 | ..yvar <- varnames[columns[2L]]
49 | } else {
50 | ..y <- 0
51 | ..yvar <- ""
52 | }
53 |
54 | # Produce plot
55 | p <- ggplot2::ggplot(X, ggplot2::aes(x = X[, columns[1L]], y = ..y)) +
56 | ggplot2::geom_point(ggplot2::aes(col = outliers)) +
57 | ggplot2::labs(x = varnames[columns[1L]], y = ..yvar) +
58 | ggplot2::scale_color_manual(values = c(`TRUE` = "red", `FALSE` = "black")) +
59 | ggplot2::guides(color = "none")
60 | if (NCOL(object$data) == 1L) {
61 | p <- p + ggplot2::scale_y_continuous(breaks = NULL)
62 | }
63 | p
64 | }
65 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 |
6 |
7 | ```{r, include = FALSE}
8 | knitr::opts_chunk$set(
9 | collapse = TRUE,
10 | comment = "#>",
11 | fig.path = "man/figures/README-",
12 | out.width = "100%"
13 | )
14 | ```
15 |
16 | # lookout
17 |
18 |
19 | [](https://github.com/sevvandi/lookout/actions/workflows/R-CMD-check.yaml)
20 | [](https://CRAN.R-project.org/package=lookout)
21 |
22 |
23 | **lookout** identifies outliers in data using leave-one-out kernel density estimates and extreme value theory. The bandwidth for kernel density estimates is computed using persistent homology, a technique in topological data analysis. Using the peak-over-threshold method, a Generalized Pareto Distribution is fitted to the log of leave-one-out kde values to identify outliers.
24 |
25 | See [Kandanaarachchi and Hyndman (2021)](https://robjhyndman.com/publications/lookout/) for the underlying methodology.
26 |
27 | ## Installation
28 |
29 | You can install the released version of lookout from [CRAN](https://CRAN.R-project.org) with:
30 |
31 | ``` r
32 | #install.packages("lookout")
33 | ```
34 |
35 | And the development version from [GitHub](https://github.com/) with:
36 |
37 | ``` r
38 | # install.packages("devtools")
39 | devtools::install_github("sevvandi/lookout")
40 | ```
41 |
42 | ## Example
43 |
44 | ```{r}
45 | library(lookout)
46 | lo <- lookout(faithful)
47 | lo
48 | autoplot(lo)
49 | ```
50 |
51 | Next we look at outlier persistence. The outlier persistence plot shows the outliers that persist over a range of bandwidth values for different levels of significance. The strength is inversely proportional to the level of significance. If the level of significance is 0.01, then the strength is 10 and if it is 0.1, then the strength is 1.
52 |
53 | ```{r}
54 | persistence <- persisting_outliers(faithful)
55 | autoplot(persistence)
56 | ```
57 |
--------------------------------------------------------------------------------
/man/persisting_outliers.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/outlier_persistence.R
3 | \name{persisting_outliers}
4 | \alias{persisting_outliers}
5 | \title{Computes outlier persistence for a range of significance values.}
6 | \usage{
7 | persisting_outliers(
8 | X,
9 | alpha = seq(0.01, 0.1, by = 0.01),
10 | st_qq = 0.9,
11 | scale = TRUE,
12 | num_steps = 20,
13 | old_version = FALSE
14 | )
15 | }
16 | \arguments{
17 | \item{X}{The input data in a matrix, data.frame, or tibble format. All
18 | columns should be numeric.}
19 |
20 | \item{alpha}{Grid of significance levels.}
21 |
22 | \item{st_qq}{The starting quantile for death radii sequence. This will be
23 | used to compute the starting bandwidth value.}
24 |
25 | \item{scale}{If \code{TRUE}, the data is scaled. Default is \code{TRUE}. Which
26 | scaling method is used depends on the \code{old_version} parameter.
27 | See \code{\link{lookout}} for details.}
28 |
29 | \item{num_steps}{The length of the bandwidth sequence.}
30 |
31 | \item{old_version}{Logical indicator of which version of the algorithm to use.}
32 | }
33 | \value{
34 | A list with the following components:
35 | \item{\code{out}}{A 3D array of \code{N x num_steps x num_alpha} where
36 | \code{N} denotes the number of observations, \code{num_steps} denote the
37 | length of the bandwidth sequence, and \code{num_alpha} denotes the number of
38 | significance levels. This is a binary array and the entries are set to 1 if
39 | that observation is an outlier for that particular bandwidth and significance
40 | level.}
41 | \item{\code{bw}}{The set of bandwidth values.}
42 | \item{\code{gpdparas}}{The GPD parameters used. }
43 | \item{\code{lookoutbw}}{The bandwidth chosen by the algorithm \code{lookout}
44 | using persistent homology.}
45 | }
46 | \description{
47 | This function computes outlier persistence for a range of significance
48 | values, using the algorithm lookout, an outlier detection method that uses
49 | leave-one-out kernel density estimates and generalized Pareto distributions
50 | to find outliers.
51 | }
52 | \examples{
53 | X <- rbind(
54 | data.frame(
55 | x = rnorm(500),
56 | y = rnorm(500)
57 | ),
58 | data.frame(
59 | x = rnorm(5, mean = 10, sd = 0.2),
60 | y = rnorm(5, mean = 10, sd = 0.2)
61 | )
62 | )
63 | plot(X, pch = 19)
64 | outliers <- persisting_outliers(X, scale = FALSE)
65 | outliers
66 | autoplot(outliers)
67 | }
68 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # lookout
5 |
6 |
7 |
8 | [](https://github.com/sevvandi/lookout/actions/workflows/R-CMD-check.yaml)
9 | [](https://CRAN.R-project.org/package=lookout)
11 |
12 |
13 | **lookout** identifies outliers in data using leave-one-out kernel
14 | density estimates and extreme value theory. The bandwidth for kernel
15 | density estimates is computed using persistent homology, a technique in
16 | topological data analysis. Using the peak-over-threshold method, a
17 | Generalized Pareto Distribution is fitted to the log of leave-one-out
18 | kde values to identify outliers.
19 |
20 | See [Kandanaarachchi and Hyndman
21 | (2021)](https://robjhyndman.com/publications/lookout/) for the
22 | underlying methodology.
23 |
24 | ## Installation
25 |
26 | You can install the released version of lookout from
27 | [CRAN](https://CRAN.R-project.org) with:
28 |
29 | ``` r
30 | #install.packages("lookout")
31 | ```
32 |
33 | And the development version from [GitHub](https://github.com/) with:
34 |
35 | ``` r
36 | # install.packages("devtools")
37 | devtools::install_github("sevvandi/lookout")
38 | ```
39 |
40 | ## Example
41 |
42 | ``` r
43 | library(lookout)
44 | lo <- lookout(faithful)
45 | lo
46 | #> Leave-out-out KDE outliers using lookout algorithm
47 | #>
48 | #> Call: lookout(X = faithful)
49 | #>
50 | #> Outliers Probability
51 | #> 1 6 0.005553188
52 | #> 2 24 0.006423949
53 | #> 3 46 0.007934127
54 | #> 4 149 0.008300670
55 | #> 5 158 0.007242257
56 | #> 6 197 0.004333429
57 | #> 7 211 0.000000000
58 | #> 8 244 0.004956339
59 | autoplot(lo)
60 | ```
61 |
62 |
63 |
64 | Next we look at outlier persistence. The outlier persistence plot shows
65 | the outliers that persist over a range of bandwidth values for different
66 | levels of significance. The strength is inversely proportional to the
67 | level of significance. If the level of significance is 0.01, then the
68 | strength is 10 and if it is 0.1, then the strength is 1.
69 |
70 | ``` r
71 | persistence <- persisting_outliers(faithful)
72 | autoplot(persistence)
73 | ```
74 |
75 |
76 |
--------------------------------------------------------------------------------
/man/mvscale.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/mvscale.R
3 | \name{mvscale}
4 | \alias{mvscale}
5 | \title{Compute robust multivariate scaled data}
6 | \usage{
7 | mvscale(
8 | object,
9 | center = stats::median,
10 | scale = robustbase::s_Qn,
11 | cov = robustbase::covOGK,
12 | warning = TRUE
13 | )
14 | }
15 | \arguments{
16 | \item{object}{A vector, matrix, or data frame containing some numerical data.}
17 |
18 | \item{center}{A function to compute the center of each numerical variable. Set
19 | to NULL if no centering is required.}
20 |
21 | \item{scale}{A function to scale each numerical variable. When
22 | \code{cov = robustbase::covOGK()}, it is passed as the \code{sigmamu} argument.}
23 |
24 | \item{cov}{A function to compute the covariance matrix. Set to NULL if no rotation required.}
25 |
26 | \item{warning}{Should a warning be issued if non-numeric columns are ignored?}
27 | }
28 | \value{
29 | A vector, matrix or data frame of the same size and class as \code{object},
30 | but with numerical variables replaced by scaled versions.
31 | }
32 | \description{
33 | A multivariate version of \code{\link[base:scale]{base::scale()}}, that takes account
34 | of the covariance matrix of the data, and uses robust estimates
35 | of center, scale and covariance by default. The centers are removed using medians, the
36 | scale function is the IQR, and the covariance matrix is estimated using a
37 | robust OGK estimate. The data are scaled using the Cholesky decomposition of
38 | the inverse covariance. Then the scaled data are returned.
39 | }
40 | \details{
41 | Optionally, the centering and scaling can be done for each variable
42 | separately, so there is no rotation of the data, by setting \code{cov = NULL}.
43 | Also optionally, non-robust methods can be used by specifying \code{center = mean},
44 | \code{scale = stats::sd()}, and \code{cov = stats::cov()}. Any non-numeric columns are retained
45 | with a warning.
46 | }
47 | \examples{
48 | # Univariate z-scores (no rotation)
49 | z <- mvscale(faithful, center = mean, scale = sd, cov = NULL, warning = FALSE)
50 | # Non-robust scaling with rotation
51 | z <- mvscale(faithful, center = mean, cov = stats::cov, warning = FALSE)
52 | # Robust scaling and rotation
53 | z <- mvscale(faithful, warning = FALSE)
54 | }
55 | \seealso{
56 | \code{\link[base:scale]{base::scale()}}, \code{\link[stats:sd]{stats::sd()}}, \code{\link[stats:cor]{stats::cov()}}, \code{\link[robustbase:covOGK]{robustbase::covOGK()}}, \code{\link[robustbase:Qn]{robustbase::s_Qn()}}
57 | }
58 | \author{
59 | Rob J Hyndman
60 | }
61 |
--------------------------------------------------------------------------------
/R/autoplot_persistence.R:
--------------------------------------------------------------------------------
1 | #' Plots outlier persistence for a range of significance levels.
2 | #'
3 | #' This function plots outlier persistence for a range of significance levels
4 | #' using the algorithm lookout, an outlier detection method that uses
5 | #' leave-one-out kernel density estimates and generalized Pareto distributions
6 | #' to find outliers.
7 | #'
8 | #' @param object The output of the function `persisting_outliers`.
9 | #' @param alpha The significance levels to plot.
10 | #' @param ... Other arguments currently ignored.
11 | #'
12 | #' @return A ggplot object.
13 | #'
14 | #' @examples
15 | #' X <- rbind(
16 | #' data.frame(
17 | #' x = rnorm(500),
18 | #' y = rnorm(500)
19 | #' ),
20 | #' data.frame(
21 | #' x = rnorm(5, mean = 10, sd = 0.2),
22 | #' y = rnorm(5, mean = 10, sd = 0.2)
23 | #' )
24 | #' )
25 | #' plot(X, pch = 19)
26 | #' outliers <- persisting_outliers(X, scale = FALSE)
27 | #' autoplot(outliers)
28 | #' @export
29 | autoplot.persistingoutliers <- function(object, alpha = object$alpha, ...) {
30 | which_alpha <- (round(object$alpha, 4) %in% round(alpha, 4))
31 | if (all(!which_alpha)) {
32 | stop("No specified alpha values available.")
33 | }
34 | outwts <- apply(object$out[, , which_alpha, drop = FALSE], c(1, 2), sum)
35 | outwtsg <- cbind.data.frame(seq(NROW(outwts)), outwts)
36 | colnames(outwtsg)[1] <- "Observation"
37 | col1 <- max(which(colSums(outwtsg) != 0))
38 | outwtsg <- outwtsg[, seq(col1)]
39 |
40 | # Long form
41 | dfl <- tidyr::pivot_longer(
42 | outwtsg,
43 | -Observation,
44 | names_to = "bw",
45 | values_to = "Strength"
46 | )
47 | # Add bandwidths
48 | dfl$Bandwidth <- object$bw[as.integer(dfl$bw)]
49 |
50 | # Colours
51 | if (length(alpha) > 1) {
52 | col_pal1 <- c(
53 | "white",
54 | "#ffffcc",
55 | "#ffeda0",
56 | "#fed976",
57 | "#feb24c",
58 | "#fd8d3c",
59 | "#fc4e2a",
60 | "#e31a1c",
61 | "#bd0026",
62 | "#800026"
63 | )
64 | } else {
65 | col_pal1 <- c("white", "black")
66 | }
67 |
68 | Observation <- Bandwidth <- Strength <- NULL
69 | p <- ggplot2::ggplot(
70 | dfl,
71 | ggplot2::aes(x = Bandwidth, y = Observation, fill = Strength)
72 | ) +
73 | ggplot2::geom_raster() +
74 | ggplot2::scale_fill_gradientn(colours = col_pal1) +
75 | ggplot2::theme(
76 | panel.grid.major = ggplot2::element_blank(),
77 | panel.grid.minor = ggplot2::element_blank(),
78 | panel.background = ggplot2::element_blank(),
79 | axis.line = ggplot2::element_line(colour = "black")
80 | )
81 | if (length(alpha) == 1L) {
82 | p <- p + ggplot2::theme(legend.position = "none")
83 | }
84 |
85 | return(p)
86 | }
87 |
--------------------------------------------------------------------------------
/man/lookout.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/lookoutliers.R
3 | \name{lookout}
4 | \alias{lookout}
5 | \title{Identifies outliers using the algorithm lookout.}
6 | \usage{
7 | lookout(
8 | X,
9 | alpha = 0.01,
10 | beta = 0.9,
11 | gamma = 0.97,
12 | bw = NULL,
13 | gpd = NULL,
14 | scale = TRUE,
15 | fast = NROW(X) > 1000,
16 | old_version = FALSE
17 | )
18 | }
19 | \arguments{
20 | \item{X}{The numerical input data in a data.frame, matrix or tibble format.}
21 |
22 | \item{alpha}{The level of significance. Default is \code{0.01}. So there is
23 | a 1/100 chance of any point being falsely classified as an outlier.}
24 |
25 | \item{beta}{The quantile threshold used in the GPD estimation. Default is \code{0.90}.
26 | To ensure there is enough data available, values greater than 0.90 are set to 0.90.}
27 |
28 | \item{gamma}{Parameter for bandwidth calculation giving the quantile of the
29 | Rips death radii to use for the bandwidth. Default is \code{0.97}. Ignored
30 | under the old version; where the lower limit of the maximum Rips death radii
31 | difference is used. Also ignored if \code{bw} is provided.}
32 |
33 | \item{bw}{Bandwidth parameter. If \code{NULL} (default), the bandwidth is
34 | found using Persistent Homology.}
35 |
36 | \item{gpd}{Generalized Pareto distribution parameters. If \code{NULL} (the
37 | default), these are estimated from the data.}
38 |
39 | \item{scale}{If \code{TRUE}, the data is standardized. Using the old version,
40 | unit scaling is applied so that each column is in the range \code{[0,1]}.
41 | Under the new version, robust rotation and scaling is used so that the columns
42 | are approximately uncorrelated with unit variance. Default is \code{TRUE}.}
43 |
44 | \item{fast}{If \code{TRUE} (default), makes the computation faster by
45 | sub-setting the data for the bandwidth calculation.}
46 |
47 | \item{old_version}{Logical indicator of which version of the algorithm to use.
48 | Default is FALSE, meaning the newer version is used.}
49 | }
50 | \value{
51 | A list with the following components:
52 | \item{\code{outliers}}{The set of outliers.}
53 | \item{\code{outlier_probability}}{The GPD probability of the data.}
54 | \item{\code{outlier_scores}}{The outlier scores of the data.}
55 | \item{\code{bandwidth}}{The bandwdith selected using persistent homology. }
56 | \item{\code{kde}}{The kernel density estimate values.}
57 | \item{\code{lookde}}{The leave-one-out kde values.}
58 | \item{\code{gpd}}{The fitted GPD parameters.}
59 | }
60 | \description{
61 | This function identifies outliers using the algorithm lookout, an outlier
62 | detection method that uses leave-one-out kernel density estimates and
63 | generalized Pareto distributions to find outliers.
64 | }
65 | \examples{
66 | X <- rbind(
67 | data.frame(
68 | x = rnorm(500),
69 | y = rnorm(500)
70 | ),
71 | data.frame(
72 | x = rnorm(5, mean = 10, sd = 0.2),
73 | y = rnorm(5, mean = 10, sd = 0.2)
74 | )
75 | )
76 | lo <- lookout(X)
77 | lo
78 | autoplot(lo)
79 | }
80 |
--------------------------------------------------------------------------------
/R/outlier_persistence.R:
--------------------------------------------------------------------------------
1 | #' Computes outlier persistence for a range of significance values.
2 | #'
3 | #' This function computes outlier persistence for a range of significance
4 | #' values, using the algorithm lookout, an outlier detection method that uses
5 | #' leave-one-out kernel density estimates and generalized Pareto distributions
6 | #' to find outliers.
7 | #'
8 | #' @param X The input data in a matrix, data.frame, or tibble format. All
9 | #' columns should be numeric.
10 | #' @param alpha Grid of significance levels.
11 | #' @param st_qq The starting quantile for death radii sequence. This will be
12 | #' used to compute the starting bandwidth value.
13 | #' @param scale If \code{TRUE}, the data is scaled. Default is \code{TRUE}. Which
14 | #' scaling method is used depends on the \code{old_version} parameter.
15 | #' See \code{\link{lookout}} for details.
16 | #' @param old_version Logical indicator of which version of the algorithm to use.
17 | #' @param num_steps The length of the bandwidth sequence.
18 | #'
19 | #' @return A list with the following components:
20 | #' \item{\code{out}}{A 3D array of \code{N x num_steps x num_alpha} where
21 | #' \code{N} denotes the number of observations, \code{num_steps} denote the
22 | #' length of the bandwidth sequence, and \code{num_alpha} denotes the number of
23 | #' significance levels. This is a binary array and the entries are set to 1 if
24 | #' that observation is an outlier for that particular bandwidth and significance
25 | #' level.}
26 | #' \item{\code{bw}}{The set of bandwidth values.}
27 | #' \item{\code{gpdparas}}{The GPD parameters used. }
28 | #' \item{\code{lookoutbw}}{The bandwidth chosen by the algorithm \code{lookout}
29 | #' using persistent homology.}
30 | #'
31 | #' @examples
32 | #' X <- rbind(
33 | #' data.frame(
34 | #' x = rnorm(500),
35 | #' y = rnorm(500)
36 | #' ),
37 | #' data.frame(
38 | #' x = rnorm(5, mean = 10, sd = 0.2),
39 | #' y = rnorm(5, mean = 10, sd = 0.2)
40 | #' )
41 | #' )
42 | #' plot(X, pch = 19)
43 | #' outliers <- persisting_outliers(X, scale = FALSE)
44 | #' outliers
45 | #' autoplot(outliers)
46 | #' @export
47 |
48 | persisting_outliers <- function(
49 | X,
50 | alpha = seq(0.01, 0.1, by = 0.01),
51 | st_qq = 0.9,
52 | scale = TRUE,
53 | num_steps = 20,
54 | old_version = FALSE
55 | ) {
56 | # Prepare X matrix
57 | X <- as.matrix(X)
58 | if (scale) {
59 | if (old_version) {
60 | X <- unitize(X)
61 | } else {
62 | X <- mvscale(X)
63 | }
64 | }
65 |
66 | # Calculate persistent homology
67 | if (NCOL(X) == 1L) {
68 | phom <- TDAstats::calculate_homology(dist(X), format = "distmat")
69 | } else {
70 | phom <- TDAstats::calculate_homology(X, dim = 0)
71 | }
72 |
73 | # Find bandwiths
74 | death_radi <- phom[, 3L]
75 | qq_st <- quantile(death_radi, probs = st_qq)
76 | qq_en <- max(death_radi) * sqrt(5)
77 | bw_vals <- seq(qq_st, qq_en, length.out = num_steps)
78 | q_thres <- quantile(death_radi, probs = 0.5)
79 | dr_thres <- death_radi[death_radi >= q_thres]
80 | dr_thres_diff <- diff(dr_thres)
81 | max_persist_ind <- which.max(dr_thres_diff)
82 | ind1 <- min(which(death_radi >= q_thres))
83 | ind <- max_persist_ind + ind1 - 1L
84 | bw_fixed <- death_radi[ind] * sqrt(5)
85 |
86 | # Find outliers
87 | lookoutobj1 <- lookout(
88 | X,
89 | alpha = 0.05,
90 | scale = FALSE,
91 | bw = bw_fixed,
92 | old_version = old_version
93 | )
94 | paras <- lookoutobj1$gpd[1:2]
95 | output <- array(0, dim = c(dim(X)[1], num_steps, length(alpha)))
96 | for (i in seq_along(bw_vals)) {
97 | lookoutobj <- lookout(
98 | X,
99 | alpha = 0.05,
100 | scale = FALSE,
101 | bw = bw_vals[i],
102 | gpd = paras,
103 | old_version = old_version
104 | )
105 | for (j in seq_along(alpha)) {
106 | outinds <- which(lookoutobj$outlier_probability < alpha[j])
107 | output[outinds, i, j] <- 1
108 | }
109 | }
110 |
111 | # Return results
112 | structure(
113 | list(
114 | out = output,
115 | bw = bw_vals,
116 | gpdparas = paras,
117 | lookoutbw = bw_fixed,
118 | alpha = alpha,
119 | call = match.call()
120 | ),
121 | class = "persistingoutliers"
122 | )
123 | }
124 |
--------------------------------------------------------------------------------
/R/mvscale.R:
--------------------------------------------------------------------------------
1 | #' Compute robust multivariate scaled data
2 | #'
3 | #' @description A multivariate version of [base::scale()], that takes account
4 | #' of the covariance matrix of the data, and uses robust estimates
5 | #' of center, scale and covariance by default. The centers are removed using medians, the
6 | #' scale function is the IQR, and the covariance matrix is estimated using a
7 | #' robust OGK estimate. The data are scaled using the Cholesky decomposition of
8 | #' the inverse covariance. Then the scaled data are returned.
9 | #'
10 | #' @details Optionally, the centering and scaling can be done for each variable
11 | #' separately, so there is no rotation of the data, by setting `cov = NULL`.
12 | #' Also optionally, non-robust methods can be used by specifying `center = mean`,
13 | #' `scale = stats::sd()`, and `cov = stats::cov()`. Any non-numeric columns are retained
14 | #' with a warning.
15 | #'
16 | #' @param object A vector, matrix, or data frame containing some numerical data.
17 | #' @param center A function to compute the center of each numerical variable. Set
18 | #' to NULL if no centering is required.
19 | #' @param scale A function to scale each numerical variable. When
20 | #' `cov = robustbase::covOGK()`, it is passed as the `sigmamu` argument.
21 | #' @param cov A function to compute the covariance matrix. Set to NULL if no rotation required.
22 | #' @param warning Should a warning be issued if non-numeric columns are ignored?
23 | #' @return A vector, matrix or data frame of the same size and class as `object`,
24 | #' but with numerical variables replaced by scaled versions.
25 | #' @seealso [base::scale()], [stats::sd()], [stats::cov()], [robustbase::covOGK()], [robustbase::s_Qn()]
26 | #' @author Rob J Hyndman
27 | #' @examples
28 | #' # Univariate z-scores (no rotation)
29 | #' z <- mvscale(faithful, center = mean, scale = sd, cov = NULL, warning = FALSE)
30 | #' # Non-robust scaling with rotation
31 | #' z <- mvscale(faithful, center = mean, cov = stats::cov, warning = FALSE)
32 | #' # Robust scaling and rotation
33 | #' z <- mvscale(faithful, warning = FALSE)
34 | #' @export
35 | mvscale <- function(
36 | object,
37 | center = stats::median,
38 | scale = robustbase::s_Qn,
39 | cov = robustbase::covOGK,
40 | warning = TRUE
41 | ) {
42 | d <- NCOL(object)
43 | vec <- FALSE # Indicator if object is a vector
44 | # We find the numerical columns and convert to a matrix
45 | # First deal with vector inputs
46 | if (d == 1L & !inherits(object, "matrix") & !inherits(object, "data.frame")) {
47 | numeric_col <- is.numeric(object)
48 | if (!numeric_col) {
49 | stop("Input must be numeric")
50 | }
51 | vec <- TRUE
52 | mat <- as.matrix(object)
53 | } else if (inherits(object, "matrix")) {
54 | # It is already a matrix
55 | if (!is.numeric(object)) {
56 | stop("Input must be numeric")
57 | }
58 | numeric_col <- rep(TRUE, NCOL(object))
59 | mat <- object
60 | } else {
61 | # It must be a data frame. So let's find the numeric columns
62 | numeric_col <- unlist(lapply(object, is.numeric))
63 | if (!all(numeric_col) & warning) {
64 | warning(
65 | "Ignoring non-numeric columns: ",
66 | paste(names(object)[!numeric_col], collapse = ", ")
67 | )
68 | }
69 | mat <- as.matrix(object[, numeric_col])
70 | }
71 | # Remove centers
72 | if (!is.null(center)) {
73 | med <- apply(mat, 2, center)
74 | mat <- sweep(mat, 2L, med)
75 | }
76 | # Create more resilient version of scale function
77 | if (!is.null(scale)) {
78 | my_scale <- function(x, ..., na.rm = TRUE) {
79 | s <- scale(x, ..., na.rm = na.rm)
80 | s[s == 0] <- 1 # Avoid division by zero
81 | return(s)
82 | }
83 | } else {
84 | my_scale <- function(x, ..., na.rm = TRUE) {
85 | 1
86 | }
87 | }
88 | # Scale
89 | if (d == 1L) {
90 | z <- mat / my_scale(mat)
91 | if (vec) {
92 | return(c(z))
93 | }
94 | } else if (!is.null(cov)) {
95 | if (identical(cov, robustbase::covOGK)) {
96 | S <- cov(mat, sigmamu = my_scale)$cov
97 | } else {
98 | S <- cov(mat)
99 | }
100 | Sinv <- try(solve(S), silent = TRUE)
101 | if (inherits(Sinv, "try-error")) {
102 | # Add a small ridge to the covariance matrix to avoid singularity issues
103 | Sinv <- try(solve(S + diag(1e-6, nrow(S), ncol(S))), silent = TRUE)
104 | if (inherits(Sinv, "try-error")) {
105 | # Add a bigger ridge
106 | Sinv <- solve(S + diag(1e-2, nrow(S), ncol(S)))
107 | }
108 | }
109 | U <- chol(Sinv)
110 | z <- mat %*% t(U)
111 | } else {
112 | s <- apply(mat, 2, my_scale)
113 | z <- sweep(mat, 2L, s, "/")
114 | }
115 | # Convert back to matrix, data frame or tibble if necessary
116 | idx <- which(numeric_col)
117 | for (i in seq_along(idx)) {
118 | object[, idx[i]] <- z[, i]
119 | }
120 | # Rename columns if there has been rotation
121 | if (!is.null(cov)) {
122 | names(object)[numeric_col] <- paste0("z", seq(sum(numeric_col)))
123 | }
124 | return(object)
125 | }
126 |
--------------------------------------------------------------------------------
/R/lookoutliers.R:
--------------------------------------------------------------------------------
1 | #' Identifies outliers using the algorithm lookout.
2 | #'
3 | #' This function identifies outliers using the algorithm lookout, an outlier
4 | #' detection method that uses leave-one-out kernel density estimates and
5 | #' generalized Pareto distributions to find outliers.
6 | #'
7 | #' @param X The numerical input data in a data.frame, matrix or tibble format.
8 | #' @param alpha The level of significance. Default is \code{0.01}. So there is
9 | #' a 1/100 chance of any point being falsely classified as an outlier.
10 | #' @param beta The quantile threshold used in the GPD estimation. Default is \code{0.90}.
11 | #' To ensure there is enough data available, values greater than 0.90 are set to 0.90.
12 | #' @param gamma Parameter for bandwidth calculation giving the quantile of the
13 | #' Rips death radii to use for the bandwidth. Default is \code{0.97}. Ignored
14 | #' under the old version; where the lower limit of the maximum Rips death radii
15 | #' difference is used. Also ignored if \code{bw} is provided.
16 | #' @param bw Bandwidth parameter. If \code{NULL} (default), the bandwidth is
17 | #' found using Persistent Homology.
18 | #' @param gpd Generalized Pareto distribution parameters. If `NULL` (the
19 | #' default), these are estimated from the data.
20 | #' @param scale If \code{TRUE}, the data is standardized. Using the old version,
21 | #' unit scaling is applied so that each column is in the range \code{[0,1]}.
22 | #' Under the new version, robust rotation and scaling is used so that the columns
23 | #' are approximately uncorrelated with unit variance. Default is \code{TRUE}.
24 | #' @param fast If \code{TRUE} (default), makes the computation faster by
25 | #' sub-setting the data for the bandwidth calculation.
26 | #' @param old_version Logical indicator of which version of the algorithm to use.
27 | #' Default is FALSE, meaning the newer version is used.
28 | #' @return A list with the following components:
29 | #' \item{\code{outliers}}{The set of outliers.}
30 | #' \item{\code{outlier_probability}}{The GPD probability of the data.}
31 | #' \item{\code{outlier_scores}}{The outlier scores of the data.}
32 | #' \item{\code{bandwidth}}{The bandwdith selected using persistent homology. }
33 | #' \item{\code{kde}}{The kernel density estimate values.}
34 | #' \item{\code{lookde}}{The leave-one-out kde values.}
35 | #' \item{\code{gpd}}{The fitted GPD parameters.}
36 | #'
37 | #' @examples
38 | #' X <- rbind(
39 | #' data.frame(
40 | #' x = rnorm(500),
41 | #' y = rnorm(500)
42 | #' ),
43 | #' data.frame(
44 | #' x = rnorm(5, mean = 10, sd = 0.2),
45 | #' y = rnorm(5, mean = 10, sd = 0.2)
46 | #' )
47 | #' )
48 | #' lo <- lookout(X)
49 | #' lo
50 | #' autoplot(lo)
51 | #' @export lookout
52 | #' @importFrom stats dist quantile median sd
53 | lookout <- function(
54 | X,
55 | alpha = 0.01,
56 | beta = 0.90,
57 | gamma = 0.97,
58 | bw = NULL,
59 | gpd = NULL,
60 | scale = TRUE,
61 | fast = NROW(X) > 1000,
62 | old_version = FALSE
63 | ) {
64 | # alpha, beta and gamma need to be between 0 and 1
65 | if (alpha < 0 || alpha > 1) {
66 | stop("gamma should be between 0 and 1.")
67 | }
68 | if (beta < 0 || beta > 1) {
69 | stop("gamma should be between 0 and 1.")
70 | }
71 | # gamma needs to be between 0 and 1
72 | if (gamma < 0 || gamma > 1) {
73 | stop("gamma should be between 0 and 1.")
74 | }
75 |
76 | # Prepare X matrix
77 | origX <- X
78 | X <- as.matrix(X)
79 | if (scale) {
80 | if (old_version) {
81 | X <- unitize(X)
82 | } else {
83 | X <- mvscale(X)
84 | }
85 | }
86 |
87 | # Find bandwidth and scale for Epanechnikov kernel
88 | if (is.null(bw)) {
89 | bandwidth <- find_tda_bw(
90 | X,
91 | fast = fast,
92 | gamma,
93 | use_differences = old_version
94 | ) *
95 | sqrt(5)
96 | } else {
97 | bandwidth <- bw
98 | }
99 |
100 | # find kde and lookde estimates
101 | kdeobj <- lookde(X, bandwidth = bandwidth, fast = fast)
102 | log_dens <- -log(kdeobj$kde)
103 |
104 | # find POT GPD parameters, threshold 0.90
105 | beta <- min(0.9, beta)
106 | qq <- quantile(log_dens, probs = beta)
107 |
108 | # check if there are points above the quantile
109 | if (!any(log_dens > qq)) {
110 | stop("No points above the quantile for GPD estimation")
111 | }
112 |
113 | if (is.null(gpd)) {
114 | M1 <- evd::fpot(log_dens, qq, std.err = FALSE)
115 | gpd <- M1$estimate[1L:2L]
116 | if (gpd[2] > 0 & !old_version) {
117 | # This should only be done in the new lookout
118 | # This shows that shape is estimated to be positive.
119 | # This should not be the case because log densities are bounded
120 | M1 <- evd::fpot(log_dens, qq, shape = 0, std.err = FALSE)
121 | gpd <- c(M1$estimate, 0)
122 | }
123 | }
124 | # for these Generalized Pareto distribution parameters, compute the
125 | # probabilities of leave-one-out kernel density estimates
126 | potlookde <- evd::pgpd(
127 | -log(kdeobj$lookde),
128 | loc = qq,
129 | scale = gpd[1],
130 | shape = gpd[2],
131 | lower.tail = FALSE
132 | ) *
133 | (1 - beta)
134 |
135 | outscores <- 1 - potlookde
136 | # select outliers according to threshold
137 | outliers <- which(potlookde < alpha)
138 | dfout <- cbind.data.frame(outliers, potlookde[outliers])
139 | colnames(dfout) <- c("Outliers", "Probability")
140 |
141 | structure(
142 | list(
143 | data = origX,
144 | outliers = dfout,
145 | outlier_probability = potlookde,
146 | outlier_scores = outscores,
147 | bandwidth = bandwidth,
148 | kde = kdeobj$kde,
149 | lookde = kdeobj$lookde,
150 | gpd = gpd,
151 | call = match.call()
152 | ),
153 | class = "lookoutliers"
154 | )
155 | }
156 |
157 |
158 | lookde <- function(x, bandwidth, fast) {
159 | x <- as.matrix(x)
160 | nn <- NROW(x)
161 |
162 | if (fast) {
163 | # To make the nearest neighbour distance computation faster
164 | # select a kk different to nn as follows
165 | kk <- min(max(ceiling(nn / 200), 100), nn, 500)
166 | } else {
167 | kk <- nn
168 | }
169 |
170 | # Epanechnikov kernel density estimate
171 | dist <- RANN::nn2(x, k = kk)$nn.dists
172 | dist[dist > bandwidth] <- NA_real_
173 | phat <- 0.75 /
174 | (nn * bandwidth) *
175 | rowSums(1 - (dist / bandwidth)^2, na.rm = TRUE)
176 |
177 | # leave one out
178 | kdevalsloo <- 0.75 / ((nn - 1) * (bandwidth))
179 | lookde <- nn * phat / (nn - 1) - kdevalsloo
180 |
181 | list(x = x, kde = phat, lookde = pmax(lookde, 0))
182 | }
183 |
184 |
185 | subset_for_tda <- function(X) {
186 | # Leader algorithm in HDoutliers
187 | # Inserted from HDoutliers function getHDmembers
188 | # We cannot call that function because the algorithm only comes to
189 | # effect if the number of rows are greater than 10000
190 | # And we have used RANN::nn2, which is a faster algorithm.
191 |
192 | X <- as.matrix(X)
193 |
194 | n <- nrow(X)
195 | p <- ncol(X)
196 |
197 | Xu <- unitize(X)
198 |
199 | sds <- apply(Xu, 2, sd)
200 | sd_radius <- sqrt(sum(sds^2))
201 | radius <- min(0.1 / (log(n)^(1 / p)), sd_radius)
202 | members <- rep(list(NULL), n)
203 | exemplars <- 1
204 | members[[1]] <- 1
205 |
206 | for (i in 2:n) {
207 | KNN <- RANN::nn2(
208 | data = Xu[c(exemplars, i), , drop = FALSE],
209 | query = Xu[i, , drop = FALSE],
210 | k = 2
211 | )
212 | m <- KNN$nn.idx[1, 2]
213 | d <- KNN$nn.dists[1, 2]
214 | if (d < radius) {
215 | curr <- length(exemplars)
216 | l <- exemplars[curr]
217 | members[[l]] <- c(members[[l]], i)
218 | next
219 | }
220 | exemplars <- c(exemplars, i)
221 | members[[i]] <- i
222 | }
223 | # X[exemplars, ]
224 | exemplars
225 | }
226 |
--------------------------------------------------------------------------------