├── .github ├── .gitignore └── workflows │ ├── recheck.yml │ ├── pkgdown.yaml │ └── R-CMD-check.yaml ├── hex └── lookout.png ├── man ├── figures │ ├── logo.png │ ├── README-pressure-1.png │ ├── README-unnamed-chunk-2-1.png │ ├── README-unnamed-chunk-2-2.png │ └── README-unnamed-chunk-3-1.png ├── reexports.Rd ├── autoplot.lookoutliers.Rd ├── lookout_ts.Rd ├── autoplot.persistingoutliers.Rd ├── lookout-package.Rd ├── find_tda_bw.Rd ├── persisting_outliers.Rd ├── mvscale.Rd └── lookout.Rd ├── pkgdown ├── favicon │ ├── favicon.ico │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── apple-touch-icon.png │ ├── apple-touch-icon-60x60.png │ ├── apple-touch-icon-76x76.png │ ├── apple-touch-icon-120x120.png │ ├── apple-touch-icon-152x152.png │ └── apple-touch-icon-180x180.png └── extra.css ├── .gitignore ├── NEWS.md ├── .Rbuildignore ├── R ├── utils.R ├── lookout-package.R ├── print.R ├── lookout_ts.R ├── bandwidth.R ├── autoplot_lookout.R ├── autoplot_persistence.R ├── outlier_persistence.R ├── mvscale.R └── lookoutliers.R ├── lookout.Rproj ├── NAMESPACE ├── _pkgdown.yml ├── DESCRIPTION ├── README.Rmd └── README.md /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /hex/lookout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/hex/lookout.png -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/logo.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/favicon.ico -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | .DS_Store 7 | .history 8 | docs 9 | -------------------------------------------------------------------------------- /man/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-pressure-1.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-unnamed-chunk-2-2.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/man/figures/README-unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-60x60.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-76x76.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-120x120.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-152x152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-152x152.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-180x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sevvandi/lookout/HEAD/pkgdown/favicon/apple-touch-icon-180x180.png -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # lookout 2.0.0 2 | 3 | * Added a `NEWS.md` file to track changes to the package. 4 | * Updated lookout algorithm as per Hyndman, Kandanaarachchi and Turner (2025). 5 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^\.travis\.yml$ 5 | ^README\.Rmd$ 6 | ^\.github$ 7 | ^_pkgdown\.yml$ 8 | ^docs$ 9 | ^pkgdown$ 10 | ^hex$ 11 | .history 12 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # Unitize each column of X 2 | unitize <- function(X) { 3 | for (col in seq_len(NCOL(X))) { 4 | maxcol <- max(X[, col]) 5 | mincol <- min(X[, col]) 6 | if (maxcol != mincol) { 7 | X[, col] <- (X[, col] - mincol) / (maxcol - mincol) 8 | } 9 | } 10 | X 11 | } 12 | -------------------------------------------------------------------------------- /R/lookout-package.R: -------------------------------------------------------------------------------- 1 | #' @importFrom ggplot2 ggplot aes geom_raster xlab ylab geom_point 2 | #' @importFrom ggplot2 autoplot 3 | #' @export 4 | ggplot2::autoplot 5 | NULL 6 | 7 | #' @docType package 8 | #' @aliases NULL lookout-package 9 | #' @keywords internal 10 | "_PACKAGE" 11 | 12 | # The following block is used by usethis to automatically manage 13 | # roxygen namespace tags. Modify with care! 14 | ## usethis namespace: start 15 | ## usethis namespace: end 16 | NULL 17 | -------------------------------------------------------------------------------- /.github/workflows/recheck.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | inputs: 4 | which: 5 | type: choice 6 | description: Which dependents to check 7 | options: 8 | - strong 9 | - most 10 | 11 | name: Reverse dependency check 12 | 13 | jobs: 14 | revdep_check: 15 | name: Reverse check ${{ inputs.which }} dependents 16 | uses: r-devel/recheck/.github/workflows/recheck.yml@v1 17 | with: 18 | which: ${{ inputs.which }} 19 | -------------------------------------------------------------------------------- /man/reexports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lookout-package.R 3 | \docType{import} 4 | \name{reexports} 5 | \alias{reexports} 6 | \alias{autoplot} 7 | \title{Objects exported from other packages} 8 | \keyword{internal} 9 | \description{ 10 | These objects are imported from other packages. Follow the links 11 | below to see their documentation. 12 | 13 | \describe{ 14 | \item{ggplot2}{\code{\link[ggplot2]{autoplot}}} 15 | }} 16 | 17 | -------------------------------------------------------------------------------- /lookout.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: 6a5358b0-e0d8-4db8-9b9a-a0f3e182d4bf 3 | 4 | RestoreWorkspace: Default 5 | SaveWorkspace: Default 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: Sweave 14 | LaTeX: pdfLaTeX 15 | 16 | AutoAppendNewline: Yes 17 | StripTrailingWhitespace: Yes 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /R/print.R: -------------------------------------------------------------------------------- 1 | #' @method print persistingoutliers 2 | #' @export 3 | 4 | print.persistingoutliers <- function(x, ...) { 5 | cat("Persistent outliers using lookout algorithm") 6 | cat("\n\nCall: ") 7 | print(x$call) 8 | cat("\nLookout bandwidth: ", x$lookoutbw, "\n") 9 | } 10 | 11 | #' @method print lookoutliers 12 | #' @export 13 | print.lookoutliers <- function(x, ...) { 14 | cat("Leave-out-out KDE outliers using lookout algorithm") 15 | cat("\n\nCall: ") 16 | print(x$call) 17 | cat("\n") 18 | print(x$outliers) 19 | cat("\n") 20 | } 21 | -------------------------------------------------------------------------------- /pkgdown/extra.css: -------------------------------------------------------------------------------- 1 | h1, 2 | .h1 { 3 | font-size: 2.5rem; 4 | font-weight: 700; 5 | } 6 | 7 | h2, 8 | .h2 { 9 | font-size: 2.0rem; 10 | font-weight: 700; 11 | } 12 | 13 | h3, 14 | .h3 { 15 | font-size: 1.5rem; 16 | font-weight: 700; 17 | } 18 | 19 | .bg-primary .navbar-nav .show>.nav-link, 20 | .bg-primary .navbar-nav .nav-link.active, 21 | .bg-primary .navbar-nav .nav-link:hover, 22 | .bg-primary .navbar-nav .nav-link:focus { 23 | color: #ffb81c !important; 24 | } 25 | 26 | .text-muted { 27 | color: #ffb81c !important; 28 | } 29 | 30 | .algolia-autocomplete .aa-dropdown-menu .aa-suggestion { 31 | color: #234460; 32 | } 33 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(autoplot,lookoutliers) 4 | S3method(autoplot,persistingoutliers) 5 | S3method(print,lookoutliers) 6 | S3method(print,persistingoutliers) 7 | export(autoplot) 8 | export(find_tda_bw) 9 | export(lookout) 10 | export(lookout_ts) 11 | export(mvscale) 12 | export(persisting_outliers) 13 | importFrom(ggplot2,aes) 14 | importFrom(ggplot2,autoplot) 15 | importFrom(ggplot2,geom_point) 16 | importFrom(ggplot2,geom_raster) 17 | importFrom(ggplot2,ggplot) 18 | importFrom(ggplot2,xlab) 19 | importFrom(ggplot2,ylab) 20 | importFrom(stats,dist) 21 | importFrom(stats,median) 22 | importFrom(stats,quantile) 23 | importFrom(stats,sd) 24 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://sevvandi.github.io/lookout/ 2 | template: 3 | bootstrap: 5 4 | theme: tango 5 | bootswatch: flatly 6 | bslib: 7 | base_font: { google: "Fira Sans" } 8 | heading_font: { google: "Fira Sans" } 9 | code_font: "Hack, mono" 10 | primary: "#234460" 11 | link-color: "#234460" 12 | includes: 13 | in_header: 14 | 15 | authors: 16 | Sevvandi Kandanaarachchi: 17 | href: https://sevvandi.github.io 18 | Rob Hyndman: 19 | href: https://robjhyndman.com 20 | 21 | navbar: 22 | type: light 23 | 24 | figures: 25 | dev: ragg::agg_png 26 | dpi: 300 27 | dev.args: [] 28 | fig.ext: png 29 | fig.width: 8 30 | fig.height: 5 31 | -------------------------------------------------------------------------------- /man/autoplot.lookoutliers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_lookout.R 3 | \name{autoplot.lookoutliers} 4 | \alias{autoplot.lookoutliers} 5 | \title{Plots outliers identified by lookout algorithm.} 6 | \usage{ 7 | \method{autoplot}{lookoutliers}(object, columns = 1:2, ...) 8 | } 9 | \arguments{ 10 | \item{object}{The output of the function \code{lookout}.} 11 | 12 | \item{columns}{Which columns of the original data to plot 13 | (specified as either numbers or strings)} 14 | 15 | \item{...}{Other arguments currently ignored.} 16 | } 17 | \value{ 18 | A ggplot object. 19 | } 20 | \description{ 21 | Scatterplot of two columns from the data set with outliers highlighted. 22 | } 23 | \examples{ 24 | X <- rbind( 25 | data.frame( 26 | x = rnorm(500), 27 | y = rnorm(500) 28 | ), 29 | data.frame( 30 | x = rnorm(5, mean = 10, sd = 0.2), 31 | y = rnorm(5, mean = 10, sd = 0.2) 32 | ) 33 | ) 34 | lo <- lookout(X) 35 | autoplot(lo) 36 | } 37 | -------------------------------------------------------------------------------- /man/lookout_ts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lookout_ts.R 3 | \name{lookout_ts} 4 | \alias{lookout_ts} 5 | \title{Identifies outliers in univariate time series using the algorithm lookout.} 6 | \usage{ 7 | lookout_ts(x, scale = FALSE, ...) 8 | } 9 | \arguments{ 10 | \item{x}{The input univariate time series.} 11 | 12 | \item{scale}{If \code{TRUE}, the data is standardized. Using the old version, 13 | unit scaling is applied so that each column is in the range \code{[0,1]}. 14 | Under the new version, robust rotation and scaling is used so that the columns 15 | are approximately uncorrelated with unit variance. Default is \code{TRUE}.} 16 | 17 | \item{...}{Other arguments are passed to \code{\link{lookout}}.} 18 | } 19 | \value{ 20 | A lookout object. 21 | } 22 | \description{ 23 | This is the time series implementation of lookout which identifies outliers 24 | in the double differenced time series. 25 | } 26 | \examples{ 27 | set.seed(1) 28 | x <- arima.sim(list(order = c(1, 1, 0), ar = 0.8), n = 200) 29 | x[50] <- x[50] + 10 30 | plot(x) 31 | lo <- lookout_ts(x) 32 | lo 33 | } 34 | \seealso{ 35 | \code{\link{lookout}} 36 | } 37 | -------------------------------------------------------------------------------- /man/autoplot.persistingoutliers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoplot_persistence.R 3 | \name{autoplot.persistingoutliers} 4 | \alias{autoplot.persistingoutliers} 5 | \title{Plots outlier persistence for a range of significance levels.} 6 | \usage{ 7 | \method{autoplot}{persistingoutliers}(object, alpha = object$alpha, ...) 8 | } 9 | \arguments{ 10 | \item{object}{The output of the function \code{persisting_outliers}.} 11 | 12 | \item{alpha}{The significance levels to plot.} 13 | 14 | \item{...}{Other arguments currently ignored.} 15 | } 16 | \value{ 17 | A ggplot object. 18 | } 19 | \description{ 20 | This function plots outlier persistence for a range of significance levels 21 | using the algorithm lookout, an outlier detection method that uses 22 | leave-one-out kernel density estimates and generalized Pareto distributions 23 | to find outliers. 24 | } 25 | \examples{ 26 | X <- rbind( 27 | data.frame( 28 | x = rnorm(500), 29 | y = rnorm(500) 30 | ), 31 | data.frame( 32 | x = rnorm(5, mean = 10, sd = 0.2), 33 | y = rnorm(5, mean = 10, sd = 0.2) 34 | ) 35 | ) 36 | plot(X, pch = 19) 37 | outliers <- persisting_outliers(X, scale = FALSE) 38 | autoplot(outliers) 39 | } 40 | -------------------------------------------------------------------------------- /R/lookout_ts.R: -------------------------------------------------------------------------------- 1 | #' Identifies outliers in univariate time series using the algorithm lookout. 2 | #' 3 | #' This is the time series implementation of lookout which identifies outliers 4 | #' in the double differenced time series. 5 | #' @param x The input univariate time series. 6 | #' @inheritParams lookout 7 | #' @param ... Other arguments are passed to \code{\link{lookout}}. 8 | #' @return A lookout object. 9 | #' @seealso \code{\link{lookout}} 10 | #' 11 | #' @examples 12 | #' set.seed(1) 13 | #' x <- arima.sim(list(order = c(1, 1, 0), ar = 0.8), n = 200) 14 | #' x[50] <- x[50] + 10 15 | #' plot(x) 16 | #' lo <- lookout_ts(x) 17 | #' lo 18 | #' @export lookout_ts 19 | lookout_ts <- function(x, scale = FALSE, ...) { 20 | u <- c(0, diff(diff(x))) 21 | out <- lookout(u, scale = scale, ...) 22 | outliers <- out$outliers[, 1] 23 | # Keep only the most extreme outlier(s) in each consecutive sequence of outliers 24 | if (length(outliers) > 1) { 25 | oo <- c() 26 | clust <- cumsum(c(1, diff(outliers) > 1)) 27 | len <- max(clust) 28 | for (kk in seq_len(len)) { 29 | inds <- outliers[which(clust == kk)] 30 | oo <- c(oo, inds[which.min(out$outlier_probability[inds])]) 31 | } 32 | inds <- which(out$outliers[, 1] %in% oo) 33 | out$outliers <- out$outliers[inds, ] 34 | } 35 | out 36 | } 37 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: lookout 2 | Type: Package 3 | Title: Leave One Out Kernel Density Estimates for Outlier Detection 4 | Version: 2.0.0 5 | Authors@R: c( 6 | person("Sevvandi", "Kandanaarachchi", email = "sevvandik@gmail.com", 7 | role = c("aut", "cre"), comment = c(ORCID = "0000-0002-0337-0395")), 8 | person("Rob", "Hyndman", email = "rob.hyndman@monash.edu", 9 | role = c("aut"), comment = c(ORCID = "0000-0002-2140-5352")), 10 | person("Chris", "Fraley", role = "ctb", email = "fraley@u.washington.edu") 11 | ) 12 | Maintainer: Sevvandi Kandanaarachchi 13 | Description: Outlier detection using leave-one-out kernel density estimates and 14 | extreme value theory. The bandwidth for kernel density estimates is computed 15 | using persistent homology, a technique in topological data analysis. Using 16 | peak-over-threshold method, a generalized Pareto distribution is fitted to 17 | the log of leave-one-out kde values to identify outliers. 18 | License: GPL-3 19 | Encoding: UTF-8 20 | LazyData: true 21 | Roxygen: list(markdown = TRUE) 22 | RoxygenNote: 7.3.3 23 | BugReports: https://github.com/sevvandi/lookout/issues 24 | Imports: 25 | evd, 26 | ggplot2, 27 | RANN, 28 | robustbase, 29 | stats, 30 | TDAstats, 31 | tidyr 32 | Suggests: 33 | knitr, 34 | rmarkdown 35 | URL: https://sevvandi.github.io/lookout/, https://github.com/sevvandi/lookout 36 | -------------------------------------------------------------------------------- /man/lookout-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lookout-package.R 3 | \docType{package} 4 | \name{lookout-package} 5 | \alias{lookout-package} 6 | \title{lookout: Leave One Out Kernel Density Estimates for Outlier Detection} 7 | \description{ 8 | \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} 9 | 10 | Outlier detection using leave-one-out kernel density estimates and extreme value theory. The bandwidth for kernel density estimates is computed using persistent homology, a technique in topological data analysis. Using peak-over-threshold method, a generalized Pareto distribution is fitted to the log of leave-one-out kde values to identify outliers. 11 | } 12 | \seealso{ 13 | Useful links: 14 | \itemize{ 15 | \item \url{https://sevvandi.github.io/lookout/} 16 | \item \url{https://github.com/sevvandi/lookout} 17 | \item Report bugs at \url{https://github.com/sevvandi/lookout/issues} 18 | } 19 | 20 | } 21 | \author{ 22 | \strong{Maintainer}: Sevvandi Kandanaarachchi \email{sevvandik@gmail.com} (\href{https://orcid.org/0000-0002-0337-0395}{ORCID}) 23 | 24 | Authors: 25 | \itemize{ 26 | \item Rob Hyndman \email{rob.hyndman@monash.edu} (\href{https://orcid.org/0000-0002-2140-5352}{ORCID}) 27 | } 28 | 29 | Other contributors: 30 | \itemize{ 31 | \item Chris Fraley \email{fraley@u.washington.edu} [contributor] 32 | } 33 | 34 | } 35 | \keyword{internal} 36 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.5.0 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /man/find_tda_bw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bandwidth.R 3 | \name{find_tda_bw} 4 | \alias{find_tda_bw} 5 | \title{Identifies bandwidth for outlier detection.} 6 | \usage{ 7 | find_tda_bw(X, fast = TRUE, gamma = 0.97, use_differences = FALSE) 8 | } 9 | \arguments{ 10 | \item{X}{The numerical input data in a data.frame, matrix or tibble format.} 11 | 12 | \item{fast}{If \code{TRUE} (default), makes the computation faster by 13 | sub-setting the data for the bandwidth calculation.} 14 | 15 | \item{gamma}{Parameter for bandwidth calculation giving the quantile of the 16 | Rips death radii to use for the bandwidth. Default is \code{0.97}. Ignored 17 | under the old version; where the lower limit of the maximum Rips death radii 18 | difference is used. Also ignored if \code{bw} is provided.} 19 | 20 | \item{use_differences}{If TRUE, the bandwidth is set to the lower point 21 | of the maximum Rips death radii differences. If FALSE, 22 | the gamma quantile of the Rips death radii is used. Default is FALSE.} 23 | } 24 | \value{ 25 | The bandwidth 26 | } 27 | \description{ 28 | This function identifies the bandwidth that is used in the kernel density 29 | estimate computation. The function uses topological data analysis (TDA) 30 | to find the badnwidth. 31 | } 32 | \examples{ 33 | X <- rbind( 34 | data.frame( 35 | x = rnorm(500), 36 | y = rnorm(500) 37 | ), 38 | data.frame( 39 | x = rnorm(5, mean = 10, sd = 0.2), 40 | y = rnorm(5, mean = 10, sd = 0.2) 41 | ) 42 | ) 43 | find_tda_bw(X, fast = TRUE) 44 | 45 | } 46 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: R-CMD-check.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ${{ matrix.config.os }} 15 | 16 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 17 | 18 | strategy: 19 | fail-fast: false 20 | matrix: 21 | config: 22 | - {os: macOS-latest, r: 'release'} 23 | - {os: windows-latest, r: 'release'} 24 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 25 | - {os: ubuntu-latest, r: 'release'} 26 | - {os: ubuntu-latest, r: 'oldrel-1'} 27 | 28 | env: 29 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 30 | R_KEEP_PKG_SOURCE: yes 31 | 32 | steps: 33 | - uses: actions/checkout@v4 34 | 35 | - uses: r-lib/actions/setup-pandoc@v2 36 | 37 | - uses: r-lib/actions/setup-r@v2 38 | with: 39 | r-version: ${{ matrix.config.r }} 40 | http-user-agent: ${{ matrix.config.http-user-agent }} 41 | use-public-rspm: true 42 | 43 | - name: Install XQuartz on macOS 44 | if: runner.os == 'macOS' 45 | run: | 46 | brew install --cask xquartz 47 | 48 | - uses: r-lib/actions/setup-r-dependencies@v2 49 | with: 50 | extra-packages: any::rcmdcheck 51 | needs: check 52 | 53 | - uses: r-lib/actions/check-r-package@v2 54 | with: 55 | upload-snapshots: true 56 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 57 | -------------------------------------------------------------------------------- /R/bandwidth.R: -------------------------------------------------------------------------------- 1 | #' Identifies bandwidth for outlier detection. 2 | #' 3 | #' This function identifies the bandwidth that is used in the kernel density 4 | #' estimate computation. The function uses topological data analysis (TDA) 5 | #' to find the badnwidth. 6 | #' 7 | #' @inheritParams lookout 8 | #' @param use_differences If TRUE, the bandwidth is set to the lower point 9 | #' of the maximum Rips death radii differences. If FALSE, 10 | #' the gamma quantile of the Rips death radii is used. Default is FALSE. 11 | #' 12 | #' @return The bandwidth 13 | #' 14 | #' @examples 15 | #' X <- rbind( 16 | #' data.frame( 17 | #' x = rnorm(500), 18 | #' y = rnorm(500) 19 | #' ), 20 | #' data.frame( 21 | #' x = rnorm(5, mean = 10, sd = 0.2), 22 | #' y = rnorm(5, mean = 10, sd = 0.2) 23 | #' ) 24 | #' ) 25 | #' find_tda_bw(X, fast = TRUE) 26 | #' 27 | #' @export 28 | find_tda_bw <- function(X, fast = TRUE, gamma = 0.97, use_differences = FALSE) { 29 | stopifnot(gamma > 0 && gamma <= 1) 30 | X <- as.matrix(X) 31 | 32 | # select a subset of X for tda computation 33 | if (fast) { 34 | inds <- subset_for_tda(X) 35 | Xsub <- X[inds, ] 36 | } else { 37 | Xsub <- X 38 | } 39 | 40 | if (NCOL(X) == 1L) { 41 | phom <- TDAstats::calculate_homology(dist(Xsub), format = "distmat") 42 | } else { 43 | phom <- TDAstats::calculate_homology(Xsub, dim = 0) 44 | } 45 | 46 | death_radi <- phom[, 3L] 47 | 48 | # Added so that very small death radi are not chosen 49 | if (use_differences) { 50 | med_radi <- median(death_radi) 51 | death_radi_upper <- death_radi[death_radi >= med_radi] 52 | dr_thres_diff <- diff(death_radi_upper) 53 | return(death_radi_upper[which.max(dr_thres_diff)]) 54 | } else { 55 | m <- NCOL(X) 56 | return(unname(quantile(death_radi, probs = gamma, type = 8L)^(2/m))) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /R/autoplot_lookout.R: -------------------------------------------------------------------------------- 1 | #' Plots outliers identified by lookout algorithm. 2 | #' 3 | #' Scatterplot of two columns from the data set with outliers highlighted. 4 | #' 5 | #' @param object The output of the function `lookout`. 6 | #' @param columns Which columns of the original data to plot 7 | #' (specified as either numbers or strings) 8 | #' @param ... Other arguments currently ignored. 9 | #' 10 | #' @return A ggplot object. 11 | #' 12 | #' @examples 13 | #' X <- rbind( 14 | #' data.frame( 15 | #' x = rnorm(500), 16 | #' y = rnorm(500) 17 | #' ), 18 | #' data.frame( 19 | #' x = rnorm(5, mean = 10, sd = 0.2), 20 | #' y = rnorm(5, mean = 10, sd = 0.2) 21 | #' ) 22 | #' ) 23 | #' lo <- lookout(X) 24 | #' autoplot(lo) 25 | #' @export 26 | autoplot.lookoutliers <- function(object, columns = 1:2, ...) { 27 | # Column names 28 | varnames <- colnames(object$data) 29 | if (is.null(varnames)) { 30 | varnames <- paste0("V", seq(NCOL(object$data))) 31 | } 32 | X <- as.data.frame(object$data) 33 | colnames(X) <- varnames 34 | if (is.character(columns)) { 35 | columns <- match(columns, varnames) 36 | } else { 37 | columns <- columns[columns <= NCOL(X)] 38 | } 39 | 40 | # Outliers 41 | outliers <- NULL 42 | X$outliers <- rep(FALSE, NROW(X)) 43 | X$outliers[object$outliers[, "Outliers"]] <- TRUE 44 | 45 | # y axis 46 | if (length(columns) > 1) { 47 | ..y <- X[, columns[2L]] 48 | ..yvar <- varnames[columns[2L]] 49 | } else { 50 | ..y <- 0 51 | ..yvar <- "" 52 | } 53 | 54 | # Produce plot 55 | p <- ggplot2::ggplot(X, ggplot2::aes(x = X[, columns[1L]], y = ..y)) + 56 | ggplot2::geom_point(ggplot2::aes(col = outliers)) + 57 | ggplot2::labs(x = varnames[columns[1L]], y = ..yvar) + 58 | ggplot2::scale_color_manual(values = c(`TRUE` = "red", `FALSE` = "black")) + 59 | ggplot2::guides(color = "none") 60 | if (NCOL(object$data) == 1L) { 61 | p <- p + ggplot2::scale_y_continuous(breaks = NULL) 62 | } 63 | p 64 | } 65 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # lookout 17 | 18 | 19 | [![R-CMD-check](https://github.com/sevvandi/lookout/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/sevvandi/lookout/actions/workflows/R-CMD-check.yaml) 20 | [![CRAN status](https://www.r-pkg.org/badges/version/lookout)](https://CRAN.R-project.org/package=lookout) 21 | 22 | 23 | **lookout** identifies outliers in data using leave-one-out kernel density estimates and extreme value theory. The bandwidth for kernel density estimates is computed using persistent homology, a technique in topological data analysis. Using the peak-over-threshold method, a Generalized Pareto Distribution is fitted to the log of leave-one-out kde values to identify outliers. 24 | 25 | See [Kandanaarachchi and Hyndman (2021)](https://robjhyndman.com/publications/lookout/) for the underlying methodology. 26 | 27 | ## Installation 28 | 29 | You can install the released version of lookout from [CRAN](https://CRAN.R-project.org) with: 30 | 31 | ``` r 32 | #install.packages("lookout") 33 | ``` 34 | 35 | And the development version from [GitHub](https://github.com/) with: 36 | 37 | ``` r 38 | # install.packages("devtools") 39 | devtools::install_github("sevvandi/lookout") 40 | ``` 41 | 42 | ## Example 43 | 44 | ```{r} 45 | library(lookout) 46 | lo <- lookout(faithful) 47 | lo 48 | autoplot(lo) 49 | ``` 50 | 51 | Next we look at outlier persistence. The outlier persistence plot shows the outliers that persist over a range of bandwidth values for different levels of significance. The strength is inversely proportional to the level of significance. If the level of significance is 0.01, then the strength is 10 and if it is 0.1, then the strength is 1. 52 | 53 | ```{r} 54 | persistence <- persisting_outliers(faithful) 55 | autoplot(persistence) 56 | ``` 57 | -------------------------------------------------------------------------------- /man/persisting_outliers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/outlier_persistence.R 3 | \name{persisting_outliers} 4 | \alias{persisting_outliers} 5 | \title{Computes outlier persistence for a range of significance values.} 6 | \usage{ 7 | persisting_outliers( 8 | X, 9 | alpha = seq(0.01, 0.1, by = 0.01), 10 | st_qq = 0.9, 11 | scale = TRUE, 12 | num_steps = 20, 13 | old_version = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{X}{The input data in a matrix, data.frame, or tibble format. All 18 | columns should be numeric.} 19 | 20 | \item{alpha}{Grid of significance levels.} 21 | 22 | \item{st_qq}{The starting quantile for death radii sequence. This will be 23 | used to compute the starting bandwidth value.} 24 | 25 | \item{scale}{If \code{TRUE}, the data is scaled. Default is \code{TRUE}. Which 26 | scaling method is used depends on the \code{old_version} parameter. 27 | See \code{\link{lookout}} for details.} 28 | 29 | \item{num_steps}{The length of the bandwidth sequence.} 30 | 31 | \item{old_version}{Logical indicator of which version of the algorithm to use.} 32 | } 33 | \value{ 34 | A list with the following components: 35 | \item{\code{out}}{A 3D array of \code{N x num_steps x num_alpha} where 36 | \code{N} denotes the number of observations, \code{num_steps} denote the 37 | length of the bandwidth sequence, and \code{num_alpha} denotes the number of 38 | significance levels. This is a binary array and the entries are set to 1 if 39 | that observation is an outlier for that particular bandwidth and significance 40 | level.} 41 | \item{\code{bw}}{The set of bandwidth values.} 42 | \item{\code{gpdparas}}{The GPD parameters used. } 43 | \item{\code{lookoutbw}}{The bandwidth chosen by the algorithm \code{lookout} 44 | using persistent homology.} 45 | } 46 | \description{ 47 | This function computes outlier persistence for a range of significance 48 | values, using the algorithm lookout, an outlier detection method that uses 49 | leave-one-out kernel density estimates and generalized Pareto distributions 50 | to find outliers. 51 | } 52 | \examples{ 53 | X <- rbind( 54 | data.frame( 55 | x = rnorm(500), 56 | y = rnorm(500) 57 | ), 58 | data.frame( 59 | x = rnorm(5, mean = 10, sd = 0.2), 60 | y = rnorm(5, mean = 10, sd = 0.2) 61 | ) 62 | ) 63 | plot(X, pch = 19) 64 | outliers <- persisting_outliers(X, scale = FALSE) 65 | outliers 66 | autoplot(outliers) 67 | } 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # lookout 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/sevvandi/lookout/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/sevvandi/lookout/actions/workflows/R-CMD-check.yaml) 9 | [![CRAN 10 | status](https://www.r-pkg.org/badges/version/lookout)](https://CRAN.R-project.org/package=lookout) 11 | 12 | 13 | **lookout** identifies outliers in data using leave-one-out kernel 14 | density estimates and extreme value theory. The bandwidth for kernel 15 | density estimates is computed using persistent homology, a technique in 16 | topological data analysis. Using the peak-over-threshold method, a 17 | Generalized Pareto Distribution is fitted to the log of leave-one-out 18 | kde values to identify outliers. 19 | 20 | See [Kandanaarachchi and Hyndman 21 | (2021)](https://robjhyndman.com/publications/lookout/) for the 22 | underlying methodology. 23 | 24 | ## Installation 25 | 26 | You can install the released version of lookout from 27 | [CRAN](https://CRAN.R-project.org) with: 28 | 29 | ``` r 30 | #install.packages("lookout") 31 | ``` 32 | 33 | And the development version from [GitHub](https://github.com/) with: 34 | 35 | ``` r 36 | # install.packages("devtools") 37 | devtools::install_github("sevvandi/lookout") 38 | ``` 39 | 40 | ## Example 41 | 42 | ``` r 43 | library(lookout) 44 | lo <- lookout(faithful) 45 | lo 46 | #> Leave-out-out KDE outliers using lookout algorithm 47 | #> 48 | #> Call: lookout(X = faithful) 49 | #> 50 | #> Outliers Probability 51 | #> 1 6 0.005553188 52 | #> 2 24 0.006423949 53 | #> 3 46 0.007934127 54 | #> 4 149 0.008300670 55 | #> 5 158 0.007242257 56 | #> 6 197 0.004333429 57 | #> 7 211 0.000000000 58 | #> 8 244 0.004956339 59 | autoplot(lo) 60 | ``` 61 | 62 | 63 | 64 | Next we look at outlier persistence. The outlier persistence plot shows 65 | the outliers that persist over a range of bandwidth values for different 66 | levels of significance. The strength is inversely proportional to the 67 | level of significance. If the level of significance is 0.01, then the 68 | strength is 10 and if it is 0.1, then the strength is 1. 69 | 70 | ``` r 71 | persistence <- persisting_outliers(faithful) 72 | autoplot(persistence) 73 | ``` 74 | 75 | 76 | -------------------------------------------------------------------------------- /man/mvscale.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mvscale.R 3 | \name{mvscale} 4 | \alias{mvscale} 5 | \title{Compute robust multivariate scaled data} 6 | \usage{ 7 | mvscale( 8 | object, 9 | center = stats::median, 10 | scale = robustbase::s_Qn, 11 | cov = robustbase::covOGK, 12 | warning = TRUE 13 | ) 14 | } 15 | \arguments{ 16 | \item{object}{A vector, matrix, or data frame containing some numerical data.} 17 | 18 | \item{center}{A function to compute the center of each numerical variable. Set 19 | to NULL if no centering is required.} 20 | 21 | \item{scale}{A function to scale each numerical variable. When 22 | \code{cov = robustbase::covOGK()}, it is passed as the \code{sigmamu} argument.} 23 | 24 | \item{cov}{A function to compute the covariance matrix. Set to NULL if no rotation required.} 25 | 26 | \item{warning}{Should a warning be issued if non-numeric columns are ignored?} 27 | } 28 | \value{ 29 | A vector, matrix or data frame of the same size and class as \code{object}, 30 | but with numerical variables replaced by scaled versions. 31 | } 32 | \description{ 33 | A multivariate version of \code{\link[base:scale]{base::scale()}}, that takes account 34 | of the covariance matrix of the data, and uses robust estimates 35 | of center, scale and covariance by default. The centers are removed using medians, the 36 | scale function is the IQR, and the covariance matrix is estimated using a 37 | robust OGK estimate. The data are scaled using the Cholesky decomposition of 38 | the inverse covariance. Then the scaled data are returned. 39 | } 40 | \details{ 41 | Optionally, the centering and scaling can be done for each variable 42 | separately, so there is no rotation of the data, by setting \code{cov = NULL}. 43 | Also optionally, non-robust methods can be used by specifying \code{center = mean}, 44 | \code{scale = stats::sd()}, and \code{cov = stats::cov()}. Any non-numeric columns are retained 45 | with a warning. 46 | } 47 | \examples{ 48 | # Univariate z-scores (no rotation) 49 | z <- mvscale(faithful, center = mean, scale = sd, cov = NULL, warning = FALSE) 50 | # Non-robust scaling with rotation 51 | z <- mvscale(faithful, center = mean, cov = stats::cov, warning = FALSE) 52 | # Robust scaling and rotation 53 | z <- mvscale(faithful, warning = FALSE) 54 | } 55 | \seealso{ 56 | \code{\link[base:scale]{base::scale()}}, \code{\link[stats:sd]{stats::sd()}}, \code{\link[stats:cor]{stats::cov()}}, \code{\link[robustbase:covOGK]{robustbase::covOGK()}}, \code{\link[robustbase:Qn]{robustbase::s_Qn()}} 57 | } 58 | \author{ 59 | Rob J Hyndman 60 | } 61 | -------------------------------------------------------------------------------- /R/autoplot_persistence.R: -------------------------------------------------------------------------------- 1 | #' Plots outlier persistence for a range of significance levels. 2 | #' 3 | #' This function plots outlier persistence for a range of significance levels 4 | #' using the algorithm lookout, an outlier detection method that uses 5 | #' leave-one-out kernel density estimates and generalized Pareto distributions 6 | #' to find outliers. 7 | #' 8 | #' @param object The output of the function `persisting_outliers`. 9 | #' @param alpha The significance levels to plot. 10 | #' @param ... Other arguments currently ignored. 11 | #' 12 | #' @return A ggplot object. 13 | #' 14 | #' @examples 15 | #' X <- rbind( 16 | #' data.frame( 17 | #' x = rnorm(500), 18 | #' y = rnorm(500) 19 | #' ), 20 | #' data.frame( 21 | #' x = rnorm(5, mean = 10, sd = 0.2), 22 | #' y = rnorm(5, mean = 10, sd = 0.2) 23 | #' ) 24 | #' ) 25 | #' plot(X, pch = 19) 26 | #' outliers <- persisting_outliers(X, scale = FALSE) 27 | #' autoplot(outliers) 28 | #' @export 29 | autoplot.persistingoutliers <- function(object, alpha = object$alpha, ...) { 30 | which_alpha <- (round(object$alpha, 4) %in% round(alpha, 4)) 31 | if (all(!which_alpha)) { 32 | stop("No specified alpha values available.") 33 | } 34 | outwts <- apply(object$out[, , which_alpha, drop = FALSE], c(1, 2), sum) 35 | outwtsg <- cbind.data.frame(seq(NROW(outwts)), outwts) 36 | colnames(outwtsg)[1] <- "Observation" 37 | col1 <- max(which(colSums(outwtsg) != 0)) 38 | outwtsg <- outwtsg[, seq(col1)] 39 | 40 | # Long form 41 | dfl <- tidyr::pivot_longer( 42 | outwtsg, 43 | -Observation, 44 | names_to = "bw", 45 | values_to = "Strength" 46 | ) 47 | # Add bandwidths 48 | dfl$Bandwidth <- object$bw[as.integer(dfl$bw)] 49 | 50 | # Colours 51 | if (length(alpha) > 1) { 52 | col_pal1 <- c( 53 | "white", 54 | "#ffffcc", 55 | "#ffeda0", 56 | "#fed976", 57 | "#feb24c", 58 | "#fd8d3c", 59 | "#fc4e2a", 60 | "#e31a1c", 61 | "#bd0026", 62 | "#800026" 63 | ) 64 | } else { 65 | col_pal1 <- c("white", "black") 66 | } 67 | 68 | Observation <- Bandwidth <- Strength <- NULL 69 | p <- ggplot2::ggplot( 70 | dfl, 71 | ggplot2::aes(x = Bandwidth, y = Observation, fill = Strength) 72 | ) + 73 | ggplot2::geom_raster() + 74 | ggplot2::scale_fill_gradientn(colours = col_pal1) + 75 | ggplot2::theme( 76 | panel.grid.major = ggplot2::element_blank(), 77 | panel.grid.minor = ggplot2::element_blank(), 78 | panel.background = ggplot2::element_blank(), 79 | axis.line = ggplot2::element_line(colour = "black") 80 | ) 81 | if (length(alpha) == 1L) { 82 | p <- p + ggplot2::theme(legend.position = "none") 83 | } 84 | 85 | return(p) 86 | } 87 | -------------------------------------------------------------------------------- /man/lookout.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lookoutliers.R 3 | \name{lookout} 4 | \alias{lookout} 5 | \title{Identifies outliers using the algorithm lookout.} 6 | \usage{ 7 | lookout( 8 | X, 9 | alpha = 0.01, 10 | beta = 0.9, 11 | gamma = 0.97, 12 | bw = NULL, 13 | gpd = NULL, 14 | scale = TRUE, 15 | fast = NROW(X) > 1000, 16 | old_version = FALSE 17 | ) 18 | } 19 | \arguments{ 20 | \item{X}{The numerical input data in a data.frame, matrix or tibble format.} 21 | 22 | \item{alpha}{The level of significance. Default is \code{0.01}. So there is 23 | a 1/100 chance of any point being falsely classified as an outlier.} 24 | 25 | \item{beta}{The quantile threshold used in the GPD estimation. Default is \code{0.90}. 26 | To ensure there is enough data available, values greater than 0.90 are set to 0.90.} 27 | 28 | \item{gamma}{Parameter for bandwidth calculation giving the quantile of the 29 | Rips death radii to use for the bandwidth. Default is \code{0.97}. Ignored 30 | under the old version; where the lower limit of the maximum Rips death radii 31 | difference is used. Also ignored if \code{bw} is provided.} 32 | 33 | \item{bw}{Bandwidth parameter. If \code{NULL} (default), the bandwidth is 34 | found using Persistent Homology.} 35 | 36 | \item{gpd}{Generalized Pareto distribution parameters. If \code{NULL} (the 37 | default), these are estimated from the data.} 38 | 39 | \item{scale}{If \code{TRUE}, the data is standardized. Using the old version, 40 | unit scaling is applied so that each column is in the range \code{[0,1]}. 41 | Under the new version, robust rotation and scaling is used so that the columns 42 | are approximately uncorrelated with unit variance. Default is \code{TRUE}.} 43 | 44 | \item{fast}{If \code{TRUE} (default), makes the computation faster by 45 | sub-setting the data for the bandwidth calculation.} 46 | 47 | \item{old_version}{Logical indicator of which version of the algorithm to use. 48 | Default is FALSE, meaning the newer version is used.} 49 | } 50 | \value{ 51 | A list with the following components: 52 | \item{\code{outliers}}{The set of outliers.} 53 | \item{\code{outlier_probability}}{The GPD probability of the data.} 54 | \item{\code{outlier_scores}}{The outlier scores of the data.} 55 | \item{\code{bandwidth}}{The bandwdith selected using persistent homology. } 56 | \item{\code{kde}}{The kernel density estimate values.} 57 | \item{\code{lookde}}{The leave-one-out kde values.} 58 | \item{\code{gpd}}{The fitted GPD parameters.} 59 | } 60 | \description{ 61 | This function identifies outliers using the algorithm lookout, an outlier 62 | detection method that uses leave-one-out kernel density estimates and 63 | generalized Pareto distributions to find outliers. 64 | } 65 | \examples{ 66 | X <- rbind( 67 | data.frame( 68 | x = rnorm(500), 69 | y = rnorm(500) 70 | ), 71 | data.frame( 72 | x = rnorm(5, mean = 10, sd = 0.2), 73 | y = rnorm(5, mean = 10, sd = 0.2) 74 | ) 75 | ) 76 | lo <- lookout(X) 77 | lo 78 | autoplot(lo) 79 | } 80 | -------------------------------------------------------------------------------- /R/outlier_persistence.R: -------------------------------------------------------------------------------- 1 | #' Computes outlier persistence for a range of significance values. 2 | #' 3 | #' This function computes outlier persistence for a range of significance 4 | #' values, using the algorithm lookout, an outlier detection method that uses 5 | #' leave-one-out kernel density estimates and generalized Pareto distributions 6 | #' to find outliers. 7 | #' 8 | #' @param X The input data in a matrix, data.frame, or tibble format. All 9 | #' columns should be numeric. 10 | #' @param alpha Grid of significance levels. 11 | #' @param st_qq The starting quantile for death radii sequence. This will be 12 | #' used to compute the starting bandwidth value. 13 | #' @param scale If \code{TRUE}, the data is scaled. Default is \code{TRUE}. Which 14 | #' scaling method is used depends on the \code{old_version} parameter. 15 | #' See \code{\link{lookout}} for details. 16 | #' @param old_version Logical indicator of which version of the algorithm to use. 17 | #' @param num_steps The length of the bandwidth sequence. 18 | #' 19 | #' @return A list with the following components: 20 | #' \item{\code{out}}{A 3D array of \code{N x num_steps x num_alpha} where 21 | #' \code{N} denotes the number of observations, \code{num_steps} denote the 22 | #' length of the bandwidth sequence, and \code{num_alpha} denotes the number of 23 | #' significance levels. This is a binary array and the entries are set to 1 if 24 | #' that observation is an outlier for that particular bandwidth and significance 25 | #' level.} 26 | #' \item{\code{bw}}{The set of bandwidth values.} 27 | #' \item{\code{gpdparas}}{The GPD parameters used. } 28 | #' \item{\code{lookoutbw}}{The bandwidth chosen by the algorithm \code{lookout} 29 | #' using persistent homology.} 30 | #' 31 | #' @examples 32 | #' X <- rbind( 33 | #' data.frame( 34 | #' x = rnorm(500), 35 | #' y = rnorm(500) 36 | #' ), 37 | #' data.frame( 38 | #' x = rnorm(5, mean = 10, sd = 0.2), 39 | #' y = rnorm(5, mean = 10, sd = 0.2) 40 | #' ) 41 | #' ) 42 | #' plot(X, pch = 19) 43 | #' outliers <- persisting_outliers(X, scale = FALSE) 44 | #' outliers 45 | #' autoplot(outliers) 46 | #' @export 47 | 48 | persisting_outliers <- function( 49 | X, 50 | alpha = seq(0.01, 0.1, by = 0.01), 51 | st_qq = 0.9, 52 | scale = TRUE, 53 | num_steps = 20, 54 | old_version = FALSE 55 | ) { 56 | # Prepare X matrix 57 | X <- as.matrix(X) 58 | if (scale) { 59 | if (old_version) { 60 | X <- unitize(X) 61 | } else { 62 | X <- mvscale(X) 63 | } 64 | } 65 | 66 | # Calculate persistent homology 67 | if (NCOL(X) == 1L) { 68 | phom <- TDAstats::calculate_homology(dist(X), format = "distmat") 69 | } else { 70 | phom <- TDAstats::calculate_homology(X, dim = 0) 71 | } 72 | 73 | # Find bandwiths 74 | death_radi <- phom[, 3L] 75 | qq_st <- quantile(death_radi, probs = st_qq) 76 | qq_en <- max(death_radi) * sqrt(5) 77 | bw_vals <- seq(qq_st, qq_en, length.out = num_steps) 78 | q_thres <- quantile(death_radi, probs = 0.5) 79 | dr_thres <- death_radi[death_radi >= q_thres] 80 | dr_thres_diff <- diff(dr_thres) 81 | max_persist_ind <- which.max(dr_thres_diff) 82 | ind1 <- min(which(death_radi >= q_thres)) 83 | ind <- max_persist_ind + ind1 - 1L 84 | bw_fixed <- death_radi[ind] * sqrt(5) 85 | 86 | # Find outliers 87 | lookoutobj1 <- lookout( 88 | X, 89 | alpha = 0.05, 90 | scale = FALSE, 91 | bw = bw_fixed, 92 | old_version = old_version 93 | ) 94 | paras <- lookoutobj1$gpd[1:2] 95 | output <- array(0, dim = c(dim(X)[1], num_steps, length(alpha))) 96 | for (i in seq_along(bw_vals)) { 97 | lookoutobj <- lookout( 98 | X, 99 | alpha = 0.05, 100 | scale = FALSE, 101 | bw = bw_vals[i], 102 | gpd = paras, 103 | old_version = old_version 104 | ) 105 | for (j in seq_along(alpha)) { 106 | outinds <- which(lookoutobj$outlier_probability < alpha[j]) 107 | output[outinds, i, j] <- 1 108 | } 109 | } 110 | 111 | # Return results 112 | structure( 113 | list( 114 | out = output, 115 | bw = bw_vals, 116 | gpdparas = paras, 117 | lookoutbw = bw_fixed, 118 | alpha = alpha, 119 | call = match.call() 120 | ), 121 | class = "persistingoutliers" 122 | ) 123 | } 124 | -------------------------------------------------------------------------------- /R/mvscale.R: -------------------------------------------------------------------------------- 1 | #' Compute robust multivariate scaled data 2 | #' 3 | #' @description A multivariate version of [base::scale()], that takes account 4 | #' of the covariance matrix of the data, and uses robust estimates 5 | #' of center, scale and covariance by default. The centers are removed using medians, the 6 | #' scale function is the IQR, and the covariance matrix is estimated using a 7 | #' robust OGK estimate. The data are scaled using the Cholesky decomposition of 8 | #' the inverse covariance. Then the scaled data are returned. 9 | #' 10 | #' @details Optionally, the centering and scaling can be done for each variable 11 | #' separately, so there is no rotation of the data, by setting `cov = NULL`. 12 | #' Also optionally, non-robust methods can be used by specifying `center = mean`, 13 | #' `scale = stats::sd()`, and `cov = stats::cov()`. Any non-numeric columns are retained 14 | #' with a warning. 15 | #' 16 | #' @param object A vector, matrix, or data frame containing some numerical data. 17 | #' @param center A function to compute the center of each numerical variable. Set 18 | #' to NULL if no centering is required. 19 | #' @param scale A function to scale each numerical variable. When 20 | #' `cov = robustbase::covOGK()`, it is passed as the `sigmamu` argument. 21 | #' @param cov A function to compute the covariance matrix. Set to NULL if no rotation required. 22 | #' @param warning Should a warning be issued if non-numeric columns are ignored? 23 | #' @return A vector, matrix or data frame of the same size and class as `object`, 24 | #' but with numerical variables replaced by scaled versions. 25 | #' @seealso [base::scale()], [stats::sd()], [stats::cov()], [robustbase::covOGK()], [robustbase::s_Qn()] 26 | #' @author Rob J Hyndman 27 | #' @examples 28 | #' # Univariate z-scores (no rotation) 29 | #' z <- mvscale(faithful, center = mean, scale = sd, cov = NULL, warning = FALSE) 30 | #' # Non-robust scaling with rotation 31 | #' z <- mvscale(faithful, center = mean, cov = stats::cov, warning = FALSE) 32 | #' # Robust scaling and rotation 33 | #' z <- mvscale(faithful, warning = FALSE) 34 | #' @export 35 | mvscale <- function( 36 | object, 37 | center = stats::median, 38 | scale = robustbase::s_Qn, 39 | cov = robustbase::covOGK, 40 | warning = TRUE 41 | ) { 42 | d <- NCOL(object) 43 | vec <- FALSE # Indicator if object is a vector 44 | # We find the numerical columns and convert to a matrix 45 | # First deal with vector inputs 46 | if (d == 1L & !inherits(object, "matrix") & !inherits(object, "data.frame")) { 47 | numeric_col <- is.numeric(object) 48 | if (!numeric_col) { 49 | stop("Input must be numeric") 50 | } 51 | vec <- TRUE 52 | mat <- as.matrix(object) 53 | } else if (inherits(object, "matrix")) { 54 | # It is already a matrix 55 | if (!is.numeric(object)) { 56 | stop("Input must be numeric") 57 | } 58 | numeric_col <- rep(TRUE, NCOL(object)) 59 | mat <- object 60 | } else { 61 | # It must be a data frame. So let's find the numeric columns 62 | numeric_col <- unlist(lapply(object, is.numeric)) 63 | if (!all(numeric_col) & warning) { 64 | warning( 65 | "Ignoring non-numeric columns: ", 66 | paste(names(object)[!numeric_col], collapse = ", ") 67 | ) 68 | } 69 | mat <- as.matrix(object[, numeric_col]) 70 | } 71 | # Remove centers 72 | if (!is.null(center)) { 73 | med <- apply(mat, 2, center) 74 | mat <- sweep(mat, 2L, med) 75 | } 76 | # Create more resilient version of scale function 77 | if (!is.null(scale)) { 78 | my_scale <- function(x, ..., na.rm = TRUE) { 79 | s <- scale(x, ..., na.rm = na.rm) 80 | s[s == 0] <- 1 # Avoid division by zero 81 | return(s) 82 | } 83 | } else { 84 | my_scale <- function(x, ..., na.rm = TRUE) { 85 | 1 86 | } 87 | } 88 | # Scale 89 | if (d == 1L) { 90 | z <- mat / my_scale(mat) 91 | if (vec) { 92 | return(c(z)) 93 | } 94 | } else if (!is.null(cov)) { 95 | if (identical(cov, robustbase::covOGK)) { 96 | S <- cov(mat, sigmamu = my_scale)$cov 97 | } else { 98 | S <- cov(mat) 99 | } 100 | Sinv <- try(solve(S), silent = TRUE) 101 | if (inherits(Sinv, "try-error")) { 102 | # Add a small ridge to the covariance matrix to avoid singularity issues 103 | Sinv <- try(solve(S + diag(1e-6, nrow(S), ncol(S))), silent = TRUE) 104 | if (inherits(Sinv, "try-error")) { 105 | # Add a bigger ridge 106 | Sinv <- solve(S + diag(1e-2, nrow(S), ncol(S))) 107 | } 108 | } 109 | U <- chol(Sinv) 110 | z <- mat %*% t(U) 111 | } else { 112 | s <- apply(mat, 2, my_scale) 113 | z <- sweep(mat, 2L, s, "/") 114 | } 115 | # Convert back to matrix, data frame or tibble if necessary 116 | idx <- which(numeric_col) 117 | for (i in seq_along(idx)) { 118 | object[, idx[i]] <- z[, i] 119 | } 120 | # Rename columns if there has been rotation 121 | if (!is.null(cov)) { 122 | names(object)[numeric_col] <- paste0("z", seq(sum(numeric_col))) 123 | } 124 | return(object) 125 | } 126 | -------------------------------------------------------------------------------- /R/lookoutliers.R: -------------------------------------------------------------------------------- 1 | #' Identifies outliers using the algorithm lookout. 2 | #' 3 | #' This function identifies outliers using the algorithm lookout, an outlier 4 | #' detection method that uses leave-one-out kernel density estimates and 5 | #' generalized Pareto distributions to find outliers. 6 | #' 7 | #' @param X The numerical input data in a data.frame, matrix or tibble format. 8 | #' @param alpha The level of significance. Default is \code{0.01}. So there is 9 | #' a 1/100 chance of any point being falsely classified as an outlier. 10 | #' @param beta The quantile threshold used in the GPD estimation. Default is \code{0.90}. 11 | #' To ensure there is enough data available, values greater than 0.90 are set to 0.90. 12 | #' @param gamma Parameter for bandwidth calculation giving the quantile of the 13 | #' Rips death radii to use for the bandwidth. Default is \code{0.97}. Ignored 14 | #' under the old version; where the lower limit of the maximum Rips death radii 15 | #' difference is used. Also ignored if \code{bw} is provided. 16 | #' @param bw Bandwidth parameter. If \code{NULL} (default), the bandwidth is 17 | #' found using Persistent Homology. 18 | #' @param gpd Generalized Pareto distribution parameters. If `NULL` (the 19 | #' default), these are estimated from the data. 20 | #' @param scale If \code{TRUE}, the data is standardized. Using the old version, 21 | #' unit scaling is applied so that each column is in the range \code{[0,1]}. 22 | #' Under the new version, robust rotation and scaling is used so that the columns 23 | #' are approximately uncorrelated with unit variance. Default is \code{TRUE}. 24 | #' @param fast If \code{TRUE} (default), makes the computation faster by 25 | #' sub-setting the data for the bandwidth calculation. 26 | #' @param old_version Logical indicator of which version of the algorithm to use. 27 | #' Default is FALSE, meaning the newer version is used. 28 | #' @return A list with the following components: 29 | #' \item{\code{outliers}}{The set of outliers.} 30 | #' \item{\code{outlier_probability}}{The GPD probability of the data.} 31 | #' \item{\code{outlier_scores}}{The outlier scores of the data.} 32 | #' \item{\code{bandwidth}}{The bandwdith selected using persistent homology. } 33 | #' \item{\code{kde}}{The kernel density estimate values.} 34 | #' \item{\code{lookde}}{The leave-one-out kde values.} 35 | #' \item{\code{gpd}}{The fitted GPD parameters.} 36 | #' 37 | #' @examples 38 | #' X <- rbind( 39 | #' data.frame( 40 | #' x = rnorm(500), 41 | #' y = rnorm(500) 42 | #' ), 43 | #' data.frame( 44 | #' x = rnorm(5, mean = 10, sd = 0.2), 45 | #' y = rnorm(5, mean = 10, sd = 0.2) 46 | #' ) 47 | #' ) 48 | #' lo <- lookout(X) 49 | #' lo 50 | #' autoplot(lo) 51 | #' @export lookout 52 | #' @importFrom stats dist quantile median sd 53 | lookout <- function( 54 | X, 55 | alpha = 0.01, 56 | beta = 0.90, 57 | gamma = 0.97, 58 | bw = NULL, 59 | gpd = NULL, 60 | scale = TRUE, 61 | fast = NROW(X) > 1000, 62 | old_version = FALSE 63 | ) { 64 | # alpha, beta and gamma need to be between 0 and 1 65 | if (alpha < 0 || alpha > 1) { 66 | stop("gamma should be between 0 and 1.") 67 | } 68 | if (beta < 0 || beta > 1) { 69 | stop("gamma should be between 0 and 1.") 70 | } 71 | # gamma needs to be between 0 and 1 72 | if (gamma < 0 || gamma > 1) { 73 | stop("gamma should be between 0 and 1.") 74 | } 75 | 76 | # Prepare X matrix 77 | origX <- X 78 | X <- as.matrix(X) 79 | if (scale) { 80 | if (old_version) { 81 | X <- unitize(X) 82 | } else { 83 | X <- mvscale(X) 84 | } 85 | } 86 | 87 | # Find bandwidth and scale for Epanechnikov kernel 88 | if (is.null(bw)) { 89 | bandwidth <- find_tda_bw( 90 | X, 91 | fast = fast, 92 | gamma, 93 | use_differences = old_version 94 | ) * 95 | sqrt(5) 96 | } else { 97 | bandwidth <- bw 98 | } 99 | 100 | # find kde and lookde estimates 101 | kdeobj <- lookde(X, bandwidth = bandwidth, fast = fast) 102 | log_dens <- -log(kdeobj$kde) 103 | 104 | # find POT GPD parameters, threshold 0.90 105 | beta <- min(0.9, beta) 106 | qq <- quantile(log_dens, probs = beta) 107 | 108 | # check if there are points above the quantile 109 | if (!any(log_dens > qq)) { 110 | stop("No points above the quantile for GPD estimation") 111 | } 112 | 113 | if (is.null(gpd)) { 114 | M1 <- evd::fpot(log_dens, qq, std.err = FALSE) 115 | gpd <- M1$estimate[1L:2L] 116 | if (gpd[2] > 0 & !old_version) { 117 | # This should only be done in the new lookout 118 | # This shows that shape is estimated to be positive. 119 | # This should not be the case because log densities are bounded 120 | M1 <- evd::fpot(log_dens, qq, shape = 0, std.err = FALSE) 121 | gpd <- c(M1$estimate, 0) 122 | } 123 | } 124 | # for these Generalized Pareto distribution parameters, compute the 125 | # probabilities of leave-one-out kernel density estimates 126 | potlookde <- evd::pgpd( 127 | -log(kdeobj$lookde), 128 | loc = qq, 129 | scale = gpd[1], 130 | shape = gpd[2], 131 | lower.tail = FALSE 132 | ) * 133 | (1 - beta) 134 | 135 | outscores <- 1 - potlookde 136 | # select outliers according to threshold 137 | outliers <- which(potlookde < alpha) 138 | dfout <- cbind.data.frame(outliers, potlookde[outliers]) 139 | colnames(dfout) <- c("Outliers", "Probability") 140 | 141 | structure( 142 | list( 143 | data = origX, 144 | outliers = dfout, 145 | outlier_probability = potlookde, 146 | outlier_scores = outscores, 147 | bandwidth = bandwidth, 148 | kde = kdeobj$kde, 149 | lookde = kdeobj$lookde, 150 | gpd = gpd, 151 | call = match.call() 152 | ), 153 | class = "lookoutliers" 154 | ) 155 | } 156 | 157 | 158 | lookde <- function(x, bandwidth, fast) { 159 | x <- as.matrix(x) 160 | nn <- NROW(x) 161 | 162 | if (fast) { 163 | # To make the nearest neighbour distance computation faster 164 | # select a kk different to nn as follows 165 | kk <- min(max(ceiling(nn / 200), 100), nn, 500) 166 | } else { 167 | kk <- nn 168 | } 169 | 170 | # Epanechnikov kernel density estimate 171 | dist <- RANN::nn2(x, k = kk)$nn.dists 172 | dist[dist > bandwidth] <- NA_real_ 173 | phat <- 0.75 / 174 | (nn * bandwidth) * 175 | rowSums(1 - (dist / bandwidth)^2, na.rm = TRUE) 176 | 177 | # leave one out 178 | kdevalsloo <- 0.75 / ((nn - 1) * (bandwidth)) 179 | lookde <- nn * phat / (nn - 1) - kdevalsloo 180 | 181 | list(x = x, kde = phat, lookde = pmax(lookde, 0)) 182 | } 183 | 184 | 185 | subset_for_tda <- function(X) { 186 | # Leader algorithm in HDoutliers 187 | # Inserted from HDoutliers function getHDmembers 188 | # We cannot call that function because the algorithm only comes to 189 | # effect if the number of rows are greater than 10000 190 | # And we have used RANN::nn2, which is a faster algorithm. 191 | 192 | X <- as.matrix(X) 193 | 194 | n <- nrow(X) 195 | p <- ncol(X) 196 | 197 | Xu <- unitize(X) 198 | 199 | sds <- apply(Xu, 2, sd) 200 | sd_radius <- sqrt(sum(sds^2)) 201 | radius <- min(0.1 / (log(n)^(1 / p)), sd_radius) 202 | members <- rep(list(NULL), n) 203 | exemplars <- 1 204 | members[[1]] <- 1 205 | 206 | for (i in 2:n) { 207 | KNN <- RANN::nn2( 208 | data = Xu[c(exemplars, i), , drop = FALSE], 209 | query = Xu[i, , drop = FALSE], 210 | k = 2 211 | ) 212 | m <- KNN$nn.idx[1, 2] 213 | d <- KNN$nn.dists[1, 2] 214 | if (d < radius) { 215 | curr <- length(exemplars) 216 | l <- exemplars[curr] 217 | members[[l]] <- c(members[[l]], i) 218 | next 219 | } 220 | exemplars <- c(exemplars, i) 221 | members[[i]] <- i 222 | } 223 | # X[exemplars, ] 224 | exemplars 225 | } 226 | --------------------------------------------------------------------------------