├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ └── test-coverage.yaml ├── .gitignore ├── CRAN-RELEASE ├── CRAN-SUBMISSION ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── 00_global_vars.R ├── anomalize-package.R ├── anomalize.R ├── anomalize_clean.R ├── anomalize_methods.R ├── plot_anomalies.R ├── plot_anomaly_decomposition.R ├── prep_tbl_time.R ├── tidyquant_theme_compat.R ├── tidyverse_cran_downloads.R ├── time_apply.R ├── time_decompose.R ├── time_decompose_methods.R ├── time_frequency.R ├── time_recompose.R ├── time_scale_template.R ├── utils.R └── zzz.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── anomalize.Rproj ├── codecov.yml ├── cran-comments.md ├── data-raw └── tidyverse_cran_downloads.R ├── data └── tidyverse_cran_downloads.rda ├── man ├── anomalize-package.Rd ├── anomalize.Rd ├── anomalize_methods.Rd ├── clean_anomalies.Rd ├── decompose_methods.Rd ├── figures │ ├── README-tidyverse_anoms_1-1.png │ ├── README-unnamed-chunk-3-1.png │ └── logo.png ├── plot_anomalies.Rd ├── plot_anomaly_decomposition.Rd ├── prep_tbl_time.Rd ├── tidyverse_cran_downloads.Rd ├── time_apply.Rd ├── time_decompose.Rd ├── time_frequency.Rd ├── time_recompose.Rd └── time_scale_template.Rd ├── pkgdown ├── extra.css └── favicon │ ├── apple-touch-icon-120x120.png │ ├── apple-touch-icon-152x152.png │ ├── apple-touch-icon-180x180.png │ ├── apple-touch-icon-60x60.png │ ├── apple-touch-icon-76x76.png │ ├── apple-touch-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ └── favicon.ico ├── tests ├── testthat.R └── testthat │ ├── _snaps │ ├── anomalize.md │ ├── plot_anomaly_decomposition.md │ ├── time_decompose.md │ └── time_recompose.md │ ├── test-anomalize.R │ ├── test-clean_anomalies.R │ ├── test-plot_anomalies.R │ ├── test-plot_anomaly_decomposition.R │ ├── test-prep_tbl_time.R │ ├── test-time_apply.R │ ├── test-time_decompose.R │ ├── test-time_frequency.R │ ├── test-time_recompose.R │ └── test-utils.R └── vignettes ├── .gitignore ├── anomalize_methods.Rmd ├── anomalize_quick_start_guide.Rmd └── forecasting_with_cleaned_anomalies.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^cran-comments\.md$ 5 | ^_pkgdown\.yml$ 6 | ^docs$ 7 | ^data-raw$ 8 | ^\.travis\.yml$ 9 | ^codecov\.yml$ 10 | ^doc$ 11 | ^Meta$ 12 | ^CRAN-RELEASE$ 13 | ^CRAN-SUBMISSION$ 14 | ^\.github$ 15 | ^pkgdown$ 16 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v3 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.4.1 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | inst/doc 6 | doc 7 | Meta 8 | /doc/ 9 | /Meta/ 10 | docs 11 | .DS_Store 12 | -------------------------------------------------------------------------------- /CRAN-RELEASE: -------------------------------------------------------------------------------- 1 | This package was submitted to CRAN on 2020-10-20. 2 | Once it is accepted, delete this file and tag the release (commit de0d706). 3 | -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 0.3.0 2 | Date: 2023-10-31 20:39:42 UTC 3 | SHA: ceae56d649369a8300cf32d511743439683bc5a4 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: anomalize 2 | Type: Package 3 | Title: Tidy Anomaly Detection 4 | Version: 0.3.0.9000 5 | Authors@R: c( 6 | person("Matt", "Dancho", email = "mdancho@business-science.io", role = c("aut", "cre")), 7 | person("Davis", "Vaughan", email = "dvaughan@business-science.io", role = c("aut")) 8 | ) 9 | Description: 10 | The 'anomalize' package enables a "tidy" workflow for detecting anomalies in data. 11 | The main functions are time_decompose(), anomalize(), and time_recompose(). 12 | When combined, it's quite simple to decompose time series, detect anomalies, 13 | and create bands separating the "normal" data from the anomalous data at scale (i.e. for multiple time series). 14 | Time series decomposition is used to remove trend and seasonal components via the time_decompose() function 15 | and methods include seasonal decomposition of time series by Loess ("stl") and 16 | seasonal decomposition by piecewise medians ("twitter"). The anomalize() function implements 17 | two methods for anomaly detection of residuals including using an inner quartile range ("iqr") 18 | and generalized extreme studentized deviation ("gesd"). These methods are based on 19 | those used in the 'forecast' package and the Twitter 'AnomalyDetection' package. 20 | Refer to the associated functions for specific references for these methods. 21 | URL: https://business-science.github.io/anomalize/, https://github.com/business-science/anomalize 22 | BugReports: https://github.com/business-science/anomalize/issues 23 | License: GPL (>= 3) 24 | Encoding: UTF-8 25 | LazyData: true 26 | Depends: 27 | R (>= 3.0.0) 28 | Imports: 29 | dplyr, 30 | glue, 31 | timetk, 32 | sweep, 33 | tibbletime (>= 0.1.5), 34 | purrr, 35 | rlang, 36 | tibble, 37 | tidyr (>= 1.0.0), 38 | ggplot2 (>= 3.4.0) 39 | RoxygenNote: 7.2.3 40 | Roxygen: list(markdown = TRUE) 41 | Suggests: 42 | tidyquant, 43 | stringr, 44 | testthat (>= 3.0.0), 45 | knitr, 46 | rmarkdown 47 | VignetteBuilder: knitr 48 | Config/testthat/edition: 3 49 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(anomalize,default) 4 | S3method(anomalize,grouped_df) 5 | S3method(anomalize,tbl_df) 6 | S3method(clean_anomalies,default) 7 | S3method(clean_anomalies,tbl_df) 8 | S3method(plot_anomalies,default) 9 | S3method(plot_anomalies,tbl_time) 10 | S3method(plot_anomaly_decomposition,default) 11 | S3method(plot_anomaly_decomposition,grouped_tbl_time) 12 | S3method(plot_anomaly_decomposition,tbl_time) 13 | S3method(prep_tbl_time,data.frame) 14 | S3method(prep_tbl_time,default) 15 | S3method(prep_tbl_time,tbl_time) 16 | S3method(time_apply,data.frame) 17 | S3method(time_apply,default) 18 | S3method(time_apply,grouped_df) 19 | S3method(time_decompose,default) 20 | S3method(time_decompose,grouped_df) 21 | S3method(time_decompose,grouped_tbl_time) 22 | S3method(time_decompose,tbl_df) 23 | S3method(time_decompose,tbl_time) 24 | S3method(time_recompose,default) 25 | S3method(time_recompose,grouped_df) 26 | S3method(time_recompose,grouped_tbl_time) 27 | S3method(time_recompose,tbl_df) 28 | S3method(time_recompose,tbl_time) 29 | export(anomalize) 30 | export(clean_anomalies) 31 | export(decompose_stl) 32 | export(decompose_twitter) 33 | export(gesd) 34 | export(get_time_scale_template) 35 | export(iqr) 36 | export(plot_anomalies) 37 | export(plot_anomaly_decomposition) 38 | export(prep_tbl_time) 39 | export(set_time_scale_template) 40 | export(time_apply) 41 | export(time_decompose) 42 | export(time_frequency) 43 | export(time_recompose) 44 | export(time_scale_template) 45 | export(time_trend) 46 | import(ggplot2) 47 | importFrom(dplyr,"%>%") 48 | importFrom(dplyr,contains) 49 | importFrom(dplyr,n) 50 | importFrom(dplyr,quo_name) 51 | importFrom(dplyr,row_number) 52 | importFrom(ggplot2,"%+replace%") 53 | importFrom(rlang,"!!!") 54 | importFrom(rlang,"!!") 55 | importFrom(rlang,":=") 56 | importFrom(rlang,.data) 57 | importFrom(stats,as.formula) 58 | importFrom(stats,mad) 59 | importFrom(stats,median) 60 | importFrom(stats,qt) 61 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # anomalize (development version) 2 | 3 | * anomalize works better with ggplot2 3.4.0 4 | 5 | * anomalize no longer depends on tidyverse, devtools and roxygen2 (@olivroy, #70) 6 | 7 | # anomalize 0.3.0 8 | 9 | Prepare for supercession by `timetk`. Note that `anomalize` R package will be maintained for backwards compatibility. Users may wish to add these 2 lines of code to existing codebases that use the legacy anomalize R package: 10 | 11 | ``` r 12 | library(anomalize) 13 | 14 | anomalize <- anomalize::anomalize 15 | plot_anomalies <- anomalize::plot_anomalies 16 | ``` 17 | 18 | # anomalize 0.2.4 19 | 20 | Republish on CRAN. 21 | 22 | # anomalize 0.2.2 23 | 24 | __Bug Fixes__ 25 | 26 | - `theme_tq()`: Fix issues with `%+replace%`, `theme_gray`, and `rel` not found. 27 | 28 | # anomalize 0.2.1 29 | 30 | __Bug Fixes__ 31 | 32 | * Fix issue with sign error in GESD Method (Issue #46). 33 | * Require `tibbletime` >= 0.1.5 34 | 35 | # anomalize 0.2.0 36 | 37 | * `clean_anomalies()` - A new function to simplify cleaning anomalies by replacing with trend and seasonal components. This is useful in preparing data for forecasting. 38 | 39 | * `tidyr` v1.0.0 and `tibbletime` v0.1.3 compatability - Improvements to incorporate the upgraded `tidyr` package. 40 | 41 | # anomalize 0.1.1 42 | 43 | * [Issue #2](https://github.com/business-science/anomalize/issues/2): Bugfixes for various `ggplot2` issues in `plot_anomalies()`. Solves "Error in FUN(X[[i]], ...) : object '.group' not found". 44 | * [Issue #6](https://github.com/business-science/anomalize/issues/6): Bugfixes for invalid unary operator error in `plot_anomaly_decomposition()`. Solves "Error in -x : invalid argument to unary operator". 45 | 46 | 47 | # anomalize 0.1.0 48 | 49 | * Added a `NEWS.md` file to track changes to the package. 50 | -------------------------------------------------------------------------------- /R/00_global_vars.R: -------------------------------------------------------------------------------- 1 | globalVariables(c( 2 | "n", 3 | ".", 4 | ".period_groups", 5 | "data", 6 | "abs_diff_lower", 7 | "abs_diff_upper", 8 | "below_max_anoms", 9 | "centerline", 10 | "critical_value", 11 | "direction", 12 | "index", 13 | "limit_lower", 14 | "limit_upper", 15 | "max_abs_diff", 16 | "outlier", 17 | "outlier_reported", 18 | "sorting", 19 | "test_statistic", 20 | "value", 21 | "observed", 22 | "random", 23 | "remainder", 24 | "seasadj", 25 | "season", 26 | "trend", 27 | "target", 28 | "anomaly", 29 | "key", 30 | "median_spans", 31 | "recomposed_l1", 32 | "recomposed_l2", 33 | "data_names", 34 | "nested.col" 35 | )) 36 | -------------------------------------------------------------------------------- /R/anomalize-package.R: -------------------------------------------------------------------------------- 1 | #' @description 2 | #' The 'anomalize' package enables a "tidy" workflow for detecting anomalies in data. 3 | #' The main functions are time_decompose(), anomalize(), and time_recompose(). 4 | #' When combined, it's quite simple to decompose time series, detect anomalies, 5 | #' and create bands separating the "normal" data from the anomalous data at scale (i.e. for multiple time series). 6 | #' Time series decomposition is used to remove trend and seasonal components via the time_decompose() function 7 | #' and methods include seasonal decomposition of time series by Loess and 8 | #' seasonal decomposition by piecewise medians. The anomalize() function implements 9 | #' two methods for anomaly detection of residuals including using an inner quartile range 10 | #' and generalized extreme studentized deviation. These methods are based on 11 | #' those used in the `forecast` package and the Twitter `AnomalyDetection` package. 12 | #' Refer to the associated functions for specific references for these methods. 13 | #' 14 | #' To learn more about `anomalize`, start with the vignettes: 15 | #' `browseVignettes(package = "anomalize")` 16 | #' @aliases anomalize-package 17 | #' @keywords internal 18 | "_PACKAGE" 19 | 20 | ## usethis namespace: start 21 | #' @importFrom rlang := !! !!! .data 22 | #' @importFrom dplyr %>% n row_number contains quo_name 23 | #' @importFrom stats median mad qt as.formula 24 | #' @import ggplot2 25 | ## usethis namespace: end 26 | NULL 27 | -------------------------------------------------------------------------------- /R/anomalize.R: -------------------------------------------------------------------------------- 1 | #' Detect anomalies using the tidyverse 2 | #' 3 | #' The `anomalize()` function is used to detect outliers in a distribution 4 | #' with no trend or seasonality present. It takes the output of [time_decompose()], 5 | #' which has be de-trended and applies anomaly detection methods to identify outliers. 6 | #' 7 | #' @inheritParams time_apply 8 | #' @param data A `tibble` or `tbl_time` object. 9 | #' @param method The anomaly detection method. One of `"iqr"` or `"gesd"`. 10 | #' The IQR method is faster at the expense of possibly not being quite as accurate. 11 | #' The GESD method has the best properties for outlier detection, but is loop-based 12 | #' and therefore a bit slower. 13 | #' @param alpha Controls the width of the "normal" range. 14 | #' Lower values are more conservative while higher values are less prone 15 | #' to incorrectly classifying "normal" observations. 16 | #' @param max_anoms The maximum percent of anomalies permitted to be identified. 17 | #' @param verbose A boolean. If `TRUE`, will return a list containing useful information 18 | #' about the anomalies. If `FALSE`, just returns the data expanded with the anomalies and 19 | #' the lower (l1) and upper (l2) bounds. 20 | #' 21 | #' @return Returns a `tibble` / `tbl_time` object or list depending on the value of `verbose`. 22 | #' 23 | #' @details 24 | #' The return has three columns: 25 | #' "remainder_l1" (lower limit for anomalies), "remainder_l2" (upper limit for 26 | #' anomalies), and "anomaly" (Yes/No). 27 | #' 28 | #' Use [time_decompose()] to decompose a time series prior to performing 29 | #' anomaly detection with `anomalize()`. Typically, `anomalize()` is 30 | #' performed on the "remainder" of the time series decomposition. 31 | #' 32 | #' For non-time series data (data without trend), the `anomalize()` function can 33 | #' be used without time series decomposition. 34 | #' 35 | #' The `anomalize()` function uses two methods for outlier detection 36 | #' each with benefits. 37 | #' 38 | #' __IQR__: 39 | #' 40 | #' The IQR Method uses an innerquartile range of 25% and 75% to establish a baseline distribution around 41 | #' the median. With the default `alpha = 0.05`, the limits are established by expanding 42 | #' the 25/75 baseline by an IQR Factor of 3 (3X). The IQR Factor = 0.15 / alpha (hense 3X with alpha = 0.05). 43 | #' To increase the IQR Factor controling the limits, decrease the alpha, which makes 44 | #' it more difficult to be an outlier. Increase alpha to make it easier to be an outlier. 45 | #' 46 | #' The IQR method is used in [`forecast::tsoutliers()`](https://github.com/robjhyndman/forecast). 47 | #' 48 | #' 49 | #' __GESD__: 50 | #' 51 | #' The GESD Method (Generlized Extreme Studentized Deviate Test) progressively 52 | #' eliminates outliers using a Student's T-Test comparing the test statistic to a critical value. 53 | #' Each time an outlier is removed, the test statistic is updated. Once test statistic 54 | #' drops below the critical value, all outliers are considered removed. Because this method 55 | #' involves continuous updating via a loop, it is slower than the IQR method. However, it 56 | #' tends to be the best performing method for outlier removal. 57 | #' 58 | #' The GESD method is used in [`AnomalyDection::AnomalyDetectionTs()`](https://github.com/twitter/AnomalyDetection). 59 | #' 60 | #' @references 61 | #' 1. [How to correct outliers once detected for time series data forecasting? Cross Validated, https://stats.stackexchange.com](https://stats.stackexchange.com/questions/69874/how-to-correct-outliers-once-detected-for-time-series-data-forecasting) 62 | #' 2. [Cross Validated: Simple algorithm for online outlier detection of a generic time series. Cross Validated, https://stats.stackexchange.com](https://stats.stackexchange.com/questions/1142/simple-algorithm-for-online-outlier-detection-of-a-generic-time-series?) 63 | #' 3. [Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). 64 | #' A Novel Technique for Long-Term Anomaly Detection in the Cloud. Twitter Inc.](https://www.usenix.org/system/files/conference/hotcloud14/hotcloud14-vallis.pdf) 65 | #' 4. [Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). AnomalyDetection: Anomaly Detection Using 66 | #' Seasonal Hybrid Extreme Studentized Deviate Test. R package version 1.0.](https://github.com/twitter/AnomalyDetection) 67 | #' 5. Alex T.C. Lau (November/December 2015). GESD - A Robust and Effective Technique for Dealing with Multiple Outliers. ASTM Standardization News. www.astm.org/sn 68 | #' 69 | #' @seealso 70 | #' Anomaly Detection Methods (Powers `anomalize`) 71 | #' - [iqr()] 72 | #' - [gesd()] 73 | #' 74 | #' Time Series Anomaly Detection Functions (anomaly detection workflow): 75 | #' - [time_decompose()] 76 | #' - [time_recompose()] 77 | #' 78 | #' @examples 79 | #' \dontrun{ 80 | #' library(dplyr) 81 | #' 82 | #' # Needed to pass CRAN check / This is loaded by default 83 | #' set_time_scale_template(time_scale_template()) 84 | #' 85 | #' tidyverse_cran_downloads %>% 86 | #' time_decompose(count, method = "stl") %>% 87 | #' anomalize(remainder, method = "iqr") 88 | #' } 89 | #' 90 | #' @export 91 | anomalize <- function(data, target, method = c("iqr", "gesd"), 92 | alpha = 0.05, max_anoms = 0.20, verbose = FALSE) { 93 | UseMethod("anomalize", data) 94 | } 95 | 96 | #' @export 97 | anomalize.default <- function(data, target, method = c("iqr", "gesd"), 98 | alpha = 0.05, max_anoms = 0.20, verbose = FALSE) { 99 | stop("Error anomalize(): Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) 100 | } 101 | 102 | #' @export 103 | anomalize.tbl_df <- function(data, target, method = c("iqr", "gesd"), 104 | alpha = 0.05, max_anoms = 0.20, verbose = FALSE) { 105 | 106 | # Checks 107 | if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE) 108 | 109 | # Setup 110 | target_expr <- rlang::enquo(target) 111 | 112 | method <- tolower(method[[1]]) 113 | x <- data %>% dplyr::pull(!! target_expr) 114 | 115 | # Detect Anomalies 116 | # method <- tolower(method[[1]]) 117 | # args <- list(x = data %>% dplyr::pull(!! target_expr), 118 | # alpha = alpha, 119 | # max_anoms = max_anoms, 120 | # verbose = TRUE) 121 | # 122 | # outlier_list <- do.call(method, args) 123 | 124 | # Explicitly call functions 125 | if (method == "iqr") { 126 | outlier_list <- anomalize::iqr(x = x, 127 | alpha = alpha, 128 | max_anoms = max_anoms, 129 | verbose = TRUE) 130 | } else if (method == "gesd") { 131 | outlier_list <- anomalize::gesd(x = x, 132 | alpha = alpha, 133 | max_anoms = max_anoms, 134 | verbose = TRUE) 135 | 136 | } else { 137 | stop("The `method` selected is invalid.", call. = FALSE) 138 | } 139 | 140 | outlier <- outlier_list$outlier 141 | limit_lower <- outlier_list$critical_limits[[1]] 142 | limit_upper <- outlier_list$critical_limits[[2]] 143 | 144 | # Returns 145 | ret <- data %>% 146 | dplyr::mutate(!!paste0(dplyr::quo_name(target_expr), "_l1") := limit_lower, 147 | !!paste0(dplyr::quo_name(target_expr), "_l2") := limit_upper) %>% 148 | tibble::add_column(anomaly = outlier) 149 | 150 | if (verbose) { 151 | ret <- list( 152 | anomalized_tbl = ret, 153 | anomaly_details = outlier_list 154 | ) 155 | 156 | return(ret) 157 | 158 | } else { 159 | return(ret) 160 | } 161 | 162 | } 163 | 164 | #' @export 165 | anomalize.grouped_df <- function(data, target, method = c("iqr", "gesd"), 166 | alpha = 0.05, max_anoms = 0.20, verbose = FALSE, ...) { 167 | 168 | # Checks 169 | if (missing(target)) stop('Error in anomalize(): argument "target" is missing, with no default', call. = FALSE) 170 | if (verbose) warning(glue::glue("Cannot use 'verbose = TRUE' with grouped data.")) 171 | 172 | # Setup 173 | target_expr <- dplyr::enquo(target) 174 | 175 | ret <- data %>% 176 | grouped_mapper( 177 | .f = anomalize, 178 | target = !! target_expr, 179 | method = method[[1]], 180 | alpha = alpha, 181 | max_anoms = max_anoms, 182 | verbose = FALSE, 183 | ...) 184 | 185 | return(ret) 186 | 187 | } 188 | 189 | -------------------------------------------------------------------------------- /R/anomalize_clean.R: -------------------------------------------------------------------------------- 1 | #' Clean anomalies from anomalized data 2 | #' 3 | #' @param data A `tibble` or `tbl_time` object. 4 | #' 5 | #' @return Returns a `tibble` / `tbl_time` object with a new column "observed_cleaned". 6 | #' 7 | #' @details 8 | #' The `clean_anomalies()` function is used to replace outliers with the seasonal and trend component. 9 | #' This is often desirable when forecasting with noisy time series data to improve trend detection. 10 | #' 11 | #' To clean anomalies, the input data must be detrended with `time_decompose()` and anomalized with `anomalize()`. 12 | #' The data can also be recomposed with `time_recompose()`. 13 | #' 14 | #' @seealso 15 | #' Time Series Anomaly Detection Functions (anomaly detection workflow): 16 | #' - [time_decompose()] 17 | #' - [anomalize()] 18 | #' - [time_recompose()] 19 | #' 20 | #' @examples 21 | #' 22 | #' \dontrun{ 23 | #' library(dplyr) 24 | #' 25 | #' # Needed to pass CRAN check / This is loaded by default 26 | #' set_time_scale_template(time_scale_template()) 27 | #' 28 | #' data(tidyverse_cran_downloads) 29 | #' 30 | #' tidyverse_cran_downloads %>% 31 | #' time_decompose(count, method = "stl") %>% 32 | #' anomalize(remainder, method = "iqr") %>% 33 | #' clean_anomalies() 34 | #' } 35 | #' 36 | #' @export 37 | clean_anomalies <- function(data) { 38 | UseMethod("clean_anomalies", data) 39 | } 40 | 41 | #' @export 42 | clean_anomalies.default <- function(data) { 43 | stop("Error clean_anomalies(): Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) 44 | } 45 | 46 | #' @export 47 | clean_anomalies.tbl_df <- function(data) { 48 | 49 | # Checks 50 | check_clean_anomalies_input(data) 51 | 52 | # Get method col 53 | method_col <- get_method_col(data) 54 | 55 | if (method_col == "trend") { 56 | data %>% 57 | dplyr::mutate(observed_cleaned = ifelse(anomaly == "Yes", season + trend, observed)) 58 | } else { 59 | data %>% 60 | dplyr::mutate(observed_cleaned = ifelse(anomaly == "Yes", season + median_spans, observed)) 61 | } 62 | 63 | } 64 | 65 | check_clean_anomalies_input <- function(data) { 66 | 67 | data_names <- names(data) 68 | 69 | # Detect method - STL or Twitter 70 | method_names <- c("trend", "median_spans") 71 | method_name_in_data <- any(method_names %in% data_names) 72 | 73 | # Check - No method name in data 74 | if (!method_name_in_data) stop("Error clean_anomalies(): Output does not contain a column named trend or median_spans. This may occur if the output was not detrended with time_decompose().", call. = FALSE) 75 | 76 | # Check - Required names from time_decompose() 77 | required_names <- c("observed", "season") 78 | required_names_in_data <- all(required_names %in% data_names) 79 | if (!required_names_in_data) stop("Error clean_anomalies(): Output does not contain columns named observed and season. This may occur if the output was not detrended with time_decompose().", call. = FALSE) 80 | 81 | # Check - Required names from time_decompose() 82 | required_names <- c("anomaly") 83 | required_names_in_data <- all(required_names %in% data_names) 84 | if (!required_names_in_data) stop("Error clean_anomalies(): Output does not contain columns named anomaly. This may occur if the output was not anomalized with anomalize().", call. = FALSE) 85 | 86 | 87 | } 88 | 89 | 90 | get_method_col <- function(data) { 91 | 92 | data_names <- names(data) 93 | 94 | # Detect method - STL or Twitter 95 | method_names <- c("trend", "median_spans") 96 | method_name_in_data <- method_names %in% data_names 97 | 98 | method_names[method_name_in_data] 99 | 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /R/anomalize_methods.R: -------------------------------------------------------------------------------- 1 | #' Methods that power anomalize() 2 | #' 3 | #' @inheritParams anomalize 4 | #' @param x A vector of numeric data. 5 | #' @param verbose A boolean. If `TRUE`, will return a list containing useful information 6 | #' about the anomalies. If `FALSE`, just returns a vector of "Yes" / "No" values. 7 | #' 8 | #' @return Returns character vector or list depending on the value of `verbose`. 9 | #' 10 | #' 11 | #' @seealso [anomalize()] 12 | #' 13 | #' @examples 14 | #' 15 | #' set.seed(100) 16 | #' x <- rnorm(100) 17 | #' idx_outliers <- sample(100, size = 5) 18 | #' x[idx_outliers] <- x[idx_outliers] + 10 19 | #' 20 | #' iqr(x, alpha = 0.05, max_anoms = 0.2) 21 | #' iqr(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) 22 | #' 23 | #' gesd(x, alpha = 0.05, max_anoms = 0.2) 24 | #' gesd(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) 25 | #' 26 | #' 27 | #' @references 28 | #' - The IQR method is used in [`forecast::tsoutliers()`](https://github.com/robjhyndman/forecast/blob/master/R/clean.R) 29 | #' - The GESD method is used in Twitter's [`AnomalyDetection`](https://github.com/twitter/AnomalyDetection) package and is also available as a function in [@raunakms's GESD method](https://github.com/raunakms/GESD/blob/master/runGESD.R) 30 | #' 31 | #' @name anomalize_methods 32 | 33 | # 1A. IQR Method ---- 34 | 35 | #' @export 36 | #' @rdname anomalize_methods 37 | iqr <- function(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) { 38 | quantile_x <- stats::quantile(x, prob = c(0.25, 0.75), na.rm = TRUE) 39 | iq_range <- quantile_x[[2]] - quantile_x[[1]] 40 | limits <- quantile_x + (0.15 / alpha) * iq_range * c(-1, 1) 41 | 42 | outlier_idx <- ((x < limits[1]) | (x > limits[2])) 43 | outlier_vals <- x[outlier_idx] 44 | outlier_response <- ifelse(outlier_idx == TRUE, "Yes", "No") 45 | 46 | vals_tbl <- tibble::tibble(value = x) %>% 47 | tibble::rownames_to_column(var = "index") %>% 48 | # Establish limits and assess if outside of limits 49 | dplyr::mutate( 50 | limit_lower = limits[1], 51 | limit_upper = limits[2], 52 | abs_diff_lower = ifelse(value <= limit_lower, abs(value - limit_lower), 0), 53 | abs_diff_upper = ifelse(value >= limit_upper, abs(value - limit_upper), 0), 54 | max_abs_diff = ifelse(abs_diff_lower > abs_diff_upper, abs_diff_lower, abs_diff_upper) 55 | ) %>% 56 | dplyr::select(index, dplyr::everything()) %>% 57 | dplyr::select(-c(abs_diff_lower, abs_diff_upper)) %>% 58 | # Sort by absolute distance from centerline of limits 59 | dplyr::mutate( 60 | centerline = (limit_upper + limit_lower) / 2, 61 | sorting = abs(value - centerline) 62 | ) %>% 63 | dplyr::arrange(dplyr::desc(sorting)) %>% 64 | dplyr::select(-c(centerline, sorting)) %>% 65 | tibble::rownames_to_column(var = "rank") %>% 66 | dplyr::mutate( 67 | rank = as.numeric(rank), 68 | index = as.numeric(index) 69 | ) %>% 70 | # Identify outliers 71 | dplyr::arrange(dplyr::desc(max_abs_diff)) %>% 72 | dplyr::mutate( 73 | outlier = ifelse(max_abs_diff > 0, "Yes", "No"), 74 | below_max_anoms = ifelse(dplyr::row_number() / dplyr::n() > max_anoms, 75 | "No", "Yes" 76 | ), 77 | outlier_reported = ifelse(outlier == "Yes" & below_max_anoms == "Yes", 78 | "Yes", "No" 79 | ), 80 | direction = dplyr::case_when( 81 | (outlier_reported == "Yes") & (value > limit_upper) ~ "Up", 82 | (outlier_reported == "Yes") & (value < limit_lower) ~ "Down", 83 | TRUE ~ "NA" 84 | ), 85 | direction = ifelse(direction == "NA", NA, direction) 86 | ) 87 | 88 | vals_tbl_filtered <- vals_tbl %>% 89 | dplyr::filter(below_max_anoms == "Yes") %>% 90 | dplyr::select(-c(max_abs_diff:below_max_anoms)) %>% 91 | dplyr::rename(outlier = outlier_reported) 92 | 93 | # Critical Limits 94 | if (any(vals_tbl$outlier == "No")) { 95 | # Non outliers identified, pick first limit 96 | limit_tbl <- vals_tbl %>% 97 | dplyr::filter(outlier == "No") %>% 98 | dplyr::slice(1) 99 | limits_vec <- c( 100 | limit_lower = limit_tbl$limit_lower, 101 | limit_upper = limit_tbl$limit_upper 102 | ) 103 | } else { 104 | # All outliers, pick last limits 105 | limit_tbl <- vals_tbl %>% 106 | dplyr::slice(n()) 107 | limits_vec <- c( 108 | limit_lower = limit_tbl$limit_lower, 109 | limit_upper = limit_tbl$limit_upper 110 | ) 111 | } 112 | 113 | # Return results 114 | if (verbose) { 115 | outlier_list <- list( 116 | outlier = vals_tbl %>% dplyr::arrange(index) %>% dplyr::pull(outlier_reported), 117 | outlier_idx = vals_tbl %>% dplyr::filter(outlier_reported == "Yes") %>% dplyr::pull(index), 118 | outlier_vals = vals_tbl %>% dplyr::filter(outlier_reported == "Yes") %>% dplyr::pull(value), 119 | outlier_direction = vals_tbl %>% dplyr::filter(outlier_reported == "Yes") %>% dplyr::pull(direction), 120 | critical_limits = limits_vec, 121 | outlier_report = vals_tbl_filtered 122 | ) 123 | return(outlier_list) 124 | } else { 125 | return(vals_tbl %>% dplyr::arrange(index) %>% dplyr::pull(outlier_reported)) 126 | } 127 | } 128 | 129 | 130 | 131 | # 1B. GESD: Generalized Extreme Studentized Deviate Test ---- 132 | 133 | #' @export 134 | #' @rdname anomalize_methods 135 | gesd <- function(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) { 136 | 137 | # Variables 138 | n <- length(x) 139 | r <- trunc(n * max_anoms) # use max anoms to limit loop 140 | R <- numeric(length = r) # test statistics for 'r' outliers 141 | 142 | lambda <- numeric(length = r) # critical values for 'r' outliers 143 | outlier_ind <- numeric(length = r) # removed outlier observation values 144 | outlier_val <- numeric(length = r) # removed outlier observation values 145 | m <- 0 # number of outliers 146 | x_new <- x # temporary observation values 147 | median_new <- numeric(length = r) 148 | mad_new <- numeric(length = r) 149 | 150 | # Outlier detection 151 | for (i in seq_len(r)) { 152 | 153 | # Compute test statistic 154 | median_new[i] <- median(x_new) 155 | mad_new[i] <- mad(x_new) 156 | 157 | z <- abs(x_new - median(x_new)) / (mad(x_new) + .Machine$double.eps) # Z-scores 158 | 159 | max_ind <- which(z == max(z), arr.ind = T)[1] # in case of ties, return first one 160 | R[i] <- z[max_ind] # max Z-score 161 | outlier_val[i] <- x_new[max_ind] # removed outlier observation values 162 | outlier_ind[i] <- which(x_new[max_ind] == x, arr.ind = T)[1] # index of removed outlier observation values 163 | x_new <- x_new[-max_ind] # remove observation that maximizes |x_i - x_mean| 164 | 165 | # Compute critical values 166 | p <- 1 - alpha / (2 * (n - i + 1)) # probability 167 | t_pv <- qt(p, df = (n - i - 1)) # Critical value from Student's t distribution 168 | lambda[i] <- ((n - i) * t_pv) / (sqrt((n - i - 1 + t_pv^2) * (n - i + 1))) 169 | 170 | # Find exact number of outliers 171 | # largest 'i' such that R_i > lambda_i 172 | if (!is.na(R[i]) & !is.na(lambda[i])) { # qt can produce NaNs 173 | if (R[i] > lambda[i]) { 174 | m <- i 175 | } 176 | } 177 | } 178 | 179 | vals_tbl <- tibble::tibble( 180 | rank = as.numeric(1:r), 181 | index = outlier_ind, 182 | value = outlier_val, 183 | test_statistic = R, 184 | critical_value = lambda, 185 | median = median_new, 186 | mad = mad_new, 187 | limit_lower = median - critical_value * mad, 188 | limit_upper = critical_value * mad + median 189 | ) %>% 190 | dplyr::mutate( 191 | outlier = ifelse(test_statistic > critical_value, "Yes", "No"), 192 | direction = dplyr::case_when( 193 | (outlier == "Yes") & (value > limit_upper) ~ "Up", 194 | (outlier == "Yes") & (value < limit_lower) ~ "Down", 195 | TRUE ~ "NA" 196 | ), 197 | direction = ifelse(direction == "NA", NA, direction) 198 | ) %>% 199 | dplyr::select(-c(test_statistic:mad)) 200 | 201 | outlier_index <- vals_tbl %>% dplyr::filter(outlier == "Yes") %>% dplyr::pull(index) 202 | outlier_idx <- seq_along(x) %in% outlier_index 203 | outlier_response <- ifelse(outlier_idx == TRUE, "Yes", "No") 204 | 205 | # Critical Limits 206 | if (any(vals_tbl$outlier == "No")) { 207 | # Non outliers identified, pick first limit 208 | limit_tbl <- vals_tbl %>% 209 | dplyr::filter(outlier == "No") %>% 210 | dplyr::slice(1) 211 | limits_vec <- c( 212 | limit_lower = limit_tbl$limit_lower, 213 | limit_upper = limit_tbl$limit_upper 214 | ) 215 | } else { 216 | # All outliers, pick last limits 217 | limit_tbl <- vals_tbl %>% 218 | dplyr::slice(n()) 219 | limits_vec <- c( 220 | limit_lower = limit_tbl$limit_lower, 221 | limit_upper = limit_tbl$limit_upper 222 | ) 223 | } 224 | 225 | # Return results 226 | if (verbose) { 227 | outlier_list <- list( 228 | outlier = outlier_response, 229 | outlier_idx = outlier_index, 230 | outlier_vals = vals_tbl %>% dplyr::filter(outlier == "Yes") %>% dplyr::pull(value), 231 | outlier_direction = vals_tbl %>% dplyr::filter(outlier == "Yes") %>% dplyr::pull(direction), 232 | critical_limits = limits_vec, 233 | outlier_report = vals_tbl 234 | ) 235 | return(outlier_list) 236 | } else { 237 | return(outlier_response) 238 | } 239 | } 240 | 241 | -------------------------------------------------------------------------------- /R/plot_anomalies.R: -------------------------------------------------------------------------------- 1 | #' Visualize the anomalies in one or multiple time series 2 | #' 3 | #' @param data A `tibble` or `tbl_time` object. 4 | #' @param time_recomposed A boolean. If `TRUE`, will use the `time_recompose()` bands to 5 | #' place bands as approximate limits around the "normal" data. 6 | #' @param ncol Number of columns to display. Set to 1 for single column by default. 7 | #' @param color_no Color for non-anomalous data. 8 | #' @param color_yes Color for anomalous data. 9 | #' @param fill_ribbon Fill color for the time_recomposed ribbon. 10 | #' @param alpha_dots Controls the transparency of the dots. Reduce when too many dots on the screen. 11 | #' @param alpha_circles Controls the transparency of the circles that identify anomalies. 12 | #' @param alpha_ribbon Controls the transparency of the time_recomposed ribbon. 13 | #' @param size_dots Controls the size of the dots. 14 | #' @param size_circles Controls the size of the circles that identify anomalies. 15 | #' 16 | #' @return Returns a `ggplot` object. 17 | #' 18 | #' @details 19 | #' Plotting function for visualizing anomalies on one or more time series. 20 | #' Multiple time series must be grouped using `dplyr::group_by()`. 21 | #' 22 | #' @seealso [plot_anomaly_decomposition()] 23 | #' 24 | #' @examples 25 | #' 26 | #' \dontrun{ 27 | #' library(dplyr) 28 | #' library(ggplot2) 29 | #' 30 | #' 31 | #' #### SINGLE TIME SERIES #### 32 | #' tidyverse_cran_downloads %>% 33 | #' filter(package == "tidyquant") %>% 34 | #' ungroup() %>% 35 | #' time_decompose(count, method = "stl") %>% 36 | #' anomalize(remainder, method = "iqr") %>% 37 | #' time_recompose() %>% 38 | #' plot_anomalies(time_recomposed = TRUE) 39 | #' 40 | #' 41 | #' #### MULTIPLE TIME SERIES #### 42 | #' tidyverse_cran_downloads %>% 43 | #' time_decompose(count, method = "stl") %>% 44 | #' anomalize(remainder, method = "iqr") %>% 45 | #' time_recompose() %>% 46 | #' plot_anomalies(time_recomposed = TRUE, ncol = 3) 47 | #' } 48 | #' 49 | #' @export 50 | plot_anomalies <- function(data, time_recomposed = FALSE, ncol = 1, 51 | color_no = "#2c3e50", color_yes = "#e31a1c", fill_ribbon = "grey70", 52 | alpha_dots = 1, alpha_circles = 1, alpha_ribbon = 1, 53 | size_dots = 1.5, size_circles = 4) { 54 | 55 | UseMethod("plot_anomalies", data) 56 | } 57 | 58 | #' @export 59 | plot_anomalies.default <- function(data, time_recomposed = FALSE, ncol = 1, 60 | color_no = "#2c3e50", color_yes = "#e31a1c", fill_ribbon = "grey70", 61 | alpha_dots = 1, alpha_circles = 1, alpha_ribbon = 1, 62 | size_dots = 1.5, size_circles = 4) { 63 | stop("Object is not of class `tbl_time`.", call. = FALSE) 64 | } 65 | 66 | #' @export 67 | plot_anomalies.tbl_time <- function(data, time_recomposed = FALSE, ncol = 1, 68 | color_no = "#2c3e50", color_yes = "#e31a1c", fill_ribbon = "grey70", 69 | alpha_dots = 1, alpha_circles = 1, alpha_ribbon = 1, 70 | size_dots = 1.5, size_circles = 4) { 71 | 72 | # Checks 73 | column_names <- names(data) 74 | check_names <- c("observed", "anomaly") %in% column_names 75 | if (!all(check_names)) stop('Error in plot_anomalies(): key names are missing. Make sure observed:remainder, anomaly, recomposed_l1, and recomposed_l2 are present', call. = FALSE) 76 | 77 | # Setup 78 | date_expr <- tibbletime::get_index_quo(data) 79 | date_col <- tibbletime::get_index_char(data) 80 | 81 | g <- data %>% 82 | ggplot2::ggplot(ggplot2::aes(x = .data[[date_col]], y = .data[["observed"]])) 83 | 84 | 85 | if (time_recomposed) { 86 | check_names <- c("recomposed_l1", "recomposed_l2") %in% column_names 87 | if (!all(check_names)) stop('Error in plot_anomalies(): key names are missing. Make sure recomposed_l1 and recomposed_l2 are present', call. = FALSE) 88 | 89 | g <- g + 90 | ggplot2::geom_ribbon(ggplot2::aes(ymin = recomposed_l1, ymax = recomposed_l2), 91 | fill = fill_ribbon) 92 | 93 | } 94 | 95 | g <- g + 96 | ggplot2::geom_point(ggplot2::aes(color = .data[["anomaly"]]), size = size_dots, alpha = alpha_dots) + 97 | ggplot2::geom_point(ggplot2::aes(x = .data[[date_col]], y = .data[["observed"]], color = .data[["anomaly"]]), 98 | size = size_circles, shape = 1, alpha = alpha_circles, 99 | data = data %>% dplyr::filter(anomaly == "Yes"), 100 | inherit.aes = FALSE) + 101 | theme_tq() + 102 | ggplot2::scale_color_manual(values = c("No" = color_no, "Yes" = color_yes)) + 103 | ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 30, hjust = 1)) 104 | 105 | 106 | 107 | 108 | if (dplyr::is.grouped_df(data)) { 109 | 110 | facet_group <- dplyr::groups(data) %>% 111 | purrr::map(quo_name) %>% 112 | unlist() %>% 113 | paste0(collapse = " + ") 114 | 115 | g <- g + 116 | ggplot2::facet_wrap(as.formula(paste0(" ~ ", facet_group)), 117 | scales = "free_y", ncol = ncol) 118 | } 119 | 120 | return(g) 121 | 122 | } 123 | -------------------------------------------------------------------------------- /R/plot_anomaly_decomposition.R: -------------------------------------------------------------------------------- 1 | #' Visualize the time series decomposition with anomalies shown 2 | #' 3 | #' @param data A `tibble` or `tbl_time` object. 4 | #' @param ncol Number of columns to display. Set to 1 for single column by default. 5 | #' @param color_no Color for non-anomalous data. 6 | #' @param color_yes Color for anomalous data. 7 | #' @param alpha_dots Controls the transparency of the dots. Reduce when too many dots on the screen. 8 | #' @param alpha_circles Controls the transparency of the circles that identify anomalies. 9 | #' @param size_dots Controls the size of the dots. 10 | #' @param size_circles Controls the size of the circles that identify anomalies. 11 | #' @param strip.position Controls the placement of the strip that identifies the time series decomposition components. 12 | #' 13 | #' @return Returns a `ggplot` object. 14 | #' 15 | #' @details 16 | #' The first step in reviewing the anomaly detection process is to evaluate 17 | #' a single times series to observe how the algorithm is selecting anomalies. 18 | #' The `plot_anomaly_decomposition()` function is used to gain 19 | #' an understanding as to whether or not the method is detecting anomalies correctly and 20 | #' whether or not parameters such as decomposition method, anomalize method, 21 | #' alpha, frequency, and so on should be adjusted. 22 | #' 23 | #' @seealso [plot_anomalies()] 24 | #' 25 | #' @examples 26 | #' 27 | #' library(dplyr) 28 | #' library(ggplot2) 29 | #' 30 | #' tidyverse_cran_downloads %>% 31 | #' filter(package == "tidyquant") %>% 32 | #' ungroup() %>% 33 | #' time_decompose(count, method = "stl") %>% 34 | #' anomalize(remainder, method = "iqr") %>% 35 | #' plot_anomaly_decomposition() 36 | #' 37 | #' @export 38 | plot_anomaly_decomposition <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", 39 | alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, 40 | strip.position = "right") { 41 | UseMethod("plot_anomaly_decomposition", data) 42 | 43 | } 44 | 45 | #' @export 46 | plot_anomaly_decomposition.default <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", 47 | alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, 48 | strip.position = "right") { 49 | stop("Object is not of class `tbl_time`.", call. = FALSE) 50 | 51 | 52 | } 53 | 54 | #' @export 55 | plot_anomaly_decomposition.grouped_tbl_time <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", 56 | alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, 57 | strip.position = "right") { 58 | stop("Object cannot be grouped. Select a single time series for evaluation, and use `dplyr::ungroup()`.", call. = FALSE) 59 | 60 | 61 | } 62 | 63 | #' @export 64 | plot_anomaly_decomposition.tbl_time <- function(data, ncol = 1, color_no = "#2c3e50", color_yes = "#e31a1c", 65 | alpha_dots = 1, alpha_circles = 1, size_dots = 1.5, size_circles = 4, 66 | strip.position = "right") { 67 | 68 | # Checks 69 | column_names <- names(data) 70 | check_names <- c("observed", "remainder", "anomaly", "remainder_l1", "remainder_l2") %in% column_names 71 | if (!all(check_names)) stop('Error in plot_anomaly_decomposition(): key names are missing. Make sure observed:remainder, remainder_l1, and remainder_l2 are present', call. = FALSE) 72 | 73 | 74 | # Setup 75 | date_expr <- tibbletime::get_index_quo(data) 76 | date_col <- tibbletime::get_index_char(data) 77 | 78 | data_anomaly_tbl <- data %>% 79 | dplyr::select(!!date_expr, observed:remainder, anomaly) %>% 80 | tidyr::gather(key = key, value = value, -dplyr::one_of(c(!! date_col, 'anomaly')), factor_key = T) 81 | 82 | g <- data_anomaly_tbl %>% 83 | ggplot2::ggplot(ggplot2::aes(x = .data[[date_col]], y = .data$value, color = .data$anomaly)) + 84 | # Points 85 | ggplot2::geom_point(size = size_dots, alpha = alpha_dots) + 86 | # Circles 87 | ggplot2::geom_point(size = size_circles, shape = 1, alpha = alpha_circles, 88 | data = data_anomaly_tbl %>% dplyr::filter(anomaly == "Yes")) + 89 | # Horizontal Line at Y = 0 90 | ggplot2::geom_hline(yintercept = 0, color = palette_light()[[1]]) + 91 | theme_tq() + 92 | ggplot2::facet_wrap(~ key, ncol = ncol, scales = "free_y", strip.position = strip.position) + 93 | ggplot2::scale_color_manual(values = c("No" = color_no, "Yes" = color_yes)) + 94 | ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 30, hjust = 1)) 95 | 96 | 97 | return(g) 98 | 99 | } 100 | -------------------------------------------------------------------------------- /R/prep_tbl_time.R: -------------------------------------------------------------------------------- 1 | #' Automatically create tibbletime objects from tibbles 2 | #' 3 | #' @param data A `tibble`. 4 | #' @param message A boolean. If `TRUE`, returns a message indicating any 5 | #' conversion details important to know during the conversion to `tbl_time` class. 6 | #' 7 | #' @return Returns a `tibbletime` object of class `tbl_time`. 8 | #' 9 | #' @details 10 | #' Detects a date or datetime index column and automatically 11 | #' 12 | #' 13 | #' @examples 14 | #' 15 | #' library(dplyr) 16 | #' library(tibbletime) 17 | #' 18 | #' data_tbl <- tibble( 19 | #' date = seq.Date(from = as.Date("2018-01-01"), by = "day", length.out = 10), 20 | #' value = rnorm(10) 21 | #' ) 22 | #' 23 | #' prep_tbl_time(data_tbl) 24 | #' 25 | #' @export 26 | prep_tbl_time <- function(data, message = FALSE) { 27 | UseMethod("prep_tbl_time", data) 28 | } 29 | 30 | #' @export 31 | prep_tbl_time.default <- function(data, message = FALSE) { 32 | stop("Object is not of class `data.frame`.", call. = FALSE) 33 | } 34 | 35 | 36 | #' @export 37 | prep_tbl_time.data.frame <- function(data, message = FALSE) { 38 | 39 | cl <- class(data)[[1]] 40 | 41 | idx <- tryCatch(timetk::tk_get_timeseries_variables(data)[[1]], error = function(e) stop("Error in prep_tbl_time(): No date or datetime column found.")) 42 | 43 | data <- data %>% 44 | tibbletime::as_tbl_time(index = !! rlang::sym(idx)) 45 | 46 | if (message) message(glue::glue("Converting from {cl} to {class(data)[[1]]}. 47 | Auto-index message: index = {idx}")) 48 | 49 | return(data) 50 | } 51 | 52 | #' @export 53 | prep_tbl_time.tbl_time <- function(data, message = FALSE) { 54 | return(data) 55 | } 56 | 57 | -------------------------------------------------------------------------------- /R/tidyquant_theme_compat.R: -------------------------------------------------------------------------------- 1 | # tidyquant functions copied to remove dependency on tidyquant 2 | 3 | #' @importFrom ggplot2 %+replace% 4 | 5 | theme_tq <- function(base_size = 11, base_family = "") { 6 | 7 | # Tidyquant colors 8 | blue <- "#2c3e50" 9 | green <- "#18BC9C" 10 | white <- "#FFFFFF" 11 | grey <- "grey80" 12 | 13 | # Starts with theme_grey and then modify some parts 14 | ggplot2::theme_grey(base_size = base_size, base_family = base_family) %+replace% 15 | ggplot2::theme( 16 | 17 | # Base Inherited Elements 18 | line = ggplot2::element_line(colour = blue, linewidth = 0.5, linetype = 1, 19 | lineend = "butt"), 20 | rect = ggplot2::element_rect(fill = white, colour = blue, 21 | linewidth = 0.5, linetype = 1), 22 | text = ggplot2::element_text(family = base_family, face = "plain", 23 | colour = blue, size = base_size, 24 | lineheight = 0.9, hjust = 0.5, vjust = 0.5, angle = 0, 25 | margin = ggplot2::margin(), debug = FALSE), 26 | 27 | # Axes 28 | axis.line = ggplot2::element_blank(), 29 | axis.text = ggplot2::element_text(size = ggplot2::rel(0.8)), 30 | axis.ticks = ggplot2::element_line(color = grey, linewidth = ggplot2::rel(1/3)), 31 | axis.title = ggplot2::element_text(size = ggplot2::rel(1.0)), 32 | 33 | # Panel 34 | panel.background = ggplot2::element_rect(fill = white, color = NA), 35 | panel.border = ggplot2::element_rect(fill = NA, linewidth = ggplot2::rel(1/2), color = blue), 36 | panel.grid.major = ggplot2::element_line(color = grey, linewidth = ggplot2::rel(1/3)), 37 | panel.grid.minor = ggplot2::element_line(color = grey, linewidth = ggplot2::rel(1/3)), 38 | panel.grid.minor.x = ggplot2::element_blank(), 39 | panel.spacing = ggplot2::unit(.75, "cm"), 40 | 41 | # Legend 42 | legend.key = ggplot2::element_rect(fill = white, color = NA), 43 | legend.position = "bottom", 44 | 45 | # Strip (Used with multiple panels) 46 | strip.background = ggplot2::element_rect(fill = blue, color = blue), 47 | strip.text = ggplot2::element_text(color = white, size = ggplot2::rel(0.8), margin = ggplot2::margin(t = 5, b = 5)), 48 | 49 | # Plot 50 | plot.title = ggplot2::element_text(size = ggplot2::rel(1.2), hjust = 0, 51 | margin = ggplot2::margin(t = 0, r = 0, b = 4, l = 0, unit = "pt")), 52 | plot.subtitle = ggplot2::element_text(size = ggplot2::rel(0.9), hjust = 0, 53 | margin = ggplot2::margin(t = 0, r = 0, b = 3, l = 0, unit = "pt")), 54 | 55 | # Complete theme 56 | complete = TRUE 57 | ) 58 | } 59 | 60 | theme_tq_dark <- function(base_size = 11, base_family = "") { 61 | 62 | # Tidyquant colors 63 | blue <- "#2c3e50" 64 | green <- "#18BC9C" 65 | white <- "#FFFFFF" 66 | grey <- "grey50" 67 | 68 | # Starts with theme_tq and then invert some colors 69 | theme_tq(base_size = base_size, base_family = base_family) %+replace% 70 | ggplot2::theme( 71 | 72 | # Axes 73 | axis.ticks = ggplot2::element_line(color = blue, linewidth = ggplot2::rel(1/3)), 74 | 75 | # Panel 76 | panel.background = ggplot2::element_rect(fill = grey, color = NA), 77 | panel.grid.major = ggplot2::element_line(color = white, linewidth = ggplot2::rel(1/3)), 78 | panel.grid.minor = ggplot2::element_line(color = white, linewidth = ggplot2::rel(1/3)), 79 | 80 | # Complete theme 81 | complete = TRUE 82 | ) 83 | } 84 | 85 | theme_tq_green <- function(base_size = 11, base_family = "") { 86 | 87 | # Tidyquant colors 88 | blue <- "#2c3e50" 89 | green <- "#18BC9C" 90 | white <- "#FFFFFF" 91 | grey <- "grey80" 92 | 93 | # Starts with theme_tq and then invert some colors 94 | theme_tq(base_size = base_size, base_family = base_family) %+replace% 95 | ggplot2::theme( 96 | 97 | # Axes 98 | axis.ticks = ggplot2::element_line(color = blue, linewidth = ggplot2::rel(1/3)), 99 | 100 | # Panel 101 | panel.background = ggplot2::element_rect(fill = green, color = NA), 102 | panel.grid.major = ggplot2::element_line(color = white, linewidth = ggplot2::rel(1/3)), 103 | panel.grid.minor = ggplot2::element_line(color = white, linewidth = ggplot2::rel(1/3)), 104 | 105 | # Complete theme 106 | complete = TRUE 107 | ) 108 | } 109 | 110 | scale_color_tq <- function(..., theme = "light") { 111 | 112 | pal <- switch(theme, 113 | "light" = unname(palette_light()) %>% rep(100), 114 | "dark" = unname(palette_dark()) %>% rep(100), 115 | "green" = unname(palette_green() %>% rep(100)) 116 | ) 117 | 118 | ggplot2::scale_color_manual(values = pal) 119 | } 120 | 121 | palette_light <- function() { 122 | c( 123 | blue = "#2c3e50", # blue 124 | red = "#e31a1c", # red 125 | green = "#18BC9C", # green 126 | yellow = "#CCBE93", # yellow 127 | steel_blue = "#a6cee3", # steel_blue 128 | navy_blue = "#1f78b4", # navy_blue 129 | light_green = "#b2df8a", # light_green 130 | pink = "#fb9a99", # pink 131 | light_orange = "#fdbf6f", # light_orange 132 | orange = "#ff7f00", # orange 133 | light_purple = "#cab2d6", # light_purple 134 | purple = "#6a3d9a" # purple 135 | ) %>% toupper() 136 | } 137 | 138 | palette_dark <- function() { 139 | # Brighter version of palette_light 140 | c( 141 | blue = "#0055AA", # blue 142 | red = "#C40003", # red 143 | green = "#00C19B", # green 144 | yellow = "#EAC862", # yellow 145 | steel_blue = "#7FD2FF", # steel_blue 146 | navy_blue = "#007ED3", # navy_blue 147 | light_green = "#b2df8a", # light_green 148 | pink = "#FFACAA", # pink 149 | light_orange = "#FF9D1E", # light_orange 150 | lime_green = "#C3EF00", # lime_green 151 | light_purple = "#cab2d6", # light_purple 152 | purple = "#894FC6" # purple 153 | ) %>% toupper() 154 | } 155 | 156 | palette_green <- function() { 157 | # Green compatible version of palette_light 158 | c( 159 | blue = "#0055AA", # blue 160 | red = "#C40003", # red 161 | yellow = "#EAC862", # yellow 162 | steel_blue = "#7FD2FF", # steel_blue 163 | navy_blue = "#007ED3", # navy_blue 164 | creme = "#F6F4F3", # creme 165 | pink = "#FFACAA", # pink 166 | light_orange = "#FF9D1E", # light_orange 167 | lime_green = "#C3EF00", # lime_green 168 | light_purple = "#cab2d6", # light_purple 169 | purple = "#894FC6", # purple 170 | brown = "#592E2E" # brown 171 | ) %>% toupper() 172 | } 173 | 174 | palette_light <- function() { 175 | c( 176 | blue = "#2c3e50", # blue 177 | red = "#e31a1c", # red 178 | green = "#18BC9C", # green 179 | yellow = "#CCBE93", # yellow 180 | steel_blue = "#a6cee3", # steel_blue 181 | navy_blue = "#1f78b4", # navy_blue 182 | light_green = "#b2df8a", # light_green 183 | pink = "#fb9a99", # pink 184 | light_orange = "#fdbf6f", # light_orange 185 | orange = "#ff7f00", # orange 186 | light_purple = "#cab2d6", # light_purple 187 | purple = "#6a3d9a" # purple 188 | ) %>% toupper() 189 | } 190 | -------------------------------------------------------------------------------- /R/tidyverse_cran_downloads.R: -------------------------------------------------------------------------------- 1 | #' Downloads of various "tidyverse" packages from CRAN 2 | #' 3 | #' A dataset containing the daily download counts from 2017-01-01 to 2018-03-01 4 | #' for the following tidyverse packages: 5 | #' - `tidyr` 6 | #' - `lubridate` 7 | #' - `dplyr` 8 | #' - `broom` 9 | #' - `tidyquant` 10 | #' - `tidytext` 11 | #' - `ggplot2` 12 | #' - `purrr` 13 | #' - `stringr` 14 | #' - `forcats` 15 | #' - `knitr` 16 | #' - `readr` 17 | #' - `tibble` 18 | #' - `tidyverse` 19 | #' 20 | #' 21 | #' @format A `grouped_tbl_time` object with 6,375 rows and 3 variables: 22 | #' \describe{ 23 | #' \item{date}{Date of the daily observation} 24 | #' \item{count}{Number of downloads that day} 25 | #' \item{package}{The package corresponding to the daily download number} 26 | #' } 27 | #' 28 | #' @source 29 | #' The package downloads come from CRAN by way of the `cranlogs` package. 30 | "tidyverse_cran_downloads" 31 | -------------------------------------------------------------------------------- /R/time_apply.R: -------------------------------------------------------------------------------- 1 | #' Apply a function to a time series by period 2 | #' 3 | #' @inheritParams tibbletime::collapse_by 4 | #' @param data A `tibble` with a date or datetime index. 5 | #' @param target A column to apply the function to 6 | #' @param period A time-based definition (e.g. "1 week"). 7 | #' or a numeric number of observations per frequency (e.g. 10). 8 | #' See [tibbletime::collapse_by()] for period notation. 9 | #' @param .fun A function to apply (e.g. `median`) 10 | #' @param ... Additional parameters passed to the function, `.fun` 11 | #' @param message A boolean. If `message = TRUE`, the frequency used is output 12 | #' along with the units in the scale of the data. 13 | #' 14 | #' @return Returns a `tibbletime` object of class `tbl_time`. 15 | #' 16 | #' @details 17 | #' Uses a time-based period to apply functions to. This is useful in circumstances where you want to 18 | #' compare the observation values to aggregated values such as `mean()` or `median()` 19 | #' during a set time-based period. The returned output extends the 20 | #' length of the data frame so the differences can easily be computed. 21 | #' 22 | #' 23 | #' @examples 24 | #' 25 | #' library(dplyr) 26 | #' 27 | #' # Basic Usage 28 | #' tidyverse_cran_downloads %>% 29 | #' time_apply(count, period = "1 week", .fun = mean, na.rm = TRUE) 30 | #' 31 | #' @export 32 | time_apply <- function(data, target, period, .fun, ..., 33 | start_date = NULL, side = "end", clean = FALSE, message = TRUE) { 34 | 35 | UseMethod("time_apply", data) 36 | 37 | } 38 | 39 | #' @export 40 | time_apply.default <- function(data, target, period, .fun, ..., 41 | start_date = NULL, side = "end", clean = FALSE, message = TRUE) { 42 | stop("Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) 43 | } 44 | 45 | 46 | #' @export 47 | time_apply.data.frame <- function(data, target, period, .fun, ..., 48 | start_date = NULL, side = "end", clean = FALSE, message = TRUE) { 49 | 50 | # Checks 51 | if (missing(target)) stop('Error in time_apply(): argument "target" is missing, with no default', call. = FALSE) 52 | if (missing(period)) stop('Error in time_apply(): argument "period" is missing, with no default', call. = FALSE) 53 | if (missing(.fun)) stop('Error in time_apply(): argument ".fun" is missing, with no default', call. = FALSE) 54 | 55 | 56 | # Setup inputs 57 | data <- prep_tbl_time(data, message = F) 58 | 59 | date_col_expr <- tibbletime::get_index_quo(data) 60 | date_col_name <- dplyr::quo_name(date_col_expr) 61 | 62 | target_expr <- dplyr::enquo(target) 63 | 64 | # Function apply logic 65 | if (is.character(period)) { 66 | # See collapse_by for valid character sequences (e.g. "1 Y") 67 | ret <- data %>% 68 | tibbletime::collapse_by(period = period, clean = clean, start_date = start_date, side = side) %>% 69 | dplyr::group_by(!! tibbletime::get_index_quo(.)) %>% 70 | dplyr::mutate(time_apply = .fun(!! target_expr, ...)) %>% 71 | dplyr::ungroup() %>% 72 | dplyr::mutate(!! date_col_name := data %>% dplyr::pull(!! date_col_expr)) 73 | 74 | } else { 75 | # Numeric (e.g. every 15 data points) 76 | ret <- data %>% 77 | dplyr::mutate( 78 | .period_groups = c(0, (1:(nrow(.) - 1) %/% period)) 79 | ) %>% 80 | dplyr::group_by(.period_groups) %>% 81 | dplyr::mutate( 82 | time_apply = .fun(!! target_expr, ...) 83 | ) %>% 84 | dplyr::ungroup() %>% 85 | dplyr::select(-.period_groups) 86 | } 87 | 88 | return(ret) 89 | 90 | } 91 | 92 | #' @export 93 | time_apply.grouped_df <- function(data, target, period, .fun, ..., 94 | start_date = NULL, side = "end", clean = FALSE, message = TRUE) { 95 | 96 | # Checks 97 | if (missing(target)) stop('Error in time_apply(): argument "target" is missing, with no default', call. = FALSE) 98 | if (missing(period)) stop('Error in time_apply(): argument "period" is missing, with no default', call. = FALSE) 99 | if (missing(.fun)) stop('Error in time_apply(): argument ".fun" is missing, with no default', call. = FALSE) 100 | 101 | 102 | # Setup 103 | data <- prep_tbl_time(data, message = F) 104 | 105 | target_expr <- dplyr::enquo(target) 106 | 107 | # Map time_apply.data.frame 108 | ret <- data %>% 109 | grouped_mapper( 110 | .f = time_apply, 111 | target = !! target_expr, 112 | period = period, 113 | .fun = .fun, 114 | ... = ..., 115 | start_date = start_date, 116 | side = side, 117 | clean = clean, 118 | message = message) 119 | 120 | return(ret) 121 | 122 | } 123 | 124 | -------------------------------------------------------------------------------- /R/time_decompose.R: -------------------------------------------------------------------------------- 1 | #' Decompose a time series in preparation for anomaly detection 2 | #' 3 | #' @inheritParams anomalize 4 | #' @param data A `tibble` or `tbl_time` object. 5 | #' @param method The time series decomposition method. One of `"stl"` or `"twitter"`. 6 | #' The STL method uses seasonal decomposition (see [decompose_stl()]). 7 | #' The Twitter method uses `trend` to remove the trend (see [decompose_twitter()]). 8 | #' @param frequency Controls the seasonal adjustment (removal of seasonality). 9 | #' Input can be either "auto", a time-based definition (e.g. "1 week"), 10 | #' or a numeric number of observations per frequency (e.g. 10). 11 | #' Refer to [time_frequency()]. 12 | #' @param trend Controls the trend component 13 | #' For stl, the trend controls the sensitivity of the lowess smoother, which is used to remove the remainder. 14 | #' For twitter, the trend controls the period width of the median, which are used to remove the trend and center the remainder. 15 | #' @param ... Additional parameters passed to the underlying method functions. 16 | #' @param merge A boolean. `FALSE` by default. If `TRUE`, will append results to the original data. 17 | #' @param message A boolean. If `TRUE`, will output information related to `tbl_time` conversions, frequencies, 18 | #' and trend / median spans (if applicable). 19 | #' 20 | #' @return Returns a `tbl_time` object. 21 | #' 22 | #' @details 23 | #' The `time_decompose()` function generates a time series decomposition on 24 | #' `tbl_time` objects. The function is "tidy" in the sense that it works 25 | #' on data frames. It is designed to work with time-based data, and as such 26 | #' must have a column that contains date or datetime information. The function 27 | #' also works with grouped data. The function implements several methods 28 | #' of time series decomposition, each with benefits. 29 | #' 30 | #' __STL__: 31 | #' 32 | #' The STL method (`method = "stl"`) implements time series decomposition using 33 | #' the underlying [decompose_stl()] function. If you are familiar with [stats::stl()], 34 | #' the function is a "tidy" version that is designed to work with `tbl_time` objects. 35 | #' The decomposition separates the "season" and "trend" components from 36 | #' the "observed" values leaving the "remainder" for anomaly detection. 37 | #' The user can control two parameters: `frequency` and `trend`. 38 | #' The `frequency` parameter adjusts the "season" component that is removed 39 | #' from the "observed" values. The `trend` parameter adjusts the 40 | #' trend window (`t.window` parameter from `stl()`) that is used. 41 | #' The user may supply both `frequency` 42 | #' and `trend` as time-based durations (e.g. "90 days") or numeric values 43 | #' (e.g. 180) or "auto", which predetermines the frequency and/or trend 44 | #' based on the scale of the time series. 45 | #' 46 | #' 47 | #' __Twitter__: 48 | #' 49 | #' The Twitter method (`method = "twitter"`) implements time series decomposition using 50 | #' the methodology from the Twitter [AnomalyDetection](https://github.com/twitter/AnomalyDetection) package. 51 | #' The decomposition separates the "seasonal" component and then removes 52 | #' the median data, which is a different approach than the STL method for removing 53 | #' the trend. This approach works very well for low-growth + high seasonality data. 54 | #' STL may be a better approach when trend is a large factor. 55 | #' The user can control two parameters: `frequency` and `trend`. 56 | #' The `frequency` parameter adjusts the "season" component that is removed 57 | #' from the "observed" values. The `trend` parameter adjusts the 58 | #' period width of the median spans that are used. The user may supply both `frequency` 59 | #' and `trend` as time-based durations (e.g. "90 days") or numeric values 60 | #' (e.g. 180) or "auto", which predetermines the frequency and/or median spans 61 | #' based on the scale of the time series. 62 | #' 63 | #' @references 64 | #' 1. CLEVELAND, R. B., CLEVELAND, W. S., MCRAE, J. E., AND TERPENNING, I. 65 | #' STL: A Seasonal-Trend Decomposition Procedure Based on Loess. Journal of Official Statistics, Vol. 6, No. 1 (1990), pp. 3-73. 66 | #' 2. [Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). 67 | #' A Novel Technique for Long-Term Anomaly Detection in the Cloud. Twitter Inc.](https://www.usenix.org/system/files/conference/hotcloud14/hotcloud14-vallis.pdf) 68 | #' 3. [Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). AnomalyDetection: Anomaly Detection Using 69 | #' Seasonal Hybrid Extreme Studentized Deviate Test. R package version 1.0.](https://github.com/twitter/AnomalyDetection) 70 | #' 71 | #' @seealso 72 | #' Decomposition Methods (Powers `time_decompose`) 73 | #' - [decompose_stl()] 74 | #' - [decompose_twitter()] 75 | #' 76 | #' Time Series Anomaly Detection Functions (anomaly detection workflow): 77 | #' - [anomalize()] 78 | #' - [time_recompose()] 79 | #' 80 | #' @examples 81 | #' 82 | #' library(dplyr) 83 | #' 84 | #' # Basic Usage 85 | #' tidyverse_cran_downloads %>% 86 | #' time_decompose(count, method = "stl") 87 | #' 88 | #' # twitter 89 | #' tidyverse_cran_downloads %>% 90 | #' time_decompose(count, 91 | #' method = "twitter", 92 | #' frequency = "1 week", 93 | #' trend = "2 months", 94 | #' merge = TRUE, 95 | #' message = FALSE) 96 | #' 97 | #' @export 98 | time_decompose <- function(data, target, method = c("stl", "twitter"), 99 | frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE) { 100 | UseMethod("time_decompose", data) 101 | } 102 | 103 | #' @export 104 | time_decompose.default <- function(data, target, method = c("stl", "twitter"), 105 | frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE) { 106 | stop("Error time_decompose(): Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) 107 | } 108 | 109 | #' @export 110 | time_decompose.tbl_time <- function(data, target, method = c("stl", "twitter"), 111 | frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE) { 112 | 113 | # Checks 114 | if (missing(target)) stop('Error in time_decompose(): argument "target" is missing, with no default', call. = FALSE) 115 | 116 | # Setup 117 | target_expr <- dplyr::enquo(target) 118 | method <- tolower(method[[1]]) 119 | 120 | # Set method 121 | if (method == "twitter") { 122 | decomp_tbl <- data %>% 123 | decompose_twitter(!! target_expr, frequency = frequency, trend = trend, message = message, ...) 124 | } else if (method == "stl") { 125 | decomp_tbl <- data %>% 126 | decompose_stl(!! target_expr, frequency = frequency, trend = trend, message = message, ...) 127 | # } else if (method == "multiplicative") { 128 | # decomp_tbl <- data %>% 129 | # decompose_multiplicative(!! target_expr, frequency = frequency, message = message, ...) 130 | } else { 131 | stop(paste0("method = '", method[[1]], "' is not a valid option.")) 132 | } 133 | 134 | # Merge if desired 135 | if (merge) { 136 | ret <- merge_two_tibbles(data, decomp_tbl, .f = time_decompose) 137 | } else { 138 | ret <- decomp_tbl 139 | } 140 | 141 | return(ret) 142 | 143 | } 144 | 145 | #' @export 146 | time_decompose.tbl_df <- function(data, target, method = c("stl", "twitter"), 147 | frequency = "auto", trend = "auto", ..., merge = FALSE, message = TRUE) { 148 | 149 | # Checks 150 | if (missing(target)) stop('Error in time_decompose(): argument "target" is missing, with no default', call. = FALSE) 151 | 152 | # Prep 153 | data <- prep_tbl_time(data, message = message) 154 | 155 | # Send to time_decompose.tbl_time 156 | time_decompose(data = data, 157 | target = !! dplyr::enquo(target), 158 | method = method[[1]], 159 | frequency = frequency, 160 | trend = trend, 161 | ... = ..., 162 | merge = merge, 163 | message = message) 164 | 165 | } 166 | 167 | 168 | 169 | 170 | #' @export 171 | time_decompose.grouped_tbl_time <- function(data, target, method = c("stl", "twitter"), 172 | frequency = "auto", trend = "auto", ..., merge = FALSE, message = FALSE) { 173 | 174 | # Checks 175 | if (missing(target)) stop('Error in time_decompose(): argument "target" is missing, with no default', call. = FALSE) 176 | 177 | # Setup 178 | target_expr <- dplyr::enquo(target) 179 | 180 | # Mapping 181 | ret <- data %>% 182 | grouped_mapper( 183 | .f = time_decompose, 184 | target = !! target_expr, 185 | method = method[[1]], 186 | frequency = frequency, 187 | trend = trend, 188 | ... = ..., 189 | merge = merge, 190 | message = message) 191 | 192 | return(ret) 193 | 194 | } 195 | 196 | #' @export 197 | time_decompose.grouped_df <- function(data, target, method = c("stl", "twitter"), 198 | frequency = "auto", trend = "auto", ..., merge = FALSE, message = FALSE) { 199 | 200 | data <- prep_tbl_time(data, message = message) 201 | 202 | # Send to grouped_tbl_time 203 | time_decompose(data = data, 204 | target = !! dplyr::enquo(target), 205 | method = method[[1]], 206 | frequency = frequency, 207 | trend = trend, 208 | ... = ..., 209 | merge = merge, 210 | message = message) 211 | 212 | } 213 | 214 | 215 | -------------------------------------------------------------------------------- /R/time_decompose_methods.R: -------------------------------------------------------------------------------- 1 | #' Methods that power time_decompose() 2 | #' 3 | #' @inheritParams time_decompose 4 | #' 5 | #' @return A `tbl_time` object containing the time series decomposition. 6 | #' 7 | #' @seealso [time_decompose()] 8 | #' 9 | #' @examples 10 | #' 11 | #' library(dplyr) 12 | #' 13 | #' tidyverse_cran_downloads %>% 14 | #' ungroup() %>% 15 | #' filter(package == "tidyquant") %>% 16 | #' decompose_stl(count) 17 | #' 18 | #' 19 | #' @references 20 | #' - The "twitter" method is used in Twitter's [`AnomalyDetection` package](https://github.com/twitter/AnomalyDetection) 21 | #' 22 | #' @name decompose_methods 23 | 24 | # 2A. Twitter ---- 25 | 26 | #' @export 27 | #' @rdname decompose_methods 28 | decompose_twitter <- function(data, target, frequency = "auto", trend = "auto", message = TRUE) { 29 | 30 | # Checks 31 | if (missing(target)) stop('Error in decompose_twitter(): argument "target" is missing, with no default', call. = FALSE) 32 | # if (!is.null(median_spans)) 33 | # if (!is.numeric(median_spans)) stop('Error in decompse_twitter(): argument "median_spans" must be numeric.', call. = FALSE) 34 | 35 | data <- prep_tbl_time(data) 36 | date_col_vals <- tibbletime::get_index_col(data) 37 | 38 | target_expr <- dplyr::enquo(target) 39 | 40 | date_col_name <- timetk::tk_get_timeseries_variables(data)[[1]] 41 | date_col_expr <- rlang::sym(date_col_name) 42 | 43 | freq <- time_frequency(data, period = frequency, message = message) 44 | # trnd <- time_trend(data, period = trend) 45 | 46 | # Time Series Decomposition 47 | decomp_tbl <- data %>% 48 | dplyr::pull(!! target_expr) %>% 49 | stats::ts(frequency = freq) %>% 50 | stats::stl(s.window = "periodic", robust = TRUE) %>% 51 | sweep::sw_tidy_decomp() %>% 52 | dplyr::select(-c(index, seasadj)) %>% 53 | # forecast::mstl() %>% 54 | # as.tibble() %>% 55 | tibble::add_column(!! date_col_name := date_col_vals, .after = 0) %>% 56 | purrr::set_names(c(date_col_name, "observed", "season", "trend", "remainder")) %>% 57 | dplyr::mutate(seasadj = observed - season) %>% 58 | dplyr::select(!!date_col_expr, observed, season, seasadj, trend, remainder) 59 | 60 | # Median Span Logic 61 | trnd <- time_trend(data, period = trend, message = FALSE) 62 | median_spans_needed <- round(nrow(data) / trnd) 63 | 64 | decomp_tbl <- decomp_tbl %>% 65 | dplyr::mutate( 66 | .period_groups = rep(1:median_spans_needed, length.out = nrow(.)) %>% sort() 67 | ) %>% 68 | dplyr::group_by(.period_groups) %>% 69 | dplyr::mutate(median_spans = median(observed, na.rm = T)) %>% 70 | dplyr::ungroup() %>% 71 | dplyr::select(-.period_groups) 72 | 73 | if (message) { 74 | med_span <- decomp_tbl %>% 75 | dplyr::count(median_spans) %>% 76 | dplyr::pull(n) %>% 77 | stats::median(na.rm = TRUE) 78 | 79 | med_scale <- decomp_tbl %>% 80 | timetk::tk_index() %>% 81 | timetk::tk_get_timeseries_summary() %>% 82 | dplyr::pull(scale) 83 | 84 | message(glue::glue("median_span = {med_span} {med_scale}s")) 85 | } 86 | 87 | # Remainder calculation 88 | decomp_tbl <- decomp_tbl %>% 89 | dplyr::mutate( 90 | remainder = observed - season - median_spans 91 | ) %>% 92 | dplyr::select(!! date_col_expr, observed, season, median_spans, remainder) 93 | 94 | decomp_tbl <- anomalize::prep_tbl_time(decomp_tbl) 95 | 96 | return(decomp_tbl) 97 | 98 | } 99 | 100 | # NOT USED 101 | # Helper function for decompose_twitter 102 | # time_median <- function(data, target, period = "auto", template = time_scale_template(), message = TRUE) { 103 | # 104 | # # Setup inputs 105 | # data <- prep_tbl_time(data, message = F) 106 | # 107 | # date_col_expr <- tibbletime::get_index_quo(data) 108 | # date_col_name <- dplyr::quo_name(date_col_expr) 109 | # 110 | # target_expr <- dplyr::enquo(target) 111 | # 112 | # # For median_span (trend) = "auto" use template 113 | # if (period == "auto") { 114 | # 115 | # # Get timeseries summary attributes 116 | # ts_summary <- data %>% 117 | # tibbletime::get_index_col() %>% 118 | # timetk::tk_get_timeseries_summary() 119 | # 120 | # ts_scale <- ts_summary$scale 121 | # 122 | # period <- template %>% 123 | # target_time_decomposition_scale(ts_scale, "trend", index_shift = 0) 124 | # 125 | # } 126 | # 127 | # # Use time_apply() 128 | # ret <- data %>% 129 | # time_apply(!! target_expr, period = period, 130 | # .fun = median, na.rm = T, clean = F, message = message) %>% 131 | # dplyr::rename(median_spans = time_apply) 132 | # 133 | # if (message) message(glue::glue("median_span = {period}")) 134 | # 135 | # return(ret) 136 | # 137 | # } 138 | 139 | 140 | # 2B. STL ---- 141 | 142 | #' @export 143 | #' @rdname decompose_methods 144 | decompose_stl <- function(data, target, frequency = "auto", trend = "auto", message = TRUE) { 145 | 146 | # Checks 147 | if (missing(target)) stop('Error in decompose_stl(): argument "target" is missing, with no default', call. = FALSE) 148 | 149 | 150 | data <- prep_tbl_time(data) 151 | date_col_vals <- tibbletime::get_index_col(data) 152 | 153 | target_expr <- dplyr::enquo(target) 154 | 155 | date_col_name <- timetk::tk_get_timeseries_variables(data)[[1]] 156 | date_col_expr <- rlang::sym(date_col_name) 157 | 158 | freq <- time_frequency(data, period = frequency, message = message) 159 | trnd <- time_trend(data, period = trend, message = message) 160 | 161 | # Time Series Decomposition 162 | decomp_tbl <- data %>% 163 | dplyr::pull(!! target_expr) %>% 164 | stats::ts(frequency = freq) %>% 165 | stats::stl(s.window = "periodic", t.window = trnd, robust = TRUE) %>% 166 | sweep::sw_tidy_decomp() %>% 167 | # forecast::mstl() %>% 168 | # as.tibble() %>% 169 | tibble::add_column(!! date_col_name := date_col_vals, .after = 0) %>% 170 | dplyr::select(!! date_col_expr, observed, season, trend, remainder) 171 | 172 | decomp_tbl <- anomalize::prep_tbl_time(decomp_tbl) 173 | 174 | return(decomp_tbl) 175 | 176 | } 177 | 178 | 179 | 180 | # NOT USED: USE TRANSFORMATIONS INSTEAD 181 | # # 2C. Multiplicative 182 | # 183 | # #' @export 184 | # #' @rdname decompose_methods 185 | # decompose_multiplicative <- function(data, target, frequency = "auto", trend = "auto", message = TRUE) { 186 | # 187 | # # Checks 188 | # if (missing(target)) stop('Error in decompose_multiplicative(): argument "target" is missing, with no default', call. = FALSE) 189 | # 190 | # # Setup inputs 191 | # data <- prep_tbl_time(data) 192 | # date_col_vals <- tibbletime::get_index_col(data) 193 | # 194 | # target_expr <- dplyr::enquo(target) 195 | # 196 | # date_col_name <- timetk::tk_get_timeseries_variables(data)[[1]] 197 | # date_col_expr <- rlang::sym(date_col_name) 198 | # 199 | # frequency <- anomalize::time_frequency(data, period = frequency, message = message) 200 | # # Note that trend is unused in super smoother (`supsmu()`) 201 | # 202 | # # Time Series Decomposition 203 | # decomp_tbl <- data %>% 204 | # dplyr::pull(!! target_expr) %>% 205 | # stats::ts(frequency = frequency) %>% 206 | # stats::decompose(type = "multiplicative") %>% 207 | # sweep::sw_tidy_decomp() %>% 208 | # dplyr::select(-index) %>% 209 | # dplyr::rename(remainder = random) %>% 210 | # dplyr::select(observed, season, seasadj, trend, remainder) %>% 211 | # tibble::add_column(!! date_col_name := date_col_vals, .after = 0) %>% 212 | # # Fix trend and remainder 213 | # dplyr::mutate( 214 | # trend = stats::supsmu(seq_along(observed), seasadj)$y, 215 | # remainder = observed / (trend * season) 216 | # ) %>% 217 | # dplyr::select(-seasadj) 218 | # 219 | # decomp_tbl <- anomalize::prep_tbl_time(decomp_tbl) 220 | # 221 | # return(decomp_tbl) 222 | # 223 | # } 224 | -------------------------------------------------------------------------------- /R/time_frequency.R: -------------------------------------------------------------------------------- 1 | #' Generate a time series frequency from a periodicity 2 | #' 3 | #' @param data A `tibble` with a date or datetime index. 4 | #' @param period Either "auto", a time-based definition (e.g. "14 days"), 5 | #' or a numeric number of observations per frequency (e.g. 10). 6 | #' See [tibbletime::collapse_by()] for period notation. 7 | #' @param message A boolean. If `message = TRUE`, the frequency used is output 8 | #' along with the units in the scale of the data. 9 | #' 10 | #' @return Returns a scalar numeric value indicating the number of observations in the frequency or trend span. 11 | #' 12 | #' @details 13 | #' A frequency is loosely defined as the number of observations that comprise a cycle 14 | #' in a data set. The trend is loosely defined as time span that can 15 | #' be aggregated across to visualize the central tendency of the data. 16 | #' It's often easiest to think of frequency and trend in terms of the time-based units 17 | #' that the data is already in. __This is what `time_frequency()` and `time_trend()` 18 | #' enable: using time-based periods to define the frequency or trend.__ 19 | #' 20 | #' __Frequency__: 21 | #' 22 | #' As an example, a weekly cycle is often 5-days (for working 23 | #' days) or 7-days (for calendar days). Rather than specify a frequency of 5 or 7, 24 | #' the user can specify `period = "1 week"`, and 25 | #' time_frequency()` will detect the scale of the time series and return 5 or 7 26 | #' based on the actual data. 27 | #' 28 | #' The `period` argument has three basic options for returning a frequency. 29 | #' Options include: 30 | #' - `"auto"`: A target frequency is determined using a pre-defined template (see `template` below). 31 | #' - `time-based duration`: (e.g. "1 week" or "2 quarters" per cycle) 32 | #' - `numeric number of observations`: (e.g. 5 for 5 observations per cycle) 33 | #' 34 | #' The `template` argument is only used when `period = "auto"`. The template is a tibble 35 | #' of three features: `time_scale`, `frequency`, and `trend`. The algorithm will inspect 36 | #' the scale of the time series and select the best frequency that matches the scale and 37 | #' number of observations per target frequency. A frequency is then chosen on be the 38 | #' best match. The predefined template is stored in a function `time_scale_template()`. 39 | #' However, the user can come up with his or her own template changing the values 40 | #' for frequency in the data frame and saving it to `anomalize_options$time_scale_template`. 41 | #' 42 | #' __Trend__: 43 | #' 44 | #' As an example, the trend of daily data is often best aggregated by evaluating 45 | #' the moving average over a quarter or a month span. Rather than specify the number 46 | #' of days in a quarter or month, the user can specify "1 quarter" or "1 month", 47 | #' and the `time_trend()` function will return the correct number of observations 48 | #' per trend cycle. In addition, there is an option, `period = "auto"`, to 49 | #' auto-detect an appropriate trend span depending on the data. The `template` 50 | #' is used to define the appropriate trend span. 51 | #' 52 | #' @examples 53 | #' 54 | #' library(dplyr) 55 | #' 56 | #' data(tidyverse_cran_downloads) 57 | #' 58 | #' #### FREQUENCY DETECTION #### 59 | #' 60 | #' # period = "auto" 61 | #' tidyverse_cran_downloads %>% 62 | #' filter(package == "tidyquant") %>% 63 | #' ungroup() %>% 64 | #' time_frequency(period = "auto") 65 | #' 66 | #' time_scale_template() 67 | #' 68 | #' # period = "1 month" 69 | #' tidyverse_cran_downloads %>% 70 | #' filter(package == "tidyquant") %>% 71 | #' ungroup() %>% 72 | #' time_frequency(period = "1 month") 73 | #' 74 | #' #### TREND DETECTION #### 75 | #' 76 | #' tidyverse_cran_downloads %>% 77 | #' filter(package == "tidyquant") %>% 78 | #' ungroup() %>% 79 | #' time_trend(period = "auto") 80 | 81 | 82 | #' @export 83 | #' @rdname time_frequency 84 | time_frequency <- function(data, period = "auto", message = TRUE) { 85 | 86 | # Checks 87 | if (!is.data.frame(data)) stop("Error time_frequency(): Object must inherit class `data.frame`, `tbl_df` or `tbl_time`.") 88 | 89 | if (dplyr::is.grouped_df(data)) 90 | stop(glue::glue("Error time_frequency(): Cannot use on a grouped data frame. 91 | Frequency should be performed on a single time series.")) 92 | 93 | # Setup inputs 94 | template <- get_time_scale_template() 95 | data <- prep_tbl_time(data, message = F) 96 | 97 | index_expr <- data %>% tibbletime::get_index_quo() 98 | index_name <- dplyr::quo_name(index_expr) 99 | 100 | # Get timeseries summary attributes 101 | ts_summary <- data %>% 102 | tibbletime::get_index_col() %>% 103 | timetk::tk_get_timeseries_summary() 104 | 105 | ts_nobs <- ts_summary$n.obs 106 | ts_scale <- ts_summary$scale 107 | 108 | 109 | if (is.numeric(period)) { 110 | # 1. Numeric Periods 111 | freq <- period 112 | 113 | } else if (period != "auto") { 114 | # 2. Text (e.g. period = "14 days") 115 | freq <- data %>% 116 | tibbletime::collapse_by(period = period) %>% 117 | dplyr::count(!! index_expr) %>% 118 | dplyr::pull(n) %>% 119 | stats::median(na.rm = T) 120 | 121 | } else { 122 | # 3. period = "auto" 123 | 124 | periodicity_target <- template %>% 125 | target_time_decomposition_scale(time_scale = ts_scale, target = "frequency", index_shift = 0) 126 | 127 | freq <- data %>% 128 | tibbletime::collapse_by(period = periodicity_target) %>% 129 | dplyr::count(!! index_expr) %>% 130 | dplyr::pull(n) %>% 131 | stats::median(na.rm = T) 132 | 133 | # Insufficient observations: nobs-to-freq should be at least 3-1 134 | if (ts_nobs < 3*freq) { 135 | periodicity_target <- template %>% 136 | target_time_decomposition_scale(time_scale = ts_scale, target = "frequency", index_shift = 1) 137 | 138 | freq <- data %>% 139 | tibbletime::collapse_by(period = periodicity_target) %>% 140 | dplyr::count(!! index_expr) %>% 141 | dplyr::pull(n) %>% 142 | stats::median(na.rm = T) 143 | } 144 | 145 | if (ts_nobs < 3*freq) { 146 | freq <- 1 147 | } 148 | } 149 | 150 | if (message) { 151 | freq_string <- glue::glue("frequency = {freq} {ts_scale}s") 152 | message(freq_string) 153 | } 154 | 155 | return(freq) 156 | } 157 | 158 | #' @export 159 | #' @rdname time_frequency 160 | time_trend <- function(data, period = "auto", message = TRUE) { 161 | 162 | # Checks 163 | if (!is.data.frame(data)) stop("Error time_trend(): Object must inherit class `data.frame`, `tbl_df` or `tbl_time`.") 164 | 165 | if (dplyr::is.grouped_df(data)) 166 | stop(glue::glue("Cannot use on a grouped data frame. 167 | Frequency should be performed on a single time series.")) 168 | 169 | # Setup inputs 170 | template <- get_time_scale_template() 171 | data <- prep_tbl_time(data, message = F) 172 | 173 | index_expr <- data %>% tibbletime::get_index_quo() 174 | index_name <- dplyr::quo_name(index_expr) 175 | 176 | # Get timeseries summary attributes 177 | ts_summary <- data %>% 178 | tibbletime::get_index_col() %>% 179 | timetk::tk_get_timeseries_summary() 180 | 181 | ts_nobs <- ts_summary$n.obs 182 | ts_scale <- ts_summary$scale 183 | 184 | 185 | if (is.numeric(period)) { 186 | # 1. Numeric Periods 187 | trend <- period 188 | 189 | } else if (period != "auto") { 190 | # 2. Text (e.g. period = "14 days") 191 | trend <- data %>% 192 | tibbletime::collapse_by(period = period) %>% 193 | dplyr::count(!! index_expr) %>% 194 | dplyr::pull(n) %>% 195 | stats::median(na.rm = T) 196 | 197 | } else { 198 | # 3. period = "auto" 199 | 200 | periodicity_target <- template %>% 201 | target_time_decomposition_scale(time_scale = ts_scale, target = "trend", index_shift = 0) 202 | 203 | trend <- data %>% 204 | tibbletime::collapse_by(period = periodicity_target) %>% 205 | dplyr::count(!! index_expr) %>% 206 | dplyr::pull(n) %>% 207 | stats::median(na.rm = T) 208 | 209 | # Insufficient observations: nobs-to-trend should be at least 2-1 210 | if (ts_nobs / trend < 2) { 211 | periodicity_target <- template %>% 212 | target_time_decomposition_scale(time_scale = ts_scale, target = "trend", index_shift = 1) 213 | 214 | trend <- data %>% 215 | tibbletime::collapse_by(period = periodicity_target) %>% 216 | dplyr::count(!! index_expr) %>% 217 | dplyr::pull(n) %>% 218 | stats::median(na.rm = T) 219 | 220 | trend <- ceiling(trend) 221 | 222 | } 223 | 224 | if (ts_nobs / trend < 2) { 225 | trend <- ts_nobs 226 | } 227 | } 228 | 229 | if (message) { 230 | trend_string <- glue::glue("trend = {trend} {ts_scale}s") 231 | message(trend_string) 232 | } 233 | 234 | return(trend) 235 | } 236 | 237 | # Helper function to get the time decomposition scale 238 | target_time_decomposition_scale <- function(template, time_scale, target = c("frequency", "trend"), index_shift = 0) { 239 | 240 | target_expr <- rlang::sym(target[[1]]) 241 | 242 | idx <- which(template$time_scale == time_scale) - index_shift 243 | key_value <- template$time_scale[idx] 244 | 245 | template %>% 246 | dplyr::filter(time_scale == key_value) %>% 247 | dplyr::pull(!! target_expr) 248 | } 249 | -------------------------------------------------------------------------------- /R/time_recompose.R: -------------------------------------------------------------------------------- 1 | #' Recompose bands separating anomalies from "normal" observations 2 | #' 3 | #' @param data A `tibble` or `tbl_time` object that has been 4 | #' processed with `time_decompose()` and `anomalize()`. 5 | #' 6 | #' @return Returns a `tbl_time` object. 7 | #' 8 | #' @details 9 | #' The `time_recompose()` function is used to generate bands around the 10 | #' "normal" levels of observed values. The function uses the remainder_l1 11 | #' and remainder_l2 levels produced during the [anomalize()] step 12 | #' and the season and trend/median_spans values from the [time_decompose()] 13 | #' step to reconstruct bands around the normal values. 14 | #' 15 | #' The following key names are required: observed:remainder from the 16 | #' `time_decompose()` step and remainder_l1 and remainder_l2 from the 17 | #' `anomalize()` step. 18 | #' 19 | #' 20 | #' @seealso 21 | #' Time Series Anomaly Detection Functions (anomaly detection workflow): 22 | #' - [time_decompose()] 23 | #' - [anomalize()] 24 | #' 25 | #' @examples 26 | #' 27 | #' library(dplyr) 28 | #' 29 | #' data(tidyverse_cran_downloads) 30 | #' 31 | #' # Basic Usage 32 | #' tidyverse_cran_downloads %>% 33 | #' time_decompose(count, method = "stl") %>% 34 | #' anomalize(remainder, method = "iqr") %>% 35 | #' time_recompose() 36 | #' 37 | #' 38 | #' @export 39 | time_recompose <- function(data) { 40 | UseMethod("time_recompose", data) 41 | } 42 | 43 | #' @export 44 | time_recompose.default <- function(data) { 45 | stop("Error time_recompose(): Object is not of class `tbl_df` or `tbl_time`.", call. = FALSE) 46 | } 47 | 48 | #' @export 49 | time_recompose.tbl_time <- function(data) { 50 | 51 | # Checks 52 | column_names <- names(data) 53 | check_names <- c("observed", "remainder", "remainder_l1", "remainder_l2") %in% column_names 54 | if (!all(check_names)) stop('Error in time_recompose(): key names are missing. Make sure observed:remainder, remainder_l1, and remainder_l2 are present', call. = FALSE) 55 | 56 | # Setup 57 | # target_expr <- dplyr::enquo(target) 58 | # method <- tolower(method[[1]]) 59 | 60 | l1 <- data %>% 61 | dplyr::select(observed:remainder, contains("_l1")) %>% 62 | dplyr::select(-c(observed, remainder)) %>% 63 | apply(MARGIN = 1, FUN = sum) 64 | 65 | l2 <- data %>% 66 | dplyr::select(observed:remainder, contains("_l2")) %>% 67 | dplyr::select(-c(observed, remainder)) %>% 68 | apply(MARGIN = 1, FUN = sum) 69 | 70 | ret <- data %>% 71 | # add_column(!! paste0(quo_name(target_expr), "_l1") := l1) 72 | tibble::add_column( 73 | recomposed_l1 = l1, 74 | recomposed_l2 = l2 75 | ) 76 | 77 | return(ret) 78 | 79 | } 80 | 81 | #' @export 82 | time_recompose.tbl_df <- function(data) { 83 | 84 | # Prep 85 | data <- prep_tbl_time(data, message = FALSE) 86 | 87 | # Send to time_recompose.tbl_time 88 | time_recompose(data = data) 89 | 90 | } 91 | 92 | 93 | #' @export 94 | time_recompose.grouped_tbl_time <- function(data) { 95 | 96 | # Checks 97 | column_names <- names(data) 98 | check_names <- c("observed", "remainder", "remainder_l1", "remainder_l2") %in% column_names 99 | if (!all(check_names)) stop('Error in time_recompose(): key names are missing. Make sure observed:remainder, remainder_l1, and remainder_l2 are present', call. = FALSE) 100 | 101 | # Setup 102 | group_names <- dplyr::groups(data) 103 | group_vars_expr <- rlang::syms(group_names) 104 | 105 | # Recompose l1 and l2 bands 106 | l1 <- data %>% 107 | dplyr::ungroup() %>% 108 | dplyr::select(observed:remainder, contains("_l1")) %>% 109 | dplyr::select(-c(observed, remainder)) %>% 110 | apply(MARGIN = 1, FUN = sum) 111 | 112 | l2 <- data %>% 113 | dplyr::ungroup() %>% 114 | dplyr::select(observed:remainder, contains("_l2")) %>% 115 | dplyr::select(-c(observed, remainder)) %>% 116 | apply(MARGIN = 1, FUN = sum) 117 | 118 | ret <- data %>% 119 | dplyr::ungroup() %>% 120 | tibble::add_column( 121 | recomposed_l1 = l1, 122 | recomposed_l2 = l2 123 | ) %>% 124 | dplyr::group_by(!!! group_vars_expr) 125 | 126 | return(ret) 127 | 128 | } 129 | 130 | #' @export 131 | time_recompose.grouped_df <- function(data) { 132 | 133 | data <- prep_tbl_time(data, message = message) 134 | 135 | # Send to grouped_tbl_time 136 | time_recompose(data = data) 137 | 138 | } 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /R/time_scale_template.R: -------------------------------------------------------------------------------- 1 | #' Get and modify time scale template 2 | #' 3 | #' @param data A `tibble` with a "time_scale", "frequency", and "trend" columns. 4 | #' 5 | #' 6 | #' @details 7 | #' 8 | #' Used to get and set the time scale template, which is used by `time_frequency()` 9 | #' and `time_trend()` when `period = "auto"`. 10 | #' 11 | #' @seealso [time_frequency()], [time_trend()] 12 | #' 13 | #' @examples 14 | #' 15 | #' get_time_scale_template() 16 | #' 17 | #' set_time_scale_template(time_scale_template()) 18 | #' 19 | 20 | 21 | 22 | #' @export 23 | #' @rdname time_scale_template 24 | set_time_scale_template <- function(data) { 25 | if (!missing(data)) { 26 | options(time_scale_template = data) 27 | } 28 | #getOption('time_scale_template') 29 | } 30 | 31 | #' @export 32 | #' @rdname time_scale_template 33 | get_time_scale_template <- function() { 34 | getOption('time_scale_template') 35 | } 36 | 37 | #' @export 38 | #' @rdname time_scale_template 39 | time_scale_template <- function() { 40 | 41 | tibble::tribble( 42 | ~ "time_scale", ~ "frequency", ~ "trend", 43 | "second", "1 hour", "12 hours", 44 | "minute", "1 day", "14 days", 45 | "hour", "1 day", "1 month", 46 | "day", "1 week", "3 months", 47 | "week", "1 quarter", "1 year", 48 | "month", "1 year", "5 years", 49 | "quarter", "1 year", "10 years", 50 | "year", "5 years", "30 years" 51 | ) 52 | 53 | } 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | # UTILITY FUNCTIONS ---- 2 | 3 | # 1. Mapping Functions ----- 4 | 5 | grouped_mapper <- function(data, target, .f, ...) { 6 | 7 | data <- prep_tbl_time(data, message = FALSE) 8 | 9 | target_expr <- dplyr::enquo(target) 10 | 11 | group_names <- dplyr::group_vars(data) 12 | 13 | ret <- data %>% 14 | dplyr::group_nest() %>% 15 | dplyr::mutate(nested.col = purrr::map( 16 | .x = data, 17 | .f = .f, 18 | target = !! target_expr, 19 | ...) 20 | ) %>% 21 | dplyr::select(-data) %>% 22 | tidyr::unnest(cols = nested.col) %>% 23 | dplyr::group_by_at(.vars = group_names) 24 | 25 | # if (merge) { 26 | # ret <- merge_two_tibbles(tib1 = data, tib2 = ret, .f = .f) 27 | # } 28 | 29 | return(ret) 30 | 31 | } 32 | 33 | # 2. Merging Time-Based Tibbles ----- 34 | 35 | merge_two_tibbles <- function(tib1, tib2, .f) { 36 | 37 | # Merge results 38 | if (identical(nrow(tib1), nrow(tib2))) { 39 | 40 | # Arrange dates - Possibility of issue if dates not decending in tib1 41 | tib1 <- arrange_by_date(tib1) 42 | 43 | # Drop date column and groups 44 | tib2 <- drop_date_and_group_cols(tib2) 45 | 46 | # Replace bad names 47 | tib2 <- replace_bad_names(tib2, .f) 48 | 49 | # Replace duplicate names 50 | tib2 <- replace_duplicate_colnames(tib1, tib2) 51 | 52 | ret <- dplyr::bind_cols(tib1, tib2) 53 | 54 | } else { 55 | 56 | stop("Could not join. Incompatible structures.") 57 | } 58 | 59 | return(ret) 60 | } 61 | 62 | replace_duplicate_colnames <- function(tib1, tib2) { 63 | 64 | # Collect column names 65 | name_list_tib1 <- colnames(tib1) 66 | name_list_tib2 <- colnames(tib2) 67 | name_list <- c(name_list_tib1, name_list_tib2) 68 | 69 | duplicates_exist <- detect_duplicates(name_list) 70 | 71 | # Iteratively add .1, .2, .3 ... onto end of column names 72 | if (duplicates_exist) { 73 | 74 | i <- 1 75 | 76 | while (duplicates_exist) { 77 | 78 | dup_names_stripped <- 79 | strsplit(name_list[duplicated(name_list)], 80 | split = "\\.\\.") %>% 81 | sapply(function(x) x[[1]]) 82 | 83 | name_list[duplicated(name_list)] <- 84 | paste0(dup_names_stripped, "..", i) 85 | 86 | i <- i + 1 87 | 88 | duplicates_exist <- detect_duplicates(name_list) 89 | 90 | } 91 | 92 | name_list_tib2 <- name_list[(ncol(tib1) + 1):length(name_list)] 93 | 94 | colnames(tib2) <- name_list_tib2 95 | } 96 | 97 | return(tib2) 98 | } 99 | 100 | detect_duplicates <- function(name_list) { 101 | 102 | name_list %>% 103 | duplicated() %>% 104 | any() 105 | } 106 | 107 | # bad / restricted names are names that get selected unintetionally by OHLC functions 108 | replace_bad_names <- function(tib, fun_name) { 109 | 110 | bad_names_regex <- "open|high|low|close|volume|adjusted|price" 111 | 112 | name_list_tib <- colnames(tib) 113 | name_list_tib_lower <- tolower(name_list_tib) 114 | 115 | detect_bad_names <- grepl(pattern = bad_names_regex, 116 | x = name_list_tib_lower) 117 | 118 | if (any(detect_bad_names)) { 119 | 120 | len <- length(name_list_tib_lower[detect_bad_names]) 121 | name_list_tib[detect_bad_names] <- rep(fun_name, length.out = len) 122 | 123 | } 124 | 125 | colnames(tib) <- name_list_tib 126 | 127 | return(tib) 128 | } 129 | 130 | arrange_by_date <- function(tib) { 131 | 132 | if (dplyr::is.grouped_df(tib)) { 133 | 134 | group_names <- dplyr::group_vars(tib) 135 | 136 | arrange_date <- function(tib) { 137 | date_col <- timetk::tk_get_timeseries_variables(tib)[[1]] 138 | tib %>% 139 | dplyr::arrange(!! rlang::sym(date_col)) 140 | } 141 | 142 | tib <- tib %>% 143 | tidyr::nest() %>% 144 | dplyr::mutate(nested.col = 145 | purrr::map(data, arrange_date) 146 | ) %>% 147 | dplyr::select(-data) %>% 148 | tidyr::unnest(cols = nested.col) %>% 149 | dplyr::group_by_at(.vars = group_names) 150 | 151 | 152 | } else { 153 | date_col <- timetk::tk_get_timeseries_variables(tib)[[1]] 154 | tib <- tib %>% 155 | dplyr::arrange(!! rlang::sym(date_col)) 156 | 157 | } 158 | 159 | return(tib) 160 | } 161 | 162 | drop_date_and_group_cols <- function(tib) { 163 | 164 | date_col <- timetk::tk_get_timeseries_variables(tib)[[1]] 165 | group_cols <- dplyr::groups(tib) %>% 166 | as.character() 167 | cols_to_remove <- c(date_col, group_cols) 168 | tib_names <- colnames(tib) 169 | cols_to_remove_logical <- tib_names %in% cols_to_remove 170 | tib_names_without_date_or_group <- tib_names[!cols_to_remove_logical] 171 | 172 | tib <- tib %>% 173 | dplyr::ungroup() %>% 174 | dplyr::select(!!! rlang::syms(tib_names_without_date_or_group)) 175 | 176 | return(tib) 177 | } 178 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | 2 | # By default set time_scale_template_options to time_scale_template() 3 | .onLoad = function(libname, pkgname) { 4 | options( 5 | time_scale_template = time_scale_template() 6 | ) 7 | } 8 | 9 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | # Anomalize is being Superceded by Timetk: 6 | 7 | # anomalize 8 | 9 | 10 | [![R-CMD-check](https://github.com/business-science/anomalize/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/business-science/anomalize/actions/workflows/R-CMD-check.yaml) 11 | [![Lifecycle Status](https://img.shields.io/badge/lifecycle-superceded-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html) 12 | [![Coverage status](https://codecov.io/gh/business-science/anomalize/branch/master/graph/badge.svg)](https://app.codecov.io/github/business-science/anomalize?branch=master) 13 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/anomalize)](https://cran.r-project.org/package=anomalize) 14 | ![](http://cranlogs.r-pkg.org/badges/anomalize?color=brightgreen) 15 | ![](http://cranlogs.r-pkg.org/badges/grand-total/anomalize?color=brightgreen) 16 | 17 | 18 | 19 | 20 | ```{r setup, include = FALSE} 21 | knitr::opts_chunk$set( 22 | collapse = TRUE, 23 | comment = "#>", 24 | fig.path = "man/figures/README-", 25 | out.width = "100%", 26 | dpi = 200, 27 | message = F, 28 | warning = F 29 | ) 30 | library(anomalize) 31 | library(dplyr) # for pipe 32 | ``` 33 | 34 | 35 | The `anomalize` package functionality has been superceded by `timetk`. We suggest you begin to use the `timetk::anomalize()` to benefit from enhanced functionality to get improvements going forward. [Learn more about Anomaly Detection with `timetk` here.](https://business-science.github.io/timetk/articles/TK08_Automatic_Anomaly_Detection.html) 36 | 37 | The original `anomalize` package functionality will be maintained for previous code bases that use the legacy functionality. 38 | 39 | To prevent the new `timetk` functionality from conflicting with old `anomalize` code, use these lines: 40 | 41 | ``` r 42 | library(anomalize) 43 | 44 | anomalize <- anomalize::anomalize 45 | plot_anomalies <- anomalize::plot_anomalies 46 | ``` 47 | 48 | 49 | 50 | 51 | 52 | 53 | > Tidy anomaly detection 54 | 55 | `anomalize` enables a tidy workflow for detecting anomalies in data. The main functions are `time_decompose()`, `anomalize()`, and `time_recompose()`. When combined, it's quite simple to decompose time series, detect anomalies, and create bands separating the "normal" data from the anomalous data. 56 | 57 | ## Anomalize In 2 Minutes (YouTube) 58 | 59 | Anomalize 61 | 62 | Check out our entire [Software Intro Series](https://www.youtube.com/watch?v=Gk_HwjhlQJs&list=PLo32uKohmrXsYNhpdwr15W143rX6uMAze) on YouTube! 63 | 64 | ## Installation 65 | 66 | You can install the development version with `devtools` or the most recent CRAN version with `install.packages()`: 67 | 68 | ``` r 69 | # devtools::install_github("business-science/anomalize") 70 | install.packages("anomalize") 71 | ``` 72 | 73 | ## How It Works 74 | 75 | `anomalize` has three main functions: 76 | 77 | - `time_decompose()`: Separates the time series into seasonal, trend, and remainder components 78 | - `anomalize()`: Applies anomaly detection methods to the remainder component. 79 | - `time_recompose()`: Calculates limits that separate the "normal" data from the anomalies! 80 | 81 | ## Getting Started 82 | 83 | Load the `anomalize` package. Usually, you will also load the tidyverse as well! 84 | 85 | ```{r, eval = F} 86 | library(anomalize) 87 | library(tidyverse) 88 | # NOTE: timetk now has anomaly detection built in, which 89 | # will get the new functionality going forward. 90 | # Use this script to prevent overwriting legacy anomalize: 91 | 92 | anomalize <- anomalize::anomalize 93 | plot_anomalies <- anomalize::plot_anomalies 94 | ``` 95 | 96 | 97 | Next, let's get some data. `anomalize` ships with a data set called `tidyverse_cran_downloads` that contains the daily CRAN download counts for 15 "tidy" packages from 2017-01-01 to 2018-03-01. 98 | 99 | Suppose we want to determine which daily download "counts" are anomalous. It's as easy as using the three main functions (`time_decompose()`, `anomalize()`, and `time_recompose()`) along with a visualization function, `plot_anomalies()`. 100 | 101 | ```{r tidyverse_anoms_1, fig.height=8} 102 | tidyverse_cran_downloads %>% 103 | # Data Manipulation / Anomaly Detection 104 | time_decompose(count, method = "stl") %>% 105 | anomalize(remainder, method = "iqr") %>% 106 | time_recompose() %>% 107 | # Anomaly Visualization 108 | plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) + 109 | ggplot2::labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods") 110 | ``` 111 | 112 | Check out the [`anomalize` Quick Start Guide](https://business-science.github.io/anomalize/articles/anomalize_quick_start_guide.html). 113 | 114 | ## Reducing Forecast Error by 32% 115 | 116 | Yes! Anomalize has a new function, `clean_anomalies()`, that can be used to repair time series prior to forecasting. We have a [brand new vignette - Reduce Forecast Error (by 32%) with Cleaned Anomalies](https://business-science.github.io/anomalize/articles/forecasting_with_cleaned_anomalies.html). 117 | ```{r} 118 | tidyverse_cran_downloads %>% 119 | dplyr::filter(package == "lubridate") %>% 120 | dplyr::ungroup() %>% 121 | time_decompose(count) %>% 122 | anomalize(remainder) %>% 123 | 124 | # New function that cleans & repairs anomalies! 125 | clean_anomalies() %>% 126 | 127 | dplyr::select(date, anomaly, observed, observed_cleaned) %>% 128 | dplyr::filter(anomaly == "Yes") 129 | ``` 130 | 131 | 132 | ## But Wait, There's More! 133 | 134 | There are a several extra capabilities: 135 | 136 | - `plot_anomaly_decomposition()` for visualizing the inner workings of how algorithm detects anomalies in the "remainder". 137 | 138 | ```{r, fig.height=7} 139 | tidyverse_cran_downloads %>% 140 | dplyr::filter(package == "lubridate") %>% 141 | dplyr::ungroup() %>% 142 | time_decompose(count) %>% 143 | anomalize(remainder) %>% 144 | plot_anomaly_decomposition() + 145 | ggplot2::labs(title = "Decomposition of Anomalized Lubridate Downloads") 146 | ``` 147 | 148 | For more information on the `anomalize` methods and the inner workings, please see ["Anomalize Methods" Vignette](https://business-science.github.io/anomalize/articles/anomalize_methods.html). 149 | 150 | ## References 151 | 152 | Several other packages were instrumental in developing anomaly detection methods used in `anomalize`: 153 | 154 | - Twitter's `AnomalyDetection`, which implements decomposition using median spans and the Generalized Extreme Studentized Deviation (GESD) test for anomalies. 155 | - `forecast::tsoutliers()` function, which implements the IQR method. 156 | 157 | # Interested in Learning Anomaly Detection? 158 | 159 | Business Science offers two 1-hour courses on Anomaly Detection: 160 | 161 | - [Learning Lab 18](https://university.business-science.io/p/learning-labs-pro) - Time Series Anomaly Detection with `anomalize` 162 | 163 | - [Learning Lab 17](https://university.business-science.io/p/learning-labs-pro) - Anomaly Detection with `H2O` Machine Learning 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Anomalize is being Superceded by Timetk: 3 | 4 | # anomalize 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/business-science/anomalize/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/business-science/anomalize/actions/workflows/R-CMD-check.yaml) 9 | [![Lifecycle 10 | Status](https://img.shields.io/badge/lifecycle-superceded-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html) 11 | [![Coverage 12 | status](https://codecov.io/gh/business-science/anomalize/branch/master/graph/badge.svg)](https://app.codecov.io/github/business-science/anomalize?branch=master) 13 | [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/anomalize)](https://cran.r-project.org/package=anomalize) 14 | ![](http://cranlogs.r-pkg.org/badges/anomalize?color=brightgreen) 15 | ![](http://cranlogs.r-pkg.org/badges/grand-total/anomalize?color=brightgreen) 16 | 17 | 18 | 19 | 20 | The `anomalize` package functionality has been superceded by `timetk`. 21 | We suggest you begin to use the `timetk::anomalize()` to benefit from 22 | enhanced functionality to get improvements going forward. [Learn more 23 | about Anomaly Detection with `timetk` 24 | here.](https://business-science.github.io/timetk/articles/TK08_Automatic_Anomaly_Detection.html) 25 | 26 | The original `anomalize` package functionality will be maintained for 27 | previous code bases that use the legacy functionality. 28 | 29 | To prevent the new `timetk` functionality from conflicting with old 30 | `anomalize` code, use these lines: 31 | 32 | ``` r 33 | library(anomalize) 34 | 35 | anomalize <- anomalize::anomalize 36 | plot_anomalies <- anomalize::plot_anomalies 37 | ``` 38 | 39 | 40 | 41 | > Tidy anomaly detection 42 | 43 | `anomalize` enables a tidy workflow for detecting anomalies in data. The 44 | main functions are `time_decompose()`, `anomalize()`, and 45 | `time_recompose()`. When combined, it’s quite simple to decompose time 46 | series, detect anomalies, and create bands separating the “normal” data 47 | from the anomalous data. 48 | 49 | ## Anomalize In 2 Minutes (YouTube) 50 | 51 | Anomalize 53 | 54 | Check out our entire [Software Intro 55 | Series](https://www.youtube.com/watch?v=Gk_HwjhlQJs&list=PLo32uKohmrXsYNhpdwr15W143rX6uMAze) 56 | on YouTube! 57 | 58 | ## Installation 59 | 60 | You can install the development version with `devtools` or the most 61 | recent CRAN version with `install.packages()`: 62 | 63 | ``` r 64 | # devtools::install_github("business-science/anomalize") 65 | install.packages("anomalize") 66 | ``` 67 | 68 | ## How It Works 69 | 70 | `anomalize` has three main functions: 71 | 72 | - `time_decompose()`: Separates the time series into seasonal, trend, 73 | and remainder components 74 | - `anomalize()`: Applies anomaly detection methods to the remainder 75 | component. 76 | - `time_recompose()`: Calculates limits that separate the “normal” data 77 | from the anomalies! 78 | 79 | ## Getting Started 80 | 81 | Load the `anomalize` package. Usually, you will also load the tidyverse 82 | as well! 83 | 84 | ``` r 85 | library(anomalize) 86 | library(tidyverse) 87 | # NOTE: timetk now has anomaly detection built in, which 88 | # will get the new functionality going forward. 89 | # Use this script to prevent overwriting legacy anomalize: 90 | 91 | anomalize <- anomalize::anomalize 92 | plot_anomalies <- anomalize::plot_anomalies 93 | ``` 94 | 95 | Next, let’s get some data. `anomalize` ships with a data set called 96 | `tidyverse_cran_downloads` that contains the daily CRAN download counts 97 | for 15 “tidy” packages from 2017-01-01 to 2018-03-01. 98 | 99 | Suppose we want to determine which daily download “counts” are 100 | anomalous. It’s as easy as using the three main functions 101 | (`time_decompose()`, `anomalize()`, and `time_recompose()`) along with a 102 | visualization function, `plot_anomalies()`. 103 | 104 | ``` r 105 | tidyverse_cran_downloads %>% 106 | # Data Manipulation / Anomaly Detection 107 | time_decompose(count, method = "stl") %>% 108 | anomalize(remainder, method = "iqr") %>% 109 | time_recompose() %>% 110 | # Anomaly Visualization 111 | plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) + 112 | ggplot2::labs(title = "Tidyverse Anomalies", subtitle = "STL + IQR Methods") 113 | ``` 114 | 115 | 116 | 117 | Check out the [`anomalize` Quick Start 118 | Guide](https://business-science.github.io/anomalize/articles/anomalize_quick_start_guide.html). 119 | 120 | ## Reducing Forecast Error by 32% 121 | 122 | Yes! Anomalize has a new function, `clean_anomalies()`, that can be used 123 | to repair time series prior to forecasting. We have a [brand new 124 | vignette - Reduce Forecast Error (by 32%) with Cleaned 125 | Anomalies](https://business-science.github.io/anomalize/articles/forecasting_with_cleaned_anomalies.html). 126 | 127 | ``` r 128 | tidyverse_cran_downloads %>% 129 | dplyr::filter(package == "lubridate") %>% 130 | dplyr::ungroup() %>% 131 | time_decompose(count) %>% 132 | anomalize(remainder) %>% 133 | 134 | # New function that cleans & repairs anomalies! 135 | clean_anomalies() %>% 136 | 137 | dplyr::select(date, anomaly, observed, observed_cleaned) %>% 138 | dplyr::filter(anomaly == "Yes") 139 | #> # A time tibble: 19 × 4 140 | #> # Index: date 141 | #> date anomaly observed observed_cleaned 142 | #> 143 | #> 1 2017-01-12 Yes -1.14e-13 3522. 144 | #> 2 2017-04-19 Yes 8.55e+ 3 5202. 145 | #> 3 2017-09-01 Yes 3.98e-13 4137. 146 | #> 4 2017-09-07 Yes 9.49e+ 3 4871. 147 | #> 5 2017-10-30 Yes 1.20e+ 4 6413. 148 | #> 6 2017-11-13 Yes 1.03e+ 4 6641. 149 | #> 7 2017-11-14 Yes 1.15e+ 4 7250. 150 | #> 8 2017-12-04 Yes 1.03e+ 4 6519. 151 | #> 9 2017-12-05 Yes 1.06e+ 4 7099. 152 | #> 10 2017-12-27 Yes 3.69e+ 3 7073. 153 | #> 11 2018-01-01 Yes 1.87e+ 3 6418. 154 | #> 12 2018-01-05 Yes -5.68e-14 6293. 155 | #> 13 2018-01-13 Yes 7.64e+ 3 4141. 156 | #> 14 2018-02-07 Yes 1.19e+ 4 8539. 157 | #> 15 2018-02-08 Yes 1.17e+ 4 8237. 158 | #> 16 2018-02-09 Yes -5.68e-14 7780. 159 | #> 17 2018-02-10 Yes 0 5478. 160 | #> 18 2018-02-23 Yes -5.68e-14 8519. 161 | #> 19 2018-02-24 Yes 0 6218. 162 | ``` 163 | 164 | ## But Wait, There’s More! 165 | 166 | There are a several extra capabilities: 167 | 168 | - `plot_anomaly_decomposition()` for visualizing the inner workings of 169 | how algorithm detects anomalies in the “remainder”. 170 | 171 | ``` r 172 | tidyverse_cran_downloads %>% 173 | dplyr::filter(package == "lubridate") %>% 174 | dplyr::ungroup() %>% 175 | time_decompose(count) %>% 176 | anomalize(remainder) %>% 177 | plot_anomaly_decomposition() + 178 | ggplot2::labs(title = "Decomposition of Anomalized Lubridate Downloads") 179 | ``` 180 | 181 | 182 | 183 | For more information on the `anomalize` methods and the inner workings, 184 | please see [“Anomalize Methods” 185 | Vignette](https://business-science.github.io/anomalize/articles/anomalize_methods.html). 186 | 187 | ## References 188 | 189 | Several other packages were instrumental in developing anomaly detection 190 | methods used in `anomalize`: 191 | 192 | - Twitter’s `AnomalyDetection`, which implements decomposition using 193 | median spans and the Generalized Extreme Studentized Deviation (GESD) 194 | test for anomalies. 195 | - `forecast::tsoutliers()` function, which implements the IQR method. 196 | 197 | # Interested in Learning Anomaly Detection? 198 | 199 | Business Science offers two 1-hour courses on Anomaly Detection: 200 | 201 | - [Learning Lab 202 | 18](https://university.business-science.io/p/learning-labs-pro) - Time 203 | Series Anomaly Detection with `anomalize` 204 | 205 | - [Learning Lab 206 | 17](https://university.business-science.io/p/learning-labs-pro) - 207 | Anomaly Detection with `H2O` Machine Learning 208 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://business-science.github.io/anomalize/ 2 | template: 3 | bootstrap: 5 4 | bootswatch: flatly 5 | params: 6 | ganalytics: UA-76139189-1 7 | navbar: 8 | bg: primary 9 | title: timetk 10 | left: 11 | - icon: fa-home 12 | href: index.html 13 | - text: Start 14 | href: articles/anomalize_quick_start_guide.html 15 | - text: Articles 16 | href: articles/index.html 17 | - text: API 18 | href: reference/index.html 19 | menu: 20 | - text: API Functions 21 | - icon: fa-home 22 | text: Function Reference 23 | href: reference/index.html 24 | - text: '---' 25 | - text: Change History 26 | - text: News 27 | href: news/index.html 28 | right: 29 | - icon: fa-github 30 | href: https://github.com/business-science/timetk 31 | reference: 32 | - title: General 33 | contents: tidyverse_cran_downloads 34 | - title: Anomalize workflow 35 | desc: __The main functions used to anomalize time series data.__ 36 | contents: 37 | - starts_with("time_decompose") 38 | - anomalize 39 | - starts_with("time_recompose") 40 | - clean_anomalies 41 | - title: Visualization functions 42 | desc: __Plotting utilities for visualizing anomalies.__ 43 | contents: starts_with("plot_") 44 | - title: Frequency and trend 45 | desc: __Working with the frequency, trend, and time scale.__ 46 | contents: 47 | - ends_with("frequency") 48 | - ends_with("trend") 49 | - contains("time_scale") 50 | - title: Methods 51 | desc: __Functions that power the main anomalize functions.__ 52 | contents: 53 | - starts_with("decompose_") 54 | - iqr 55 | - gesd 56 | - title: Misc 57 | desc: __Miscellaneous functions and utilites.__ 58 | contents: 59 | - starts_with("prep_") 60 | - time_apply 61 | 62 | -------------------------------------------------------------------------------- /anomalize.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | 23 | UseNativePipeOperator: No 24 | 25 | SpellingDictionary: en_US 26 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 1% 13 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * local OS X install, R 3.5.3 3 | * ubuntu 14.04 (on travis-ci), R 3.5.3 4 | * win-builder (devel and release) 5 | 6 | ## R CMD check results 7 | 8 | 0 errors | 0 warnings | 0 notes 9 | 10 | * This is a new release. 11 | -------------------------------------------------------------------------------- /data-raw/tidyverse_cran_downloads.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tibbletime) 3 | library(cranlogs) 4 | 5 | pkgs <- c( 6 | "tidyr", "lubridate", "dplyr", 7 | "broom", "tidyquant", "tidytext", 8 | "ggplot2", "purrr", "glue", 9 | "stringr", "forcats", "knitr", 10 | "readr", "tibble", "tidyverse" 11 | ) 12 | 13 | tidyverse_cran_downloads <- cran_downloads(pkgs, from = "2017-01-01", to = "2018-03-01") %>% 14 | group_by(package) %>% 15 | as_tbl_time(date) 16 | 17 | tidyverse_cran_downloads 18 | -------------------------------------------------------------------------------- /data/tidyverse_cran_downloads.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/data/tidyverse_cran_downloads.rda -------------------------------------------------------------------------------- /man/anomalize-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/anomalize-package.R 3 | \docType{package} 4 | \name{anomalize-package} 5 | \alias{anomalize-package} 6 | \alias{_PACKAGE} 7 | \title{anomalize: Tidy Anomaly Detection} 8 | \description{ 9 | The 'anomalize' package enables a "tidy" workflow for detecting anomalies in data. 10 | The main functions are time_decompose(), anomalize(), and time_recompose(). 11 | When combined, it's quite simple to decompose time series, detect anomalies, 12 | and create bands separating the "normal" data from the anomalous data at scale (i.e. for multiple time series). 13 | Time series decomposition is used to remove trend and seasonal components via the time_decompose() function 14 | and methods include seasonal decomposition of time series by Loess and 15 | seasonal decomposition by piecewise medians. The anomalize() function implements 16 | two methods for anomaly detection of residuals including using an inner quartile range 17 | and generalized extreme studentized deviation. These methods are based on 18 | those used in the \code{forecast} package and the Twitter \code{AnomalyDetection} package. 19 | Refer to the associated functions for specific references for these methods. 20 | 21 | To learn more about \code{anomalize}, start with the vignettes: 22 | \code{browseVignettes(package = "anomalize")} 23 | } 24 | \seealso{ 25 | Useful links: 26 | \itemize{ 27 | \item \url{https://business-science.github.io/anomalize/} 28 | \item \url{https://github.com/business-science/anomalize} 29 | \item Report bugs at \url{https://github.com/business-science/anomalize/issues} 30 | } 31 | 32 | } 33 | \author{ 34 | \strong{Maintainer}: Matt Dancho \email{mdancho@business-science.io} 35 | 36 | Authors: 37 | \itemize{ 38 | \item Davis Vaughan \email{dvaughan@business-science.io} 39 | } 40 | 41 | } 42 | \keyword{internal} 43 | -------------------------------------------------------------------------------- /man/anomalize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/anomalize.R 3 | \name{anomalize} 4 | \alias{anomalize} 5 | \title{Detect anomalies using the tidyverse} 6 | \usage{ 7 | anomalize( 8 | data, 9 | target, 10 | method = c("iqr", "gesd"), 11 | alpha = 0.05, 12 | max_anoms = 0.2, 13 | verbose = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{data}{A \code{tibble} or \code{tbl_time} object.} 18 | 19 | \item{target}{A column to apply the function to} 20 | 21 | \item{method}{The anomaly detection method. One of \code{"iqr"} or \code{"gesd"}. 22 | The IQR method is faster at the expense of possibly not being quite as accurate. 23 | The GESD method has the best properties for outlier detection, but is loop-based 24 | and therefore a bit slower.} 25 | 26 | \item{alpha}{Controls the width of the "normal" range. 27 | Lower values are more conservative while higher values are less prone 28 | to incorrectly classifying "normal" observations.} 29 | 30 | \item{max_anoms}{The maximum percent of anomalies permitted to be identified.} 31 | 32 | \item{verbose}{A boolean. If \code{TRUE}, will return a list containing useful information 33 | about the anomalies. If \code{FALSE}, just returns the data expanded with the anomalies and 34 | the lower (l1) and upper (l2) bounds.} 35 | } 36 | \value{ 37 | Returns a \code{tibble} / \code{tbl_time} object or list depending on the value of \code{verbose}. 38 | } 39 | \description{ 40 | The \code{anomalize()} function is used to detect outliers in a distribution 41 | with no trend or seasonality present. It takes the output of \code{\link[=time_decompose]{time_decompose()}}, 42 | which has be de-trended and applies anomaly detection methods to identify outliers. 43 | } 44 | \details{ 45 | The return has three columns: 46 | "remainder_l1" (lower limit for anomalies), "remainder_l2" (upper limit for 47 | anomalies), and "anomaly" (Yes/No). 48 | 49 | Use \code{\link[=time_decompose]{time_decompose()}} to decompose a time series prior to performing 50 | anomaly detection with \code{anomalize()}. Typically, \code{anomalize()} is 51 | performed on the "remainder" of the time series decomposition. 52 | 53 | For non-time series data (data without trend), the \code{anomalize()} function can 54 | be used without time series decomposition. 55 | 56 | The \code{anomalize()} function uses two methods for outlier detection 57 | each with benefits. 58 | 59 | \strong{IQR}: 60 | 61 | The IQR Method uses an innerquartile range of 25\% and 75\% to establish a baseline distribution around 62 | the median. With the default \code{alpha = 0.05}, the limits are established by expanding 63 | the 25/75 baseline by an IQR Factor of 3 (3X). The IQR Factor = 0.15 / alpha (hense 3X with alpha = 0.05). 64 | To increase the IQR Factor controling the limits, decrease the alpha, which makes 65 | it more difficult to be an outlier. Increase alpha to make it easier to be an outlier. 66 | 67 | The IQR method is used in \href{https://github.com/robjhyndman/forecast}{\code{forecast::tsoutliers()}}. 68 | 69 | \strong{GESD}: 70 | 71 | The GESD Method (Generlized Extreme Studentized Deviate Test) progressively 72 | eliminates outliers using a Student's T-Test comparing the test statistic to a critical value. 73 | Each time an outlier is removed, the test statistic is updated. Once test statistic 74 | drops below the critical value, all outliers are considered removed. Because this method 75 | involves continuous updating via a loop, it is slower than the IQR method. However, it 76 | tends to be the best performing method for outlier removal. 77 | 78 | The GESD method is used in \href{https://github.com/twitter/AnomalyDetection}{\code{AnomalyDection::AnomalyDetectionTs()}}. 79 | } 80 | \examples{ 81 | \dontrun{ 82 | library(dplyr) 83 | 84 | # Needed to pass CRAN check / This is loaded by default 85 | set_time_scale_template(time_scale_template()) 86 | 87 | tidyverse_cran_downloads \%>\% 88 | time_decompose(count, method = "stl") \%>\% 89 | anomalize(remainder, method = "iqr") 90 | } 91 | 92 | } 93 | \references{ 94 | \enumerate{ 95 | \item \href{https://stats.stackexchange.com/questions/69874/how-to-correct-outliers-once-detected-for-time-series-data-forecasting}{How to correct outliers once detected for time series data forecasting? Cross Validated, https://stats.stackexchange.com} 96 | \item \href{https://stats.stackexchange.com/questions/1142/simple-algorithm-for-online-outlier-detection-of-a-generic-time-series?}{Cross Validated: Simple algorithm for online outlier detection of a generic time series. Cross Validated, https://stats.stackexchange.com} 97 | \item \href{https://www.usenix.org/system/files/conference/hotcloud14/hotcloud14-vallis.pdf}{Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). A Novel Technique for Long-Term Anomaly Detection in the Cloud. Twitter Inc.} 98 | \item \href{https://github.com/twitter/AnomalyDetection}{Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). AnomalyDetection: Anomaly Detection Using Seasonal Hybrid Extreme Studentized Deviate Test. R package version 1.0.} 99 | \item Alex T.C. Lau (November/December 2015). GESD - A Robust and Effective Technique for Dealing with Multiple Outliers. ASTM Standardization News. www.astm.org/sn 100 | } 101 | } 102 | \seealso{ 103 | Anomaly Detection Methods (Powers \code{anomalize}) 104 | \itemize{ 105 | \item \code{\link[=iqr]{iqr()}} 106 | \item \code{\link[=gesd]{gesd()}} 107 | } 108 | 109 | Time Series Anomaly Detection Functions (anomaly detection workflow): 110 | \itemize{ 111 | \item \code{\link[=time_decompose]{time_decompose()}} 112 | \item \code{\link[=time_recompose]{time_recompose()}} 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /man/anomalize_methods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/anomalize_methods.R 3 | \name{anomalize_methods} 4 | \alias{anomalize_methods} 5 | \alias{iqr} 6 | \alias{gesd} 7 | \title{Methods that power anomalize()} 8 | \usage{ 9 | iqr(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) 10 | 11 | gesd(x, alpha = 0.05, max_anoms = 0.2, verbose = FALSE) 12 | } 13 | \arguments{ 14 | \item{x}{A vector of numeric data.} 15 | 16 | \item{alpha}{Controls the width of the "normal" range. 17 | Lower values are more conservative while higher values are less prone 18 | to incorrectly classifying "normal" observations.} 19 | 20 | \item{max_anoms}{The maximum percent of anomalies permitted to be identified.} 21 | 22 | \item{verbose}{A boolean. If \code{TRUE}, will return a list containing useful information 23 | about the anomalies. If \code{FALSE}, just returns a vector of "Yes" / "No" values.} 24 | } 25 | \value{ 26 | Returns character vector or list depending on the value of \code{verbose}. 27 | } 28 | \description{ 29 | Methods that power anomalize() 30 | } 31 | \examples{ 32 | 33 | set.seed(100) 34 | x <- rnorm(100) 35 | idx_outliers <- sample(100, size = 5) 36 | x[idx_outliers] <- x[idx_outliers] + 10 37 | 38 | iqr(x, alpha = 0.05, max_anoms = 0.2) 39 | iqr(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) 40 | 41 | gesd(x, alpha = 0.05, max_anoms = 0.2) 42 | gesd(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE) 43 | 44 | 45 | } 46 | \references{ 47 | \itemize{ 48 | \item The IQR method is used in \href{https://github.com/robjhyndman/forecast/blob/master/R/clean.R}{\code{forecast::tsoutliers()}} 49 | \item The GESD method is used in Twitter's \href{https://github.com/twitter/AnomalyDetection}{\code{AnomalyDetection}} package and is also available as a function in \href{https://github.com/raunakms/GESD/blob/master/runGESD.R}{@raunakms's GESD method} 50 | } 51 | } 52 | \seealso{ 53 | \code{\link[=anomalize]{anomalize()}} 54 | } 55 | -------------------------------------------------------------------------------- /man/clean_anomalies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/anomalize_clean.R 3 | \name{clean_anomalies} 4 | \alias{clean_anomalies} 5 | \title{Clean anomalies from anomalized data} 6 | \usage{ 7 | clean_anomalies(data) 8 | } 9 | \arguments{ 10 | \item{data}{A \code{tibble} or \code{tbl_time} object.} 11 | } 12 | \value{ 13 | Returns a \code{tibble} / \code{tbl_time} object with a new column "observed_cleaned". 14 | } 15 | \description{ 16 | Clean anomalies from anomalized data 17 | } 18 | \details{ 19 | The \code{clean_anomalies()} function is used to replace outliers with the seasonal and trend component. 20 | This is often desirable when forecasting with noisy time series data to improve trend detection. 21 | 22 | To clean anomalies, the input data must be detrended with \code{time_decompose()} and anomalized with \code{anomalize()}. 23 | The data can also be recomposed with \code{time_recompose()}. 24 | } 25 | \examples{ 26 | 27 | \dontrun{ 28 | library(dplyr) 29 | 30 | # Needed to pass CRAN check / This is loaded by default 31 | set_time_scale_template(time_scale_template()) 32 | 33 | data(tidyverse_cran_downloads) 34 | 35 | tidyverse_cran_downloads \%>\% 36 | time_decompose(count, method = "stl") \%>\% 37 | anomalize(remainder, method = "iqr") \%>\% 38 | clean_anomalies() 39 | } 40 | 41 | } 42 | \seealso{ 43 | Time Series Anomaly Detection Functions (anomaly detection workflow): 44 | \itemize{ 45 | \item \code{\link[=time_decompose]{time_decompose()}} 46 | \item \code{\link[=anomalize]{anomalize()}} 47 | \item \code{\link[=time_recompose]{time_recompose()}} 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /man/decompose_methods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/time_decompose_methods.R 3 | \name{decompose_methods} 4 | \alias{decompose_methods} 5 | \alias{decompose_twitter} 6 | \alias{decompose_stl} 7 | \title{Methods that power time_decompose()} 8 | \usage{ 9 | decompose_twitter( 10 | data, 11 | target, 12 | frequency = "auto", 13 | trend = "auto", 14 | message = TRUE 15 | ) 16 | 17 | decompose_stl(data, target, frequency = "auto", trend = "auto", message = TRUE) 18 | } 19 | \arguments{ 20 | \item{data}{A \code{tibble} or \code{tbl_time} object.} 21 | 22 | \item{target}{A column to apply the function to} 23 | 24 | \item{frequency}{Controls the seasonal adjustment (removal of seasonality). 25 | Input can be either "auto", a time-based definition (e.g. "1 week"), 26 | or a numeric number of observations per frequency (e.g. 10). 27 | Refer to \code{\link[=time_frequency]{time_frequency()}}.} 28 | 29 | \item{trend}{Controls the trend component 30 | For stl, the trend controls the sensitivity of the lowess smoother, which is used to remove the remainder. 31 | For twitter, the trend controls the period width of the median, which are used to remove the trend and center the remainder.} 32 | 33 | \item{message}{A boolean. If \code{TRUE}, will output information related to \code{tbl_time} conversions, frequencies, 34 | and trend / median spans (if applicable).} 35 | } 36 | \value{ 37 | A \code{tbl_time} object containing the time series decomposition. 38 | } 39 | \description{ 40 | Methods that power time_decompose() 41 | } 42 | \examples{ 43 | 44 | library(dplyr) 45 | 46 | tidyverse_cran_downloads \%>\% 47 | ungroup() \%>\% 48 | filter(package == "tidyquant") \%>\% 49 | decompose_stl(count) 50 | 51 | 52 | } 53 | \references{ 54 | \itemize{ 55 | \item The "twitter" method is used in Twitter's \href{https://github.com/twitter/AnomalyDetection}{\code{AnomalyDetection} package} 56 | } 57 | } 58 | \seealso{ 59 | \code{\link[=time_decompose]{time_decompose()}} 60 | } 61 | -------------------------------------------------------------------------------- /man/figures/README-tidyverse_anoms_1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/man/figures/README-tidyverse_anoms_1-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/man/figures/README-unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/man/figures/logo.png -------------------------------------------------------------------------------- /man/plot_anomalies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_anomalies.R 3 | \name{plot_anomalies} 4 | \alias{plot_anomalies} 5 | \title{Visualize the anomalies in one or multiple time series} 6 | \usage{ 7 | plot_anomalies( 8 | data, 9 | time_recomposed = FALSE, 10 | ncol = 1, 11 | color_no = "#2c3e50", 12 | color_yes = "#e31a1c", 13 | fill_ribbon = "grey70", 14 | alpha_dots = 1, 15 | alpha_circles = 1, 16 | alpha_ribbon = 1, 17 | size_dots = 1.5, 18 | size_circles = 4 19 | ) 20 | } 21 | \arguments{ 22 | \item{data}{A \code{tibble} or \code{tbl_time} object.} 23 | 24 | \item{time_recomposed}{A boolean. If \code{TRUE}, will use the \code{time_recompose()} bands to 25 | place bands as approximate limits around the "normal" data.} 26 | 27 | \item{ncol}{Number of columns to display. Set to 1 for single column by default.} 28 | 29 | \item{color_no}{Color for non-anomalous data.} 30 | 31 | \item{color_yes}{Color for anomalous data.} 32 | 33 | \item{fill_ribbon}{Fill color for the time_recomposed ribbon.} 34 | 35 | \item{alpha_dots}{Controls the transparency of the dots. Reduce when too many dots on the screen.} 36 | 37 | \item{alpha_circles}{Controls the transparency of the circles that identify anomalies.} 38 | 39 | \item{alpha_ribbon}{Controls the transparency of the time_recomposed ribbon.} 40 | 41 | \item{size_dots}{Controls the size of the dots.} 42 | 43 | \item{size_circles}{Controls the size of the circles that identify anomalies.} 44 | } 45 | \value{ 46 | Returns a \code{ggplot} object. 47 | } 48 | \description{ 49 | Visualize the anomalies in one or multiple time series 50 | } 51 | \details{ 52 | Plotting function for visualizing anomalies on one or more time series. 53 | Multiple time series must be grouped using \code{dplyr::group_by()}. 54 | } 55 | \examples{ 56 | 57 | \dontrun{ 58 | library(dplyr) 59 | library(ggplot2) 60 | 61 | 62 | #### SINGLE TIME SERIES #### 63 | tidyverse_cran_downloads \%>\% 64 | filter(package == "tidyquant") \%>\% 65 | ungroup() \%>\% 66 | time_decompose(count, method = "stl") \%>\% 67 | anomalize(remainder, method = "iqr") \%>\% 68 | time_recompose() \%>\% 69 | plot_anomalies(time_recomposed = TRUE) 70 | 71 | 72 | #### MULTIPLE TIME SERIES #### 73 | tidyverse_cran_downloads \%>\% 74 | time_decompose(count, method = "stl") \%>\% 75 | anomalize(remainder, method = "iqr") \%>\% 76 | time_recompose() \%>\% 77 | plot_anomalies(time_recomposed = TRUE, ncol = 3) 78 | } 79 | 80 | } 81 | \seealso{ 82 | \code{\link[=plot_anomaly_decomposition]{plot_anomaly_decomposition()}} 83 | } 84 | -------------------------------------------------------------------------------- /man/plot_anomaly_decomposition.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_anomaly_decomposition.R 3 | \name{plot_anomaly_decomposition} 4 | \alias{plot_anomaly_decomposition} 5 | \title{Visualize the time series decomposition with anomalies shown} 6 | \usage{ 7 | plot_anomaly_decomposition( 8 | data, 9 | ncol = 1, 10 | color_no = "#2c3e50", 11 | color_yes = "#e31a1c", 12 | alpha_dots = 1, 13 | alpha_circles = 1, 14 | size_dots = 1.5, 15 | size_circles = 4, 16 | strip.position = "right" 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{A \code{tibble} or \code{tbl_time} object.} 21 | 22 | \item{ncol}{Number of columns to display. Set to 1 for single column by default.} 23 | 24 | \item{color_no}{Color for non-anomalous data.} 25 | 26 | \item{color_yes}{Color for anomalous data.} 27 | 28 | \item{alpha_dots}{Controls the transparency of the dots. Reduce when too many dots on the screen.} 29 | 30 | \item{alpha_circles}{Controls the transparency of the circles that identify anomalies.} 31 | 32 | \item{size_dots}{Controls the size of the dots.} 33 | 34 | \item{size_circles}{Controls the size of the circles that identify anomalies.} 35 | 36 | \item{strip.position}{Controls the placement of the strip that identifies the time series decomposition components.} 37 | } 38 | \value{ 39 | Returns a \code{ggplot} object. 40 | } 41 | \description{ 42 | Visualize the time series decomposition with anomalies shown 43 | } 44 | \details{ 45 | The first step in reviewing the anomaly detection process is to evaluate 46 | a single times series to observe how the algorithm is selecting anomalies. 47 | The \code{plot_anomaly_decomposition()} function is used to gain 48 | an understanding as to whether or not the method is detecting anomalies correctly and 49 | whether or not parameters such as decomposition method, anomalize method, 50 | alpha, frequency, and so on should be adjusted. 51 | } 52 | \examples{ 53 | 54 | library(dplyr) 55 | library(ggplot2) 56 | 57 | tidyverse_cran_downloads \%>\% 58 | filter(package == "tidyquant") \%>\% 59 | ungroup() \%>\% 60 | time_decompose(count, method = "stl") \%>\% 61 | anomalize(remainder, method = "iqr") \%>\% 62 | plot_anomaly_decomposition() 63 | 64 | } 65 | \seealso{ 66 | \code{\link[=plot_anomalies]{plot_anomalies()}} 67 | } 68 | -------------------------------------------------------------------------------- /man/prep_tbl_time.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/prep_tbl_time.R 3 | \name{prep_tbl_time} 4 | \alias{prep_tbl_time} 5 | \title{Automatically create tibbletime objects from tibbles} 6 | \usage{ 7 | prep_tbl_time(data, message = FALSE) 8 | } 9 | \arguments{ 10 | \item{data}{A \code{tibble}.} 11 | 12 | \item{message}{A boolean. If \code{TRUE}, returns a message indicating any 13 | conversion details important to know during the conversion to \code{tbl_time} class.} 14 | } 15 | \value{ 16 | Returns a \code{tibbletime} object of class \code{tbl_time}. 17 | } 18 | \description{ 19 | Automatically create tibbletime objects from tibbles 20 | } 21 | \details{ 22 | Detects a date or datetime index column and automatically 23 | } 24 | \examples{ 25 | 26 | library(dplyr) 27 | library(tibbletime) 28 | 29 | data_tbl <- tibble( 30 | date = seq.Date(from = as.Date("2018-01-01"), by = "day", length.out = 10), 31 | value = rnorm(10) 32 | ) 33 | 34 | prep_tbl_time(data_tbl) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /man/tidyverse_cran_downloads.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tidyverse_cran_downloads.R 3 | \docType{data} 4 | \name{tidyverse_cran_downloads} 5 | \alias{tidyverse_cran_downloads} 6 | \title{Downloads of various "tidyverse" packages from CRAN} 7 | \format{ 8 | A \code{grouped_tbl_time} object with 6,375 rows and 3 variables: 9 | \describe{ 10 | \item{date}{Date of the daily observation} 11 | \item{count}{Number of downloads that day} 12 | \item{package}{The package corresponding to the daily download number} 13 | } 14 | } 15 | \source{ 16 | The package downloads come from CRAN by way of the \code{cranlogs} package. 17 | } 18 | \usage{ 19 | tidyverse_cran_downloads 20 | } 21 | \description{ 22 | A dataset containing the daily download counts from 2017-01-01 to 2018-03-01 23 | for the following tidyverse packages: 24 | \itemize{ 25 | \item \code{tidyr} 26 | \item \code{lubridate} 27 | \item \code{dplyr} 28 | \item \code{broom} 29 | \item \code{tidyquant} 30 | \item \code{tidytext} 31 | \item \code{ggplot2} 32 | \item \code{purrr} 33 | \item \code{stringr} 34 | \item \code{forcats} 35 | \item \code{knitr} 36 | \item \code{readr} 37 | \item \code{tibble} 38 | \item \code{tidyverse} 39 | } 40 | } 41 | \keyword{datasets} 42 | -------------------------------------------------------------------------------- /man/time_apply.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/time_apply.R 3 | \name{time_apply} 4 | \alias{time_apply} 5 | \title{Apply a function to a time series by period} 6 | \usage{ 7 | time_apply( 8 | data, 9 | target, 10 | period, 11 | .fun, 12 | ..., 13 | start_date = NULL, 14 | side = "end", 15 | clean = FALSE, 16 | message = TRUE 17 | ) 18 | } 19 | \arguments{ 20 | \item{data}{A \code{tibble} with a date or datetime index.} 21 | 22 | \item{target}{A column to apply the function to} 23 | 24 | \item{period}{A time-based definition (e.g. "1 week"). 25 | or a numeric number of observations per frequency (e.g. 10). 26 | See \code{\link[tibbletime:collapse_by]{tibbletime::collapse_by()}} for period notation.} 27 | 28 | \item{.fun}{A function to apply (e.g. \code{median})} 29 | 30 | \item{...}{Additional parameters passed to the function, \code{.fun}} 31 | 32 | \item{start_date}{Optional argument used to 33 | specify the start date for the 34 | first group. The default is to start at the closest period boundary 35 | below the minimum date in the supplied index.} 36 | 37 | \item{side}{Whether to return the date at the beginning or the end of 38 | the new period. By default, the "end" of the period. 39 | Use "start" to change to the start of the period.} 40 | 41 | \item{clean}{Whether or not to round the collapsed index up / down to the next 42 | period boundary. The decision to round up / down is controlled by the side 43 | argument.} 44 | 45 | \item{message}{A boolean. If \code{message = TRUE}, the frequency used is output 46 | along with the units in the scale of the data.} 47 | } 48 | \value{ 49 | Returns a \code{tibbletime} object of class \code{tbl_time}. 50 | } 51 | \description{ 52 | Apply a function to a time series by period 53 | } 54 | \details{ 55 | Uses a time-based period to apply functions to. This is useful in circumstances where you want to 56 | compare the observation values to aggregated values such as \code{mean()} or \code{median()} 57 | during a set time-based period. The returned output extends the 58 | length of the data frame so the differences can easily be computed. 59 | } 60 | \examples{ 61 | 62 | library(dplyr) 63 | 64 | # Basic Usage 65 | tidyverse_cran_downloads \%>\% 66 | time_apply(count, period = "1 week", .fun = mean, na.rm = TRUE) 67 | 68 | } 69 | -------------------------------------------------------------------------------- /man/time_decompose.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/time_decompose.R 3 | \name{time_decompose} 4 | \alias{time_decompose} 5 | \title{Decompose a time series in preparation for anomaly detection} 6 | \usage{ 7 | time_decompose( 8 | data, 9 | target, 10 | method = c("stl", "twitter"), 11 | frequency = "auto", 12 | trend = "auto", 13 | ..., 14 | merge = FALSE, 15 | message = TRUE 16 | ) 17 | } 18 | \arguments{ 19 | \item{data}{A \code{tibble} or \code{tbl_time} object.} 20 | 21 | \item{target}{A column to apply the function to} 22 | 23 | \item{method}{The time series decomposition method. One of \code{"stl"} or \code{"twitter"}. 24 | The STL method uses seasonal decomposition (see \code{\link[=decompose_stl]{decompose_stl()}}). 25 | The Twitter method uses \code{trend} to remove the trend (see \code{\link[=decompose_twitter]{decompose_twitter()}}).} 26 | 27 | \item{frequency}{Controls the seasonal adjustment (removal of seasonality). 28 | Input can be either "auto", a time-based definition (e.g. "1 week"), 29 | or a numeric number of observations per frequency (e.g. 10). 30 | Refer to \code{\link[=time_frequency]{time_frequency()}}.} 31 | 32 | \item{trend}{Controls the trend component 33 | For stl, the trend controls the sensitivity of the lowess smoother, which is used to remove the remainder. 34 | For twitter, the trend controls the period width of the median, which are used to remove the trend and center the remainder.} 35 | 36 | \item{...}{Additional parameters passed to the underlying method functions.} 37 | 38 | \item{merge}{A boolean. \code{FALSE} by default. If \code{TRUE}, will append results to the original data.} 39 | 40 | \item{message}{A boolean. If \code{TRUE}, will output information related to \code{tbl_time} conversions, frequencies, 41 | and trend / median spans (if applicable).} 42 | } 43 | \value{ 44 | Returns a \code{tbl_time} object. 45 | } 46 | \description{ 47 | Decompose a time series in preparation for anomaly detection 48 | } 49 | \details{ 50 | The \code{time_decompose()} function generates a time series decomposition on 51 | \code{tbl_time} objects. The function is "tidy" in the sense that it works 52 | on data frames. It is designed to work with time-based data, and as such 53 | must have a column that contains date or datetime information. The function 54 | also works with grouped data. The function implements several methods 55 | of time series decomposition, each with benefits. 56 | 57 | \strong{STL}: 58 | 59 | The STL method (\code{method = "stl"}) implements time series decomposition using 60 | the underlying \code{\link[=decompose_stl]{decompose_stl()}} function. If you are familiar with \code{\link[stats:stl]{stats::stl()}}, 61 | the function is a "tidy" version that is designed to work with \code{tbl_time} objects. 62 | The decomposition separates the "season" and "trend" components from 63 | the "observed" values leaving the "remainder" for anomaly detection. 64 | The user can control two parameters: \code{frequency} and \code{trend}. 65 | The \code{frequency} parameter adjusts the "season" component that is removed 66 | from the "observed" values. The \code{trend} parameter adjusts the 67 | trend window (\code{t.window} parameter from \code{stl()}) that is used. 68 | The user may supply both \code{frequency} 69 | and \code{trend} as time-based durations (e.g. "90 days") or numeric values 70 | (e.g. 180) or "auto", which predetermines the frequency and/or trend 71 | based on the scale of the time series. 72 | 73 | \strong{Twitter}: 74 | 75 | The Twitter method (\code{method = "twitter"}) implements time series decomposition using 76 | the methodology from the Twitter \href{https://github.com/twitter/AnomalyDetection}{AnomalyDetection} package. 77 | The decomposition separates the "seasonal" component and then removes 78 | the median data, which is a different approach than the STL method for removing 79 | the trend. This approach works very well for low-growth + high seasonality data. 80 | STL may be a better approach when trend is a large factor. 81 | The user can control two parameters: \code{frequency} and \code{trend}. 82 | The \code{frequency} parameter adjusts the "season" component that is removed 83 | from the "observed" values. The \code{trend} parameter adjusts the 84 | period width of the median spans that are used. The user may supply both \code{frequency} 85 | and \code{trend} as time-based durations (e.g. "90 days") or numeric values 86 | (e.g. 180) or "auto", which predetermines the frequency and/or median spans 87 | based on the scale of the time series. 88 | } 89 | \examples{ 90 | 91 | library(dplyr) 92 | 93 | # Basic Usage 94 | tidyverse_cran_downloads \%>\% 95 | time_decompose(count, method = "stl") 96 | 97 | # twitter 98 | tidyverse_cran_downloads \%>\% 99 | time_decompose(count, 100 | method = "twitter", 101 | frequency = "1 week", 102 | trend = "2 months", 103 | merge = TRUE, 104 | message = FALSE) 105 | 106 | } 107 | \references{ 108 | \enumerate{ 109 | \item CLEVELAND, R. B., CLEVELAND, W. S., MCRAE, J. E., AND TERPENNING, I. 110 | STL: A Seasonal-Trend Decomposition Procedure Based on Loess. Journal of Official Statistics, Vol. 6, No. 1 (1990), pp. 3-73. 111 | \item \href{https://www.usenix.org/system/files/conference/hotcloud14/hotcloud14-vallis.pdf}{Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). A Novel Technique for Long-Term Anomaly Detection in the Cloud. Twitter Inc.} 112 | \item \href{https://github.com/twitter/AnomalyDetection}{Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). AnomalyDetection: Anomaly Detection Using Seasonal Hybrid Extreme Studentized Deviate Test. R package version 1.0.} 113 | } 114 | } 115 | \seealso{ 116 | Decomposition Methods (Powers \code{time_decompose}) 117 | \itemize{ 118 | \item \code{\link[=decompose_stl]{decompose_stl()}} 119 | \item \code{\link[=decompose_twitter]{decompose_twitter()}} 120 | } 121 | 122 | Time Series Anomaly Detection Functions (anomaly detection workflow): 123 | \itemize{ 124 | \item \code{\link[=anomalize]{anomalize()}} 125 | \item \code{\link[=time_recompose]{time_recompose()}} 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /man/time_frequency.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/time_frequency.R 3 | \name{time_frequency} 4 | \alias{time_frequency} 5 | \alias{time_trend} 6 | \title{Generate a time series frequency from a periodicity} 7 | \usage{ 8 | time_frequency(data, period = "auto", message = TRUE) 9 | 10 | time_trend(data, period = "auto", message = TRUE) 11 | } 12 | \arguments{ 13 | \item{data}{A \code{tibble} with a date or datetime index.} 14 | 15 | \item{period}{Either "auto", a time-based definition (e.g. "14 days"), 16 | or a numeric number of observations per frequency (e.g. 10). 17 | See \code{\link[tibbletime:collapse_by]{tibbletime::collapse_by()}} for period notation.} 18 | 19 | \item{message}{A boolean. If \code{message = TRUE}, the frequency used is output 20 | along with the units in the scale of the data.} 21 | } 22 | \value{ 23 | Returns a scalar numeric value indicating the number of observations in the frequency or trend span. 24 | } 25 | \description{ 26 | Generate a time series frequency from a periodicity 27 | } 28 | \details{ 29 | A frequency is loosely defined as the number of observations that comprise a cycle 30 | in a data set. The trend is loosely defined as time span that can 31 | be aggregated across to visualize the central tendency of the data. 32 | It's often easiest to think of frequency and trend in terms of the time-based units 33 | that the data is already in. \strong{This is what \code{time_frequency()} and \code{time_trend()} 34 | enable: using time-based periods to define the frequency or trend.} 35 | 36 | \strong{Frequency}: 37 | 38 | As an example, a weekly cycle is often 5-days (for working 39 | days) or 7-days (for calendar days). Rather than specify a frequency of 5 or 7, 40 | the user can specify \code{period = "1 week"}, and 41 | time_frequency()` will detect the scale of the time series and return 5 or 7 42 | based on the actual data. 43 | 44 | The \code{period} argument has three basic options for returning a frequency. 45 | Options include: 46 | \itemize{ 47 | \item \code{"auto"}: A target frequency is determined using a pre-defined template (see \code{template} below). 48 | \item \verb{time-based duration}: (e.g. "1 week" or "2 quarters" per cycle) 49 | \item \verb{numeric number of observations}: (e.g. 5 for 5 observations per cycle) 50 | } 51 | 52 | The \code{template} argument is only used when \code{period = "auto"}. The template is a tibble 53 | of three features: \code{time_scale}, \code{frequency}, and \code{trend}. The algorithm will inspect 54 | the scale of the time series and select the best frequency that matches the scale and 55 | number of observations per target frequency. A frequency is then chosen on be the 56 | best match. The predefined template is stored in a function \code{time_scale_template()}. 57 | However, the user can come up with his or her own template changing the values 58 | for frequency in the data frame and saving it to \code{anomalize_options$time_scale_template}. 59 | 60 | \strong{Trend}: 61 | 62 | As an example, the trend of daily data is often best aggregated by evaluating 63 | the moving average over a quarter or a month span. Rather than specify the number 64 | of days in a quarter or month, the user can specify "1 quarter" or "1 month", 65 | and the \code{time_trend()} function will return the correct number of observations 66 | per trend cycle. In addition, there is an option, \code{period = "auto"}, to 67 | auto-detect an appropriate trend span depending on the data. The \code{template} 68 | is used to define the appropriate trend span. 69 | } 70 | \examples{ 71 | 72 | library(dplyr) 73 | 74 | data(tidyverse_cran_downloads) 75 | 76 | #### FREQUENCY DETECTION #### 77 | 78 | # period = "auto" 79 | tidyverse_cran_downloads \%>\% 80 | filter(package == "tidyquant") \%>\% 81 | ungroup() \%>\% 82 | time_frequency(period = "auto") 83 | 84 | time_scale_template() 85 | 86 | # period = "1 month" 87 | tidyverse_cran_downloads \%>\% 88 | filter(package == "tidyquant") \%>\% 89 | ungroup() \%>\% 90 | time_frequency(period = "1 month") 91 | 92 | #### TREND DETECTION #### 93 | 94 | tidyverse_cran_downloads \%>\% 95 | filter(package == "tidyquant") \%>\% 96 | ungroup() \%>\% 97 | time_trend(period = "auto") 98 | } 99 | -------------------------------------------------------------------------------- /man/time_recompose.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/time_recompose.R 3 | \name{time_recompose} 4 | \alias{time_recompose} 5 | \title{Recompose bands separating anomalies from "normal" observations} 6 | \usage{ 7 | time_recompose(data) 8 | } 9 | \arguments{ 10 | \item{data}{A \code{tibble} or \code{tbl_time} object that has been 11 | processed with \code{time_decompose()} and \code{anomalize()}.} 12 | } 13 | \value{ 14 | Returns a \code{tbl_time} object. 15 | } 16 | \description{ 17 | Recompose bands separating anomalies from "normal" observations 18 | } 19 | \details{ 20 | The \code{time_recompose()} function is used to generate bands around the 21 | "normal" levels of observed values. The function uses the remainder_l1 22 | and remainder_l2 levels produced during the \code{\link[=anomalize]{anomalize()}} step 23 | and the season and trend/median_spans values from the \code{\link[=time_decompose]{time_decompose()}} 24 | step to reconstruct bands around the normal values. 25 | 26 | The following key names are required: observed:remainder from the 27 | \code{time_decompose()} step and remainder_l1 and remainder_l2 from the 28 | \code{anomalize()} step. 29 | } 30 | \examples{ 31 | 32 | library(dplyr) 33 | 34 | data(tidyverse_cran_downloads) 35 | 36 | # Basic Usage 37 | tidyverse_cran_downloads \%>\% 38 | time_decompose(count, method = "stl") \%>\% 39 | anomalize(remainder, method = "iqr") \%>\% 40 | time_recompose() 41 | 42 | 43 | } 44 | \seealso{ 45 | Time Series Anomaly Detection Functions (anomaly detection workflow): 46 | \itemize{ 47 | \item \code{\link[=time_decompose]{time_decompose()}} 48 | \item \code{\link[=anomalize]{anomalize()}} 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /man/time_scale_template.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/time_scale_template.R 3 | \name{set_time_scale_template} 4 | \alias{set_time_scale_template} 5 | \alias{get_time_scale_template} 6 | \alias{time_scale_template} 7 | \title{Get and modify time scale template} 8 | \usage{ 9 | set_time_scale_template(data) 10 | 11 | get_time_scale_template() 12 | 13 | time_scale_template() 14 | } 15 | \arguments{ 16 | \item{data}{A \code{tibble} with a "time_scale", "frequency", and "trend" columns.} 17 | } 18 | \description{ 19 | Get and modify time scale template 20 | } 21 | \details{ 22 | Used to get and set the time scale template, which is used by \code{time_frequency()} 23 | and \code{time_trend()} when \code{period = "auto"}. 24 | } 25 | \examples{ 26 | 27 | get_time_scale_template() 28 | 29 | set_time_scale_template(time_scale_template()) 30 | 31 | } 32 | \seealso{ 33 | \code{\link[=time_frequency]{time_frequency()}}, \code{\link[=time_trend]{time_trend()}} 34 | } 35 | -------------------------------------------------------------------------------- /pkgdown/extra.css: -------------------------------------------------------------------------------- 1 | 2 | .navbar-brand { 3 | color: #FFFFFF !important; 4 | } 5 | 6 | .nav-link { 7 | color: #FFFFFF !important; 8 | } 9 | 10 | .navbar-dark .navbar-nav .active>.nav-link { 11 | background-color: #18bc9c; 12 | } 13 | 14 | pre.downlit.sourceCode{ 15 | border-color: #7daad7 !important; 16 | border-radius: 3px; 17 | box-shadow: 2px 2px 2px #999; 18 | } 19 | 20 | .navbar-dark input[type="search"] { 21 | background-color:white; 22 | color: #2c3e50; 23 | } 24 | 25 | a { 26 | color: #18bc9c; 27 | } 28 | 29 | code a:any-link { 30 | color: #18bc9c !important; 31 | text-decoration-color: #919aa1; 32 | } 33 | 34 | h1, h2, h3, h4 { 35 | padding-top: 20px; 36 | } 37 | 38 | body { 39 | font-weight: 400 !important; 40 | } 41 | 42 | 43 | thead { 44 | font-size: 20px; 45 | } 46 | 47 | 48 | div.comparison thead tr th:first-child, 49 | div.comparison tbody tr td:first-child { 50 | width: 12em; 51 | min-width: 12em; 52 | max-width: 12em; 53 | word-break: break-all; 54 | } 55 | 56 | div.comparison table { 57 | border-collapse: collapse; 58 | } 59 | 60 | div.comparison tr { 61 | border-color: #b4bcc2; 62 | border: solid; 63 | border-width: 1px 0; 64 | } 65 | 66 | div.comparison .header { 67 | border-color: #b4bcc2; 68 | border: solid; 69 | border-width: 2px 0; 70 | } 71 | 72 | .ref-index h3 { 73 | color: #18bc9c; 74 | } 75 | 76 | 77 | /*-- scss:defaults --*/ 78 | 79 | .navbar { 80 | background-color: #2C3E50 !important; 81 | } 82 | 83 | 84 | /* sidebar formatting */ 85 | 86 | .sidebar a.nav-link { 87 | font-size: 14.4px; 88 | font-weight: 400; 89 | } 90 | 91 | .sidebar code:not(.sourceCode) { 92 | font-size: 11px !important; 93 | } 94 | 95 | .sidebar-item-container .text-start { 96 | font-weight: 600; 97 | font-size: 14.4px !important; 98 | } 99 | 100 | .sidebar-item-text { 101 | /*color: rgba(60, 60, 60, 0.7);*/ 102 | font-weight: 500; 103 | font-size: 14px; 104 | line-height: 22px; 105 | } 106 | 107 | .sidebar-item { 108 | margin-top: 0px; 109 | } 110 | 111 | .sidebar-item-section { 112 | padding-top: 16px; 113 | } 114 | 115 | .sidebar-section { 116 | padding-left: 0px !important; 117 | } 118 | 119 | .sidebar-item-section .sidebar-item-section { 120 | padding-top: 0px; 121 | padding-left: 10px; 122 | } 123 | 124 | 125 | /* navbar formatting */ 126 | 127 | @media (max-device-width: 600px) { 128 | .navbar { 129 | padding-top: 1rem !important; 130 | padding-bottom: 1rem !important; 131 | } 132 | .navbar-title { 133 | font-size: 0.8rem !important; 134 | } 135 | } 136 | 137 | 138 | .cell { 139 | margin-bottom: 1rem; 140 | } 141 | 142 | .cell > .sourceCode { 143 | margin-bottom: 0; 144 | } 145 | 146 | .cell-output > pre { 147 | margin-bottom: 0; 148 | } 149 | 150 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { 151 | margin-left: 0.8rem; 152 | margin-top: 0; 153 | background: none; 154 | border-left: 2px solid #18bc9c; 155 | border-top-left-radius: 0; 156 | border-top-right-radius: 0; 157 | } 158 | 159 | .cell-output > .sourceCode { 160 | border: none; 161 | background: none; 162 | margin-top: 0; 163 | } 164 | 165 | .cell-output > div { 166 | display: inline-block; 167 | } 168 | 169 | div.description { 170 | padding-left: 2px; 171 | padding-top: 5px; 172 | font-style: italic; 173 | font-size: 135%; 174 | opacity: 70%; 175 | } 176 | 177 | /* show_doc signature */ 178 | blockquote > pre { 179 | font-size: 14px; 180 | } 181 | 182 | .table { 183 | font-size: 16px; 184 | /* disable striped tables */ 185 | --bs-table-striped-bg: var(--bs-table-bg); 186 | } 187 | 188 | .quarto-figure-center > figure > figcaption { 189 | text-align: center; 190 | } 191 | 192 | .figure-caption { 193 | font-size: 75%; 194 | font-style: italic; 195 | } 196 | 197 | /* new */ 198 | // @font-face { 199 | // font-family: 'Inter'; 200 | // src: url('./assets/Inter-VariableFont.ttf') format('ttf') 201 | // } 202 | 203 | :root { 204 | --primary: #2c3350; 205 | --secondary: #18bc9c; 206 | } 207 | 208 | html, body { 209 | color: #374151; 210 | font-family: 'Inter', sans-serif; 211 | } 212 | 213 | header { 214 | transform: translateY(0) !important; 215 | } 216 | 217 | #title-block-header { 218 | margin-block-end: 2rem; 219 | } 220 | 221 | #quarto-sidebar { 222 | top: 62px !important; 223 | z-index: 100; 224 | } 225 | 226 | .content a { 227 | color: #18bc9c; 228 | text-decoration: none; 229 | font-weight: 600; 230 | border-bottom: 1px solid var(--secondary); 231 | } 232 | 233 | .content a:hover { 234 | border-bottom: 2px solid var(--secondary); 235 | } 236 | 237 | a > code { 238 | background-color: transparent !important; 239 | } 240 | 241 | a > code:hover { 242 | color: var(--primary) !important; 243 | } 244 | 245 | 246 | .aa-SubmitIcon { 247 | // fill: rgba(17, 24,39, 0.6) !important; 248 | height: 20px !important; 249 | margin-top: -2px; 250 | } 251 | 252 | .navbar-brand-logo { 253 | -webkit-filter: drop-shadow(3px 3px 3px #222); 254 | } 255 | 256 | .navbar #quarto-search { 257 | margin-left: -2px; 258 | } 259 | 260 | .navbar-container { 261 | max-width: 1280px; 262 | margin: 0 auto; 263 | } 264 | 265 | .content { 266 | width: 100%; 267 | } 268 | 269 | h1, h2, h3, h4, h5, h6 { 270 | margin-top: 3rem !important; 271 | text-transform: none; 272 | } 273 | 274 | .dropdown-header { 275 | margin-top: 1rem !important; 276 | } 277 | 278 | h1.title { 279 | font-weight: 800; 280 | font-size: 1.875rem; 281 | line-height: 2.25rem; 282 | } 283 | 284 | div.description { 285 | font-style: normal; 286 | font-size: .875rem; 287 | line-height: 1.25rem; 288 | } 289 | 290 | p { 291 | margin-bottom: 1.25rem; 292 | } 293 | 294 | /* menu */ 295 | .sidebar-menu-container > ul > li:first-child > .sidebar-item-container > a > span { 296 | font-weight: 600 !important; 297 | font-size: 0.875rem; 298 | color: var(--secondary); 299 | } 300 | 301 | div.sidebar-item-container { 302 | color: #323232; 303 | } 304 | 305 | .sidebar-divider.hi { 306 | color: rgb(0,0,0, 0.2); 307 | margin-top: 0.5rem; 308 | margin-bottom: 1rem; 309 | } 310 | 311 | #quarto-margin-sidebar { 312 | top: 63px !important; 313 | } 314 | 315 | .menu-text { 316 | font-weight: 400; 317 | } 318 | 319 | 320 | ul.sidebar-section { 321 | padding-left: 0; 322 | } 323 | 324 | .sidebar-link { 325 | line-height: 2.125rem; 326 | padding: 0 0.5rem; 327 | } 328 | 329 | .sidebar-menu-container { 330 | padding-right: 0 !important; 331 | } 332 | 333 | ul.sidebar-section .sidebar-link { 334 | padding-left: 1rem; 335 | width: 100%; 336 | } 337 | 338 | .sidebar-link.active { 339 | background: rgba(255, 112, 0, 0.1); 340 | border-radius: 0.25rem; 341 | } 342 | 343 | .sidebar-link.active span { 344 | font-weight: 600 !important; 345 | color: var(--secondary); 346 | } 347 | 348 | .callout { 349 | border-left: auto !important; 350 | border-radius: 1rem; 351 | padding: 0.75rem; 352 | } 353 | 354 | .callout-tip { 355 | background: rgba(63,182,24, 0.05); 356 | border: 1px solid rgba(63,182,24, 0.25) !important; 357 | } 358 | 359 | .callout-note { 360 | background: rgba(59 , 130, 246, 0.05); 361 | border: 1px solid rgba(59, 130, 246, 0.25) !important; 362 | } 363 | 364 | .callout-style-default > .callout-header { 365 | background: none !important; 366 | } 367 | 368 | 369 | 370 | .cell-output { 371 | margin-top: 1rem; 372 | } 373 | 374 | .cell-output pre { 375 | border-radius: 0.375rem; 376 | } 377 | 378 | .cell-output > div { 379 | overflow-x: scroll; 380 | } 381 | 382 | .code-copy-button { 383 | margin: 0.5rem; 384 | } 385 | 386 | 387 | 388 | .cell-output > div { 389 | border: 1px solid rgba(100, 116, 139, 0.2) !important; 390 | border-radius: 1rem; 391 | margin-bottom: 3rem; 392 | margin-top: 3rem; 393 | } 394 | 395 | table, .table { 396 | 397 | font-size: 0.875rem; 398 | margin-bottom: 0; 399 | max-width: 100%; 400 | overflow-x: scroll; 401 | display: block; 402 | } 403 | 404 | thead { 405 | background: rgba(12, 18, 26, 0.02); 406 | border-bottom-color: rgba(100, 116, 139, 0.2) !important; 407 | } 408 | 409 | thead tr:first-child { 410 | background-color: rgb(249, 250, 251, 0.7) !important; 411 | } 412 | 413 | thead tr:first-child th:first-child { 414 | border-radius: 1rem 0 0 0; 415 | } 416 | 417 | thead tr:first-child th:last-child { 418 | border-radius: 0 1rem 0 0; 419 | } 420 | 421 | th, td { 422 | padding: 0.5rem 1rem !important; 423 | white-space: nowrap !important; 424 | text-transform: none !important; 425 | } 426 | 427 | td a, td a code { 428 | white-space: nowrap !important; 429 | } 430 | 431 | tbody { 432 | border-color: transparent !important; 433 | border-top: none !important; 434 | } 435 | 436 | tbody tr:last-child td:first-child { 437 | border-radius: 0 0 0 1rem; 438 | } 439 | 440 | tr.even, tr.odd { 441 | line-height: 2rem; 442 | } 443 | 444 | tr:hover { 445 | background-color: rgba(17, 24, 39, 0.05); 446 | } 447 | 448 | td:first-child, td:last-child { 449 | padding: 0.25rem 1rem !important; 450 | } 451 | 452 | .dropdown-menu.show { 453 | background: white; 454 | border: none; 455 | border-radius: 0.5rem; 456 | box-shadow: 0 2px 4px rgba(0,0,0,0.1); 457 | padding-top: 0.5rem !important; 458 | padding-bottom: 0.25rem !important; 459 | } 460 | 461 | .dropdown-menu li { 462 | padding: 0.25rem 1rem !important; 463 | } 464 | 465 | .dropdown-menu li:hover { 466 | background-color: #e9ecef; 467 | } 468 | 469 | .js-plotly-plot .plotly { 470 | border: none !important; 471 | } 472 | 473 | .svg-container { 474 | border: none !important; 475 | } 476 | 477 | .svg-container > svg { 478 | border-radius: 2rem; 479 | } 480 | 481 | // .plotly-graph-div { 482 | // border-radius: 5rem; 483 | // } 484 | 485 | @media (max-width: 991.98px) { 486 | #quarto-sidebar-glass.show { 487 | z-index: 10001; 488 | } 489 | 490 | #quarto-sidebar { 491 | top: 0 !important; 492 | z-index: 10002 !important; 493 | } 494 | 495 | #quarto-sidebar .sidebar-menu-container { 496 | min-width: unset; 497 | width: calc(100% - 32px); 498 | } 499 | 500 | #quarto-sidebar.show { 501 | max-width: calc(100vw - 32px); 502 | width: 320px !important; 503 | } 504 | } 505 | -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/apple-touch-icon-120x120.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-152x152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/apple-touch-icon-152x152.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-180x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/apple-touch-icon-180x180.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/apple-touch-icon-60x60.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/apple-touch-icon-76x76.png -------------------------------------------------------------------------------- /pkgdown/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /pkgdown/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/anomalize/f5d37063c83bb0b4b4256aed81dead489414b89c/pkgdown/favicon/favicon.ico -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/testing-design.html#sec-tests-files-overview 7 | # * https://testthat.r-lib.org/articles/special-files.html 8 | 9 | library(testthat) 10 | library(anomalize) 11 | 12 | test_check("anomalize") 13 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/anomalize.md: -------------------------------------------------------------------------------- 1 | # gesd can handle low variance data 2 | 3 | Code 4 | low_var %>% time_decompose(count, method = "twitter") %>% anomalize(remainder, 5 | method = "gesd") %>% expect_message("Converting") 6 | Message 7 | frequency = 7 days 8 | median_span = 2090 days 9 | 10 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/plot_anomaly_decomposition.md: -------------------------------------------------------------------------------- 1 | # returns a ggplot 2 | 3 | Code 4 | g <- tidyverse_cran_downloads %>% dplyr::filter(package == "tidyquant") %>% 5 | dplyr::ungroup() %>% time_decompose(count, method = "stl") %>% anomalize( 6 | remainder, method = "iqr") 7 | Message 8 | frequency = 7 days 9 | trend = 91 days 10 | 11 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/time_decompose.md: -------------------------------------------------------------------------------- 1 | # single tbl_df 2 | 3 | Code 4 | stl_tbl_time <- tidyverse_cran_downloads %>% dplyr::filter(package == 5 | "lubridate") %>% dplyr::ungroup() %>% dplyr::as_tibble() %>% time_decompose( 6 | count, method = "stl", frequency = "auto", trend = "auto") 7 | Message 8 | Converting from tbl_df to tbl_time. 9 | Auto-index message: index = date 10 | frequency = 7 days 11 | trend = 91 days 12 | 13 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/time_recompose.md: -------------------------------------------------------------------------------- 1 | # time_recompose works on tbl_time 2 | 3 | Code 4 | single_recomp <- tidyverse_cran_downloads %>% dplyr::filter(package == 5 | "tidyquant") %>% dplyr::ungroup() %>% time_decompose(count, method = "stl") %>% 6 | anomalize(remainder, method = "iqr") %>% time_recompose() 7 | Message 8 | frequency = 7 days 9 | trend = 91 days 10 | 11 | -------------------------------------------------------------------------------- /tests/testthat/test-anomalize.R: -------------------------------------------------------------------------------- 1 | # Setup 2 | tq_dloads <- tidyverse_cran_downloads %>% 3 | dplyr::ungroup() %>% 4 | dplyr::filter(package == "tidyquant") 5 | 6 | # Low-variance data 7 | low_var <- dplyr::tibble( 8 | time = Sys.Date(), 9 | count = c( 10 | 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 11 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 12 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 13 | 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 14 | 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 15 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 16 | 0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 17 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 18 | 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 19 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20 | 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 21 | 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 22 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 23 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 24 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 25 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 26 | 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 27 | 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28 | 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 29 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 30 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 31 | 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 32 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 33 | 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 34 | 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 35 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 0, 1, 0, 0, 36 | 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 37 | 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 38 | 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 39 | 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 40 | 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 41 | 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 42 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43 | 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 44 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 45 | 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 46 | 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 47 | 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 48 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 49 | 0, 0, 0, 0, 0, 0, 1, 0, 1, 3, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 50 | 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 51 | 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 0, 52 | 0, 0, 0, 0, 3, 0, 0, 1, 2, 2, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 53 | 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 54 | 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 55 | 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 56 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 57 | 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 1, 58 | 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 1, 3, 0, 2, 0, 0, 0, 59 | 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 2, 1, 0, 0, 0, 0, 1, 0, 0, 2, 60 | 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 61 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 62 | 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 63 | 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 64 | 1, 1, 0, 0, 1, 0, 1, 0, 2, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 65 | 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 66 | 0, 0, 1, 0, 0, 1, 3, 0, 1, 0, 0, 3, 0, 0, 0, 0, 2, 1, 0, 0, 1, 67 | 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 1, 68 | 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 2, 0, 0, 69 | 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 2, 70 | 1, 3, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 71 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 72 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 2, 73 | 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 74 | 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75 | 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 76 | 1, 2, 0, 1, 1, 2, 0, 0, 0, 0, 2, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 77 | 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 78 | 3, 2, 2, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 79 | 0, 0, 0, 0, 2, 0, 0, 0, 1, 5, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 80 | 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 81 | 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 82 | 2, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 83 | 1, 0, 2, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 84 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 85 | 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 2, 1, 3, 2, 0, 0, 0, 0, 0, 0, 86 | 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 87 | 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 88 | 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 89 | 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 90 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 91 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 92 | 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 93 | 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 94 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 95 | 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 96 | 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 97 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 98 | 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 99 | 1, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 100 | 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 101 | 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 4, 0, 0, 0, 0, 0, 102 | 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0, 1, 103 | 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 104 | 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 105 | 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 106 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 107 | 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 0, 108 | 0, 2, 1, 1, 0, 0, 1, 2, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 109 | 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 110 | ) 111 | ) %>% 112 | dplyr::mutate(time = time + dplyr::row_number()) 113 | 114 | # Tests 115 | 116 | test_that("iqr_tbl_df works", { 117 | 118 | iqr_tbl_df <- tq_dloads %>% 119 | anomalize(count, method = "iqr") 120 | 121 | expect_equal(nrow(iqr_tbl_df), 425) 122 | expect_equal(ncol(iqr_tbl_df), 6) 123 | 124 | }) 125 | 126 | test_that("gesd_tbl_df works", { 127 | 128 | gesd_tbl_df <- tq_dloads %>% 129 | anomalize(count, method = "gesd") 130 | 131 | expect_equal(nrow(gesd_tbl_df), 425) 132 | expect_equal(ncol(gesd_tbl_df), 6) 133 | 134 | }) 135 | 136 | test_that("gesd can handle low variance data", { 137 | 138 | low_var %>% 139 | anomalize(count, method = "gesd") %>% 140 | expect_no_error() 141 | # Capture messages in snapshots 142 | low_var %>% 143 | time_decompose(count, method = "stl") %>% 144 | anomalize(remainder, method = "gesd") %>% 145 | expect_message("Converting") %>% 146 | expect_message("frequency") %>% 147 | expect_message("trend") 148 | expect_snapshot({ 149 | 150 | 151 | low_var %>% 152 | time_decompose(count, method = "twitter") %>% 153 | anomalize(remainder, method = "gesd") %>% 154 | expect_message("Converting") 155 | }) 156 | 157 | 158 | 159 | }) 160 | 161 | test_that("iqr_grouped_df works", { 162 | 163 | iqr_grouped_df <- tidyverse_cran_downloads %>% 164 | dplyr::ungroup() %>% 165 | dplyr::filter(package %in% c("tidyquant", "tidytext")) %>% 166 | dplyr::group_by(package) %>% 167 | anomalize(count, method = "iqr") 168 | 169 | expect_equal(nrow(iqr_grouped_df), 850) 170 | expect_equal(ncol(iqr_grouped_df), 6) 171 | 172 | }) 173 | 174 | test_that("gesd_grouped_df works", { 175 | 176 | gesd_grouped_df <- tidyverse_cran_downloads %>% 177 | dplyr::ungroup() %>% 178 | dplyr::filter(package %in% c("tidyquant", "tidytext")) %>% 179 | dplyr::group_by(package) %>% 180 | anomalize(count, method = "gesd") 181 | 182 | expect_equal(nrow(gesd_grouped_df), 850) 183 | expect_equal(ncol(gesd_grouped_df), 6) 184 | 185 | }) 186 | -------------------------------------------------------------------------------- /tests/testthat/test-clean_anomalies.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | data_stl <- tidyverse_cran_downloads %>% 4 | time_decompose(count, method = "stl") %>% 5 | anomalize(remainder, method = "iqr") 6 | 7 | data_twitter <- tidyverse_cran_downloads %>% 8 | time_decompose(count, method = "twitter") %>% 9 | anomalize(remainder, method = "iqr") 10 | 11 | 12 | test_that("bad data returns error", { 13 | 14 | expect_error(clean_anomalies(2)) 15 | 16 | }) 17 | 18 | test_that("Clean Anomalies from STL Method", { 19 | expect_match(names(clean_anomalies(data_stl)), "observed_cleaned", all = FALSE) 20 | }) 21 | 22 | test_that("Clean Anomalies from Twitter Method", { 23 | expect_match(names(clean_anomalies(data_twitter)), "observed_cleaned", all = FALSE) 24 | }) 25 | -------------------------------------------------------------------------------- /tests/testthat/test-plot_anomalies.R: -------------------------------------------------------------------------------- 1 | test_that("errors on incorrect input", { 2 | expect_error(plot_anomalies(3)) 3 | }) 4 | 5 | test_that("returns a ggplot", { 6 | g <- tidyverse_cran_downloads %>% 7 | time_decompose(count, method = "stl") %>% 8 | anomalize(remainder, method = "iqr") %>% 9 | time_recompose() %>% 10 | plot_anomalies(time_recomposed = TRUE, ncol = 3) 11 | expect_s3_class(g, "ggplot") 12 | }) 13 | -------------------------------------------------------------------------------- /tests/testthat/test-plot_anomaly_decomposition.R: -------------------------------------------------------------------------------- 1 | test_that("errors on incorrect input", { 2 | expect_error(plot_anomaly_decomposition(3)) 3 | }) 4 | 5 | test_that("returns a ggplot", { 6 | expect_snapshot( 7 | g <- tidyverse_cran_downloads %>% 8 | dplyr::filter(package == "tidyquant") %>% 9 | dplyr::ungroup() %>% 10 | time_decompose(count, method = "stl") %>% 11 | anomalize(remainder, method = "iqr") 12 | ) 13 | 14 | expect_s3_class(plot_anomaly_decomposition(g), "ggplot") 15 | }) 16 | -------------------------------------------------------------------------------- /tests/testthat/test-prep_tbl_time.R: -------------------------------------------------------------------------------- 1 | test_that("prep_tbl_time errors on incorrect input", { 2 | expect_error(prep_tbl_time(1)) 3 | expect_error(prep_tbl_time(dplyr::tibble(x = stats::rnorm(100)))) 4 | }) 5 | 6 | test_that("converts tibble to tbl_time", { 7 | data_tbl <- dplyr::tibble( 8 | date = seq.Date(from = as.Date("2018-01-01"), by = "day", length.out = 10), 9 | value = rnorm(10) 10 | ) 11 | 12 | expect_s3_class(prep_tbl_time(data_tbl), class = "tbl_time") 13 | expect_message(prep_tbl_time(data_tbl, message = T)) 14 | }) 15 | 16 | test_that("tbl_time returns tbl_time", { 17 | data_tbl <- dplyr::tibble( 18 | date = seq.Date(from = as.Date("2018-01-01"), by = "day", length.out = 10), 19 | value = rnorm(10) 20 | ) %>% 21 | tibbletime::as_tbl_time(date) 22 | 23 | expect_s3_class(prep_tbl_time(data_tbl), class = "tbl_time") 24 | 25 | }) 26 | -------------------------------------------------------------------------------- /tests/testthat/test-time_apply.R: -------------------------------------------------------------------------------- 1 | test_that("errors on incorrect input", { 2 | expect_error(time_apply(2)) 3 | expect_error(tidyverse_cran_downloads %>% time_apply()) 4 | }) 5 | 6 | 7 | test_that("grouped_tbl_time works", { 8 | grouped_tbl_time_mean <- tidyverse_cran_downloads %>% 9 | time_apply(count, period = "1 week", .fun = mean, na.rm = TRUE) 10 | expect_equal(ncol(grouped_tbl_time_mean), 4) 11 | }) 12 | 13 | test_that("tbl_time works", { 14 | grouped_tbl_time_mean <- tidyverse_cran_downloads %>% 15 | dplyr::filter(package == "tidyquant") %>% 16 | dplyr::ungroup() %>% 17 | time_apply(count, period = "1 week", .fun = mean, na.rm = TRUE) 18 | expect_equal(ncol(grouped_tbl_time_mean), 4) 19 | }) 20 | 21 | -------------------------------------------------------------------------------- /tests/testthat/test-time_decompose.R: -------------------------------------------------------------------------------- 1 | test_that("Incorrect data type errors", { 2 | expect_error(time_decompose(5)) 3 | }) 4 | 5 | test_that("No target errors", { 6 | expect_error(time_decompose(tidyverse_cran_downloads)) 7 | expect_error(time_decompose(dplyr::ungroup(tidyverse_cran_downloads))) 8 | }) 9 | 10 | test_that("single tbl_df", { 11 | # Capture output 12 | expect_snapshot( 13 | stl_tbl_time <- tidyverse_cran_downloads %>% 14 | dplyr::filter(package == "lubridate") %>% 15 | dplyr::ungroup() %>% 16 | dplyr::as_tibble() %>% 17 | time_decompose(count, method = "stl", frequency = "auto", trend = "auto") 18 | ) 19 | expect_equal(ncol(stl_tbl_time), 5) 20 | expect_equal(nrow(stl_tbl_time), 425) 21 | 22 | }) 23 | 24 | test_that("grouped tbl_df", { 25 | stl_tbl_time <- tidyverse_cran_downloads %>% 26 | dplyr::as_tibble() %>% 27 | dplyr::group_by(package) %>% 28 | time_decompose(count, method = "stl", frequency = "auto", trend = "auto") 29 | 30 | expect_equal(ncol(stl_tbl_time), 6) 31 | expect_equal(nrow(stl_tbl_time), 6375) 32 | 33 | }) 34 | 35 | test_that("method = stl, auto freq/trend", { 36 | stl_tbl_time <- tidyverse_cran_downloads %>% 37 | time_decompose(count, method = "stl", frequency = "auto", trend = "auto") 38 | 39 | expect_equal(ncol(stl_tbl_time), 6) 40 | expect_equal(nrow(stl_tbl_time), 6375) 41 | expect_equal(dplyr::n_groups(stl_tbl_time), 15) 42 | 43 | }) 44 | 45 | test_that("method = stl, character freq/trend", { 46 | stl_tbl_time <- tidyverse_cran_downloads %>% 47 | time_decompose(count, method = "stl", frequency = "1 month", trend = "3 months") 48 | 49 | expect_equal(ncol(stl_tbl_time), 6) 50 | expect_equal(nrow(stl_tbl_time), 6375) 51 | expect_equal(dplyr::n_groups(stl_tbl_time), 15) 52 | 53 | }) 54 | 55 | test_that("method = stl, numeric freq/trend", { 56 | stl_tbl_time <- tidyverse_cran_downloads %>% 57 | time_decompose(count, method = "stl", frequency = 7, trend = 30) 58 | 59 | expect_equal(ncol(stl_tbl_time), 6) 60 | expect_equal(nrow(stl_tbl_time), 6375) 61 | expect_equal(dplyr::n_groups(stl_tbl_time), 15) 62 | 63 | }) 64 | 65 | test_that("method = twitter, auto freq/trend", { 66 | twitter_tbl_time <- tidyverse_cran_downloads %>% 67 | time_decompose(count, method = "twitter", frequency = "auto", trend = "auto") 68 | 69 | expect_equal(ncol(twitter_tbl_time), 6) 70 | expect_equal(nrow(twitter_tbl_time), 6375) 71 | expect_equal(dplyr::n_groups(twitter_tbl_time), 15) 72 | 73 | }) 74 | 75 | test_that("method = twitter, character freq/trend", { 76 | twitter_tbl_time <- tidyverse_cran_downloads %>% 77 | time_decompose(count, method = "twitter", frequency = "1 week", trend = "1 month") 78 | 79 | expect_equal(ncol(twitter_tbl_time), 6) 80 | expect_equal(nrow(twitter_tbl_time), 6375) 81 | expect_equal(dplyr::n_groups(twitter_tbl_time), 15) 82 | 83 | }) 84 | 85 | test_that("method = twitter, numeric freq/trend", { 86 | twitter_tbl_time <- tidyverse_cran_downloads %>% 87 | time_decompose(count, method = "twitter", frequency = 7, trend = 90) 88 | 89 | expect_equal(ncol(twitter_tbl_time), 6) 90 | expect_equal(nrow(twitter_tbl_time), 6375) 91 | expect_equal(dplyr::n_groups(twitter_tbl_time), 15) 92 | 93 | }) 94 | 95 | # test_that("method = multiplicative, auto freq/trend", { 96 | # mult_tbl_time <- tidyverse_cran_downloads %>% 97 | # time_decompose(count, method = "multiplicative", frequency = "auto", trend = "auto") 98 | # 99 | # expect_equal(ncol(mult_tbl_time), 6) 100 | # expect_equal(nrow(mult_tbl_time), 6375) 101 | # expect_equal(dplyr::n_groups(mult_tbl_time), 15) 102 | # 103 | # }) 104 | # 105 | # test_that("method = multiplicative, character freq/trend", { 106 | # mult_tbl_time <- tidyverse_cran_downloads %>% 107 | # time_decompose(count, method = "multiplicative", frequency = "1 week", trend = "1 month") 108 | # 109 | # expect_equal(ncol(mult_tbl_time), 6) 110 | # expect_equal(nrow(mult_tbl_time), 6375) 111 | # expect_equal(dplyr::n_groups(mult_tbl_time), 15) 112 | # 113 | # }) 114 | # 115 | # test_that("method = multiplicative, numeric freq/trend", { 116 | # mult_tbl_time <- tidyverse_cran_downloads %>% 117 | # time_decompose(count, method = "multiplicative", frequency = 7, trend = 90) 118 | # 119 | # expect_equal(ncol(mult_tbl_time), 6) 120 | # expect_equal(nrow(mult_tbl_time), 6375) 121 | # expect_equal(dplyr::n_groups(mult_tbl_time), 15) 122 | # 123 | # }) 124 | 125 | test_that("grouped_df works", { 126 | grouped_data <- tidyverse_cran_downloads %>% 127 | dplyr::as_tibble() %>% 128 | dplyr::group_by(package) %>% 129 | time_decompose(count) 130 | 131 | expect_equal(ncol(grouped_data), 6) 132 | expect_equal(nrow(grouped_data), 6375) 133 | expect_equal(dplyr::n_groups(grouped_data), 15) 134 | 135 | }) 136 | -------------------------------------------------------------------------------- /tests/testthat/test-time_frequency.R: -------------------------------------------------------------------------------- 1 | # Setup 2 | 3 | tq_dloads <- tidyverse_cran_downloads %>% 4 | dplyr::ungroup() %>% 5 | dplyr::filter(package == "tidyquant") 6 | 7 | tq_dloads_small <- tq_dloads %>% 8 | dplyr::slice_head(n = 60) 9 | 10 | # Tests 11 | 12 | test_that("time_frequency fails with incorrect input", { 13 | expect_error(time_frequency(5)) 14 | expect_error(time_frequency(tidyverse_cran_downloads)) 15 | }) 16 | 17 | test_that("time_trend fails with incorrect input", { 18 | expect_error(time_trend(5)) 19 | expect_error(time_trend(tidyverse_cran_downloads)) 20 | }) 21 | 22 | test_that("time_frequency works: period = 'auto'", { 23 | 24 | expect_message(freq <- time_frequency(tq_dloads)) 25 | 26 | expect_equal(freq, 7) 27 | 28 | }) 29 | 30 | test_that("time_frequency works: period = '1 month'", { 31 | 32 | expect_message(freq <- time_frequency(tq_dloads, period = "1 month")) 33 | 34 | expect_equal(freq, 31) 35 | 36 | }) 37 | 38 | test_that("time_frequency works: period = 5", { 39 | 40 | expect_message(freq <- time_frequency(tq_dloads, period = 5)) 41 | 42 | expect_equal(freq, 5) 43 | 44 | }) 45 | 46 | 47 | 48 | test_that("time_trend works: period = 'auto'", { 49 | 50 | expect_message(trend <- time_trend(tq_dloads)) 51 | 52 | expect_equal(trend, 91) 53 | 54 | }) 55 | 56 | test_that("time_trend works: period = '90 days'", { 57 | 58 | expect_message(trend <- time_trend(tq_dloads, period = "30 days")) 59 | 60 | expect_equal(trend, 30) 61 | 62 | }) 63 | 64 | test_that("time_trend works: period = 90", { 65 | 66 | expect_message(trend <- time_trend(tq_dloads, period = 90)) 67 | 68 | expect_equal(trend, 90) 69 | 70 | }) 71 | 72 | test_that("time_trend works with small data: period = 'auto'", { 73 | 74 | expect_message(trend <- time_trend(tq_dloads_small)) 75 | 76 | expect_equal(trend, 28) 77 | 78 | }) 79 | 80 | -------------------------------------------------------------------------------- /tests/testthat/test-time_recompose.R: -------------------------------------------------------------------------------- 1 | test_that("errors on incorrect input", { 2 | expect_error(time_recompose(5)) 3 | }) 4 | 5 | test_that("time_recompose works on grouped_tbl_time", { 6 | grouped_recomp <- tidyverse_cran_downloads %>% 7 | time_decompose(count, method = "stl") %>% 8 | anomalize(remainder, method = "iqr") %>% 9 | time_recompose() 10 | expect_contains(names(grouped_recomp), "recomposed_l2") 11 | }) 12 | 13 | test_that("time_recompose works on tbl_time", { 14 | expect_snapshot( 15 | single_recomp <- tidyverse_cran_downloads %>% 16 | dplyr::filter(package == "tidyquant") %>% 17 | dplyr::ungroup() %>% 18 | time_decompose(count, method = "stl") %>% 19 | anomalize(remainder, method = "iqr") %>% 20 | time_recompose() 21 | ) 22 | expect_contains(names(single_recomp), "recomposed_l2") 23 | }) 24 | 25 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | test_that("utils: time_decompose `merge = TRUE` works", { 2 | merged_decomposition <- tidyverse_cran_downloads %>% 3 | time_decompose(count, merge = TRUE) 4 | expect_equal(ncol(merged_decomposition), 7) 5 | }) 6 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/anomalize_methods.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Anomalize Methods" 3 | author: "Business Science" 4 | date: "`r Sys.Date()`" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: TRUE 8 | vignette: > 9 | %\VignetteIndexEntry{Anomalize Methods} 10 | %\VignetteEngine{knitr::rmarkdown} 11 | %\VignetteEncoding{UTF-8} 12 | --- 13 | 14 | ```{r setup, include = FALSE} 15 | knitr::opts_chunk$set( 16 | collapse = TRUE, 17 | comment = "#>", 18 | warning = F, 19 | fig.align = "center" 20 | ) 21 | 22 | library(anomalize) 23 | # load necessary tidyverse packages for analysis 24 | library(dplyr) 25 | library(ggplot2) 26 | 27 | # NOTE: timetk now has anomaly detection built in, which 28 | # will get the new functionality going forward. 29 | 30 | anomalize <- anomalize::anomalize 31 | plot_anomalies <- anomalize::plot_anomalies 32 | ``` 33 | 34 | Anomaly detection is critical to many disciplines, but possibly none more important than in __time series analysis__. A time series is the sequential set of values tracked over a time duration. The definition we use for an __anomaly__ is simple: an anomaly is something that happens that (1) was unexpected or (2) was caused by an abnormal event. Therefore, the problem we intend to solve with `anomalize` is providing methods to accurately detect these "anomalous" events. 35 | 36 | The methods that `anomalize` uses can be separated into two main tasks: 37 | 38 | 1. Generating Time Series Analysis Remainders 39 | 2. Detecting Anomalies in the Remainders 40 | 41 | ## 1. Generating Time Series Analysis Remainders 42 | 43 | Anomaly detection is performed on __remainders__ from a time series analysis that have had removed both: 44 | 45 | * __Seasonal Components__: Cyclic pattern usually occurring on a daily cycle for minute or hour data or a weekly cycle for daily data 46 | * __Trend Components__: Longer term growth that happens over many observations. 47 | 48 | Therefore, the first objective is to generate remainders from a time series. Some analysis techniques are better for this task then others, and it's probably not the ones you would think. 49 | 50 | There are many ways that a time series can be deconstructed to produce residuals. We have tried many including using ARIMA, Machine Learning (Regression), Seasonal Decomposition, and so on. For anomaly detection, we have seen the best performance using __seasonal decomposition__. Most high performance machine learning techniques perform poorly for anomaly detection because of _overfitting_, which downplays the difference between the actual value and the fitted value. This is not the objective of anomaly detection wherein we need to highlight the anomaly. Seasonal decomposition does very well for this task, removing the right features (i.e. seasonal and trend components) while preserving the characteristics of anomalies in the residuals. 51 | 52 | The `anomalize` package implements two techniques for seasonal decomposition: 53 | 54 | 1. __STL__: Seasonal Decomposition of Time Series by Loess 55 | 2. __Twitter__: Seasonal Decomposition of Time Series by Median 56 | 57 | Each method has pros and cons. 58 | 59 | ### 1.A. STL 60 | 61 | The STL method uses the `stl()` function from the `stats` package. STL works very well in circumstances where a long term trend is present. The Loess algorithm typically does a very good job at detecting the trend. However, it circumstances when the seasonal component is more dominant than the trend, Twitter tends to perform better. 62 | 63 | ### 1.B. Twitter 64 | 65 | The Twitter method is a similar decomposition method to that used in Twitter's `AnomalyDetection` package. The Twitter method works identically to STL for removing the seasonal component. The main difference is in removing the trend, which is performed by removing the median of the data rather than fitting a smoother. The median works well when a long-term trend is less dominant that the short-term seasonal component. This is because the smoother tends to overfit the anomalies. 66 | 67 | ### 1.C. Comparison of STL and Twitter Decomposition Methods 68 | 69 | Load two libraries to perform the comparison. 70 | 71 | ```r 72 | library(tidyverse) 73 | library(anomalize) 74 | 75 | # NOTE: timetk now has anomaly detection built in, which 76 | # will get the new functionality going forward. 77 | 78 | anomalize <- anomalize::anomalize 79 | plot_anomalies <- anomalize::plot_anomalies 80 | ``` 81 | 82 | 83 | Collect data on the daily downloads of the `lubridate` package. This comes from the data set, `tidyverse_cran_downloads` that is part of `anomalize` package. 84 | 85 | ```{r} 86 | # Data on `lubridate` package daily downloads 87 | lubridate_download_history <- tidyverse_cran_downloads %>% 88 | filter(package == "lubridate") %>% 89 | ungroup() 90 | 91 | # Output first 10 observations 92 | lubridate_download_history %>% 93 | head(10) %>% 94 | knitr::kable() 95 | ``` 96 | 97 | We can visualize the differences between the two decomposition methods. 98 | 99 | 100 | ```{r, fig.show='hold', fig.height=7, fig.align='default'} 101 | # STL Decomposition Method 102 | p1 <- lubridate_download_history %>% 103 | time_decompose(count, 104 | method = "stl", 105 | frequency = "1 week", 106 | trend = "3 months") %>% 107 | anomalize(remainder) %>% 108 | plot_anomaly_decomposition() + 109 | ggtitle("STL Decomposition") 110 | 111 | # Twitter Decomposition Method 112 | p2 <- lubridate_download_history %>% 113 | time_decompose(count, 114 | method = "twitter", 115 | frequency = "1 week", 116 | trend = "3 months") %>% 117 | anomalize(remainder) %>% 118 | plot_anomaly_decomposition() + 119 | ggtitle("Twitter Decomposition") 120 | 121 | # Show plots 122 | p1 123 | p2 124 | ``` 125 | 126 | 127 | We can see that the season components for both STL and Twitter decomposition are exactly the same. The difference is the trend component: 128 | 129 | * STL: The STL trend follows a smoothed Loess with a Loess trend window at 91 days (as defined by `trend = "3 months"`). The remainder of the decomposition is centered. 130 | 131 | * Twitter: The Twitter trend is a series of medians that are removed. The median span logic is such that the medians are selected to have equal distribution of observations. Because of this, the trend span is 85 days, which is slightly less than the 91 days (or 3 months). 132 | 133 | ### 1.D. Transformations 134 | 135 | In certain circumstances such as multiplicative trends in which the residuals (remainders) have heteroskedastic properties, which is when the variance changes as the time series sequence progresses (e.g. the remainders fan out), it becomes difficult to detect anomalies in especially in the low variance regions. Logarithmic or power transformations can help in these situations. This is beyond the scope of the methods and is not implemented in the current version of `anomalize`. However, these transformations can be performed on the incoming target and the output can be inverse-transformed. 136 | 137 | 138 | ## 2. Detecting Anomalies in the Remainders 139 | 140 | Once a time series analysis is completed and the remainder has the desired characteristics, the remainders can be analyzed. The challenge is that anomalies are high leverage points that distort the distribution. The `anomalize` package implements two methods that are resistant to the high leverage points: 141 | 142 | 1. __IQR__: Inner Quartile Range 143 | 2. __GESD__: Generalized Extreme Studentized Deviate Test 144 | 145 | Both methods have pros and cons. 146 | 147 | 148 | ### 2.A. IQR 149 | 150 | The IQR method is a similar method to that used in the `forecast` package for anomaly removal within the `tsoutliers()` function. It takes a distribution and uses the 25% and 75% inner quartile range to establish the distribution of the remainder. Limits are set by default to a factor of 3X above and below the inner quartile range, and any remainders beyond the limits are considered anomalies. 151 | 152 | The `alpha` parameter adjusts the 3X factor. By default, `alpha = 0.05` for consistency with the GESD method. An `alpha = 0.025`, results in a 6X factor, expanding the limits and making it more difficult for data to be an anomaly. Conversely, an `alpha = 0.10` contracts the limits to a factor of 1.5X making it more easy for data to be an anomaly. 153 | 154 | The IQR method does not depend on any loops and is therefore faster and more easily scaled than the GESD method. However, it may not be as accurate in detecting anomalies since the high leverage anomalies can skew the centerline (median) of the IQR. 155 | 156 | ### 2.B. GESD 157 | 158 | The GESD method is used in Twitter's `AnomalyDetection` package. It involves an iterative evaluation of the Generalized Extreme Studentized Deviate test, which progressively evaluates anomalies, removing the worst offenders and recalculating the test statistic and critical value. The critical values progressively contract as more high leverage points are removed. 159 | 160 | The `alpha` parameter adjusts the width of the critical values. By default, `alpha = 0.05`. 161 | 162 | The GESD method is iterative, and therefore more expensive that the IQR method. The main benefit is that GESD is less resistant to high leverage points since the distribution of the data is progressively analyzed as anomalies are removed. 163 | 164 | ### 2.C Comparison of IQR and GESD Methods 165 | 166 | We can generate anomalous data to illustrate how each method work compares to each other. 167 | 168 | ```{r, fig.height=3, fig.width=5} 169 | # Generate anomalies 170 | set.seed(100) 171 | x <- rnorm(100) 172 | idx_outliers <- sample(100, size = 5) 173 | x[idx_outliers] <- x[idx_outliers] + 10 174 | 175 | # Visualize simulated anomalies 176 | qplot(1:length(x), x, 177 | main = "Simulated Anomalies", 178 | xlab = "Index") 179 | ``` 180 | 181 | Two functions power `anomalize()`, which are `iqr()` and `gesd()`. We can use these intermediate functions to illustrate the anomaly detection characteristics. 182 | 183 | ```{r, fig.show="hold", fig.width=5} 184 | # Analyze outliers: Outlier Report is available with verbose = TRUE 185 | iqr_outliers <- iqr(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE)$outlier_report 186 | 187 | gesd_outliers <- gesd(x, alpha = 0.05, max_anoms = 0.2, verbose = TRUE)$outlier_report 188 | 189 | # ploting function for anomaly plots 190 | ggsetup <- function(data) { 191 | data %>% 192 | ggplot(aes(rank, value, color = outlier)) + 193 | geom_point() + 194 | geom_line(aes(y = limit_upper), color = "red", linetype = 2) + 195 | geom_line(aes(y = limit_lower), color = "red", linetype = 2) + 196 | geom_text(aes(label = index), vjust = -1.25) + 197 | theme_bw() + 198 | scale_color_manual(values = c("No" = "#2c3e50", "Yes" = "#e31a1c")) + 199 | expand_limits(y = 13) + 200 | theme(legend.position = "bottom") 201 | } 202 | 203 | 204 | # Visualize 205 | p3 <- iqr_outliers %>% 206 | ggsetup() + 207 | ggtitle("IQR: Top outliers sorted by rank") 208 | 209 | p4 <- gesd_outliers %>% 210 | ggsetup() + 211 | ggtitle("GESD: Top outliers sorted by rank") 212 | 213 | # Show plots 214 | p3 215 | p4 216 | ``` 217 | 218 | 219 | We can see that the IQR limits don't vary whereas the GESD limits get more stringent as anomalies are removed from the data. As a result, the GESD method tends to be more accurate in detecting anomalies at the expense of incurring more processing time for the looped anomaly removal. This expense is most noticeable with larger data sets (many observations or many time series). 220 | 221 | ## 3. Conclusion 222 | 223 | The `anomalize` package implements several useful and accurate techniques for implementing anomaly detection. The user should now have a better understanding of how the algorithms work along with the strengths and weaknesses of each method. 224 | 225 | ## 4. References 226 | 227 | 228 | 1. [How to correct outliers once detected for time series data forecasting? Cross Validated, https://stats.stackexchange.com](https://stats.stackexchange.com/questions/69874/how-to-correct-outliers-once-detected-for-time-series-data-forecasting) 229 | 230 | 2. [Cross Validated: Simple algorithm for online outlier detection of a generic time series. Cross Validated, https://stats.stackexchange.com](https://stats.stackexchange.com/questions/1142/simple-algorithm-for-online-outlier-detection-of-a-generic-time-series?) 231 | 232 | 3. [Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). A Novel Technique for Long-Term Anomaly Detection in the Cloud. Twitter Inc.](https://www.usenix.org/system/files/conference/hotcloud14/hotcloud14-vallis.pdf) 233 | 234 | 4. [Owen S. Vallis, Jordan Hochenbaum and Arun Kejariwal (2014). AnomalyDetection: Anomaly Detection Using Seasonal Hybrid Extreme Studentized Deviate Test. R package version 1.0.](https://github.com/twitter/AnomalyDetection) 235 | 236 | 5. Alex T.C. Lau (November/December 2015). GESD - A Robust and Effective Technique for Dealing with Multiple Outliers. ASTM Standardization News. www.astm.org/sn 237 | 238 | 239 | # Interested in Learning Anomaly Detection? 240 | 241 | Business Science offers two 1-hour courses on Anomaly Detection: 242 | 243 | - [Learning Lab 18](https://university.business-science.io/p/learning-labs-pro) - Time Series Anomaly Detection with `anomalize` 244 | 245 | - [Learning Lab 17](https://university.business-science.io/p/learning-labs-pro) - Anomaly Detection with `H2O` Machine Learning 246 | 247 | -------------------------------------------------------------------------------- /vignettes/anomalize_quick_start_guide.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Anomalize Quick Start Guide" 3 | author: "Business Science" 4 | date: "`r Sys.Date()`" 5 | output: 6 | rmarkdown::html_vignette: 7 | toc: TRUE 8 | toc_depth: 2 9 | vignette: > 10 | %\VignetteIndexEntry{Anomalize Quick Start Guide} 11 | %\VignetteEngine{knitr::rmarkdown} 12 | %\VignetteEncoding{UTF-8} 13 | --- 14 | 15 | 16 | ```{r setup, include = FALSE} 17 | knitr::opts_chunk$set( 18 | collapse = TRUE, 19 | comment = "#>", 20 | warning = F, 21 | fig.align = "center" 22 | ) 23 | 24 | library(tibbletime) 25 | library(dplyr) 26 | library(ggplot2) 27 | library(anomalize) 28 | # NOTE: timetk now has anomaly detection built in, which 29 | # will get the new functionality going forward. 30 | 31 | anomalize <- anomalize::anomalize 32 | plot_anomalies <- anomalize::plot_anomalies 33 | ``` 34 | 35 | The `anomalize` package is a feature rich package for performing anomaly detection. It's geared towards time series analysis, which is one of the biggest needs for understanding when anomalies occur. We have a quick start section called "5-Minutes to Anomalize" for those looking to jump right in. We also have a detailed section on parameter adjustment for those looking to understand what nobs they can turn. Finally, for those really looking to get under the hood, we have another vignette called "Anomalize Methods" that gets into a deep discussion on STL, Twitter, IQR and GESD methods that are used to power `anomalize`. 36 | 37 | ## Anomalize Intro on YouTube 38 | 39 | As a first step, you may wish to watch our `anomalize` introduction video on YouTube. 40 | 41 | Anomalize 43 | 44 | Check out our entire [Software Intro Series](https://www.youtube.com/watch?v=Gk_HwjhlQJs&list=PLo32uKohmrXsYNhpdwr15W143rX6uMAze) on YouTube! 45 | 46 | ## 5-Minutes To Anomalize 47 | 48 | Load libraries. 49 | 50 | ```r 51 | library(tidyverse) 52 | library(tibbletime) 53 | library(anomalize) 54 | 55 | # NOTE: timetk now has anomaly detection built in, which 56 | # will get the new functionality going forward. 57 | 58 | anomalize <- anomalize::anomalize 59 | plot_anomalies <- anomalize::plot_anomalies 60 | ``` 61 | 62 | Get some data. We'll use the `tidyverse_cran_downloads` data set that comes with `anomalize`. A few points: 63 | 64 | * It's a `tibbletime` object (class `tbl_time`), which is the object structure that `anomalize` works with because it's time aware! Tibbles (class `tbl_df`) will automatically be converted. 65 | 66 | * It contains daily download counts on 15 "tidy" packages spanning 2017-01-01 to 2018-03-01. The 15 packages are already grouped for your convenience. 67 | 68 | * It's all setup and ready to analyze with `anomalize`! 69 | 70 | ```{r} 71 | tidyverse_cran_downloads 72 | ``` 73 | 74 | We can use the general workflow for anomaly detection, which involves three main functions: 75 | 76 | 1. `time_decompose()`: Separates the time series into seasonal, trend, and remainder components 77 | 2. `anomalize()`: Applies anomaly detection methods to the remainder component. 78 | 3. `time_recompose()`: Calculates limits that separate the "normal" data from the anomalies! 79 | 80 | ```{r} 81 | tidyverse_cran_downloads_anomalized <- tidyverse_cran_downloads %>% 82 | time_decompose(count, merge = TRUE) %>% 83 | anomalize(remainder) %>% 84 | time_recompose() 85 | 86 | tidyverse_cran_downloads_anomalized %>% glimpse() 87 | ``` 88 | 89 | Let's explain what happened: 90 | 91 | 1. `time_decompose(count, merge = TRUE)`: This performs a time series decomposition on the "count" column using seasonal decomposition. It created four columns: 92 | * "observed": The observed values (actuals) 93 | * "season": The seasonal or cyclic trend. The default for daily data is a weekly seasonality. 94 | * "trend": This is the long term trend. The default is a Loess smoother using spans of 3-months for daily data. 95 | * "remainder": This is what we want to analyze for outliers. It is simply the observed minus both the season and trend. 96 | * Setting `merge = TRUE` keeps the original data with the newly created columns. 97 | 98 | 2. `anomalize(remainder)`: This performs anomaly detection on the remainder column. It creates three new columns: 99 | * "remainder_l1": The lower limit of the remainder 100 | * "remainder_l2": The upper limit of the remainder 101 | * "anomaly": Yes/No telling us whether or not the observation is an anomaly 102 | 103 | 3. `time_recompose()`: This recomposes the season, trend and remainder_l1 and remainder_l2 columns into new limits that bound the observed values. The two new columns created are: 104 | * "recomposed_l1": The lower bound of outliers around the observed value 105 | * "recomposed_l2": The upper bound of outliers around the observed value 106 | 107 | We can then visualize the anomalies using the `plot_anomalies()` function. 108 | 109 | ```{r, fig.height=8, fig.width=6} 110 | tidyverse_cran_downloads_anomalized %>% 111 | plot_anomalies(ncol = 3, alpha_dots = 0.25) 112 | ``` 113 | 114 | 115 | ## Parameter Adjustment 116 | 117 | Now that you have an overview of the package, you can begin to adjust the parameter settings. The first settings you may wish to explore are related to time series decomposition: trend and seasonality. The second are related to anomaly detection: alpha and max anoms. 118 | 119 | ### Adjusting Decomposition Trend and Seasonality 120 | 121 | Adjusting the trend and seasonality are fundamental to time series analysis and specifically time series decomposition. With `anomalize`, it's simple to make adjustments because everything is done with date or datetime information so you can intuitively select increments by time spans that make sense (e.g. "5 minutes" or "1 month"). 122 | 123 | To get started, let's isolate one of the time series packages: lubridate. 124 | 125 | ```{r} 126 | lubridate_daily_downloads <- tidyverse_cran_downloads %>% 127 | filter(package == "lubridate") %>% 128 | ungroup() 129 | 130 | lubridate_daily_downloads 131 | ``` 132 | 133 | Next, let's perform anomaly detection. 134 | 135 | ```{r} 136 | lubridate_daily_downloads_anomalized <- lubridate_daily_downloads %>% 137 | time_decompose(count) %>% 138 | anomalize(remainder) %>% 139 | time_recompose() 140 | 141 | lubridate_daily_downloads_anomalized %>% glimpse() 142 | ``` 143 | 144 | First, notice that a `frequency` and a `trend` were automatically selected for us. This is by design. The arguments `frequency = "auto"` and `trend = "auto"` are the defaults. We can visualize this decomposition using `plot_anomaly_decomposition()`. 145 | 146 | ```{r, fig.width=5, fig.height=6} 147 | p1 <- lubridate_daily_downloads_anomalized %>% 148 | plot_anomaly_decomposition() + 149 | ggtitle("Freq/Trend = 'auto'") 150 | 151 | p1 152 | ``` 153 | 154 | 155 | 156 | When "auto" is used, a `get_time_scale_template()` is used to determine logical frequency and trend spans based on the scale of the data. You can uncover the logic: 157 | 158 | ```{r} 159 | get_time_scale_template() 160 | ``` 161 | 162 | What this means is that if the scale is 1 day (meaning the difference between each data point is 1 day), then the frequency will be 7 days (or 1 week) and the trend will be around 90 days (or 3 months). This logic tends to work quite well for anomaly detection, but you may wish to adjust it. There are two ways: 163 | 164 | 1. Local parameter adjustment 165 | 2. Global parameter adjustment 166 | 167 | #### Local Parameter Adjustment 168 | 169 | Local parameter adjustment can be performed by tweaking the in-function parameters. Below we adjust `trend = "14 days"` which makes for a quite overfit trend. 170 | 171 | ```{r, fig.show="hold", fig.height=6, fig.align="default"} 172 | # Local adjustment via time_decompose 173 | p2 <- lubridate_daily_downloads %>% 174 | time_decompose(count, 175 | frequency = "auto", 176 | trend = "14 days") %>% 177 | anomalize(remainder) %>% 178 | plot_anomaly_decomposition() + 179 | ggtitle("Trend = 14 Days (Local)") 180 | 181 | # Show plots 182 | p1 183 | p2 184 | ``` 185 | 186 | #### Global Parameter Adjustement 187 | 188 | We can also adjust globally by using `set_time_scale_template()` to update the default template to one that we prefer. We'll change the "3 month" trend to "2 weeks" for time scale = "day". Use `time_scale_template()` to retrieve the time scale template that `anomalize` begins with, them `mutate()` the trend field in the desired location, and use `set_time_scale_template()` to update the template in the global options. We can retrieve the updated template using `get_time_scale_template()` to verify the change has been executed properly. 189 | 190 | ```{r} 191 | # Globally change time scale template options 192 | time_scale_template() %>% 193 | mutate(trend = ifelse(time_scale == "day", "14 days", trend)) %>% 194 | set_time_scale_template() 195 | 196 | get_time_scale_template() 197 | ``` 198 | 199 | Finally we can re-run the `time_decompose()` with defaults, and we can see that the trend is "14 days". 200 | 201 | ```{r, fig.width=5, fig.height=6} 202 | p3 <- lubridate_daily_downloads %>% 203 | time_decompose(count) %>% 204 | anomalize(remainder) %>% 205 | plot_anomaly_decomposition() + 206 | ggtitle("Trend = 14 Days (Global)") 207 | 208 | p3 209 | ``` 210 | 211 | Let's reset the time scale template defaults back to the original defaults. 212 | 213 | ```{r} 214 | # Set time scale template to the original defaults 215 | time_scale_template() %>% 216 | set_time_scale_template() 217 | 218 | # Verify the change 219 | get_time_scale_template() 220 | ``` 221 | 222 | 223 | ### Adjusting Anomaly Detection Alpha and Max Anoms 224 | 225 | The `alpha` and `max_anoms` are the two parameters that control the `anomalize()` function. Here's how they work. 226 | 227 | #### Alpha 228 | 229 | We can adjust `alpha`, which is set to 0.05 by default. By default the bands just cover the outside of the range. 230 | 231 | ```{r, fig.height=5, fig.width=5} 232 | p4 <- lubridate_daily_downloads %>% 233 | time_decompose(count) %>% 234 | anomalize(remainder, alpha = 0.05, max_anoms = 0.2) %>% 235 | time_recompose() %>% 236 | plot_anomalies(time_recomposed = TRUE) + 237 | ggtitle("alpha = 0.05") 238 | 239 | p4 240 | ``` 241 | 242 | We can decrease `alpha`, which increases the bands making it more difficult to be an outlier. See that the bands doubled in size. 243 | 244 | ```{r, fig.show="hold", fig.align="default"} 245 | p5 <- lubridate_daily_downloads %>% 246 | time_decompose(count) %>% 247 | anomalize(remainder, alpha = 0.025, max_anoms = 0.2) %>% 248 | time_recompose() %>% 249 | plot_anomalies(time_recomposed = TRUE) + 250 | ggtitle("alpha = 0.025") 251 | 252 | p4 253 | p5 254 | ``` 255 | 256 | #### Max Anoms 257 | 258 | The `max_anoms` parameter is used to control the maximum percentage of data that can be an anomaly. This is useful in cases where `alpha` is too difficult to tune, and you really want to focus on the most aggregious anomalies. 259 | 260 | Let's adjust `alpha = 0.3` so pretty much anything is an outlier. Now let's try a comparison between `max_anoms = 0.2` (20% anomalies allowed) and `max_anoms = 0.05` (5% anomalies allowed). 261 | 262 | ```{r, fig.show="hold", fig.align="default"} 263 | p6 <- lubridate_daily_downloads %>% 264 | time_decompose(count) %>% 265 | anomalize(remainder, alpha = 0.3, max_anoms = 0.2) %>% 266 | time_recompose() %>% 267 | plot_anomalies(time_recomposed = TRUE) + 268 | ggtitle("20% Anomalies") 269 | 270 | p7 <- lubridate_daily_downloads %>% 271 | time_decompose(count) %>% 272 | anomalize(remainder, alpha = 0.3, max_anoms = 0.05) %>% 273 | time_recompose() %>% 274 | plot_anomalies(time_recomposed = TRUE) + 275 | ggtitle("5% Anomalies") 276 | 277 | p6 278 | p7 279 | ``` 280 | 281 | In reality, you'll probably want to leave `alpha` in the range of 0.10 to 0.02, but it makes a nice illustration of how you can also use `max_anoms` to ensure only the most aggregious anomalies are identified. 282 | 283 | 284 | 285 | ## Further Understanding: Methods 286 | 287 | If you haven't had your fill and want to dive into the methods that power anomalize, check out the vignette, "Anomalize Methods". 288 | 289 | 290 | # Interested in Learning Anomaly Detection? 291 | 292 | Business Science offers two 1-hour courses on Anomaly Detection: 293 | 294 | - [Learning Lab 18](https://university.business-science.io/p/learning-labs-pro) - Time Series Anomaly Detection with `anomalize` 295 | 296 | - [Learning Lab 17](https://university.business-science.io/p/learning-labs-pro) - Anomaly Detection with `H2O` Machine Learning 297 | -------------------------------------------------------------------------------- /vignettes/forecasting_with_cleaned_anomalies.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Reduce Forecast Error with Cleaned Anomalies" 3 | author: "Business Science" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Reduce Forecast Error with Cleaned Anomalies} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>", 16 | warning = F, 17 | fig.align = "center" 18 | ) 19 | 20 | library(dplyr) 21 | library(ggplot2) 22 | library(tidyquant) 23 | library(anomalize) 24 | library(timetk) 25 | ``` 26 | 27 | 28 | 29 | > Forecasting error can often be reduced 20% to 50% by repairing anomolous data 30 | 31 | ## Example - Reducing Forecasting Error by 32% 32 | 33 | We can often get better forecast performance by cleaning anomalous data prior to forecasting. This is the perfect use case for integrating the `clean_anomalies()` function into your ___forecast workflow___. 34 | 35 | ```r 36 | library(tidyverse) 37 | library(tidyquant) 38 | library(anomalize) 39 | library(timetk) 40 | ``` 41 | 42 | ```{r} 43 | # NOTE: timetk now has anomaly detection built in, which 44 | # will get the new functionality going forward. 45 | # Use this script to prevent overwriting legacy anomalize: 46 | 47 | anomalize <- anomalize::anomalize 48 | plot_anomalies <- anomalize::plot_anomalies 49 | ``` 50 | 51 | Here is a short example with the `tidyverse_cran_downloads` dataset that comes with `anomalize`. __We'll see how we can reduce the forecast error by 32% simply by repairing anomalies.__ 52 | 53 | ```{r} 54 | tidyverse_cran_downloads 55 | ``` 56 | 57 | Let's take one package with some extreme events. We can hone in on `lubridate`, which has some outliers that we can fix. 58 | 59 | ```{r, fig.height=8, fig.width=6} 60 | tidyverse_cran_downloads %>% 61 | ggplot(aes(date, count, color = package)) + 62 | geom_point(alpha = 0.5) + 63 | facet_wrap(~ package, ncol = 3, scales = "free_y") + 64 | scale_color_viridis_d() + 65 | theme_tq() 66 | ``` 67 | 68 | 69 | ## Forecasting Lubridate Downloads 70 | 71 | Let's focus on downloads of the `lubridate` R package. 72 | 73 | ```{r} 74 | lubridate_tbl <- tidyverse_cran_downloads %>% 75 | ungroup() %>% 76 | filter(package == "lubridate") 77 | ``` 78 | 79 | First, we'll make a function, `forecast_mae()`, that can take the input of both cleaned and uncleaned anomalies and calculate forecast error of future uncleaned anomalies. 80 | 81 | The modeling function uses the following criteria: 82 | 83 | - Split the `data` into training and testing data that maintains the correct time-series sequence using the `prop` argument. 84 | - Models the daily time series of the training data set from observed (demonstrates no cleaning) or observed and cleaned (demonstrates improvement from cleaning). Specified by the `col_train` argument. 85 | - Compares the predictions to the observed values. Specified by the `col_test` argument. 86 | 87 | ```{r} 88 | forecast_mae <- function(data, col_train, col_test, prop = 0.8) { 89 | 90 | predict_expr <- enquo(col_train) 91 | actual_expr <- enquo(col_test) 92 | 93 | idx_train <- 1:(floor(prop * nrow(data))) 94 | 95 | train_tbl <- data %>% filter(row_number() %in% idx_train) 96 | test_tbl <- data %>% filter(!row_number() %in% idx_train) 97 | 98 | # Model using training data (training) 99 | model_formula <- as.formula(paste0(quo_name(predict_expr), " ~ index.num + year + quarter + month.lbl + day + wday.lbl")) 100 | 101 | model_glm <- train_tbl %>% 102 | tk_augment_timeseries_signature() %>% 103 | glm(model_formula, data = .) 104 | 105 | # Make Prediction 106 | suppressWarnings({ 107 | # Suppress rank-deficit warning 108 | prediction <- predict(model_glm, newdata = test_tbl %>% tk_augment_timeseries_signature()) 109 | actual <- test_tbl %>% pull(!! actual_expr) 110 | }) 111 | 112 | # Calculate MAE 113 | mae <- mean(abs(prediction - actual)) 114 | 115 | return(mae) 116 | 117 | } 118 | ``` 119 | 120 | ## Workflow for Cleaning Anomalies 121 | 122 | We will use the `anomalize` workflow of decomposing (`time_decompose()`) and identifying anomalies (`anomalize()`). We use the function, __`clean_anomalies()`, to add new column called "observed_cleaned" that is repaired by replacing all anomalies with the trend + seasonal components from the decompose operation__. We can now experiment to see the improvment in forecasting performance by comparing a forecast made with "observed" versus "observed_cleaned" 123 | 124 | ```{r} 125 | lubridate_anomalized_tbl <- lubridate_tbl %>% 126 | time_decompose(count) %>% 127 | anomalize(remainder) %>% 128 | 129 | # Function to clean & repair anomalous data 130 | clean_anomalies() 131 | 132 | lubridate_anomalized_tbl 133 | ``` 134 | 135 | ## Before Cleaning with anomalize 136 | 137 | ```{r} 138 | lubridate_anomalized_tbl %>% 139 | forecast_mae(col_train = observed, col_test = observed, prop = 0.8) 140 | ``` 141 | 142 | ## After Cleaning with anomalize 143 | 144 | ```{r} 145 | lubridate_anomalized_tbl %>% 146 | forecast_mae(col_train = observed_cleaned, col_test = observed, prop = 0.8) 147 | ``` 148 | 149 | ## 32% Reduction in Forecast Error 150 | 151 | This is approximately a 32% reduction in forecast error as measure by Mean Absolute Error (MAE). 152 | 153 | ```{r} 154 | (2755 - 4054) / 4054 155 | ``` 156 | 157 | # Interested in Learning Anomaly Detection? 158 | 159 | Business Science offers two 1-hour courses on Anomaly Detection: 160 | 161 | - [Learning Lab 18](https://university.business-science.io/p/learning-labs-pro) - Time Series Anomaly Detection with `anomalize` 162 | 163 | - [Learning Lab 17](https://university.business-science.io/p/learning-labs-pro) - Anomaly Detection with `H2O` Machine Learning 164 | --------------------------------------------------------------------------------