├── .ci ├── build-docs.sh ├── install.sh ├── lint-r-code.R ├── report_to_covr.sh ├── setup.sh └── test.sh ├── .github ├── CODEOWNERS ├── dependabot.yml └── workflows │ ├── build-docs.yaml │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── NEWS.md ├── README.md ├── cleanup_local.sh ├── cran-comments.md ├── r-pkg ├── .Rbuildignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R │ ├── assertions.R │ ├── chomp_aggs.R │ ├── chomp_hits.R │ ├── es_search.R │ ├── get_fields.R │ ├── helperfuns.R │ ├── logging.R │ ├── parse_date_time.R │ ├── unpack_nested_data.R │ └── uptasticsearch.R ├── _pkgdown.yml ├── inst │ └── testdata │ │ └── .gitkeep ├── man │ ├── chomp_aggs.Rd │ ├── chomp_hits.Rd │ ├── doc_shared.Rd │ ├── es_search.Rd │ ├── get_fields.Rd │ ├── parse_date_time.Rd │ └── unpack_nested_data.Rd ├── tests │ ├── testthat.R │ └── testthat │ │ ├── test-assertions.R │ │ ├── test-chomp_aggs.R │ │ ├── test-chomp_hits.R │ │ ├── test-es_search.R │ │ ├── test-get_fields.R │ │ ├── test-integration.R │ │ ├── test-parse_date_time.R │ │ └── test-unpack_nested_data.R └── vignettes │ └── FAQ.Rmd ├── setup_local.sh └── test-data ├── aggs_cardinality.json ├── aggs_date_histogram.json ├── aggs_date_histogram_cardinality.json ├── aggs_date_histogram_extended_stats.json ├── aggs_date_histogram_histogram.json ├── aggs_date_histogram_percentiles.json ├── aggs_date_histogram_significant_terms.json ├── aggs_date_histogram_stats.json ├── aggs_date_histogram_terms.json ├── aggs_extended_stats.json ├── aggs_histogram.json ├── aggs_percentiles.json ├── aggs_significant_terms.json ├── aggs_stats.json ├── aggs_terms.json ├── aggs_terms_cardinality.json ├── aggs_terms_date_histogram.json ├── aggs_terms_date_histogram_cardinality.json ├── aggs_terms_date_histogram_extended_stats.json ├── aggs_terms_date_histogram_percentiles.json ├── aggs_terms_date_histogram_significant_terms.json ├── aggs_terms_date_histogram_stats.json ├── aggs_terms_date_histogram_terms.json ├── aggs_terms_extended_stats.json ├── aggs_terms_histogram.json ├── aggs_terms_percentiles.json ├── aggs_terms_significant_terms.json ├── aggs_terms_stats.json ├── aggs_terms_terms.json ├── empty_terms.json ├── es5_shakespeare_mapping.json ├── es6_shakespeare_mapping.json ├── es7_shakespeare_mapping.json ├── es_hits.json ├── legacy_shakespeare_mapping.json ├── one_index_mapping.json ├── one_var_agg.json ├── sample.json ├── sample_es7.json ├── three_var_agg.json └── two_index_mapping.json /.ci/build-docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # failure is a natural part of life 4 | set -e -u -o pipefail 5 | 6 | # setup LaTeX stuff 7 | brew install basictex 8 | export PATH="/Library/TeX/texbin:$PATH" 9 | sudo tlmgr --verify-repo=none update --self 10 | sudo tlmgr --verify-repo=none install inconsolata helvetic rsfs 11 | 12 | # install dependencies 13 | Rscript -e "install.packages(c('assertthat', 'curl', 'data.table', 'futile.logger', 'jsonlite', 'knitr', 'markdown', 'pkgdown', 'purrr', 'roxygen2', 'stringr'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" 14 | 15 | cp NEWS.md ./r-pkg/ 16 | cp README.md ./r-pkg/ 17 | 18 | # build the docs 19 | pushd ./r-pkg 20 | R CMD INSTALL --with-keep.source . 21 | Rscript -e "roxygen2::roxygenize()" 22 | Rscript -e "pkgdown::build_site()" 23 | popd 24 | -------------------------------------------------------------------------------- /.ci/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # failure is a natural part of life 4 | set -e -u -o pipefail 5 | 6 | R CMD INSTALL \ 7 | --clean \ 8 | ./r-pkg 9 | -------------------------------------------------------------------------------- /.ci/lint-r-code.R: -------------------------------------------------------------------------------- 1 | 2 | library(lintr) # nolint[unused_import] 3 | 4 | args <- commandArgs( 5 | trailingOnly = TRUE 6 | ) 7 | SOURCE_DIR <- args[[1L]] 8 | 9 | FILES_TO_LINT <- list.files( 10 | path = SOURCE_DIR 11 | , pattern = "\\.r$" 12 | , all.files = TRUE 13 | , ignore.case = TRUE 14 | , full.names = TRUE 15 | , recursive = TRUE 16 | , include.dirs = FALSE 17 | ) 18 | 19 | # text to use for pipe operators from packages like 'magrittr' 20 | pipe_text <- paste0( 21 | "For consistency and the sake of being explicit, this project's code " 22 | , "does not use the pipe operator." 23 | ) 24 | 25 | # text to use for functions that should only be called interactively 26 | interactive_text <- paste0( 27 | "Functions like '?', 'help', and 'install.packages()' should only be used " 28 | , "interactively, not in package code." 29 | ) 30 | 31 | LINTERS_TO_USE <- list( 32 | "absolute_path" = lintr::absolute_path_linter() 33 | , "any_duplicated" = lintr::any_duplicated_linter() 34 | , "any_is_na" = lintr::any_is_na_linter() 35 | , "assignment" = lintr::assignment_linter() 36 | , "backport" = lintr::backport_linter() 37 | , "boolean_arithmetic" = lintr::boolean_arithmetic_linter() 38 | , "braces" = lintr::brace_linter() 39 | , "class_equals" = lintr::class_equals_linter() 40 | , "commas" = lintr::commas_linter() 41 | , "conjunct_test" = lintr::conjunct_test_linter() 42 | , "duplicate_argument" = lintr::duplicate_argument_linter() 43 | , "empty_assignment" = lintr::empty_assignment_linter() 44 | , "equals_na" = lintr::equals_na_linter() 45 | , "fixed_regex" = lintr::fixed_regex_linter() 46 | , "for_loop_index" = lintr::for_loop_index_linter() 47 | , "function_left" = lintr::function_left_parentheses_linter() 48 | , "function_return" = lintr::function_return_linter() 49 | , "implicit_assignment" = lintr::implicit_assignment_linter() 50 | , "infix_spaces" = lintr::infix_spaces_linter() 51 | , "inner_combine" = lintr::inner_combine_linter() 52 | , "is_numeric" = lintr::is_numeric_linter() 53 | , "lengths" = lintr::lengths_linter() 54 | , "length_levels" = lintr::length_levels_linter() 55 | , "length_test" = lintr::length_test_linter() 56 | , "line_length" = lintr::line_length_linter(length = 150L) 57 | , "literal_coercion" = lintr::literal_coercion_linter() 58 | , "matrix" = lintr::matrix_apply_linter() 59 | , "missing_argument" = lintr::missing_argument_linter() 60 | , "non_portable_path" = lintr::nonportable_path_linter() 61 | , "numeric_leading_zero" = lintr::numeric_leading_zero_linter() 62 | , "outer_negation" = lintr::outer_negation_linter() 63 | , "package_hooks" = lintr::package_hooks_linter() 64 | , "paren_body" = lintr::paren_body_linter() 65 | , "paste" = lintr::paste_linter() 66 | , "quotes" = lintr::quotes_linter() 67 | , "redundant_equals" = lintr::redundant_equals_linter() 68 | , "regex_subset" = lintr::regex_subset_linter() 69 | , "routine_registration" = lintr::routine_registration_linter() 70 | , "scalar_in" = lintr::scalar_in_linter() 71 | , "semicolon" = lintr::semicolon_linter() 72 | , "seq" = lintr::seq_linter() 73 | , "spaces_inside" = lintr::spaces_inside_linter() 74 | , "spaces_left_parens" = lintr::spaces_left_parentheses_linter() 75 | , "sprintf" = lintr::sprintf_linter() 76 | , "string_boundary" = lintr::string_boundary_linter() 77 | #, "todo_comments" = lintr::todo_comment_linter(c("todo", "fixme", "to-do")) 78 | , "trailing_blank" = lintr::trailing_blank_lines_linter() 79 | , "trailing_white" = lintr::trailing_whitespace_linter() 80 | , "true_false" = lintr::T_and_F_symbol_linter() 81 | , "undesirable_function" = lintr::undesirable_function_linter( 82 | fun = c( 83 | "cbind" = paste0( 84 | "cbind is an unsafe way to build up a data frame. merge() or direct " 85 | , "column assignment is preferred." 86 | ) 87 | , "help" = interactive_text 88 | , "ifelse" = "The use of ifelse() is dangerous because it will silently allow mixing types." 89 | , "install.packages" = interactive_text 90 | , "rbind" = "data.table::rbindlist() is faster and safer than rbind(), and is preferred in this project." 91 | , "require" = paste0( 92 | "library() is preferred to require() because it will raise an error immediately " 93 | , "if a package is missing." 94 | ) 95 | ) 96 | ) 97 | , "undesirable_operator" = lintr::undesirable_operator_linter( 98 | op = c( 99 | "%>%" = pipe_text 100 | , "%.%" = pipe_text 101 | , "%..%" = pipe_text 102 | , "?" = interactive_text 103 | , "??" = interactive_text 104 | ) 105 | ) 106 | , "unnecessary_concatenation" = lintr::unnecessary_concatenation_linter() 107 | , "unnecessary_lambda" = lintr::unnecessary_lambda_linter() 108 | , "unreachable_code" = lintr::unreachable_code_linter() 109 | , "unused_import" = lintr::unused_import_linter() 110 | , "vector_logic" = lintr::vector_logic_linter() 111 | , "whitespace" = lintr::whitespace_linter() 112 | ) 113 | 114 | cat(sprintf("Found %i R files to lint\n", length(FILES_TO_LINT))) 115 | 116 | results <- NULL 117 | 118 | for (r_file in FILES_TO_LINT) { 119 | 120 | this_result <- lintr::lint( 121 | filename = r_file 122 | , linters = LINTERS_TO_USE 123 | , cache = FALSE 124 | ) 125 | 126 | print( 127 | sprintf( 128 | "Found %i linting errors in %s" 129 | , length(this_result) 130 | , r_file 131 | ) 132 | , quote = FALSE 133 | ) 134 | 135 | results <- c(results, this_result) 136 | 137 | } 138 | 139 | issues_found <- length(results) 140 | 141 | if (issues_found > 0L) { 142 | print(results) 143 | } 144 | 145 | quit(save = "no", status = issues_found) 146 | -------------------------------------------------------------------------------- /.ci/report_to_covr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # failure is a natural part of life 4 | set -e -u -o pipefail 5 | 6 | Rscript -e " \ 7 | Sys.setenv(NOT_CRAN = 'true'); \ 8 | covr::codecov('r-pkg/') \ 9 | " 10 | -------------------------------------------------------------------------------- /.ci/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # failure is a natural part of life 4 | set -e -u -o pipefail 5 | 6 | # `devscripts` is required for 'checkbashisms' (https://github.com/r-lib/actions/issues/111) 7 | sudo apt-get update 8 | sudo apt-get install \ 9 | --no-install-recommends \ 10 | -y \ 11 | --allow-downgrades \ 12 | libcurl4-openssl-dev \ 13 | curl \ 14 | devscripts \ 15 | texinfo \ 16 | texlive-latex-recommended \ 17 | texlive-fonts-recommended \ 18 | texlive-fonts-extra \ 19 | tidy \ 20 | qpdf 21 | 22 | Rscript -e "install.packages(c('covr', 'curl', 'data.table', 'futile.logger', 'jsonlite', 'knitr', 'lintr', 'markdown', 'purrr', 'stringr', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" 23 | cp test-data/* r-pkg/inst/testdata/ 24 | -------------------------------------------------------------------------------- /.ci/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # failure is a natural part of life 4 | set -e -u -o pipefail 5 | 6 | R CMD build ./r-pkg 7 | export _R_CHECK_CRAN_INCOMING_=false 8 | R CMD check \ 9 | --as-cran \ 10 | ./*.tar.gz 11 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This file controls default reviewers for 'uptasticsearch' code. 2 | # See https://help.github.com/en/articles/about-code-owners 3 | # for details 4 | # 5 | # Maintainers are encouraged to use their best discretion in 6 | # setting reviewers on PRs manually, but this file should 7 | # offer a reasonable automatic best-guess. 8 | # 9 | # NOTE: according to GitHub, the LAST rule matched in this 10 | # file will determine who is added to a PR for review 11 | 12 | # Default reviewers for all code 13 | * @jameslamb @austin3dickey 14 | 15 | # community files 16 | LICENSE @jameslamb @austin3dickey @bburns632 17 | CONDUCT.md @jameslamb @austin3dickey @bburns632 18 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | - package-ecosystem: github-actions 5 | directory: / 6 | schedule: 7 | interval: monthly 8 | groups: 9 | ci-dependencies: 10 | patterns: 11 | - "*" 12 | commit-message: 13 | prefix: "[ci]" 14 | labels: 15 | - maintenance 16 | -------------------------------------------------------------------------------- /.github/workflows/build-docs.yaml: -------------------------------------------------------------------------------- 1 | name: build-docs 2 | 3 | concurrency: 4 | group: docs-build-on-${{ github.event_name }}-from-${{ github.ref_name }} 5 | cancel-in-progress: true 6 | 7 | on: 8 | # run only when called by other workflows 9 | workflow_call: 10 | inputs: 11 | deploy: 12 | required: true 13 | type: boolean 14 | default: false 15 | description: "set to true to publish docs" 16 | 17 | jobs: 18 | build: 19 | runs-on: macos-latest 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | - name: set up R 25 | uses: r-lib/actions/setup-r@v2 26 | with: 27 | r-version: release 28 | - name: set up pandoc 29 | uses: r-lib/actions/setup-pandoc@v2 30 | - name: build docs 31 | run: | 32 | .ci/build-docs.sh 33 | - uses: actions/upload-pages-artifact@v3 34 | with: 35 | path: ./r-pkg/docs 36 | 37 | deploy: 38 | needs: 39 | - build 40 | if: inputs.deploy 41 | 42 | # Grant GITHUB_TOKEN the permissions required to make a Pages deployment 43 | permissions: 44 | pages: write # to deploy to Pages 45 | id-token: write # to verify the deployment originates from an appropriate source 46 | 47 | # Deploy to the github-pages environment 48 | environment: 49 | name: github-pages 50 | url: ${{ steps.deployment.outputs.page_url }} 51 | 52 | runs-on: ubuntu-latest 53 | steps: 54 | - name: Deploy to GitHub Pages 55 | id: deployment 56 | uses: actions/deploy-pages@v4 57 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | # run by clicking buttons in the GitHub Actions UI 11 | workflow_dispatch: 12 | inputs: 13 | deploy-docs: 14 | description: 'Update the docs site?' 15 | required: true 16 | type: boolean 17 | 18 | # automatically cancel in-progress builds if another commit is pushed 19 | concurrency: 20 | group: ${{ github.workflow }}-${{ github.ref }} 21 | cancel-in-progress: true 22 | 23 | env: 24 | # parallelize compilation (extra important for Linux, where CRAN doesn't supply pre-compiled binaries) 25 | MAKEFLAGS: "-j4" 26 | 27 | jobs: 28 | build-docs: 29 | uses: ./.github/workflows/build-docs.yaml 30 | with: 31 | deploy: ${{ (github.event_name == 'push' && startsWith(github.ref, 'refs/tags')) || (github.event_name == 'workflow_dispatch' && inputs.deploy-docs == true) }} 32 | secrets: inherit 33 | lint: 34 | name: lint 35 | runs-on: ubuntu-latest 36 | timeout-minutes: 30 37 | steps: 38 | - name: checkout repository 39 | uses: actions/checkout@v4 40 | with: 41 | fetch-depth: 0 42 | - uses: pre-commit/action@v3.0.1 43 | - name: set up R 44 | uses: r-lib/actions/setup-r@v2 45 | - name: run lintr 46 | run: | 47 | Rscript -e "install.packages('lintr')" 48 | Rscript ./.ci/lint-r-code.R $(pwd) 49 | test: 50 | name: test (ES ${{ matrix.es_version }}) 51 | runs-on: ubuntu-latest 52 | timeout-minutes: 60 53 | strategy: 54 | fail-fast: false 55 | matrix: 56 | es_version: 57 | - 1.7.6 58 | - 2.4.6 59 | - 5.6.16 60 | - 6.8.15 61 | - 7.0.1 62 | - 7.17.22 63 | - 8.0.1 64 | - 8.5.3 65 | - 8.10.4 66 | - 8.15.5 67 | - 8.17.2 68 | steps: 69 | - name: checkout repository 70 | uses: actions/checkout@v4 71 | with: 72 | fetch-depth: 1 73 | - name: set up R 74 | uses: r-lib/actions/setup-r@v2 75 | with: 76 | r-version: release 77 | - name: set up pandoc 78 | uses: r-lib/actions/setup-pandoc@v2 79 | - name: run tests 80 | shell: bash 81 | run: | 82 | export ES_VERSION=${{ matrix.es_version }} 83 | $GITHUB_WORKSPACE/.ci/setup.sh 84 | $GITHUB_WORKSPACE/.ci/install.sh 85 | $GITHUB_WORKSPACE/setup_local.sh ${{ matrix.es_version }} 86 | $GITHUB_WORKSPACE/.ci/test.sh 87 | $GITHUB_WORKSPACE/.ci/report_to_covr.sh 88 | all-successful: 89 | if: always() 90 | runs-on: ubuntu-latest 91 | needs: 92 | - build-docs 93 | - lint 94 | - test 95 | steps: 96 | - name: Decide whether the needed jobs succeeded or failed 97 | uses: re-actors/alls-green@v1.2.2 98 | with: 99 | jobs: ${{ toJSON(needs) }} 100 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | *.tar.gz 13 | *.Rcheck/ 14 | 15 | # RStudio files 16 | *.Rproj 17 | .Rproj.user/ 18 | 19 | # produced vignettes 20 | vignettes/*.html 21 | vignettes/*.pdf 22 | 23 | # Temporary files created by R markdown 24 | *.utf8.md 25 | *.knit.md 26 | .Rproj.user 27 | 28 | # Data files 29 | *.Rda 30 | *.pdf 31 | *.csv 32 | 33 | # system files 34 | *.DS_Store 35 | 36 | # misc testing files 37 | sandbox/ 38 | lib/ 39 | coverage.html 40 | 41 | # shared files copied into package at build time 42 | r-pkg/NEWS.md 43 | r-pkg/README.md 44 | r-pkg/inst/testdata/*.json 45 | 46 | # Python stuff 47 | **/.pytest_cache/ 48 | **/__pycache__/ 49 | **/dist/ 50 | **/htmlcov/ 51 | **/*.egg-info/ 52 | 53 | # As long as we're storing the pkgdown site 54 | # at the repo root, should protect against 55 | # people committing files in r-pkg 56 | r-pkg/docs/ 57 | 58 | # backup files from command-line tools 59 | *.bak 60 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | exclude: | 3 | (?x)^( 4 | test-data/.* 5 | )$ 6 | repos: 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v5.0.0 9 | hooks: 10 | - id: end-of-file-fixer 11 | - id: trailing-whitespace 12 | - repo: https://github.com/maxwinterstein/shfmt-py 13 | rev: v3.7.0.1 14 | hooks: 15 | - id: shfmt 16 | args: ["--indent=4", "--space-redirects", "--write"] 17 | - repo: https://github.com/shellcheck-py/shellcheck-py 18 | rev: v0.10.0.1 19 | hooks: 20 | - id: shellcheck 21 | args: ["--exclude=SC2002"] 22 | - repo: https://github.com/codespell-project/codespell 23 | rev: v2.4.1 24 | hooks: 25 | - id: codespell 26 | # additional_dependencies: [tomli] 27 | # args: ["--toml", "pyproject.toml"] 28 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | **Examples of behavior that contributes to creating a positive environment include:** 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | **Examples of unacceptable behavior by participants include:** 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team (see the "Maintainer" field in file `r-pkg/DESCRIPTION`). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at http://contributor-covenant.org/version/1/4. 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018, Uptake 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build 2 | build: 3 | cp test-data/* r-pkg/inst/testdata/ 4 | R CMD BUILD r-pkg/ 5 | 6 | .PHONY: coverage 7 | coverage: 8 | echo "Calculating test coverage..." 9 | Rscript -e "Sys.setenv(NOT_CRAN = 'true'); coverage <- covr::package_coverage('r-pkg/'); print(coverage); covr::report(coverage, './coverage.html')" 10 | echo "Done calculating coverage" 11 | open coverage.html 12 | 13 | .PHONY: install 14 | install: build 15 | R CMD INSTALL r-pkg/ 16 | 17 | .PHONY: test 18 | test: build 19 | R CMD CHECK --as-cran uptasticsearch_*.tar.gz 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # uptasticsearch 2 | 3 | [![GitHub Actions Build Status](https://github.com/uptake/uptasticsearch/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/uptake/uptasticsearch/actions/workflows/ci.yml) 4 | [![codecov](https://codecov.io/gh/uptake/uptasticsearch/branch/main/graph/badge.svg)](https://app.codecov.io/gh/uptake/uptasticsearch) 5 | [![CRAN\_Status\_Badge](https://www.r-pkg.org/badges/version-last-release/uptasticsearch)](https://cran.r-project.org/package=uptasticsearch) 6 | [![CRAN\_Download\_Badge](https://cranlogs.r-pkg.org/badges/grand-total/uptasticsearch)](https://cran.r-project.org/package=uptasticsearch) 7 | 8 | ## Introduction 9 | 10 | `uptasticsearch` tackles the issue of getting data out of Elasticsearch and into a tabular format in R. 11 | It should work for all versions of Elasticsearch from 1.0.0 onwards, but [is not regularly tested against all of them](https://github.com/uptake/uptasticsearch/blob/main/CONTRIBUTING.md#gha). 12 | If you run into a problem, please [open an issue](https://github.com/uptake/uptasticsearch/issues). 13 | 14 | # Table of contents 15 | 16 | * [How it Works](#howitworks) 17 | * [Installation](#installation) 18 | * [R](#rinstallation) 19 | * [Usage Examples](#examples) 20 | * [Get a Batch of Documents](#example1) 21 | * [Aggregation Results](#example2) 22 | 23 | ## How it Works 24 | 25 | The core functionality of this package is the `es_search()` function. 26 | This returns a `data.table` containing the parsed result of any given query. Note that this includes `aggs` queries. 27 | 28 | ## Installation 29 | 30 | ### R 31 | 32 | ![Lifecycle Maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg) 33 | 34 | Releases of this package can be installed from CRAN: 35 | 36 | ```r 37 | install.packages( 38 | 'uptasticsearch' 39 | , repos = "http://cran.rstudio.com" 40 | ) 41 | ``` 42 | 43 | or from `conda-forge` 44 | 45 | ```shell 46 | conda install -c conda-forge r-uptasticsearch 47 | ``` 48 | 49 | To use the development version of the package, which has the newest changes, you can install directly from GitHub 50 | 51 | ```r 52 | remotes::install_github( 53 | "uptake/uptasticsearch" 54 | , subdir = "r-pkg" 55 | ) 56 | ``` 57 | 58 | ## Usage Examples 59 | 60 | The examples presented here pertain to a fictional Elasticsearch index holding some information on a movie theater business. 61 | 62 | ### Example 1: Get a Batch of Documents 63 | 64 | The most common use case for this package will be the case where you have an Elasticsearch query and want to get a data frame representation of many resulting documents. 65 | 66 | In the example below, we use `uptasticsearch` to look for all survey results in which customers said their satisfaction was "low" or "very low" and mentioned food in their comments. 67 | 68 | ```r 69 | library(uptasticsearch) 70 | 71 | # Build your query in an R string 72 | qbody <- '{ 73 | "query": { 74 | "filtered": { 75 | "filter": { 76 | "bool": { 77 | "must": [ 78 | { 79 | "exists": { 80 | "field": "customer_comments" 81 | } 82 | }, 83 | { 84 | "terms": { 85 | "overall_satisfaction": ["very low", "low"] 86 | } 87 | } 88 | ] 89 | } 90 | } 91 | }, 92 | "query": { 93 | "match_phrase": { 94 | "customer_comments": "food" 95 | } 96 | } 97 | } 98 | }' 99 | 100 | # Execute the query, parse into a data.table 101 | commentDT <- es_search( 102 | es_host = 'http://mydb.mycompany.com:9200' 103 | , es_index = "survey_results" 104 | , query_body = qbody 105 | , scroll = "1m" 106 | , n_cores = 4 107 | ) 108 | ``` 109 | 110 | ### Example 2: Aggregation Results 111 | 112 | Elasticsearch ships with a rich set of aggregations for creating summarized views of your data. 113 | `uptasticsearch` has built-in support for these aggregations. 114 | 115 | In the example below, we use `uptasticsearch` to create daily timeseries of summary statistics like total revenue and average payment amount. 116 | 117 | ```r 118 | library(uptasticsearch) 119 | 120 | # Build your query in an R string 121 | qbody <- '{ 122 | "query": { 123 | "filtered": { 124 | "filter": { 125 | "bool": { 126 | "must": [ 127 | { 128 | "exists": { 129 | "field": "pmt_amount" 130 | } 131 | } 132 | ] 133 | } 134 | } 135 | } 136 | }, 137 | "aggs": { 138 | "timestamp": { 139 | "date_histogram": { 140 | "field": "timestamp", 141 | "interval": "day" 142 | }, 143 | "aggs": { 144 | "revenue": { 145 | "extended_stats": { 146 | "field": "pmt_amount" 147 | } 148 | } 149 | } 150 | } 151 | }, 152 | "size": 0 153 | }' 154 | 155 | # Execute the query, parse result into a data.table 156 | revenueDT <- es_search( 157 | es_host = 'http://mydb.mycompany.com:9200' 158 | , es_index = "transactions" 159 | , size = 1000 160 | , query_body = qbody 161 | , n_cores = 1 162 | ) 163 | ``` 164 | 165 | In the example above, we used the [date_histogram](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-datehistogram-aggregation.html) and [extended_stats](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-extendedstats-aggregation.html) aggregations. 166 | `es_search()` has built-in support for many other aggregations and combinations of aggregations, with more on the way. 167 | Please see the table below for the current status of the package. 168 | Note that names of the form "agg1 - agg2" refer to the ability to handled aggregations nested inside other aggregations. 169 | 170 | |Agg type | support? | 171 | |:------------------------------------------|:--------:| 172 | |["cardinality"][1] |YES | 173 | |["date_histogram"][2] |YES | 174 | |date_histogram - cardinality |YES | 175 | |date_histogram - extended_stats |YES | 176 | |date_histogram - histogram |YES | 177 | |date_histogram - percentiles |YES | 178 | |date_histogram - significant_terms |YES | 179 | |date_histogram - stats |YES | 180 | |date_histogram - terms |YES | 181 | |["extended_stats"][3] |YES | 182 | |["histogram"][4] |YES | 183 | |["percentiles"][5] |YES | 184 | |["significant terms"][6] |YES | 185 | |["stats"][7] |YES | 186 | |["terms"][8] |YES | 187 | |terms - cardinality |YES | 188 | |terms - date_histogram |YES | 189 | |terms - date_histogram - cardinality |YES | 190 | |terms - date_histogram - extended_stats |YES | 191 | |terms - date_histogram - histogram |YES | 192 | |terms - date_histogram - percentiles |YES | 193 | |terms - date_histogram - significant_terms |YES | 194 | |terms - date_histogram - stats |YES | 195 | |terms - date_histogram - terms |YES | 196 | |terms - extended_stats |YES | 197 | |terms - histogram |YES | 198 | |terms - percentiles |YES | 199 | |terms - significant_terms |YES | 200 | |terms - stats |YES | 201 | |terms - terms |YES | 202 | 203 | [1]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html 204 | [2]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-datehistogram-aggregation.html 205 | [3]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-extendedstats-aggregation.html 206 | [4]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-histogram-aggregation.html 207 | [5]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-percentile-aggregation.html 208 | [6]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-significantterms-aggregation.html 209 | [7]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-stats-aggregation.html 210 | [8]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-terms-aggregation.html 211 | -------------------------------------------------------------------------------- /cleanup_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e -u -o pipefail 4 | 5 | # Remove testing directory 6 | echo "removing testing directory" 7 | rm -r ./sandbox 8 | 9 | # Kill the running container 10 | echo "killing running container" 11 | docker kill "$(docker ps -ql)" 12 | 13 | echo "done cleaning up test environment" 14 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | # CRAN Submission History 2 | 3 | ## v0.0.2 - Submission 1 - (July 17, 2017) 4 | 5 | ### Test environments 6 | * Alpine 3.5 (on Jenkins CI), R 3.4.0 7 | * local CentOS 7.3, R 3.4.0 8 | * local OS X, R 3.3.2 9 | * local Windows 10, R 3.3.2 10 | * Windows via `devtools::build_win()` 11 | 12 | ### `R CMD check` results 13 | * There were no ERRORs, WARNINGs. 14 | * One NOTE from `checking CRAN incoming feasibility ...` can be safely ignored since it's a note that notifies CRAN that this is a new maintainer/submission. 15 | 16 | ### CRAN Response 17 | * Automatic checking upon CRAN submission yielded two notes. One was the "incoming feasibility..." item we mentioned above, which is not an issue. 18 | * The other note said that `Author field differs from that derived from Authors@R`. This did not arise when running `R CMD check --as-cran` locally, but it looks like "fnd" is not a supported tag for an author. Removed that tag. 19 | 20 | ## v0.0.2 - Submission 2 - (July 17, 2017) 21 | 22 | ### CRAN Response 23 | * Need to use the [CRAN preferred method](https://cran.r-project.org/web/licenses/BSD_3_clause) of declaring the BSD 3-Clause license 24 | * Need to quote software names 25 | 26 | ## v0.0.2 - Submission 3 - (July 18, 2017) 27 | 28 | ### CRAN Response 29 | * No lingering issues. v0.0.2 released to CRAN! 30 | 31 | ## v0.1.0 - Submission 1 - (August 28, 2017) 32 | 33 | ### `R CMD check` results 34 | * No issues 35 | 36 | ### CRAN Response 37 | * Need to use CRAN canonical form (http://cran.r-project.org/package=uptasticsearch) 38 | 39 | ## v0.1.0 - Submission 2 - (August 28, 2017) 40 | 41 | ### `R CMD check` results 42 | * No issues 43 | 44 | ### CRAN Response 45 | * CRAN canonical form uses HTTPS (https://cran.r-project.org/package=uptasticsearch) 46 | 47 | ## v0.1.0 - Submission 3 - (August 29, 2017) 48 | 49 | ### `R CMD check` results 50 | * No issues 51 | 52 | ### CRAN Response 53 | * CRAN URLs are still missing HTTPS (submitter error) 54 | 55 | ## v0.1.0 - Submission 4 - (August 29, 2017) 56 | 57 | ### `R CMD check` results 58 | * No issues 59 | 60 | ### CRAN Response 61 | * Still missing HTTPS in CRAN URLs (we'd been editing the README at the repo root, not the one built with the package) 62 | * Reviewers asked if examples in "\dontrun" could be run instead 63 | 64 | ## v0.1.0 - Submission 5 - (August 29, 2017) 65 | 66 | ### `R CMD check` results 67 | * No issues 68 | 69 | ### CRAN Response 70 | * No lingering issues. v0.1.0 released to CRAN! 71 | 72 | ## v0.2.0 - Submission 1 - (April 12, 2018) 73 | 74 | ### `R CMD check` results 75 | * No issues 76 | 77 | ### CRAN Response 78 | * No issues. v0.2.0 released to CRAN! 79 | 80 | ## v0.3.0 - Submission 1 - (June 18, 2018) 81 | 82 | ### `R CMD check` results 83 | * No issues 84 | 85 | ### CRAN Response 86 | * No issues. v0.3.0 released to CRAN! 87 | 88 | ## v0.3.1 - Submission 1 - (January 28, 2019) 89 | 90 | ### `R CMD check` results 91 | * Issues on several platforms, of the form `premature EOF...`. This is a result of forgetting to put the test data in the package tarball before upload. 92 | 93 | ### CRAN Response 94 | * Upload a new version with this fixed or your package comes down in 7 days 95 | 96 | ## v0.3.1 - Submission 2 - (January 29, 2019) 97 | 98 | ### `R CMD check` results 99 | * Empty links in `NEWS.md` 100 | 101 | ### CRAN Response 102 | * Upload a new version with this fixed or your package comes down in 7 days 103 | 104 | ## v0.3.1 - Submission 3 - (January 30, 2019) 105 | 106 | ### `R CMD check` results 107 | * No issues 108 | 109 | ### CRAN Response 110 | * No issues. v0.3.1 released to CRAN! 111 | 112 | ## v0.4.0 - Submission 1 - (September 9, 2019) 113 | 114 | In this submission, we changed maintainer from `james.lamb@uptake.com` to `jaylamb20@gmail.com`. Added this note in the initial submission: 115 | 116 | > This is a release to add support for Elasticsearch 7.x, a major release stream that has been General Availability since April 2019. 117 | 118 | > You may see that the maintainer email is changing from "james.lamb@uptake.com" to "jaylamb20@gmail.com". This is a contact info update only, not an actual maintainer change. The "uptake.com" address is tied to the company that holds copyright over this project (https://github.com/uptake/uptasticsearch/blob/master/LICENSE#L3). I no longer work there but have received their permission to continue on as the maintainer. If you need confirmation you can contact my coauthors who still work there (austin.dickey@uptake.com, nick.paras@uptake.com) or that company's legal team (dennis.lee@uptake.com) 119 | 120 | ### `R CMD check` results 121 | * No issues 122 | 123 | ### CRAN Response 124 | * Release was auto-accepted, but the response email said "We are waiting for confirmation from the old maintainer address now.". I responded and re-iterated the message above about changed maintainer email. No response yet. We are blocked until they respond. 125 | * CRAN seems ok with the maintainer change, noted that we have one bad link in `README.md`, "`./CONTRIBUTING.md"`. Needs to be changed to a fully-specified URL. 126 | 127 | ## v0.4.0 - Submission 1 - (September 11, 2019) 128 | 129 | ### `R CMD check` results 130 | * No issues 131 | 132 | ### CRAN Response 133 | * No issues. v0.4.0 released to CRAN! 134 | 135 | ## v1.0.0 - Submission 1 - (February 24, 2025) 136 | 137 | Submitted with the following comments. 138 | 139 | > This is the first release of 'uptasticsearch' since 2019. 140 | > It mainly seeks to preserve the package on CRAN by removing use of deprecated-and-soon-to-be-removed functionality in 'testthat' (https://github.com/uptake/uptasticsearch/issues/223). 141 | 142 | ### `R CMD check` results 143 | 144 | * No issues 145 | 146 | ### CRAN Response 147 | 148 | * No issues. v1.0.0 released to CRAN! 149 | -------------------------------------------------------------------------------- /r-pkg/.Rbuildignore: -------------------------------------------------------------------------------- 1 | 2 | # Files currently checked into the repo 3 | ^docs$ 4 | ^_pkgdown\.yml$ 5 | ^cran-comments\.md$ 6 | ^tests/testthat/test-integration_tests\.R$ 7 | ^inst/testdata/shakespeare_mapping\.json$ 8 | ^CONDUCT\.md$ 9 | ^LICENSE\.MD$ 10 | ^.travis\.yml$ 11 | ^setup_local.sh$ 12 | ^cleanup_local.sh$ 13 | ^coverage.sh$ 14 | 15 | # History files 16 | ^\.Rhistory* 17 | ^\.Rapp\.history* 18 | 19 | # Session Data files 20 | ^\.RData$ 21 | 22 | # Example code in package build process 23 | .*-Ex\.R 24 | 25 | # Output files from R CMD build 26 | .*\.tar\.gz 27 | 28 | # Output files from R CMD check 29 | .*\.Rcheck/ 30 | 31 | # RStudio files 32 | .*\.Rproj 33 | \.Rproj\.user/ 34 | 35 | # produced vignettes 36 | vignettes/*\.html 37 | vignettes/*\.pdf 38 | 39 | # Temporary files created by R markdown 40 | .*\.utf8\.md 41 | .*\.knit\.md 42 | ^\.Rproj\.user$ 43 | 44 | # Data files 45 | .*\.Rda 46 | .*\.pdf 47 | .*\.csv 48 | 49 | # system files 50 | .*\.DS_Store 51 | ^.*\.Rproj$ 52 | 53 | # Temporary files generated by local testing 54 | ^lib$ 55 | ^sandbox$ 56 | ^coverage.html$ 57 | 58 | # Stuff 59 | .Rbuildignore 60 | .*\.gitkeep 61 | -------------------------------------------------------------------------------- /r-pkg/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: uptasticsearch 2 | Type: Package 3 | Title: Get Data Frame Representations of 'Elasticsearch' Results 4 | Version: 1.0.0.9999 5 | Authors@R: c( 6 | person("James", "Lamb", email = "jaylamb20@gmail.com", role = c("aut", "cre")), 7 | person("Nick", "Paras", role = c("aut")), 8 | person("Austin", "Dickey", role = c("aut")), 9 | person("Michael", "Frasco", email = "mfrasco6@gmail.com", role = c("ctb")), 10 | person("Weiwen", "Gu", role = c("ctb")), 11 | person("Will", "Dearden", role = c("ctb")), 12 | person("Uptake Technologies Inc.", role = c("cph"))) 13 | Maintainer: James Lamb 14 | Description: 15 | 'Elasticsearch' is an open-source, distributed, document-based datastore 16 | (). 17 | It provides an 'HTTP' 'API' for querying the database and extracting datasets, but that 18 | 'API' was not designed for common data science workflows like pulling large batches of 19 | records and normalizing those documents into a data frame that can be used as a training 20 | dataset for statistical models. 'uptasticsearch' provides an interface for 'Elasticsearch' 21 | that is explicitly designed to make these data science workflows easy and fun. 22 | Depends: 23 | R (>= 3.3.0) 24 | Imports: 25 | curl, 26 | data.table, 27 | futile.logger, 28 | jsonlite, 29 | purrr, 30 | stats, 31 | stringr 32 | Suggests: 33 | knitr, 34 | markdown, 35 | testthat 36 | License: BSD_3_clause + file LICENSE 37 | URL: https://github.com/uptake/uptasticsearch 38 | BugReports: https://github.com/uptake/uptasticsearch/issues 39 | RoxygenNote: 7.3.2 40 | VignetteBuilder: knitr 41 | Encoding: UTF-8 42 | -------------------------------------------------------------------------------- /r-pkg/LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2017 2 | COPYRIGHT HOLDER: Uptake Technologies Inc. 3 | ORGANIZATION: Uptake Technologies Inc. 4 | -------------------------------------------------------------------------------- /r-pkg/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(chomp_aggs) 4 | export(chomp_hits) 5 | export(es_search) 6 | export(get_fields) 7 | export(parse_date_time) 8 | export(unpack_nested_data) 9 | importFrom(curl,curl_fetch_memory) 10 | importFrom(curl,handle_setheaders) 11 | importFrom(curl,handle_setopt) 12 | importFrom(curl,new_handle) 13 | importFrom(data.table,":=") 14 | importFrom(data.table,as.data.table) 15 | importFrom(data.table,copy) 16 | importFrom(data.table,data.table) 17 | importFrom(data.table,is.data.table) 18 | importFrom(data.table,rbindlist) 19 | importFrom(data.table,setcolorder) 20 | importFrom(data.table,setkeyv) 21 | importFrom(data.table,setnames) 22 | importFrom(data.table,uniqueN) 23 | importFrom(futile.logger,flog.debug) 24 | importFrom(futile.logger,flog.fatal) 25 | importFrom(futile.logger,flog.info) 26 | importFrom(futile.logger,flog.warn) 27 | importFrom(jsonlite,fromJSON) 28 | importFrom(parallel,clusterMap) 29 | importFrom(parallel,detectCores) 30 | importFrom(parallel,makeForkCluster) 31 | importFrom(parallel,makePSOCKcluster) 32 | importFrom(parallel,stopCluster) 33 | importFrom(purrr,map2) 34 | importFrom(purrr,map_if) 35 | importFrom(purrr,map_int) 36 | importFrom(purrr,map_lgl) 37 | importFrom(purrr,simplify) 38 | importFrom(stats,runif) 39 | importFrom(stringr,str_extract) 40 | importFrom(stringr,str_replace_all) 41 | importFrom(stringr,str_split) 42 | importFrom(stringr,str_split_fixed) 43 | -------------------------------------------------------------------------------- /r-pkg/R/assertions.R: -------------------------------------------------------------------------------- 1 | 2 | # [title] assert something and raise an exception if it isn't true 3 | # [name] .assert 4 | # [description] If the condition passed to .assert() does not evaluate to TRUE, 5 | # issues a FATAL-level log message and then raises an R exception, 6 | # both with the content of `msg`. 7 | .assert <- function(expr, msg) { 8 | res <- eval(expr, envir = parent.frame()) 9 | if (isTRUE(res)) { 10 | return(invisible(TRUE)) 11 | } 12 | .log_fatal(msg) 13 | } 14 | 15 | # [title] check if an object is a count 16 | # [name] .is_count 17 | # [description] Returns TRUE if `x` is a single positive integer 18 | # and FALSE otherwise. 19 | .is_count <- function(x) { 20 | return( 21 | length(x) == 1 && 22 | is.numeric(x) && 23 | !is.na(x) && 24 | x > 0 && 25 | trunc(x) == x 26 | ) 27 | } 28 | 29 | # [title] check if an object is a scalar logical 30 | # [name] .is_flag 31 | # [description] Returns TRUE if `x` is `TRUE` or `FALSE` 32 | # and `FALSE` otherwise. 33 | .is_flag <- function(x) { 34 | return( 35 | is.logical(x) && 36 | length(x) == 1L && 37 | !is.na(x) 38 | ) 39 | } 40 | 41 | # [title] check if an object is a string 42 | # [name] .is_string 43 | # [description] Returns TRUE if `x` is a non-empty string 44 | # and FALSE otherwise. 45 | .is_string <- function(x) { 46 | return( 47 | is.character(x) && 48 | length(x) == 1L && 49 | !is.na(x) && 50 | x != "" 51 | ) 52 | } 53 | 54 | # [title] check if an object is a writeable filepath that exists 55 | # [name] .is_writeable 56 | # [description] Returns TRUE if `x` is a filepath that already exists 57 | # and is writeable, and FALSE otherwise. 58 | .is_writeable <- function(x) { 59 | return( 60 | .is_string(x) && 61 | file.exists(x) && 62 | file.access(x, mode = 2L)[[1L]] == 0L 63 | ) 64 | } 65 | -------------------------------------------------------------------------------- /r-pkg/R/chomp_hits.R: -------------------------------------------------------------------------------- 1 | #' @title Hits to data.tables 2 | #' @name chomp_hits 3 | #' @description A function for converting Elasticsearch docs into R data.tables. It 4 | #' uses \code{\link[jsonlite]{fromJSON}} with \code{flatten = TRUE} to convert a 5 | #' JSON into an R data.frame, and formats it into a data.table. 6 | #' @importFrom jsonlite fromJSON 7 | #' @importFrom data.table as.data.table setnames 8 | #' @export 9 | #' @param hits_json A character vector. If its length is greater than 1, its elements will be pasted 10 | #' together. This can contain a JSON returned from a \code{search} query in 11 | #' Elasticsearch, or a filepath or URL pointing at one. 12 | #' @param keep_nested_data_cols a boolean (default TRUE); whether to keep columns that are nested 13 | #' arrays in the original JSON. A warning will be given if these 14 | #' columns are deleted. 15 | #' @examples 16 | #' # A sample raw result from a hits query: 17 | #' result <- '[{"_source":{"timestamp":"2017-01-01","cust_name":"Austin","details":{ 18 | #' "cust_class":"big_spender","location":"chicago","pastPurchases":[{"film":"The Notebook", 19 | #' "pmt_amount":6.25},{"film":"The Town","pmt_amount":8.00},{"film":"Zootopia","pmt_amount":7.50, 20 | #' "matinee":true}]}}},{"_source":{"timestamp":"2017-02-02","cust_name":"James","details":{ 21 | #' "cust_class":"peasant","location":"chicago","pastPurchases":[{"film":"Minions", 22 | #' "pmt_amount":6.25,"matinee":true},{"film":"Rogue One","pmt_amount":10.25},{"film":"Bridesmaids", 23 | #' "pmt_amount":8.75},{"film":"Bridesmaids","pmt_amount":6.25,"matinee":true}]}}},{"_source":{ 24 | #' "timestamp":"2017-03-03","cust_name":"Nick","details":{"cust_class":"critic","location":"cannes", 25 | #' "pastPurchases":[{"film":"Aala Kaf Ifrit","pmt_amount":0,"matinee":true},{ 26 | #' "film":"Dopo la guerra (Apres la Guerre)","pmt_amount":0,"matinee":true},{ 27 | #' "film":"Avengers: Infinity War","pmt_amount":12.75}]}}}]' 28 | #' 29 | #' # Chomp into a data.table 30 | #' sampleChompedDT <- chomp_hits(hits_json = result, keep_nested_data_cols = TRUE) 31 | #' print(sampleChompedDT) 32 | #' 33 | #' # (Note: use es_search() to get here in one step) 34 | #' 35 | #' # Unpack by details.pastPurchases 36 | #' unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT 37 | #' , col_to_unpack = "details.pastPurchases") 38 | #' print(unpackedDT) 39 | chomp_hits <- function(hits_json = NULL, keep_nested_data_cols = TRUE) { 40 | 41 | # If nothing was passed to hits_json, return NULL and warn 42 | if (is.null(hits_json)) { 43 | msg <- "You did not pass any input data to chomp_hits. Returning NULL." 44 | .log_warn(msg) 45 | return(invisible(NULL)) 46 | } 47 | 48 | if (!is.character(hits_json)) { 49 | msg <- paste0("The first argument of chomp_hits must be a character vector." 50 | , "You may have passed an R list. In that case, if you already " 51 | , "used jsonlite::fromJSON(), you can just call " 52 | , "data.table::as.data.table().") 53 | .log_fatal(msg) 54 | } 55 | 56 | # Parse the input JSON to a list object 57 | jsonList <- jsonlite::fromJSON(hits_json, flatten = TRUE) 58 | 59 | # If this came from a raw query result, we need to grab the hits.hits element. 60 | # Otherwise, just assume we have a list of hits 61 | if (all(c("took", "timed_out", "_shards", "hits") %in% names(jsonList))) { 62 | batchDT <- data.table::as.data.table(jsonList[["hits"]][["hits"]]) 63 | } else { 64 | batchDT <- data.table::as.data.table(jsonList) 65 | } 66 | 67 | # Strip "_source" from all the column names because blegh 68 | data.table::setnames(batchDT, gsub("_source.", "", names(batchDT), fixed = TRUE)) 69 | 70 | # Warn the user if there's nested data 71 | colTypes <- sapply(batchDT, mode) 72 | if (any(colTypes == "list")) { 73 | if (keep_nested_data_cols) { 74 | msg <- paste( 75 | "Keeping the following nested data columns." 76 | , "Consider using unpack_nested_data for one:\n" 77 | , toString(names(colTypes)[colTypes == "list"]) 78 | ) 79 | .log_info(msg) 80 | } else { 81 | 82 | msg <- paste( 83 | "Deleting the following nested data columns:\n" 84 | , toString(names(colTypes)[colTypes == "list"]) 85 | ) 86 | .log_warn(msg) 87 | batchDT <- batchDT[, !names(colTypes[colTypes == "list"]), with = FALSE] 88 | } 89 | } 90 | 91 | return(batchDT) 92 | } 93 | -------------------------------------------------------------------------------- /r-pkg/R/helperfuns.R: -------------------------------------------------------------------------------- 1 | # [title] Extract the content of an HTTP response into a different format 2 | # [name] .content 3 | # [description] Mainly here to making mocking easier in testing. 4 | # [references] https://testthat.r-lib.org/reference/local_mocked_bindings.html#namespaced-calls 5 | #' @importFrom jsonlite fromJSON 6 | .content <- function(response, as) { 7 | text_content <- rawToChar(response$content) 8 | if (as == "text") { 9 | return(text_content) 10 | } 11 | 12 | # if not plain text, assume we want to parse JSON into an R list 13 | return(jsonlite::fromJSON( 14 | txt = text_content 15 | , simplifyVector = FALSE 16 | , simplifyDataFrame = FALSE 17 | , simplifyMatrix = FALSE 18 | )) 19 | } 20 | 21 | # [title] Get a random length-n string 22 | # [name] .random_string 23 | # [description] Get a random length-n string of lowercase letters. 24 | # Note that this uses sample() and so might produce deterministic 25 | # results in programs where set.seed() is used to control randomness. 26 | .random_string <- function(num_characters) { 27 | return( 28 | paste( 29 | sample(letters, replace = TRUE, size = num_characters) 30 | , collapse = "" 31 | ) 32 | ) 33 | } 34 | 35 | # [title] List out HTTP codes that should be treated as retryable 36 | # [name] .should_retry 37 | # [description] Here because {curl} doesn't ship a retry mechanism, so this library 38 | # implements its own. 39 | .should_retry <- function(response) { 40 | retryable_error_codes <- c( 41 | # 408 - timeout 42 | 408L 43 | # 422 - unprocessable entity 44 | , 422L 45 | # 425 - too early 46 | , 425L 47 | # 429 - too many requests 48 | , 429L 49 | # 500 - internal server error 50 | , 500L 51 | # 502 - bad gateway 52 | , 502L 53 | # 503 - service unavailable 54 | , 503L 55 | # 504 - gateway timeout 56 | , 504L 57 | ) 58 | return(response$status_code %in% retryable_error_codes) 59 | } 60 | 61 | # [title] Retry an HTTP requests a couple times (if necessary) 62 | # [name] .retry 63 | # [description] Implements exponential backoff with jitter, around failed requests. 64 | # See .should_retry() for details on which status codes are considered retryable. 65 | # This is here because {curl} does not have a built-in retry API. 66 | #' @importFrom curl curl_fetch_memory 67 | #' @importFrom stats runif 68 | .retry <- function(handle, url) { 69 | 70 | max_retries <- 3L 71 | attempt_count <- 1L 72 | while (attempt_count <= max_retries) { 73 | 74 | # if this isn't the 1st attempt, apply backoff 75 | if (attempt_count > 1L) { 76 | # exponential backoff with jitter 77 | # 78 | # 1.45s + {jitter} 79 | # 2.10s + {jitter} 80 | # 3.05s + {jitter} 81 | # etc., etc. 82 | # 83 | # ref: https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/ 84 | sleep_seconds <- 1.45 ** (attempt_count - 1L) + stats::runif(n = 1L, min = 0.1, max = 0.5) 85 | .log_debug(sprintf("Sleeping for %.2f seconds before retrying.", sleep_seconds)) 86 | Sys.sleep(sleep_seconds) 87 | } 88 | 89 | # execute request 90 | response <- curl::curl_fetch_memory( 91 | url = url 92 | , handle = handle 93 | ) 94 | 95 | # check if the response should be retried 96 | if (.should_retry(response)) { 97 | .log_debug(sprintf( 98 | "Request failed (status code %i): '%s %s'" 99 | , response$status_code 100 | , response$method 101 | , response$url 102 | )) 103 | attempt_count <- attempt_count + 1L 104 | } else { 105 | break 106 | } 107 | } 108 | return(response) 109 | } 110 | 111 | # [title] Execute an HTTP request and return the result 112 | # [name] .request 113 | # [description] Mainly here to making mocking easier in testing, but this 114 | # also centralizes the mechanism for HTTP request execution in one place. 115 | # [references] https://testthat.r-lib.org/reference/local_mocked_bindings.html#namespaced-calls 116 | #' @importFrom curl handle_setheaders handle_setopt new_handle 117 | .request <- function(verb, url, body) { 118 | handle <- curl::new_handle() 119 | 120 | # set headers 121 | # 122 | # This can safely be hard-coded here because every payload this library 123 | # posts and every response body it receives is JSON data. 124 | curl::handle_setheaders( 125 | handle = handle 126 | , "Accept" = "application/json" # nolint[non_portable_path] 127 | , "Content-Type" = "application/json" # nolint[non_portable_path] 128 | ) 129 | 130 | # set HTTP method 131 | curl::handle_setopt(handle = handle, customrequest = verb) 132 | 133 | # add body 134 | if (!is.null(body)) { 135 | curl::handle_setopt( 136 | handle = handle 137 | , copypostfields = body 138 | ) 139 | } 140 | 141 | # actually execute request 142 | response <- .retry( 143 | handle = handle 144 | , url = url 145 | ) 146 | 147 | return(invisible(response)) 148 | } 149 | 150 | # [title] Raise an exception if an HTTP response indicates an error 151 | # [name] .stop_for_status 152 | # [description] 3xx, 4xx, and 5xx responses are treated as errors. 153 | # curl should automatically follow redirects (which is what most 154 | # 3xx responses are), so if that's working well then this code should 155 | # never actually see a 3xx response. 156 | .stop_for_status <- function(response) { 157 | if (response$status_code <= 300L) { 158 | return(invisible(NULL)) 159 | } 160 | .log_fatal(sprintf( 161 | "Request failed (status code %i): '%s %s'" 162 | , response$status_code 163 | , response$method 164 | , response$url 165 | )) 166 | } 167 | -------------------------------------------------------------------------------- /r-pkg/R/logging.R: -------------------------------------------------------------------------------- 1 | #' @importFrom futile.logger flog.debug 2 | .log_debug <- function(...) { 3 | futile.logger::flog.debug(...) 4 | } 5 | 6 | #' @importFrom futile.logger flog.info 7 | .log_info <- function(...) { 8 | futile.logger::flog.info(...) 9 | } 10 | 11 | #' @importFrom futile.logger flog.warn 12 | .log_warn <- function(...) { 13 | futile.logger::flog.warn(...) 14 | warning(...) 15 | } 16 | 17 | #' @importFrom futile.logger flog.fatal 18 | .log_fatal <- function(...) { 19 | futile.logger::flog.fatal(...) 20 | stop(...) 21 | } 22 | -------------------------------------------------------------------------------- /r-pkg/R/parse_date_time.R: -------------------------------------------------------------------------------- 1 | #' @title Parse date-times from Elasticsearch records 2 | #' @name parse_date_time 3 | #' @description Given a data.table with date-time strings, 4 | #' this function converts those dates-times to type POSIXct with the appropriate 5 | #' time zone. Assumption is that dates are of the form "2016-07-25T22:15:19Z" 6 | #' where T is just a separator and the last letter is a military timezone. 7 | #' 8 | #' This is a side-effect-free function: it returns a new data.table and the 9 | #' input data.table is unmodified. 10 | #' @importFrom data.table copy is.data.table 11 | #' @importFrom purrr map2 simplify 12 | #' @importFrom stringr str_extract 13 | #' @export 14 | #' @param input_df a data.table with one or more date-time columns you want to convert 15 | #' @param date_cols Character vector of column names to convert. Columns should have 16 | #' string dates of the form "2016-07-25T22:15:19Z". 17 | #' @param assume_tz Timezone to convert to if parsing fails. Default is UTC 18 | #' @references \url{https://www.timeanddate.com/time/zones/military} 19 | #' @references \url{https://en.wikipedia.org/wiki/List_of_tz_database_time_zones} 20 | #' @examples 21 | #' # Sample es_search(), chomp_hits(), or chomp_aggs() output: 22 | #' someDT <- data.table::data.table(id = 1:5 23 | #' , company = c("Apple", "Apple", "Banana", "Banana", "Cucumber") 24 | #' , timestamp = c("2015-03-14T09:26:53B", "2015-03-14T09:26:54B" 25 | #' , "2031-06-28T08:53:07Z", "2031-06-28T08:53:08Z" 26 | #' , "2000-01-01")) 27 | #' 28 | #' # Note that the date field is character right now 29 | #' str(someDT) 30 | #' 31 | #' # Let's fix that! 32 | #' someDT <- parse_date_time(input_df = someDT 33 | #' , date_cols = "timestamp" 34 | #' , assume_tz = "UTC") 35 | #' str(someDT) 36 | parse_date_time <- function(input_df 37 | , date_cols 38 | , assume_tz = "UTC" 39 | ) { 40 | 41 | # Break if input_df isn't actually a data.table 42 | if (!data.table::is.data.table(input_df)) { 43 | msg <- paste("parse_date_time expects to receive a data.table object." 44 | , "You provided an object of class" 45 | , toString(class(input_df)) 46 | , "to input_df.") 47 | .log_fatal(msg) 48 | } 49 | 50 | # Break if date_cols is not a character vector 51 | if (!identical(class(date_cols), "character")) { 52 | msg <- paste("The date_cols argument in parse_date_time expects", 53 | "a character vector of column names. You gave an object", 54 | "of class", toString(class(date_cols))) 55 | .log_fatal(msg) 56 | } 57 | 58 | # Break if any of the date_cols are not actually in this DT 59 | if (!all(date_cols %in% names(input_df))) { 60 | not_there <- date_cols[!(date_cols %in% names(input_df))] 61 | msg <- paste("The following columns, which you passed to date_cols,", 62 | "do not actually exist in input_df:", 63 | toString(not_there)) 64 | .log_fatal(msg) 65 | } 66 | 67 | # Other input checks we don't have explicit error messages for 68 | .assert(.is_string(assume_tz), "Argument 'assume_tz' must be a non-empty string") 69 | 70 | # Work on a copy of the DT to avoid side effects 71 | outDT <- data.table::copy(input_df) 72 | 73 | # Map one-letter TZs to valid timezones to be passed to lubridate functions 74 | # Military (one-letter) times: 75 | # Mapping UTC to etc --> https://en.wikipedia.org/wiki/List_of_tz_database_time_zones 76 | tzHash <- vector("character") 77 | # nolint start 78 | tzHash["A"] <- "Etc/GMT-1" # UTC +1 79 | tzHash["B"] <- "Etc/GMT-2" # UTC +2 80 | tzHash["C"] <- "Etc/GMT-3" # UTC +3 81 | tzHash["D"] <- "Etc/GMT-4" # UTC +4 82 | tzHash["E"] <- "Etc/GMT-5" # UTC +5 83 | tzHash["F"] <- "Etc/GMT-6" # UTC +6 84 | tzHash["G"] <- "Etc/GMT-7" # UTC +7 85 | tzHash["H"] <- "Etc/GMT-8" # UTC +8 86 | tzHash["I"] <- "Etc/GMT-9" # UTC +9 87 | tzHash["K"] <- "Etc/GMT-10" # UTC +10 88 | tzHash["L"] <- "Etc/GMT-11" # UTC +11 89 | tzHash["M"] <- "Etc/GMT-12" # UTC +12 90 | tzHash["N"] <- "Etc/GMT+1" # UTC -1 91 | tzHash["O"] <- "Etc/GMT+2" # UTC -2 92 | tzHash["P"] <- "Etc/GMT+3" # UTC -3 93 | tzHash["Q"] <- "Etc/GMT+4" # UTC -4 94 | tzHash["R"] <- "Etc/GMT+5" # UTC -5 95 | tzHash["S"] <- "Etc/GMT+6" # UTC -6 96 | tzHash["T"] <- "Etc/GMT+7" # UTC -7 97 | tzHash["U"] <- "Etc/GMT+8" # UTC -8 98 | tzHash["V"] <- "Etc/GMT+9" # UTC -9 99 | tzHash["W"] <- "Etc/GMT+10" # UTC -10 100 | tzHash["X"] <- "Etc/GMT+11" # UTC -11 101 | tzHash["Y"] <- "Etc/GMT+12" # UTC -12 102 | tzHash["Z"] <- "UTC" # UTC 103 | # nolint end 104 | 105 | # Parse dates, return POSIXct UTC dates 106 | for (dateCol in date_cols) { 107 | 108 | # Grab this vector to work on 109 | dateVec <- outDT[[dateCol]] 110 | 111 | # Parse out timestamps and military timezone strings 112 | dateTimes <- paste0( 113 | stringr::str_extract(dateVec, "^\\d{4}-\\d{2}-\\d{2}") # nolint[non_portable_path] 114 | , " " 115 | , stringr::str_extract(dateVec, "\\d{2}:\\d{2}:\\d{2}") 116 | ) 117 | tzKeys <- stringr::str_extract(dateVec, "[A-Za-z]{1}$") 118 | 119 | # Grab a vector of timezones 120 | timeZones <- tzHash[tzKeys] 121 | timeZones[is.na(timeZones)] <- assume_tz 122 | 123 | # Combine the timestamp and timezone vector to convert to POSIXct 124 | dateTimes <- purrr::map2( 125 | dateTimes 126 | , timeZones 127 | , function(dateTime, timeZone) { 128 | return(as.POSIXct(dateTime, tz = timeZone)) 129 | } 130 | ) 131 | 132 | utcDates <- as.POSIXct.numeric( 133 | purrr::simplify(dateTimes) 134 | , origin = "1970-01-01" 135 | , tz = "UTC" 136 | ) 137 | 138 | # Put back in the data.table 139 | outDT[, (dateCol) := utcDates] 140 | } 141 | 142 | return(outDT) 143 | } 144 | -------------------------------------------------------------------------------- /r-pkg/R/unpack_nested_data.R: -------------------------------------------------------------------------------- 1 | #' @title Unpack a nested data.table 2 | #' @name unpack_nested_data 3 | #' @description After calling a \code{chomp_*} function or \code{es_search}, if 4 | #' you had a nested array in the JSON, its corresponding column in the 5 | #' resulting data.table is a data.frame itself (or a list of vectors). This 6 | #' function expands that nested column out, adding its data to the original 7 | #' data.table, and duplicating metadata down the rows as necessary. 8 | #' 9 | #' This is a side-effect-free function: it returns a new data.table and the 10 | #' input data.table is unmodified. 11 | #' @importFrom data.table as.data.table copy is.data.table rbindlist setnames 12 | #' @importFrom purrr map_if map_lgl map_int 13 | #' @export 14 | #' @param chomped_df a data.table 15 | #' @param col_to_unpack a character vector of length one: the column name to unpack 16 | #' @examples 17 | #' # A sample raw result from a hits query: 18 | #' result <- '[{"_source":{"timestamp":"2017-01-01","cust_name":"Austin","details":{ 19 | #' "cust_class":"big_spender","location":"chicago","pastPurchases":[{"film":"The Notebook", 20 | #' "pmt_amount":6.25},{"film":"The Town","pmt_amount":8.00},{"film":"Zootopia","pmt_amount":7.50, 21 | #' "matinee":true}]}}},{"_source":{"timestamp":"2017-02-02","cust_name":"James","details":{ 22 | #' "cust_class":"peasant","location":"chicago","pastPurchases":[{"film":"Minions", 23 | #' "pmt_amount":6.25,"matinee":true},{"film":"Rogue One","pmt_amount":10.25},{"film":"Bridesmaids", 24 | #' "pmt_amount":8.75},{"film":"Bridesmaids","pmt_amount":6.25,"matinee":true}]}}},{"_source":{ 25 | #' "timestamp":"2017-03-03","cust_name":"Nick","details":{"cust_class":"critic","location":"cannes", 26 | #' "pastPurchases":[{"film":"Aala Kaf Ifrit","pmt_amount":0,"matinee":true},{ 27 | #' "film":"Dopo la guerra (Apres la Guerre)","pmt_amount":0,"matinee":true},{ 28 | #' "film":"Avengers: Infinity War","pmt_amount":12.75}]}}}]' 29 | #' 30 | #' # Chomp into a data.table 31 | #' sampleChompedDT <- chomp_hits(hits_json = result, keep_nested_data_cols = TRUE) 32 | #' print(sampleChompedDT) 33 | #' 34 | #' # (Note: use es_search() to get here in one step) 35 | #' 36 | #' # Unpack by details.pastPurchases 37 | #' unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT 38 | #' , col_to_unpack = "details.pastPurchases") 39 | #' print(unpackedDT) 40 | unpack_nested_data <- function(chomped_df, col_to_unpack) { 41 | 42 | # Input checks 43 | if (!data.table::is.data.table(chomped_df)) { 44 | msg <- "For unpack_nested_data, chomped_df must be a data.table" 45 | .log_fatal(msg) 46 | } 47 | if (!.is_string(col_to_unpack)) { 48 | msg <- "For unpack_nested_data, col_to_unpack must be a character of length 1" 49 | .log_fatal(msg) 50 | } 51 | if (!(col_to_unpack %in% names(chomped_df))) { 52 | msg <- "For unpack_nested_data, col_to_unpack must be one of the column names" 53 | .log_fatal(msg) 54 | } 55 | 56 | inDT <- data.table::copy(chomped_df) 57 | 58 | # Define a column name to store original row ID 59 | repeat { 60 | joinCol <- .random_string(36L) 61 | if (!(joinCol %in% names(inDT))) { 62 | break 63 | } 64 | } 65 | inDT[, (joinCol) := .I] 66 | 67 | # Take out the packed column 68 | listDT <- inDT[[col_to_unpack]] 69 | inDT[, (col_to_unpack) := NULL] 70 | 71 | # Check for empty column 72 | if (all(purrr::map_int(listDT, NROW) == 0)) { 73 | msg <- "The column given to unpack_nested_data had no data in it." 74 | .log_fatal(msg) 75 | } 76 | 77 | listDT[lengths(listDT) == 0] <- NA 78 | 79 | is_df <- purrr::map_lgl(listDT, is.data.frame) 80 | is_list <- purrr::map_lgl(listDT, is.list) 81 | is_atomic <- purrr::map_lgl(listDT, is.atomic) 82 | is_na <- is.na(listDT) 83 | 84 | # Bind packed column into one data.table 85 | if (all(is_atomic)) { 86 | newDT <- data.table::as.data.table(unlist(listDT)) 87 | newDT[, (joinCol) := rep(seq_along(listDT), lengths(listDT))] 88 | } else if (all(is_df | is_list | is_na)) { 89 | # Find name to use for NA columns 90 | first_df <- min(which(is_df)) 91 | col_name <- names(listDT[[first_df]])[1] 92 | 93 | .prep_na_row <- function(x, col_name) { 94 | x <- data.table::as.data.table(x) 95 | names(x) <- col_name 96 | return(x) 97 | } 98 | 99 | # If the packed column contains data.tables, we use rbindlist 100 | newDT <- purrr::map_if(listDT, is_na, .prep_na_row, col_name = col_name) 101 | newDT <- data.table::rbindlist(newDT, fill = TRUE, idcol = joinCol) 102 | } else { 103 | msg <- paste0("Each row in column ", col_to_unpack, " must be a data frame or a vector.") 104 | .log_fatal(msg) 105 | } 106 | 107 | # Join it back in 108 | outDT <- inDT[newDT, on = joinCol] 109 | outDT[, (joinCol) := NULL] 110 | 111 | # In the case of all atomic... 112 | if ("V1" %in% names(outDT)) { 113 | data.table::setnames(outDT, "V1", col_to_unpack) 114 | } 115 | 116 | return(outDT) 117 | } 118 | -------------------------------------------------------------------------------- /r-pkg/R/uptasticsearch.R: -------------------------------------------------------------------------------- 1 | # Globals to make R CMD check not spit out "no visible binding for global 2 | # variable" notes. 3 | # Basically, R CMD check doesn't like it when you don't quote the "V1" in 4 | # a call like DT[, V1]. 5 | # See: http://stackoverflow.com/a/12429344 6 | # Also: see hadley's comments on his own post there. They're great. 7 | 8 | utils::globalVariables(c( 9 | "." 10 | , ".I" 11 | , ".id" 12 | , "alias" 13 | , "field" 14 | , "index" 15 | , "V1" 16 | , "V2" 17 | )) 18 | 19 | 20 | # NULL object for common parameter documentation 21 | #' @param es_host A string identifying an Elasticsearch host. This should be of the form 22 | #' \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}. 23 | #' @param es_index The name of an Elasticsearch index to be queried. Note that passing 24 | #' \code{NULL} is not supported. Technically, not passing an index 25 | #' to Elasticsearch is legal and results in searching over all indexes. 26 | #' To be sure that this very expensive query is not executed by accident, 27 | #' uptasticsearch forbids this. If you want to execute a query over 28 | #' all indexes in the cluster, set this argument to \code{"_all"}. 29 | #' @name doc_shared 30 | #' @title NULL Object For Common Documentation 31 | #' @description This is a NULL object with documentation so that later functions can call 32 | #' inheritParams 33 | #' @keywords internal 34 | NULL 35 | -------------------------------------------------------------------------------- /r-pkg/_pkgdown.yml: -------------------------------------------------------------------------------- 1 | template: 2 | bootstrap: 5 3 | params: 4 | bootswatch: flatly 5 | 6 | # this needs to be specified and to match 'URL:' in DESCRIPTION, 7 | # or pkgdown raises this error: 8 | # 9 | # URLs not ok. 10 | # In _pkgdown.yml, url is missing. 11 | # 12 | url: https://github.com/uptake/uptasticsearch 13 | 14 | repo: 15 | url: 16 | home: https://github.com/uptake/uptasticsearch/ 17 | source: https://github.com/uptake/uptasticsearch/tree/main/r-pkg/ 18 | issue: https://github.com/uptake/uptasticsearch/issues 19 | user: https://github.com/ 20 | 21 | reference: 22 | - title: Main function 23 | contents: 24 | - es_search 25 | - title: Parse raw JSON into data.table 26 | contents: 27 | - starts_with("chomp_") 28 | - title: Utilities 29 | contents: 30 | - unpack_nested_data 31 | - parse_date_time 32 | - title: Exploratory functions 33 | contents: 34 | - get_fields 35 | -------------------------------------------------------------------------------- /r-pkg/inst/testdata/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uptake/uptasticsearch/62d4739912db1e56cba7771f9903d6e551e557dc/r-pkg/inst/testdata/.gitkeep -------------------------------------------------------------------------------- /r-pkg/man/chomp_aggs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chomp_aggs.R 3 | \name{chomp_aggs} 4 | \alias{chomp_aggs} 5 | \title{Aggs query to data.table} 6 | \usage{ 7 | chomp_aggs(aggs_json = NULL) 8 | } 9 | \arguments{ 10 | \item{aggs_json}{A character vector. If its length is greater than 1, its elements will be pasted 11 | together. This can contain a JSON returned from an \code{aggs} query in 12 | Elasticsearch, or a filepath or URL pointing at one.} 13 | } 14 | \value{ 15 | A data.table representation of the result or NULL if the aggregation result is empty. 16 | } 17 | \description{ 18 | Given some raw JSON from an aggs query in Elasticsearch, parse the 19 | aggregations into a data.table. 20 | } 21 | \examples{ 22 | # A sample raw result from an aggs query combining date_histogram and extended_stats: 23 | result <- '{"aggregations":{"dateTime":{"buckets":[{"key_as_string":"2016-12-01T00:00:00.000Z", 24 | "key":1480550400000,"doc_count":123,"num_potatoes":{"count":120,"min":0,"max":40,"avg":15, 25 | "sum":1800,"sum_of_squares":28000,"variance":225,"std_deviation":15,"std_deviation_bounds":{ 26 | "upper":26,"lower":13}}},{"key_as_string":"2017-01-01T00:00:00.000Z","key":1483228800000, 27 | "doc_count":134,"num_potatoes":{"count":131,"min":0,"max":39,"avg":16,"sum":2096, 28 | "sum_of_squares":34000,"variance":225,"std_deviation":15,"std_deviation_bounds":{"upper":26, 29 | "lower":13}}}]}}}' 30 | 31 | # Parse into a data.table 32 | aggDT <- chomp_aggs(aggs_json = result) 33 | print(aggDT) 34 | } 35 | -------------------------------------------------------------------------------- /r-pkg/man/chomp_hits.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chomp_hits.R 3 | \name{chomp_hits} 4 | \alias{chomp_hits} 5 | \title{Hits to data.tables} 6 | \usage{ 7 | chomp_hits(hits_json = NULL, keep_nested_data_cols = TRUE) 8 | } 9 | \arguments{ 10 | \item{hits_json}{A character vector. If its length is greater than 1, its elements will be pasted 11 | together. This can contain a JSON returned from a \code{search} query in 12 | Elasticsearch, or a filepath or URL pointing at one.} 13 | 14 | \item{keep_nested_data_cols}{a boolean (default TRUE); whether to keep columns that are nested 15 | arrays in the original JSON. A warning will be given if these 16 | columns are deleted.} 17 | } 18 | \description{ 19 | A function for converting Elasticsearch docs into R data.tables. It 20 | uses \code{\link[jsonlite]{fromJSON}} with \code{flatten = TRUE} to convert a 21 | JSON into an R data.frame, and formats it into a data.table. 22 | } 23 | \examples{ 24 | # A sample raw result from a hits query: 25 | result <- '[{"_source":{"timestamp":"2017-01-01","cust_name":"Austin","details":{ 26 | "cust_class":"big_spender","location":"chicago","pastPurchases":[{"film":"The Notebook", 27 | "pmt_amount":6.25},{"film":"The Town","pmt_amount":8.00},{"film":"Zootopia","pmt_amount":7.50, 28 | "matinee":true}]}}},{"_source":{"timestamp":"2017-02-02","cust_name":"James","details":{ 29 | "cust_class":"peasant","location":"chicago","pastPurchases":[{"film":"Minions", 30 | "pmt_amount":6.25,"matinee":true},{"film":"Rogue One","pmt_amount":10.25},{"film":"Bridesmaids", 31 | "pmt_amount":8.75},{"film":"Bridesmaids","pmt_amount":6.25,"matinee":true}]}}},{"_source":{ 32 | "timestamp":"2017-03-03","cust_name":"Nick","details":{"cust_class":"critic","location":"cannes", 33 | "pastPurchases":[{"film":"Aala Kaf Ifrit","pmt_amount":0,"matinee":true},{ 34 | "film":"Dopo la guerra (Apres la Guerre)","pmt_amount":0,"matinee":true},{ 35 | "film":"Avengers: Infinity War","pmt_amount":12.75}]}}}]' 36 | 37 | # Chomp into a data.table 38 | sampleChompedDT <- chomp_hits(hits_json = result, keep_nested_data_cols = TRUE) 39 | print(sampleChompedDT) 40 | 41 | # (Note: use es_search() to get here in one step) 42 | 43 | # Unpack by details.pastPurchases 44 | unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT 45 | , col_to_unpack = "details.pastPurchases") 46 | print(unpackedDT) 47 | } 48 | -------------------------------------------------------------------------------- /r-pkg/man/doc_shared.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/uptasticsearch.R 3 | \name{doc_shared} 4 | \alias{doc_shared} 5 | \title{NULL Object For Common Documentation} 6 | \arguments{ 7 | \item{es_host}{A string identifying an Elasticsearch host. This should be of the form 8 | \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}.} 9 | 10 | \item{es_index}{The name of an Elasticsearch index to be queried. Note that passing 11 | \code{NULL} is not supported. Technically, not passing an index 12 | to Elasticsearch is legal and results in searching over all indexes. 13 | To be sure that this very expensive query is not executed by accident, 14 | uptasticsearch forbids this. If you want to execute a query over 15 | all indexes in the cluster, set this argument to \code{"_all"}.} 16 | } 17 | \description{ 18 | This is a NULL object with documentation so that later functions can call 19 | inheritParams 20 | } 21 | \keyword{internal} 22 | -------------------------------------------------------------------------------- /r-pkg/man/es_search.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/es_search.R 3 | \name{es_search} 4 | \alias{es_search} 5 | \title{Execute an Elasticsearch query and get a data.table} 6 | \usage{ 7 | es_search( 8 | es_host, 9 | es_index, 10 | size = 10000, 11 | query_body = "{}", 12 | scroll = "5m", 13 | max_hits = Inf, 14 | n_cores = ceiling(parallel::detectCores()/2), 15 | break_on_duplicates = TRUE, 16 | ignore_scroll_restriction = FALSE, 17 | intermediates_dir = getwd() 18 | ) 19 | } 20 | \arguments{ 21 | \item{es_host}{A string identifying an Elasticsearch host. This should be of the form 22 | \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}.} 23 | 24 | \item{es_index}{The name of an Elasticsearch index to be queried. Note that passing 25 | \code{NULL} is not supported. Technically, not passing an index 26 | to Elasticsearch is legal and results in searching over all indexes. 27 | To be sure that this very expensive query is not executed by accident, 28 | uptasticsearch forbids this. If you want to execute a query over 29 | all indexes in the cluster, set this argument to \code{"_all"}.} 30 | 31 | \item{size}{Number of records per page of results. 32 | See \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-from-size}{Elasticsearch docs} for more. 33 | Note that this will be reset to 0 if you submit a \code{query_body} with 34 | an "aggs" request in it. Also see \code{max_hits}.} 35 | 36 | \item{query_body}{String with a valid Elasticsearch query. Default is an empty query.} 37 | 38 | \item{scroll}{How long should the scroll context be held open? This should be a 39 | duration string like "1m" (for one minute) or "15s" (for 15 seconds). 40 | The scroll context will be refreshed every time you ask Elasticsearch 41 | for another record, so this parameter should just be the amount of 42 | time you expect to pass between requests. See the 43 | \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html#request-body-search-scroll}{Elasticsearch scroll/pagination docs} 44 | for more information.} 45 | 46 | \item{max_hits}{Integer. If specified, \code{es_search} will stop pulling data as soon 47 | as it has pulled this many hits. Default is \code{Inf}, meaning that 48 | all possible hits will be pulled.} 49 | 50 | \item{n_cores}{Number of cores to distribute fetching and processing over.} 51 | 52 | \item{break_on_duplicates}{Boolean, defaults to TRUE. \code{es_search} uses the size of the 53 | final object it returns to check whether or not some data were lost 54 | during the processing. If you have duplicates in the source data, you 55 | will have to set this flag to FALSE and just trust that no data have 56 | been lost. Sorry :( .} 57 | 58 | \item{ignore_scroll_restriction}{There is a cost associated with keeping an 59 | Elasticsearch scroll context open. By default, 60 | this function does not allow arguments to \code{scroll} 61 | which exceed one hour. This is done to prevent 62 | costly mistakes made by novice Elasticsearch users. 63 | If you understand the cost of keeping the context 64 | open for a long time and would like to pass a \code{scroll} 65 | value longer than an hour, set \code{ignore_scroll_restriction} 66 | to \code{TRUE}.} 67 | 68 | \item{intermediates_dir}{When scrolling over search results, this function writes 69 | intermediate results to disk. By default, `es_search` will create a temporary 70 | directory in whatever working directory the function is called from. If you 71 | want to change this behavior, provide a path here. `es_search` will create 72 | and write to a temporary directory under whatever path you provide.} 73 | } 74 | \description{ 75 | Given a query and some optional parameters, \code{es_search} gets results 76 | from HTTP requests to Elasticsearch and returns a data.table 77 | representation of those results. 78 | } 79 | \examples{ 80 | \dontrun{ 81 | 82 | ###=== Example 1: Get low-scoring food survey results ===### 83 | 84 | query_body <- '{"query":{"filtered":{"filter":{"bool":{"must":[ 85 | {"exists":{"field":"customer_comments"}}, 86 | {"terms":{"overall_satisfaction":["very low","low"]}}]}}}, 87 | "query":{"match_phrase":{"customer_comments":"food"}}}}' 88 | 89 | # Execute the query, parse into a data.table 90 | commentDT <- es_search(es_host = 'http://mydb.mycompany.com:9200' 91 | , es_index = "survey_results" 92 | , query_body = query_body 93 | , scroll = "1m" 94 | , n_cores = 4) 95 | 96 | ###=== Example 2: Time series agg features ===### 97 | 98 | # Create query that will give you daily summary stats for revenue 99 | query_body <- '{"query":{"filtered":{"filter":{"bool":{"must":[ 100 | {"exists":{"field":"pmt_amount"}}]}}}}, 101 | "aggs":{"timestamp":{"date_histogram":{"field":"timestamp","interval":"day"}, 102 | "aggs":{"revenue":{"extended_stats":{"field":"pmt_amount"}}}}},"size":0}' 103 | 104 | # Execute the query and get the result 105 | resultDT <- es_search(es_host = "http://es.custdb.mycompany.com:9200" 106 | , es_index = 'ticket_sales' 107 | , query_body = query_body) 108 | } 109 | } 110 | \references{ 111 | \href{https://www.elastic.co/guide/en/elasticsearch/reference/6.7/search-request-scroll.html}{Elasticsearch 6 scrolling strategy} 112 | } 113 | -------------------------------------------------------------------------------- /r-pkg/man/get_fields.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_fields.R 3 | \name{get_fields} 4 | \alias{get_fields} 5 | \title{Get the names and data types of the indexed fields in an index} 6 | \usage{ 7 | get_fields(es_host, es_indices = "_all") 8 | } 9 | \arguments{ 10 | \item{es_host}{A string identifying an Elasticsearch host. This should be of the form 11 | \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}.} 12 | 13 | \item{es_indices}{A character vector that contains the names of indices for 14 | which to get mappings. Default is \code{'_all'}, which means 15 | get the mapping for all indices. Names of indices can be 16 | treated as regular expressions.} 17 | } 18 | \value{ 19 | A data.table containing four columns: index, type, field, and data_type 20 | } 21 | \description{ 22 | For a given Elasticsearch index, return the mapping from field name 23 | to data type for all indexed fields. 24 | } 25 | \examples{ 26 | \dontrun{ 27 | # get the mapping for all indexed fields in the ticket_sales and customers indices 28 | mappingDT <- get_fields(es_host = "http://es.custdb.mycompany.com:9200" 29 | , es_indices = c("ticket_sales", "customers")) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /r-pkg/man/parse_date_time.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parse_date_time.R 3 | \name{parse_date_time} 4 | \alias{parse_date_time} 5 | \title{Parse date-times from Elasticsearch records} 6 | \usage{ 7 | parse_date_time(input_df, date_cols, assume_tz = "UTC") 8 | } 9 | \arguments{ 10 | \item{input_df}{a data.table with one or more date-time columns you want to convert} 11 | 12 | \item{date_cols}{Character vector of column names to convert. Columns should have 13 | string dates of the form "2016-07-25T22:15:19Z".} 14 | 15 | \item{assume_tz}{Timezone to convert to if parsing fails. Default is UTC} 16 | } 17 | \description{ 18 | Given a data.table with date-time strings, 19 | this function converts those dates-times to type POSIXct with the appropriate 20 | time zone. Assumption is that dates are of the form "2016-07-25T22:15:19Z" 21 | where T is just a separator and the last letter is a military timezone. 22 | 23 | This is a side-effect-free function: it returns a new data.table and the 24 | input data.table is unmodified. 25 | } 26 | \examples{ 27 | # Sample es_search(), chomp_hits(), or chomp_aggs() output: 28 | someDT <- data.table::data.table(id = 1:5 29 | , company = c("Apple", "Apple", "Banana", "Banana", "Cucumber") 30 | , timestamp = c("2015-03-14T09:26:53B", "2015-03-14T09:26:54B" 31 | , "2031-06-28T08:53:07Z", "2031-06-28T08:53:08Z" 32 | , "2000-01-01")) 33 | 34 | # Note that the date field is character right now 35 | str(someDT) 36 | 37 | # Let's fix that! 38 | someDT <- parse_date_time(input_df = someDT 39 | , date_cols = "timestamp" 40 | , assume_tz = "UTC") 41 | str(someDT) 42 | } 43 | \references{ 44 | \url{https://www.timeanddate.com/time/zones/military} 45 | 46 | \url{https://en.wikipedia.org/wiki/List_of_tz_database_time_zones} 47 | } 48 | -------------------------------------------------------------------------------- /r-pkg/man/unpack_nested_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/unpack_nested_data.R 3 | \name{unpack_nested_data} 4 | \alias{unpack_nested_data} 5 | \title{Unpack a nested data.table} 6 | \usage{ 7 | unpack_nested_data(chomped_df, col_to_unpack) 8 | } 9 | \arguments{ 10 | \item{chomped_df}{a data.table} 11 | 12 | \item{col_to_unpack}{a character vector of length one: the column name to unpack} 13 | } 14 | \description{ 15 | After calling a \code{chomp_*} function or \code{es_search}, if 16 | you had a nested array in the JSON, its corresponding column in the 17 | resulting data.table is a data.frame itself (or a list of vectors). This 18 | function expands that nested column out, adding its data to the original 19 | data.table, and duplicating metadata down the rows as necessary. 20 | 21 | This is a side-effect-free function: it returns a new data.table and the 22 | input data.table is unmodified. 23 | } 24 | \examples{ 25 | # A sample raw result from a hits query: 26 | result <- '[{"_source":{"timestamp":"2017-01-01","cust_name":"Austin","details":{ 27 | "cust_class":"big_spender","location":"chicago","pastPurchases":[{"film":"The Notebook", 28 | "pmt_amount":6.25},{"film":"The Town","pmt_amount":8.00},{"film":"Zootopia","pmt_amount":7.50, 29 | "matinee":true}]}}},{"_source":{"timestamp":"2017-02-02","cust_name":"James","details":{ 30 | "cust_class":"peasant","location":"chicago","pastPurchases":[{"film":"Minions", 31 | "pmt_amount":6.25,"matinee":true},{"film":"Rogue One","pmt_amount":10.25},{"film":"Bridesmaids", 32 | "pmt_amount":8.75},{"film":"Bridesmaids","pmt_amount":6.25,"matinee":true}]}}},{"_source":{ 33 | "timestamp":"2017-03-03","cust_name":"Nick","details":{"cust_class":"critic","location":"cannes", 34 | "pastPurchases":[{"film":"Aala Kaf Ifrit","pmt_amount":0,"matinee":true},{ 35 | "film":"Dopo la guerra (Apres la Guerre)","pmt_amount":0,"matinee":true},{ 36 | "film":"Avengers: Infinity War","pmt_amount":12.75}]}}}]' 37 | 38 | # Chomp into a data.table 39 | sampleChompedDT <- chomp_hits(hits_json = result, keep_nested_data_cols = TRUE) 40 | print(sampleChompedDT) 41 | 42 | # (Note: use es_search() to get here in one step) 43 | 44 | # Unpack by details.pastPurchases 45 | unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT 46 | , col_to_unpack = "details.pastPurchases") 47 | print(unpackedDT) 48 | } 49 | -------------------------------------------------------------------------------- /r-pkg/tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(uptasticsearch) # nolint[unused_import] 3 | 4 | testthat::test_check( 5 | package = "uptasticsearch" 6 | , stop_on_failure = TRUE 7 | , stop_on_warning = FALSE 8 | , reporter = testthat::SummaryReporter$new() 9 | ) 10 | -------------------------------------------------------------------------------- /r-pkg/tests/testthat/test-assertions.R: -------------------------------------------------------------------------------- 1 | test_that(".is_count() works", { 2 | expect_true(.is_count(1L)) 3 | expect_true(.is_count(8L)) 4 | expect_false(.is_count(-2L)) 5 | expect_false(.is_count(0)) 6 | expect_false(.is_count(15.2)) 7 | expect_false(.is_count(NA)) 8 | expect_false(.is_count(NA_character_)) 9 | expect_false(.is_count(NA_integer_)) 10 | expect_false(.is_count(NA_real_)) 11 | expect_false(.is_count(c(1L, 2L))) 12 | expect_false(.is_count("a-number")) 13 | expect_false(.is_count(NULL)) 14 | expect_false(.is_count(TRUE)) 15 | }) 16 | 17 | test_that(".is_flag() works", { 18 | expect_true(.is_flag(TRUE)) 19 | expect_true(.is_flag(FALSE)) 20 | expect_false(.is_flag(-1)) 21 | expect_false(.is_flag(-1L)) 22 | expect_false(.is_flag(0)) 23 | expect_false(.is_flag(0L)) 24 | expect_false(.is_flag(1)) 25 | expect_false(.is_flag(1L)) 26 | expect_false(.is_flag(15.2)) 27 | expect_false(.is_flag(NA)) 28 | expect_false(.is_flag(NA_character_)) 29 | expect_false(.is_flag(NA_integer_)) 30 | expect_false(.is_flag(NA_real_)) 31 | expect_false(.is_flag(c(1L, 2L))) 32 | expect_false(.is_flag("a-number")) 33 | expect_false(.is_flag(NULL)) 34 | }) 35 | 36 | test_that(".is_string() works", { 37 | expect_true(.is_string("abc")) 38 | expect_true(.is_string(" ")) 39 | expect_false(.is_string("")) 40 | expect_false(.is_string(-2L)) 41 | expect_false(.is_string(0)) 42 | expect_false(.is_string(15.2)) 43 | expect_false(.is_string(NA)) 44 | expect_false(.is_string(NA_character_)) 45 | expect_false(.is_string(NA_integer_)) 46 | expect_false(.is_string(NA_real_)) 47 | expect_false(.is_string(c(1L, 2L))) 48 | expect_false(.is_string(NULL)) 49 | expect_false(.is_string(TRUE)) 50 | }) 51 | 52 | test_that(".is_writeable() works", { 53 | expect_true(.is_writeable(getwd())) 54 | expect_false(.is_writeable(file.path(tempdir(), "some-nonsense"))) 55 | expect_false(.is_writeable("")) 56 | expect_false(.is_writeable(-2L)) 57 | expect_false(.is_writeable(0)) 58 | expect_false(.is_writeable(15.2)) 59 | expect_false(.is_writeable(NA)) 60 | expect_false(.is_writeable(NA_character_)) 61 | expect_false(.is_writeable(NA_integer_)) 62 | expect_false(.is_writeable(NA_real_)) 63 | expect_false(.is_writeable(c(1L, 2L))) 64 | expect_false(.is_writeable("a-number")) 65 | expect_false(.is_writeable(NULL)) 66 | expect_false(.is_writeable(TRUE)) 67 | }) 68 | -------------------------------------------------------------------------------- /r-pkg/tests/testthat/test-chomp_hits.R: -------------------------------------------------------------------------------- 1 | 2 | # Configure logger (suppress all logs in testing) 3 | loggerOptions <- futile.logger::logger.options() 4 | if (!identical(loggerOptions, list())) { 5 | origLogThreshold <- loggerOptions[[1]][["threshold"]] 6 | } else { 7 | origLogThreshold <- futile.logger::INFO 8 | } 9 | futile.logger::flog.threshold(0) 10 | 11 | # This is effectively a test of running elastic::Search(raw = TRUE) and passing it through chomp_hits() 12 | test_that("chomp_hits should work from a one-element character vector", { 13 | # nolint start 14 | jsonString <- '{"took": 54, "timed_out": false, "_shards": {"total": 16,"successful": 16, "failed": 0}, 15 | "hits": { 16 | "total": 46872, 17 | "max_score": 0.882234, 18 | "hits": [ 19 | {"_index": "redsawx", "_type": "ballplayer", "_id": "abc123", "_score": 0.882234, 20 | "_source": {"name": "David Ortiz", "stats" : {"yrs_played": 20, "final_season": {"avg": 0.315, "HR": 38, "R": 79}, 21 | "full_career": {"avg": 0.286, "HR": 541, "R": 1419}}}}, 22 | {"_index": "redsawx", "_type": "ballplayer", "_id": "def567", "_score": 0.882234, 23 | "_source": {"name": "Kevin Youkilis", "stats" : {"yrs_played": 10, "final_season": {"avg": 0.219, "HR": 2, "R": 12}, 24 | "full_career": {"avg": 0.281, "HR": 150, "R": 653}}}}, 25 | {"_index": "redsawx", "_type": "ballplayer", "_id": "abc567", "_score": 0.882234, 26 | "_source": {"name": "Trot Nixon", "stats" : {"yrs_played": 12, "final_season": {"avg": 0.171, "HR": 1, "R": 2}, 27 | "full_career": {"avg": 0.274, "HR": 137, "R": 579}}}}, 28 | {"_index": "redsawx", "_type": "ballplayer", "_id": "def123", "_score": 0.882234, 29 | "_source": {"name": "Manny Ramirez", "stats" : {"yrs_played": 19, "final_season": {"avg": 0.059, "HR": 0, "R": 0}, 30 | "full_career": {"avg": 0.312, "HR": 555, "R": 1544}}}}, 31 | {"_index": "redsawx", "_type": "ballplayer", "_id": "ghi890", "_score": 0.882234, 32 | "_source": {"name": "Jason Varitek", "stats" : {"yrs_played": 15, "final_season": {"avg": 0.221, "HR": 11, "R": 32}, 33 | "full_career": {"avg": 0.256, "HR": "193", "R": 664}}}} 34 | ]}}' 35 | # nolint end 36 | chompDT <- chomp_hits(hits_json = jsonString) 37 | expect_true(data.table::is.data.table(chompDT)) 38 | expect_equivalent(dim(chompDT), c(5, 12)) 39 | expect_true(all(c("_id", "_index", "_score", "name", "stats.final_season.avg", 40 | "stats.final_season.HR", "stats.final_season.R", "stats.full_career.avg", 41 | "stats.full_career.HR", "stats.full_career.R", "stats.yrs_played", "_type") %in% 42 | names(chompDT))) 43 | expect_identical(chompDT$stats.full_career.R, as.integer(c(1419, 653, 579, 1544, 664))) 44 | expect_identical(chompDT$stats.full_career.HR, as.character(c(541, 150, 137, 555, 193))) 45 | }) 46 | 47 | # What if we're passing the hits array, not the entire result? 48 | test_that("chomp_hits should work with just the hits array", { 49 | # nolint start 50 | jsonString <- '[ 51 | {"_index": "redsawx", "_type": "ballplayer", "_id": "abc123", "_score": 0.882234, 52 | "_source": {"name": "David Ortiz", "stats" : {"yrs_played": 20, "final_season": {"avg": 0.315, "HR": 38, "R": 79}, 53 | "full_career": {"avg": 0.286, "HR": 541, "R": 1419}}}}, 54 | {"_index": "redsawx", "_type": "ballplayer", "_id": "def567", "_score": 0.882234, 55 | "_source": {"name": "Kevin Youkilis", "stats" : {"yrs_played": 10, "final_season": {"avg": 0.219, "HR": 2, "R": 12}, 56 | "full_career": {"avg": 0.281, "HR": 150, "R": 653}}}}, 57 | {"_index": "redsawx", "_type": "ballplayer", "_id": "abc567", "_score": 0.882234, 58 | "_source": {"name": "Trot Nixon", "stats" : {"yrs_played": 12, "final_season": {"avg": 0.171, "HR": 1, "R": 2}, 59 | "full_career": {"avg": 0.274, "HR": 137, "R": 579}}}}, 60 | {"_index": "redsawx", "_type": "ballplayer", "_id": "def123", "_score": 0.882234, 61 | "_source": {"name": "Manny Ramirez", "stats" : {"yrs_played": 19, "final_season": {"avg": 0.059, "HR": 0, "R": 0}, 62 | "full_career": {"avg": 0.312, "HR": 555, "R": 1544}}}}, 63 | {"_index": "redsawx", "_type": "ballplayer", "_id": "ghi890", "_score": 0.882234, 64 | "_source": {"name": "Jason Varitek", "stats" : {"yrs_played": 15, "final_season": {"avg": 0.221, "HR": 11, "R": 32}, 65 | "full_career": {"avg": 0.256, "HR": "193", "R": 664}}}} 66 | ]' 67 | # nolint end 68 | chompDT <- chomp_hits(hits_json = jsonString) 69 | expect_true(data.table::is.data.table(chompDT)) 70 | expect_equivalent(dim(chompDT), c(5, 12)) 71 | expect_true(all(c("_id", "_index", "_score", "name", "stats.final_season.avg", 72 | "stats.final_season.HR", "stats.final_season.R", "stats.full_career.avg", 73 | "stats.full_career.HR", "stats.full_career.R", "stats.yrs_played", "_type") %in% 74 | names(chompDT))) 75 | expect_identical(chompDT$stats.full_career.R, as.integer(c(1419, 653, 579, 1544, 664))) 76 | expect_identical(chompDT$stats.full_career.HR, as.character(c(541, 150, 137, 555, 193))) 77 | }) 78 | 79 | # This tests the type of data representation you'd get from reading in a JSON file with readLines 80 | test_that("chomp_hits should work from a multi-element character vector", { 81 | test_json <- system.file("testdata", "es_hits.json", package = "uptasticsearch") 82 | jsonVec <- suppressWarnings(readLines(test_json)) 83 | chompDT <- chomp_hits(hits_json = jsonVec) 84 | expect_true(data.table::is.data.table(chompDT)) 85 | expect_equivalent(dim(chompDT), c(5, 12)) 86 | expect_true(all(c("_id", "_index", "_score", "name", "stats.final_season.avg", 87 | "stats.final_season.HR", "stats.final_season.R", "stats.full_career.avg", 88 | "stats.full_career.HR", "stats.full_career.R", "stats.yrs_played", "_type") %in% 89 | names(chompDT))) 90 | expect_identical(chompDT$stats.full_career.R, as.integer(c(1419, 653, 579, 1544, 664))) 91 | expect_identical(chompDT$stats.full_career.HR, as.character(c(541, 150, 137, 555, 193))) 92 | }) 93 | 94 | # In case you need to have a non-R, non-Python run queries for you and store them in a file 95 | test_that("chomp_hits should work from a file", { 96 | test_json <- system.file("testdata", "es_hits.json", package = "uptasticsearch") 97 | chompDT <- chomp_hits(hits_json = test_json) 98 | expect_true(data.table::is.data.table(chompDT)) 99 | expect_equivalent(dim(chompDT), c(5, 12)) 100 | expect_true(all(c("_id", "_index", "_score", "name", "stats.final_season.avg", 101 | "stats.final_season.HR", "stats.final_season.R", "stats.full_career.avg", 102 | "stats.full_career.HR", "stats.full_career.R", "stats.yrs_played", "_type") %in% 103 | names(chompDT))) 104 | expect_identical(chompDT$stats.full_career.R, as.integer(c(1419, 653, 579, 1544, 664))) 105 | expect_identical(chompDT$stats.full_career.HR, as.character(c(541, 150, 137, 555, 193))) 106 | }) 107 | 108 | # Should warn and return null if you don't provide any data 109 | test_that("chomp_hits should return NULL if you do not provide data", { 110 | result <- suppressWarnings(chomp_hits(hits_json = NULL)) 111 | expect_true(is.null(result)) 112 | expect_warning(chomp_hits(hits_json = NULL), 113 | regexp = "You did not pass any input data to chomp_hits") 114 | }) 115 | 116 | # Should break if you pass the wrong kind of input 117 | test_that("chomp_hits should break if you pass the wrong input", { 118 | expect_error(chomp_hits(hits_json = data.frame(a = 1:5)), 119 | regexp = "The first argument of chomp_hits must be a character vector") 120 | }) 121 | 122 | # Should warn if the resulting data is nested with default keep_nested_data_cols = FALSE 123 | test_that("chomp_hits should warn and delete if the resulting data is nested with keep_nested_data_cols = FALSE", { 124 | expect_warning({ 125 | chomped <- chomp_hits( 126 | hits_json = '[{"test1":[{"a":1}],"test2":2}]' 127 | , keep_nested_data_cols = FALSE 128 | ) 129 | }, regexp = "Deleting the following nested data columns:") 130 | expect_equal(names(chomped), "test2") 131 | }) 132 | 133 | ##### TEST TEAR DOWN ##### 134 | futile.logger::flog.threshold(origLogThreshold) 135 | -------------------------------------------------------------------------------- /r-pkg/tests/testthat/test-es_search.R: -------------------------------------------------------------------------------- 1 | 2 | # Configure logger (suppress all logs in testing) 3 | loggerOptions <- futile.logger::logger.options() 4 | if (!identical(loggerOptions, list())) { 5 | origLogThreshold <- loggerOptions[[1]][["threshold"]] 6 | } else { 7 | origLogThreshold <- futile.logger::INFO 8 | } 9 | futile.logger::flog.threshold(0) 10 | 11 | # Should reject NULL index 12 | test_that("es_search should reject NULL index", { 13 | expect_error({ 14 | es_search( 15 | es_host = "http://mycompany.com:9200" 16 | , es_index = NULL 17 | ) 18 | }, regexp = "You passed NULL to es_index") 19 | }) 20 | 21 | # Should reject bad queries 22 | test_that("es_search should reject malformed queries", { 23 | # Length greater than 1 24 | expect_error({ 25 | es_search( 26 | es_host = "http://mycompany.com:9200" 27 | , es_index = "_all" 28 | , query = c( 29 | '{"_source": {"include": ["stuff.*"]},' 30 | , '{"aggs": {"superman": {"terms": {"field": "hi"}}}}}' 31 | ) 32 | ) 33 | }, regexp = "You gave an object of length 2") 34 | 35 | # Specified as a list (like you might get from jsonlite::fromJSON) 36 | expect_error({ 37 | es_search( 38 | es_host = "http://mycompany.com:9200" 39 | , es_index = "_all" 40 | , query = list( 41 | '{"_source": {"include": ["stuff.*"]},{"aggs": {"superman": {"terms": {"field": "hi"}}}}}' 42 | ) 43 | ) 44 | }, regexp = "query_body should be a single string") 45 | }) 46 | 47 | #---- .ConvertToSec 48 | 49 | # .ConvertToSec should work for seconds 50 | test_that(".ConvertToSec should work for seconds", 51 | expect_identical(60, uptasticsearch:::.ConvertToSec("60s"))) 52 | 53 | # .ConverToSec should work for minutes 54 | test_that(".ConvertToSec should work for minutes", 55 | expect_identical(600, uptasticsearch:::.ConvertToSec("10m"))) 56 | 57 | # .ConvertToSec should work for hours 58 | test_that(".ConvertToSec should work for hours", 59 | expect_identical(72000, uptasticsearch:::.ConvertToSec("20h"))) 60 | 61 | # .ConvertToSec should work for days 62 | test_that(".ConvertToSec should work for days", 63 | expect_identical(172800, uptasticsearch:::.ConvertToSec("2d"))) 64 | 65 | # .ConvertToSec should work for weeks 66 | test_that(".ConvertToSec should work for weeks", 67 | expect_identical(3024000, uptasticsearch:::.ConvertToSec("5w"))) 68 | 69 | # .ConvertToSec should break on unsupported timeStrings 70 | test_that(".ConvertToSec should work for seconds", 71 | expect_error(uptasticsearch:::.ConvertToSec("50Y") 72 | , regexp = "Could not figure out units of datemath")) 73 | 74 | #---- ValidateAndFormatHost 75 | 76 | # .ValidateAndFormatHost should break if you give it a non-character input 77 | test_that(".ValidateAndFormatHost should break if you give it a non-character input", 78 | expect_error(uptasticsearch:::.ValidateAndFormatHost(9200) 79 | , regexp = "es_host should be a string")) 80 | 81 | # .ValidateAndFormatHost should break if you give it a multi-element vector 82 | test_that(".ValidateAndFormatHost should break if you give it a multi-element vector", 83 | expect_error(uptasticsearch:::.ValidateAndFormatHost(c("http://", "mydb.mycompany.com:9200")) 84 | , regexp = "es_host should be length 1")) 85 | 86 | # .ValidateAndFormatHost should warn you and drop trailing slashes if you have them 87 | test_that(".ValidateAndFormatHost should handle trailing slashes", { 88 | # single slash 89 | newHost <- uptasticsearch:::.ValidateAndFormatHost("http://mydb.mycompany.com:9200/") 90 | expect_identical(newHost, "http://mydb.mycompany.com:9200") 91 | 92 | # objectively ridiculous number of slashes 93 | newHost2 <- uptasticsearch:::.ValidateAndFormatHost("http://mydb.mycompany.com:9200/////////") 94 | expect_identical(newHost2, "http://mydb.mycompany.com:9200") 95 | }) 96 | 97 | # .ValidateAndFormatHost should break if you don't have a port 98 | test_that(".ValidateAndFormatHost should break if you don't have a port", 99 | expect_error(uptasticsearch:::.ValidateAndFormatHost("http://mydb.mycompany.com") 100 | , regexp = "No port found in es_host")) 101 | 102 | # .ValidateAndFormatHost should warn if you don't have a valid transfer protocol 103 | test_that(".ValidateAndFormatHost should warn and use http if you don't give a port", { 104 | # single slash 105 | expect_warning({ 106 | hostWithTransfer <- uptasticsearch:::.ValidateAndFormatHost("mydb.mycompany.com:9200") 107 | }, regexp = "You did not provide a transfer protocol") 108 | expect_identical(hostWithTransfer, "http://mydb.mycompany.com:9200") 109 | }) 110 | 111 | #---- .major_version 112 | test_that(".major_version should correctly parse semver version strings", { 113 | 114 | # yay random tests 115 | for (i in 1:50) { 116 | v1 <- as.character(sample(0:9, size = 1)) 117 | v2 <- as.character(sample(0:9, size = 1)) 118 | v3 <- as.character(sample(0:9, size = 1)) 119 | test_version <- paste0(v1, ".", v2, ".", v3) 120 | expect_identical( 121 | uptasticsearch:::.major_version(test_version) 122 | , v1 123 | , info = paste0("version that broke this: ", test_version) 124 | ) 125 | } 126 | }) 127 | 128 | ##### TEST TEAR DOWN ##### 129 | futile.logger::flog.threshold(origLogThreshold) 130 | -------------------------------------------------------------------------------- /r-pkg/tests/testthat/test-get_fields.R: -------------------------------------------------------------------------------- 1 | # Configure logger (suppress all logs in testing) 2 | loggerOptions <- futile.logger::logger.options() 3 | if (!identical(loggerOptions, list())) { 4 | origLogThreshold <- loggerOptions[[1]][["threshold"]] 5 | } else { 6 | origLogThreshold <- futile.logger::INFO 7 | } 8 | futile.logger::flog.threshold(0) 9 | 10 | 11 | #--- get_fields 12 | 13 | # Gives an informative error if es_indices is NULL or an empty string 14 | test_that("get_fields should give an informative error if es_indices is NULL or an empty string", { 15 | expect_error(get_fields(es_host = "http://es.custdb.mycompany.com:9200" 16 | , es_indices = NULL), 17 | regexp = "Argument 'es_indices' must be a non-empty character vector") 18 | expect_error(get_fields(es_host = "http://es.custdb.mycompany.com:9200" 19 | , es_indices = ""), 20 | regexp = "get_fields must be passed a valid es_indices") 21 | }) 22 | 23 | # works as expected when mocked 24 | test_that("get_fields works as expected when mocked", { 25 | 26 | test_json <- system.file("testdata", "two_index_mapping.json", package = "uptasticsearch") 27 | aliasDT <- data.table::data.table( 28 | alias = c("alias1", "alias2") 29 | , index = c("company", "otherIndex") 30 | ) 31 | testthat::with_mocked_bindings( 32 | `.content` = function(...) { 33 | return(jsonlite::fromJSON(txt = test_json)) 34 | }, 35 | `.get_aliases` = function(...) { 36 | return(aliasDT) 37 | }, 38 | `.get_es_version` = function(...) { 39 | return("6") 40 | } 41 | , 42 | `.request` = function(...) { 43 | return(NULL) 44 | }, 45 | `.stop_for_status` = function(...) { 46 | return(NULL) 47 | }, 48 | { 49 | outDT <- get_fields( 50 | es_host = "http://db.mycompany.com:9200" 51 | , es_indices = c("company", "hotel") 52 | ) 53 | data.table::setkey(outDT, NULL) 54 | expected <- data.table::data.table( 55 | index = c(rep("alias1", 3), rep("hotel", 5)) 56 | , type = c(rep("building", 3), rep("bed_room", 2), rep("conference_room", 3)) 57 | , field = c("id", "address", "address.keyword", "num_beds", "description" 58 | , "num_people", "purpose", "purpose.keyword") 59 | , data_type = c("long", "text", "keyword", "integer", "text", "integer" 60 | , "text", "keyword") 61 | ) 62 | expect_identical(outDT, expected) 63 | } 64 | ) 65 | }) 66 | 67 | #--- .flatten_mapping 68 | 69 | # Works if one index is passed 70 | test_that(".flatten_mapping should work if the mapping for one index is provided", { 71 | test_json <- system.file("testdata", "one_index_mapping.json", package = "uptasticsearch") 72 | mapping <- jsonlite::fromJSON(txt = test_json) 73 | mappingDT <- uptasticsearch:::.flatten_mapping(mapping = mapping) 74 | expected <- data.table::data.table( 75 | index = rep("basketball", 5) 76 | , type = rep("players", 5) 77 | , field = c("team", "name.first", "name.last", "age", "position") 78 | , data_type = c("keyword", "text", "text", "integer", "keyword") 79 | ) 80 | expect_identical(mappingDT, expected) 81 | }) 82 | 83 | # works if multiple indices are passed 84 | test_that(".flatten_mapping should work if the mapping for multiple indices are provided", { 85 | test_json <- system.file("testdata", "two_index_mapping.json", package = "uptasticsearch") 86 | mapping <- jsonlite::fromJSON(txt = test_json) 87 | mappingDT <- uptasticsearch:::.flatten_mapping(mapping = mapping) 88 | expected <- data.table::data.table( 89 | index = c(rep("company", 3), rep("hotel", 5)) 90 | , type = c(rep("building", 3), rep("bed_room", 2), rep("conference_room", 3)) 91 | , field = c("id", "address", "address.keyword", "num_beds", "description" 92 | , "num_people", "purpose", "purpose.keyword") 93 | , data_type = c("long", "text", "keyword", "integer", "text", "integer" 94 | , "text", "keyword") 95 | ) 96 | expect_identical(mappingDT, expected) 97 | }) 98 | 99 | ##### TEST TEAR DOWN ##### 100 | futile.logger::flog.threshold(origLogThreshold) 101 | -------------------------------------------------------------------------------- /r-pkg/tests/testthat/test-parse_date_time.R: -------------------------------------------------------------------------------- 1 | # Configure logger (suppress all logs in testing) 2 | loggerOptions <- futile.logger::logger.options() 3 | if (!identical(loggerOptions, list())) { 4 | origLogThreshold <- loggerOptions[[1]][["threshold"]] 5 | } else { 6 | origLogThreshold <- futile.logger::INFO 7 | } 8 | futile.logger::flog.threshold(0) 9 | 10 | # Correctly adjusts UTC date-times 11 | test_that("parse_date_time should transform the indicated date_cols to POSIXct with timezone UTC if they're given in UTC", { 12 | testDT <- data.table::data.table( 13 | id = c("a", "b", "c") 14 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 15 | ) 16 | newDT <- parse_date_time(testDT, date_cols = "dateTime") 17 | 18 | expect_true(inherits(newDT$dateTime, "POSIXct")) 19 | expect_identical( 20 | newDT 21 | , data.table::data.table( 22 | id = c("a", "b", "c") 23 | , dateTime = as.POSIXct(c("2016-07-16 21:15:00", "2015-04-16 02:15:00", "2015-03-04 15:25:00"), tz = "UTC") 24 | ) 25 | ) 26 | }) 27 | 28 | # Correctly adjusts non-UTC date-times 29 | test_that("parse_date_time should transform the indicated date_cols to POSIXct with timezone UTC correctly even if the dates are not specified in UTC", { # nolint[line_length] 30 | testDT <- data.table::data.table( 31 | id = c("a", "b", "c") 32 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00A", "2015-03-04T15:25:00B") 33 | ) 34 | newDT <- parse_date_time(testDT, date_cols = "dateTime") 35 | 36 | expect_true(inherits(newDT$dateTime, "POSIXct")) 37 | expect_identical( 38 | newDT 39 | , data.table::data.table( 40 | id = c("a", "b", "c") 41 | , dateTime = as.POSIXct(c("2016-07-16 21:15:00", "2015-04-16 01:15:00", "2015-03-04 13:25:00"), tz = "UTC") 42 | ) 43 | ) 44 | }) 45 | 46 | # Returns object of class POSIXct 47 | test_that("parse_date_time should transform the indicated date_cols to class POSIXct", { 48 | testDT <- data.table::data.table( 49 | id = c("a", "b", "c") 50 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 51 | ) 52 | newDT <- parse_date_time(testDT, date_cols = "dateTime") 53 | 54 | expect_true(inherits(newDT$dateTime, "POSIXct")) 55 | expect_identical( 56 | newDT 57 | , data.table::data.table( 58 | id = c("a", "b", "c") 59 | , dateTime = as.POSIXct(c("2016-07-16 21:15:00", "2015-04-16 02:15:00", "2015-03-04 15:25:00"), tz = "UTC") 60 | ) 61 | ) 62 | }) 63 | 64 | # Works for one date column 65 | test_that("parse_date_time should perform adjustments only on the columns you ask it to", { 66 | testDT <- data.table::data.table( 67 | id = c("a", "b", "c") 68 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 69 | , otherDate = c("2014-03-11T12:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 70 | ) 71 | newDT <- parse_date_time(testDT, date_cols = "dateTime") 72 | 73 | expect_true(all(c("dateTime", "otherDate") %in% names(newDT))) 74 | expect_true(inherits(newDT$dateTime, "POSIXct")) 75 | expect_true(is.character(newDT$otherDate)) 76 | expect_identical( 77 | newDT 78 | , data.table::data.table( 79 | id = c("a", "b", "c") 80 | , dateTime = as.POSIXct(c("2016-07-16 21:15:00", "2015-04-16 02:15:00", "2015-03-04 15:25:00"), tz = "UTC") 81 | , otherDate = c("2014-03-11T12:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 82 | ) 83 | ) 84 | }) 85 | 86 | # works for multiple date columns 87 | test_that("parse_date_time should perform adjustments for multiple data columns if asked", { 88 | testDT <- data.table::data.table( 89 | id = c("a", "b", "c") 90 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 91 | , otherDate = c("2014-03-11T12:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 92 | ) 93 | newDT <- parse_date_time(testDT, date_cols = c("dateTime", "otherDate")) 94 | 95 | expect_true(all(c("dateTime", "otherDate") %in% names(newDT))) 96 | expect_true(inherits(newDT$dateTime, "POSIXct")) 97 | expect_true(inherits(newDT$otherDate, "POSIXct")) 98 | expect_identical( 99 | newDT 100 | , data.table::data.table( 101 | id = c("a", "b", "c") 102 | , dateTime = as.POSIXct(c("2016-07-16 21:15:00", "2015-04-16 02:15:00", "2015-03-04 15:25:00"), tz = "UTC") 103 | , otherDate = as.POSIXct(c("2014-03-11 12:15:00", "2015-04-16 02:15:00", "2015-03-04 15:25:00"), tz = "UTC") 104 | ) 105 | ) 106 | }) 107 | 108 | # Gives an informative error if date_cols is not character vector 109 | test_that("parse_date_time should give an informative error if you pass non-character stuff to date_cols", { 110 | testDT <- data.table::data.table( 111 | id = c("a", "b", "c") 112 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 113 | ) 114 | 115 | expect_error({ 116 | parse_date_time(testDT, date_cols = list("dateTime")) 117 | }, regexp = "The date_cols argument in parse_date_time expects a character vector") 118 | }) 119 | 120 | # Gives informative error if inputDT is not a data.table 121 | test_that("parse_date_time should give an informative error if you don't pass it a data.table", { 122 | testDF <- data.frame( 123 | id = c("a", "b", "c") 124 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 125 | ) 126 | 127 | expect_error({ 128 | parse_date_time(testDF, date_cols = "dateTime") 129 | }, regexp = "parse_date_time expects to receive a data\\.table object") # nolint[non_portable_path] 130 | }) 131 | 132 | # Gives informative error if you ask to adjust date_cols that don't exist 133 | test_that("parse_date_time should give an informative error if you give it dateCol names that don't exist in the DT", { 134 | testDT <- data.table::data.table( 135 | id = c("a", "b", "c") 136 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 137 | ) 138 | 139 | expect_error({ 140 | parse_date_time(testDT, date_cols = c("dateTime", "dateTyme")) 141 | }, regexp = "do not actually exist in input_df") 142 | }) 143 | 144 | # Does not have side effects (works on a copy) 145 | test_that("parse_date_time should leave the original DT unchanged", { 146 | testDT <- data.table::data.table( 147 | id = c("a", "b", "c") 148 | , dateTime = c("2016-07-16T21:15:00Z", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 149 | ) 150 | 151 | beforeDT <- data.table::copy(testDT) 152 | origAddress <- data.table::address(testDT) 153 | newDT <- parse_date_time(testDT, date_cols = "dateTime") 154 | 155 | expect_identical(testDT, beforeDT) 156 | expect_identical(origAddress, data.table::address(testDT)) 157 | expect_true(origAddress != data.table::address(newDT)) 158 | }) 159 | 160 | # Substitutes in assume_tz if missing a timezone 161 | test_that("parse_date_time should leave the original DT unchanged", { 162 | 163 | testDT <- data.table::data.table( 164 | id = c("a", "b", "c") 165 | , dateTime = c("2016-07-16T21:15:00", "2015-04-16T02:15:00Z", "2015-03-04T15:25:00Z") 166 | ) 167 | beforeDT <- data.table::copy(testDT) 168 | origAddress <- data.table::address(testDT) 169 | newDT <- parse_date_time(testDT, date_cols = "dateTime", assume_tz = "UTC") 170 | 171 | expect_identical(newDT[id == "a", dateTime], as.POSIXct("2016-07-16 21:15:00", tz = "UTC")) 172 | }) 173 | 174 | ##### TEST TEAR DOWN ##### 175 | futile.logger::flog.threshold(origLogThreshold) 176 | -------------------------------------------------------------------------------- /r-pkg/tests/testthat/test-unpack_nested_data.R: -------------------------------------------------------------------------------- 1 | 2 | # Configure logger (suppress all logs in testing) 3 | loggerOptions <- futile.logger::logger.options() 4 | if (!identical(loggerOptions, list())) { 5 | origLogThreshold <- loggerOptions[[1]][["threshold"]] 6 | } else { 7 | origLogThreshold <- futile.logger::INFO 8 | } 9 | futile.logger::flog.threshold(0) 10 | 11 | #--- unpack_nested_data 12 | 13 | # Should work with result of chomp_hits 14 | test_that("unpack_nested_data should work with the result of chomp_hits", { 15 | # nolint start 16 | test_json <- '[{"_source":{"dateTime":"2017-01-01","username":"Austin1","details":{ 17 | "interactions":400,"userType":"active","appData":[{"appName":"farmville","minutes":500}, 18 | {"appName":"candy_crush","value":350},{"appName":"angry_birds","typovalue":422}]}}}, 19 | {"_source":{"dateTime":"2017-02-02","username":"Austin2","details":{"interactions":5, 20 | "userType":"very_active","appData":[{"appName":"minesweeper","value":28},{"appName": 21 | "pokemon_go","value":190},{"appName":"pokemon_stay","value":1},{"appName":"block_dude", 22 | "value":796}]}}}]' 23 | # nolint end 24 | sampleChompedDT <- chomp_hits(test_json 25 | , keep_nested_data_cols = TRUE) 26 | unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT 27 | , col_to_unpack = "details.appData") 28 | expect_true(data.table::is.data.table(unpackedDT)) 29 | expect_equivalent(dim(unpackedDT), c(7, 8)) 30 | expect_named(unpackedDT, c("dateTime", "username", "details.interactions", 31 | "details.userType", "appName", "minutes", "value", "typovalue")) 32 | expect_identical(unpackedDT$appName, c("farmville", "candy_crush", "angry_birds", 33 | "minesweeper", "pokemon_go", "pokemon_stay", 34 | "block_dude")) 35 | expect_identical(unpackedDT$username, c(rep("Austin1", 3), rep("Austin2", 4))) 36 | expect_true(sum(is.na(unpackedDT$minutes)) == 6) 37 | }) 38 | 39 | # Should work if the array is a simple array rather than an array of maps 40 | test_that("unpack_nested_data should work if the array is a simple array", { 41 | # nolint start 42 | test_json <- '[{"_source":{"dateTime":"2017-01-01","username":"Austin1","details":{ 43 | "interactions":400,"userType":"active","minutes":[500,350,422]}}}, 44 | {"_source":{"dateTime":"2017-02-02","username":"Austin2","details":{"interactions":0, 45 | "userType":"never","minutes":[]}}}, 46 | {"_source":{"dateTime":"2017-03-03","username":"Austin3","details":{"interactions":5, 47 | "userType":"very_active","minutes":[28,190,1,796]}}}]' 48 | # nolint end 49 | sampleChompedDT <- chomp_hits(test_json 50 | , keep_nested_data_cols = TRUE) 51 | unpackedDT <- unpack_nested_data(chomped_df = sampleChompedDT 52 | , col_to_unpack = "details.minutes") 53 | expect_true(data.table::is.data.table(unpackedDT)) 54 | expect_equivalent(dim(unpackedDT), c(8, 5)) 55 | expect_named(unpackedDT, c("dateTime", "username", "details.interactions", 56 | "details.userType", "details.minutes")) 57 | expect_equivalent(unpackedDT$details.minutes, c(500, 350, 422, NA, 28, 190, 1, 796)) 58 | expect_identical(unpackedDT$username, c(rep("Austin1", 3), "Austin2", rep("Austin3", 4))) 59 | }) 60 | 61 | # Should break if chomped_df is not a data.table 62 | test_that("unpack_nested_data should break if you don't pass a data.table", { 63 | expect_error(unpack_nested_data(chomped_df = 42 64 | , col_to_unpack = "blah"), 65 | regexp = "chomped_df must be a data.table") 66 | }) 67 | 68 | # Should break if col_to_unpack is not a string 69 | test_that("unpack_nested_data should break if col_to_unpack is not a string", { 70 | expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7) 71 | , col_to_unpack = 8), 72 | regexp = "col_to_unpack must be a character of length 1") 73 | }) 74 | 75 | # Should break if col_to_unpack is not of length 1 76 | test_that("unpack_nested_data should break if col_to_unpack is not of length 1", { 77 | expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7) 78 | , col_to_unpack = c("a", "b")), 79 | regexp = "col_to_unpack must be a character of length 1") 80 | }) 81 | 82 | # Should break if col_to_unpack is not one of the column names 83 | test_that("unpack_nested_data should break if col_to_unpack is not one of the column names", { 84 | expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7) 85 | , col_to_unpack = "a"), 86 | regexp = "col_to_unpack must be one of the column names") 87 | }) 88 | 89 | # Should break if the column doesn't include any data 90 | test_that("unpack_nested_data should break if the column doesn't include any data", { 91 | expect_error(unpack_nested_data(chomped_df = data.table::data.table(wow = 7, dang = list()) 92 | , col_to_unpack = "dang"), 93 | regexp = "The column given to unpack_nested_data had no data in it") 94 | }) 95 | 96 | test_that("unpack_nested_data should break if the column contains something that is not a dataframe or vector", { 97 | DT <- data.table::data.table(x = 1:2, y = list(list(2), 3)) 98 | expect_error(unpack_nested_data(chomped_df = DT, col_to_unpack = "y") 99 | , regexp = "must be a data frame or a vector") 100 | }) 101 | 102 | test_that("unpack_nested_data should handle NA and empty rows", { 103 | DT <- data.table::data.table(x = 1:2, y = list(z = NA, data.table::data.table(w = 5:6, z = 7:8))) 104 | DT2 <- data.table::data.table(x = 1:2, y = list(z = list(), data.table::data.table(w = 5:6, z = 7:8))) 105 | unpackedDT <- data.table::data.table( 106 | x = c(1, 2, 2) 107 | , w = c(NA, 5, 6) 108 | , z = c(NA, 7, 8) 109 | ) 110 | expect_equal(unpack_nested_data(DT, col_to_unpack = "y"), unpackedDT) 111 | expect_equal(unpack_nested_data(DT2, col_to_unpack = "y"), unpackedDT) 112 | }) 113 | 114 | 115 | ##### TEST TEAR DOWN ##### 116 | futile.logger::flog.threshold(origLogThreshold) 117 | -------------------------------------------------------------------------------- /r-pkg/vignettes/FAQ.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Frequently Asked Questions" 3 | author: "Stephanie Kirmer" 4 | date: "`r Sys.Date()`" 5 | output: 6 | markdown::html_format: 7 | options: 8 | toc: true 9 | toc_depth: 2 10 | number_sections: true 11 | vignette: > 12 | %\VignetteIndexEntry{FAQ - Help with Uptasticsearch Functionalities} 13 | %\VignetteEngine{knitr::knitr} 14 | %\VignetteEncoding{UTF-8} 15 | --- 16 | 17 | # Introduction 18 | 19 | Welcome to uptasticsearch! This package exists to help R users connect to elasticsearch databases smoothly and easily. However, sometimes things go wrong! This FAQ is an ongoing project to catalog common errors and questions people have about this system, and provide simple and useful answers. 20 | 21 | If your question is not presented here, and google doesn't help, go ahead and post an issue on github so somebody can help you. 22 | 23 | *** 24 | 25 | # Questions 26 | 27 | ## Query Syntax Problems 28 | Developing queries of your own for elasticsearch is sometimes tough. If you are finding that the queries you write are not valid, there may be many reasons. You are probably in this situation if your error is `Bad Request (HTTP 400)` or similar. 29 | 30 | ### Troubleshooting Guide 31 | 32 | * Are all your **brackets and curly braces** correct and paired? Check just to make sure. 33 | * Are you **quoting** things correctly? This syntax calls for an awful lot of quotation marks, don't forget them. 34 | 35 | ## Query Returns No Results 36 | 37 | After you have verified with certainty that your query is appropriately structured and written, you might still have challenges. What to do if you get the error `Query is syntactically valid but 0 documents were matched. Returning NULL` ? 38 | 39 | ### Troubleshooting Guide 40 | 41 | * Are your **search terms** named and described correctly? All spelled right? 42 | * Are you looking in the **correct index**? Perhaps your document is in a different one. 43 | * If you are passing **dates or datetimes**, are the formats of your input formatted just right? 44 | * IMPORTANT: **Are the terms you are using indexed**? This is a tricky one. Your term may exist and have data in the documents, but if your particular elasticsearch database has not indexed that term, you won't be able to use it for searching. This does not mean that the document/s aren't there, but just that you can't use that term for searching. 45 | * Are you sure the **document exists**? It might just not be there. 46 | 47 | *** 48 | 49 | # Contribute to this Guide! 50 | 51 | We are always happy to get more questions and answers to add to this guide. If you have a tricky issue that you have figured out the solution to, please submit a PR on github and add it to this guide. 52 | -------------------------------------------------------------------------------- /setup_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "collecting arguments..." 6 | 7 | ES_VERSION=${1} 8 | echo "Elasticsearch version: $ES_VERSION" 9 | 10 | WDIR=$(pwd) 11 | TESTDIR=${WDIR}/sandbox 12 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample.json 13 | ES_HOST="127.0.0.1" 14 | ES_PORT="9200" 15 | 16 | echo "Starting up Elasticsearch..." 17 | 18 | case "${ES_VERSION}" in 19 | 20 | 1.7.6) 21 | docker run --rm -d -p "${ES_PORT}:9200" elasticsearch:1.7.6 22 | MAPPING_FILE=$(pwd)/test-data/legacy_shakespeare_mapping.json 23 | ;; 24 | 2.4.6) 25 | docker run --rm -d -p "${ES_PORT}:9200" elasticsearch:2.4.6 26 | MAPPING_FILE=$(pwd)/test-data/legacy_shakespeare_mapping.json 27 | ;; 28 | 5.6.16) 29 | docker run --rm -d -p "${ES_PORT}:9200" \ 30 | -e "xpack.security.enabled=false" \ 31 | docker.elastic.co/elasticsearch/elasticsearch:5.6.16 32 | MAPPING_FILE=$(pwd)/test-data/es5_shakespeare_mapping.json 33 | ;; 34 | 6.8.15) 35 | docker run --rm -d -p "${ES_PORT}:9200" \ 36 | -e "discovery.type=single-node" \ 37 | -e "xpack.security.enabled=false" \ 38 | docker.elastic.co/elasticsearch/elasticsearch:6.8.15 39 | MAPPING_FILE=$(pwd)/test-data/es6_shakespeare_mapping.json 40 | ;; 41 | 7.0.1) 42 | docker run --rm -d -p "${ES_PORT}:9200" \ 43 | -e "discovery.type=single-node" \ 44 | -e "xpack.security.enabled=false" \ 45 | docker.elastic.co/elasticsearch/elasticsearch:7.0.1 46 | MAPPING_FILE=$(pwd)/test-data/es7_shakespeare_mapping.json 47 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample_es7.json 48 | ;; 49 | 7.17.22) 50 | docker run --rm -d -p "${ES_PORT}:9200" \ 51 | -e "discovery.type=single-node" \ 52 | -e "xpack.security.enabled=false" \ 53 | docker.elastic.co/elasticsearch/elasticsearch:7.17.22 54 | MAPPING_FILE=$(pwd)/test-data/es7_shakespeare_mapping.json 55 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample_es7.json 56 | ;; 57 | 8.0.1) 58 | docker run --rm -d -p "${ES_PORT}:9200" \ 59 | -e "discovery.type=single-node" \ 60 | -e "xpack.security.enabled=false" \ 61 | docker.elastic.co/elasticsearch/elasticsearch:8.0.1 62 | MAPPING_FILE=$(pwd)/test-data/es7_shakespeare_mapping.json 63 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample_es7.json 64 | ;; 65 | 8.5.3) 66 | docker run --rm -d -p "${ES_PORT}:9200" \ 67 | -e "discovery.type=single-node" \ 68 | -e "xpack.security.enabled=false" \ 69 | docker.elastic.co/elasticsearch/elasticsearch:8.5.3 70 | MAPPING_FILE=$(pwd)/test-data/es7_shakespeare_mapping.json 71 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample_es7.json 72 | ;; 73 | 8.10.4) 74 | docker run --rm -d -p "${ES_PORT}:9200" \ 75 | -e "discovery.type=single-node" \ 76 | -e "xpack.security.enabled=false" \ 77 | docker.elastic.co/elasticsearch/elasticsearch:8.10.4 78 | MAPPING_FILE=$(pwd)/test-data/es7_shakespeare_mapping.json 79 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample_es7.json 80 | ;; 81 | 8.15.5) 82 | docker run --rm -d -p "${ES_PORT}:9200" \ 83 | -e "discovery.type=single-node" \ 84 | -e "xpack.security.enabled=false" \ 85 | docker.elastic.co/elasticsearch/elasticsearch:8.15.5 86 | MAPPING_FILE=$(pwd)/test-data/es7_shakespeare_mapping.json 87 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample_es7.json 88 | ;; 89 | 8.17.2) 90 | docker run --rm -d -p "${ES_PORT}:9200" \ 91 | -e "discovery.type=single-node" \ 92 | -e "xpack.security.enabled=false" \ 93 | docker.elastic.co/elasticsearch/elasticsearch:8.17.2 94 | MAPPING_FILE=$(pwd)/test-data/es7_shakespeare_mapping.json 95 | SAMPLE_DATA_FILE=$(pwd)/test-data/sample_es7.json 96 | ;; 97 | *) 98 | echo "Did not recognize version ${ES_VERSION}. Not starting Elasticsearch" 99 | exit 1 100 | ;; 101 | esac 102 | 103 | echo "Elasticsearch v${ES_VERSION} is now running at http://${ES_HOST}:9200" 104 | 105 | echo "Setting up local testing environment" 106 | 107 | # Creating testing directory 108 | mkdir -p "${TESTDIR}" 109 | 110 | # Get data 111 | cp "${MAPPING_FILE}" "${TESTDIR}/shakespeare_mapping.json" 112 | cp "${SAMPLE_DATA_FILE}" "${TESTDIR}/sample.json" 113 | cd "${TESTDIR}" 114 | 115 | # give the cluster a chance 116 | sleep 30 117 | 118 | # Create shakespeare index and shakespeare mapping 119 | curl -X PUT "http://${ES_HOST}:9200/shakespeare" \ 120 | -H 'Content-Type: application/json' \ 121 | -d @shakespeare_mapping.json 122 | 123 | # Upload data 124 | curl -X POST "http://${ES_HOST}:9200/shakespeare/_bulk" \ 125 | -H 'Content-Type: application/json' \ 126 | --data-binary @sample.json 127 | 128 | # Add an intentionally empty index 129 | curl -X PUT "http://${ES_HOST}:9200/empty_index" \ 130 | -H 'Content-Type: application/json' \ 131 | -d @shakespeare_mapping.json 132 | 133 | # Refresh all indices 134 | curl -X POST "http://${ES_HOST}:9200/_refresh" 135 | 136 | # Check that we got something 137 | curl -X GET "http://${ES_HOST}:9200/shakespeare/_search?size=1" 138 | 139 | cd "${WDIR}" 140 | 141 | echo "" 142 | echo "Your local environment is ready." 143 | -------------------------------------------------------------------------------- /test-data/aggs_cardinality.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 30, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 12, 6 | "successful": 12, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2651, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "number_of_things": { 16 | "value": 777 17 | } 18 | } 19 | } -------------------------------------------------------------------------------- /test-data/aggs_date_histogram.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 41, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2627223, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "report_week": { 16 | "buckets": [ 17 | { 18 | "key_as_string": "2017-02-27T00:00:00.000Z", 19 | "key": 1488153600000, 20 | "doc_count": 201674 21 | }, 22 | { 23 | "key_as_string": "2017-03-06T00:00:00.000Z", 24 | "key": 1488758400000, 25 | "doc_count": 295596 26 | }, 27 | { 28 | "key_as_string": "2017-03-13T00:00:00.000Z", 29 | "key": 1489363200000, 30 | "doc_count": 277618 31 | }, 32 | { 33 | "key_as_string": "2017-03-20T00:00:00.000Z", 34 | "key": 1489968000000, 35 | "doc_count": 259233 36 | }, 37 | { 38 | "key_as_string": "2017-03-27T00:00:00.000Z", 39 | "key": 1490572800000, 40 | "doc_count": 265538 41 | }, 42 | { 43 | "key_as_string": "2017-04-03T00:00:00.000Z", 44 | "key": 1491177600000, 45 | "doc_count": 299502 46 | }, 47 | { 48 | "key_as_string": "2017-04-10T00:00:00.000Z", 49 | "key": 1491782400000, 50 | "doc_count": 303826 51 | }, 52 | { 53 | "key_as_string": "2017-04-17T00:00:00.000Z", 54 | "key": 1492387200000, 55 | "doc_count": 305400 56 | }, 57 | { 58 | "key_as_string": "2017-04-24T00:00:00.000Z", 59 | "key": 1492992000000, 60 | "doc_count": 325883 61 | }, 62 | { 63 | "key_as_string": "2017-05-01T00:00:00.000Z", 64 | "key": 1493596800000, 65 | "doc_count": 92953 66 | } 67 | ] 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /test-data/aggs_date_histogram_cardinality.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 38, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2627223, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "report_week": { 16 | "buckets": [ 17 | { 18 | "key_as_string": "2017-02-27T00:00:00.000Z", 19 | "key": 1488153600000, 20 | "doc_count": 201674, 21 | "num_customers": { 22 | "value": 4 23 | } 24 | }, 25 | { 26 | "key_as_string": "2017-03-06T00:00:00.000Z", 27 | "key": 1488758400000, 28 | "doc_count": 295596, 29 | "num_customers": { 30 | "value": 5 31 | } 32 | }, 33 | { 34 | "key_as_string": "2017-03-13T00:00:00.000Z", 35 | "key": 1489363200000, 36 | "doc_count": 277618, 37 | "num_customers": { 38 | "value": 5 39 | } 40 | }, 41 | { 42 | "key_as_string": "2017-03-20T00:00:00.000Z", 43 | "key": 1489968000000, 44 | "doc_count": 259233, 45 | "num_customers": { 46 | "value": 5 47 | } 48 | }, 49 | { 50 | "key_as_string": "2017-03-27T00:00:00.000Z", 51 | "key": 1490572800000, 52 | "doc_count": 265538, 53 | "num_customers": { 54 | "value": 5 55 | } 56 | }, 57 | { 58 | "key_as_string": "2017-04-03T00:00:00.000Z", 59 | "key": 1491177600000, 60 | "doc_count": 299502, 61 | "num_customers": { 62 | "value": 5 63 | } 64 | }, 65 | { 66 | "key_as_string": "2017-04-10T00:00:00.000Z", 67 | "key": 1491782400000, 68 | "doc_count": 303826, 69 | "num_customers": { 70 | "value": 5 71 | } 72 | }, 73 | { 74 | "key_as_string": "2017-04-17T00:00:00.000Z", 75 | "key": 1492387200000, 76 | "doc_count": 305400, 77 | "num_customers": { 78 | "value": 4 79 | } 80 | }, 81 | { 82 | "key_as_string": "2017-04-24T00:00:00.000Z", 83 | "key": 1492992000000, 84 | "doc_count": 325883, 85 | "num_customers": { 86 | "value": 4 87 | } 88 | }, 89 | { 90 | "key_as_string": "2017-05-01T00:00:00.000Z", 91 | "key": 1493596800000, 92 | "doc_count": 92953, 93 | "num_customers": { 94 | "value": 4 95 | } 96 | } 97 | ] 98 | } 99 | } 100 | } -------------------------------------------------------------------------------- /test-data/aggs_date_histogram_extended_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 27, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2627223, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "report_week": { 16 | "buckets": [ 17 | { 18 | "key_as_string": "2017-02-27T00:00:00.000Z", 19 | "key": 1488153600000, 20 | "doc_count": 201674, 21 | "some_score": { 22 | "count": 201674, 23 | "min": 0, 24 | "max": 3, 25 | "avg": 1.572527941132719, 26 | "sum": 317138, 27 | "sum_of_squares": 575494, 28 | "variance": 0.3807413638101676, 29 | "std_deviation": 0.6170424327468635, 30 | "std_deviation_bounds": { 31 | "upper": 2.806612806626446, 32 | "lower": 0.338443075638992 33 | } 34 | } 35 | }, 36 | { 37 | "key_as_string": "2017-03-06T00:00:00.000Z", 38 | "key": 1488758400000, 39 | "doc_count": 295596, 40 | "some_score": { 41 | "count": 295596, 42 | "min": 0, 43 | "max": 7, 44 | "avg": 1.5650110285660157, 45 | "sum": 462611, 46 | "sum_of_squares": 832155, 47 | "variance": 0.3659172758225649, 48 | "std_deviation": 0.604910965202785, 49 | "std_deviation_bounds": { 50 | "upper": 2.7748329589715857, 51 | "lower": 0.35518909816044575 52 | } 53 | } 54 | }, 55 | { 56 | "key_as_string": "2017-03-13T00:00:00.000Z", 57 | "key": 1489363200000, 58 | "doc_count": 277618, 59 | "some_score": { 60 | "count": 277618, 61 | "min": 0, 62 | "max": 7, 63 | "avg": 1.5557384607626306, 64 | "sum": 431901, 65 | "sum_of_squares": 772061, 66 | "variance": 0.36069708397207323, 67 | "std_deviation": 0.6005806223747759, 68 | "std_deviation_bounds": { 69 | "upper": 2.7568997055121827, 70 | "lower": 0.3545772160130787 71 | } 72 | } 73 | }, 74 | { 75 | "key_as_string": "2017-03-20T00:00:00.000Z", 76 | "key": 1489968000000, 77 | "doc_count": 259233, 78 | "some_score": { 79 | "count": 259233, 80 | "min": 0, 81 | "max": 7, 82 | "avg": 1.5482635312633808, 83 | "sum": 401361, 84 | "sum_of_squares": 717589, 85 | "variance": 0.37100369485597195, 86 | "std_deviation": 0.609100726363031, 87 | "std_deviation_bounds": { 88 | "upper": 2.766464983989443, 89 | "lower": 0.3300620785373187 90 | } 91 | } 92 | }, 93 | { 94 | "key_as_string": "2017-03-27T00:00:00.000Z", 95 | "key": 1490572800000, 96 | "doc_count": 265538, 97 | "some_score": { 98 | "count": 265538, 99 | "min": 0, 100 | "max": 7, 101 | "avg": 1.5432329836031, 102 | "sum": 409787, 103 | "sum_of_squares": 729499, 104 | "variance": 0.36568093963288295, 105 | "std_deviation": 0.6047155857367023, 106 | "std_deviation_bounds": { 107 | "upper": 2.7526641550765047, 108 | "lower": 0.3338018121296955 109 | } 110 | } 111 | }, 112 | { 113 | "key_as_string": "2017-04-03T00:00:00.000Z", 114 | "key": 1491177600000, 115 | "doc_count": 299502, 116 | "some_score": { 117 | "count": 299502, 118 | "min": 0, 119 | "max": 7, 120 | "avg": 1.539488884882238, 121 | "sum": 461080, 122 | "sum_of_squares": 818386, 123 | "variance": 0.3624632388381306, 124 | "std_deviation": 0.6020491996823271, 125 | "std_deviation_bounds": { 126 | "upper": 2.743587284246892, 127 | "lower": 0.33539048551758377 128 | } 129 | } 130 | }, 131 | { 132 | "key_as_string": "2017-04-10T00:00:00.000Z", 133 | "key": 1491782400000, 134 | "doc_count": 303826, 135 | "some_score": { 136 | "count": 303826, 137 | "min": 0, 138 | "max": 7, 139 | "avg": 1.5399274584795244, 140 | "sum": 467870, 141 | "sum_of_squares": 831860, 142 | "variance": 0.36657211693925107, 143 | "std_deviation": 0.6054519939179746, 144 | "std_deviation_bounds": { 145 | "upper": 2.7508314463154733, 146 | "lower": 0.3290234706435753 147 | } 148 | } 149 | }, 150 | { 151 | "key_as_string": "2017-04-17T00:00:00.000Z", 152 | "key": 1492387200000, 153 | "doc_count": 305400, 154 | "some_score": { 155 | "count": 305400, 156 | "min": 0, 157 | "max": 3, 158 | "avg": 1.5349738048461035, 159 | "sum": 468781, 160 | "sum_of_squares": 829427, 161 | "variance": 0.35972640730333577, 162 | "std_deviation": 0.5997719627519578, 163 | "std_deviation_bounds": { 164 | "upper": 2.734517730350019, 165 | "lower": 0.33542987934218793 166 | } 167 | } 168 | }, 169 | { 170 | "key_as_string": "2017-04-24T00:00:00.000Z", 171 | "key": 1492992000000, 172 | "doc_count": 325883, 173 | "some_score": { 174 | "count": 325883, 175 | "min": 0, 176 | "max": 3, 177 | "avg": 1.506402604615767, 178 | "sum": 490911, 179 | "sum_of_squares": 851439, 180 | "variance": 0.34346495817661304, 181 | "std_deviation": 0.5860588350810975, 182 | "std_deviation_bounds": { 183 | "upper": 2.678520274777962, 184 | "lower": 0.3342849344535721 185 | } 186 | } 187 | }, 188 | { 189 | "key_as_string": "2017-05-01T00:00:00.000Z", 190 | "key": 1493596800000, 191 | "doc_count": 92953, 192 | "some_score": { 193 | "count": 92953, 194 | "min": 0, 195 | "max": 3, 196 | "avg": 1.5381429324497327, 197 | "sum": 142975, 198 | "sum_of_squares": 252321, 199 | "variance": 0.34861719614213066, 200 | "std_deviation": 0.5904381391323994, 201 | "std_deviation_bounds": { 202 | "upper": 2.7190192107145315, 203 | "lower": 0.35726665418493386 204 | } 205 | } 206 | } 207 | ] 208 | } 209 | } 210 | } -------------------------------------------------------------------------------- /test-data/aggs_date_histogram_histogram.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 64, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2627223, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "report_week": { 16 | "buckets": [ 17 | { 18 | "key_as_string": "2017-02-27T00:00:00.000Z", 19 | "key": 1488153600000, 20 | "doc_count": 201674, 21 | "num_customers": { 22 | "buckets": [ 23 | { 24 | "key": 0, 25 | "doc_count": 96724 26 | }, 27 | { 28 | "key": 2, 29 | "doc_count": 104950 30 | } 31 | ] 32 | } 33 | }, 34 | { 35 | "key_as_string": "2017-03-06T00:00:00.000Z", 36 | "key": 1488758400000, 37 | "doc_count": 295596, 38 | "num_customers": { 39 | "buckets": [ 40 | { 41 | "key": 0, 42 | "doc_count": 141532 43 | }, 44 | { 45 | "key": 2, 46 | "doc_count": 154061 47 | }, 48 | { 49 | "key": 6, 50 | "doc_count": 3 51 | } 52 | ] 53 | } 54 | }, 55 | { 56 | "key_as_string": "2017-03-13T00:00:00.000Z", 57 | "key": 1489363200000, 58 | "doc_count": 277618, 59 | "num_customers": { 60 | "buckets": [ 61 | { 62 | "key": 0, 63 | "doc_count": 137844 64 | }, 65 | { 66 | "key": 2, 67 | "doc_count": 139770 68 | }, 69 | { 70 | "key": 6, 71 | "doc_count": 4 72 | } 73 | ] 74 | } 75 | }, 76 | { 77 | "key_as_string": "2017-03-20T00:00:00.000Z", 78 | "key": 1489968000000, 79 | "doc_count": 259233, 80 | "num_customers": { 81 | "buckets": [ 82 | { 83 | "key": 0, 84 | "doc_count": 131999 85 | }, 86 | { 87 | "key": 2, 88 | "doc_count": 127233 89 | }, 90 | { 91 | "key": 6, 92 | "doc_count": 1 93 | } 94 | ] 95 | } 96 | }, 97 | { 98 | "key_as_string": "2017-03-27T00:00:00.000Z", 99 | "key": 1490572800000, 100 | "doc_count": 265538, 101 | "num_customers": { 102 | "buckets": [ 103 | { 104 | "key": 0, 105 | "doc_count": 135852 106 | }, 107 | { 108 | "key": 2, 109 | "doc_count": 129683 110 | }, 111 | { 112 | "key": 6, 113 | "doc_count": 3 114 | } 115 | ] 116 | } 117 | }, 118 | { 119 | "key_as_string": "2017-04-03T00:00:00.000Z", 120 | "key": 1491177600000, 121 | "doc_count": 299502, 122 | "num_customers": { 123 | "buckets": [ 124 | { 125 | "key": 0, 126 | "doc_count": 152149 127 | }, 128 | { 129 | "key": 2, 130 | "doc_count": 147352 131 | }, 132 | { 133 | "key": 6, 134 | "doc_count": 1 135 | } 136 | ] 137 | } 138 | }, 139 | { 140 | "key_as_string": "2017-04-10T00:00:00.000Z", 141 | "key": 1491782400000, 142 | "doc_count": 303826, 143 | "num_customers": { 144 | "buckets": [ 145 | { 146 | "key": 0, 147 | "doc_count": 152587 148 | }, 149 | { 150 | "key": 2, 151 | "doc_count": 151237 152 | }, 153 | { 154 | "key": 6, 155 | "doc_count": 2 156 | } 157 | ] 158 | } 159 | }, 160 | { 161 | "key_as_string": "2017-04-17T00:00:00.000Z", 162 | "key": 1492387200000, 163 | "doc_count": 305400, 164 | "num_customers": { 165 | "buckets": [ 166 | { 167 | "key": 0, 168 | "doc_count": 155831 169 | }, 170 | { 171 | "key": 2, 172 | "doc_count": 149569 173 | } 174 | ] 175 | } 176 | }, 177 | { 178 | "key_as_string": "2017-04-24T00:00:00.000Z", 179 | "key": 1492992000000, 180 | "doc_count": 325883, 181 | "num_customers": { 182 | "buckets": [ 183 | { 184 | "key": 0, 185 | "doc_count": 174351 186 | }, 187 | { 188 | "key": 2, 189 | "doc_count": 151532 190 | } 191 | ] 192 | } 193 | }, 194 | { 195 | "key_as_string": "2017-05-01T00:00:00.000Z", 196 | "key": 1493596800000, 197 | "doc_count": 92953, 198 | "num_customers": { 199 | "buckets": [ 200 | { 201 | "key": 0, 202 | "doc_count": 47062 203 | }, 204 | { 205 | "key": 2, 206 | "doc_count": 45891 207 | } 208 | ] 209 | } 210 | } 211 | ] 212 | } 213 | } 214 | } -------------------------------------------------------------------------------- /test-data/aggs_date_histogram_percentiles.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 793, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2627223, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "report_week": { 16 | "buckets": [ 17 | { 18 | "key_as_string": "2017-02-27T00:00:00.000Z", 19 | "key": 1488153600000, 20 | "doc_count": 201674, 21 | "some_score": { 22 | "values": { 23 | "1.0": -34.088625462317765, 24 | "5.0": -22.21437155815218, 25 | "25.0": 0, 26 | "50.0": 0, 27 | "75.0": 0.9635160402586782, 28 | "95.0": 39.92845350581187, 29 | "99.0": 54.1784650053889 30 | } 31 | } 32 | }, 33 | { 34 | "key_as_string": "2017-03-06T00:00:00.000Z", 35 | "key": 1488758400000, 36 | "doc_count": 295596, 37 | "some_score": { 38 | "values": { 39 | "1.0": -34.08282723410825, 40 | "5.0": -22.21305824228935, 41 | "25.0": 0, 42 | "50.0": 0, 43 | "75.0": 11.410704161577858, 44 | "95.0": 39.92881631128357, 45 | "99.0": 54.41136809871141 46 | } 47 | } 48 | }, 49 | { 50 | "key_as_string": "2017-03-13T00:00:00.000Z", 51 | "key": 1489363200000, 52 | "doc_count": 277618, 53 | "some_score": { 54 | "values": { 55 | "1.0": -34.08776755675079, 56 | "5.0": -22.204749690684626, 57 | "25.0": 0, 58 | "50.0": 0, 59 | "75.0": 0, 60 | "95.0": 40.69975507593814, 61 | "99.0": 55.713896441756184 62 | } 63 | } 64 | }, 65 | { 66 | "key_as_string": "2017-03-20T00:00:00.000Z", 67 | "key": 1489968000000, 68 | "doc_count": 259233, 69 | "some_score": { 70 | "values": { 71 | "1.0": -34.090477892822264, 72 | "5.0": -22.183271306999618, 73 | "25.0": 0, 74 | "50.0": 0, 75 | "75.0": 0.0000076293945, 76 | "95.0": 41.17598738972316, 77 | "99.0": 55.81825399052243 78 | } 79 | } 80 | }, 81 | { 82 | "key_as_string": "2017-03-27T00:00:00.000Z", 83 | "key": 1490572800000, 84 | "doc_count": 265538, 85 | "some_score": { 86 | "values": { 87 | "1.0": -33.6658307712262, 88 | "5.0": -22.1560300289784, 89 | "25.0": 0, 90 | "50.0": 0, 91 | "75.0": 0, 92 | "95.0": 40.98001281894075, 93 | "99.0": 53.650719571905 94 | } 95 | } 96 | }, 97 | { 98 | "key_as_string": "2017-04-03T00:00:00.000Z", 99 | "key": 1491177600000, 100 | "doc_count": 299502, 101 | "some_score": { 102 | "values": { 103 | "1.0": -25.785169981452103, 104 | "5.0": -22.198657666424893, 105 | "25.0": 0, 106 | "50.0": 0, 107 | "75.0": 0, 108 | "95.0": 40.17350207009979, 109 | "99.0": 53.226301237661175 110 | } 111 | } 112 | }, 113 | { 114 | "key_as_string": "2017-04-10T00:00:00.000Z", 115 | "key": 1491782400000, 116 | "doc_count": 303826, 117 | "some_score": { 118 | "values": { 119 | "1.0": -23.592878827369006, 120 | "5.0": -22.19212706309159, 121 | "25.0": 0, 122 | "50.0": 0, 123 | "75.0": 0, 124 | "95.0": 40.54370418041331, 125 | "99.0": 52.52955001574485 126 | } 127 | } 128 | }, 129 | { 130 | "key_as_string": "2017-04-17T00:00:00.000Z", 131 | "key": 1492387200000, 132 | "doc_count": 305400, 133 | "some_score": { 134 | "values": { 135 | "1.0": -23.491448460820184, 136 | "5.0": -22.20308940649364, 137 | "25.0": 0, 138 | "50.0": 0, 139 | "75.0": 0, 140 | "95.0": 42.20244370485052, 141 | "99.0": 54.15284094789408 142 | } 143 | } 144 | }, 145 | { 146 | "key_as_string": "2017-04-24T00:00:00.000Z", 147 | "key": 1492992000000, 148 | "doc_count": 325883, 149 | "some_score": { 150 | "values": { 151 | "1.0": -23.50024845031057, 152 | "5.0": -22.20040238272332, 153 | "25.0": 0, 154 | "50.0": 0, 155 | "75.0": 0, 156 | "95.0": 41.52234592261954, 157 | "99.0": 56.313834199795735 158 | } 159 | } 160 | }, 161 | { 162 | "key_as_string": "2017-05-01T00:00:00.000Z", 163 | "key": 1493596800000, 164 | "doc_count": 92953, 165 | "some_score": { 166 | "values": { 167 | "1.0": -24.467312120035082, 168 | "5.0": -22.212801839940852, 169 | "25.0": 0, 170 | "50.0": 0, 171 | "75.0": 0, 172 | "95.0": 41.001332251036345, 173 | "99.0": 55.490378534226195 174 | } 175 | } 176 | } 177 | ] 178 | } 179 | } 180 | } -------------------------------------------------------------------------------- /test-data/aggs_date_histogram_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 21, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2627223, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "report_week": { 16 | "buckets": [ 17 | { 18 | "key_as_string": "2017-02-27T00:00:00.000Z", 19 | "key": 1488153600000, 20 | "doc_count": 201674, 21 | "some_score": { 22 | "count": 201674, 23 | "min": 0, 24 | "max": 3, 25 | "avg": 1.572527941132719, 26 | "sum": 317138 27 | } 28 | }, 29 | { 30 | "key_as_string": "2017-03-06T00:00:00.000Z", 31 | "key": 1488758400000, 32 | "doc_count": 295596, 33 | "some_score": { 34 | "count": 295596, 35 | "min": 0, 36 | "max": 7, 37 | "avg": 1.5650110285660157, 38 | "sum": 462611 39 | } 40 | }, 41 | { 42 | "key_as_string": "2017-03-13T00:00:00.000Z", 43 | "key": 1489363200000, 44 | "doc_count": 277618, 45 | "some_score": { 46 | "count": 277618, 47 | "min": 0, 48 | "max": 7, 49 | "avg": 1.5557384607626306, 50 | "sum": 431901 51 | } 52 | }, 53 | { 54 | "key_as_string": "2017-03-20T00:00:00.000Z", 55 | "key": 1489968000000, 56 | "doc_count": 259233, 57 | "some_score": { 58 | "count": 259233, 59 | "min": 0, 60 | "max": 7, 61 | "avg": 1.5482635312633808, 62 | "sum": 401361 63 | } 64 | }, 65 | { 66 | "key_as_string": "2017-03-27T00:00:00.000Z", 67 | "key": 1490572800000, 68 | "doc_count": 265538, 69 | "some_score": { 70 | "count": 265538, 71 | "min": 0, 72 | "max": 7, 73 | "avg": 1.5432329836031, 74 | "sum": 409787 75 | } 76 | }, 77 | { 78 | "key_as_string": "2017-04-03T00:00:00.000Z", 79 | "key": 1491177600000, 80 | "doc_count": 299502, 81 | "some_score": { 82 | "count": 299502, 83 | "min": 0, 84 | "max": 7, 85 | "avg": 1.539488884882238, 86 | "sum": 461080 87 | } 88 | }, 89 | { 90 | "key_as_string": "2017-04-10T00:00:00.000Z", 91 | "key": 1491782400000, 92 | "doc_count": 303826, 93 | "some_score": { 94 | "count": 303826, 95 | "min": 0, 96 | "max": 7, 97 | "avg": 1.5399274584795244, 98 | "sum": 467870 99 | } 100 | }, 101 | { 102 | "key_as_string": "2017-04-17T00:00:00.000Z", 103 | "key": 1492387200000, 104 | "doc_count": 305400, 105 | "some_score": { 106 | "count": 305400, 107 | "min": 0, 108 | "max": 3, 109 | "avg": 1.5349738048461035, 110 | "sum": 468781 111 | } 112 | }, 113 | { 114 | "key_as_string": "2017-04-24T00:00:00.000Z", 115 | "key": 1492992000000, 116 | "doc_count": 325883, 117 | "some_score": { 118 | "count": 325883, 119 | "min": 0, 120 | "max": 3, 121 | "avg": 1.506402604615767, 122 | "sum": 490911 123 | } 124 | }, 125 | { 126 | "key_as_string": "2017-05-01T00:00:00.000Z", 127 | "key": 1493596800000, 128 | "doc_count": 92953, 129 | "some_score": { 130 | "count": 92953, 131 | "min": 0, 132 | "max": 3, 133 | "avg": 1.5381429324497327, 134 | "sum": 142975 135 | } 136 | } 137 | ] 138 | } 139 | } 140 | } -------------------------------------------------------------------------------- /test-data/aggs_date_histogram_terms.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 44, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 103069, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "report_week": { 16 | "buckets": [ 17 | { 18 | "key_as_string": "2017-02-27T00:00:00.000Z", 19 | "key": 1488153600000, 20 | "doc_count": 7838, 21 | "theater_number": { 22 | "doc_count_error_upper_bound": 0, 23 | "sum_other_doc_count": 0, 24 | "buckets": [ 25 | { 26 | "key": 0, 27 | "doc_count": 5273 28 | }, 29 | { 30 | "key": 1, 31 | "doc_count": 2055 32 | }, 33 | { 34 | "key": 3, 35 | "doc_count": 510 36 | } 37 | ] 38 | } 39 | }, 40 | { 41 | "key_as_string": "2017-03-06T00:00:00.000Z", 42 | "key": 1488758400000, 43 | "doc_count": 11608, 44 | "theater_number": { 45 | "doc_count_error_upper_bound": 0, 46 | "sum_other_doc_count": 0, 47 | "buckets": [ 48 | { 49 | "key": 0, 50 | "doc_count": 8025 51 | }, 52 | { 53 | "key": 1, 54 | "doc_count": 2931 55 | }, 56 | { 57 | "key": 3, 58 | "doc_count": 652 59 | } 60 | ] 61 | } 62 | }, 63 | { 64 | "key_as_string": "2017-03-13T00:00:00.000Z", 65 | "key": 1489363200000, 66 | "doc_count": 12043, 67 | "theater_number": { 68 | "doc_count_error_upper_bound": 0, 69 | "sum_other_doc_count": 0, 70 | "buckets": [ 71 | { 72 | "key": 0, 73 | "doc_count": 8306 74 | }, 75 | { 76 | "key": 1, 77 | "doc_count": 3009 78 | }, 79 | { 80 | "key": 3, 81 | "doc_count": 728 82 | } 83 | ] 84 | } 85 | }, 86 | { 87 | "key_as_string": "2017-03-20T00:00:00.000Z", 88 | "key": 1489968000000, 89 | "doc_count": 11918, 90 | "theater_number": { 91 | "doc_count_error_upper_bound": 0, 92 | "sum_other_doc_count": 0, 93 | "buckets": [ 94 | { 95 | "key": 0, 96 | "doc_count": 8118 97 | }, 98 | { 99 | "key": 1, 100 | "doc_count": 3098 101 | }, 102 | { 103 | "key": 3, 104 | "doc_count": 700 105 | }, 106 | { 107 | "key": 2, 108 | "doc_count": 2 109 | } 110 | ] 111 | } 112 | }, 113 | { 114 | "key_as_string": "2017-03-27T00:00:00.000Z", 115 | "key": 1490572800000, 116 | "doc_count": 11580, 117 | "theater_number": { 118 | "doc_count_error_upper_bound": 0, 119 | "sum_other_doc_count": 0, 120 | "buckets": [ 121 | { 122 | "key": 0, 123 | "doc_count": 8126 124 | }, 125 | { 126 | "key": 1, 127 | "doc_count": 2834 128 | }, 129 | { 130 | "key": 3, 131 | "doc_count": 619 132 | } 133 | ] 134 | } 135 | }, 136 | { 137 | "key_as_string": "2017-04-03T00:00:00.000Z", 138 | "key": 1491177600000, 139 | "doc_count": 11404, 140 | "theater_number": { 141 | "doc_count_error_upper_bound": 0, 142 | "sum_other_doc_count": 0, 143 | "buckets": [ 144 | { 145 | "key": 0, 146 | "doc_count": 7976 147 | }, 148 | { 149 | "key": 1, 150 | "doc_count": 2753 151 | }, 152 | { 153 | "key": 3, 154 | "doc_count": 675 155 | } 156 | ] 157 | } 158 | }, 159 | { 160 | "key_as_string": "2017-04-10T00:00:00.000Z", 161 | "key": 1491782400000, 162 | "doc_count": 10583, 163 | "theater_number": { 164 | "doc_count_error_upper_bound": 0, 165 | "sum_other_doc_count": 0, 166 | "buckets": [ 167 | { 168 | "key": 0, 169 | "doc_count": 7267 170 | }, 171 | { 172 | "key": 1, 173 | "doc_count": 2706 174 | }, 175 | { 176 | "key": 3, 177 | "doc_count": 610 178 | } 179 | ] 180 | } 181 | }, 182 | { 183 | "key_as_string": "2017-04-17T00:00:00.000Z", 184 | "key": 1492387200000, 185 | "doc_count": 11358, 186 | "theater_number": { 187 | "doc_count_error_upper_bound": 0, 188 | "sum_other_doc_count": 0, 189 | "buckets": [ 190 | { 191 | "key": 0, 192 | "doc_count": 7916 193 | }, 194 | { 195 | "key": 1, 196 | "doc_count": 2756 197 | }, 198 | { 199 | "key": 3, 200 | "doc_count": 686 201 | } 202 | ] 203 | } 204 | }, 205 | { 206 | "key_as_string": "2017-04-24T00:00:00.000Z", 207 | "key": 1492992000000, 208 | "doc_count": 11303, 209 | "theater_number": { 210 | "doc_count_error_upper_bound": 0, 211 | "sum_other_doc_count": 0, 212 | "buckets": [ 213 | { 214 | "key": 0, 215 | "doc_count": 7864 216 | }, 217 | { 218 | "key": 1, 219 | "doc_count": 2812 220 | }, 221 | { 222 | "key": 3, 223 | "doc_count": 614 224 | } 225 | ] 226 | } 227 | }, 228 | { 229 | "key_as_string": "2017-05-01T00:00:00.000Z", 230 | "key": 1493596800000, 231 | "doc_count": 3434, 232 | "theater_number": { 233 | "doc_count_error_upper_bound": 0, 234 | "sum_other_doc_count": 0, 235 | "buckets": [ 236 | { 237 | "key": 0, 238 | "doc_count": 2436 239 | }, 240 | { 241 | "key": 1, 242 | "doc_count": 810 243 | }, 244 | { 245 | "key": 3, 246 | "doc_count": 188 247 | } 248 | ] 249 | } 250 | } 251 | ] 252 | } 253 | } 254 | } -------------------------------------------------------------------------------- /test-data/aggs_extended_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 194, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 92958, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "affinity_score": { 16 | "count": 59068, 17 | "min": -37.73445, 18 | "max": 70.62504577636719, 19 | "avg": 1.6455430412652865, 20 | "sum": 97198.93636145795, 21 | "sum_of_squares": 21853691.855166968, 22 | "variance": 367.26733293524387, 23 | "std_deviation": 19.164220123324714, 24 | "std_deviation_bounds": { 25 | "upper": 39.97398328791471, 26 | "lower": -36.682897205384144 27 | } 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /test-data/aggs_histogram.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 36, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 2627232, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "affinity_score": { 16 | "buckets": [ 17 | { 18 | "key": -50, 19 | "doc_count": 21470 20 | }, 21 | { 22 | "key": -25, 23 | "doc_count": 331525 24 | }, 25 | { 26 | "key": 0, 27 | "doc_count": 1096008 28 | }, 29 | { 30 | "key": 25, 31 | "doc_count": 263747 32 | }, 33 | { 34 | "key": 50, 35 | "doc_count": 27445 36 | } 37 | ] 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /test-data/aggs_percentiles.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 76, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 92958, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "affinity_score": { 16 | "values": { 17 | "1.0": -24.4674287519375, 18 | "5.0": -22.212802690289852, 19 | "25.0": 0, 20 | "50.0": 0, 21 | "65.489756": 0, 22 | "75.0": 0, 23 | "95.0": 40.997696236818356, 24 | "99.0": 55.490141729049355 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /test-data/aggs_significant_terms.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 343, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 103104, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "top_tweet_keywords": { 16 | "doc_count": 103104, 17 | "buckets": [ 18 | { 19 | "key": "no", 20 | "doc_count": 72807, 21 | "score": 137.9584061112563, 22 | "bg_count": 384901 23 | }, 24 | { 25 | "key": "cont", 26 | "doc_count": 66740, 27 | "score": 135.87842669458297, 28 | "bg_count": 328493 29 | }, 30 | { 31 | "key": "sa", 32 | "doc_count": 64397, 33 | "score": 125.67996557134086, 34 | "bg_count": 330583 35 | }, 36 | { 37 | "key": "norm", 38 | "doc_count": 65314, 39 | "score": 125.59086038715985, 40 | "bg_count": 340281 41 | }, 42 | { 43 | "key": "nor", 44 | "doc_count": 65314, 45 | "score": 125.58381289257261, 46 | "bg_count": 340300 47 | } 48 | ] 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /test-data/aggs_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 137, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 92958, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "affinity_score": { 16 | "count": 59068, 17 | "min": -37.73445, 18 | "max": 70.62504577636719, 19 | "avg": 1.6455430412652863, 20 | "sum": 97198.93636145793 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /test-data/aggs_terms.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 17, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 120468, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "magic_number": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 2988, 18 | "buckets": [ 19 | { 20 | "key": 3, 21 | "doc_count": 24996 22 | }, 23 | { 24 | "key": 9, 25 | "doc_count": 22329 26 | }, 27 | { 28 | "key": 19, 29 | "doc_count": 21830 30 | }, 31 | { 32 | "key": 8, 33 | "doc_count": 13440 34 | }, 35 | { 36 | "key": 5, 37 | "doc_count": 11663 38 | }, 39 | { 40 | "key": 2, 41 | "doc_count": 9896 42 | }, 43 | { 44 | "key": 4, 45 | "doc_count": 6860 46 | }, 47 | { 48 | "key": 1, 49 | "doc_count": 3676 50 | }, 51 | { 52 | "key": 14, 53 | "doc_count": 1561 54 | }, 55 | { 56 | "key": 6, 57 | "doc_count": 1229 58 | } 59 | ] 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_cardinality.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 5, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 120468, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "customerNumber": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 51313, 18 | "buckets": [ 19 | { 20 | "key": 3, 21 | "doc_count": 24996, 22 | "purchase_types": { 23 | "value": 4 24 | } 25 | }, 26 | { 27 | "key": 9, 28 | "doc_count": 22329, 29 | "purchase_types": { 30 | "value": 4 31 | } 32 | }, 33 | { 34 | "key": 19, 35 | "doc_count": 21830, 36 | "purchase_types": { 37 | "value": 2 38 | } 39 | } 40 | ] 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_date_histogram.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 85, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 3299133, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "customerNumber": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 1610025, 18 | "buckets": [ 19 | { 20 | "key": 3, 21 | "doc_count": 635876, 22 | "purchase_date": { 23 | "buckets": [ 24 | { 25 | "key_as_string": "2017-02-27T00:00:00.000Z", 26 | "key": 1488153600000, 27 | "doc_count": 40944 28 | }, 29 | { 30 | "key_as_string": "2017-03-06T00:00:00.000Z", 31 | "key": 1488758400000, 32 | "doc_count": 67822 33 | }, 34 | { 35 | "key_as_string": "2017-03-13T00:00:00.000Z", 36 | "key": 1489363200000, 37 | "doc_count": 69171 38 | }, 39 | { 40 | "key_as_string": "2017-03-20T00:00:00.000Z", 41 | "key": 1489968000000, 42 | "doc_count": 62926 43 | }, 44 | { 45 | "key_as_string": "2017-03-27T00:00:00.000Z", 46 | "key": 1490572800000, 47 | "doc_count": 70643 48 | }, 49 | { 50 | "key_as_string": "2017-04-03T00:00:00.000Z", 51 | "key": 1491177600000, 52 | "doc_count": 76538 53 | }, 54 | { 55 | "key_as_string": "2017-04-10T00:00:00.000Z", 56 | "key": 1491782400000, 57 | "doc_count": 80135 58 | }, 59 | { 60 | "key_as_string": "2017-04-17T00:00:00.000Z", 61 | "key": 1492387200000, 62 | "doc_count": 72677 63 | }, 64 | { 65 | "key_as_string": "2017-04-24T00:00:00.000Z", 66 | "key": 1492992000000, 67 | "doc_count": 70024 68 | }, 69 | { 70 | "key_as_string": "2017-05-01T00:00:00.000Z", 71 | "key": 1493596800000, 72 | "doc_count": 24996 73 | } 74 | ] 75 | } 76 | }, 77 | { 78 | "key": 5, 79 | "doc_count": 529046, 80 | "purchase_date": { 81 | "buckets": [ 82 | { 83 | "key_as_string": "2017-02-27T00:00:00.000Z", 84 | "key": 1488153600000, 85 | "doc_count": 41429 86 | }, 87 | { 88 | "key_as_string": "2017-03-06T00:00:00.000Z", 89 | "key": 1488758400000, 90 | "doc_count": 60928 91 | }, 92 | { 93 | "key_as_string": "2017-03-13T00:00:00.000Z", 94 | "key": 1489363200000, 95 | "doc_count": 65796 96 | }, 97 | { 98 | "key_as_string": "2017-03-20T00:00:00.000Z", 99 | "key": 1489968000000, 100 | "doc_count": 63584 101 | }, 102 | { 103 | "key_as_string": "2017-03-27T00:00:00.000Z", 104 | "key": 1490572800000, 105 | "doc_count": 60740 106 | }, 107 | { 108 | "key_as_string": "2017-04-03T00:00:00.000Z", 109 | "key": 1491177600000, 110 | "doc_count": 81163 111 | }, 112 | { 113 | "key_as_string": "2017-04-10T00:00:00.000Z", 114 | "key": 1491782400000, 115 | "doc_count": 65028 116 | }, 117 | { 118 | "key_as_string": "2017-04-17T00:00:00.000Z", 119 | "key": 1492387200000, 120 | "doc_count": 40006 121 | }, 122 | { 123 | "key_as_string": "2017-04-24T00:00:00.000Z", 124 | "key": 1492992000000, 125 | "doc_count": 38709 126 | }, 127 | { 128 | "key_as_string": "2017-05-01T00:00:00.000Z", 129 | "key": 1493596800000, 130 | "doc_count": 11663 131 | } 132 | ] 133 | } 134 | }, 135 | { 136 | "key": 19, 137 | "doc_count": 524186, 138 | "purchase_date": { 139 | "buckets": [ 140 | { 141 | "key_as_string": "2017-02-27T00:00:00.000Z", 142 | "key": 1488153600000, 143 | "doc_count": 49385 144 | }, 145 | { 146 | "key_as_string": "2017-03-06T00:00:00.000Z", 147 | "key": 1488758400000, 148 | "doc_count": 42337 149 | }, 150 | { 151 | "key_as_string": "2017-03-13T00:00:00.000Z", 152 | "key": 1489363200000, 153 | "doc_count": 48440 154 | }, 155 | { 156 | "key_as_string": "2017-03-20T00:00:00.000Z", 157 | "key": 1489968000000, 158 | "doc_count": 29970 159 | }, 160 | { 161 | "key_as_string": "2017-03-27T00:00:00.000Z", 162 | "key": 1490572800000, 163 | "doc_count": 37824 164 | }, 165 | { 166 | "key_as_string": "2017-04-03T00:00:00.000Z", 167 | "key": 1491177600000, 168 | "doc_count": 94017 169 | }, 170 | { 171 | "key_as_string": "2017-04-10T00:00:00.000Z", 172 | "key": 1491782400000, 173 | "doc_count": 79809 174 | }, 175 | { 176 | "key_as_string": "2017-04-17T00:00:00.000Z", 177 | "key": 1492387200000, 178 | "doc_count": 47820 179 | }, 180 | { 181 | "key_as_string": "2017-04-24T00:00:00.000Z", 182 | "key": 1492992000000, 183 | "doc_count": 72754 184 | }, 185 | { 186 | "key_as_string": "2017-05-01T00:00:00.000Z", 187 | "key": 1493596800000, 188 | "doc_count": 21830 189 | } 190 | ] 191 | } 192 | } 193 | ] 194 | } 195 | } 196 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_extended_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 418, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 120468, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "campaign_status": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 55, 18 | "buckets": [ 19 | { 20 | "key": "market_to", 21 | "doc_count": 107736, 22 | "some_score": { 23 | "count": 74717, 24 | "min": -175.1807861328125, 25 | "max": 151.98361, 26 | "avg": 4.2240326056588025, 27 | "sum": 315607.04419700877, 28 | "sum_of_squares": 187165555.7172291, 29 | "variance": 2487.150464713055, 30 | "std_deviation": 49.87133911088667, 31 | "std_deviation_bounds": { 32 | "upper": 103.96671082743215, 33 | "lower": -95.51864561611454 34 | } 35 | } 36 | }, 37 | { 38 | "key": "maybe", 39 | "doc_count": 10548, 40 | "some_score": { 41 | "count": 10456, 42 | "min": -90.16599, 43 | "max": 148.19164, 44 | "avg": 117.63368726205155, 45 | "sum": 1229977.834012011, 46 | "sum_of_squares": 169895700.71152127, 47 | "variance": 2410.9480533757473, 48 | "std_deviation": 49.10140581873137, 49 | "std_deviation_bounds": { 50 | "upper": 215.8364988995143, 51 | "lower": 19.430875624588808 52 | } 53 | } 54 | }, 55 | { 56 | "key": "ignore", 57 | "doc_count": 2129, 58 | "some_score": { 59 | "count": 2127, 60 | "min": -90.16599, 61 | "max": 148.35457, 62 | "avg": 75.17809260768585, 63 | "sum": 159903.80297654783, 64 | "sum_of_squares": 24416683.030253433, 65 | "variance": 5827.654029977135, 66 | "std_deviation": 76.33907276078965, 67 | "std_deviation_bounds": { 68 | "upper": 227.85623812926514, 69 | "lower": -77.50005291389344 70 | } 71 | } 72 | } 73 | ] 74 | } 75 | } 76 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_histogram.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 295, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 120468, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "campaign_status": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 55, 18 | "buckets": [ 19 | { 20 | "key": "ignore", 21 | "doc_count": 107736, 22 | "affinity_score": { 23 | "buckets": [ 24 | { 25 | "key": -50, 26 | "doc_count": 5262 27 | }, 28 | { 29 | "key": 0, 30 | "doc_count": 66695 31 | }, 32 | { 33 | "key": 50, 34 | "doc_count": 2760 35 | } 36 | ] 37 | } 38 | }, 39 | { 40 | "key": "maybe", 41 | "doc_count": 10548, 42 | "affinity_score": { 43 | "buckets": [ 44 | { 45 | "key": -50, 46 | "doc_count": 9099 47 | }, 48 | { 49 | "key": 0, 50 | "doc_count": 1357 51 | } 52 | ] 53 | } 54 | }, 55 | { 56 | "key": "market_to", 57 | "doc_count": 2129, 58 | "affinity_score": { 59 | "buckets": [ 60 | { 61 | "key": -50, 62 | "doc_count": 1097 63 | }, 64 | { 65 | "key": 0, 66 | "doc_count": 1030 67 | } 68 | ] 69 | } 70 | } 71 | ] 72 | } 73 | } 74 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_percentiles.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 142, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 120468, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "campaign_status": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 55, 18 | "buckets": [ 19 | { 20 | "key": "maybe", 21 | "doc_count": 107736, 22 | "some_score": { 23 | "values": { 24 | "1.0": -112.12122535844742, 25 | "5.0": -86.76868939171038, 26 | "25.0": 0, 27 | "50.0": 0, 28 | "60.58934": 0, 29 | "75.0": 0, 30 | "95.0": 117.86637213803532, 31 | "99.0": 129.32387561889348 32 | } 33 | } 34 | }, 35 | { 36 | "key": "ignore", 37 | "doc_count": 10548, 38 | "some_score": { 39 | "values": { 40 | "1.0": 0, 41 | "5.0": 0, 42 | "25.0": 117.86835624469992, 43 | "50.0": 148.06826928571428, 44 | "60.58934": 148.0793833809623, 45 | "75.0": 148.09198967492816, 46 | "95.0": 148.10842188873627, 47 | "99.0": 148.1524185 48 | } 49 | } 50 | }, 51 | { 52 | "key": "market_to", 53 | "doc_count": 2129, 54 | "some_score": { 55 | "values": { 56 | "1.0": -90.1644744, 57 | "5.0": 0, 58 | "25.0": 0, 59 | "50.0": 148.1146593939394, 60 | "60.58934": 148.15548840481475, 61 | "75.0": 148.1812391941392, 62 | "95.0": 148.31047099999998, 63 | "99.0": 148.3409206 64 | } 65 | } 66 | } 67 | ] 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_significant_terms.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 236, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 3433, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "popularity_score": { 16 | "doc_count_error_upper_bound": 11, 17 | "sum_other_doc_count": 2625, 18 | "buckets": [ 19 | { 20 | "key": "summaries", 21 | "doc_count": 388, 22 | "comment_term": { 23 | "doc_count": 388, 24 | "buckets": [ 25 | { 26 | "key": "suggeste", 27 | "doc_count": 375, 28 | "score": 4473.743549520632, 29 | "bg_count": 22318 30 | }, 31 | { 32 | "key": "suggested", 33 | "doc_count": 375, 34 | "score": 4473.743549520632, 35 | "bg_count": 22318 36 | }, 37 | { 38 | "key": "strong", 39 | "doc_count": 376, 40 | "score": 4065.286271318061, 41 | "bg_count": 24691 42 | }, 43 | { 44 | "key": "stron", 45 | "doc_count": 376, 46 | "score": 4065.286271318061, 47 | "bg_count": 24691 48 | }, 49 | { 50 | "key": "stro", 51 | "doc_count": 376, 52 | "score": 4064.133797943521, 53 | "bg_count": 24698 54 | }, 55 | { 56 | "key": "heavy", 57 | "doc_count": 376, 58 | "score": 3803.2106294847563, 59 | "bg_count": 26392 60 | }, 61 | { 62 | "key": "action", 63 | "doc_count": 367, 64 | "score": 2914.3560174102226, 65 | "bg_count": 32810 66 | }, 67 | { 68 | "key": "actio", 69 | "doc_count": 367, 70 | "score": 2914.3560174102226, 71 | "bg_count": 32810 72 | }, 73 | { 74 | "key": "gorgeous", 75 | "doc_count": 4, 76 | "score": 2840.6390424062065, 77 | "bg_count": 4 78 | }, 79 | { 80 | "key": "suggest", 81 | "doc_count": 371, 82 | "score": 2748.930000046098, 83 | "bg_count": 35546 84 | } 85 | ] 86 | } 87 | }, 88 | { 89 | "key": "opinion", 90 | "doc_count": 230, 91 | "comment_term": { 92 | "doc_count": 230, 93 | "buckets": [ 94 | { 95 | "key": "check", 96 | "doc_count": 3, 97 | "score": 1957.5013232514177, 98 | "bg_count": 9 99 | }, 100 | { 101 | "key": "sealer", 102 | "doc_count": 6, 103 | "score": 1531.941678310183, 104 | "bg_count": 46 105 | }, 106 | { 107 | "key": "scrape", 108 | "doc_count": 6, 109 | "score": 1531.941678310183, 110 | "bg_count": 46 111 | }, 112 | { 113 | "key": "splines", 114 | "doc_count": 26, 115 | "score": 1341.9556298577054, 116 | "bg_count": 986 117 | }, 118 | { 119 | "key": "doesn'", 120 | "doc_count": 6, 121 | "score": 1304.9834908632638, 122 | "bg_count": 54 123 | }, 124 | { 125 | "key": "doesn't", 126 | "doc_count": 6, 127 | "score": 1304.9834908632638, 128 | "bg_count": 54 129 | }, 130 | { 131 | "key": "love", 132 | "doc_count": 26, 133 | "score": 1292.1523001831285, 134 | "bg_count": 1024 135 | }, 136 | { 137 | "key": "miles", 138 | "doc_count": 17, 139 | "score": 1240.5437448180944, 140 | "bg_count": 456 141 | }, 142 | { 143 | "key": "mile", 144 | "doc_count": 17, 145 | "score": 1240.5437448180944, 146 | "bg_count": 456 147 | }, 148 | { 149 | "key": "doesn", 150 | "doc_count": 6, 151 | "score": 1194.3894588446378, 152 | "bg_count": 59 153 | } 154 | ] 155 | } 156 | }, 157 | { 158 | "key": "reviews", 159 | "doc_count": 190, 160 | "comment_term": { 161 | "doc_count": 190, 162 | "buckets": [ 163 | { 164 | "key": "hey", 165 | "doc_count": 13, 166 | "score": 16157.088567867037, 167 | "bg_count": 30 168 | }, 169 | { 170 | "key": "whoa", 171 | "doc_count": 6, 172 | "score": 14750.373787099328, 173 | "bg_count": 7 174 | }, 175 | { 176 | "key": "only", 177 | "doc_count": 14, 178 | "score": 10410.191831332715, 179 | "bg_count": 54 180 | }, 181 | { 182 | "key": "no", 183 | "doc_count": 14, 184 | "score": 10410.191831332715, 185 | "bg_count": 54 186 | }, 187 | { 188 | "key": "not", 189 | "doc_count": 5, 190 | "score": 10243.310743965174, 191 | "bg_count": 7 192 | }, 193 | { 194 | "key": "first", 195 | "doc_count": 3, 196 | "score": 8604.387340720223, 197 | "bg_count": 3 198 | }, 199 | { 200 | "key": "fly", 201 | "doc_count": 4, 202 | "score": 2415.2500014579387, 203 | "bg_count": 19 204 | }, 205 | { 206 | "key": "sizes", 207 | "doc_count": 4, 208 | "score": 2415.2500014579387, 209 | "bg_count": 19 210 | }, 211 | { 212 | "key": "unacc", 213 | "doc_count": 9, 214 | "score": 2370.5534939793097, 215 | "bg_count": 98 216 | }, 217 | { 218 | "key": "unacce", 219 | "doc_count": 9, 220 | "score": 2370.5534939793097, 221 | "bg_count": 98 222 | } 223 | ] 224 | } 225 | } 226 | ] 227 | } 228 | } 229 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 4, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 64, 6 | "successful": 64, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 120468, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "customerNumber": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 51313, 18 | "buckets": [ 19 | { 20 | "key": 3, 21 | "doc_count": 24996, 22 | "some_score": { 23 | "count": 24996, 24 | "min": 0, 25 | "max": 2, 26 | "avg": 0.06052968474955993, 27 | "sum": 1513 28 | } 29 | }, 30 | { 31 | "key": 9, 32 | "doc_count": 22329, 33 | "some_score": { 34 | "count": 22329, 35 | "min": 0, 36 | "max": 1, 37 | "avg": 0.009807873169420932, 38 | "sum": 219 39 | } 40 | }, 41 | { 42 | "key": 19, 43 | "doc_count": 21830, 44 | "some_score": { 45 | "count": 21830, 46 | "min": 0, 47 | "max": 0, 48 | "avg": 0, 49 | "sum": 0 50 | } 51 | } 52 | ] 53 | } 54 | } 55 | } -------------------------------------------------------------------------------- /test-data/aggs_terms_terms.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 9, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 5, 6 | "successful": 5, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 120468, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "magic_number": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 51313, 18 | "buckets": [ 19 | { 20 | "key": 3, 21 | "doc_count": 24996, 22 | "customerType": { 23 | "doc_count_error_upper_bound": 0, 24 | "sum_other_doc_count": 0, 25 | "buckets": [ 26 | { 27 | "key": "type_a", 28 | "doc_count": 24996 29 | } 30 | ] 31 | } 32 | }, 33 | { 34 | "key": 9, 35 | "doc_count": 22329, 36 | "customerType": { 37 | "doc_count_error_upper_bound": 0, 38 | "sum_other_doc_count": 0, 39 | "buckets": [ 40 | { 41 | "key": "type_a", 42 | "doc_count": 22329 43 | } 44 | ] 45 | } 46 | }, 47 | { 48 | "key": 19, 49 | "doc_count": 21830, 50 | "customerType": { 51 | "doc_count_error_upper_bound": 0, 52 | "sum_other_doc_count": 0, 53 | "buckets": [ 54 | { 55 | "key": "type_a", 56 | "doc_count": 21830 57 | } 58 | ] 59 | } 60 | } 61 | ] 62 | } 63 | } 64 | } -------------------------------------------------------------------------------- /test-data/empty_terms.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 15, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 5, 6 | "successful": 5, 7 | "skipped": 0, 8 | "failed": 0 9 | }, 10 | "hits": { 11 | "total": 48, 12 | "max_score": 0.0, 13 | "hits": [] 14 | }, 15 | "aggregations": { 16 | "blegh": { 17 | "doc_count_error_upper_bound": 0, 18 | "sum_other_doc_count": 0, 19 | "buckets": [] 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /test-data/es5_shakespeare_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "_default_": { 4 | "properties": { 5 | "speaker": { 6 | "type": "string", 7 | "fielddata": true 8 | }, 9 | "play_name": { 10 | "type": "string", 11 | "fielddata": true 12 | }, 13 | "line_id": { 14 | "type": "integer" 15 | }, 16 | "line_number": { 17 | "type": "string", 18 | "fielddata": true 19 | }, 20 | "speech_number": { 21 | "type": "integer" 22 | }, 23 | "text_entry": { 24 | "type": "string", 25 | "fielddata": true 26 | } 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /test-data/es6_shakespeare_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "_default_": { 4 | "properties": { 5 | "speaker": { 6 | "type": "text", 7 | "fielddata": true 8 | }, 9 | "play_name": { 10 | "type": "text", 11 | "fielddata": true 12 | }, 13 | "line_id": { 14 | "type": "integer" 15 | }, 16 | "line_number": { 17 | "type": "text", 18 | "fielddata": true 19 | }, 20 | "speech_number": { 21 | "type": "integer" 22 | }, 23 | "text_entry": { 24 | "type": "text", 25 | "fielddata": true 26 | } 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /test-data/es7_shakespeare_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "properties": { 4 | "speaker": { 5 | "type": "keyword" 6 | }, 7 | "play_name": { 8 | "type": "keyword" 9 | }, 10 | "line_id": { 11 | "type": "integer" 12 | }, 13 | "line_number": { 14 | "type": "keyword" 15 | }, 16 | "speech_number": { 17 | "type": "keyword" 18 | }, 19 | "text_entry": { 20 | "type": "text" 21 | } 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /test-data/es_hits.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 54, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 16, 6 | "successful": 16, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 46872, 11 | "max_score": 0.882234, 12 | "hits": [ 13 | {"_index": "staging_redsawx", "_type": "ballplayer", "_id": "abc123", "_score": 0.882234, 14 | "_source": {"name": "David Ortiz", "stats" : {"yrs_played": 20, "final_season": {"avg": 0.315, "HR": 38, "R": 79}, 15 | "full_career": {"avg": 0.286, "HR": 541, "R": 1419}}}}, 16 | {"_index": "staging_redsawx", "_type": "ballplayer", "_id": "def567", "_score": 0.882234, 17 | "_source": {"name": "Kevin Youkilis", "stats" : {"yrs_played": 10, "final_season": {"avg": 0.219, "HR": 2, "R": 12}, 18 | "full_career": {"avg": 0.281, "HR": 150, "R": 653}}}}, 19 | {"_index": "staging_redsawx", "_type": "ballplayer", "_id": "abc567", "_score": 0.882234, 20 | "_source": {"name": "Trot Nixon", "stats" : {"yrs_played": 12, "final_season": {"avg": 0.171, "HR": 1, "R": 2}, 21 | "full_career": {"avg": 0.274, "HR": 137, "R": 579}}}}, 22 | {"_index": "staging_redsawx", "_type": "ballplayer", "_id": "def123", "_score": 0.882234, 23 | "_source": {"name": "Manny Ramirez", "stats" : {"yrs_played": 19, "final_season": {"avg": 0.059, "HR": 0, "R": 0}, 24 | "full_career": {"avg": 0.312, "HR": 555, "R": 1544}}}}, 25 | {"_index": "staging_redsawx", "_type": "ballplayer", "_id": "ghi890", "_score": 0.882234, 26 | "_source": {"name": "Jason Varitek", "stats" : {"yrs_played": 15, "final_season": {"avg": 0.221, "HR": 11, "R": 32}, 27 | "full_career": {"avg": 0.256, "HR": "193", "R": 664}}}} 28 | ] 29 | } 30 | } -------------------------------------------------------------------------------- /test-data/legacy_shakespeare_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "_default_": { 4 | "properties": { 5 | "speaker": { 6 | "type": "string" 7 | }, 8 | "play_name": { 9 | "type": "string" 10 | }, 11 | "line_id": { 12 | "type": "integer" 13 | }, 14 | "line_number": { 15 | "type": "string" 16 | }, 17 | "speech_number": { 18 | "type": "integer" 19 | }, 20 | "text_entry": { 21 | "type": "string" 22 | } 23 | } 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /test-data/one_index_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "basketball": { 3 | "mappings": { 4 | "players": { 5 | "properties": { 6 | "team": { 7 | "type": "keyword" 8 | }, 9 | "name": { 10 | "properties": { 11 | "first": { 12 | "type": "text" 13 | }, 14 | "last": { 15 | "type": "text" 16 | } 17 | } 18 | }, 19 | "age": { 20 | "type": "integer" 21 | }, 22 | "position": { 23 | "type": "keyword" 24 | } 25 | } 26 | } 27 | } 28 | } 29 | } -------------------------------------------------------------------------------- /test-data/one_var_agg.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 5, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 16, 6 | "successful": 16, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 110207, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "some_variable": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 0, 18 | "buckets": [ 19 | { 20 | "key": "level1", 21 | "doc_count": 62159 22 | }, 23 | { 24 | "key": "level2", 25 | "doc_count": 21576 26 | }, 27 | { 28 | "key": "level3", 29 | "doc_count": 10575 30 | } 31 | ] 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /test-data/three_var_agg.json: -------------------------------------------------------------------------------- 1 | { 2 | "took": 494, 3 | "timed_out": false, 4 | "_shards": { 5 | "total": 16, 6 | "successful": 16, 7 | "failed": 0 8 | }, 9 | "hits": { 10 | "total": 11335918, 11 | "max_score": 0, 12 | "hits": [] 13 | }, 14 | "aggregations": { 15 | "a_grouping_var": { 16 | "doc_count_error_upper_bound": 0, 17 | "sum_other_doc_count": 526088, 18 | "buckets": [ 19 | { 20 | "key": 0, 21 | "doc_count": 3403964, 22 | "another_one": { 23 | "doc_count_error_upper_bound": 23422, 24 | "sum_other_doc_count": 2941783, 25 | "buckets": [ 26 | { 27 | "key": 2915, 28 | "doc_count": 188629, 29 | "yet_another_one": { 30 | "doc_count_error_upper_bound": 0, 31 | "sum_other_doc_count": 0, 32 | "buckets": [ 33 | { 34 | "key": "lupe_fiasco", 35 | "doc_count": 168098 36 | }, 37 | { 38 | "key": "tech_n9ne", 39 | "doc_count": 20531 40 | } 41 | ] 42 | } 43 | }, 44 | { 45 | "key": 3952, 46 | "doc_count": 146357, 47 | "yet_another_one": { 48 | "doc_count_error_upper_bound": 0, 49 | "sum_other_doc_count": 0, 50 | "buckets": [ 51 | { 52 | "key": "lupe_fiasco", 53 | "doc_count": 145484 54 | }, 55 | { 56 | "key": "tech_n9ne", 57 | "doc_count": 873 58 | } 59 | ] 60 | } 61 | }, 62 | { 63 | "key": 2632, 64 | "doc_count": 127195, 65 | "yet_another_one": { 66 | "doc_count_error_upper_bound": 0, 67 | "sum_other_doc_count": 0, 68 | "buckets": [ 69 | { 70 | "key": "lupe_fiasco", 71 | "doc_count": 121318 72 | }, 73 | { 74 | "key": "tech_n9ne", 75 | "doc_count": 5877 76 | } 77 | ] 78 | } 79 | } 80 | ] 81 | } 82 | }, 83 | { 84 | "key": 2, 85 | "doc_count": 3360049, 86 | "another_one": { 87 | "doc_count_error_upper_bound": 13449, 88 | "sum_other_doc_count": 2105828, 89 | "buckets": [ 90 | { 91 | "key": 2349, 92 | "doc_count": 542582, 93 | "yet_another_one": { 94 | "doc_count_error_upper_bound": 0, 95 | "sum_other_doc_count": 0, 96 | "buckets": [ 97 | { 98 | "key": "childish_gambino", 99 | "doc_count": 485820 100 | }, 101 | { 102 | "key": "tech_n9ne", 103 | "doc_count": 56762 104 | } 105 | ] 106 | } 107 | }, 108 | { 109 | "key": 2201, 110 | "doc_count": 505387, 111 | "yet_another_one": { 112 | "doc_count_error_upper_bound": 0, 113 | "sum_other_doc_count": 0, 114 | "buckets": [ 115 | { 116 | "key": "childish_gambino", 117 | "doc_count": 470503 118 | }, 119 | { 120 | "key": "tech_n9ne", 121 | "doc_count": 34884 122 | } 123 | ] 124 | } 125 | }, 126 | { 127 | "key": 2247, 128 | "doc_count": 206252, 129 | "yet_another_one": { 130 | "doc_count_error_upper_bound": 0, 131 | "sum_other_doc_count": 0, 132 | "buckets": [ 133 | { 134 | "key": "childish_gambino", 135 | "doc_count": 188375 136 | }, 137 | { 138 | "key": "tech_n9ne", 139 | "doc_count": 17877 140 | } 141 | ] 142 | } 143 | } 144 | ] 145 | } 146 | }, 147 | { 148 | "key": 1, 149 | "doc_count": 2600800, 150 | "another_one": { 151 | "doc_count_error_upper_bound": 17346, 152 | "sum_other_doc_count": 1692470, 153 | "buckets": [ 154 | { 155 | "key": 2126, 156 | "doc_count": 433735, 157 | "yet_another_one": { 158 | "doc_count_error_upper_bound": 0, 159 | "sum_other_doc_count": 0, 160 | "buckets": [ 161 | { 162 | "key": "lupe_fiasco", 163 | "doc_count": 405476 164 | }, 165 | { 166 | "key": "tech_n9ne", 167 | "doc_count": 28259 168 | } 169 | ] 170 | } 171 | }, 172 | { 173 | "key": 777, 174 | "doc_count": 277387, 175 | "yet_another_one": { 176 | "doc_count_error_upper_bound": 0, 177 | "sum_other_doc_count": 0, 178 | "buckets": [ 179 | { 180 | "key": "lupe_fiasco", 181 | "doc_count": 241894 182 | }, 183 | { 184 | "key": "tech_n9ne", 185 | "doc_count": 35493 186 | } 187 | ] 188 | } 189 | }, 190 | { 191 | "key": 663, 192 | "doc_count": 197208, 193 | "yet_another_one": { 194 | "doc_count_error_upper_bound": 0, 195 | "sum_other_doc_count": 0, 196 | "buckets": [ 197 | { 198 | "key": "lupe_fiasco", 199 | "doc_count": 193540 200 | }, 201 | { 202 | "key": "tech_n9ne", 203 | "doc_count": 3668 204 | } 205 | ] 206 | } 207 | } 208 | ] 209 | } 210 | } 211 | ] 212 | } 213 | } 214 | } -------------------------------------------------------------------------------- /test-data/two_index_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "company": { 3 | "mappings": { 4 | "building": { 5 | "properties": { 6 | "id": { 7 | "type": "long" 8 | }, 9 | "address": { 10 | "type": "text", 11 | "fields": { 12 | "keyword": { 13 | "type": "keyword", 14 | "ignore_above": 256 15 | } 16 | } 17 | } 18 | } 19 | } 20 | } 21 | }, 22 | "hotel": { 23 | "mappings": { 24 | "bed_room": { 25 | "properties": { 26 | "num_beds": { 27 | "type": "integer" 28 | }, 29 | "description": { 30 | "type": "text" 31 | } 32 | } 33 | }, 34 | "conference_room": { 35 | "properties": { 36 | "num_people": { 37 | "type": "integer" 38 | }, 39 | "purpose": { 40 | "type": "text", 41 | "fields": { 42 | "keyword": { 43 | "type": "keyword", 44 | "ignore_above": 256 45 | } 46 | } 47 | } 48 | } 49 | } 50 | } 51 | } 52 | } --------------------------------------------------------------------------------