├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── api.R
    ├── elasticsearchr.R
    ├── mappings.R
    └── utils.R
├── README.md
├── appveyor.yml
├── cran-comments.md
├── elasticsearchr.Rproj
├── man
    ├── aggs.Rd
    ├── check_http_code_throw_error.Rd
    ├── cleaned_field_names.Rd
    ├── create_bulk_upload_file.Rd
    ├── create_metadata.Rd
    ├── elastic.Rd
    ├── elastic_predicates.Rd
    ├── elastic_version.Rd
    ├── elasticsearchr.Rd
    ├── extract_query_results.Rd
    ├── from_size_search.Rd
    ├── grapes-create-grapes.Rd
    ├── grapes-delete-grapes.Rd
    ├── grapes-index-grapes.Rd
    ├── grapes-info-grapes.Rd
    ├── grapes-search-grapes.Rd
    ├── index_bulk_dataframe.Rd
    ├── list_fields.Rd
    ├── list_indices.Rd
    ├── mapping_default_simple.Rd
    ├── mapping_fielddata_true.Rd
    ├── plus-.elastic_api.Rd
    ├── print.elastic_api.Rd
    ├── query.Rd
    ├── scroll_search.Rd
    ├── select_fields.Rd
    ├── sort_on.Rd
    ├── valid_connection.Rd
    └── valid_json.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── helper-elasticsearch_test_data.R
    │   ├── test-api.R
    │   └── test-utils.R
└── vignettes
    └── quick_start.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^Meta$
 2 | ^doc$
 3 | ^CRAN-RELEASE$
 4 | ^.*\.Rproj$
 5 | ^\.Rproj\.user$
 6 | ^\.travis\.yml$
 7 | .Rprofile
 8 | man-roxygen
 9 | ^README\.Rmd$
10 | ^appveyor\.yml$
11 | ^cran-comments\.md$
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Meta
2 | doc
3 | .Rproj.user
4 | .Rhistory
5 | .RData
6 | inst/doc
7 | *.html
8 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | 
 3 | language: r
 4 | 
 5 | r_packages:
 6 |   - rmarkdown
 7 | #  - covr
 8 | 
 9 | #after_success:
10 | #  - Rscript -e 'library(covr);codecov()'
11 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: elasticsearchr
 2 | Type: Package
 3 | Version: 0.3.1
 4 | Title: A Lightweight Interface for Interacting with Elasticsearch from R
 5 | Date: 2019-07-28
 6 | Author: Alex Ioannides
 7 | Maintainer: Alex Ioannides <alex.ioannides@yahoo.co.uk>
 8 | Description: A lightweight R interface to 'Elasticsearch' - a NoSQL search-engine and 
 9 |     column store database (see <https://www.elastic.co/products/elasticsearch> for more 
10 |     information). This package implements a simple Domain-Specific Language (DSL) for indexing, 
11 |     deleting, querying, sorting and aggregating data using 'Elasticsearch'.
12 | License: Apache License 2.0
13 | URL: https://github.com/alexioannides/elasticsearchr
14 | BugReports: https://github.com/alexioannides/elasticsearchr/issues
15 | LazyData: TRUE
16 | Imports:
17 |     httr,
18 |     jsonlite,
19 |     dplyr
20 | Suggests:
21 |     knitr,
22 |     testthat,
23 |     rmarkdown
24 | VignetteBuilder: knitr
25 | RoxygenNote: 6.1.1
26 | Encoding: UTF-8
27 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method("%create%",elastic_rescource)
 4 | S3method("%delete%",elastic_rescource)
 5 | S3method("%index%",elastic_rescource)
 6 | S3method("%info%",elastic_rescource)
 7 | S3method("%search%",elastic)
 8 | S3method("+",elastic_api)
 9 | S3method(print,elastic_api)
10 | export("%create%")
11 | export("%delete%")
12 | export("%index%")
13 | export("%info%")
14 | export("%search%")
15 | export(aggs)
16 | export(elastic)
17 | export(is_elastic)
18 | export(is_elastic_aggs)
19 | export(is_elastic_api)
20 | export(is_elastic_info)
21 | export(is_elastic_query)
22 | export(is_elastic_rescource)
23 | export(is_elastic_sort)
24 | export(is_elastic_source_filter)
25 | export(list_fields)
26 | export(list_indices)
27 | export(mapping_default_simple)
28 | export(mapping_fielddata_true)
29 | export(query)
30 | export(select_fields)
31 | export(sort_on)
32 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # elasticsearchr 0.3.1
 2 | 
 3 | * Changed back-end to use dplyr::bind_rows as opposed to do.call(rbind) to handle NAs consistently.
 4 | 
 5 | 
 6 | # elasticsearchr 0.3.0
 7 | 
 8 | * Added support for source filtering with the `select_fields` function.
 9 | * Added the `%info%` operator to be used for retreiving cluster and index information.
10 | * Added the `list_indices` function for retrieving a list of all available indices.
11 | * Added the `list_fields` function for retrieving a list of all available indices.
12 | 
13 | 
14 | # elasticsearchr 0.2.3
15 | 
16 | * Refactored `valid_url` with the enhanced `valid_connection` function, based on HTTP requests to the cluster health API.
17 | 
18 | 
19 | # elasticsearchr 0.2.2
20 | 
21 | * Added support for HTTPS endpoints.
22 | 
23 | 
24 | # elasticsearchr 0.2.1
25 | 
26 | * Modified all HTTP requests to have `Content-Type` headers for Elasticsearch 6.x compatibility;
27 | * Fixed an issue where `extract_aggs_results` could not handle results from base metric aggregations;
28 | * Fixed an issue where `valid_url` will not return `TRUE` when Elasticsearch port numbers are not 4 digits long;
29 | 
30 | 
31 | # elasticsearchr 0.2.0
32 | 
33 | * BREAKING CHANGE: `sort` has been renamed to `sort_on` to avoid clashing with the sort function in base R;
34 | * Fixed an issue with `valid_url` that was causing an error on r-oldrel-windows-ix86+x86_64;
35 | * Fixed data.frame index bug - Elasticsearch Bulk API was failing when data.frame was large (>15mb); and,
36 | * Fixed an issue with `search_scroll` printing dots to stdout as opposed to returning them as message output.
37 | * Enhanced `search_scroll` such that it retrieves a maximum of 10,000 documents per-scroll to speed-up query retrieval;
38 | 
39 | 
40 | # elasticsearchr 0.1.0
41 | 
42 | * Initial release.
43 | 


--------------------------------------------------------------------------------
/R/api.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2019 Alex Ioannides
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # ---- classes, methods and predicates ------------------------------------------------------------
 17 | 
 18 | 
 19 | #' elasticsearchr predicate functions.
 20 | #'
 21 | #' Predicate functions for identifying different elasticsearchr object types.
 22 | #'
 23 | #' @param x An elasticsearchr object.
 24 | #' @return Boolean.
 25 | #' @name elastic_predicates
 26 | NULL
 27 | 
 28 | #' @export
 29 | #' @rdname elastic_predicates
 30 | is_elastic <- function(x) inherits(x, "elastic")
 31 | 
 32 | #' @export
 33 | #' @rdname elastic_predicates
 34 | is_elastic_rescource <- function(x) inherits(x, "elastic_rescource")
 35 | 
 36 | #' @export
 37 | #' @rdname elastic_predicates
 38 | is_elastic_api <- function(x) inherits(x, "elastic_api")
 39 | 
 40 | #' @export
 41 | #' @rdname elastic_predicates
 42 | is_elastic_query <- function(x) inherits(x, "elastic_query")
 43 | 
 44 | #' @export
 45 | #' @rdname elastic_predicates
 46 | is_elastic_aggs <- function(x) inherits(x, "elastic_aggs")
 47 | 
 48 | #' @export
 49 | #' @rdname elastic_predicates
 50 | is_elastic_sort <- function(x) inherits(x, "elastic_sort")
 51 | 
 52 | #' @export
 53 | #' @rdname elastic_predicates
 54 | is_elastic_source_filter <- function(x) inherits(x, "elastic_source_filter")
 55 | 
 56 | #' @export
 57 | #' @rdname elastic_predicates
 58 | is_elastic_info <- function(x) inherits(x, "elastic_info")
 59 | 
 60 | 
 61 | #' elastic_resource class constructor.
 62 | #'
 63 | #' Objects of this class contain all of the information required to locate documents in an
 64 | #' Elasticsearch cluster.
 65 | #'
 66 | #' @export
 67 | #'
 68 | #' @param cluster_url URL to the Elastic cluster.
 69 | #' @param index The name of an index on the Elasticsearch cluster.
 70 | #' @param doc_type [optional] The name of a document type within the index.
 71 | #' @return An \code{elastic_rescource} object.
 72 | #'
 73 | #' @examples
 74 | #' \dontrun{
 75 | #' my_data <- elastic("http://localhost:9200", "iris", "data")
 76 | #' }
 77 | elastic <- function(cluster_url, index, doc_type = NULL) {
 78 |   stopifnot(is.character(cluster_url), is.character(index), is.character(doc_type) | is.null(doc_type),
 79 |             valid_connection(cluster_url))
 80 | 
 81 |   if (substr(cluster_url, nchar(cluster_url), nchar(cluster_url)) == "/") {
 82 |     valid_index_url <- paste0(cluster_url, index)
 83 |   } else {
 84 |     valid_index_url <- paste0(cluster_url, "/", index)
 85 |   }
 86 | 
 87 |   if (is.null(doc_type)) {
 88 |     valid_search_endpoint <- paste0(valid_index_url, "/_search")
 89 |   } else {
 90 |     valid_search_endpoint <- paste0(valid_index_url, "/", doc_type, "/_search")
 91 |   }
 92 | 
 93 |   structure(list("search_url" = valid_search_endpoint, "cluster_url" = cluster_url,
 94 |                  "index" = index, "doc_type" = doc_type), class = c("elastic_rescource", "elastic"))
 95 | }
 96 | 
 97 | 
 98 | #' Define Elasticsearch query.
 99 | #'
100 | #' @export
101 | #'
102 | #' @param json JSON object describing the query that needs to be executed.
103 | #' @param size [optional] The number of documents to return. If left unspecified, then the default
104 | #' if to return all documents.
105 | #' @return An \code{elastic_query} object.
106 | #'
107 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html}
108 | #'
109 | #' @examples
110 | #' all_docs <- query('{"match_all": {}}')
111 | query <- function(json, size = 0) {
112 |   stopifnot(valid_json(json))
113 |   api_call <- paste0('"query":', json)
114 |   structure(list("api_call" = api_call, "size" = size),
115 |             class = c("elastic_query", "elastic_api", "elastic"))
116 | }
117 | 
118 | 
119 | #' Define Elasticsearch query sort
120 | #'
121 | #' @export
122 | #'
123 | #' @param json JSON object describing the sorting required on the query results.
124 | #' @return An \code{elastic_sort} object.
125 | #'
126 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-sort.html}
127 | #'
128 | #' @examples
129 | #' sort_by_key <- sort_on('[{"sort_key": {"order": "asc"}}]')
130 | sort_on <- function(json) {
131 |   stopifnot(valid_json(json))
132 |   api_call <- paste0('"sort":', json)
133 |   structure(list("api_call" = api_call), class = c("elastic_sort", "elastic_api", "elastic"))
134 | }
135 | 
136 | 
137 | #' Define Elasticsearch query source filter.
138 | #'
139 | #' @export
140 | #'
141 | #' @param json JSON object describing the aggregation that needs to be executed.
142 | #' @return An \code{elastic_source_filter} object.
143 | #'
144 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-source-filtering.html}
145 | #'
146 | select_fields <- function(json) {
147 |   stopifnot(valid_json(json))
148 |   api_call <- paste0('"_source": ', json)
149 |   structure(list("api_call" = api_call),
150 |             class = c("elastic_source_filter", "elastic_api", "elastic"))
151 | }
152 | 
153 | 
154 | #' Define Elasticsearch aggregation.
155 | #'
156 | #' @export
157 | #'
158 | #' @param json JSON object describing the aggregation that needs to be executed.
159 | #' @return An \code{elastic_aggs} object.
160 | #'
161 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html}
162 | #'
163 | #' @examples
164 | #' avg_sepal_width_per_cat <- aggs('{"avg_sepal_width_per_cat": {
165 | #'       "terms": {"field": "species"},
166 | #'       "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
167 | #' }')
168 | aggs <- function(json) {
169 |   stopifnot(valid_json(json))
170 |   api_call <- paste0('"aggs":', json)
171 |   structure(list("api_call" = api_call, "size" = 0),
172 |             class = c("elastic_aggs", "elastic_api", "elastic"))
173 | }
174 | 
175 | 
176 | 
177 | #' List of fields in index information.
178 | #' @export
179 | #'
180 | #' @return An \code{elastic_info} object.
181 | #'
182 | #' @examples
183 | #' list_fields()
184 | list_fields <- function() {
185 |   endpoint <- "/_mapping"
186 | 
187 |   process_response <- function(response) {
188 |     index_mapping <- httr::content(response, as = "parsed")
189 |     fields <- names(index_mapping[[1]]$mappings$data$properties)
190 |     fields
191 |   }
192 | 
193 |   structure(list("endpoint" = endpoint, "process_response" = process_response),
194 |             class = c("elastic_info", "elastic_api", "elastic"))
195 | }
196 | 
197 | 
198 | #' List of indices in cluster information.
199 | #'
200 | #' @export
201 | #'
202 | #' @return An \code{elastic_info} object.
203 | #'
204 | #' @examples
205 | #' list_indices()
206 | list_indices <- function() {
207 |   endpoint <- "/_mapping"
208 |   process_response <- function(response) names(httr::content(response, as = "parsed"))
209 |   structure(list("endpoint" = endpoint, "process_response" = process_response),
210 |             class = c("elastic_info", "elastic_api", "elastic"))
211 | }
212 | 
213 | 
214 | # ---- operators ----------------------------------------------------------------------------------
215 | 
216 | 
217 | #' Get cluster and index (meta) data.
218 | #'
219 | #' An operator to be used with requests for information
220 | #'
221 | #' @export
222 | #'
223 | #' @param rescource An \code{elastic_rescource} object that contains the information on the
224 | #' Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
225 | #' not already exist, it will be created automatically.
226 | #' @param info \code{elastic_info} object.
227 | #'
228 | #' @examples
229 | #' \dontrun{
230 | #' elastic("http://localhost:9200", "iris", "data") %info% list_indices()
231 | #' elastic("http://localhost:9200", "iris", "data") %info% list_fields()
232 | #' }
233 | `%info%` <- function(rescource, info) UseMethod("%info%")
234 | 
235 | #' @export
236 | `%info%.elastic_rescource` <- function(rescource, info) {
237 |   stopifnot(is_elastic_rescource(rescource), is_elastic_info(info))
238 |   api_call <- paste0(rescource$cluster_url, info$endpoint)
239 |   response <- httr::GET(api_call)
240 |   info$process_response(response)
241 | }
242 | 
243 | 
244 | #' Index a data frame.
245 | #'
246 | #' Inserting records (or documents) into Elasticsearch is referred to as "indexing' the data. This
247 | #' function considers each row of a data frame as a document to be indexed into an Elasticsearch
248 | #' index.
249 | #'
250 | #' If the data frame contains a column named 'id', then this will be used to assign document ids.
251 | #' Otherwise, Elasticsearch will automatically assigne the documents random ids.
252 | #'
253 | #' @export
254 | #'
255 | #' @param rescource An \code{elastic_rescource} object that contains the information on the
256 | #' Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
257 | #' not already exist, it will be created automatically.
258 | #' @param df data.frame whose rows will be indexed as documents in the Elasticsearch cluster.
259 | #'
260 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html}
261 | #'
262 | #' @examples
263 | #' \dontrun{
264 | #' elastic("http://localhost:9200", "iris", "data") %index% iris
265 | #' }
266 | `%index%` <- function(rescource, df) UseMethod("%index%")
267 | 
268 | #' @export
269 | `%index%.elastic_rescource` <- function(rescource, df) {
270 |   stopifnot(is_elastic_rescource(rescource), is.data.frame(df), !is.null(rescource$doc_type))
271 |   colnames(df) <- cleaned_field_names(colnames(df))
272 | 
273 |   df_size_mb <- utils::object.size(df) / (1000 * 1000)
274 |   chunk_size_mb <- 10
275 |   num_data_chunks <- ceiling(df_size_mb / chunk_size_mb)
276 |   num_rows_per_chunk <- ceiling(nrow(df) / num_data_chunks)
277 |   chunk_indices <- lapply(X = seq(1, nrow(df), num_rows_per_chunk),
278 |                           FUN = function(x) c(x, min(nrow(df), x + num_rows_per_chunk - 1)))
279 | 
280 |   lapply(X = chunk_indices, FUN = function(x) index_bulk_dataframe(rescource, df[x[1]:x[2], ]))
281 |   message("... data successfully indexed", appendLF = FALSE)
282 | }
283 | 
284 | 
285 | #' Create Elasticsearch index with custom mapping.
286 | #'
287 | #' Mappings are the closest concept to traditional database 'schema'. This function allows the
288 | #' creation of Elasticsearch indicies with custom mappings. If left unspecified, Elasticsearch will
289 | #' infer the type of each field based on the first document indexed.
290 | #'
291 | #' @export
292 | #'
293 | #' @param rescource An \code{elastic_rescource} object that contains the information on the
294 | #' Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
295 | #' not already exist, it will be created automatically.
296 | #' @param mapping A JSON object containing the mapping details required for the index.
297 | #'
298 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html}
299 | #'
300 | #' @examples
301 | #' \dontrun{
302 | #' elastic("http://localhost:9200", "iris", "data") %create% mapping_default_simple()
303 | #' }
304 | `%create%` <- function(rescource, mapping) UseMethod("%create%")
305 | 
306 | #' @export
307 | `%create%.elastic_rescource` <- function(rescource, mapping) {
308 |   response <- httr::PUT(paste(rescource$cluster_url, rescource$index, sep = "/"), body = mapping,
309 |                         httr::add_headers("Content-Type" = "application/json"))
310 |   check_http_code_throw_error(response)
311 |   message(paste("...", rescource$index, "has been created"))
312 | }
313 | 
314 | 
315 | #' Delete Elasticsearch index.
316 | #'
317 | #' Delete all of the documents within a particular document type (if specified), or delete an
318 | #' entire index (if the document type is unspecified.)
319 | #'
320 | #' @export
321 | #'
322 | #' @param rescource An \code{elastic_rescource} object that contains the information on the
323 | #' Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
324 | #' not already exist, it will be created automatically.
325 | #' @param approve Must be equal to \code{"TRUE"} for deletion for all documents in a rescource,
326 | #' OR be a character vector of document ids if only specific documents need to be deleted.
327 | #'
328 | #' @examples
329 | #' \dontrun{
330 | #' elastic("http://localhost:9200", "iris", "data") %delete% TRUE
331 | #' }
332 | `%delete%` <- function(rescource, approve) UseMethod("%delete%")
333 | 
334 | #' @export
335 | `%delete%.elastic_rescource` <- function(rescource, approve) {
336 |   if (is.character(approve) & is.vector(approve)) {
337 |     ids <- approve
338 |   } else {
339 |     if (approve != TRUE) stop("please approve deletion") else ids <- NULL
340 |   }
341 | 
342 |   if (is.null(ids)) {
343 |     if (is.null(rescource$doc_type)) {
344 |       response <- httr::DELETE(paste(rescource$cluster_url, rescource$index, sep = "/"))
345 |       check_http_code_throw_error(response)
346 |       message(paste0("... ", rescource$index, " has been deleted"))
347 |     } else {
348 |       api_call_payload <- '{"query": {"match_all": {}}}'
349 |       doc_type_ids <- scroll_search(rescource, api_call_payload, extract_id_results)$id
350 |       metadata <- create_metadata("delete", rescource$index, rescource$doc_type, doc_type_ids)
351 |       deletions_file <- create_bulk_delete_file(metadata)
352 |       response <- httr::PUT(url = rescource$cluster_url,
353 |                             path = "/_bulk",
354 |                             body = httr::upload_file(deletions_file),
355 |                             httr::add_headers("Content-Type" = "application/json"))
356 | 
357 |       file.remove(deletions_file)
358 |       check_http_code_throw_error(response)
359 |       message(paste0("... ", rescource$index, "/", rescource$doc_type, " has been deleted"))
360 |     }
361 |   } else {
362 |     metadata <- create_metadata("delete", rescource$index, rescource$doc_type, ids)
363 |     deletions_file <- create_bulk_delete_file(metadata)
364 |     response <- httr::PUT(url = rescource$cluster_url,
365 |                           path = "/_bulk",
366 |                           body = httr::upload_file(deletions_file),
367 |                           httr::add_headers("Content-Type" = "application/json"))
368 | 
369 |     file.remove(deletions_file)
370 |     check_http_code_throw_error(response)
371 |     message(paste0("... ", paste0(ids, collapse = ", "), " have been deleted"))
372 |   }
373 | }
374 | 
375 | 
376 | #' Execute query or search.
377 | #'
378 | #' @export
379 | #'
380 | #' @param rescource An \code{elastic_rescource} object that contains the information on the
381 | #' Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
382 | #' not already exist, it will be created automatically.
383 | #' @param search \code{elastic_query} or \code{elastic_aggs} object.
384 | #' @return A data.frame of search or aggregation results.
385 | #'
386 | #' @examples
387 | #' \dontrun{
388 | #' results <- elastic("http://localhost:9200", "iris", "data") %search% query('{"match_all": {}}')
389 | #' head(results)
390 | #' #   sepal_length sepal_width petal_length petal_width species
391 | #' # 1          4.8         3.0          1.4         0.1  setosa
392 | #' # 2          4.3         3.0          1.1         0.1  setosa
393 | #' # 3          5.8         4.0          1.2         0.2  setosa
394 | #' # 4          5.1         3.5          1.4         0.3  setosa
395 | #' # 5          5.2         3.5          1.5         0.2  setosa
396 | #' # 6          5.2         3.4          1.4         0.2  setosa
397 | #' }
398 | `%search%` <- function(rescource, search) UseMethod("%search%")
399 | 
400 | #' @export
401 | `%search%.elastic` <- function(rescource, search) {
402 |   stopifnot(is_elastic_rescource(rescource) & is_elastic_api(search))
403 | 
404 |   if (is_elastic_query(search)) {
405 |     if (search$size != 0) {
406 |       api_call_payload <- paste0('{"size":', search$size, ', ', search$api_call, '}')
407 |       return(from_size_search(rescource, api_call_payload))
408 | 
409 |     } else {
410 |       api_call_payload <- paste0('{"size": 10000', ', ', search$api_call, '}')
411 |       return(scroll_search(rescource, api_call_payload))
412 | 
413 |     }
414 |   } else if (is_elastic_aggs(search)) {
415 |     api_call_payload <- paste0('{"size":', search$size, ', ', search$api_call, '}')
416 |     return(from_size_search(rescource, api_call_payload))
417 |   }
418 | }
419 | 
420 | 
421 | #' Define Elasticsearch aggregation on a secific subset of documents.
422 | #'
423 | #' Sometimes it is necessary to perform an aggregation on the results of a query (i.e. on a subset
424 | #' of all the available documents). This is achieved by adding an \code{aggs} object to a
425 | #' \code{query} object.
426 | #'
427 | #' @export
428 | #'
429 | #' @param x \code{elastic_query} object.
430 | #' @param y \code{elastic_aggs} or \code{elastic_sort} object.
431 | #' @return \code{elastic_aggs} object that contains the query information required for the
432 | #' aggregation.
433 | #'
434 | #' @examples
435 | #' all_docs <- query('{"match_all": {}}')
436 | #' avg_sepal_width_per_cat <- aggs('{"avg_sepal_width_per_cat": {
437 | #'       "terms": {"field": "species"},
438 | #'       "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
439 | #' }')
440 | #' all_docs + avg_sepal_width_per_cat
441 | #'
442 | #' sort_by_sepal_width <- sort_on('[{"sepal_width": {"order": "asc"}}]')
443 | #' all_docs + sort_by_sepal_width
444 | `+.elastic_api` <- function(x, y) {
445 |   stopifnot(is_elastic_query(x) & is_elastic_aggs(y) |
446 |             is_elastic_aggs(x) & is_elastic_query(y) |
447 |             is_elastic_query(x) & is_elastic_sort(y) |
448 |             is_elastic_sort(x) & is_elastic_query(y) |
449 |             is_elastic_source_filter(x) & is_elastic_query(y) |
450 |             is_elastic_query(x) & is_elastic_source_filter(y) |
451 |             is_elastic_source_filter(x) & is_elastic_sort(y) |
452 |             is_elastic_sort(x) & is_elastic_source_filter(y) |
453 |             is_elastic_query(x) & is_elastic_query(y))
454 | 
455 |   if (is_elastic_query(x) & is_elastic_sort(y) | is_elastic_query(y) & is_elastic_sort(x)) {
456 |     query <- if (is_elastic_query(x)) x else y
457 |     sort <- if (is_elastic_sort(x)) x else y
458 |     combined_api_call <- paste0(query$api_call, ',', sort$api_call)
459 |     structure(list("api_call" = combined_api_call, "size" = query$size),
460 |               class = c("elastic_query", "elastic_api", "elastic"))
461 | 
462 |   } else if (is_elastic_source_filter(x) & is_elastic_query(y) |
463 |              is_elastic_query(x) & is_elastic_source_filter(y)) {
464 |     query <- if (is_elastic_query(x)) x else y
465 |     source_filter <- if (is_elastic_source_filter(x)) x else y
466 |     combined_api_call <- paste0(source_filter$api_call, ',', query$api_call)
467 |     structure(list("api_call" = combined_api_call, "size" = query$size),
468 |               class = c("elastic_query", "elastic_api", "elastic"))
469 | 
470 |   } else if (is_elastic_source_filter(x) & is_elastic_sort(y) |
471 |              is_elastic_sort(x) & is_elastic_source_filter(y)) {
472 |     sort <- if (is_elastic_sort(x)) x else y
473 |     source_filter <- if (is_elastic_source_filter(x)) x else y
474 |     combined_api_call <- paste0(source_filter$api_call, ',', sort$api_call)
475 |     structure(list("api_call" = combined_api_call),
476 |               class = c("elastic_query", "elastic_api", "elastic"))
477 | 
478 |   } else if (is_elastic_query(x) & is_elastic_query(y)) {
479 |     if (!is.null(x$size)) {
480 |       query_body <- x
481 |     } else if (!is.null(y$size)) {
482 |       query_body <- y
483 |     } else {
484 |       stop("no main query body in left or right operands", call. = FALSE)
485 |     }
486 | 
487 |     query_src_filter_sort <- if (is.null(x$size)) x else y
488 |     combined_api_call <- paste0(query_body$api_call, ',', query_src_filter_sort$api_call)
489 |     structure(list("api_call" = combined_api_call, "size" = query_body$size),
490 |               class = c("elastic_query", "elastic_api", "elastic"))
491 | 
492 |   } else if (is_elastic_query(x) & is_elastic_aggs(y) | is_elastic_query(y) & is_elastic_aggs(x)) {
493 |     query <- if (is_elastic_query(x)) x else y
494 |     aggs <- if (is_elastic_aggs(x)) x else y
495 |     combined_api_call <- paste0(query$api_call, ',', aggs$api_call)
496 |     structure(list("api_call" = combined_api_call, "size" = 0),
497 |               class = c("elastic_aggs", "elastic_api", "elastic"))
498 |   }
499 | }
500 | 
501 | 
502 | #' Pretty-print aggs and query JSON objects.
503 | #'
504 | #' @export
505 | #'
506 | #' @param x \code{elastic_query} or \code{elastic_aggs} object.
507 | #' @param ... For consitency with all other \code{print} methods.
508 | #' @return Character string of pretty-printed JSON object.
509 | #'
510 | #' @examples
511 | #' all_docs <- query('{"match_all": {}}')
512 | #' print(all_docs)
513 | print.elastic_api <- function(x, ...) {
514 |   size_call <- if (!is.null(x$size)) paste0('"size": ', x$size, ', ') else ''
515 |   complete_json <- paste0('{', size_call, x$api_call, '}')
516 |   jsonlite::prettify(complete_json)
517 | }
518 | 


--------------------------------------------------------------------------------
/R/elasticsearchr.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2019 Alex Ioannides
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | #' elasticsearchr: a lightweight Elasticsearch client for R.
17 | #'
18 | #' Allows you to index, update and delete documents as well as run queries and aggregations.
19 | #'
20 | #' @docType package
21 | #' @name elasticsearchr
22 | NULL
23 | 


--------------------------------------------------------------------------------
/R/mappings.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2016-2019 Alex Ioannides
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | #' Simple Elasticsearch default mappings for non-text-search analytics
17 | #'
18 | #' This mapping switches-off the text analyser for all fields of type 'string' (i.e. switches off
19 | #' free text search), allows all text search to work with case-insensitive lowercase terms, and
20 | #' maps any field with the name 'timestamp' to type 'date', so long as it has the appropriate
21 | #' string or long format.
22 | #'
23 | #' @export
24 | mapping_default_simple <- function() {
25 |   jsonlite::prettify(
26 |   '{
27 |     "settings": {
28 |       "index": {
29 |         "analysis": {
30 |           "analyzer": {
31 |             "analyzer_lowercase": {
32 |               "tokenizer": "keyword",
33 |               "filter": "lowercase"
34 |             }
35 |           }
36 |         }
37 |       }
38 |     },
39 |     "mappings": {
40 |       "_default_": {
41 |         "dynamic_templates": [
42 |           {
43 |             "strings": {
44 |               "match_mapping_type": "string",
45 |               "mapping": {
46 |                 "type": "string",
47 |                 "analyzer": "analyzer_lowercase"
48 |               }
49 |             }
50 |           },
51 |           {
52 |             "time": {
53 |               "match": "timestamp",
54 |               "mapping": {
55 |                 "type": "date",
56 |                 "format": "yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd||epoch_millis"
57 |               }
58 |             }
59 |           }]
60 |       }
61 |     }
62 |   }')
63 | }
64 | 
65 | 
66 | #' Elasticsearch 5.x default mappings enabling fielddata for text fields
67 | #'
68 | #' A default mapping that enables fielddata for all string/text fields in Elasticsearch 5.x.
69 | #'
70 | #' @export
71 | mapping_fielddata_true <- function() {
72 |   jsonlite::prettify(
73 |   '{
74 |     "mappings": {
75 |       "_default_": {
76 |         "dynamic_templates": [
77 |           {
78 |             "strings": {
79 |               "match_mapping_type": "string",
80 |               "mapping": {
81 |                 "type": "text",
82 |                 "fielddata": true
83 |               }
84 |             }
85 |           }
86 |         ]
87 |       }
88 |     }
89 | }')
90 | }
91 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2019 Alex Ioannides
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | #' Validate healthy Elasticsearch connection.
 17 | #'
 18 | #' Validates healthy Elasticsearch connections by attempting to call the cluster healthcheck
 19 | #' endpoint. In doing to, it defends against incorrect URLs to Elasticsearch clusters. Requires
 20 | #' that URLs point directly to a master node - i.e. the endpoint that would return the default
 21 | #' Elasticsearch message, "You Know, for Search", e.g. `http://localhost:9200`.
 22 | #'
 23 | #' @param url The URL to validate.
 24 | #' @return Boolean
 25 | #'
 26 | #' @examples
 27 | #' \dontrun{
 28 | #' url <- "http://localhost:9200"
 29 | #' valid_connection(url)
 30 | #' # TRUE
 31 | #'
 32 | #' url <- "http://localhost:9201"
 33 | #' valid_connection(url)
 34 | #' #  Error: Failed to connect to localhost port 9201: Connection refused
 35 | #' }
 36 | valid_connection <- function(url) {
 37 |   if (substr(url, nchar(url), nchar(url)) == "/") {
 38 |     healthcheck_endpoint <- paste0(url, "_cluster/health")
 39 |   } else {
 40 |     healthcheck_endpoint <- paste0(url, "/_cluster/health")
 41 |   }
 42 | 
 43 |   tryCatch(
 44 |     {
 45 |       response <- httr::GET(healthcheck_endpoint)
 46 |       if (response$status_code != 200) {
 47 |         msg <- paste0(healthcheck_endpoint, " does not return cluster health:\n",
 48 |                       httr::content(response, as = "text"))
 49 |         stop(msg, call. = FALSE)
 50 |       }
 51 | 
 52 |       response_parsed <- httr::content(response, as = "parsed")
 53 |       if (response_parsed$status != "red") {
 54 |         return(TRUE)
 55 |       } else {
 56 |         stop("Elasticsearch cluster status is red", call. = FALSE)
 57 |       }
 58 |     },
 59 |     error = function(e) stop(e$message, call. = FALSE)
 60 |   )
 61 | }
 62 | 
 63 | 
 64 | #' Elasticsearch version
 65 | #'
 66 | #' Returns the major, minor and build version numbers for an Elasticsearch cluster, given a valid
 67 | #' URL to an Elasticsearch cluster.
 68 | #'
 69 | #' @param url A valid URL to an Elasticsearch cluster.
 70 | #' @return A list with the \code{major}, \code{minor} and \code{build} numbers.
 71 | #'
 72 | #' @examples
 73 | #' \dontrun{
 74 | #' elastic_version("http://localhost:9200")
 75 | #' $major
 76 | #' [1] 5
 77 | #'
 78 | #' $minor
 79 | #' [1] 0
 80 | #'
 81 | #' $build
 82 | #' [1] 1
 83 | #' }
 84 | elastic_version <- function(url) {
 85 |   valid_connection(url)
 86 |   response <- httr::GET(url)
 87 |   check_http_code_throw_error(response)
 88 |   version_string <- httr::content(response)$version$number
 89 |   version <- as.integer(strsplit(version_string, "\\.")[[1]])
 90 | 
 91 |   list("major" = version[1], "minor" = version[2], "build" = version[3])
 92 | }
 93 | 
 94 | 
 95 | #' Valid JSON string predicate function
 96 | #'
 97 | #' @param json Candidate JSON object as a string.
 98 | #' @return Boolean.
 99 | #'
100 | #' @examples
101 | #' \dontrun{
102 | #' good_json <- '{"id": 1}'
103 | #' valid_json(good_json)
104 | #' # TRUE
105 | #'
106 | #' bad_json <- '{"id": 1a}'
107 | #' valid_json(bad_json)
108 | #' # FALSE
109 | #' }
110 | valid_json <- function(json) {
111 |   stopifnot(is.character(json))
112 |   jsonlite::validate(json)
113 | }
114 | 
115 | 
116 | #' Sanitise column names.
117 | #'
118 | #' Convert data frame column names into an Elasticsearch compatible format.
119 | #'
120 | #' Elasticsearch will not ingest field names with periods ("."), such as "Sepal.Width", as these
121 | #' are reserved for nested objects (in the JSON sense). This function replaces all period with
122 | #' underscores ("_") and the converts everything to lowercase for simplicity.
123 | #'
124 | #' @param colnames A character vector containing data frame column names.
125 | #' @return A character vector with 'clean' column names.
126 | #'
127 | #' @examples
128 | #' \dontrun{
129 | #' df <- iris
130 | #' colnames(df) <- cleaned_field_names(colnames(df))
131 | #' colnames(df)
132 | #' # "sepal_length" "sepal_width"  "petal_length" "petal_width"  "species"
133 | #' }
134 | cleaned_field_names <- function(colnames) {
135 |   tolower(gsub("\\.", "_", colnames))
136 | }
137 | 
138 | 
139 | #' Create Bulk API metadata.
140 | #'
141 | #' The fastest way to index, delete or update many documents, is via the Bulk API. This requires
142 | #' that each document have the action combined with the document's metadata (index, type and id)
143 | #' sent to the API. This information is encapulated as a JSON object, that this function is
144 | #' responsible for generating.
145 | #'
146 | #' @param action One of: "index", "create", "update" or "delete".
147 | #' @param index The name of the index where the documents reside (or will reside).
148 | #' @param doc_type The name of the document type where the documents reside (or will reside).
149 | #' @param id [optional] Character vector of document ids.
150 | #' @param n [optional] Integer number of repeated metadata description objects that need to be
151 | #' returned (if \code{id} is not specified).
152 | #'
153 | #' @return A character vector of Bulk API document information objects.
154 | #'
155 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html}
156 | #' for more information on the information required by the Elasticsearch Bulk API.
157 | #'
158 | #' @examples
159 | #' \dontrun{
160 | #' create_metadata("index", "iris", "data", n = 2)
161 | #' '{\"index\": {\"_index\": \"iris\", \"_type\": \"data\"}}'
162 | #' '{\"index\": {\"_index\": \"iris\", \"_type\": \"data\"}}'
163 | #' }
164 | create_metadata <- function(action, index, doc_type, id = NULL, n = NULL) {
165 |   stopifnot(action %in% c("index", "create", "update", "delete"))
166 | 
167 |   if (!is.null(id)) {
168 |     metadata <- paste0('{"', action, '": {"_index": "', index, '", "_type": "', doc_type, '", "_id": "', id, '"}}')
169 |   } else if (!is.null(n)) {
170 |     metadata_line <- paste0('{"', action, '": {"_index": "', index, '", "_type": "', doc_type, '"}}')
171 |     metadata <- rep(metadata_line, n)
172 |   } else {
173 |     metadata <- paste0('{"', action, '": {"_index": "', index, '", "_type": "', doc_type, '"}}')
174 |   }
175 | 
176 |   metadata
177 | }
178 | 
179 | 
180 | #' Create Bulk API data file.
181 | #'
182 | #' The fastest way to index, delete or update many documents, is via the Bulk API. This function
183 | #' assembles a text file comprising of data and/or actions in the format required by the Bulk API.
184 | #' This is ready to be POSTed to the Bulk API.
185 | #'
186 | #' @param metadata A character vector of Bulk API document information objects, as generated by
187 | #' \code{create_metadata(...)}.
188 | #' @param df [optional] A data.frame with data for indexing or updating.
189 | #' @return The name of the temporary file containing the data for the Elasticsearch Bulk API.
190 | #'
191 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html}
192 | #' for more information on the information required by the Elasticsearch Bulk API.
193 | #'
194 | #' @examples
195 | #' \dontrun{
196 | #' bulk_upload_info <- create_metadata("index", "iris", "data", n = nrow(iris))
197 | #' create_bulk_upload_file(bulk_upload_info, iris)
198 | #' # "/var/folders/__/yz_l30s48xj6m_0059b_2twr0000gn/T//RtmpQnvUOt/file98194322b8"
199 | #'
200 | #' bulk_delete_info <- create_metadata("delete", "iris", "data", n = nrow(iris))
201 | #' create_bulk_delete_file(bulk_delete_info)
202 | #' # "/var/folders/__/yz_l30s48xj6m_0059b_2twr0000gn/T//RtmpQnvUOt/file98194322b8"
203 | #' }
204 | #' @name create_bulk_upload_file
205 | NULL
206 | 
207 | #' @rdname create_bulk_upload_file
208 | create_bulk_upload_file <- function(metadata, df = NULL) {
209 |   if (!is.null(df)) {
210 |     jsonlite::stream_out(df, file(temp_filename1 <- tempfile()), POSIXt = "ISO8601",
211 |                          pagesize = 1000, verbose = FALSE, digits = 8, always_decimal = TRUE)
212 | 
213 |     json_data <- readLines(temp_filename1)
214 |     file.remove(temp_filename1)
215 | 
216 |     final_upload_data <- rep(NA, length(json_data) * 2)
217 |     idx_even <- seq(2, length(json_data) * 2, 2)
218 |     idx_odd <- seq(1, length(json_data) * 2, 2)
219 |     final_upload_data[idx_odd] <- metadata
220 |     final_upload_data[idx_even] <- json_data
221 | 
222 |     writeLines(final_upload_data, file(temp_filename2 <- tempfile()))
223 |   } else {
224 |     writeLines(metadata, file(temp_filename2 <- tempfile()))
225 |   }
226 | 
227 |   closeAllConnections()
228 |   temp_filename2
229 | }
230 | 
231 | #' @rdname create_bulk_upload_file
232 | create_bulk_delete_file <- function(metadata) {
233 |   writeLines(metadata, file(temp_filename <- tempfile()))
234 |   closeAllConnections()
235 |   temp_filename
236 | }
237 | 
238 | 
239 | #' Index data frame with Elasticsearch Bulk API
240 | #'
241 | #' Helper function to orchestrate the assembly of the Bulk API upload file, http request to
242 | #' Elasticsearch and handling any subsequent respose errors. It's primary purpose is to be called
243 | #' repeatedly on 'chunks' of a data frame that is too bid to be indexed with a single call to the
244 | #' Bulk API (and hence the split into smaller more manageable chunks).
245 | #'
246 | #' @param rescource An \code{elastic_rescource} object that contains the information on the
247 | #' Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
248 | #' not already exist, it will be created automatically.
249 | #' @param df data.frame whose rows will be indexed as documents in the Elasticsearch cluster.
250 | #' @return NULL
251 | #'
252 | #' @examples
253 | #' \dontrun{
254 | #' rescource <- elastic("http://localhost:9200", "iris", "data")
255 | #' index_bulk_dataframe(rescource, iris)
256 | #' }
257 | index_bulk_dataframe <- function(rescource, df) {
258 |   has_ids <- "id" %in% colnames(df)
259 |   num_docs <- nrow(df)
260 | 
261 |   if (has_ids) {
262 |     metadata <- create_metadata("index", rescource$index, rescource$doc_type, df$id)
263 |   } else {
264 |     metadata <- create_metadata("index", rescource$index, rescource$doc_type, n = num_docs)
265 |   }
266 | 
267 |   bulk_data_file <- create_bulk_upload_file(metadata, df)
268 |   response <- httr::PUT(url = rescource$cluster_url,
269 |                         path = "/_bulk",
270 |                         body = httr::upload_file(bulk_data_file),
271 |                         httr::add_headers("Content-Type" = "application/json"))
272 | 
273 |   file.remove(bulk_data_file)
274 |   if (httr::status_code(response) == 200 & !httr::content(response)$errors) {
275 |     message("...", appendLF = FALSE)
276 |   } else if (httr::content(response)$errors) {
277 |     messages <- httr::content(response)$items
278 |     warning(jsonlite::prettify(httr::content(response, as = "text")))
279 |   } else {
280 |     check_http_code_throw_error(response)
281 |   }
282 | 
283 |   NULL
284 | }
285 | 
286 | 
287 | #' Execute query with from-size search API.
288 | #'
289 | #' The from-size search API allows a maximum of 10,000 search results (the maximum 'size') to be
290 | #' returned in one call to the API. The 'from' in the name of the API refers to where in the order
291 | #' of all qualifying documents (as ordered by their search score), should results start to be
292 | #' returned from. Anything larger than 10,000 and the results need to be fetched from using the
293 | #' scroll-search API (which is slower as it involves making multiple call-back requests). This API
294 | #' is particularly well suited to returning aggregation results.
295 | #'
296 | #' @param rescource An \code{elastic} rescource object describing on what documents the query is to
297 | #' be execured on.
298 | #' @param api_call_payload A character string containing the JSON payload that described the query
299 | #' to be executed.
300 | #' @return A data.frame of documents returned from the query.
301 | #'
302 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-from-size.html}
303 | #' for more information on the information required by the Elasticsearch from-size API.
304 | #'
305 | #' @examples
306 | #' \dontrun{
307 | #' elastic_rescource <- elastic("http://localhost:9200", "iris", "data")
308 | #' query_json <- '{"query": {"match_all": {}}}'
309 | #' results <- from_size_search(elastic_rescource, query_json)
310 | #' head(results)
311 | #' #   sepal_length sepal_width petal_length petal_width species
312 | #' # 1          4.8         3.0          1.4         0.1  setosa
313 | #' # 2          4.3         3.0          1.1         0.1  setosa
314 | #' # 3          5.8         4.0          1.2         0.2  setosa
315 | #' # 4          5.1         3.5          1.4         0.3  setosa
316 | #' # 5          5.2         3.5          1.5         0.2  setosa
317 | #' # 6          5.2         3.4          1.4         0.2  setosa
318 | #' }
319 | from_size_search <- function(rescource, api_call_payload) {
320 | 
321 |   response <- httr::POST(rescource$search_url, body = api_call_payload,
322 |                          httr::add_headers("Content-Type" = "application/json"))
323 | 
324 |   check_http_code_throw_error(response)
325 | 
326 |   parsed_response <- jsonlite::fromJSON(httr::content(response, as = 'text'))
327 |   if ("aggregations" %in% names(parsed_response)) {
328 |     return_data <- extract_aggs_results(response)
329 |     if (length(return_data) == 0) stop("empty response to request")
330 |   } else {
331 |     return_data <- extract_query_results(response)
332 |     if (length(return_data) == 0) stop("empty response to request")
333 |   }
334 | 
335 |   return_data
336 | }
337 | 
338 | 
339 | #' Execute a query with the scroll-search API.
340 | #'
341 | #' The scroll-search API works by returning a 'token' to the user that allows search results to be
342 | #' returned one 'page' at a time. This, large query results (in excess of the 10,000 documents
343 | #' maximum size offered by the from-search API) can be retreived by making multiple calls after the
344 | #' initial query was sent. Although a slower process end-to-end, this API is particularly well
345 | #' suited to returning large query results.
346 | #'
347 | #' @param rescource An \code{elastic} rescource object describing on what documents the query is to
348 | #' be execured on.
349 | #' @param api_call_payload A character string containing the JSON payload that described the query
350 | #' to be executed.
351 | #' @param extract_function A function to be used for extracting the data from the responses sent
352 | #' back from the scroll-search API. Defaults to \code{extract_query_results} that extracts query
353 | #' results, for when the scroll-search API is being used for retreiving query results (as opposed
354 | #' to aggregations or document ids, etc.).
355 | #' @return A data.frame of documents returned from the query.
356 | 
357 | #' @seealso \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html}
358 | #' for more information on the information required by the Elasticsearch scroll-search API.
359 | #'
360 | #' @examples
361 | #' \dontrun{
362 | #' elastic_rescource <- elastic("http://localhost:9200", "iris", "data")
363 | #' query_json <- '{"query": {"match_all": {}}}'
364 | #' results <- scroll_search(elastic_rescource, query_json)
365 | #' head(results)
366 | #' #   sepal_length sepal_width petal_length petal_width species
367 | #' # 1          4.8         3.0          1.4         0.1  setosa
368 | #' # 2          4.3         3.0          1.1         0.1  setosa
369 | #' # 3          5.8         4.0          1.2         0.2  setosa
370 | #' # 4          5.1         3.5          1.4         0.3  setosa
371 | #' # 5          5.2         3.5          1.5         0.2  setosa
372 | #' # 6          5.2         3.4          1.4         0.2  setosa
373 | #' }
374 | scroll_search <- function(rescource, api_call_payload, extract_function = extract_query_results) {
375 |   scroll_search_url <- paste0(rescource$cluster_url, "/_search/scroll")
376 |   scroll_results <- list()
377 | 
378 |   initial_scroll_search_url <- paste0(rescource$search_url, "?size=10000&scroll=1m")
379 |   initial_response <- httr::POST(initial_scroll_search_url, body = api_call_payload,
380 |                                  httr::add_headers("Content-Type" = "application/json"))
381 | 
382 |   check_http_code_throw_error(initial_response)
383 | 
384 |   scroll_results[[1]] <- extract_function(initial_response)
385 |   next_scroll_id <- httr::content(initial_response)$`_scroll_id`
386 |   has_next <- TRUE
387 |   n <- 2
388 |   while (has_next) {
389 |     message("...", appendLF = FALSE)
390 |     next_api_payload <- paste0('{"scroll": "1m", "scroll_id": "', next_scroll_id, '"}')
391 |     next_response <- httr::POST(scroll_search_url, body = next_api_payload,
392 |                                 httr::add_headers("Content-Type" = "application/json"))
393 | 
394 |     check_http_code_throw_error(next_response)
395 |     if(length(httr::content(next_response)$hits$hits) > 0) {
396 |       scroll_results[[n]] <- extract_function(next_response)
397 |       next_scroll_id <- httr::content(next_response)$`_scroll_id`
398 |       n <- n + 1
399 |     } else {
400 |       has_next <- FALSE
401 |     }
402 |   }
403 | 
404 |   as.data.frame(dplyr::bind_rows(scroll_results), stringsAsFactors = FALSE)
405 | }
406 | 
407 | 
408 | #' Elasticsearch HTTP response data extraction functions.
409 | #'
410 | #' Functions for extracting the different types of data that can be contained in a response to a
411 | #' search API request.
412 | #'
413 | #' @name extract_query_results
414 | #' @param response An HTTP response from a response to a search API request.
415 | #' @return A data.frame of response results.
416 | NULL
417 | 
418 | #' @rdname extract_query_results
419 | extract_query_results <- function(response) {
420 |   df <- jsonlite::fromJSON(httr::content(response, as = 'text'))$hits$hits$`_source`
421 |   if (length(df) == 0) stop("no query results returned")
422 |   jsonlite::flatten(df)
423 | }
424 | 
425 | #' @rdname extract_query_results
426 | extract_aggs_results <- function(response) {
427 |   data <- jsonlite::fromJSON(httr::content(response, as = 'text'))
428 |   # are results from a bucket aggregation or metric aggregation?
429 |   if ("buckets" %in% names(data$aggregations[[1]])) {
430 |     df <- data$aggregations[[1]]$buckets
431 |   } else {
432 |     df <- as.data.frame(data$aggregations)
433 |   }
434 |   if (length(df) == 0) stop("no aggs results returned")
435 |   jsonlite::flatten(df)
436 | }
437 | 
438 | #' @rdname extract_query_results
439 | extract_id_results <- function(response) {
440 |   ids <- jsonlite::fromJSON(httr::content(response, as = 'text'))$hits$hits$`_id`
441 |   if (length(ids) == 0) stop("no ids returned")
442 |   data.frame(id = ids)
443 | }
444 | 
445 | 
446 | #' HTTP response error handling.
447 | #'
448 | #' If an HTTP request returns a status code that is not 200, then this functions throws an
449 | #' exception and prints the prettified response contents to stderr.
450 | #'
451 | #' @param response An HTTP response from a request to a search API request.
452 | #' @return Exception with prettified JSON response printed to stderr.
453 | check_http_code_throw_error <- function(response) {
454 |   response_code <- httr::status_code(response)
455 |   if (!(response_code %in% c(200, 201))) {
456 |     stop(paste("Elasticsearch returned a status code of", httr::status_code(response), "\n"),
457 |          jsonlite::prettify(response))
458 |   }
459 | }
460 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- [![codecov](https://codecov.io/github/alexioannides/elasticsearchr/branch/master/graphs/badge.svg)](https://codecov.io/github/alexioannides/elasticsearchr) -->
  2 | [![Build Status](https://travis-ci.org/AlexIoannides/elasticsearchr.svg?branch=master)](https://travis-ci.org/AlexIoannides/elasticsearchr) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/AlexIoannides/elasticsearchr?branch=master&svg=true)](https://ci.appveyor.com/project/AlexIoannides/elasticsearchr) [![cran version](http://www.r-pkg.org/badges/version/elasticsearchr)](https://cran.r-project.org/package=elasticsearchr) [![rstudio mirror downloads](http://cranlogs.r-pkg.org/badges/grand-total/elasticsearchr)](https://github.com/metacran/cranlogs.app)
  3 | 
  4 | - built and tested using Elasticsearch v2.x, v5.x, v6.x and v7.x.
  5 | 
  6 | ![][esr_img]
  7 | 
  8 | # elasticsearchr: a Lightweight Elasticsearch Client for R
  9 | 
 10 | [Elasticsearch][es] is a distributed [NoSQL][nosql] document store search-engine and [column-oriented database][es_column], whose **fast** (near real-time) reads and powerful aggregation engine make it an excellent choice as an 'analytics database' for R&D, production-use or both. Installation is simple, it ships with sensible default settings that allow it to work effectively out-of-the-box, and all interaction is made via a set of intuitive and extremely [well documented][es_docs] [RESTful][restful] APIs. I've been using it for two years now and I am evangelical.
 11 | 
 12 | The `elasticsearchr` package implements a simple Domain-Specific Language (DSL) for indexing, deleting, querying, sorting and aggregating data in Elasticsearch, from within R. The main purpose of this package is to remove the labour involved with assembling HTTP requests to Elasticsearch's REST APIs and processing the responses. Instead, users of this package need only send and receive data frames to Elasticsearch resources. Users needing richer functionality are encouraged to investigate the excellent `elastic` package from the good people at [rOpenSci][ropensci].
 13 | 
 14 | This package is available on [CRAN][cran] or from [this GitHub repository][githubrepo]. To install the latest development version from GitHub, make sure that you have the `devtools` package installed (this comes bundled with RStudio), and then execute the following on the R command line:
 15 | 
 16 | ```r
 17 | devtools::install_github("alexioannides/elasticsearchr")
 18 | ```
 19 | 
 20 | ## Installing Elasticsearch
 21 | 
 22 | Elasticsearch can be downloaded [here][es_download], where the instructions for installing and starting it can also be found. OS X users (such as myself) can also make use of [Homebrew][homebrew] to install it with the command,
 23 | 
 24 | ```bash
 25 | $ brew install elasticsearch
 26 | ```
 27 | 
 28 | And then start it by executing `$ elasticsearch` from within any Terminal window. Successful installation and start-up can be checked by navigating any web browser to `http://localhost:9200`, where the following message should greet you (give or take the cluster name that changes with every restart),
 29 | 
 30 | ```js
 31 | {
 32 |   "name" : "RF6t1Gr",
 33 |   "cluster_name" : "elasticsearch",
 34 |   "cluster_uuid" : "pag7iIG-TK271EahH0B0yA",
 35 |   "version" : {
 36 |     "number" : "6.1.1",
 37 |     "build_hash" : "bd92e7f",
 38 |     "build_date" : "2017-12-17T20:23:25.338Z",
 39 |     "build_snapshot" : false,
 40 |     "lucene_version" : "7.1.0",
 41 |     "minimum_wire_compatibility_version" : "5.6.0",
 42 |     "minimum_index_compatibility_version" : "5.0.0"
 43 |   },
 44 |   "tagline" : "You Know, for Search"
 45 | }
 46 | ```
 47 | 
 48 | ## Elasticsearch 101
 49 | 
 50 | If you followed the installation steps above, you have just installed a single Elasticsearch 'node'. When **not** testing on your laptop, Elasticsearch usually comes in clusters of nodes (usually there are at least 3). The easiest easy way to get access to a managed Elasticsearch cluster is by using the [Elastic Cloud][es_cloud] managed service provided by [Elastic][elastic] (note that Amazon Web Services offer something similar too). For the rest of this brief tutorial I will assuming you're running a single node on your laptop (a great way of working with data that is too big for memory).
 51 | 
 52 | In Elasticsearch a 'row' of data is stored as a 'document'. A document is a [JSON][json] object - for example, the first row of R's `iris` dataset,
 53 | 
 54 | ```r
 55 | #   sepal_length sepal_width petal_length petal_width species
 56 | # 1          5.1         3.5          1.4         0.2  setosa
 57 | ```
 58 | 
 59 | would be represented as follows using JSON,
 60 | 
 61 | ```js
 62 | {
 63 |   "sepal_length": 5.1,
 64 |   "sepal_width": 3.5,
 65 |   "petal_length": 1.4,
 66 |   "petal_width": 0.2,
 67 |   "species": "setosa"
 68 | }
 69 | ```
 70 | 
 71 | Documents are classified into 'types' and stored in an 'index'. In a crude analogy with traditional SQL databases that is often used, we would associate an index with a database instance and the document types as tables within that database. In practice this example is not accurate - it is better to think of all documents as residing in a single - possibly sparse - table (defined by the index), where the document types represent non-unique sub-sets of columns in the table. This is especially so as fields that occur in multiple document types (within the same index), must have the same data-type - for example, if `"name"` exists in document type `customer` as well as in document type `address`, then `"name"` will need to be a `string` in both. Note, that 'types' are being slowly phased-out and in Elasticsearch v7.x there will only be indices.
 72 | 
 73 | Each document is considered a 'resource' that has a Uniform Resource Locator (URL) associated with it. Elasticsearch URLs all have the following format: `http://your_cluster:9200/your_index/your_doc_type/your_doc_id`. For example, the above `iris` document could be living at `http://localhost:9200/iris/data/1` - you could even point a web browser to this location and investigate the document's contents.
 74 | 
 75 | Although Elasticsearch - like most NoSQL databases - is often referred to as being 'schema free', as we have already see this is not entirely correct. What is true, however, is that the schema - or 'mapping' as it's called in Elasticsearch - does not _need_ to be declared up-front (although you certainly can do this). Elasticsearch is more than capable of guessing the types of fields based on new data indexed for the first time.
 76 | 
 77 | For more information on any of these basic concepts take a look [here][basic_concepts]
 78 | 
 79 | ## `elasticsearchr`: a Quick Start
 80 | 
 81 | `elasticsearchr` is a **lightweight** client - by this I mean that it only aims to do 'just enough' work to make using Elasticsearch with R easy and intuitive. You will still need to read the [Elasticsearch documentation][es_docs] to understand how to compose queries and aggregations. What follows is a quick summary of what is possible.
 82 | 
 83 | ### Elasticsearch Data Resources
 84 | 
 85 | Elasticsearch resources, as defined by the URLs described above, are defined as `elastic` objects in `elasticsearchr`. For example,
 86 | 
 87 | ```r
 88 | es <- elastic("http://localhost:9200", "iris", "data")
 89 | ```
 90 | 
 91 | Refers to documents of type 'data' in the 'iris' index located on an Elasticsearch node on my laptop. Note that:
 92 | - it is possible to leave the document type empty if you need to refer to all documents in an index; and,
 93 | - `elastic` objects can be defined even if the underling resources have yet to be brought into existence.
 94 | 
 95 | ### Indexing New Data
 96 | 
 97 | To index (insert) data from a data frame, use the `%index%` operator as follows:
 98 | 
 99 | ```r
100 | elastic("http://localhost:9200", "iris", "data") %index% iris
101 | ```
102 | 
103 | In this example, the `iris` dataset is indexed into the 'iris' index and given a document type called 'data'. Note that I have not provided any document ids here. **To explicitly specify document ids there must be a column in the data frame that is labelled `id`**, from which the document ids will be taken.
104 | 
105 | ### Deleting Data
106 | 
107 | Documents can be deleted in three different ways using the `%delete%` operator. Firstly, an entire index (including the mapping information) can be erased by referencing just the index in the resource - e.g.,
108 | 
109 | ```r
110 | elastic("http://localhost:9200", "iris") %delete% TRUE
111 | ```
112 | 
113 | Alternatively, documents can be deleted on a type-by-type basis leaving the index and it's mappings untouched, by referencing both the index and the document type as the resource - e.g.,
114 | 
115 | ```r
116 | elastic("http://localhost:9200", "iris", "data") %delete% TRUE
117 | ```
118 | 
119 | Finally, specific documents can be deleted by referencing their ids directly - e.g.,
120 | 
121 | ```r
122 | elastic("http://localhost:9200", "iris", "data") %delete% c("1", "2", "3", "4", "5")
123 | ```
124 | 
125 | ### Queries
126 | 
127 | Any type of query that Elasticsearch makes available can be defined in a `query` object using the native Elasticsearch JSON syntax - e.g. to match every document we could use the `match_all` query,
128 | 
129 | ```r
130 | for_everything <- query('{
131 |   "match_all": {}
132 | }')
133 | ```
134 | 
135 | To execute this query we use the `%search%` operator on the appropriate resource - e.g.,
136 | 
137 | ```r
138 | elastic("http://localhost:9200", "iris", "data") %search% for_everything
139 | 
140 | #     sepal_length sepal_width petal_length petal_width    species
141 | # 1            4.9         3.0          1.4         0.2     setosa
142 | # 2            4.9         3.1          1.5         0.1     setosa
143 | # 3            5.8         4.0          1.2         0.2     setosa
144 | # 4            5.4         3.9          1.3         0.4     setosa
145 | # 5            5.1         3.5          1.4         0.3     setosa
146 | # 6            5.4         3.4          1.7         0.2     setosa
147 | # ...
148 | ```
149 | 
150 | #### Selecting a Subset of Fields to Return
151 | 
152 | Sometimes only subset of all the available fields need to be returned, so it is much more efficient for Elasticsearch only to return the required data as opposed to all of it. This can be achieved as follows,
153 | 
154 | ```r
155 | selected_fields <- select_fields('{
156 |   "includes": ["sepal_length", "species"]
157 | }')
158 | 
159 | elastic("http://localhost:9200", "iris", "data") %search% (for_everything + selected_fields)
160 | 
161 | #        species sepal_length
162 | # 1       setosa          4.3
163 | # 2       setosa          5.7
164 | # 3       setosa          5.1
165 | # 4       setosa          5.1
166 | # 5       setosa          4.8
167 | # 6       setosa          5.0
168 | ```
169 | 
170 | The selected fields are defined using Elasticsearch's [source filtering API][source_filtering].
171 | 
172 | #### Sorting Query Results
173 | 
174 | Query results can be sorted on multiple fields by defining a `sort` object using the same Elasticsearch JSON syntax - e.g. to sort by `sepal_width` in ascending order the required `sort` object would be defined as,
175 | 
176 | ```r
177 | by_sepal_width <- sort_on('{"sepal_width": {"order": "asc"}}')
178 | ```
179 | 
180 | This is then added to a `query` object whose results we want sorted and executed using the `%search%` operator as before - e.g.,
181 | 
182 | ```r
183 | elastic("http://localhost:9200", "iris", "data") %search% (for_everything + by_sepal_width)
184 | 
185 | #   sepal_length sepal_width petal_length petal_width    species
186 | # 1          5.0         2.0          3.5         1.0 versicolor
187 | # 2          6.0         2.2          5.0         1.5  virginica
188 | # 3          6.0         2.2          4.0         1.0 versicolor
189 | # 4          6.2         2.2          4.5         1.5 versicolor
190 | # 5          4.5         2.3          1.3         0.3     setosa
191 | # 6          6.3         2.3          4.4         1.3 versicolor
192 | # ...
193 | ```
194 | 
195 | ### Aggregations
196 | 
197 | Similarly, any type of aggregation that Elasticsearch makes available can be defined in an `aggs` object - e.g. to compute the average `sepal_width` per-species of flower we would specify the following aggregation,
198 | 
199 | ```r
200 | avg_sepal_width <- aggs('{
201 |   "avg_sepal_width_per_species": {
202 |     "terms": {
203 |       "field": "species",
204 |       "size": 3
205 |     },
206 |     "aggs": {
207 |       "avg_sepal_width": {
208 |         "avg": {
209 |           "field": "sepal_width"
210 |         }
211 |       }
212 |     }
213 |   }
214 | }')
215 | ```
216 | 
217 | _(Elasticsearch 5.x and 6.x users please note that when using the out-of-the-box mappings the above aggregation requires that `"field": "species"` be changed to `"field": "species.keyword"` - see [here][es_five_mappings] for more information as to why)_
218 | 
219 | This aggregation is also executed via the `%search%` operator on the appropriate resource - e.g.,
220 | 
221 | ```r
222 | elastic("http://localhost:9200", "iris", "data") %search% avg_sepal_width
223 | 
224 | #          key doc_count avg_sepal_width.value
225 | # 1     setosa        50                 3.428
226 | # 2 versicolor        50                 2.770
227 | # 3  virginica        50                 2.974
228 | ```
229 | 
230 | Queries and aggregations can be combined such that the aggregations are computed on the results of the query. For example, to execute the combination of the above query and aggregation, we would execute,
231 | 
232 | ```r
233 | elastic("http://localhost:9200", "iris", "data") %search% (for_everything + avg_sepal_width)
234 | 
235 | #          key doc_count avg_sepal_width.value
236 | # 1     setosa        50                 3.428
237 | # 2 versicolor        50                 2.770
238 | # 3  virginica        50                 2.974
239 | ```
240 | 
241 | where the combination yields,
242 | 
243 | ```r
244 | print(for_everything + avg_sepal_width)
245 | 
246 | # {
247 | #     "size": 0,
248 | #     "query": {
249 | #         "match_all": {
250 | #
251 | #         }
252 | #     },
253 | #     "aggs": {
254 | #         "avg_sepal_width_per_species": {
255 | #             "terms": {
256 | #                 "field": "species",
257 | #                 "size": 0
258 | #             },
259 | #             "aggs": {
260 | #                 "avg_sepal_width": {
261 | #                     "avg": {
262 | #                         "field": "sepal_width"
263 | #                     }
264 | #                 }
265 | #             }
266 | #         }
267 | #     }
268 | # }
269 | ```
270 | 
271 | For comprehensive coverage of all query and aggregations types please refer to the rather excellent [official documentation][es_docs] (newcomers to Elasticsearch are advised to start with the 'Query String' query).
272 | 
273 | ### Mappings
274 | 
275 | We have also included the ability to create an empty index with a custom mapping, using the `%create%` operator - e.g.,
276 | 
277 | ```r
278 | elastic("http://localhost:9200", "iris") %create% mapping_default_simple()
279 | ```
280 | 
281 | Where in this instance `mapping_default_simple()` is a default mapping that I have shipped with `elasticsearchr`. It switches-off the text analyser for all fields of type 'string' (i.e. switches off free text search), allows all text search to work with case-insensitive lower-case terms, and maps any field with the name 'timestamp' to type 'date', so long as it has the appropriate string or long format.
282 | 
283 | ### Cluster and Index Information
284 | 
285 | We have also added the ability to retrieve basic information from the cluster, using the `%info%` operator. For example, to retrieve a list of all avaliable indices in the cluster,
286 | 
287 | ```r
288 | elastic("http://localhost:9200", "*") %info% list_indices()
289 |  
290 | # [1] "iris"
291 | ```
292 | 
293 | Or to list all of the available fields in an index,
294 | 
295 | ```r
296 | elastic("http://localhost:9200", "iris") %info% list_fields()
297 | 
298 | # [1] "petal_length" "petal_width" "sepal_length" "sepal_width" "species"
299 | ```
300 | 
301 | ## Acknowledgements
302 | 
303 | A big thank you to Hadley Wickham and Jeroen Ooms, the authors of the `httr` and `jsonlite` packages that `elasticsearchr` leans upon _heavily_. And, to the other contributors and supporters - your efforts are greatly appreciated!
304 | 
305 | 
306 | [esr_img]: https://alexioannides.github.io/images/r/elasticsearchr/elasticsearchr2.png "Elasticsearchr"
307 | 
308 | [elastic]: https://www.elastic.co "Elastic corp."
309 | 
310 | [es]: https://www.elastic.co/products/elasticsearch "Elasticsearch"
311 | 
312 | [es_column]: https://www.elastic.co/blog/elasticsearch-as-a-column-store "Elasticsearch as a Column Store"
313 | 
314 | [cran]: https://cran.r-project.org/package=elasticsearchr "elasticsearchr on CRAN"
315 | 
316 | [githubrepo]: https://github.com/AlexIoannides/elasticsearchr "Alex's GitHub repository"
317 | 
318 | [githubissues]: https://github.com/AlexIoannides/elasticsearchr/issues "elasticsearchr issues"
319 | 
320 | [es_download]: https://www.elastic.co/downloads/elasticsearch "Download"
321 | 
322 | [nosql]: https://en.wikipedia.org/wiki/NoSQL "What is NoSQL?"
323 | 
324 | [es_docs]: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html "Elasticsearch documentation"
325 | 
326 | [restful]: https://en.wikipedia.org/wiki/Representational_state_transfer "RESTful?"
327 | 
328 | [ropensci]: https://github.com/ropensci/elastic "rOpenSci"
329 | 
330 | [homebrew]: http://brew.sh/ "Homebrew for OS X"
331 | 
332 | [es_cloud]: https://www.elastic.co/cloud/as-a-service "Elastic Cloud"
333 | 
334 | [json]: https://en.wikipedia.org/wiki/JSON "JSON"
335 | 
336 | [basic_concepts]: https://www.elastic.co/guide/en/elasticsearch/reference/current/elasticsearch-intro.html "Basic Concepts"
337 | 
338 | [source_filtering]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-source-filtering.html "Source Filters"
339 | 
340 | [es_five_mappings]: https://www.elastic.co/guide/en/elasticsearch/reference/5.0/breaking_50_mapping_changes.html "Text fields in Elasticsearch 5.x"
341 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # DO NOT CHANGE the "init" and "install" sections below
 2 | 
 3 | # Download script file from GitHub
 4 | init:
 5 |   ps: |
 6 |         $ErrorActionPreference = "Stop"
 7 |         Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1"
 8 |         Import-Module '..\appveyor-tool.ps1'
 9 | 
10 | install:
11 |   ps: Bootstrap
12 | 
13 | # Adapt as necessary starting from here
14 | 
15 | build_script:
16 |   - travis-tool.sh install_deps
17 | 
18 | test_script:
19 |   - travis-tool.sh run_tests
20 | 
21 | on_failure:
22 |   - 7z a failure.zip *.Rcheck\*
23 |   - appveyor PushArtifact failure.zip
24 | 
25 | artifacts:
26 |   - path: '*.Rcheck\**\*.log'
27 |     name: Logs
28 | 
29 |   - path: '*.Rcheck\**\*.out'
30 |     name: Logs
31 | 
32 |   - path: '*.Rcheck\**\*.fail'
33 |     name: Logs
34 | 
35 |   - path: '*.Rcheck\**\*.Rout'
36 |     name: Logs
37 | 
38 |   - path: '\*_*.tar.gz'
39 |     name: Bits
40 | 
41 |   - path: '\*_*.zip'
42 |     name: Bits
43 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Test environments
 2 | 
 3 | * local OS X install, R 3.6.1
 4 | * Ubuntu 14.04.5 LTS (on travis-ci), R version 3.6.1 (2017-01-27)
 5 | * Windows Server 2012 R2 x64 (build 9600) (on AppVeyor), R version 3.6.1 Patched (2019-07-24 r76883)
 6 | 
 7 | ## R CMD check results
 8 | 
 9 | 0 errors | 0 warnings | 0 note
10 | 
11 | ## Reverse dependencies
12 | 
13 | There are currently no reverse dependencies.
14 | 


--------------------------------------------------------------------------------
/elasticsearchr.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/man/aggs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{aggs}
 4 | \alias{aggs}
 5 | \title{Define Elasticsearch aggregation.}
 6 | \usage{
 7 | aggs(json)
 8 | }
 9 | \arguments{
10 | \item{json}{JSON object describing the aggregation that needs to be executed.}
11 | }
12 | \value{
13 | An \code{elastic_aggs} object.
14 | }
15 | \description{
16 | Define Elasticsearch aggregation.
17 | }
18 | \examples{
19 | avg_sepal_width_per_cat <- aggs('{"avg_sepal_width_per_cat": {
20 |       "terms": {"field": "species"},
21 |       "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
22 | }')
23 | }
24 | \seealso{
25 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html}
26 | }
27 | 


--------------------------------------------------------------------------------
/man/check_http_code_throw_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{check_http_code_throw_error}
 4 | \alias{check_http_code_throw_error}
 5 | \title{HTTP response error handling.}
 6 | \usage{
 7 | check_http_code_throw_error(response)
 8 | }
 9 | \arguments{
10 | \item{response}{An HTTP response from a request to a search API request.}
11 | }
12 | \value{
13 | Exception with prettified JSON response printed to stderr.
14 | }
15 | \description{
16 | If an HTTP request returns a status code that is not 200, then this functions throws an
17 | exception and prints the prettified response contents to stderr.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/cleaned_field_names.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{cleaned_field_names}
 4 | \alias{cleaned_field_names}
 5 | \title{Sanitise column names.}
 6 | \usage{
 7 | cleaned_field_names(colnames)
 8 | }
 9 | \arguments{
10 | \item{colnames}{A character vector containing data frame column names.}
11 | }
12 | \value{
13 | A character vector with 'clean' column names.
14 | }
15 | \description{
16 | Convert data frame column names into an Elasticsearch compatible format.
17 | }
18 | \details{
19 | Elasticsearch will not ingest field names with periods ("."), such as "Sepal.Width", as these
20 | are reserved for nested objects (in the JSON sense). This function replaces all period with
21 | underscores ("_") and the converts everything to lowercase for simplicity.
22 | }
23 | \examples{
24 | \dontrun{
25 | df <- iris
26 | colnames(df) <- cleaned_field_names(colnames(df))
27 | colnames(df)
28 | # "sepal_length" "sepal_width"  "petal_length" "petal_width"  "species"
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/create_bulk_upload_file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{create_bulk_upload_file}
 4 | \alias{create_bulk_upload_file}
 5 | \alias{create_bulk_delete_file}
 6 | \title{Create Bulk API data file.}
 7 | \usage{
 8 | create_bulk_upload_file(metadata, df = NULL)
 9 | 
10 | create_bulk_delete_file(metadata)
11 | }
12 | \arguments{
13 | \item{metadata}{A character vector of Bulk API document information objects, as generated by
14 | \code{create_metadata(...)}.}
15 | 
16 | \item{df}{[optional] A data.frame with data for indexing or updating.}
17 | }
18 | \value{
19 | The name of the temporary file containing the data for the Elasticsearch Bulk API.
20 | }
21 | \description{
22 | The fastest way to index, delete or update many documents, is via the Bulk API. This function
23 | assembles a text file comprising of data and/or actions in the format required by the Bulk API.
24 | This is ready to be POSTed to the Bulk API.
25 | }
26 | \examples{
27 | \dontrun{
28 | bulk_upload_info <- create_metadata("index", "iris", "data", n = nrow(iris))
29 | create_bulk_upload_file(bulk_upload_info, iris)
30 | # "/var/folders/__/yz_l30s48xj6m_0059b_2twr0000gn/T//RtmpQnvUOt/file98194322b8"
31 | 
32 | bulk_delete_info <- create_metadata("delete", "iris", "data", n = nrow(iris))
33 | create_bulk_delete_file(bulk_delete_info)
34 | # "/var/folders/__/yz_l30s48xj6m_0059b_2twr0000gn/T//RtmpQnvUOt/file98194322b8"
35 | }
36 | }
37 | \seealso{
38 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html}
39 | for more information on the information required by the Elasticsearch Bulk API.
40 | }
41 | 


--------------------------------------------------------------------------------
/man/create_metadata.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{create_metadata}
 4 | \alias{create_metadata}
 5 | \title{Create Bulk API metadata.}
 6 | \usage{
 7 | create_metadata(action, index, doc_type, id = NULL, n = NULL)
 8 | }
 9 | \arguments{
10 | \item{action}{One of: "index", "create", "update" or "delete".}
11 | 
12 | \item{index}{The name of the index where the documents reside (or will reside).}
13 | 
14 | \item{doc_type}{The name of the document type where the documents reside (or will reside).}
15 | 
16 | \item{id}{[optional] Character vector of document ids.}
17 | 
18 | \item{n}{[optional] Integer number of repeated metadata description objects that need to be
19 | returned (if \code{id} is not specified).}
20 | }
21 | \value{
22 | A character vector of Bulk API document information objects.
23 | }
24 | \description{
25 | The fastest way to index, delete or update many documents, is via the Bulk API. This requires
26 | that each document have the action combined with the document's metadata (index, type and id)
27 | sent to the API. This information is encapulated as a JSON object, that this function is
28 | responsible for generating.
29 | }
30 | \examples{
31 | \dontrun{
32 | create_metadata("index", "iris", "data", n = 2)
33 | '{\\"index\\": {\\"_index\\": \\"iris\\", \\"_type\\": \\"data\\"}}'
34 | '{\\"index\\": {\\"_index\\": \\"iris\\", \\"_type\\": \\"data\\"}}'
35 | }
36 | }
37 | \seealso{
38 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html}
39 | for more information on the information required by the Elasticsearch Bulk API.
40 | }
41 | 


--------------------------------------------------------------------------------
/man/elastic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{elastic}
 4 | \alias{elastic}
 5 | \title{elastic_resource class constructor.}
 6 | \usage{
 7 | elastic(cluster_url, index, doc_type = NULL)
 8 | }
 9 | \arguments{
10 | \item{cluster_url}{URL to the Elastic cluster.}
11 | 
12 | \item{index}{The name of an index on the Elasticsearch cluster.}
13 | 
14 | \item{doc_type}{[optional] The name of a document type within the index.}
15 | }
16 | \value{
17 | An \code{elastic_rescource} object.
18 | }
19 | \description{
20 | Objects of this class contain all of the information required to locate documents in an
21 | Elasticsearch cluster.
22 | }
23 | \examples{
24 | \dontrun{
25 | my_data <- elastic("http://localhost:9200", "iris", "data")
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/elastic_predicates.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{elastic_predicates}
 4 | \alias{elastic_predicates}
 5 | \alias{is_elastic}
 6 | \alias{is_elastic_rescource}
 7 | \alias{is_elastic_api}
 8 | \alias{is_elastic_query}
 9 | \alias{is_elastic_aggs}
10 | \alias{is_elastic_sort}
11 | \alias{is_elastic_source_filter}
12 | \alias{is_elastic_info}
13 | \title{elasticsearchr predicate functions.}
14 | \usage{
15 | is_elastic(x)
16 | 
17 | is_elastic_rescource(x)
18 | 
19 | is_elastic_api(x)
20 | 
21 | is_elastic_query(x)
22 | 
23 | is_elastic_aggs(x)
24 | 
25 | is_elastic_sort(x)
26 | 
27 | is_elastic_source_filter(x)
28 | 
29 | is_elastic_info(x)
30 | }
31 | \arguments{
32 | \item{x}{An elasticsearchr object.}
33 | }
34 | \value{
35 | Boolean.
36 | }
37 | \description{
38 | Predicate functions for identifying different elasticsearchr object types.
39 | }
40 | 


--------------------------------------------------------------------------------
/man/elastic_version.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{elastic_version}
 4 | \alias{elastic_version}
 5 | \title{Elasticsearch version}
 6 | \usage{
 7 | elastic_version(url)
 8 | }
 9 | \arguments{
10 | \item{url}{A valid URL to an Elasticsearch cluster.}
11 | }
12 | \value{
13 | A list with the \code{major}, \code{minor} and \code{build} numbers.
14 | }
15 | \description{
16 | Returns the major, minor and build version numbers for an Elasticsearch cluster, given a valid
17 | URL to an Elasticsearch cluster.
18 | }
19 | \examples{
20 | \dontrun{
21 | elastic_version("http://localhost:9200")
22 | $major
23 | [1] 5
24 | 
25 | $minor
26 | [1] 0
27 | 
28 | $build
29 | [1] 1
30 | }
31 | }
32 | 


--------------------------------------------------------------------------------
/man/elasticsearchr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/elasticsearchr.R
 3 | \docType{package}
 4 | \name{elasticsearchr}
 5 | \alias{elasticsearchr}
 6 | \alias{elasticsearchr-package}
 7 | \title{elasticsearchr: a lightweight Elasticsearch client for R.}
 8 | \description{
 9 | Allows you to index, update and delete documents as well as run queries and aggregations.
10 | }
11 | 


--------------------------------------------------------------------------------
/man/extract_query_results.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{extract_query_results}
 4 | \alias{extract_query_results}
 5 | \alias{extract_aggs_results}
 6 | \alias{extract_id_results}
 7 | \title{Elasticsearch HTTP response data extraction functions.}
 8 | \usage{
 9 | extract_query_results(response)
10 | 
11 | extract_aggs_results(response)
12 | 
13 | extract_id_results(response)
14 | }
15 | \arguments{
16 | \item{response}{An HTTP response from a response to a search API request.}
17 | }
18 | \value{
19 | A data.frame of response results.
20 | }
21 | \description{
22 | Functions for extracting the different types of data that can be contained in a response to a
23 | search API request.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/from_size_search.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{from_size_search}
 4 | \alias{from_size_search}
 5 | \title{Execute query with from-size search API.}
 6 | \usage{
 7 | from_size_search(rescource, api_call_payload)
 8 | }
 9 | \arguments{
10 | \item{rescource}{An \code{elastic} rescource object describing on what documents the query is to
11 | be execured on.}
12 | 
13 | \item{api_call_payload}{A character string containing the JSON payload that described the query
14 | to be executed.}
15 | }
16 | \value{
17 | A data.frame of documents returned from the query.
18 | }
19 | \description{
20 | The from-size search API allows a maximum of 10,000 search results (the maximum 'size') to be
21 | returned in one call to the API. The 'from' in the name of the API refers to where in the order
22 | of all qualifying documents (as ordered by their search score), should results start to be
23 | returned from. Anything larger than 10,000 and the results need to be fetched from using the
24 | scroll-search API (which is slower as it involves making multiple call-back requests). This API
25 | is particularly well suited to returning aggregation results.
26 | }
27 | \examples{
28 | \dontrun{
29 | elastic_rescource <- elastic("http://localhost:9200", "iris", "data")
30 | query_json <- '{"query": {"match_all": {}}}'
31 | results <- from_size_search(elastic_rescource, query_json)
32 | head(results)
33 | #   sepal_length sepal_width petal_length petal_width species
34 | # 1          4.8         3.0          1.4         0.1  setosa
35 | # 2          4.3         3.0          1.1         0.1  setosa
36 | # 3          5.8         4.0          1.2         0.2  setosa
37 | # 4          5.1         3.5          1.4         0.3  setosa
38 | # 5          5.2         3.5          1.5         0.2  setosa
39 | # 6          5.2         3.4          1.4         0.2  setosa
40 | }
41 | }
42 | \seealso{
43 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-from-size.html}
44 | for more information on the information required by the Elasticsearch from-size API.
45 | }
46 | 


--------------------------------------------------------------------------------
/man/grapes-create-grapes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{\%create\%}
 4 | \alias{\%create\%}
 5 | \title{Create Elasticsearch index with custom mapping.}
 6 | \usage{
 7 | rescource \%create\% mapping
 8 | }
 9 | \arguments{
10 | \item{rescource}{An \code{elastic_rescource} object that contains the information on the
11 | Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
12 | not already exist, it will be created automatically.}
13 | 
14 | \item{mapping}{A JSON object containing the mapping details required for the index.}
15 | }
16 | \description{
17 | Mappings are the closest concept to traditional database 'schema'. This function allows the
18 | creation of Elasticsearch indicies with custom mappings. If left unspecified, Elasticsearch will
19 | infer the type of each field based on the first document indexed.
20 | }
21 | \examples{
22 | \dontrun{
23 | elastic("http://localhost:9200", "iris", "data") \%create\% mapping_default_simple()
24 | }
25 | }
26 | \seealso{
27 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping.html}
28 | }
29 | 


--------------------------------------------------------------------------------
/man/grapes-delete-grapes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{\%delete\%}
 4 | \alias{\%delete\%}
 5 | \title{Delete Elasticsearch index.}
 6 | \usage{
 7 | rescource \%delete\% approve
 8 | }
 9 | \arguments{
10 | \item{rescource}{An \code{elastic_rescource} object that contains the information on the
11 | Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
12 | not already exist, it will be created automatically.}
13 | 
14 | \item{approve}{Must be equal to \code{"TRUE"} for deletion for all documents in a rescource,
15 | OR be a character vector of document ids if only specific documents need to be deleted.}
16 | }
17 | \description{
18 | Delete all of the documents within a particular document type (if specified), or delete an
19 | entire index (if the document type is unspecified.)
20 | }
21 | \examples{
22 | \dontrun{
23 | elastic("http://localhost:9200", "iris", "data") \%delete\% TRUE
24 | }
25 | }
26 | 


--------------------------------------------------------------------------------
/man/grapes-index-grapes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{\%index\%}
 4 | \alias{\%index\%}
 5 | \title{Index a data frame.}
 6 | \usage{
 7 | rescource \%index\% df
 8 | }
 9 | \arguments{
10 | \item{rescource}{An \code{elastic_rescource} object that contains the information on the
11 | Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
12 | not already exist, it will be created automatically.}
13 | 
14 | \item{df}{data.frame whose rows will be indexed as documents in the Elasticsearch cluster.}
15 | }
16 | \description{
17 | Inserting records (or documents) into Elasticsearch is referred to as "indexing' the data. This
18 | function considers each row of a data frame as a document to be indexed into an Elasticsearch
19 | index.
20 | }
21 | \details{
22 | If the data frame contains a column named 'id', then this will be used to assign document ids.
23 | Otherwise, Elasticsearch will automatically assigne the documents random ids.
24 | }
25 | \examples{
26 | \dontrun{
27 | elastic("http://localhost:9200", "iris", "data") \%index\% iris
28 | }
29 | }
30 | \seealso{
31 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html}
32 | }
33 | 


--------------------------------------------------------------------------------
/man/grapes-info-grapes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{\%info\%}
 4 | \alias{\%info\%}
 5 | \title{Get cluster and index (meta) data.}
 6 | \usage{
 7 | rescource \%info\% info
 8 | }
 9 | \arguments{
10 | \item{rescource}{An \code{elastic_rescource} object that contains the information on the
11 | Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
12 | not already exist, it will be created automatically.}
13 | 
14 | \item{info}{\code{elastic_info} object.}
15 | }
16 | \description{
17 | An operator to be used with requests for information
18 | }
19 | \examples{
20 | \dontrun{
21 | elastic("http://localhost:9200", "iris", "data") \%info\% list_indices()
22 | elastic("http://localhost:9200", "iris", "data") \%info\% list_fields()
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/man/grapes-search-grapes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{\%search\%}
 4 | \alias{\%search\%}
 5 | \title{Execute query or search.}
 6 | \usage{
 7 | rescource \%search\% search
 8 | }
 9 | \arguments{
10 | \item{rescource}{An \code{elastic_rescource} object that contains the information on the
11 | Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
12 | not already exist, it will be created automatically.}
13 | 
14 | \item{search}{\code{elastic_query} or \code{elastic_aggs} object.}
15 | }
16 | \value{
17 | A data.frame of search or aggregation results.
18 | }
19 | \description{
20 | Execute query or search.
21 | }
22 | \examples{
23 | \dontrun{
24 | results <- elastic("http://localhost:9200", "iris", "data") \%search\% query('{"match_all": {}}')
25 | head(results)
26 | #   sepal_length sepal_width petal_length petal_width species
27 | # 1          4.8         3.0          1.4         0.1  setosa
28 | # 2          4.3         3.0          1.1         0.1  setosa
29 | # 3          5.8         4.0          1.2         0.2  setosa
30 | # 4          5.1         3.5          1.4         0.3  setosa
31 | # 5          5.2         3.5          1.5         0.2  setosa
32 | # 6          5.2         3.4          1.4         0.2  setosa
33 | }
34 | }
35 | 


--------------------------------------------------------------------------------
/man/index_bulk_dataframe.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{index_bulk_dataframe}
 4 | \alias{index_bulk_dataframe}
 5 | \title{Index data frame with Elasticsearch Bulk API}
 6 | \usage{
 7 | index_bulk_dataframe(rescource, df)
 8 | }
 9 | \arguments{
10 | \item{rescource}{An \code{elastic_rescource} object that contains the information on the
11 | Elasticsearch cluster, index and document type, where the indexed data will reside. If this does
12 | not already exist, it will be created automatically.}
13 | 
14 | \item{df}{data.frame whose rows will be indexed as documents in the Elasticsearch cluster.}
15 | }
16 | \description{
17 | Helper function to orchestrate the assembly of the Bulk API upload file, http request to
18 | Elasticsearch and handling any subsequent respose errors. It's primary purpose is to be called
19 | repeatedly on 'chunks' of a data frame that is too bid to be indexed with a single call to the
20 | Bulk API (and hence the split into smaller more manageable chunks).
21 | }
22 | \examples{
23 | \dontrun{
24 | rescource <- elastic("http://localhost:9200", "iris", "data")
25 | index_bulk_dataframe(rescource, iris)
26 | }
27 | }
28 | 


--------------------------------------------------------------------------------
/man/list_fields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{list_fields}
 4 | \alias{list_fields}
 5 | \title{List of fields in index information.}
 6 | \usage{
 7 | list_fields()
 8 | }
 9 | \value{
10 | An \code{elastic_info} object.
11 | }
12 | \description{
13 | List of fields in index information.
14 | }
15 | \examples{
16 | list_fields()
17 | }
18 | 


--------------------------------------------------------------------------------
/man/list_indices.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{list_indices}
 4 | \alias{list_indices}
 5 | \title{List of indices in cluster information.}
 6 | \usage{
 7 | list_indices()
 8 | }
 9 | \value{
10 | An \code{elastic_info} object.
11 | }
12 | \description{
13 | List of indices in cluster information.
14 | }
15 | \examples{
16 | list_indices()
17 | }
18 | 


--------------------------------------------------------------------------------
/man/mapping_default_simple.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mappings.R
 3 | \name{mapping_default_simple}
 4 | \alias{mapping_default_simple}
 5 | \title{Simple Elasticsearch default mappings for non-text-search analytics}
 6 | \usage{
 7 | mapping_default_simple()
 8 | }
 9 | \description{
10 | This mapping switches-off the text analyser for all fields of type 'string' (i.e. switches off
11 | free text search), allows all text search to work with case-insensitive lowercase terms, and
12 | maps any field with the name 'timestamp' to type 'date', so long as it has the appropriate
13 | string or long format.
14 | }
15 | 


--------------------------------------------------------------------------------
/man/mapping_fielddata_true.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/mappings.R
 3 | \name{mapping_fielddata_true}
 4 | \alias{mapping_fielddata_true}
 5 | \title{Elasticsearch 5.x default mappings enabling fielddata for text fields}
 6 | \usage{
 7 | mapping_fielddata_true()
 8 | }
 9 | \description{
10 | A default mapping that enables fielddata for all string/text fields in Elasticsearch 5.x.
11 | }
12 | 


--------------------------------------------------------------------------------
/man/plus-.elastic_api.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{+.elastic_api}
 4 | \alias{+.elastic_api}
 5 | \title{Define Elasticsearch aggregation on a secific subset of documents.}
 6 | \usage{
 7 | \method{+}{elastic_api}(x, y)
 8 | }
 9 | \arguments{
10 | \item{x}{\code{elastic_query} object.}
11 | 
12 | \item{y}{\code{elastic_aggs} or \code{elastic_sort} object.}
13 | }
14 | \value{
15 | \code{elastic_aggs} object that contains the query information required for the
16 | aggregation.
17 | }
18 | \description{
19 | Sometimes it is necessary to perform an aggregation on the results of a query (i.e. on a subset
20 | of all the available documents). This is achieved by adding an \code{aggs} object to a
21 | \code{query} object.
22 | }
23 | \examples{
24 | all_docs <- query('{"match_all": {}}')
25 | avg_sepal_width_per_cat <- aggs('{"avg_sepal_width_per_cat": {
26 |       "terms": {"field": "species"},
27 |       "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
28 | }')
29 | all_docs + avg_sepal_width_per_cat
30 | 
31 | sort_by_sepal_width <- sort_on('[{"sepal_width": {"order": "asc"}}]')
32 | all_docs + sort_by_sepal_width
33 | }
34 | 


--------------------------------------------------------------------------------
/man/print.elastic_api.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{print.elastic_api}
 4 | \alias{print.elastic_api}
 5 | \title{Pretty-print aggs and query JSON objects.}
 6 | \usage{
 7 | \method{print}{elastic_api}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{\code{elastic_query} or \code{elastic_aggs} object.}
11 | 
12 | \item{...}{For consitency with all other \code{print} methods.}
13 | }
14 | \value{
15 | Character string of pretty-printed JSON object.
16 | }
17 | \description{
18 | Pretty-print aggs and query JSON objects.
19 | }
20 | \examples{
21 | all_docs <- query('{"match_all": {}}')
22 | print(all_docs)
23 | }
24 | 


--------------------------------------------------------------------------------
/man/query.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{query}
 4 | \alias{query}
 5 | \title{Define Elasticsearch query.}
 6 | \usage{
 7 | query(json, size = 0)
 8 | }
 9 | \arguments{
10 | \item{json}{JSON object describing the query that needs to be executed.}
11 | 
12 | \item{size}{[optional] The number of documents to return. If left unspecified, then the default
13 | if to return all documents.}
14 | }
15 | \value{
16 | An \code{elastic_query} object.
17 | }
18 | \description{
19 | Define Elasticsearch query.
20 | }
21 | \examples{
22 | all_docs <- query('{"match_all": {}}')
23 | }
24 | \seealso{
25 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html}
26 | }
27 | 


--------------------------------------------------------------------------------
/man/scroll_search.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{scroll_search}
 4 | \alias{scroll_search}
 5 | \title{Execute a query with the scroll-search API.}
 6 | \usage{
 7 | scroll_search(rescource, api_call_payload,
 8 |   extract_function = extract_query_results)
 9 | }
10 | \arguments{
11 | \item{rescource}{An \code{elastic} rescource object describing on what documents the query is to
12 | be execured on.}
13 | 
14 | \item{api_call_payload}{A character string containing the JSON payload that described the query
15 | to be executed.}
16 | 
17 | \item{extract_function}{A function to be used for extracting the data from the responses sent
18 | back from the scroll-search API. Defaults to \code{extract_query_results} that extracts query
19 | results, for when the scroll-search API is being used for retreiving query results (as opposed
20 | to aggregations or document ids, etc.).}
21 | }
22 | \value{
23 | A data.frame of documents returned from the query.
24 | }
25 | \description{
26 | The scroll-search API works by returning a 'token' to the user that allows search results to be
27 | returned one 'page' at a time. This, large query results (in excess of the 10,000 documents
28 | maximum size offered by the from-search API) can be retreived by making multiple calls after the
29 | initial query was sent. Although a slower process end-to-end, this API is particularly well
30 | suited to returning large query results.
31 | }
32 | \examples{
33 | \dontrun{
34 | elastic_rescource <- elastic("http://localhost:9200", "iris", "data")
35 | query_json <- '{"query": {"match_all": {}}}'
36 | results <- scroll_search(elastic_rescource, query_json)
37 | head(results)
38 | #   sepal_length sepal_width petal_length petal_width species
39 | # 1          4.8         3.0          1.4         0.1  setosa
40 | # 2          4.3         3.0          1.1         0.1  setosa
41 | # 3          5.8         4.0          1.2         0.2  setosa
42 | # 4          5.1         3.5          1.4         0.3  setosa
43 | # 5          5.2         3.5          1.5         0.2  setosa
44 | # 6          5.2         3.4          1.4         0.2  setosa
45 | }
46 | }
47 | \seealso{
48 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html}
49 | for more information on the information required by the Elasticsearch scroll-search API.
50 | }
51 | 


--------------------------------------------------------------------------------
/man/select_fields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{select_fields}
 4 | \alias{select_fields}
 5 | \title{Define Elasticsearch query source filter.}
 6 | \usage{
 7 | select_fields(json)
 8 | }
 9 | \arguments{
10 | \item{json}{JSON object describing the aggregation that needs to be executed.}
11 | }
12 | \value{
13 | An \code{elastic_source_filter} object.
14 | }
15 | \description{
16 | Define Elasticsearch query source filter.
17 | }
18 | \seealso{
19 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-source-filtering.html}
20 | }
21 | 


--------------------------------------------------------------------------------
/man/sort_on.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/api.R
 3 | \name{sort_on}
 4 | \alias{sort_on}
 5 | \title{Define Elasticsearch query sort}
 6 | \usage{
 7 | sort_on(json)
 8 | }
 9 | \arguments{
10 | \item{json}{JSON object describing the sorting required on the query results.}
11 | }
12 | \value{
13 | An \code{elastic_sort} object.
14 | }
15 | \description{
16 | Define Elasticsearch query sort
17 | }
18 | \examples{
19 | sort_by_key <- sort_on('[{"sort_key": {"order": "asc"}}]')
20 | }
21 | \seealso{
22 | \url{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-sort.html}
23 | }
24 | 


--------------------------------------------------------------------------------
/man/valid_connection.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{valid_connection}
 4 | \alias{valid_connection}
 5 | \title{Validate healthy Elasticsearch connection.}
 6 | \usage{
 7 | valid_connection(url)
 8 | }
 9 | \arguments{
10 | \item{url}{The URL to validate.}
11 | }
12 | \value{
13 | Boolean
14 | }
15 | \description{
16 | Validates healthy Elasticsearch connections by attempting to call the cluster healthcheck
17 | endpoint. In doing to, it defends against incorrect URLs to Elasticsearch clusters. Requires
18 | that URLs point directly to a master node - i.e. the endpoint that would return the default
19 | Elasticsearch message, "You Know, for Search", e.g. `http://localhost:9200`.
20 | }
21 | \examples{
22 | \dontrun{
23 | url <- "http://localhost:9200"
24 | valid_connection(url)
25 | # TRUE
26 | 
27 | url <- "http://localhost:9201"
28 | valid_connection(url)
29 | #  Error: Failed to connect to localhost port 9201: Connection refused
30 | }
31 | }
32 | 


--------------------------------------------------------------------------------
/man/valid_json.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils.R
 3 | \name{valid_json}
 4 | \alias{valid_json}
 5 | \title{Valid JSON string predicate function}
 6 | \usage{
 7 | valid_json(json)
 8 | }
 9 | \arguments{
10 | \item{json}{Candidate JSON object as a string.}
11 | }
12 | \value{
13 | Boolean.
14 | }
15 | \description{
16 | Valid JSON string predicate function
17 | }
18 | \examples{
19 | \dontrun{
20 | good_json <- '{"id": 1}'
21 | valid_json(good_json)
22 | # TRUE
23 | 
24 | bad_json <- '{"id": 1a}'
25 | valid_json(bad_json)
26 | # FALSE
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(elasticsearchr)
3 | 
4 | test_check("elasticsearchr")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-elasticsearch_test_data.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2019 Alex Ioannides
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # ---- prepare test data frame
 17 | iris_data <- iris
 18 | colnames(iris_data) <- c("sepal_length", "sepal_width", "petal_length", "petal_width", "species")
 19 | iris_data["species"] <- as.character(iris_data$species)
 20 | iris_data["sort_key"] <- 1:150
 21 | 
 22 | 
 23 | # ---- compute expected aggretaion results ----
 24 | iris_test_aggs_bucket <- data.frame(
 25 |   "key" = c("setosa", "versicolor", "virginica"),
 26 |   "doc_count" = c(50, 50, 50),
 27 |   "avg_sepal_width.value" = c(3.428, 2.770, 2.974),
 28 |   stringsAsFactors = FALSE
 29 |   )
 30 | 
 31 | 
 32 | iris_test_aggs_metric <- data.frame(
 33 |   "value" = 3.0573333358764647,
 34 |   stringsAsFactors = FALSE
 35 | )
 36 | 
 37 | 
 38 | # ---- functions for loading and deleting test data from Elasticsearch on http://localhost:9200 ----
 39 | load_test_data <- function() {
 40 |   # ping Elasticsearch to check that it exists
 41 |   tryCatch(
 42 |     ping_es_cluster <- httr::GET("http://localhost:9200"),
 43 |     error = function(e) stop("can't find Elasticsearch at http://localhost:9200", call. = FALSE)
 44 |   )
 45 | 
 46 |   # delete any index called 'iris' on localhost
 47 |   response <- httr::DELETE("http://localhost:9200/iris")
 48 | 
 49 |   # if testing on Elasticsearch 5.x then ensure fielddata: true as a default mapping for strings
 50 |   if (elastic_version("http://localhost:9200")$major >= 5) {
 51 |     default_iris_mapping <- '{"mappings":{"_default_":{"dynamic_templates":[{"strings":{
 52 |       "match_mapping_type":"string","mapping":{"type":"text","fielddata":true}}}]}}}'
 53 |     response <- httr::PUT("http://localhost:9200/iris", body = default_iris_mapping,
 54 |                           httr::add_headers("Content-Type" = "application/json"))
 55 | 
 56 |     check_http_code_throw_error(response)
 57 |   }
 58 | 
 59 |   # index iris dataset from first principles (i.e. not using the elasticsearchr)
 60 |   for (i in 1:150) {
 61 |     iris_json_data <- gsub("\\[|\\]", "", jsonlite::toJSON((iris_data[i, ])))
 62 |     response <- httr::POST(paste0("http://localhost:9200/iris/data/", i), body = iris_json_data,
 63 |                            encode = "json", httr::add_headers("Content-Type" = "application/json"))
 64 |     if (httr::status_code(response) != 201) {
 65 |       stop("cannot index data into Elasticsearch for running tests", call. = FALSE)
 66 |     }
 67 |   }
 68 | 
 69 |   # wait until all 150 documents have been indexed and are ready for searching until returning
 70 |   wait_finish_indexing("http://localhost:9200/iris/data/_search?size=150&q=*", 150)
 71 | 
 72 |   TRUE
 73 | }
 74 | 
 75 | 
 76 | wait_finish_indexing <- function(search_url, results_size) {
 77 |   waiting <- TRUE
 78 |   start_time <- Sys.time()
 79 |   while (waiting) {
 80 |     response <- httr::POST(search_url)
 81 |     available_data <- nrow(jsonlite::fromJSON(httr::content(response, as = 'text'))$hits$hits)
 82 | 
 83 |     if (!is.null(available_data)) {
 84 |       if (available_data == results_size) {
 85 |         waiting <- FALSE
 86 |       } else {
 87 |         running_time <- Sys.time() - start_time
 88 |         if (running_time > 60) stop("indexing Elasticsearch test data has time-out")
 89 |       }
 90 |     }
 91 | 
 92 |   }
 93 | 
 94 |   TRUE
 95 | }
 96 | 
 97 | 
 98 | wait_finish_delete <- function(search_url) {
 99 |   waiting <- TRUE
100 |   start_time <- Sys.time()
101 |   while (waiting) {
102 |     response <- httr::POST(search_url)
103 |     available_data <- nrow(jsonlite::fromJSON(httr::content(response, as = 'text'))$hits$hits)
104 |     if (is.null(available_data)) {
105 |       waiting <- FALSE
106 |     } else {
107 |       running_time <- difftime(Sys.time(), start_time, units = "secs")
108 |       if (running_time > 30) stop("indexing Elasticsearch test data has time-out")
109 |     }
110 | 
111 |   }
112 | 
113 |   TRUE
114 | }
115 | 
116 | 
117 | delete_test_data <- function() {
118 |   tryCatch(
119 |     response <- httr::DELETE("http://localhost:9200/iris"),
120 |     error = function(e) stop("can't find iris index at http://localhost:9200/iris", call. = FALSE)
121 |   )
122 | 
123 |   NULL
124 | }
125 | 


--------------------------------------------------------------------------------
/tests/testthat/test-api.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2019 Alex Ioannides
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | context('api')
 17 | 
 18 | 
 19 | # ---- classes, methods and predicates ------------------------------------------------------------
 20 | 
 21 | 
 22 | test_that('elastic objects have the correct classes assigned to them', {
 23 |   # skip if on CRAN or Travis
 24 |   skip_on_travis()
 25 |   skip_on_cran()
 26 | 
 27 |   # arrange
 28 |   es_rescource <- elastic("http://localhost:9200", "iris", "data")
 29 | 
 30 |   # act
 31 |   elastic_classes <- class(es_rescource)
 32 | 
 33 |   # assert
 34 |   expect_identical(elastic_classes, c("elastic_rescource", "elastic"))
 35 | })
 36 | 
 37 | 
 38 | test_that('elastic objects correctly assemble search URLs when doc_types are specified', {
 39 |   # skip if on CRAN or Travis
 40 |   skip_on_travis()
 41 |   skip_on_cran()
 42 | 
 43 |   # arrange
 44 |   es_rescource <- elastic("http://localhost:9200", "iris", "data")
 45 | 
 46 |   # act
 47 |   search_url <- es_rescource$search_url
 48 | 
 49 |   # assert
 50 |   expect_identical(search_url, "http://localhost:9200/iris/data/_search")
 51 | })
 52 | 
 53 | 
 54 | test_that('elastic objects correctly assemble search URLs when doc_types are not specified', {
 55 |   # skip if on CRAN or Travis
 56 |   skip_on_travis()
 57 |   skip_on_cran()
 58 | 
 59 |   # arrange
 60 |   es_rescource <- elastic("http://localhost:9200", "iris")
 61 | 
 62 |   # act
 63 |   search_url <- es_rescource$search_url
 64 | 
 65 |   # assert
 66 |   expect_identical(search_url, "http://localhost:9200/iris/_search")
 67 | })
 68 | 
 69 | 
 70 | test_that('query objects have the correct classes assigned to them', {
 71 |   # arrange
 72 |   everything <- '{"match_all": {}}'
 73 | 
 74 |   # act
 75 |   es_query <- query(everything)
 76 | 
 77 |   # assert
 78 |   expect_identical(class(es_query), c("elastic_query", "elastic_api", "elastic"))
 79 | })
 80 | 
 81 | 
 82 | test_that('query objects will not accept invalid JSON', {
 83 |   # arrange
 84 |   bad_query_json <- '{"match_all": {}'
 85 | 
 86 |   # act & assert
 87 |   expect_error(query(bad_query_json))
 88 | })
 89 | 
 90 | 
 91 | test_that('query objects generate the correct search API call', {
 92 |   # arrange
 93 |   everything <- '{"match_all": {}}'
 94 | 
 95 |   # act
 96 |   es_query <- query(everything)
 97 | 
 98 |   # assert
 99 |   expect_identical(es_query$api_call, '"query":{"match_all": {}}')
100 | })
101 | 
102 | 
103 | test_that('sort objects have the correct classes assigned to them', {
104 |   # arrange
105 |   by_sepal_width <- '{"sepal_width": {"order": "asc"}}'
106 | 
107 |   # act
108 |   es_sort <- sort_on(by_sepal_width)
109 | 
110 |   # assert
111 |   expect_identical(class(es_sort), c("elastic_sort", "elastic_api", "elastic"))
112 | })
113 | 
114 | 
115 | test_that('sort objects will not accept invalid JSON', {
116 |   # arrange
117 |   bad_sort_json <- '{"sepal_width": {"order": "asc"}'
118 | 
119 |   # act & assert
120 |   expect_error(sort_on(bad_sort_json))
121 | })
122 | 
123 | 
124 | test_that('sort objects generate the correct search API call', {
125 |   # arrange
126 |   by_sepal_width <- '{"sepal_width": {"order": "asc"}}'
127 | 
128 |   # act
129 |   es_sort <- sort_on(by_sepal_width)
130 | 
131 |   # assert
132 |   expect_identical(es_sort$api_call, '"sort":{"sepal_width": {"order": "asc"}}')
133 | })
134 | 
135 | 
136 | test_that('select_fields objects have the correct classes assigned to them', {
137 |   # arrange
138 |   select <- '{"includes": ["field1", "obj1.*"], "excludes": ["field2", "obj2.*"]}'
139 | 
140 |   # act
141 |   es_source_filter <- select_fields(select)
142 | 
143 |   # assert
144 |   expect_identical(class(es_source_filter), c("elastic_source_filter", "elastic_api", "elastic"))
145 | })
146 | 
147 | 
148 | test_that('select_fields objects will not accept invalid JSON', {
149 |   # arrange
150 |   bad_source_filter_json <- '{"includes": ["field1", "obj1.*"], "excludes": ["field2", "obj2.*"}'
151 | 
152 |   # act & assert
153 |   expect_error(select_fields(bad_source_filter_json))
154 | })
155 | 
156 | 
157 | test_that('select_fields objects generate the correct search API call', {
158 |   # arrange
159 |   select <- '{"includes": ["field1", "obj1.*"], "excludes": ["field2"]}'
160 | 
161 |   # act
162 |   es_source_filter <- select_fields(select)
163 | 
164 |   # assert
165 |   expected_call <- '"_source": {"includes": ["field1", "obj1.*"], "excludes": ["field2"]}'
166 |   expect_identical(es_source_filter$api_call, expected_call)
167 | })
168 | 
169 | 
170 | test_that('aggs objects have the correct classes assigned to them', {
171 |   # arrange
172 |   avg_sepal_width_per_cat <- '{"avg_sepal_width_per_cat": {
173 |   "terms": {"field": "species"},
174 |   "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
175 |   }'
176 | 
177 |   # act
178 |   es_agg <- aggs(avg_sepal_width_per_cat)
179 | 
180 |   # assert
181 |   expect_identical(class(es_agg), c("elastic_aggs", "elastic_api", "elastic"))
182 |   })
183 | 
184 | 
185 | test_that('aggs objects will not accept invalid JSON', {
186 |   # arrange
187 |   bad_aggs_json <- '{"match_all": {}'
188 | 
189 |   # act & assert
190 |   expect_error(aggs(bad_aggs_json))
191 | })
192 | 
193 | 
194 | test_that('aggs objects generate the correct search API call', {
195 |   # arrange
196 |   avg_sepal_width_per_cat <- '{"avg_sepal_width_per_cat": {
197 |   "terms": {"field": "species"},
198 |   "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
199 |   }'
200 | 
201 |   # act
202 |   es_agg <- aggs(avg_sepal_width_per_cat)
203 | 
204 |   # assert
205 |   expected_api_call <- '"aggs":{"avg_sepal_width_per_cat": {
206 |   "terms": {"field": "species"},
207 |   "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
208 |   }'
209 |   expect_identical(es_agg$api_call, expected_api_call)
210 | })
211 | 
212 | 
213 | test_that('list_indices objects have the correct classes assigned to them', {
214 |   # act
215 |   es_info <- list_indices()
216 | 
217 |   # assert
218 |   expect_identical(class(es_info), c("elastic_info", "elastic_api", "elastic"))
219 | })
220 | 
221 | 
222 | test_that('list_fields objects have the correct classes assigned to them', {
223 |   # act
224 |   es_info <- list_fields()
225 | 
226 |   # assert
227 |   expect_identical(class(es_info), c("elastic_info", "elastic_api", "elastic"))
228 | })
229 | 
230 | 
231 | # ---- operators ----------------------------------------------------------------------------------
232 | 
233 | 
234 | test_that('%info% list_indices() returns a list of all available indices', {
235 |   # skip if on CRAN or Travis
236 |   skip_on_travis()
237 |   skip_on_cran()
238 | 
239 |   # arrange
240 |   load_test_data()
241 | 
242 |   # act
243 |   info_results <- elastic("http://localhost:9200", "iris", "data") %info% list_indices()
244 | 
245 |   # assert
246 |   expect_equal(info_results, "iris")
247 |   delete_test_data()
248 | })
249 | 
250 | 
251 | test_that('%info% list_fields() returns a list of all fields in an index', {
252 |   # skip if on CRAN or Travis
253 |   skip_on_travis()
254 |   skip_on_cran()
255 | 
256 |   # arrange
257 |   load_test_data()
258 | 
259 |   # act
260 |   info_results <- elastic("http://localhost:9200", "iris", "data") %info% list_fields()
261 | 
262 |   # assert
263 |   expected_fields <- c("petal_length", "petal_width", "sepal_length", "sepal_width", "sort_key", "species")
264 |   expect_equal(info_results, expected_fields)
265 |   delete_test_data()
266 | })
267 | 
268 | 
269 | test_that('%index% correctly indexes a large (>10mb single chunk) data frame', {
270 |   # skip if on CRAN or Travis
271 |   skip_on_travis()
272 |   skip_on_cran()
273 | 
274 |   # arrange
275 |   delete_test_data()
276 |   iris_data_bulk <- data.frame(do.call(cbind, lapply(1:40, FUN = function(x) iris_data)))
277 |   iris_data_bulk <- do.call(rbind, lapply(1:50, FUN = function(x) iris_data_bulk))
278 |   iris_data_bulk['sort_key'] <- 1:nrow(iris_data_bulk)
279 |   row.names(iris_data_bulk) <- 1:nrow(iris_data_bulk)
280 |   colnames(iris_data_bulk) <- cleaned_field_names(colnames(iris_data_bulk))
281 | 
282 |   # act
283 |   elastic("http://localhost:9200", "iris", "data") %index% iris_data_bulk
284 |   wait_finish_indexing("http://localhost:9200/iris/data/_search?size=7500&q=*", nrow(iris_data_bulk))
285 |   query_response <- httr::POST("http://localhost:9200/iris/data/_search?size=7500&q=*")
286 |   query_results <- jsonlite::fromJSON(httr::content(query_response, as = 'text'))$hits$hits$`_source`
287 |   query_results <- query_results[order(query_results$sort_key), ]
288 |   row.names(query_results) <- query_results$sort_key
289 | 
290 |   # assert
291 |   expect_equal(iris_data_bulk, query_results)
292 |   delete_test_data()
293 | })
294 | 
295 | 
296 | test_that('%create% can create an index with a custom mapping', {
297 |   # skip if on CRAN or Travis
298 |   skip_on_travis()
299 |   skip_on_cran()
300 | 
301 |   # arrange
302 |   delete_test_data()
303 | 
304 |   # act
305 |   elastic("http://localhost:9200", "iris") %create% mapping_default_simple()
306 |   get_mapping <- httr::GET("http://localhost:9200/iris/_mapping")
307 |   get_mapping_status <- httr::status_code(get_mapping)
308 | 
309 |   # assert
310 |   expect_equal(get_mapping_status, 200)
311 |   delete_test_data()
312 | })
313 | 
314 | 
315 | test_that('%delete% can delete all documents from an index', {
316 |   # skip if on CRAN or Travis
317 |   skip_on_travis()
318 |   skip_on_cran()
319 | 
320 |   # arrange
321 |   load_test_data()
322 | 
323 |   # act
324 |   elastic("http://localhost:9200", "iris") %delete% TRUE
325 |   wait_finish_delete("http://localhost:9200/iris/data/_search?size=150&q=*")
326 |   query_response <- httr::POST("http://localhost:9200/iris/data/_search?size=150&q=*")
327 |   query_response_status <- httr::status_code(query_response)
328 | 
329 |   # assert
330 |   expect_equal(query_response_status, 404)
331 |   delete_test_data()
332 | })
333 | 
334 | 
335 | test_that('%delete% can delete all documents from a type', {
336 |   # skip if on CRAN or Travis
337 |   skip_on_travis()
338 |   skip_on_cran()
339 | 
340 |   # arrange
341 |   load_test_data()
342 | 
343 |   # act
344 |   elastic("http://localhost:9200", "iris", "data") %delete% TRUE
345 |   wait_finish_delete("http://localhost:9200/iris/data/_search?size=150&q=*")
346 |   query_response <- httr::POST("http://localhost:9200/iris/data/_search?size=150&q=*")
347 |   query_results <- jsonlite::fromJSON(httr::content(query_response, as = 'text'))$hits$hits$`_source`
348 |   get_mapping <- httr::GET("http://localhost:9200/iris/_mapping")
349 |   get_mapping_status <- httr::status_code(get_mapping)
350 | 
351 |   # assert
352 |   expect_null(query_results)
353 |   expect_equal(get_mapping_status, 200)
354 |   delete_test_data()
355 | })
356 | 
357 | 
358 | test_that('%delete% can delete selected documents from a type', {
359 |   # skip if on CRAN or Travis
360 |   skip_on_travis()
361 |   skip_on_cran()
362 | 
363 |   # arrange
364 |   load_test_data()
365 |   query_response <- httr::POST("http://localhost:9200/iris/data/_search?size=150&q=*")
366 |   doc_ids <- jsonlite::fromJSON(httr::content(query_response, as = 'text'))$hits$hits$`_id`
367 | 
368 |   # act
369 |   elastic("http://localhost:9200", "iris", "data") %delete% doc_ids
370 |   wait_finish_delete("http://localhost:9200/iris/data/_search?size=150&q=*")
371 |   query_response <- httr::POST("http://localhost:9200/iris/data/_search?size=150&q=*")
372 |   query_results <- jsonlite::fromJSON(httr::content(query_response, as = 'text'))$hits$hits$`_source`
373 |   get_mapping <- httr::GET("http://localhost:9200/iris/_mapping")
374 |   get_mapping_status <- httr::status_code(get_mapping)
375 | 
376 |   # assert
377 |   expect_null(query_results)
378 |   expect_equal(get_mapping_status, 200)
379 |   delete_test_data()
380 | })
381 | 
382 | 
383 | test_that('we can query using the %search% operator and return all documents', {
384 |   # skip if on CRAN or Travis
385 |   skip_on_travis()
386 |   skip_on_cran()
387 | 
388 |   # arrange
389 |   load_test_data()
390 |   everything <- '{"match_all": {}}'
391 |   es_query <- query(everything)
392 | 
393 |   # act
394 |   query_results <- elastic("http://localhost:9200", "iris", "data") %search% es_query
395 | 
396 |   query_results_sorted <- query_results[order(query_results["sort_key"]), ]
397 |   rownames(query_results_sorted) <- query_results_sorted$sort_key
398 | 
399 |   # assert
400 |   expect_equal(query_results_sorted, iris_data)
401 |   delete_test_data()
402 | })
403 | 
404 | 
405 | test_that('we can query using the %search% operator and return documents sorted', {
406 |   # skip if on CRAN or Travis
407 |   skip_on_travis()
408 |   skip_on_cran()
409 | 
410 |   # arrange
411 |   load_test_data()
412 |   everything <- '{"match_all": {}}'
413 |   by_key <- '{"sort_key": {"order": "asc"}}'
414 |   es_query <- query(everything, size = 10) + sort_on(by_key)
415 | 
416 |   # act
417 |   query_results <- elastic("http://localhost:9200", "iris", "data") %search% es_query
418 | 
419 |   query_results_sorted <- query_results[order(query_results["sort_key"]), ]
420 |   rownames(query_results_sorted) <- query_results_sorted$sort_key
421 | 
422 |   # assert
423 |   expect_equal(query_results_sorted, iris_data[1:10, ])
424 |   delete_test_data()
425 | })
426 | 
427 | 
428 | test_that('we can query using the %search% operator and return a subset of fields', {
429 |   # skip if on CRAN or Travis
430 |   skip_on_travis()
431 |   skip_on_cran()
432 | 
433 |   # arrange
434 |   load_test_data()
435 |   fields <- c("sort_key", "sepal_length", "species")
436 |   everything <- '{"match_all": {}}'
437 |   source_filter_JSON <- '{"includes": ["sepal_length", "species", "sort_key"]}'
438 |   es_query <- query(everything)
439 |   es_source_filter <- select_fields(source_filter_JSON)
440 | 
441 |   # act
442 |   query_results <-
443 |     elastic("http://localhost:9200", "iris", "data") %search% (es_query + es_source_filter)
444 | 
445 |   query_results_sorted <- query_results[order(query_results["sort_key"]), fields]
446 |   rownames(query_results_sorted) <- query_results_sorted$sort_key
447 | 
448 |   # assert
449 |   expect_equal(query_results_sorted, iris_data[, fields])
450 |   delete_test_data()
451 | })
452 | 
453 | 
454 | test_that('we can query using the %search% operator and return a sorted subset of fields', {
455 |   # skip if on CRAN or Travis
456 |   skip_on_travis()
457 |   skip_on_cran()
458 | 
459 |   # arrange
460 |   load_test_data()
461 |   everything <- '{"match_all": {}}'
462 |   by_sepal_length <- '[{"sepal_length": {"order": "asc"}}, {"sort_key": {"order": "asc"}}]'
463 |   source_filter_JSON <- '{"includes": ["sepal_length", "species", "sort_key"]}'
464 | 
465 |   es_query <- query(everything)
466 |   es_sort <- sort_on(by_sepal_length)
467 |   es_source_filter <- select_fields(source_filter_JSON)
468 |   es_filter_and_sort <- es_source_filter + es_sort
469 | 
470 |   # act
471 |   query_results <-
472 |     elastic("http://localhost:9200", "iris", "data") %search% (es_query + es_filter_and_sort)
473 | 
474 |   rownames(query_results) <- 1:150
475 | 
476 |   # assert
477 |   iris_data_sorted <- iris_data[order(iris_data$sepal_length), colnames(query_results)]
478 |   rownames(iris_data_sorted) <- 1:150
479 |   expect_equal(query_results, iris_data_sorted)
480 |   delete_test_data()
481 | })
482 | 
483 | 
484 | test_that('we can use bucket aggregations using the %search% operator', {
485 |   # skip if on CRAN or Travis
486 |   skip_on_travis()
487 |   skip_on_cran()
488 | 
489 |   # arrange
490 |   load_test_data()
491 |   avg_sepal_width_per_cat <- '{"avg_sepal_width_per_cat": {
492 |   "terms": {"field": "species"},
493 |   "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
494 |   }'
495 |   es_aggs <- aggs(avg_sepal_width_per_cat)
496 | 
497 |   # act
498 |   aggs_results <- elastic("http://localhost:9200", "iris", "data") %search% es_aggs
499 | 
500 |   # assert
501 |   expect_equal(aggs_results, iris_test_aggs_bucket)
502 |   delete_test_data()
503 | })
504 | 
505 | 
506 | test_that('we can use base-metric aggregations using the %search% operator', {
507 |   # skip if on CRAN or Travis
508 |   skip_on_travis()
509 |   skip_on_cran()
510 | 
511 |   # arrange
512 |   load_test_data()
513 |   avg_sepal_width_per_cat <- '{"avg_sepal_width": {"avg": {"field": "sepal_width"}}}'
514 |   es_aggs <- aggs(avg_sepal_width_per_cat)
515 | 
516 |   # act
517 |   aggs_results <- elastic("http://localhost:9200", "iris", "data") %search% es_aggs
518 | 
519 |   # assert
520 |   expect_equal(aggs_results, iris_test_aggs_metric)
521 |   delete_test_data()
522 | })
523 | 
524 | 
525 | test_that('we can query + sort using the %search% operator', {
526 |   # skip if on CRAN or Travis
527 |   skip_on_travis()
528 |   skip_on_cran()
529 | 
530 |   # arrange
531 |   load_test_data()
532 | 
533 |   everything <- '{"match_all": {}}'
534 |   by_sepal_width <- '[{"sepal_width": {"order": "asc"}}, {"sort_key": {"order": "asc"}}]'
535 | 
536 |   es_query <- query(everything)
537 |   es_sort <- sort_on(by_sepal_width)
538 |   es_query_sorted <- es_query + es_sort
539 | 
540 |   # act
541 |   query_results <- elastic("http://localhost:9200", "iris", "data") %search% es_query_sorted
542 |   rownames(query_results) <- 1:150
543 | 
544 |   # assert
545 |   iris_data_sorted <- iris_data[order(iris_data$sepal_width, iris_data$sort_key), ]
546 |   rownames(iris_data_sorted) <- 1:150
547 |   expect_equal(query_results, iris_data_sorted)
548 |   delete_test_data()
549 | })
550 | 
551 | 
552 | test_that('we can query + aggregate using the %search% operator', {
553 |   # skip if on CRAN or Travis
554 |   skip_on_travis()
555 |   skip_on_cran()
556 | 
557 |   # arrange
558 |   load_test_data()
559 | 
560 |   everything <- '{"match_all": {}}'
561 | 
562 |   avg_sepal_width_per_cat <- '{"avg_sepal_width_per_cat": {
563 |   "terms": {"field": "species"},
564 |   "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
565 |   }'
566 | 
567 |   es_query <- query(everything)
568 |   es_agg <- aggs(avg_sepal_width_per_cat)
569 |   es_agg_on_query <- es_query + es_agg
570 | 
571 |   # act
572 |   aggs_results <- elastic("http://localhost:9200", "iris", "data") %search% es_agg_on_query
573 | 
574 |   # assert
575 |   expect_equal(aggs_results, iris_test_aggs_bucket)
576 |   delete_test_data()
577 |   })
578 | 
579 | 
580 | test_that('adding a sort object to a query object results in a query object', {
581 |   # arrange
582 |   everything <- '{"match_all": {}}'
583 |   by_sepal_width <- '{"sepal_width": {"order": "asc"}}'
584 | 
585 |   # act
586 |   es_query <- query(everything)
587 |   es_sort <- sort_on(by_sepal_width)
588 |   es_query_sorted <- es_query + es_sort
589 | 
590 |   # assert
591 |   expect_identical(class(es_query_sorted), c("elastic_query", "elastic_api", "elastic"))
592 | })
593 | 
594 | 
595 | test_that('adding a sort object to a query object generates the correct search API call', {
596 |   # arrange
597 |   everything <- '{"match_all": {}}'
598 |   by_sepal_width <- '{"sepal_width": {"order": "asc"}}'
599 | 
600 |   # act
601 |   es_query <- query(everything)
602 |   es_sort <- sort_on(by_sepal_width)
603 |   es_query_sorted <- es_query + es_sort
604 | 
605 |   # assert
606 |   expected_api_call <- '"query":{"match_all": {}},"sort":{"sepal_width": {"order": "asc"}}'
607 | 
608 |   expect_identical(es_query_sorted$api_call, expected_api_call)
609 | })
610 | 
611 | 
612 | test_that('adding an aggs object to a query object results in an aggs object', {
613 |   # arrange
614 |   everything <- '{"match_all": {}}'
615 | 
616 |   avg_sepal_width_per_cat <- '{"avg_sepal_width_per_cat": {
617 |   "terms": {"field": "species"},
618 |   "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
619 |   }'
620 | 
621 |   # act
622 |   es_query <- query(everything)
623 |   es_agg <- aggs(avg_sepal_width_per_cat)
624 |   es_agg_on_query <- es_query + es_agg
625 | 
626 |   # assert
627 |   expect_identical(class(es_agg_on_query), c("elastic_aggs", "elastic_api", "elastic"))
628 | })
629 | 
630 | 
631 | test_that('adding an aggs object to a query object generates the correct search API call', {
632 |   # arrange
633 |   everything <- '{"match_all": {}}'
634 | 
635 |   avg_sepal_width_per_cat <- '{"avg_sepal_width_per_cat": {
636 |     "terms": {"field": "species"},
637 |     "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
638 |   }'
639 | 
640 |   # act
641 |   es_query <- query(everything)
642 |   es_agg <- aggs(avg_sepal_width_per_cat)
643 |   es_agg_on_query <- es_query + es_agg
644 | 
645 |   # assert
646 |   expected_api_call <- '"query":{"match_all": {}},"aggs":{"avg_sepal_width_per_cat": {
647 |     "terms": {"field": "species"},
648 |     "aggs": {"avg_sepal_width": {"avg": {"field": "sepal_width"}}}}
649 |   }'
650 | 
651 |   expect_identical(es_agg_on_query$api_call, expected_api_call)
652 | })
653 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2016-2019 Alex Ioannides
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | context('elasticsearchr utils')
 17 | 
 18 | 
 19 | test_that('valid_connection identifies valid URLs to Elasticsearch rescources', {
 20 |   # skip if on CRAN or Travis
 21 |   skip_on_travis()
 22 |   skip_on_cran()
 23 | 
 24 |   # arrange
 25 |   url <- "http://localhost:9200"
 26 | 
 27 |   # act
 28 |   is_valid_connection <- valid_connection(url)
 29 | 
 30 |   # assert
 31 |   expect_true(is_valid_connection)
 32 | })
 33 | 
 34 | 
 35 | test_that('valid_connection identifies invalid URLs to Elasticsearch rescources', {
 36 |   # skip if on CRAN or Travis
 37 |   skip_on_travis()
 38 |   skip_on_cran()
 39 | 
 40 |   # arrange
 41 |   url <- "localhost:9201"
 42 | 
 43 |   # act & assert
 44 |   expect_error(valid_connection(url))
 45 | })
 46 | 
 47 | 
 48 | test_that('elastic_version returns the Elasticsearch version number', {
 49 |   # skip if on CRAN or Travis
 50 |   skip_on_travis()
 51 |   skip_on_cran()
 52 | 
 53 |   # arrange
 54 |   url <- "http://localhost:9200"
 55 | 
 56 |   # act
 57 |   version <- elastic_version(url)
 58 | 
 59 |   # assert
 60 |   expect_type(version$major, "integer")
 61 |   expect_type(version$minor, "integer")
 62 |   expect_type(version$build, "integer")
 63 | })
 64 | 
 65 | 
 66 | test_that('cleaned_field_names removes periods from data.frame column names', {
 67 |   # arrange
 68 |   iris_colnames <- colnames(iris)
 69 | 
 70 |   # act
 71 |   cleaned_column_names <- cleaned_field_names(iris_colnames)
 72 | 
 73 |   # assert
 74 |   expect_true(length(grep("\\.", cleaned_column_names)) == 0)
 75 | })
 76 | 
 77 | 
 78 | test_that('cleaned_field_names converts all column names to lowercase', {
 79 |   # arrange
 80 |   column_name <- "UPPER_and_lower"
 81 | 
 82 |   # act
 83 |   cleaned_column_name <- cleaned_field_names(column_name)
 84 | 
 85 |   # assert
 86 |   expect_equal(cleaned_column_name, "upper_and_lower")
 87 | })
 88 | 
 89 | 
 90 | test_that('create_metadata creates Bulk API metadata when doc ids are given', {
 91 |   # arrange
 92 |   doc_ids <- c(1, 2)
 93 | 
 94 |   # act
 95 |   metadata <- create_metadata("index", "iris", "data", doc_ids)
 96 | 
 97 |   # assert
 98 |   expected_metadata <- c('{"index": {"_index": "iris", "_type": "data", "_id": "1"}}',
 99 |                        '{"index": {"_index": "iris", "_type": "data", "_id": "2"}}')
100 |   expect_equal(metadata, expected_metadata)
101 | })
102 | 
103 | 
104 | test_that('create_metadata creates Bulk API metadata when no doc ids are given', {
105 |   # arrange
106 |   n <- 2
107 | 
108 |   # act
109 |   metadata <- create_metadata("index", "iris", "data", n = 2)
110 | 
111 |   # assert
112 |   expected_metadata <- c('{"index": {"_index": "iris", "_type": "data"}}',
113 |                        '{"index": {"_index": "iris", "_type": "data"}}')
114 |   expect_equal(metadata, expected_metadata)
115 | })
116 | 
117 | 
118 | test_that('create_bulk_upload_file produces the bulk_upload file for indexing data.frame data', {
119 |   # arrange
120 |   df <- iris[1:2,]
121 |   metadata <- create_metadata("index", "iris", "data", n = 2)
122 | 
123 |   # act
124 |   bulk_upload_file <- create_bulk_upload_file(metadata, df)
125 | 
126 |   # assert
127 |   bulk_upload_file_contents <- readLines(bulk_upload_file)
128 |   file.remove(bulk_upload_file)
129 | 
130 |   expected_upload_file <- c(
131 |     '{\"index\": {\"_index\": \"iris\", \"_type\": \"data\"}}',
132 |     '{\"Sepal.Length\":5.1,\"Sepal.Width\":3.5,\"Petal.Length\":1.4,\"Petal.Width\":0.2,\"Species\":\"setosa\"}',
133 |     '{\"index\": {\"_index\": \"iris\", \"_type\": \"data\"}}',
134 |     '{\"Sepal.Length\":4.9,\"Sepal.Width\":3.0,\"Petal.Length\":1.4,\"Petal.Width\":0.2,\"Species\":\"setosa\"}'
135 |   )
136 | 
137 |   expect_equal(expected_upload_file, bulk_upload_file_contents)
138 | })
139 | 
140 | 
141 | test_that('create_bulk_delete_file produces bulk_delete file', {
142 |   # arrange
143 |   ids <- c(1, 2)
144 |   metadata <- create_metadata("delete", "iris", "data", ids)
145 | 
146 |   # act
147 |   bulk_delete_file <- create_bulk_delete_file(metadata)
148 | 
149 |   # assert
150 |   bulk_delete_file_contents <- readLines(bulk_delete_file)
151 |   file.remove(bulk_delete_file)
152 | 
153 |   expected_delete_file <- c('{"delete": {"_index": "iris", "_type": "data", "_id": "1"}}',
154 |                             '{"delete": {"_index": "iris", "_type": "data", "_id": "2"}}')
155 | 
156 |   expect_equal(expected_delete_file, bulk_delete_file_contents)
157 | })
158 | 
159 | 
160 | test_that('index_bulk_dataframe correctly indexes a data frame', {
161 |   # skip if on CRAN or Travis
162 |   skip_on_travis()
163 |   skip_on_cran()
164 | 
165 |   # arrange
166 |   delete_test_data()
167 | 
168 |   # act
169 |   index_bulk_dataframe(elastic("http://localhost:9200", "iris", "data"), iris_data)
170 |   wait_finish_indexing("http://localhost:9200/iris/data/_search?size=150&q=*", 150)
171 |   query_response <- httr::POST("http://localhost:9200/iris/data/_search?size=150&q=*")
172 |   query_results <- jsonlite::fromJSON(httr::content(query_response, as = 'text'))$hits$hits$`_source`
173 |   query_results <- query_results[order(query_results$sort_key), ]
174 |   row.names(query_results) <- query_results$sort_key
175 | 
176 |   # assert
177 |   expect_equal(iris_data, query_results)
178 |   delete_test_data()
179 | })
180 | 
181 | 
182 | test_that('index_bulk_dataframe correctly detects and assigns document ids', {
183 |   # skip if on CRAN or Travis
184 |   skip_on_travis()
185 |   skip_on_cran()
186 | 
187 |   # arrange
188 |   delete_test_data()
189 |   iris_data_ids <- iris_data
190 |   colnames(iris_data_ids) <- c(colnames(iris_data_ids)[1:5], "id")
191 | 
192 |   # act
193 |   index_bulk_dataframe(elastic("http://localhost:9200", "iris", "data"), iris_data_ids)
194 |   wait_finish_indexing("http://localhost:9200/iris/data/_search?size=150&q=*", 150)
195 |   query_response <- httr::GET("http://localhost:9200/iris/data/150")
196 |   query_results <- data.frame(
197 |     jsonlite::fromJSON(httr::content(query_response, as = 'text'))$`_source`,
198 |     stringsAsFactors = FALSE
199 |   )
200 |   row.names(query_results) <- query_results$id
201 | 
202 |   # assert
203 |   expect_equal(iris_data_ids[150,], query_results)
204 |   delete_test_data()
205 | })
206 | 
207 | 
208 | test_that('from_size_search retrieves query results from Elasticsearch', {
209 |   # skip if on CRAN or Travis
210 |   skip_on_travis()
211 |   skip_on_cran()
212 | 
213 |   # arrange
214 |   load_test_data()
215 |   query <- '{"size": 150, "query": {"match_all": {}}}'
216 | 
217 |   # act
218 |   query_results <- from_size_search(list("search_url" = "http://localhost:9200/iris/data/_search"),
219 |                                     query)
220 | 
221 |   query_results_sorted <- query_results[order(query_results["sort_key"]), ]
222 |   rownames(query_results_sorted) <- query_results_sorted$sort_key
223 | 
224 |   # assert
225 |   expect_equal(query_results_sorted, iris_data)
226 |   delete_test_data()
227 | })
228 | 
229 | 
230 | test_that('from_size_search retrieves aggregation results from Elasticsearch', {
231 |   # skip if on CRAN or Travis
232 |   skip_on_travis()
233 |   skip_on_cran()
234 | 
235 |   # arrange
236 |   load_test_data()
237 |   aggs <- '{"aggs": {"avg_sepal_width_per_species":{"terms":{"field":"species","size":3},
238 |     "aggs":{"avg_sepal_width":{"avg":{"field":"sepal_width"}}}}}}'
239 | 
240 |   # act
241 |   aggs_results <- from_size_search(list("search_url" = "http://localhost:9200/iris/data/_search"),
242 |                                     aggs)
243 | 
244 |   # assert
245 |   expect_equal(aggs_results, iris_test_aggs_bucket)
246 |   delete_test_data()
247 | })
248 | 
249 | 
250 | test_that('scroll_search retrieves query results from Elasticsearch', {
251 |   # skip if on CRAN or Travis
252 |   skip_on_travis()
253 |   skip_on_cran()
254 | 
255 |   # arrange
256 |   load_test_data()
257 |   query <- '{"query": {"match_all": {}}}'
258 | 
259 |   # act
260 |   query_results <- scroll_search(list("cluster_url" = "http://localhost:9200",
261 |                                       "search_url" = "http://localhost:9200/iris/data/_search"),
262 |                                  query)
263 | 
264 |   query_results_sorted <- query_results[order(query_results["sort_key"]), ]
265 |   rownames(query_results_sorted) <- query_results_sorted$sort_key
266 | 
267 |   # assert
268 |   expect_equal(query_results_sorted, iris_data)
269 |   delete_test_data()
270 | })
271 | 
272 | 


--------------------------------------------------------------------------------
/vignettes/quick_start.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "elasticsearchr: a Lightweight Elasticsearch Client for R"
  3 | author: "Alex Ioannides"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{Quick Start}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | [Elasticsearch][es] is a distributed [NoSQL][nosql] document store search-engine and [column-oriented database][es_column], whose **fast** (near real-time) reads and powerful aggregation engine make it an excellent choice as an 'analytics database' for R&D, production-use or both. Installation is simple, it ships with sensible default settings that allow it to work effectively out-of-the-box, and all interaction is made via a set of intuitive and extremely [well documented][es_docs] [RESTful][restful] APIs. I've been using it for two years now and I am evangelical.
 13 | 
 14 | The `elasticsearchr` package implements a simple Domain-Specific Language (DSL) for indexing, deleting, querying, sorting and aggregating data in Elasticsearch, from within R. The main purpose of this package is to remove the labour involved with assembling HTTP requests to Elasticsearch's REST APIs and processing the responses. Instead, users of this package need only send and receive data frames to Elasticsearch resources. Users needing richer functionality are encouraged to investigate the excellent `elastic` package from the good people at [rOpenSci][ropensci].
 15 | 
 16 | This package is available on [CRAN][cran] or from [this GitHub repository][githubrepo]. To install the latest development version from GitHub, make sure that you have the `devtools` package installed (this comes bundled with RStudio), and then execute the following on the R command line:
 17 | 
 18 | ```r
 19 | devtools::install_github("alexioannides/elasticsearchr")
 20 | ```
 21 | 
 22 | ## Installing Elasticsearch
 23 | 
 24 | Elasticsearch can be downloaded [here][es_download], where the instructions for installing and starting it can also be found. OS X users (such as myself) can also make use of [Homebrew][homebrew] to install it with the command,
 25 | 
 26 | ```bash
 27 | $ brew install elasticsearch
 28 | ```
 29 | 
 30 | And then start it by executing `$ elasticsearch` from within any Terminal window. Successful installation and start-up can be checked by navigating any web browser to `http://localhost:9200`, where the following message should greet you (give or take the cluster name that changes with every restart),
 31 | 
 32 | ```js
 33 | {
 34 |   "name" : "RF6t1Gr",
 35 |   "cluster_name" : "elasticsearch",
 36 |   "cluster_uuid" : "pag7iIG-TK271EahH0B0yA",
 37 |   "version" : {
 38 |     "number" : "6.1.1",
 39 |     "build_hash" : "bd92e7f",
 40 |     "build_date" : "2017-12-17T20:23:25.338Z",
 41 |     "build_snapshot" : false,
 42 |     "lucene_version" : "7.1.0",
 43 |     "minimum_wire_compatibility_version" : "5.6.0",
 44 |     "minimum_index_compatibility_version" : "5.0.0"
 45 |   },
 46 |   "tagline" : "You Know, for Search"
 47 | }
 48 | ```
 49 | 
 50 | ## Elasticsearch 101
 51 | 
 52 | If you followed the installation steps above, you have just installed a single Elasticsearch 'node'. When **not** testing on your laptop, Elasticsearch usually comes in clusters of nodes (usually there are at least 3). The easiest easy way to get access to a managed Elasticsearch cluster is by using the [Elastic Cloud][es_cloud] managed service provided by [Elastic][elastic] (note that Amazon Web Services offer something similar too). For the rest of this brief tutorial I will assuming you're running a single node on your laptop (a great way of working with data that is too big for memory).
 53 | 
 54 | In Elasticsearch a 'row' of data is stored as a 'document'. A document is a [JSON][json] object - for example, the first row of R's `iris` dataset,
 55 | 
 56 | ```r
 57 | #   sepal_length sepal_width petal_length petal_width species
 58 | # 1          5.1         3.5          1.4         0.2  setosa
 59 | ```
 60 | 
 61 | would be represented as follows using JSON,
 62 | 
 63 | ```js
 64 | {
 65 |   "sepal_length": 5.1,
 66 |   "sepal_width": 3.5,
 67 |   "petal_length": 1.4,
 68 |   "petal_width": 0.2,
 69 |   "species": "setosa"
 70 | }
 71 | ```
 72 | 
 73 | Documents are classified into 'types' and stored in an 'index'. In a crude analogy with traditional SQL databases that is often used, we would associate an index with a database instance and the document types as tables within that database. In practice this example is not accurate - it is better to think of all documents as residing in a single - possibly sparse - table (defined by the index), where the document types represent non-unique sub-sets of columns in the table. This is especially so as fields that occur in multiple document types (within the same index), must have the same data-type - for example, if `"name"` exists in document type `customer` as well as in document type `address`, then `"name"` will need to be a `string` in both. Note, that 'types' are being slowly phased-out and in Elasticsearch v7.x there will only be indices.
 74 | 
 75 | Each document is considered a 'resource' that has a Uniform Resource Locator (URL) associated with it. Elasticsearch URLs all have the following format: `http://your_cluster:9200/your_index/your_doc_type/your_doc_id`. For example, the above `iris` document could be living at `http://localhost:9200/iris/data/1` - you could even point a web browser to this location and investigate the document's contents.
 76 | 
 77 | Although Elasticsearch - like most NoSQL databases - is often referred to as being 'schema free', as we have already see this is not entirely correct. What is true, however, is that the schema - or 'mapping' as it's called in Elasticsearch - does not _need_ to be declared up-front (although you certainly can do this). Elasticsearch is more than capable of guessing the types of fields based on new data indexed for the first time.
 78 | 
 79 | For more information on any of these basic concepts take a look [here][basic_concepts]
 80 | 
 81 | ## `elasticsearchr`: a Quick Start
 82 | 
 83 | `elasticsearchr` is a **lightweight** client - by this I mean that it only aims to do 'just enough' work to make using Elasticsearch with R easy and intuitive. You will still need to read the [Elasticsearch documentation][es_docs] to understand how to compose queries and aggregations. What follows is a quick summary of what is possible.
 84 | 
 85 | ### Elasticsearch Data Resources
 86 | 
 87 | Elasticsearch resources, as defined by the URLs described above, are defined as `elastic` objects in `elasticsearchr`. For example,
 88 | 
 89 | ```r
 90 | es <- elastic("http://localhost:9200", "iris", "data")
 91 | ```
 92 | 
 93 | Refers to documents of type 'data' in the 'iris' index located on an Elasticsearch node on my laptop. Note that:
 94 | - it is possible to leave the document type empty if you need to refer to all documents in an index; and,
 95 | - `elastic` objects can be defined even if the underling resources have yet to be brought into existence.
 96 | 
 97 | ### Indexing New Data
 98 | 
 99 | To index (insert) data from a data frame, use the `%index%` operator as follows:
100 | 
101 | ```r
102 | elastic("http://localhost:9200", "iris", "data") %index% iris
103 | ```
104 | 
105 | In this example, the `iris` dataset is indexed into the 'iris' index and given a document type called 'data'. Note that I have not provided any document ids here. **To explicitly specify document ids there must be a column in the data frame that is labelled `id`**, from which the document ids will be taken.
106 | 
107 | ### Deleting Data
108 | 
109 | Documents can be deleted in three different ways using the `%delete%` operator. Firstly, an entire index (including the mapping information) can be erased by referencing just the index in the resource - e.g.,
110 | 
111 | ```r
112 | elastic("http://localhost:9200", "iris") %delete% TRUE
113 | ```
114 | 
115 | Alternatively, documents can be deleted on a type-by-type basis leaving the index and it's mappings untouched, by referencing both the index and the document type as the resource - e.g.,
116 | 
117 | ```r
118 | elastic("http://localhost:9200", "iris", "data") %delete% TRUE
119 | ```
120 | 
121 | Finally, specific documents can be deleted by referencing their ids directly - e.g.,
122 | 
123 | ```r
124 | elastic("http://localhost:9200", "iris", "data") %delete% c("1", "2", "3", "4", "5")
125 | ```
126 | 
127 | ### Queries
128 | 
129 | Any type of query that Elasticsearch makes available can be defined in a `query` object using the native Elasticsearch JSON syntax - e.g. to match every document we could use the `match_all` query,
130 | 
131 | ```r
132 | for_everything <- query('{
133 |   "match_all": {}
134 | }')
135 | ```
136 | 
137 | To execute this query we use the `%search%` operator on the appropriate resource - e.g.,
138 | 
139 | ```r
140 | elastic("http://localhost:9200", "iris", "data") %search% for_everything
141 | 
142 | #     sepal_length sepal_width petal_length petal_width    species
143 | # 1            4.9         3.0          1.4         0.2     setosa
144 | # 2            4.9         3.1          1.5         0.1     setosa
145 | # 3            5.8         4.0          1.2         0.2     setosa
146 | # 4            5.4         3.9          1.3         0.4     setosa
147 | # 5            5.1         3.5          1.4         0.3     setosa
148 | # 6            5.4         3.4          1.7         0.2     setosa
149 | # ...
150 | ```
151 | 
152 | #### Selecting a Subset of Fields to Return
153 | 
154 | Sometimes only subset of all the available fields need to be returned, so it is much more efficient for Elasticsearch only to return the required data as opposed to all of it. This can be achieved as follows,
155 | 
156 | ```r
157 | selected_fields <- select_fields('{
158 |   "includes": ["sepal_length", "species"]
159 | }')
160 | 
161 | elastic("http://localhost:9200", "iris", "data") %search% (for_everything + selected_fields)
162 | 
163 | #        species sepal_length
164 | # 1       setosa          4.3
165 | # 2       setosa          5.7
166 | # 3       setosa          5.1
167 | # 4       setosa          5.1
168 | # 5       setosa          4.8
169 | # 6       setosa          5.0
170 | ```
171 | 
172 | The selected fields are defined using Elasticsearch's [source filtering API][source_filtering].
173 | 
174 | #### Sorting Query Results
175 | 
176 | Query results can be sorted on multiple fields by defining a `sort` object using the same Elasticsearch JSON syntax - e.g. to sort by `sepal_width` in ascending order the required `sort` object would be defined as,
177 | 
178 | ```r
179 | by_sepal_width <- sort_on('{"sepal_width": {"order": "asc"}}')
180 | ```
181 | 
182 | This is then added to a `query` object whose results we want sorted and executed using the `%search%` operator as before - e.g.,
183 | 
184 | ```r
185 | elastic("http://localhost:9200", "iris", "data") %search% (for_everything + by_sepal_width)
186 | 
187 | #   sepal_length sepal_width petal_length petal_width    species
188 | # 1          5.0         2.0          3.5         1.0 versicolor
189 | # 2          6.0         2.2          5.0         1.5  virginica
190 | # 3          6.0         2.2          4.0         1.0 versicolor
191 | # 4          6.2         2.2          4.5         1.5 versicolor
192 | # 5          4.5         2.3          1.3         0.3     setosa
193 | # 6          6.3         2.3          4.4         1.3 versicolor
194 | # ...
195 | ```
196 | 
197 | ### Aggregations
198 | 
199 | Similarly, any type of aggregation that Elasticsearch makes available can be defined in an `aggs` object - e.g. to compute the average `sepal_width` per-species of flower we would specify the following aggregation,
200 | 
201 | ```r
202 | avg_sepal_width <- aggs('{
203 |   "avg_sepal_width_per_species": {
204 |     "terms": {
205 |       "field": "species",
206 |       "size": 3
207 |     },
208 |     "aggs": {
209 |       "avg_sepal_width": {
210 |         "avg": {
211 |           "field": "sepal_width"
212 |         }
213 |       }
214 |     }
215 |   }
216 | }')
217 | ```
218 | 
219 | _(Elasticsearch 5.x and 6.x users please note that when using the out-of-the-box mappings the above aggregation requires that `"field": "species"` be changed to `"field": "species.keyword"` - see [here][es_five_mappings] for more information as to why)_
220 | 
221 | This aggregation is also executed via the `%search%` operator on the appropriate resource - e.g.,
222 | 
223 | ```r
224 | elastic("http://localhost:9200", "iris", "data") %search% avg_sepal_width
225 | 
226 | #          key doc_count avg_sepal_width.value
227 | # 1     setosa        50                 3.428
228 | # 2 versicolor        50                 2.770
229 | # 3  virginica        50                 2.974
230 | ```
231 | 
232 | Queries and aggregations can be combined such that the aggregations are computed on the results of the query. For example, to execute the combination of the above query and aggregation, we would execute,
233 | 
234 | ```r
235 | elastic("http://localhost:9200", "iris", "data") %search% (for_everything + avg_sepal_width)
236 | 
237 | #          key doc_count avg_sepal_width.value
238 | # 1     setosa        50                 3.428
239 | # 2 versicolor        50                 2.770
240 | # 3  virginica        50                 2.974
241 | ```
242 | 
243 | where the combination yields,
244 | 
245 | ```r
246 | print(for_everything + avg_sepal_width)
247 | 
248 | # {
249 | #     "size": 0,
250 | #     "query": {
251 | #         "match_all": {
252 | #
253 | #         }
254 | #     },
255 | #     "aggs": {
256 | #         "avg_sepal_width_per_species": {
257 | #             "terms": {
258 | #                 "field": "species",
259 | #                 "size": 0
260 | #             },
261 | #             "aggs": {
262 | #                 "avg_sepal_width": {
263 | #                     "avg": {
264 | #                         "field": "sepal_width"
265 | #                     }
266 | #                 }
267 | #             }
268 | #         }
269 | #     }
270 | # }
271 | ```
272 | 
273 | For comprehensive coverage of all query and aggregations types please refer to the rather excellent [official documentation][es_docs] (newcomers to Elasticsearch are advised to start with the 'Query String' query).
274 | 
275 | ### Mappings
276 | 
277 | We have also included the ability to create an empty index with a custom mapping, using the `%create%` operator - e.g.,
278 | 
279 | ```r
280 | elastic("http://localhost:9200", "iris") %create% mapping_default_simple()
281 | ```
282 | 
283 | Where in this instance `mapping_default_simple()` is a default mapping that I have shipped with `elasticsearchr`. It switches-off the text analyser for all fields of type 'string' (i.e. switches off free text search), allows all text search to work with case-insensitive lower-case terms, and maps any field with the name 'timestamp' to type 'date', so long as it has the appropriate string or long format.
284 | 
285 | ### Cluster and Index Information
286 | 
287 | We have also added the ability to retrieve basic information from the cluster, using the `%info%` operator. For example, to retrieve a list of all avaliable indices in the cluster,
288 | 
289 | ```r
290 | elastic("http://localhost:9200", "*") %info% list_indices()
291 |  
292 | # [1] "iris"
293 | ```
294 | 
295 | Or to list all of the available fields in an index,
296 | 
297 | ```r
298 | elastic("http://localhost:9200", "iris") %info% list_fields()
299 | 
300 | # [1] "petal_length" "petal_width" "sepal_length" "sepal_width" "species"
301 | ```
302 | 
303 | ## Acknowledgements
304 | 
305 | A big thank you to Hadley Wickham and Jeroen Ooms, the authors of the `httr` and `jsonlite` packages that `elasticsearchr` leans upon _heavily_. And, to the other contributors and supporters - your efforts are greatly appreciated!
306 | 
307 | 
308 | [esr_img]: https://alexioannides.github.io/images/r/elasticsearchr/elasticsearchr2.png "Elasticsearchr"
309 | 
310 | [elastic]: https://www.elastic.co "Elastic corp."
311 | 
312 | [es]: https://www.elastic.co/products/elasticsearch "Elasticsearch"
313 | 
314 | [es_column]: https://www.elastic.co/blog/elasticsearch-as-a-column-store "Elasticsearch as a Column Store"
315 | 
316 | [cran]: https://cran.r-project.org/package=elasticsearchr "elasticsearchr on CRAN"
317 | 
318 | [githubrepo]: https://github.com/AlexIoannides/elasticsearchr "Alex's GitHub repository"
319 | 
320 | [githubissues]: https://github.com/AlexIoannides/elasticsearchr/issues "elasticsearchr issues"
321 | 
322 | [es_download]: https://www.elastic.co/downloads/elasticsearch "Download"
323 | 
324 | [nosql]: https://en.wikipedia.org/wiki/NoSQL "What is NoSQL?"
325 | 
326 | [es_docs]: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html "Elasticsearch documentation"
327 | 
328 | [restful]: https://en.wikipedia.org/wiki/Representational_state_transfer "RESTful?"
329 | 
330 | [ropensci]: https://github.com/ropensci/elastic "rOpenSci"
331 | 
332 | [homebrew]: http://brew.sh/ "Homebrew for OS X"
333 | 
334 | [es_cloud]: https://www.elastic.co/cloud/as-a-service "Elastic Cloud"
335 | 
336 | [json]: https://en.wikipedia.org/wiki/JSON "JSON"
337 | 
338 | [basic_concepts]: https://www.elastic.co/guide/en/elasticsearch/reference/current/elasticsearch-intro.html "Basic Concepts"
339 | 
340 | [source_filtering]: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-source-filtering.html "Source Filters"
341 | 
342 | [es_five_mappings]: https://www.elastic.co/guide/en/elasticsearch/reference/5.0/breaking_50_mapping_changes.html "Text fields in Elasticsearch 5.x"
343 | 


--------------------------------------------------------------------------------