├── .Rbuildignore
├── .gitignore
├── tests
    ├── testthat
    │   ├── test-s3path.R
    │   ├── test-s3read.R
    │   └── test-s3store.R
    └── test-all.R
├── man
    ├── s3path.Rd
    ├── print.s3mpi_error.Rd
    ├── last_modified.Rd
    ├── s3delete.Rd
    ├── save_to_cache.Rd
    ├── s3mpi.Rd
    ├── s3cache.Rd
    ├── fetch_from_cache.Rd
    ├── s3exists.Rd
    ├── s3read.Rd
    ├── s3store.Rd
    ├── s3.get.Rd
    └── s3normalize.Rd
├── NAMESPACE
├── R
    ├── platform.R
    ├── s3delete.R
    ├── package.s3mpi.R
    ├── s3path.r
    ├── s3exists.R
    ├── s3read.r
    ├── s3store.r
    ├── s3normalize.R
    ├── s3.put.R
    ├── s3.get.R
    ├── utils.R
    └── s3cache.R
├── DESCRIPTION
├── LICENSE
├── .travis.yml
├── NEWS.md
└── README.md


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | .travis.yml
2 | .git
3 | .gitignore
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ..Rcheck
2 | *.Rcheck
3 | *.tar.gz
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test-s3path.R:
--------------------------------------------------------------------------------
1 | context('s3path')
2 | 
3 | # TODO: (RK) Fill this in.
4 | 


--------------------------------------------------------------------------------
/tests/test-all.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(testthatsomemore)
3 | library(s3mpi)
4 | test_check("s3mpi")
5 | 


--------------------------------------------------------------------------------
/man/s3path.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3path.r
 3 | \name{s3path}
 4 | \alias{s3path}
 5 | \title{Get your default s3path or error.}
 6 | \usage{
 7 | s3path()
 8 | }
 9 | \description{
10 | Get your default s3path or error.
11 | }
12 | 
13 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,s3mpi_error)
 4 | export(s3delete)
 5 | export(s3exists)
 6 | export(s3normalize)
 7 | export(s3path)
 8 | export(s3put)
 9 | export(s3read)
10 | export(s3store)
11 | import(AWS.tools)
12 | import(cacher)
13 | import(crayon)
14 | import(digest)
15 | 


--------------------------------------------------------------------------------
/R/platform.R:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/rstudio/packrat/blob/master/R/platform.R
 2 | is.windows <- function() {
 3 |   Sys.info()["sysname"] == "Windows"
 4 | }
 5 | 
 6 | is.mac <- function() {
 7 |   Sys.info()["sysname"] == "Darwin"
 8 | }
 9 | 
10 | is.linux <- function() {
11 |   Sys.info()["sysname"] == "Linux"
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/man/print.s3mpi_error.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3.get.R
 3 | \name{print.s3mpi_error}
 4 | \alias{print.s3mpi_error}
 5 | \title{Printing for s3mpi errors.}
 6 | \usage{
 7 | \method{print}{s3mpi_error}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{ANY. R object to print.}
11 | 
12 | \item{...}{additional objects to pass to print function.}
13 | }
14 | \description{
15 | Printing for s3mpi errors.
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/man/last_modified.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3cache.R
 3 | \name{last_modified}
 4 | \alias{last_modified}
 5 | \title{Determine the last modified time of an S3 object.}
 6 | \usage{
 7 | last_modified(key)
 8 | }
 9 | \arguments{
10 | \item{key}{character. The s3 key of the object.}
11 | }
12 | \value{
13 | the last modified time or \code{NULL} if it does not exist on S3.
14 | }
15 | \description{
16 | Determine the last modified time of an S3 object.
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/R/s3delete.R:
--------------------------------------------------------------------------------
 1 | #' Delete an R object from S3 by key
 2 | #'
 3 | #' @seealso \code{\link{s3store}}
 4 | #' @param key character. The key to delete from S3.
 5 | #' @param path character. The location of your S3 bucket as a prefix to \code{name},
 6 | #'    for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}.
 7 | #' @export
 8 | s3delete <- function(key, path = s3path()) {
 9 |   path <- add_ending_slash(path)
10 |   cmd <- if (use_legacy_api()) {
11 |     paste0("del ", path, "/", key)
12 |   } else {
13 |     paste0("s3 rm ", path, key)
14 |   }
15 |   system2(s3cmd(), cmd)
16 | }
17 | 


--------------------------------------------------------------------------------
/man/s3delete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3delete.R
 3 | \name{s3delete}
 4 | \alias{s3delete}
 5 | \title{Delete an R object from S3 by key}
 6 | \usage{
 7 | s3delete(key, path = s3path())
 8 | }
 9 | \arguments{
10 | \item{key}{character. The key to delete from S3.}
11 | 
12 | \item{path}{character. The location of your S3 bucket as a prefix to \code{name},
13 | for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}.}
14 | }
15 | \description{
16 | Delete an R object from S3 by key
17 | }
18 | \seealso{
19 | \code{\link{s3store}}
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/R/package.s3mpi.R:
--------------------------------------------------------------------------------
 1 | #' Bi-directional communication with R and AWS S3.
 2 | #'
 3 | #' This package provides an interface to read and store arbitrary
 4 | #' objects from and to Amazon AWS's S3 cloud storage.
 5 | #'
 6 | #' The exported helpers \code{s3read} and \code{s3store}
 7 | #' allow, upon correct configuration of your S3 credentials,
 8 | #' uploading to and downloading from S3 using R's built-in support
 9 | #' for serializing and deserializing arbitrary objects (see
10 | #' \code{\link{readRDS}} and \code{\link{saveRDS}}).
11 | #'
12 | #' @name s3mpi
13 | #' @docType package
14 | #' @import AWS.tools crayon cacher digest
15 | NULL
16 | 


--------------------------------------------------------------------------------
/man/save_to_cache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3cache.R
 3 | \name{save_to_cache}
 4 | \alias{save_to_cache}
 5 | \title{Helper function for saving a file to a cache directory.}
 6 | \usage{
 7 | save_to_cache(key, value, cache_dir = cache_directory())
 8 | }
 9 | \arguments{
10 | \item{key}{character. The key under which the cache entry is stored.}
11 | 
12 | \item{value}{ANY. The R object to save in the cache.}
13 | 
14 | \item{cache_dir}{character. The cache directory. The default is
15 | \code{cache_directory()}.}
16 | }
17 | \description{
18 | Helper function for saving a file to a cache directory.
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/R/s3path.r:
--------------------------------------------------------------------------------
 1 | #' Get your default s3path or error.
 2 | #' @export
 3 | s3path <- function() {
 4 |   ## The default S3 prefix, for example, `s3://yourbucket/yourprefix/`.
 5 |   ## You should set this in everyone's `~/.Rprofile` if
 6 |   ## you are using s3mpi to collaborate in a data science team.
 7 |   ## System environment variables are also accepted.
 8 |   path <- get_option("s3mpi.path")
 9 | 
10 |   if (is.null(path) || !nzchar(path)) {
11 |     stop("s3mpi package: Please set your s3 path using `S3MPI_PATH` system environment variable or ",
12 |          "options(s3mpi.path = 's3://your_bucket/your/path/'). ",
13 |          "This is where all of your uploaded R objects will be stored.")
14 |   }
15 | 
16 |   path
17 | }
18 | 


--------------------------------------------------------------------------------
/man/s3mpi.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/package.s3mpi.R
 3 | \docType{package}
 4 | \name{s3mpi}
 5 | \alias{s3mpi}
 6 | \alias{s3mpi-package}
 7 | \title{Bi-directional communication with R and AWS S3.}
 8 | \description{
 9 | This package provides an interface to read and store arbitrary
10 | objects from and to Amazon AWS's S3 cloud storage.
11 | }
12 | \details{
13 | The exported helpers \code{s3read} and \code{s3store}
14 | allow, upon correct configuration of your S3 credentials,
15 | uploading to and downloading from S3 using R's built-in support
16 | for serializing and deserializing arbitrary objects (see
17 | \code{\link{readRDS}} and \code{\link{saveRDS}}).
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/man/s3cache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3cache.R
 3 | \name{s3cache}
 4 | \alias{s3cache}
 5 | \title{A caching layer around s3mpi calls.}
 6 | \usage{
 7 | s3cache(s3key, value)
 8 | }
 9 | \arguments{
10 | \item{s3key}{character. The full S3 key to attempt to read or write
11 | to the cache.}
12 | 
13 | \item{value}{ANY. The R object to save in the cache. If missing,
14 | a cache read will be performed instead.}
15 | }
16 | \description{
17 | Fetching large files from the S3 MPI can be expensive when performed
18 | multiple times. This method allows one to add a caching layer
19 | around S3 fetching. The user should specify the configuration option
20 | \code{options(s3mpi.cache = "some/dir")}. The recommended cache
21 | directory (where files will be stored) is \code{"~/.s3cache"}.
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/man/fetch_from_cache.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3cache.R
 3 | \name{fetch_from_cache}
 4 | \alias{fetch_from_cache}
 5 | \title{Helper function for fetching a file from a cache directory.}
 6 | \usage{
 7 | fetch_from_cache(key, cache_dir)
 8 | }
 9 | \arguments{
10 | \item{key}{character. The key under which the cache entry is stored.}
11 | 
12 | \item{cache_dir}{character. The cache directory. The default is
13 | \code{cache_directory()}.}
14 | }
15 | \value{
16 | the cached object if the cache has not invalidated. Otherwise,
17 |   return \code{s3mpi::not_cached}.
18 | }
19 | \description{
20 | This function will also test to determine whether the file has been
21 | modified on S3 since the last cache save. If the file has never been
22 | cached or the cache is invalidated, it will return \code{s3mpi::not_cached}.
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/man/s3exists.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3exists.R
 3 | \name{s3exists}
 4 | \alias{s3exists}
 5 | \title{Determine whether object exists on S3.}
 6 | \usage{
 7 | s3exists(name, path = s3path())
 8 | }
 9 | \arguments{
10 | \item{name}{string. Name of file to look for}
11 | 
12 | \item{path}{string. Path to file.  If missing, the entire s3 path must be provided in name.}
13 | }
14 | \description{
15 | Test whether or not the given object exists at the
16 | give S3 path.
17 | }
18 | \examples{
19 | \dontrun{
20 | s3exists("my/key") # Will look in bucket given by getOption("s3mpi.path") or
21 | from a system environment variable.
22 |   # For example, if this option is "s3://mybucket/", then this query
23 |   # will check for existence of the \\code{s3://mybucket/my/key} S3 path.
24 | 
25 | s3exists("my/key", "s3://anotherbucket/") # We can of course change the bucket.
26 | }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: s3mpi
 2 | Type: Package
 3 | Title: R message passing interface using S3 storage
 4 | URL: https://github.com/robertzk/s3mpi
 5 | BugReports: https://github.com/robertzk/s3mpi/issues
 6 | Description: Easily pass objects like lists or dataframes between consoles.
 7 | Version: 0.2.47
 8 | Author: Robert Krzyzanowski <technoguyrob@gmail.com>
 9 | Maintainer: Robert Krzyzanowski <technoguyrob@gmail.com>
10 | Authors@R: c(person("Robert", "Krzyzanowski", email = "technoguyrob@gmail.com",
11 |     role = c("aut", "cre")))
12 | Depends:
13 |     R (>= 3.0.0)
14 | Imports:
15 |     AWS.tools,
16 |     cacher,
17 |     crayon,
18 |     digest
19 | Suggests:
20 |     knitr,
21 |     withr,
22 |     testthat,
23 |     testthatsomemore
24 | Remotes: kirillseva/cacher,
25 |     robertzk/testthatsomemore,
26 |     avantcredit/AWS.tools
27 | License: MIT
28 | LazyData: true
29 | Roxygen: list(wrap = FALSE)
30 | VignetteBuilder: knitr
31 | RoxygenNote: 5.0.0
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014-2016 Robert Krzyzanowski
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included
12 | in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | sudo: true
 3 | before_install:
 4 |   - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
 5 |   - chmod 755 ./travis-tool.sh
 6 |   - "./travis-tool.sh bootstrap"
 7 | sudo: required
 8 | git:
 9 |   submodules: false
10 | env:
11 |   - global:
12 |     - WARNINGS_ARE_ERRORS=1
13 |     - _R_CHECK_FORCE_SUGGESTS_=0
14 |     - LINTR_COMMENT_BOT=false
15 |     - R_LIBS_USER=~/.R/library
16 | install:
17 |   - mkdir -p "$R_LIBS_USER"
18 |   - "./travis-tool.sh install_r devtools"
19 |   - "./travis-tool.sh install_r rcmdcheck"
20 |   - "./travis-tool.sh github_package jimhester/covr robertzk/testthatsomemore"
21 |   - "./travis-tool.sh install_deps"
22 | script:
23 |   - Rscript -e 'try(devtools::install(".")); r <- rcmdcheck::rcmdcheck(".", args = c("--no-manual")); quit(save = "no", status = if (length(c(r$errors, r$warnings)) > 1 || grepl("FAILED", r$output$stdout)) { 1 } else { 0 }, runLast = FALSE)'
24 | after_success:
25 |   - Rscript -e 'library(covr);codecov()'
26 | notifications:
27 |   email:
28 |     on_success: change
29 |     on_failure: change
30 |   hipchat:
31 |     on_success: change
32 |     on_failure: change
33 |     template:
34 |     - "%{repository}#%{build_number} (%{branch} - %{commit} : %{author}): %{message}
35 |       | Details: %{build_url} | Changes: %{compare_url}"
36 |     rooms:
37 |       secure: SQirvWbQ9b0roApmf3gt6JTcWIra9NguGzR45azxVDaWw2n0w/sIufA/cxa2sTLLhKfIMNlJwwhQjNaWyHeZkTTxRb76tcHWQdPpMcNKTwfr3+C3/bXDkmQozvQkYNgGuRc2Iln5ms12fiHmwI6dp5aENACUo5fbV2SLJQvmt5w=
38 | 


--------------------------------------------------------------------------------
/man/s3read.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3read.r
 3 | \name{s3read}
 4 | \alias{s3read}
 5 | \title{Read an R object in S3 by key}
 6 | \usage{
 7 | s3read(name, path = s3path(), cache = TRUE, serialize = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{name}{character. The key to grab from S3.}
11 | 
12 | \item{path}{character. The location of your S3 bucket as a prefix to \code{name},
13 | for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}.}
14 | 
15 | \item{cache}{logical. If true, use the local s3cache if available.
16 | If false, do not use cache. By default, \code{TRUE}. Note this will
17 | consume local disk space for objects that have been \code{\link{s3read}}.}
18 | 
19 | \item{serialize}{logical. If true, use \code{s3normalize} to serialize the model object.}
20 | 
21 | \item{...}{Can be used internally to pass more arguments to \code{\link{s3.get}}.}
22 | }
23 | \description{
24 | Any type of object that can be serialized as an RDS file
25 | is capable of being read using this interface.
26 | }
27 | \details{
28 | If you wish to read non-vanilla R objects, such as those
29 | containing external pointers to C structures, see
30 | \code{\link{s3normalize}}.
31 | }
32 | \examples{
33 | \dontrun{
34 | s3store(c(1,2,3), "test123")
35 | print(s3read("test123"))
36 | # [1] 1 2 3
37 | 
38 | s3store(function(x, y) { x + 2 * y }, "myfunc")
39 | stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures!
40 | }
41 | }
42 | \seealso{
43 | \code{\link{s3store}}
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/man/s3store.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3store.r
 3 | \name{s3store}
 4 | \alias{s3put}
 5 | \alias{s3store}
 6 | \title{Store an R object in S3 by key.}
 7 | \usage{
 8 | s3store(obj, name = NULL, path = s3path(), safe = FALSE, ...)
 9 | 
10 | s3put(..., safe = TRUE)
11 | }
12 | \arguments{
13 | \item{obj}{ANY. An R object to save to S3.}
14 | 
15 | \item{name}{character. The S3 key to save to. If no key is provided,
16 | the expression passed as \code{obj} will be used.}
17 | 
18 | \item{path}{character. The S3 prefix, e.g., "s3://yourbucket/some/path/".}
19 | 
20 | \item{safe}{logical. Whether or not to overwrite existing fails by
21 | default or error if they exist.}
22 | 
23 | \item{...}{additional arguments to \code{s3mpi:::s3.put}.}
24 | }
25 | \description{
26 | Any type of object that can be serialized as an RDS file
27 | is capable of being retrieved using this interface.
28 | }
29 | \details{
30 | If you wish to store non-vanilla R objects, such as those
31 | containing external pointers to C structures, see
32 | \code{\link{s3normalize}}.
33 | }
34 | \note{
35 | \code{s3put} is equivalent to \code{s3store} except that
36 |    it will fail by default if you try to overwrite an existing key.
37 | }
38 | \examples{
39 | \dontrun{
40 | s3store(c(1,2,3), 'test123')
41 | print(s3read('test123'))
42 | # [1] 1 2 3
43 | 
44 | s3store(function(x, y) { x + 2 * y }, "myfunc")
45 | stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures!
46 | 
47 | obj <- 1:5
48 | s3store(obj) # If we do not pass a key the path is inferred from
49 |   # the expression using deparse(substitute(...)).
50 | stopifnot(all.equal(s3read("obj"), 1:5))
51 | }
52 | }
53 | \seealso{
54 | \code{\link{s3read}}
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/R/s3exists.R:
--------------------------------------------------------------------------------
 1 | #' Determine whether object exists on S3.
 2 | #'
 3 | #' Test whether or not the given object exists at the
 4 | #' give S3 path.
 5 | #'
 6 | #' @param name string. Name of file to look for
 7 | #' @param path string. Path to file.  If missing, the entire s3 path must be provided in name.
 8 | #' @export
 9 | #' @examples \dontrun{
10 | #' s3exists("my/key") # Will look in bucket given by getOption("s3mpi.path") or
11 | #' from a system environment variable.
12 | #'   # For example, if this option is "s3://mybucket/", then this query
13 | #'   # will check for existence of the \code{s3://mybucket/my/key} S3 path.
14 | #'
15 | #' s3exists("my/key", "s3://anotherbucket/") # We can of course change the bucket.
16 | #' }
17 | s3exists <- function(name, path = s3path()) {
18 |   if (is.null(name)) return(FALSE)  # https://github.com/robertzk/s3mpi/issues/22
19 |   path  <- add_ending_slash(path)
20 |   s3key <- paste(path, name, sep = "")
21 |   s3key <- gsub("/$", "", s3key) # strip terminal /
22 |   if (!grepl("^s3://", s3key)) {
23 |     stop("s3 paths must begin with \"s3://\"")
24 |   }
25 | 
26 |   results <- system2(s3cmd(), s3cmd_exists_command(s3key), stdout = TRUE)
27 | 
28 |   check_exists_results(name, results)
29 | }
30 | 
31 | s3cmd_exists_command <- function(s3key) {
32 |   if (use_legacy_api()) {
33 |     paste("ls", s3key)
34 |   } else {
35 |     paste("s3", "ls", s3key)
36 |   }
37 | }
38 | 
39 | 
40 | check_exists_results <- function(name, results) {
41 |   ## We know that the key exists if a result was returned, i.e., the
42 |   ## shown regex gives a match.
43 |   if (use_legacy_api()) {
44 |     matches <- grepl(paste0(name, "(/[0-9A-Za-z]+)*/?$"), results)
45 |   } else {
46 |     matches <- grepl(paste0(basename(name), "$"), results)
47 |   }
48 |   sum(matches) > 0
49 | }
50 | 


--------------------------------------------------------------------------------
/R/s3read.r:
--------------------------------------------------------------------------------
 1 | #' Read an R object in S3 by key
 2 | #'
 3 | #' Any type of object that can be serialized as an RDS file
 4 | #' is capable of being read using this interface.
 5 | #'
 6 | #' If you wish to read non-vanilla R objects, such as those
 7 | #' containing external pointers to C structures, see
 8 | #' \code{\link{s3normalize}}.
 9 | #'
10 | #' @seealso \code{\link{s3store}}
11 | #' @param name character. The key to grab from S3.
12 | #' @param path character. The location of your S3 bucket as a prefix to \code{name},
13 | #'    for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}.
14 | #' @param cache logical. If true, use the local s3cache if available.
15 | #'    If false, do not use cache. By default, \code{TRUE}. Note this will
16 | #'    consume local disk space for objects that have been \code{\link{s3read}}.
17 | #' @param serialize logical. If true, use \code{s3normalize} to serialize the model object.
18 | #' @param ... Can be used internally to pass more arguments to \code{\link{s3.get}}.
19 | #' @export
20 | #' @examples
21 | #' \dontrun{
22 | #' s3store(c(1,2,3), "test123")
23 | #' print(s3read("test123"))
24 | #' # [1] 1 2 3
25 | #'
26 | #' s3store(function(x, y) { x + 2 * y }, "myfunc")
27 | #' stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures!
28 | #' }
29 | s3read <- function(name, path = s3path(), cache = TRUE, serialize = TRUE, ...) {
30 |   stopifnot(isTRUE(cache) || identical(cache, FALSE))
31 | 
32 |   path <- add_ending_slash(path)
33 | 
34 |   s3key <- paste(path, name, sep = "")
35 | 
36 |   if (!isTRUE(cache) || is.null(get_option("s3mpi.cache"))) {
37 |     value <- s3.get(s3key, cache = FALSE, ...)
38 |   } else if (is.not_cached(value <- s3cache(s3key))) {
39 |     value <- s3.get(s3key, cache = TRUE, ...)
40 |     ## If the file system caching layer is enabled, store it to the file system
41 |     ## before returning the value.
42 |     s3cache(s3key, value)
43 |   }
44 |   if (isTRUE(serialize)) {
45 |     s3normalize(value, TRUE)
46 |   } else {
47 |     value
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/R/s3store.r:
--------------------------------------------------------------------------------
 1 | #' Store an R object in S3 by key.
 2 | #'
 3 | #' Any type of object that can be serialized as an RDS file
 4 | #' is capable of being retrieved using this interface.
 5 | #'
 6 | #' If you wish to store non-vanilla R objects, such as those
 7 | #' containing external pointers to C structures, see
 8 | #' \code{\link{s3normalize}}.
 9 | #'
10 | #' @export
11 | #' @seealso \code{\link{s3read}}
12 | #' @param obj ANY. An R object to save to S3.
13 | #' @param name character. The S3 key to save to. If no key is provided,
14 | #'    the expression passed as \code{obj} will be used.
15 | #' @param path character. The S3 prefix, e.g., "s3://yourbucket/some/path/".
16 | #' @param safe logical. Whether or not to overwrite existing fails by
17 | #'    default or error if they exist.
18 | #' @param ... additional arguments to \code{s3mpi:::s3.put}.
19 | #' @examples
20 | #' \dontrun{
21 | #' s3store(c(1,2,3), 'test123')
22 | #' print(s3read('test123'))
23 | #' # [1] 1 2 3
24 | #'
25 | #' s3store(function(x, y) { x + 2 * y }, "myfunc")
26 | #' stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures!
27 | #'
28 | #' obj <- 1:5
29 | #' s3store(obj) # If we do not pass a key the path is inferred from
30 | #'   # the expression using deparse(substitute(...)).
31 | #' stopifnot(all.equal(s3read("obj"), 1:5))
32 | #' }
33 | s3store <- function(obj, name = NULL, path = s3path(), safe = FALSE, ...) {
34 |   if (missing(name)) {
35 |     name <- deparse(substitute(obj))
36 |   }
37 | 
38 |   path <- add_ending_slash(path)
39 | 
40 |   s3key <- paste(path, name, sep = "")
41 |   if (isTRUE(safe) && s3exists(name, path = path, ...)) {
42 |     stop("An object with name ", name, " on path ", path,
43 |         " already exists. Use `safe = FALSE` to overwrite\n",
44 |         "-----------------------^")
45 |   }
46 | 
47 |   obj4save <- s3normalize(obj, FALSE)
48 |   s3.put(obj4save, path, name, ...)
49 | 
50 |   if (!is.null(get_option("s3mpi.cache"))) {
51 |     s3cache(s3key, obj4save)
52 |   }
53 | 
54 |   if (is.environment(obj4save)) {
55 |     s3normalize(obj4save) # Revert side effects
56 |   }
57 | 
58 |   invisible(s3key)
59 | }
60 | 
61 | #' @export
62 | #' @rdname s3store
63 | #' @note \code{s3put} is equivalent to \code{s3store} except that
64 | #'    it will fail by default if you try to overwrite an existing key.
65 | s3put <- function(..., safe = TRUE) {
66 |   s3store(..., safe = safe)
67 | }
68 | 


--------------------------------------------------------------------------------
/man/s3.get.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3.get.R, R/s3.put.R
 3 | \name{s3.get}
 4 | \alias{s3.get}
 5 | \alias{s3.put}
 6 | \title{Fetch an R object from an S3 path.}
 7 | \usage{
 8 | s3.get(path, bucket_location = "US", verbose = FALSE, debug = FALSE,
 9 |   cache = TRUE, storage_format = c("RDS", "CSV", "table"), ...)
10 | 
11 | s3.put(x, path, name, bucket_location = "US", debug = FALSE,
12 |   check_exists = TRUE, num_retries = get_option("s3mpi.num_retries", 0),
13 |   backoff = 2^seq(2, num_retries + 1), max_backoff = 128,
14 |   storage_format = c("RDS", "CSV", "table"), row.names = FALSE, ...)
15 | }
16 | \arguments{
17 | \item{path}{character. A full S3 path.}
18 | 
19 | \item{bucket_location}{character. Usually \code{"US"}.}
20 | 
21 | \item{verbose}{logical. If \code{TRUE}, the \code{s3cmd}
22 | utility verbose flag will be set.}
23 | 
24 | \item{debug}{logical. If \code{TRUE}, the \code{s3cmd}
25 | utility debug flag will be set.}
26 | 
27 | \item{cache}{logical. If \code{TRUE}, an LRU in-memory cache will be referenced.}
28 | 
29 | \item{storage_format}{character. What format the object is stored in. Defaults to RDS.}
30 | 
31 | \item{...}{additional arguments to pass the the saving function.}
32 | 
33 | \item{x}{ANY. R object to store to S3.}
34 | 
35 | \item{name}{character.}
36 | 
37 | \item{check_exists}{logical. Whether or not to check if an object already exists at the specificed location.}
38 | 
39 | \item{num_retries}{numeric. the number of times to retry uploading.}
40 | 
41 | \item{backoff}{numeric. Vector, with each element in seconds, describing the
42 | exponential backoff to be used in conjunction with the num_retries argument.
43 | Number of elements must equal num_retries. Defaults to 4, 8, 16, 32, etc.}
44 | 
45 | \item{max_backoff}{numeric. Number describing the maximum seconds s3mpi will sleep
46 | prior to retrying an upload. Defaults to 128 seconds.}
47 | 
48 | \item{row.names}{logical. Whether or not to write row names when writing CSV's or tables.}
49 | 
50 | \item{storage_format}{character. What format to store files in. Defaults to RDS.}
51 | }
52 | \value{
53 | For \code{s3.get}, the R object stored in RDS format on S3 in the \code{path}.
54 |    For \code{s3.put}, the system exit code from running the \code{s3cmd}
55 |    command line tool to perform the upload.
56 | }
57 | \description{
58 | Fetch an R object from an S3 path.
59 | }
60 | 
61 | 


--------------------------------------------------------------------------------
/man/s3normalize.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/s3normalize.R
 3 | \name{s3normalize}
 4 | \alias{s3normalize}
 5 | \title{Convert a possibly non-serializable R object to a serializable R object.}
 6 | \usage{
 7 | s3normalize(object, read = TRUE)
 8 | }
 9 | \arguments{
10 | \item{object}{ANY. The R object to normalize. If it has an
11 | \code{"s3mpi.serialize"} attribute consisting of a list with
12 | \code{"read"} and \code{"write"} keys, these arity-1 functions
13 | will be called with the \code{object} prior to reading from and
14 | writing to S3, respectively.}
15 | 
16 | \item{read}{logical. If \code{TRUE}, the \code{"read"} key of the
17 | \code{"s3mpi.serialize"} attribute, which should be a 1-argument
18 | function, will be invoked on the object. Otherwise, the \code{"write"}
19 | key will be invoked. By default, \code{read} is TRUE.}
20 | }
21 | \value{
22 | A previously possibly non-vanilla R object (that is, 
23 |    an R object that may contain external pointers to non-R objects,
24 |    such as vanilla C structs) converted to a totally vanilla R object
25 |    (for example, by replacing the pointers with \code{\link{raw}} binary data).
26 | }
27 | \description{
28 | R has good foreign function interface bindings to C code. As such,
29 | certain package authors may wish to optimize their code by keeping
30 | their objects in C structures instead of R SEXPs (the standard for
31 | object representation in the R interpreter). This also applies
32 | to bindings to external libraries. The speed advantage can be
33 | substantial, so this is not an uncommon use case. The \code{s3normalize}
34 | helper provides the ability to add an additional "preprocessor"
35 | layer prior to storing an object to S3 that converts a non-serializable
36 | object (such as a list with one of its entries pointing to an 
37 | external C structure) to serialize object (such as that list with
38 | its C structure pointer entry replaced by a \code{\link{raw}} vector).
39 | }
40 | \details{
41 | If the object being uploaded with \code{s3store} or downloaded wiht
42 | \code{s3read} has an attribute \code{"s3mpi.serialize"} which must
43 | be a list with keys \code{c("read", "write")}, these keys should
44 | hold functions requiring a single argument which are applied to
45 | the object prior to \emph{reading} from (\code{s3read}) and \emph{writing}
46 | to (\code{s3store}) S3, respectively. This allows s3mpi storage
47 | of not only vanilla R objects but \emph{arbitrary objects in memory}
48 | (whether they are internally represented by a C, Rust, Java, etc. process).
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/tests/testthat/test-s3read.R:
--------------------------------------------------------------------------------
 1 | context("s3read")
 2 | library(testthatsomemore)
 3 | 
 4 | withr::with_options(list(
 5 |   s3mpi.path = "s3://test/",
 6 |   s3mpi.cache = NULL
 7 | ), {
 8 |   describe("cache parameter validation", {
 9 |     with_mock(
10 |       `s3mpi:::s3.get` = function(...) "value",
11 |       `s3mpi:::s3cache` = function(...) TRUE, {
12 |       test_that("if cache is not TRUE or FALSE, it errors", {
13 |         expect_error(s3read("key", cache = "pizza", serialize = FALSE))
14 |         expect_error(s3read("key", cache = 23, serialize = FALSE))
15 |         expect_error(s3read("key", cache = iris, serialize = FALSE))
16 |         expect_error(s3read("key", cache = NA, serialize = FALSE))
17 |       })  
18 |       test_that("if cache is TRUE, it does not error", {
19 |         expect_equal(s3read("key", cache = TRUE, serialize = FALSE), "value")
20 |       })
21 |       test_that("if cache is FALSE, it does not error", {
22 |         expect_equal(s3read("key", cache = FALSE, serialize = FALSE), "value")
23 |       }) 
24 |     }) 
25 |   })
26 | 
27 |   test_that("if the path does not end in a slash, the slash is added", {
28 |     map <- list2env(list("s3://path/key" = "value"))
29 |     with_mock(
30 |       `s3mpi:::s3.get` = function(...) map[[..1]], {
31 |       expect_equal(s3read("key", path = "s3://path"), "value")
32 |     }) 
33 |   })
34 | 
35 |   test_that("it can fetch raw values if the caching layer is disabled", {
36 |     map <- list2env(list("s3://test/key" = "value"))
37 |     with_mock(`s3mpi:::s3.get` = function(...) map[[..1]], {
38 |       expect_equal(s3read("key", cache = FALSE), "value")
39 |       map$`s3://test/key` <- "new_value"
40 |       # Make sure we are not caching.
41 |       expect_equal(s3read("key", cache = FALSE), "new_value")
42 |     })
43 |   })
44 | 
45 |   test_that("it can fetch unraw values if the caching layer is enabled", {
46 |     map <- list2env(list("s3://test/key" = "value"))
47 |     cachedir <- tempdir()
48 |     dir.create(cachedir, FALSE, TRUE)
49 |     opts <- options(s3mpi.cache = cachedir)
50 |     on.exit(options(opts), add = TRUE)
51 | 
52 |     with_mock(
53 |       `s3mpi:::s3.get` = function(...) map[[..1]],
54 |       `s3mpi:::s3cache` = function(...) "value", {
55 |         expect_equal(s3read("key"), "value")
56 |         map$`s3://test/key` <- "new_value"
57 |         # Make sure we are caching.
58 |         expect_equal(s3read("key"), "value")
59 |     })
60 |   })
61 | 
62 |   test_that("it can fetch unraw values if the caching layer is enabled but is uncached", {
63 |     map <- list2env(list("s3://test/key" = "value"))
64 |     cachedir <- tempdir()
65 |     dir.create(cachedir, FALSE, TRUE)
66 |     opts <- options(s3mpi.cache = cachedir)
67 |     on.exit(options(opts), add = TRUE)
68 | 
69 |     with_mock(
70 |       `s3mpi:::s3.get` = function(...) map[[..1]],
71 |       `s3mpi:::s3cache` = function(...) not_cached, {
72 |         expect_equal(s3read("key"), "value")
73 |         map$`s3://test/key` <- "new_value"
74 |         # Make sure we are not caching.
75 |         expect_equal(s3read("key"), "new_value")
76 |     })
77 |   })
78 | })
79 | 
80 | 


--------------------------------------------------------------------------------
/R/s3normalize.R:
--------------------------------------------------------------------------------
 1 | ## The roxygen documentation here is pretty thorough. In effect, if
 2 | ## we wish to use s3mpi to store C, Java, etc. objects that are
 3 | ## needed for our R code to run, we can do something like:
 4 | ##
 5 | ## ```r
 6 | ## obj <- list(atomic_vector = 1:10, external_object = ptr_to_c_object)
 7 | ## attr(obj, "s3mpi.serialize") <- list(
 8 | ##   "write" = function(object) {
 9 | ##      obj$external_object <- convert_ptr_to_raw_vector(obj$external_object)
10 | ##   },
11 | ##   "read" = function(object) {
12 | ##      obj$external_object <- convert_raw_vector_to_ptr(obj$external_object)
13 | ##   })
14 | ##
15 | ## s3store(obj, "some/key") # Will invoke the write function prior to 
16 | ##    # calling saveRDS and uploading the serialized object.
17 | ## s3read("some/key") # Will invoke the read function after downloading
18 | ##    # the serialized object and calling readRDS.
19 | ## ```
20 | #' Convert a possibly non-serializable R object to a serializable R object.
21 | #'
22 | #' R has good foreign function interface bindings to C code. As such,
23 | #' certain package authors may wish to optimize their code by keeping
24 | #' their objects in C structures instead of R SEXPs (the standard for
25 | #' object representation in the R interpreter). This also applies
26 | #' to bindings to external libraries. The speed advantage can be
27 | #' substantial, so this is not an uncommon use case. The \code{s3normalize}
28 | #' helper provides the ability to add an additional "preprocessor"
29 | #' layer prior to storing an object to S3 that converts a non-serializable
30 | #' object (such as a list with one of its entries pointing to an 
31 | #' external C structure) to serialize object (such as that list with
32 | #' its C structure pointer entry replaced by a \code{\link{raw}} vector).
33 | #'
34 | #' If the object being uploaded with \code{s3store} or downloaded wiht
35 | #' \code{s3read} has an attribute \code{"s3mpi.serialize"} which must
36 | #' be a list with keys \code{c("read", "write")}, these keys should
37 | #' hold functions requiring a single argument which are applied to
38 | #' the object prior to \emph{reading} from (\code{s3read}) and \emph{writing}
39 | #' to (\code{s3store}) S3, respectively. This allows s3mpi storage
40 | #' of not only vanilla R objects but \emph{arbitrary objects in memory}
41 | #' (whether they are internally represented by a C, Rust, Java, etc. process).
42 | #' 
43 | #' @param object ANY. The R object to normalize. If it has an
44 | #'   \code{"s3mpi.serialize"} attribute consisting of a list with
45 | #'   \code{"read"} and \code{"write"} keys, these arity-1 functions
46 | #'   will be called with the \code{object} prior to reading from and
47 | #'   writing to S3, respectively.
48 | #' @param read logical. If \code{TRUE}, the \code{"read"} key of the
49 | #'    \code{"s3mpi.serialize"} attribute, which should be a 1-argument
50 | #'    function, will be invoked on the object. Otherwise, the \code{"write"}
51 | #'    key will be invoked. By default, \code{read} is TRUE.
52 | #' @return A previously possibly non-vanilla R object (that is, 
53 | #'    an R object that may contain external pointers to non-R objects,
54 | #'    such as vanilla C structs) converted to a totally vanilla R object
55 | #'    (for example, by replacing the pointers with \code{\link{raw}} binary data).
56 | #' @export
57 | s3normalize <- function(object, read = TRUE) {
58 |   if (utils::object.size(object) == 0) {
59 |     warning("In s3mpi package: size-0 object is being normalized", call. = TRUE)
60 |     NULL
61 |   } else if (read) {
62 |     (attr(object, "s3mpi.serialize")$read %||% identity)(object)
63 |   } else {
64 |     (attr(object, "s3mpi.serialize")$write %||% identity)(object)
65 |   }
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/R/s3.put.R:
--------------------------------------------------------------------------------
 1 | #' @param x ANY. R object to store to S3.
 2 | #' @param name character.
 3 | #' @param check_exists logical. Whether or not to check if an object already exists at the specificed location.
 4 | #' @param num_retries numeric. the number of times to retry uploading.
 5 | #' @param backoff numeric. Vector, with each element in seconds, describing the
 6 | #'   exponential backoff to be used in conjunction with the num_retries argument.
 7 | #'   Number of elements must equal num_retries. Defaults to 4, 8, 16, 32, etc.
 8 | #' @param max_backoff numeric. Number describing the maximum seconds s3mpi will sleep
 9 | #'   prior to retrying an upload. Defaults to 128 seconds.
10 | #' @param storage_format character. What format to store files in. Defaults to RDS.
11 | #' @param row.names logical. Whether or not to write row names when writing CSV's or tables.
12 | #' @param ... additional arguments to pass the the saving function.
13 | #' @rdname s3.get
14 | s3.put <- function (x, path, name, bucket_location = "US",
15 |                     debug = FALSE, check_exists = TRUE,
16 |                     num_retries = get_option("s3mpi.num_retries", 0), backoff = 2 ^ seq(2, num_retries + 1),
17 |                     max_backoff = 128, storage_format = c("RDS", "CSV", "table"), row.names = FALSE, ...) {
18 |   storage_format <- match.arg(storage_format)
19 | 
20 |   if (is.data.frame(x) && storage_format %in% c("CSV, table")) {
21 |     stop("You can't store an object in ", storage_format," format if it isn't a data.frame.")
22 |   }
23 | 
24 |   s3key <- paste(path, name, sep = "")
25 |   ## This inappropriately-named function actually checks existence
26 |   ## of an entire *s3key*, not a bucket.
27 |   AWS.tools:::check.bucket(s3key)
28 | 
29 |   ## Ensure backoff vector has correct number of elements and is capped
30 |   if (num_retries > 0) {
31 |     if (length(backoff) != num_retries) {
32 |       stop("Your backoff vector length must match the number of retries.")
33 |     }
34 |     backoff <- pmin(backoff, max_backoff)
35 |   }
36 | 
37 |   ## We create a temporary file, *write* the R object to the file, and then
38 |   ## upload that file to S3. This magic works thanks to R's fantastic
39 |   ## support for [arbitrary serialization](https://stat.ethz.ch/R-manual/R-patched/library/base/html/readRDS.html)
40 |   ## (including closures!).
41 |   x.serialized <- tempfile();
42 |   dir.create(dirname(x.serialized), showWarnings = FALSE, recursive = TRUE)
43 |   on.exit(unlink(x.serialized, force = TRUE), add = TRUE)
44 |   save_to_file <- get(paste0("save_as_", storage_format))
45 |   save_to_file(x, x.serialized, row.names, ...)
46 | 
47 |   cmd <- s3cmd_put_command(s3key, x.serialized, bucket_location_to_flag(bucket_location), debug)
48 |   run_system_put(path, name, cmd, check_exists, num_retries, backoff)
49 | }
50 | 
51 | run_system_put <- function(path, name, s3.cmd, check_exists, num_retries, backoff) {
52 |   ret <- system2(s3cmd(), s3.cmd, stdout = TRUE)
53 |   if (isTRUE(check_exists) && !s3exists(name, path)) {
54 |     if (num_retries > 0) {
55 |       Sys.sleep(backoff[length(backoff) - num_retries + 1])
56 |       Recall(path = path, name = name, s3.cmd = s3.cmd,
57 |              check_exists = check_exists,
58 |              num_retries = num_retries - 1, backoff = backoff)
59 |     } else {
60 |       stop("Object could not be successfully stored.")
61 |     }
62 |   } else {
63 |     ret
64 |   }
65 | }
66 | 
67 | s3cmd_put_command <- function(s3key, file, bucket_flag, debug) {
68 |   if (use_legacy_api()) {
69 |     paste("put", file, paste0('"', s3key, '"'),
70 |           bucket_flag, ifelse(debug, "--debug", ""), "--force")
71 |   } else {
72 |     paste("s3 cp", file, s3key)
73 |   }
74 | }
75 | 
76 | save_as_RDS <- function(x, filename, ...) {
77 |   saveRDS(x, filename, ...)
78 | }
79 | 
80 | 
81 | save_as_CSV <- function(x, filename, row.names, ...) {
82 |   write.csv(x, filename, row.names = row.names, ...)
83 | }
84 | 
85 | save_as_table <- function(x, filename, row.names, ...) {
86 |   write.table(x, filename, row.names = row.names, ...)
87 | }
88 | 


--------------------------------------------------------------------------------
/tests/testthat/test-s3store.R:
--------------------------------------------------------------------------------
 1 | context("s3store")
 2 | library(testthatsomemore)
 3 | 
 4 | withr::with_options(list(
 5 |   s3mpi.path = "s3://test/",
 6 |   s3mpi.cache = NULL
 7 | ), {
 8 |   test_that("it stops if safe is enabled and we overwrite", {
 9 |     testthatsomemore::package_stub("s3mpi", "s3exists", function(...) TRUE, {
10 |       expect_error(s3store("foo", "bar", safe = TRUE), "already exists")
11 |     })
12 |   })
13 | 
14 |   test_that("it can store raw values if the caching layer is disabled", {
15 |     map <- list2env(list("s3://test/key" = NULL))
16 |     testthatsomemore::package_stub("s3mpi", "s3.get",  function(...) map[[..1]], {
17 |     testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map[[paste0(..2, ..3)]] <- ..1, {
18 |       s3store("value", "key")
19 |       expect_equal(s3read("key"), "value")
20 |       map$`s3://test/key` <- "new_value"
21 |       # Make sure we are not caching.
22 |       expect_equal(s3read("key", cache = FALSE), "new_value")
23 |     })})
24 |   })
25 | 
26 |   test_that("it can store values if the caching layer is enabled", {
27 |     map <- list2env(list("s3://test/key" = NULL))
28 |     map2 <- new.env(parent = map)
29 |     testthatsomemore::package_stub("s3mpi", "s3.get",  function(...) map2[[..1]], {
30 |     testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map2[[paste0(..2, ..3)]] <- ..1, {
31 |       s3store("value", "key")
32 |       expect_equal(s3read("key"), "value")
33 |       map$`s3://test/key` <- "new_value"
34 |       # Make sure we are not caching.
35 |       expect_equal(s3read("key"), "value")
36 |     })})
37 |   })
38 | 
39 |   test_that("it denormalizes", {
40 |     map <- list2env(list("s3://test/key" = "value"))
41 | 
42 |     testthatsomemore::package_stub("s3mpi", "s3normalize",  function(a, b) { map$norm <- missing(b); a }, {
43 |     testthatsomemore::package_stub("s3mpi", "s3.get",  function(...) map[[..1]], {
44 |     testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map[[paste0(..2, ..3)]] <- ..1, {
45 |       s3store("value", "key")
46 |       expect_false(map$norm)
47 |       s3store(new.env(), "key2")
48 |       expect_true(map$norm)
49 |     })})})
50 |   })
51 | 
52 |   test_that("it can pick up missing key", {
53 |     map <- list2env(list("s3://test/key" = NULL))
54 |     testthatsomemore::package_stub("s3mpi", "s3.get",  function(...) map[[..1]], {
55 |     testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map[[paste0(..2, ..3)]] <- ..1, {
56 |       key <- "value"
57 |       s3store(key)
58 |       expect_equal(s3read("key"), "value")
59 |     })})
60 |   })
61 | 
62 |   test_that("it produces an error when the object isn't found with an s3exists following the s3.put", {
63 |     testthatsomemore::package_stub("base", "system2",  function(...) TRUE, {
64 |     testthatsomemore::package_stub("s3mpi", "s3exists",  function(...) FALSE, {
65 |     testthatsomemore::package_stub("s3mpi", "s3.put", function(...) run_system_put(..2, ..3, "", TRUE, 0), {
66 |       expect_error(s3store("value", "key"))
67 |     })})})
68 |   })
69 | 
70 |   test_that("it does not produce an error when the object is found with an s3exists following the s3.put", {
71 |     testthatsomemore::package_stub("base", "system2",  function(...) TRUE, {
72 |     testthatsomemore::package_stub("s3mpi", "s3exists",  function(...) TRUE, {
73 |     testthatsomemore::package_stub("s3mpi", "s3.put", function(...) run_system_put(..2, ..3, "", TRUE, 0), {
74 |       expect_error(s3store("value", "key"), NA)
75 |     })})})
76 |   })
77 | 
78 |   calling_intervals <- NULL
79 |   test_that("it can retry with the correct timing when an s3exists returns FALSE", {
80 |     testthatsomemore::package_stub("base", "Sys.sleep",  function(...) calling_intervals <<- c(calling_intervals, ..1), {
81 |     testthatsomemore::package_stub("base", "system2",  function(...) TRUE, {
82 |     testthatsomemore::package_stub("s3mpi", "s3exists",  function(...) FALSE, {
83 |     testthatsomemore::package_stub("s3mpi", "s3.put", function(...) run_system_put(..2, ..3, "", TRUE, 3, c(1, 2, 100)), {
84 |       try(run_system_put(,, "", TRUE, 3, c(1, 2, 100)), silent = TRUE)
85 |       expect_identical(calling_intervals, c(1, 2, 100))
86 |     })})})})
87 |   })
88 | })
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | # Version 0.2.47
  2 | 
  3 | * Fix bug in `s3exists` when using `aws-cli`.
  4 | 
  5 | # Version 0.2.46
  6 | 
  7 | * Fix typo in `last_modified`.
  8 | 
  9 | # Version 0.2.45
 10 | 
 11 | * Compatibility with `aws-cli`.
 12 | 
 13 | # Version 0.2.44
 14 | 
 15 | * Bugfixes and better compatibility with system environment variables.
 16 | 
 17 | # Version 0.2.43
 18 | 
 19 | * Allowing s3mpi.path and s3mpi.cache to be called via system environment variables.
 20 | 
 21 | # Version 0.2.42
 22 | 
 23 | * Better error output in `s3read` if path does not exist. (see [issue #72](https://github.com/robertzk/s3mpi/issues/72))
 24 | 
 25 | # Version 0.2.41
 26 | 
 27 | * Fix incorrect `strptime` version specification.
 28 | 
 29 | # Version 0.2.40
 30 | 
 31 | * Remove overly complicated metaprogramming.
 32 | 
 33 | # Version 0.2.33-9
 34 | 
 35 | * Other fixes for s4cmd.
 36 | 
 37 | # Version 0.2.32
 38 | 
 39 | * allow choice of storage format in `s3read` and `s3store`. Defaults to `RDS`,
 40 |   and now you can chooose `CSV` or `table` for data frames.
 41 | 
 42 | # Version 0.2.31
 43 | 
 44 | * Make `s3store` work with s4cmd.
 45 | 
 46 | # Version 0.2.30
 47 | 
 48 | * Don't set --bucket-location flag if `s4cmd` is detected as the `s3cmd` binary.
 49 | 
 50 | # Version 0.2.29
 51 | 
 52 | * Revert the change in 0.2.21 in favor of using `s3cmd info` over
 53 |   `s3cmd ls` to obtain updated_at information on files.
 54 | 
 55 | # Version 0.2.28
 56 | 
 57 | * Add exponential backoff logic to s3.put function.
 58 | 
 59 | # Version 0.2.27
 60 | * Turn off the LRU cache too when `cache = FALSE` in `s3read`.
 61 | 
 62 | # Version 0.2.26
 63 | * `options(s3mpi.num_retries)` now allows you to specify default number of retries globally.
 64 | 
 65 | # Version 0.2.25
 66 | * Automatically adds ending slashes to paths if they are missing when using
 67 |   `s3store`, `s3exists`, or `s3delete`.
 68 | 
 69 | # Version 0.2.24
 70 | 
 71 | * Add the ability to delete an object in s3 using `s3delete`.
 72 | 
 73 | # Version 0.2.23
 74 | 
 75 | * `s3path()` is exported.
 76 | 
 77 | # Version 0.2.22
 78 | 
 79 | * `s3read()` (with no arguments) is no longer supported.
 80 | 
 81 | # Version 0.2.21
 82 | 
 83 | * Fixed an issue where reading files that have the same prefix as another file
 84 |   on the S3 bucket generates a warning.
 85 | * Fix a more serious problem where writing and reading within the same minute
 86 |   produces incorrect results due to the s3cmd utility having *minute*-level
 87 |   rather than second-level granularity.
 88 | 
 89 | # Version 0.2.20
 90 | 
 91 | * Workaround for the silent but oh-so-deadly sporadic failure of s3cmd's put.
 92 |   By default we now check for the existence of the object when issuing a put,
 93 |   with the option to retry a number of times.
 94 | 
 95 | # Version 0.2.19
 96 | 
 97 | * Keep AWS.tools on a remote.
 98 | 
 99 | # Version 0.2.18
100 | 
101 | * Add remotes to DESCRIPTION.
102 | 
103 | # Version 0.2.17
104 | 
105 | * Explicitly create the directory of a file given by `tempfile()` to prevent
106 |   rare errors wherein the directory does not exist and yields a
107 |   file connection error. ([#41](https://github.com/robertzk/s3mpi/issues/41))
108 | 
109 | # Version 0.2.16
110 | 
111 | * Introduce an `s3mpi.disable_lru_cache` option as well as
112 |   silently fail if storage to LRU does not succeed.
113 | 
114 | # Version 0.2.15
115 | 
116 | * Switch to `system2`, which should be more windows friendly, and allow
117 |   the user to specify path to executable of s3cmd, by setting `options(s3mpi.s3cmd_path = '/usr/local/bin/s3cmd')`
118 | 
119 | # Version 0.2.13
120 | 
121 | * Fixup LRU cache to actually use size parameter option.
122 | 
123 | # Version 0.2.11
124 | 
125 | * A stylistic refactor of the package. The `.path` argument
126 |   has been deprecated in `s3read` and `s3store` in favor of
127 |   simply `path`.
128 | 
129 | # Version 0.2.9-10
130 | 
131 | * Remove the need to type a trailing slash in `.path`.
132 | 
133 | # Version 0.2.8
134 | 
135 |  * A hotfix for cache corruption, where data exists without metadata.
136 |    It can happen if writing metadata ever fails.
137 | 
138 | # Version 0.2.7
139 | 
140 |  * Remove the `s3mpi.memoize_cache` global option, since it makes no sense.
141 |    A user could have overwritten an S3 key in a different R session.
142 | 
143 |  * `s3exists(NULL)` now returns FALSE.  Fixes issue #22.
144 | 
145 | # Version 0.2.5-6
146 | 
147 |  * The `s3mpi.memoize_cache` global controls whether or not caching is
148 |    [memoised](https://github.com/hadley/memoise). If set to `TRUE`, it would
149 |    have the effect of keeping a common object in the R session instead of
150 |    retrieving it from the cached file for each given s3 key. This can significantly
151 |    speed up code that reads from the same S3 key multiple times within a
152 |    single R session.
153 | 
154 | # Version 0.2.4
155 | 
156 |  * The safety check on `s3store` uses `safe = FALSE` by default now. The new
157 |    function `s3put` is equivalent to `s3store` and should be used going forward
158 |    if one does not wish to overwrite existing keys. The other approach was causing
159 |    too many breaking changes to existing codebases.
160 | 
161 | # Version 0.2.2
162 | 
163 |  * Added a safety check for `s3store`. Now if you want to overwrite a key inside a bucket,
164 |    you need to use `s3store(key, safe = FALSE)`. By default safe is set to `TRUE`.
165 | 
166 | # Version 0.2.0
167 | 
168 |  * Added a caching mechanism that will keep copies of files downloaded and
169 |    uploaded to S3. Useful if local storage constraints are not an issue.
170 |    To enable, set `options(s3mpi.cache = '~/.s3cache')` in your `~/.Rprofile`
171 |    (or replace `'~/.s3cache'` with a directory of your choice).
172 | 


--------------------------------------------------------------------------------
/R/s3.get.R:
--------------------------------------------------------------------------------
  1 | #' Fetch an R object from an S3 path.
  2 | #'
  3 | #' @param path character. A full S3 path.
  4 | #' @param bucket_location character. Usually \code{"US"}.
  5 | #' @param verbose logical. If \code{TRUE}, the \code{s3cmd}
  6 | #'    utility verbose flag will be set.
  7 | #' @param debug logical. If \code{TRUE}, the \code{s3cmd}
  8 | #'    utility debug flag will be set.
  9 | #' @param cache logical. If \code{TRUE}, an LRU in-memory cache will be referenced.
 10 | #' @param storage_format character. What format the object is stored in. Defaults to RDS.
 11 | #' @aliases s3.put
 12 | #' @return For \code{s3.get}, the R object stored in RDS format on S3 in the \code{path}.
 13 | #'    For \code{s3.put}, the system exit code from running the \code{s3cmd}
 14 | #'    command line tool to perform the upload.
 15 | s3.get <- function (path, bucket_location = "US", verbose = FALSE, debug = FALSE, cache = TRUE, storage_format = c("RDS", "CSV", "table"), ...) {
 16 |   storage_format <- match.arg(storage_format)
 17 | 
 18 |   ## This inappropriately-named function actually checks existence
 19 |   ## of a *path*, not a bucket.
 20 |   AWS.tools:::check.bucket(path)
 21 | 
 22 |   # Helper function for fetching data from s3
 23 |   fetch <- function(path, storage_format, bucket_location, ...) {
 24 |     x.serialized <- tempfile()
 25 |     dir.create(dirname(x.serialized), showWarnings = FALSE, recursive = TRUE)
 26 |     ## We remove the file [when we exit the function](https://stat.ethz.ch/R-manual/R-patched/library/base/html/on.exit.html).
 27 |     on.exit(unlink(x.serialized), add = TRUE)
 28 | 
 29 |     if (file.exists(x.serialized)) {
 30 |       unlink(x.serialized, force = TRUE)
 31 |     }
 32 | 
 33 |     ## Run the s3cmd tool to fetch the file from S3.
 34 |     cmd <- s3cmd_get_command(path, x.serialized, bucket_location_to_flag(bucket_location), verbose, debug)
 35 |     status <- system2(s3cmd(), cmd)
 36 | 
 37 |     if (as.logical(status)) {
 38 |       warning("Nothing exists for key ", path)
 39 |       `attr<-`(`class<-`(data.frame(), c("s3mpi_error", status)), "key", path)
 40 |     } else {
 41 |       ## And then read it back in RDS format.
 42 |       load_from_file <- get(paste0("load_as_", storage_format))
 43 |       load_from_file(x.serialized, ...)
 44 |     }
 45 |   }
 46 | 
 47 |   ## Check for the path in the cache
 48 |   ## If it does not exist, create and return its entry.
 49 |   ## The `s3LRUcache` helper is defined in utils.R
 50 |   if (is.windows() || isTRUE(get_option("s3mpi.disable_lru_cache")) || !isTRUE(cache)) {
 51 |     ## We do not have awk, which we will need for the moment to
 52 |     ## extract the modified time of the S3 object.
 53 |     ans <- fetch(path, storage_format, bucket_location, ...)
 54 |   } else if (!s3LRUcache()$exists(path)) {
 55 |     ans <- fetch(path, storage_format, bucket_location, ...)
 56 |     ## We store the value of the R object in a *least recently used cache*,
 57 |     ## expecting the user to not think about optimizing their code and
 58 |     ## call `s3read` with the same key multiple times in one session. With
 59 |     ## this approach, we keep the latest 10 object in RAM and do not have
 60 |     ## to reload them into memory unnecessarily--a wise time-space trade-off!
 61 |     tryCatch(s3LRUcache()$set(path, ans), error = function(...) {
 62 |       warning("Failed to store object in LRU cache. Repeated calls to ",
 63 |               "s3read will not benefit from a performance speedup.")
 64 |     })
 65 |   } else {
 66 |     # Check time on s3LRUcache's copy
 67 |     last_cached <- s3LRUcache()$last_accessed(path) # assumes a POSIXct object
 68 | 
 69 |     # Check time on s3 remote's copy using the `s3cmd info` command.
 70 |     s3.cmd <- paste("info ", path, "| head -n 3 | tail -n 1")
 71 |     result <- system2(s3cmd(), s3.cmd, stdout = TRUE, stderr = NULL)
 72 |     # The `s3cmd info` command produces the output
 73 |     # "    Last mod:  Tue, 16 Jun 2015 19:36:10 GMT"
 74 |     # in its third line, so we subset to the 20-39 index range
 75 |     # to extract "16 Jun 2015 19:36:10".
 76 |     result <- substring(result, 20, 39)
 77 |     last_updated <- strptime(result, format = "%d %b %Y %H:%M:%S", tz = "GMT")
 78 | 
 79 |     if (last_updated > last_cached) {
 80 |       ans <- fetch(path, storage_format, bucket_location, ...)
 81 |       s3LRUcache()$set(path, ans)
 82 |     } else {
 83 |       ans <- s3LRUcache()$get(path)
 84 |     }
 85 |   }
 86 |   ans
 87 | }
 88 | 
 89 | s3cmd_get_command <- function(path, file, bucket_flag, verbose, debug) {
 90 |   if (use_legacy_api()) {
 91 |     paste("get", paste0('"', path, '"'), file,
 92 |           bucket_flag,
 93 |           if (verbose) "--verbose --progress" else "--no-progress",
 94 |           if (debug) "--debug" else "")
 95 |   } else {
 96 |     paste0("s3 cp ", path, " ", file)
 97 |   }
 98 | }
 99 | 
100 | ## Given an s3cmd path and a bucket location, will construct a flag
101 | ## argument for s3cmd.  If it looks like the s3cmd is actually
102 | ## pointing to an s4cmd, return empty string as s4cmd doesn't
103 | ## support bucket location.
104 | bucket_location_to_flag <- function(bucket_location) {
105 |   if (grepl("s4cmd", s3cmd())) {
106 |     if (bucket_location != "US") {
107 |         warning(paste0("Ignoring non-default bucket location ('",
108 |                        bucket_location,
109 |                        "') in s3mpi::s3.get since s4cmd was detected",
110 |                        "-- this might be a little slower but is safe to ignore."));
111 |     }
112 |     return("")
113 |   }
114 |   return(paste("--bucket_location", bucket_location))
115 | }
116 | 
117 | load_as_RDS <- function(filename, ...) {
118 |   readRDS(filename, ...)
119 | }
120 | 
121 | load_as_CSV <- function(filename, ...) {
122 |   read.csv(filename, ..., stringsAsFactors = FALSE)
123 | }
124 | 
125 | load_as_table <- function(filename, ...) {
126 |   read.table(filename, ..., stringsAsFactors = FALSE)
127 | }
128 | 
129 | #' Printing for s3mpi errors.
130 | #'
131 | #' @param x ANY. R object to print.
132 | #' @param ... additional objects to pass to print function.
133 | #' @export
134 | print.s3mpi_error <- function(x, ...)  {
135 |   cat("Error reading from S3: key", crayon::white$bold(attr(x, "key")), "not found.\n")
136 | }
137 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | ## A standard helper: if `x` is null, `y` will be returned instead.
  2 | `%||%` <- function(x, y) if (is.null(x)) y else x
  3 | 
  4 | ## A package specific environment
  5 | .s3mpienv <- new.env()
  6 | 
  7 | ## path to shell util
  8 | s3cmd <- function() {
  9 |   cmd <- if (use_legacy_api()) {
 10 |     if (isTRUE(nzchar(cmd <- get_option("s3mpi.s3cmd_path")))) {
 11 |       cmd
 12 |     } else { as.character(Sys.which("s3cmd")) }
 13 |   } else {
 14 |     if (isTRUE(nzchar(cmd <- get_option("s3mpi.aws_path")))) {
 15 |       cmd
 16 |     } else { as.character(Sys.which("aws")) }
 17 |   }
 18 |   if (is.null(cmd)) { stop("No s3mpi backend found on your system! Make sure you install either aws-cli or s3cmd or s4cmd") }
 19 |   cmd
 20 | }
 21 | 
 22 | use_legacy_api <- function() {
 23 |   isTRUE(get_option("s3mpi.legacy_api"))
 24 | }
 25 | 
 26 | ## Given an s3cmd path and a bucket location, will construct a flag
 27 | ## argument for s3cmd.  If it looks like the s3cmd is actually
 28 | ## pointing to an s4cmd, return empty string as s4cmd doesn't
 29 | ## support bucket location.
 30 | bucket_location_to_flag <- function(bucket_location) {
 31 |   if (using_s4cmd()) {
 32 |     if (!identical(bucket_location, "US")) {
 33 |         warning(paste0("Ignoring non-default bucket location ('",
 34 |                        bucket_location,
 35 |                        "') in s3mpi::s3.get since s4cmd was detected",
 36 |                        "-- this might be a little slower but is safe to ignore."));
 37 |     }
 38 |     ""
 39 |   } else if (use_legacy_api()) {
 40 |     paste("--bucket-location", bucket_location)
 41 |   }
 42 | }
 43 | 
 44 | ## Given an s3cmd path and a bucket location, will construct a flag
 45 | ## argument for s3cmd.  If it looks like the s3cmd is actually
 46 | ## pointing to an s4cmd, return empty string as s4cmd doesn't
 47 | ## support bucket location.
 48 | bucket_location_to_flag <- function(bucket_location) {
 49 |   if (using_s4cmd()) {
 50 |     if (!identical(bucket_location, "US")) {
 51 |         warning(paste0("Ignoring non-default bucket location ('",
 52 |                        bucket_location,
 53 |                        "') in s3mpi::s3.get since s4cmd was detected",
 54 |                        "-- this might be a little slower but is safe to ignore."));
 55 |     }
 56 |     ""
 57 |   } else {
 58 |     paste("--bucket-location", bucket_location)
 59 |   }
 60 | }
 61 | 
 62 | ## We use the [memoise](https://github.com/hadley/memoise) package to
 63 | ## ensure this check only gets run once in a given R session. This
 64 | ## means a user will have to restart R if they install s3cmd
 65 | ## during a session, but we are comfortable with that!
 66 | ensure_s3cmd_present <- memoise::memoise(function() {
 67 |   check <- try(system("s3cmd --help", intern = TRUE), silent = TRUE)
 68 |   if (is(check, "try-error")) {
 69 |     ## It is always preferable to make life as easy as possible for the user!
 70 |     ## If they have the [homebrew](https://brew.sh) package manager, we
 71 |     ## give them the fastest installation instructions.
 72 |     if (is.mac() && system2("which", "brew", stdout = FALSE) == 0) {
 73 |       stop("Please install the ", crayon::yellow("s3cmd"), " command-line ",
 74 |            "utility using by running ", crayon::green("brew install s3cmd"),
 75 |            " from your terminal and then configuring your S3 credentials ",
 76 |            "using ", crayon::yellow("s3cmd --configure"), call. = FALSE)
 77 |     } else {
 78 |       ## Otherwise, manual it is!
 79 |       stop("Please install s3cmd, the S3 command line utility: ",
 80 |            "http://s3tools.org/kb/item14.htm\nand then setup your S3 ",
 81 |            "credentials using ", crayon::yellow("s3cmd --configure"),
 82 |            call. = FALSE)
 83 |     }
 84 |   }
 85 | })
 86 | 
 87 | cache_enabled <- function() {
 88 |   !is.null(tmp <- cache_directory()) && nzchar(tmp)
 89 | }
 90 | 
 91 | cache_directory <- function() {
 92 |   dir <- get_option("s3mpi.cache")
 93 |   if (!is.null(dir) && !(is.character(dir) && length(dir) == 1 && !is.na(dir))) {
 94 |     stop("Please set the ", sQuote("s3mpi.cache"), " option to a character ",
 95 |          "vector of length 1 giving a directory path.")
 96 |   }
 97 |   dir
 98 | }
 99 | 
100 | ## We ping google.com to ensure the user has an internet connection. If not,
101 | ## we operate in "offline mode" for the whole session, that is, we read
102 | ## from the s3cache if the user has set their `s3mpi.s3cache` option
103 | ## but cannot store or read new keys.
104 | has_internet <- local({
105 |   has_internet_flag <- NULL
106 |   function() {
107 |     if (!is.null(get_option("s3mpi.skip_connection_check"))) return(FALSE)
108 |     if (!is.null(has_internet_flag)) { return(has_internet_flag) }
109 |     has_internet_flag <<- suppressWarnings({
110 |       internet_check <- try(file("http://google.com", "r"))
111 |       if (!is(internet_check, "try-error") && is(internet_check, "connection")) {
112 |         on.exit(close.connection(internet_check))
113 |       }
114 |       !(is(internet_check, "try-error") &&
115 |         grepl("cannot open", internet_check$message))
116 |     })
117 |   }
118 | })
119 | 
120 | ## A sexy [least recently used cache](http://mcicpc.cs.atu.edu/archives/2012/mcpc2012/lru/lru.html)
121 | ## using [the cacher package](https://github.com/kirillseva/cacher).
122 | s3LRUcache <- function() {
123 |   if (is.null(.s3mpienv$lrucache)) {
124 |     .s3mpienv$lrucache <- cacher::LRUcache(get_option("s3mpi.cache_size", "2Gb"))
125 |   } else {
126 |     .s3mpienv$lrucache
127 |   }
128 | }
129 | 
130 | # All S3 paths need a slash at the end to work, but we don't need the user
131 | # to know that, so let's add a slash for them if they forget.
132 | add_ending_slash <- function(path) {
133 |   last_character <- function(str) {
134 |     substr(str, nchar(str), nchar(str))
135 |   }
136 |   if (last_character(path) != "/") { paste0(path, "/") } else { path }
137 | }
138 | 
139 | using_s4cmd <- function() {
140 |   grepl("s4cmd", s3cmd())
141 | }
142 | 
143 | get_option <- function(x, default = NULL) {
144 |   result <- getOption(x)
145 |   if (is.null(result)) {
146 |     result <- Sys.getenv(toupper(gsub("\\.", "_", x)))
147 |     if (!nzchar(result)) { result <- NULL }
148 |   }
149 |   result %||% default
150 | }
151 | 


--------------------------------------------------------------------------------
/R/s3cache.R:
--------------------------------------------------------------------------------
  1 | ## If we are frequently using `s3read` and `s3store` from within an
  2 | ## active R session, it is likely that we will need to pull the stored
  3 | ## object multiple times. For example, if we have the training data
  4 | ## set for a model or a list with some summary statistics, we may be
  5 | ## pulling this frequently when performing analysis during a week-long
  6 | ## project.
  7 | ##
  8 | ## To facilitate this process and speed things up a bit, we keep a
  9 | ## local *file system cache* of the objects downloaded from S3 using
 10 | ## `s3read`. If the user has set their `s3mpi.cache` option or system
 11 | ## environment variable to a
 12 | ## directory path (by default `~/.s3cache`), we will use that directory
 13 | ## to store downloaded R objects. The second time a user calls
 14 | ## `s3read("some/key")` we will fetch it from the local file system
 15 | ## instead of spending time re-downloading the object.
 16 | ##
 17 | ## This functionality should be disabled if we regularly are storing
 18 | ## and pulling objects that in aggregate exceed the user's available disk space.
 19 | #' A caching layer around s3mpi calls.
 20 | #'
 21 | #' Fetching large files from the S3 MPI can be expensive when performed
 22 | #' multiple times. This method allows one to add a caching layer
 23 | #' around S3 fetching. The user should specify the configuration option
 24 | #' \code{options(s3mpi.cache = "some/dir")}. The recommended cache
 25 | #' directory (where files will be stored) is \code{"~/.s3cache"}.
 26 | #'
 27 | #' @param s3key character. The full S3 key to attempt to read or write
 28 | #'    to the cache.
 29 | #' @param value ANY. The R object to save in the cache. If missing,
 30 | #'    a cache read will be performed instead.
 31 | s3cache <- function(s3key, value) {
 32 |   if (!cache_enabled()) {
 33 |     stop("Cannot use s3mpi::s3cache until you set options(s3mpi.cache) ",
 34 |          "to a directory in which to place cache contents.")
 35 |   }
 36 | 
 37 |   d <- cache_directory()
 38 |   dir.create(d, FALSE, TRUE)
 39 |   ## We will hold the objects in the `data` subdirectory of the `s3mpi.cache`
 40 |   ## path and *metadata* about the objects (such as when it was last modified
 41 |   ## on S3, so we can perform cache invalidation) in the `info` directory.
 42 |   dir.create(file.path(d, "info"), FALSE, TRUE)
 43 |   dir.create(file.path(d, "data"), FALSE, TRUE)
 44 | 
 45 |   # If no value to store was provided, we assume we are reading from the cache.
 46 |   if (missing(value)) {
 47 |     fetch_from_cache(s3key, d)
 48 |   } else { # Otherwise, we are writing to it.
 49 |     save_to_cache(s3key, value, d)
 50 |   }
 51 | }
 52 | 
 53 | #' Helper function for fetching a file from a cache directory.
 54 | #'
 55 | #' This function will also test to determine whether the file has been
 56 | #' modified on S3 since the last cache save. If the file has never been
 57 | #' cached or the cache is invalidated, it will return \code{s3mpi::not_cached}.
 58 | #'
 59 | #' @param key character. The key under which the cache entry is stored.
 60 | #' @param cache_dir character. The cache directory. The default is
 61 | #'    \code{cache_directory()}.
 62 | #' @return the cached object if the cache has not invalidated. Otherwise,
 63 | #'   return \code{s3mpi::not_cached}.
 64 | fetch_from_cache <- function(key, cache_dir) {
 65 |   ## We use an [MD5 hash](https://en.wikipedia.org/wiki/MD5) to convert an
 66 |   ## arbitrary R object to a 32-character string representation. We use this
 67 |   ## as an implicit hash table in the file system so we do not have to deal
 68 |   ## with keys that cause conflicts with the file system (such as "../blah").
 69 |   cache_key <- digest::digest(key)
 70 |   cache_file <- function(dir) file.path(cache_dir, dir, cache_key)
 71 | 
 72 |   if (!file.exists(cache_file("data"))) return(not_cached)
 73 | 
 74 |   if (!file.exists(cache_file("info"))) {
 75 |     # Somehow the cache became corrupt: data exists without accompanying
 76 |     # meta-data. In this case, simply wipe the cache.
 77 |     file.remove(cache_file("data"))
 78 |     return(not_cached)
 79 |   }
 80 | 
 81 |   info <- readRDS(cache_file("info"))
 82 |   # Check if cache is invalid.
 83 |   connected <- has_internet()
 84 |   if (!connected) {
 85 |     warning("Your network connection seems to be unavailable. s3mpi will ",
 86 |             "use the latest cache entries instead of pulling from S3.",
 87 |             call. = FALSE, immediate. = FALSE)
 88 |   }
 89 | 
 90 |   ## If the modification time has changed since we last cached the
 91 |   ## value, re-pull it from S3 and wipe the cache.
 92 |   if (connected && !identical(info$mtime, last_modified(key))) {
 93 |     not_cached
 94 |   } else {
 95 |     readRDS(cache_file("data"))
 96 |   }
 97 | }
 98 | 
 99 | #' Helper function for saving a file to a cache directory.
100 | #'
101 | #' @param key character. The key under which the cache entry is stored.
102 | #' @param value ANY. The R object to save in the cache.
103 | #' @param cache_dir character. The cache directory. The default is
104 | #'    \code{cache_directory()}.
105 | save_to_cache <- function(key, value, cache_dir = cache_directory()) {
106 |   cache_key  <- digest::digest(key)
107 |   cache_file <- function(dir) file.path(cache_dir, dir, cache_key)
108 | 
109 |   saveRDS(value, cache_file("data"))
110 |   info <- list(mtime = last_modified(key), key = key)
111 |   saveRDS(info, cache_file("info"))
112 |   invisible(NULL)
113 | }
114 | 
115 | #' Determine the last modified time of an S3 object.
116 | #'
117 | #' @param key character. The s3 key of the object.
118 | #' @return the last modified time or \code{NULL} if it does not exist on S3.
119 | last_modified <- function(key) {
120 |   ## If the user doesn't have internet, assume the file hasn't changed
121 |   ## since we can't figure out if it has! Here, we simply pull from
122 |   ## the cache.
123 |   if (!has_internet()) { return(as.POSIXct(as.Date("2000-01-01"))) }
124 |   cmd <- if (use_legacy_api()) {
125 |     paste("ls", key)
126 |   } else {
127 |     paste("s3", "ls", key)
128 |   }
129 |   s3result <- system2(s3cmd(), cmd, stdout = TRUE)[1L]
130 |   if (is.character(s3result) && !is.na(s3result) && nzchar(s3result)) {
131 |     ## We use [`strptime`](https://stat.ethz.ch/R-manual/R-patched/library/base/html/strptime.html)
132 |     ## to extract the modification time from the `s3cmd ls` query.
133 |     strptime(substring(s3result, 1, 16), "%Y-%m-%d %H:%M")
134 |   }
135 | }
136 | 
137 | ## This is a special object we use to signify the object is not
138 | ## cached. We assume no one will try to `s3store` an object with
139 | ## class `"not_cached"`!
140 | not_cached <- local({ tmp <- list(); class(tmp) <- "not_cached"; tmp })
141 | is.not_cached <- function(x) identical(x, not_cached)
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | R and AWS S3 [![Build Status](https://travis-ci.org/robertzk/s3mpi.svg?branch=master)](https://travis-ci.org/robertzk/s3mpi) [![Coverage Status](https://coveralls.io/repos/robertzk/s3mpi/badge.png)](https://coveralls.io/r/robertzk/s3mpi) [![Documentation](https://img.shields.io/badge/rocco--docs-%E2%9C%93-blue.svg)](http://robertzk.github.io/s3mpi/)
  2 | =========
  3 | 
  4 | A common problem for data scientists is passing data or models to each
  5 | other without interrupting their workflow. There are typically two approaches:
  6 | 
  7 |   1. Writing CSV and RDS files and passing them around using tools like
  8 |      email, Dropbox, or SFTP. Typically, these files are too large for
  9 |      inclusion in version control.
 10 | 
 11 |   2. Building an API infrastructure around some data backends, such as
 12 |      databases, data warehouses, and streaming providers like Kafka.
 13 | 
 14 | The former works well for small teams consisting of 1-3 people but soon
 15 | becomes prohibitive. Additionally, tracking the array of files and outputs
 16 | soon becomes cumbersome and interrupts the data scientist's workflow.
 17 | 
 18 | The second option is an inevitable progression for any sufficiently large data
 19 | team, but requires major coordination with software or data engineers
 20 | and may not be practical for small teams or experimental projects. It is
 21 | also usually limited by well-defined specification of the formats that
 22 | are being passed into consoles and outputted to data storage systems.
 23 | 
 24 | On the other hand, S3mpi (S3 [*message passing interface*](https://en.wikipedia.org/wiki/Message_Passing_Interface),
 25 | affectionately named after the distributed message passing library) 
 26 | allows for **storage and serialization of arbitrary R objects** and does
 27 | not have the limits of the second approach, while providing **on-demand
 28 | access to stored data and objects**, avoiding the need for large amounts of
 29 | disk space locally.
 30 | 
 31 | Here, S3 stands for [Amazon's cloud storage](https://aws.amazon.com/s3/) which
 32 | you can think of as an infinite hard drive. You write an object to a path,
 33 | and then it *remains there indefinitely and is accessible to anyone you wish
 34 | to share it with*. For example, if you have several terabytes of datasets split
 35 | into thousands of components, you can individually load small pieces and perform
 36 | computation on them to avoid storing the entire dataset locally. This is the
 37 | basis for distributed computing systems like [Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop).
 38 | 
 39 | Assuming you have set up your [S3 configuration](http://s3tools.org/kb/item14.htm)
 40 | correctly (see the tutorial below), you can immediately get started with:
 41 | 
 42 | ```R
 43 | library(s3mpi)
 44 | s3store(obj, "s3key/for/your/object")
 45 | ```
 46 | 
 47 | You can then read it back from S3 in any R session running on a machine with
 48 | compatible S3 credentials:
 49 | 
 50 | ```R
 51 | s3read("s3key/for/your/object")
 52 | ```
 53 | 
 54 | Paired with [chat-driven development](https://sameroom.io/blog/self-hosted-team-chat-options-and-alternatives/)
 55 | this allows a team of data scientists to quickly generate team-global accessible
 56 | objects like data sets and models and chat the key to teammates so they pull down
 57 | the results within seconds for inspection, modification, or further analysis.
 58 | 
 59 | #### Installing the Package
 60 | 
 61 | This package is not currently available on CRAN and has several non-CRAN
 62 | dependencies. First, ensure you have the [s3cmd](http://s3tools.org/s3cmd) command-line
 63 | tool installed. If you are on OS X, you can simply run `brew install s3cmd` if
 64 | you have [homebrew](http://brew.sh/). Next, you will have to copy the [example
 65 | `.s3cfg`](http://s3tools.org/kb/item14.htm) file and place it in `~/.s3cfg` (or
 66 | generate it using `s3cmd --configure`) and then obtain
 67 | [AWS access credentials](http://docs.aws.amazon.com/general/latest/gr/getting-aws-sec-creds.html)
 68 | and fill out the `access_key` and `secret_key` sections of your `~/.s3cfg` file.
 69 | Note that [S3 storage is pretty cheap](https://aws.amazon.com/s3/pricing/)
 70 | and even the most intense data use is unlikely to exceed $100/month.
 71 | 
 72 | To install the R package and its dependencies, run the following from the R console.
 73 | 
 74 | ```R
 75 | if (!require("devtools")) { install.packages("devtools") }
 76 | devtools::install_github("avantcredit/AWS.tools")
 77 | devtools::install_github("kirillseva/cacher")
 78 | devtools::install_github("robertzk/s3mpi")
 79 | ```
 80 | 
 81 | This package has been used on OSX and Linux systems in a production-facing
 82 | environment, but **we have not tested it extensively on Windows**,
 83 | so if you run into problems please [file an issue](https://github.com/robertzk/s3mpi/issues/new)
 84 | immediately.
 85 | 
 86 | Finally, put the name of a default [bucket](http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html)
 87 | in your `~/.Rprofile`:
 88 | 
 89 | ```R
 90 | options(s3mpi.path = "s3://yourS3Bucket/")
 91 | ```
 92 | 
 93 | If you do not specify a default S3 path, you will have to include it
 94 | manually as the second parameter:
 95 | 
 96 | ```R
 97 | s3store(obj, "s3key/for/your/object", "s3://somebucket/")
 98 | # From another R session
 99 | s3read("s3key/for/your/object", "s3://somebucket/")
100 | ```
101 | 
102 | #### Potential uses
103 | 
104 | S3mpi has been used in production-facing environments for:
105 | 
106 |   1. Light-weight **mapreduce** for background jobs on medium data sets. One can
107 |      partition a set of primary keys, perform a computation, and `s3store`
108 |      the results in a separate S3 location for each partition.
109 |   
110 |   2. **Logging** supplementation. For example, if you alert your errors to
111 |      a service like [honeybadger](http://honeybadger.io) it is possible to
112 |      provide additional details by noting the S3 key with an R object containing
113 |      further information in the notification. This also works with chat-driven
114 |      development using [hipchat](http://hipchat.com) or [slack](http://slack.com)
115 |      by adding an S3 key with "additional details" to failure notifications.
116 | 
117 |   3. **Caching** of functions that should have deterministic outputs or infrequent
118 |      refresh intervals can be accomplishing by wrapping it with an
119 |      ["s3 memoise"](https://github.com/peterhurford/s3memoize) layer (compare to
120 |      the totally in-memory [memoise](https://github.com/hadley/memoise)).
121 | 
122 |   4. For **debugging**, it is possible to `s3store` intermediate output during a complex
123 |      computation for later inspection, especially if you do not wish to store
124 |      this information on the local file system.
125 | 
126 |   5. **Collaboration** in data science teams can be massively improved by
127 |      using `s3store` and `s3read` to quickly pass data sets under investigation
128 |      between R sessions. ("Hey can you send me the IDs of the customers that
129 |      had a messed up leads record?") This completely eliminates the error-prone
130 |      email / dropbox alternative and leaves a paper trail since it is unlikely
131 |      one would ever need to delete objects from S3.
132 | 
133 |   6. Interfacing with **production environments** during background jobs, especially
134 |      if a compatible [Ruby](https://github.com/robertzk/s3mpi-ruby) or
135 |      [Python](https://github.com/robertzk/s3mpy) API is written. This can be used
136 |      to ask an engineer to pull data from a production console, "ruby s3store"
137 |      or "python s3store" it, and seamlessly read it from the R console as an
138 |      R object such as a list or a data.frame. This **narrows the gap between
139 |      analysts and engineers**.
140 | 
141 |   7. **Reproducible reports** can be generated by storing all intermediate and
142 |      final output in a pre-defined S3 convention. At [Avant](https://github.com/avantcredit),
143 |      this approach is used to store all information about all trained models
144 |      stretching to the beginning of time.
145 | 
146 | The time required to store and read objects can be massively sped up by
147 | adopting a workflow where one **sshes into an [EC2 instance](https://aws.amazon.com/ec2/instance-types/)**.
148 | 
149 | #### Alternative S3 key setup
150 | 
151 | Instead of setting up an `~/.s3cfg` file, you can also add the
152 | following environment variables to `.bash_profile` / `.zshrc`:
153 | 
154 | ```
155 | export AWS_ACCESS_KEY_ID=PUTYOURACCESSKEYHERE
156 | export AWS_SECRET_ACCESS_KEY=PUTYOURSECRETKEYHERE
157 | ```
158 | 
159 | #### Local Caching
160 | 
161 | You can enable local caching of downloaded and uploaded files by setting a
162 | global systen environment variable or by using:
163 | 
164 | ```R
165 | options(s3mpi.cache = '~/.s3cache') # Or a directory of your choice
166 | ```
167 | 
168 | If you have the caching layer enabled in the above manner, the s3mpi package will
169 | check if you have a functioning connection to S3 before reading from the cache
170 | to determine whether the value is invalidated (i.e., if someone updated the object).
171 | If you wish to skip this check and read directly from the cache when you do not
172 | have an internet connection, set `options(s3mpi.skip_connection_check = TRUE)`.
173 | 
174 | 
175 | #### Ruby and Python Versions
176 | 
177 | You can also use S3MPI in [Ruby](https://github.com/robertzk/s3mpi-ruby) and in [Python](https://github.com/robertzk/s3mpy).
178 | 
179 | #### Command Line Accompaniment
180 | 
181 | One can find file size(s) and contents of the remote bucket 
182 | using the [s3 command line tool](http://s3tools.org/s3cmd):
183 | 
184 | ```sh
185 | s3cmd ls s3://yourS3Bucket/some/key"
186 | s3cmd ls -H  s3://yourS3Bucket/some/key" # Human readable
187 | ```
188 | 
189 | ### License
190 | 
191 | This project is licensed under the MIT License:
192 | 
193 | Copyright (c) 2015-2016 Robert Krzyzanowski
194 | 
195 | Permission is hereby granted, free of charge, to any person obtaining
196 | a copy of this software and associated documentation files (the
197 | "Software"), to deal in the Software without restriction, including
198 | without limitation the rights to use, copy, modify, merge, publish,
199 | distribute, sublicense, and/or sell copies of the Software, and to
200 | permit persons to whom the Software is furnished to do so, subject to
201 | the following conditions:
202 | 
203 | The above copyright notice and this permission notice shall be included
204 | in all copies or substantial portions of the Software.
205 | 
206 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
207 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
208 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
209 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
210 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
211 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
212 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
213 | 
214 | ### Authors
215 | 
216 | This package was originally created by Robert Krzyzanowski. Additional
217 | maintenance and improvement work was later done by Peter Hurford
218 | and Kirill Sevastyanenko.
219 | 
220 | 


--------------------------------------------------------------------------------