├── .Rbuildignore ├── .gitignore ├── tests ├── testthat │ ├── test-s3path.R │ ├── test-s3read.R │ └── test-s3store.R └── test-all.R ├── man ├── s3path.Rd ├── print.s3mpi_error.Rd ├── last_modified.Rd ├── s3delete.Rd ├── save_to_cache.Rd ├── s3mpi.Rd ├── s3cache.Rd ├── fetch_from_cache.Rd ├── s3exists.Rd ├── s3read.Rd ├── s3store.Rd ├── s3.get.Rd └── s3normalize.Rd ├── NAMESPACE ├── R ├── platform.R ├── s3delete.R ├── package.s3mpi.R ├── s3path.r ├── s3exists.R ├── s3read.r ├── s3store.r ├── s3normalize.R ├── s3.put.R ├── s3.get.R ├── utils.R └── s3cache.R ├── DESCRIPTION ├── LICENSE ├── .travis.yml ├── NEWS.md └── README.md /.Rbuildignore: -------------------------------------------------------------------------------- 1 | .travis.yml 2 | .git 3 | .gitignore 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ..Rcheck 2 | *.Rcheck 3 | *.tar.gz 4 | -------------------------------------------------------------------------------- /tests/testthat/test-s3path.R: -------------------------------------------------------------------------------- 1 | context('s3path') 2 | 3 | # TODO: (RK) Fill this in. 4 | -------------------------------------------------------------------------------- /tests/test-all.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(testthatsomemore) 3 | library(s3mpi) 4 | test_check("s3mpi") 5 | -------------------------------------------------------------------------------- /man/s3path.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3path.r 3 | \name{s3path} 4 | \alias{s3path} 5 | \title{Get your default s3path or error.} 6 | \usage{ 7 | s3path() 8 | } 9 | \description{ 10 | Get your default s3path or error. 11 | } 12 | 13 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,s3mpi_error) 4 | export(s3delete) 5 | export(s3exists) 6 | export(s3normalize) 7 | export(s3path) 8 | export(s3put) 9 | export(s3read) 10 | export(s3store) 11 | import(AWS.tools) 12 | import(cacher) 13 | import(crayon) 14 | import(digest) 15 | -------------------------------------------------------------------------------- /R/platform.R: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/rstudio/packrat/blob/master/R/platform.R 2 | is.windows <- function() { 3 | Sys.info()["sysname"] == "Windows" 4 | } 5 | 6 | is.mac <- function() { 7 | Sys.info()["sysname"] == "Darwin" 8 | } 9 | 10 | is.linux <- function() { 11 | Sys.info()["sysname"] == "Linux" 12 | } 13 | 14 | -------------------------------------------------------------------------------- /man/print.s3mpi_error.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3.get.R 3 | \name{print.s3mpi_error} 4 | \alias{print.s3mpi_error} 5 | \title{Printing for s3mpi errors.} 6 | \usage{ 7 | \method{print}{s3mpi_error}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{ANY. R object to print.} 11 | 12 | \item{...}{additional objects to pass to print function.} 13 | } 14 | \description{ 15 | Printing for s3mpi errors. 16 | } 17 | 18 | -------------------------------------------------------------------------------- /man/last_modified.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3cache.R 3 | \name{last_modified} 4 | \alias{last_modified} 5 | \title{Determine the last modified time of an S3 object.} 6 | \usage{ 7 | last_modified(key) 8 | } 9 | \arguments{ 10 | \item{key}{character. The s3 key of the object.} 11 | } 12 | \value{ 13 | the last modified time or \code{NULL} if it does not exist on S3. 14 | } 15 | \description{ 16 | Determine the last modified time of an S3 object. 17 | } 18 | 19 | -------------------------------------------------------------------------------- /R/s3delete.R: -------------------------------------------------------------------------------- 1 | #' Delete an R object from S3 by key 2 | #' 3 | #' @seealso \code{\link{s3store}} 4 | #' @param key character. The key to delete from S3. 5 | #' @param path character. The location of your S3 bucket as a prefix to \code{name}, 6 | #' for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}. 7 | #' @export 8 | s3delete <- function(key, path = s3path()) { 9 | path <- add_ending_slash(path) 10 | cmd <- if (use_legacy_api()) { 11 | paste0("del ", path, "/", key) 12 | } else { 13 | paste0("s3 rm ", path, key) 14 | } 15 | system2(s3cmd(), cmd) 16 | } 17 | -------------------------------------------------------------------------------- /man/s3delete.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3delete.R 3 | \name{s3delete} 4 | \alias{s3delete} 5 | \title{Delete an R object from S3 by key} 6 | \usage{ 7 | s3delete(key, path = s3path()) 8 | } 9 | \arguments{ 10 | \item{key}{character. The key to delete from S3.} 11 | 12 | \item{path}{character. The location of your S3 bucket as a prefix to \code{name}, 13 | for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}.} 14 | } 15 | \description{ 16 | Delete an R object from S3 by key 17 | } 18 | \seealso{ 19 | \code{\link{s3store}} 20 | } 21 | 22 | -------------------------------------------------------------------------------- /R/package.s3mpi.R: -------------------------------------------------------------------------------- 1 | #' Bi-directional communication with R and AWS S3. 2 | #' 3 | #' This package provides an interface to read and store arbitrary 4 | #' objects from and to Amazon AWS's S3 cloud storage. 5 | #' 6 | #' The exported helpers \code{s3read} and \code{s3store} 7 | #' allow, upon correct configuration of your S3 credentials, 8 | #' uploading to and downloading from S3 using R's built-in support 9 | #' for serializing and deserializing arbitrary objects (see 10 | #' \code{\link{readRDS}} and \code{\link{saveRDS}}). 11 | #' 12 | #' @name s3mpi 13 | #' @docType package 14 | #' @import AWS.tools crayon cacher digest 15 | NULL 16 | -------------------------------------------------------------------------------- /man/save_to_cache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3cache.R 3 | \name{save_to_cache} 4 | \alias{save_to_cache} 5 | \title{Helper function for saving a file to a cache directory.} 6 | \usage{ 7 | save_to_cache(key, value, cache_dir = cache_directory()) 8 | } 9 | \arguments{ 10 | \item{key}{character. The key under which the cache entry is stored.} 11 | 12 | \item{value}{ANY. The R object to save in the cache.} 13 | 14 | \item{cache_dir}{character. The cache directory. The default is 15 | \code{cache_directory()}.} 16 | } 17 | \description{ 18 | Helper function for saving a file to a cache directory. 19 | } 20 | 21 | -------------------------------------------------------------------------------- /R/s3path.r: -------------------------------------------------------------------------------- 1 | #' Get your default s3path or error. 2 | #' @export 3 | s3path <- function() { 4 | ## The default S3 prefix, for example, `s3://yourbucket/yourprefix/`. 5 | ## You should set this in everyone's `~/.Rprofile` if 6 | ## you are using s3mpi to collaborate in a data science team. 7 | ## System environment variables are also accepted. 8 | path <- get_option("s3mpi.path") 9 | 10 | if (is.null(path) || !nzchar(path)) { 11 | stop("s3mpi package: Please set your s3 path using `S3MPI_PATH` system environment variable or ", 12 | "options(s3mpi.path = 's3://your_bucket/your/path/'). ", 13 | "This is where all of your uploaded R objects will be stored.") 14 | } 15 | 16 | path 17 | } 18 | -------------------------------------------------------------------------------- /man/s3mpi.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/package.s3mpi.R 3 | \docType{package} 4 | \name{s3mpi} 5 | \alias{s3mpi} 6 | \alias{s3mpi-package} 7 | \title{Bi-directional communication with R and AWS S3.} 8 | \description{ 9 | This package provides an interface to read and store arbitrary 10 | objects from and to Amazon AWS's S3 cloud storage. 11 | } 12 | \details{ 13 | The exported helpers \code{s3read} and \code{s3store} 14 | allow, upon correct configuration of your S3 credentials, 15 | uploading to and downloading from S3 using R's built-in support 16 | for serializing and deserializing arbitrary objects (see 17 | \code{\link{readRDS}} and \code{\link{saveRDS}}). 18 | } 19 | 20 | -------------------------------------------------------------------------------- /man/s3cache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3cache.R 3 | \name{s3cache} 4 | \alias{s3cache} 5 | \title{A caching layer around s3mpi calls.} 6 | \usage{ 7 | s3cache(s3key, value) 8 | } 9 | \arguments{ 10 | \item{s3key}{character. The full S3 key to attempt to read or write 11 | to the cache.} 12 | 13 | \item{value}{ANY. The R object to save in the cache. If missing, 14 | a cache read will be performed instead.} 15 | } 16 | \description{ 17 | Fetching large files from the S3 MPI can be expensive when performed 18 | multiple times. This method allows one to add a caching layer 19 | around S3 fetching. The user should specify the configuration option 20 | \code{options(s3mpi.cache = "some/dir")}. The recommended cache 21 | directory (where files will be stored) is \code{"~/.s3cache"}. 22 | } 23 | 24 | -------------------------------------------------------------------------------- /man/fetch_from_cache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3cache.R 3 | \name{fetch_from_cache} 4 | \alias{fetch_from_cache} 5 | \title{Helper function for fetching a file from a cache directory.} 6 | \usage{ 7 | fetch_from_cache(key, cache_dir) 8 | } 9 | \arguments{ 10 | \item{key}{character. The key under which the cache entry is stored.} 11 | 12 | \item{cache_dir}{character. The cache directory. The default is 13 | \code{cache_directory()}.} 14 | } 15 | \value{ 16 | the cached object if the cache has not invalidated. Otherwise, 17 | return \code{s3mpi::not_cached}. 18 | } 19 | \description{ 20 | This function will also test to determine whether the file has been 21 | modified on S3 since the last cache save. If the file has never been 22 | cached or the cache is invalidated, it will return \code{s3mpi::not_cached}. 23 | } 24 | 25 | -------------------------------------------------------------------------------- /man/s3exists.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3exists.R 3 | \name{s3exists} 4 | \alias{s3exists} 5 | \title{Determine whether object exists on S3.} 6 | \usage{ 7 | s3exists(name, path = s3path()) 8 | } 9 | \arguments{ 10 | \item{name}{string. Name of file to look for} 11 | 12 | \item{path}{string. Path to file. If missing, the entire s3 path must be provided in name.} 13 | } 14 | \description{ 15 | Test whether or not the given object exists at the 16 | give S3 path. 17 | } 18 | \examples{ 19 | \dontrun{ 20 | s3exists("my/key") # Will look in bucket given by getOption("s3mpi.path") or 21 | from a system environment variable. 22 | # For example, if this option is "s3://mybucket/", then this query 23 | # will check for existence of the \\code{s3://mybucket/my/key} S3 path. 24 | 25 | s3exists("my/key", "s3://anotherbucket/") # We can of course change the bucket. 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: s3mpi 2 | Type: Package 3 | Title: R message passing interface using S3 storage 4 | URL: https://github.com/robertzk/s3mpi 5 | BugReports: https://github.com/robertzk/s3mpi/issues 6 | Description: Easily pass objects like lists or dataframes between consoles. 7 | Version: 0.2.47 8 | Author: Robert Krzyzanowski 9 | Maintainer: Robert Krzyzanowski 10 | Authors@R: c(person("Robert", "Krzyzanowski", email = "technoguyrob@gmail.com", 11 | role = c("aut", "cre"))) 12 | Depends: 13 | R (>= 3.0.0) 14 | Imports: 15 | AWS.tools, 16 | cacher, 17 | crayon, 18 | digest 19 | Suggests: 20 | knitr, 21 | withr, 22 | testthat, 23 | testthatsomemore 24 | Remotes: kirillseva/cacher, 25 | robertzk/testthatsomemore, 26 | avantcredit/AWS.tools 27 | License: MIT 28 | LazyData: true 29 | Roxygen: list(wrap = FALSE) 30 | VignetteBuilder: knitr 31 | RoxygenNote: 5.0.0 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014-2016 Robert Krzyzanowski 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included 12 | in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 19 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 20 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | sudo: true 3 | before_install: 4 | - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh 5 | - chmod 755 ./travis-tool.sh 6 | - "./travis-tool.sh bootstrap" 7 | sudo: required 8 | git: 9 | submodules: false 10 | env: 11 | - global: 12 | - WARNINGS_ARE_ERRORS=1 13 | - _R_CHECK_FORCE_SUGGESTS_=0 14 | - LINTR_COMMENT_BOT=false 15 | - R_LIBS_USER=~/.R/library 16 | install: 17 | - mkdir -p "$R_LIBS_USER" 18 | - "./travis-tool.sh install_r devtools" 19 | - "./travis-tool.sh install_r rcmdcheck" 20 | - "./travis-tool.sh github_package jimhester/covr robertzk/testthatsomemore" 21 | - "./travis-tool.sh install_deps" 22 | script: 23 | - Rscript -e 'try(devtools::install(".")); r <- rcmdcheck::rcmdcheck(".", args = c("--no-manual")); quit(save = "no", status = if (length(c(r$errors, r$warnings)) > 1 || grepl("FAILED", r$output$stdout)) { 1 } else { 0 }, runLast = FALSE)' 24 | after_success: 25 | - Rscript -e 'library(covr);codecov()' 26 | notifications: 27 | email: 28 | on_success: change 29 | on_failure: change 30 | hipchat: 31 | on_success: change 32 | on_failure: change 33 | template: 34 | - "%{repository}#%{build_number} (%{branch} - %{commit} : %{author}): %{message} 35 | | Details: %{build_url} | Changes: %{compare_url}" 36 | rooms: 37 | secure: SQirvWbQ9b0roApmf3gt6JTcWIra9NguGzR45azxVDaWw2n0w/sIufA/cxa2sTLLhKfIMNlJwwhQjNaWyHeZkTTxRb76tcHWQdPpMcNKTwfr3+C3/bXDkmQozvQkYNgGuRc2Iln5ms12fiHmwI6dp5aENACUo5fbV2SLJQvmt5w= 38 | -------------------------------------------------------------------------------- /man/s3read.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3read.r 3 | \name{s3read} 4 | \alias{s3read} 5 | \title{Read an R object in S3 by key} 6 | \usage{ 7 | s3read(name, path = s3path(), cache = TRUE, serialize = TRUE, ...) 8 | } 9 | \arguments{ 10 | \item{name}{character. The key to grab from S3.} 11 | 12 | \item{path}{character. The location of your S3 bucket as a prefix to \code{name}, 13 | for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}.} 14 | 15 | \item{cache}{logical. If true, use the local s3cache if available. 16 | If false, do not use cache. By default, \code{TRUE}. Note this will 17 | consume local disk space for objects that have been \code{\link{s3read}}.} 18 | 19 | \item{serialize}{logical. If true, use \code{s3normalize} to serialize the model object.} 20 | 21 | \item{...}{Can be used internally to pass more arguments to \code{\link{s3.get}}.} 22 | } 23 | \description{ 24 | Any type of object that can be serialized as an RDS file 25 | is capable of being read using this interface. 26 | } 27 | \details{ 28 | If you wish to read non-vanilla R objects, such as those 29 | containing external pointers to C structures, see 30 | \code{\link{s3normalize}}. 31 | } 32 | \examples{ 33 | \dontrun{ 34 | s3store(c(1,2,3), "test123") 35 | print(s3read("test123")) 36 | # [1] 1 2 3 37 | 38 | s3store(function(x, y) { x + 2 * y }, "myfunc") 39 | stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures! 40 | } 41 | } 42 | \seealso{ 43 | \code{\link{s3store}} 44 | } 45 | 46 | -------------------------------------------------------------------------------- /man/s3store.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3store.r 3 | \name{s3store} 4 | \alias{s3put} 5 | \alias{s3store} 6 | \title{Store an R object in S3 by key.} 7 | \usage{ 8 | s3store(obj, name = NULL, path = s3path(), safe = FALSE, ...) 9 | 10 | s3put(..., safe = TRUE) 11 | } 12 | \arguments{ 13 | \item{obj}{ANY. An R object to save to S3.} 14 | 15 | \item{name}{character. The S3 key to save to. If no key is provided, 16 | the expression passed as \code{obj} will be used.} 17 | 18 | \item{path}{character. The S3 prefix, e.g., "s3://yourbucket/some/path/".} 19 | 20 | \item{safe}{logical. Whether or not to overwrite existing fails by 21 | default or error if they exist.} 22 | 23 | \item{...}{additional arguments to \code{s3mpi:::s3.put}.} 24 | } 25 | \description{ 26 | Any type of object that can be serialized as an RDS file 27 | is capable of being retrieved using this interface. 28 | } 29 | \details{ 30 | If you wish to store non-vanilla R objects, such as those 31 | containing external pointers to C structures, see 32 | \code{\link{s3normalize}}. 33 | } 34 | \note{ 35 | \code{s3put} is equivalent to \code{s3store} except that 36 | it will fail by default if you try to overwrite an existing key. 37 | } 38 | \examples{ 39 | \dontrun{ 40 | s3store(c(1,2,3), 'test123') 41 | print(s3read('test123')) 42 | # [1] 1 2 3 43 | 44 | s3store(function(x, y) { x + 2 * y }, "myfunc") 45 | stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures! 46 | 47 | obj <- 1:5 48 | s3store(obj) # If we do not pass a key the path is inferred from 49 | # the expression using deparse(substitute(...)). 50 | stopifnot(all.equal(s3read("obj"), 1:5)) 51 | } 52 | } 53 | \seealso{ 54 | \code{\link{s3read}} 55 | } 56 | 57 | -------------------------------------------------------------------------------- /R/s3exists.R: -------------------------------------------------------------------------------- 1 | #' Determine whether object exists on S3. 2 | #' 3 | #' Test whether or not the given object exists at the 4 | #' give S3 path. 5 | #' 6 | #' @param name string. Name of file to look for 7 | #' @param path string. Path to file. If missing, the entire s3 path must be provided in name. 8 | #' @export 9 | #' @examples \dontrun{ 10 | #' s3exists("my/key") # Will look in bucket given by getOption("s3mpi.path") or 11 | #' from a system environment variable. 12 | #' # For example, if this option is "s3://mybucket/", then this query 13 | #' # will check for existence of the \code{s3://mybucket/my/key} S3 path. 14 | #' 15 | #' s3exists("my/key", "s3://anotherbucket/") # We can of course change the bucket. 16 | #' } 17 | s3exists <- function(name, path = s3path()) { 18 | if (is.null(name)) return(FALSE) # https://github.com/robertzk/s3mpi/issues/22 19 | path <- add_ending_slash(path) 20 | s3key <- paste(path, name, sep = "") 21 | s3key <- gsub("/$", "", s3key) # strip terminal / 22 | if (!grepl("^s3://", s3key)) { 23 | stop("s3 paths must begin with \"s3://\"") 24 | } 25 | 26 | results <- system2(s3cmd(), s3cmd_exists_command(s3key), stdout = TRUE) 27 | 28 | check_exists_results(name, results) 29 | } 30 | 31 | s3cmd_exists_command <- function(s3key) { 32 | if (use_legacy_api()) { 33 | paste("ls", s3key) 34 | } else { 35 | paste("s3", "ls", s3key) 36 | } 37 | } 38 | 39 | 40 | check_exists_results <- function(name, results) { 41 | ## We know that the key exists if a result was returned, i.e., the 42 | ## shown regex gives a match. 43 | if (use_legacy_api()) { 44 | matches <- grepl(paste0(name, "(/[0-9A-Za-z]+)*/?$"), results) 45 | } else { 46 | matches <- grepl(paste0(basename(name), "$"), results) 47 | } 48 | sum(matches) > 0 49 | } 50 | -------------------------------------------------------------------------------- /R/s3read.r: -------------------------------------------------------------------------------- 1 | #' Read an R object in S3 by key 2 | #' 3 | #' Any type of object that can be serialized as an RDS file 4 | #' is capable of being read using this interface. 5 | #' 6 | #' If you wish to read non-vanilla R objects, such as those 7 | #' containing external pointers to C structures, see 8 | #' \code{\link{s3normalize}}. 9 | #' 10 | #' @seealso \code{\link{s3store}} 11 | #' @param name character. The key to grab from S3. 12 | #' @param path character. The location of your S3 bucket as a prefix to \code{name}, 13 | #' for example, \code{"s3://mybucket/"} or \code{"s3://mybucket/myprefix/"}. 14 | #' @param cache logical. If true, use the local s3cache if available. 15 | #' If false, do not use cache. By default, \code{TRUE}. Note this will 16 | #' consume local disk space for objects that have been \code{\link{s3read}}. 17 | #' @param serialize logical. If true, use \code{s3normalize} to serialize the model object. 18 | #' @param ... Can be used internally to pass more arguments to \code{\link{s3.get}}. 19 | #' @export 20 | #' @examples 21 | #' \dontrun{ 22 | #' s3store(c(1,2,3), "test123") 23 | #' print(s3read("test123")) 24 | #' # [1] 1 2 3 25 | #' 26 | #' s3store(function(x, y) { x + 2 * y }, "myfunc") 27 | #' stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures! 28 | #' } 29 | s3read <- function(name, path = s3path(), cache = TRUE, serialize = TRUE, ...) { 30 | stopifnot(isTRUE(cache) || identical(cache, FALSE)) 31 | 32 | path <- add_ending_slash(path) 33 | 34 | s3key <- paste(path, name, sep = "") 35 | 36 | if (!isTRUE(cache) || is.null(get_option("s3mpi.cache"))) { 37 | value <- s3.get(s3key, cache = FALSE, ...) 38 | } else if (is.not_cached(value <- s3cache(s3key))) { 39 | value <- s3.get(s3key, cache = TRUE, ...) 40 | ## If the file system caching layer is enabled, store it to the file system 41 | ## before returning the value. 42 | s3cache(s3key, value) 43 | } 44 | if (isTRUE(serialize)) { 45 | s3normalize(value, TRUE) 46 | } else { 47 | value 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /R/s3store.r: -------------------------------------------------------------------------------- 1 | #' Store an R object in S3 by key. 2 | #' 3 | #' Any type of object that can be serialized as an RDS file 4 | #' is capable of being retrieved using this interface. 5 | #' 6 | #' If you wish to store non-vanilla R objects, such as those 7 | #' containing external pointers to C structures, see 8 | #' \code{\link{s3normalize}}. 9 | #' 10 | #' @export 11 | #' @seealso \code{\link{s3read}} 12 | #' @param obj ANY. An R object to save to S3. 13 | #' @param name character. The S3 key to save to. If no key is provided, 14 | #' the expression passed as \code{obj} will be used. 15 | #' @param path character. The S3 prefix, e.g., "s3://yourbucket/some/path/". 16 | #' @param safe logical. Whether or not to overwrite existing fails by 17 | #' default or error if they exist. 18 | #' @param ... additional arguments to \code{s3mpi:::s3.put}. 19 | #' @examples 20 | #' \dontrun{ 21 | #' s3store(c(1,2,3), 'test123') 22 | #' print(s3read('test123')) 23 | #' # [1] 1 2 3 24 | #' 25 | #' s3store(function(x, y) { x + 2 * y }, "myfunc") 26 | #' stopifnot(s3read("myfunc")(1, 2) == 5) # R can serialize closures! 27 | #' 28 | #' obj <- 1:5 29 | #' s3store(obj) # If we do not pass a key the path is inferred from 30 | #' # the expression using deparse(substitute(...)). 31 | #' stopifnot(all.equal(s3read("obj"), 1:5)) 32 | #' } 33 | s3store <- function(obj, name = NULL, path = s3path(), safe = FALSE, ...) { 34 | if (missing(name)) { 35 | name <- deparse(substitute(obj)) 36 | } 37 | 38 | path <- add_ending_slash(path) 39 | 40 | s3key <- paste(path, name, sep = "") 41 | if (isTRUE(safe) && s3exists(name, path = path, ...)) { 42 | stop("An object with name ", name, " on path ", path, 43 | " already exists. Use `safe = FALSE` to overwrite\n", 44 | "-----------------------^") 45 | } 46 | 47 | obj4save <- s3normalize(obj, FALSE) 48 | s3.put(obj4save, path, name, ...) 49 | 50 | if (!is.null(get_option("s3mpi.cache"))) { 51 | s3cache(s3key, obj4save) 52 | } 53 | 54 | if (is.environment(obj4save)) { 55 | s3normalize(obj4save) # Revert side effects 56 | } 57 | 58 | invisible(s3key) 59 | } 60 | 61 | #' @export 62 | #' @rdname s3store 63 | #' @note \code{s3put} is equivalent to \code{s3store} except that 64 | #' it will fail by default if you try to overwrite an existing key. 65 | s3put <- function(..., safe = TRUE) { 66 | s3store(..., safe = safe) 67 | } 68 | -------------------------------------------------------------------------------- /man/s3.get.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3.get.R, R/s3.put.R 3 | \name{s3.get} 4 | \alias{s3.get} 5 | \alias{s3.put} 6 | \title{Fetch an R object from an S3 path.} 7 | \usage{ 8 | s3.get(path, bucket_location = "US", verbose = FALSE, debug = FALSE, 9 | cache = TRUE, storage_format = c("RDS", "CSV", "table"), ...) 10 | 11 | s3.put(x, path, name, bucket_location = "US", debug = FALSE, 12 | check_exists = TRUE, num_retries = get_option("s3mpi.num_retries", 0), 13 | backoff = 2^seq(2, num_retries + 1), max_backoff = 128, 14 | storage_format = c("RDS", "CSV", "table"), row.names = FALSE, ...) 15 | } 16 | \arguments{ 17 | \item{path}{character. A full S3 path.} 18 | 19 | \item{bucket_location}{character. Usually \code{"US"}.} 20 | 21 | \item{verbose}{logical. If \code{TRUE}, the \code{s3cmd} 22 | utility verbose flag will be set.} 23 | 24 | \item{debug}{logical. If \code{TRUE}, the \code{s3cmd} 25 | utility debug flag will be set.} 26 | 27 | \item{cache}{logical. If \code{TRUE}, an LRU in-memory cache will be referenced.} 28 | 29 | \item{storage_format}{character. What format the object is stored in. Defaults to RDS.} 30 | 31 | \item{...}{additional arguments to pass the the saving function.} 32 | 33 | \item{x}{ANY. R object to store to S3.} 34 | 35 | \item{name}{character.} 36 | 37 | \item{check_exists}{logical. Whether or not to check if an object already exists at the specificed location.} 38 | 39 | \item{num_retries}{numeric. the number of times to retry uploading.} 40 | 41 | \item{backoff}{numeric. Vector, with each element in seconds, describing the 42 | exponential backoff to be used in conjunction with the num_retries argument. 43 | Number of elements must equal num_retries. Defaults to 4, 8, 16, 32, etc.} 44 | 45 | \item{max_backoff}{numeric. Number describing the maximum seconds s3mpi will sleep 46 | prior to retrying an upload. Defaults to 128 seconds.} 47 | 48 | \item{row.names}{logical. Whether or not to write row names when writing CSV's or tables.} 49 | 50 | \item{storage_format}{character. What format to store files in. Defaults to RDS.} 51 | } 52 | \value{ 53 | For \code{s3.get}, the R object stored in RDS format on S3 in the \code{path}. 54 | For \code{s3.put}, the system exit code from running the \code{s3cmd} 55 | command line tool to perform the upload. 56 | } 57 | \description{ 58 | Fetch an R object from an S3 path. 59 | } 60 | 61 | -------------------------------------------------------------------------------- /man/s3normalize.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/s3normalize.R 3 | \name{s3normalize} 4 | \alias{s3normalize} 5 | \title{Convert a possibly non-serializable R object to a serializable R object.} 6 | \usage{ 7 | s3normalize(object, read = TRUE) 8 | } 9 | \arguments{ 10 | \item{object}{ANY. The R object to normalize. If it has an 11 | \code{"s3mpi.serialize"} attribute consisting of a list with 12 | \code{"read"} and \code{"write"} keys, these arity-1 functions 13 | will be called with the \code{object} prior to reading from and 14 | writing to S3, respectively.} 15 | 16 | \item{read}{logical. If \code{TRUE}, the \code{"read"} key of the 17 | \code{"s3mpi.serialize"} attribute, which should be a 1-argument 18 | function, will be invoked on the object. Otherwise, the \code{"write"} 19 | key will be invoked. By default, \code{read} is TRUE.} 20 | } 21 | \value{ 22 | A previously possibly non-vanilla R object (that is, 23 | an R object that may contain external pointers to non-R objects, 24 | such as vanilla C structs) converted to a totally vanilla R object 25 | (for example, by replacing the pointers with \code{\link{raw}} binary data). 26 | } 27 | \description{ 28 | R has good foreign function interface bindings to C code. As such, 29 | certain package authors may wish to optimize their code by keeping 30 | their objects in C structures instead of R SEXPs (the standard for 31 | object representation in the R interpreter). This also applies 32 | to bindings to external libraries. The speed advantage can be 33 | substantial, so this is not an uncommon use case. The \code{s3normalize} 34 | helper provides the ability to add an additional "preprocessor" 35 | layer prior to storing an object to S3 that converts a non-serializable 36 | object (such as a list with one of its entries pointing to an 37 | external C structure) to serialize object (such as that list with 38 | its C structure pointer entry replaced by a \code{\link{raw}} vector). 39 | } 40 | \details{ 41 | If the object being uploaded with \code{s3store} or downloaded wiht 42 | \code{s3read} has an attribute \code{"s3mpi.serialize"} which must 43 | be a list with keys \code{c("read", "write")}, these keys should 44 | hold functions requiring a single argument which are applied to 45 | the object prior to \emph{reading} from (\code{s3read}) and \emph{writing} 46 | to (\code{s3store}) S3, respectively. This allows s3mpi storage 47 | of not only vanilla R objects but \emph{arbitrary objects in memory} 48 | (whether they are internally represented by a C, Rust, Java, etc. process). 49 | } 50 | 51 | -------------------------------------------------------------------------------- /tests/testthat/test-s3read.R: -------------------------------------------------------------------------------- 1 | context("s3read") 2 | library(testthatsomemore) 3 | 4 | withr::with_options(list( 5 | s3mpi.path = "s3://test/", 6 | s3mpi.cache = NULL 7 | ), { 8 | describe("cache parameter validation", { 9 | with_mock( 10 | `s3mpi:::s3.get` = function(...) "value", 11 | `s3mpi:::s3cache` = function(...) TRUE, { 12 | test_that("if cache is not TRUE or FALSE, it errors", { 13 | expect_error(s3read("key", cache = "pizza", serialize = FALSE)) 14 | expect_error(s3read("key", cache = 23, serialize = FALSE)) 15 | expect_error(s3read("key", cache = iris, serialize = FALSE)) 16 | expect_error(s3read("key", cache = NA, serialize = FALSE)) 17 | }) 18 | test_that("if cache is TRUE, it does not error", { 19 | expect_equal(s3read("key", cache = TRUE, serialize = FALSE), "value") 20 | }) 21 | test_that("if cache is FALSE, it does not error", { 22 | expect_equal(s3read("key", cache = FALSE, serialize = FALSE), "value") 23 | }) 24 | }) 25 | }) 26 | 27 | test_that("if the path does not end in a slash, the slash is added", { 28 | map <- list2env(list("s3://path/key" = "value")) 29 | with_mock( 30 | `s3mpi:::s3.get` = function(...) map[[..1]], { 31 | expect_equal(s3read("key", path = "s3://path"), "value") 32 | }) 33 | }) 34 | 35 | test_that("it can fetch raw values if the caching layer is disabled", { 36 | map <- list2env(list("s3://test/key" = "value")) 37 | with_mock(`s3mpi:::s3.get` = function(...) map[[..1]], { 38 | expect_equal(s3read("key", cache = FALSE), "value") 39 | map$`s3://test/key` <- "new_value" 40 | # Make sure we are not caching. 41 | expect_equal(s3read("key", cache = FALSE), "new_value") 42 | }) 43 | }) 44 | 45 | test_that("it can fetch unraw values if the caching layer is enabled", { 46 | map <- list2env(list("s3://test/key" = "value")) 47 | cachedir <- tempdir() 48 | dir.create(cachedir, FALSE, TRUE) 49 | opts <- options(s3mpi.cache = cachedir) 50 | on.exit(options(opts), add = TRUE) 51 | 52 | with_mock( 53 | `s3mpi:::s3.get` = function(...) map[[..1]], 54 | `s3mpi:::s3cache` = function(...) "value", { 55 | expect_equal(s3read("key"), "value") 56 | map$`s3://test/key` <- "new_value" 57 | # Make sure we are caching. 58 | expect_equal(s3read("key"), "value") 59 | }) 60 | }) 61 | 62 | test_that("it can fetch unraw values if the caching layer is enabled but is uncached", { 63 | map <- list2env(list("s3://test/key" = "value")) 64 | cachedir <- tempdir() 65 | dir.create(cachedir, FALSE, TRUE) 66 | opts <- options(s3mpi.cache = cachedir) 67 | on.exit(options(opts), add = TRUE) 68 | 69 | with_mock( 70 | `s3mpi:::s3.get` = function(...) map[[..1]], 71 | `s3mpi:::s3cache` = function(...) not_cached, { 72 | expect_equal(s3read("key"), "value") 73 | map$`s3://test/key` <- "new_value" 74 | # Make sure we are not caching. 75 | expect_equal(s3read("key"), "new_value") 76 | }) 77 | }) 78 | }) 79 | 80 | -------------------------------------------------------------------------------- /R/s3normalize.R: -------------------------------------------------------------------------------- 1 | ## The roxygen documentation here is pretty thorough. In effect, if 2 | ## we wish to use s3mpi to store C, Java, etc. objects that are 3 | ## needed for our R code to run, we can do something like: 4 | ## 5 | ## ```r 6 | ## obj <- list(atomic_vector = 1:10, external_object = ptr_to_c_object) 7 | ## attr(obj, "s3mpi.serialize") <- list( 8 | ## "write" = function(object) { 9 | ## obj$external_object <- convert_ptr_to_raw_vector(obj$external_object) 10 | ## }, 11 | ## "read" = function(object) { 12 | ## obj$external_object <- convert_raw_vector_to_ptr(obj$external_object) 13 | ## }) 14 | ## 15 | ## s3store(obj, "some/key") # Will invoke the write function prior to 16 | ## # calling saveRDS and uploading the serialized object. 17 | ## s3read("some/key") # Will invoke the read function after downloading 18 | ## # the serialized object and calling readRDS. 19 | ## ``` 20 | #' Convert a possibly non-serializable R object to a serializable R object. 21 | #' 22 | #' R has good foreign function interface bindings to C code. As such, 23 | #' certain package authors may wish to optimize their code by keeping 24 | #' their objects in C structures instead of R SEXPs (the standard for 25 | #' object representation in the R interpreter). This also applies 26 | #' to bindings to external libraries. The speed advantage can be 27 | #' substantial, so this is not an uncommon use case. The \code{s3normalize} 28 | #' helper provides the ability to add an additional "preprocessor" 29 | #' layer prior to storing an object to S3 that converts a non-serializable 30 | #' object (such as a list with one of its entries pointing to an 31 | #' external C structure) to serialize object (such as that list with 32 | #' its C structure pointer entry replaced by a \code{\link{raw}} vector). 33 | #' 34 | #' If the object being uploaded with \code{s3store} or downloaded wiht 35 | #' \code{s3read} has an attribute \code{"s3mpi.serialize"} which must 36 | #' be a list with keys \code{c("read", "write")}, these keys should 37 | #' hold functions requiring a single argument which are applied to 38 | #' the object prior to \emph{reading} from (\code{s3read}) and \emph{writing} 39 | #' to (\code{s3store}) S3, respectively. This allows s3mpi storage 40 | #' of not only vanilla R objects but \emph{arbitrary objects in memory} 41 | #' (whether they are internally represented by a C, Rust, Java, etc. process). 42 | #' 43 | #' @param object ANY. The R object to normalize. If it has an 44 | #' \code{"s3mpi.serialize"} attribute consisting of a list with 45 | #' \code{"read"} and \code{"write"} keys, these arity-1 functions 46 | #' will be called with the \code{object} prior to reading from and 47 | #' writing to S3, respectively. 48 | #' @param read logical. If \code{TRUE}, the \code{"read"} key of the 49 | #' \code{"s3mpi.serialize"} attribute, which should be a 1-argument 50 | #' function, will be invoked on the object. Otherwise, the \code{"write"} 51 | #' key will be invoked. By default, \code{read} is TRUE. 52 | #' @return A previously possibly non-vanilla R object (that is, 53 | #' an R object that may contain external pointers to non-R objects, 54 | #' such as vanilla C structs) converted to a totally vanilla R object 55 | #' (for example, by replacing the pointers with \code{\link{raw}} binary data). 56 | #' @export 57 | s3normalize <- function(object, read = TRUE) { 58 | if (utils::object.size(object) == 0) { 59 | warning("In s3mpi package: size-0 object is being normalized", call. = TRUE) 60 | NULL 61 | } else if (read) { 62 | (attr(object, "s3mpi.serialize")$read %||% identity)(object) 63 | } else { 64 | (attr(object, "s3mpi.serialize")$write %||% identity)(object) 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /R/s3.put.R: -------------------------------------------------------------------------------- 1 | #' @param x ANY. R object to store to S3. 2 | #' @param name character. 3 | #' @param check_exists logical. Whether or not to check if an object already exists at the specificed location. 4 | #' @param num_retries numeric. the number of times to retry uploading. 5 | #' @param backoff numeric. Vector, with each element in seconds, describing the 6 | #' exponential backoff to be used in conjunction with the num_retries argument. 7 | #' Number of elements must equal num_retries. Defaults to 4, 8, 16, 32, etc. 8 | #' @param max_backoff numeric. Number describing the maximum seconds s3mpi will sleep 9 | #' prior to retrying an upload. Defaults to 128 seconds. 10 | #' @param storage_format character. What format to store files in. Defaults to RDS. 11 | #' @param row.names logical. Whether or not to write row names when writing CSV's or tables. 12 | #' @param ... additional arguments to pass the the saving function. 13 | #' @rdname s3.get 14 | s3.put <- function (x, path, name, bucket_location = "US", 15 | debug = FALSE, check_exists = TRUE, 16 | num_retries = get_option("s3mpi.num_retries", 0), backoff = 2 ^ seq(2, num_retries + 1), 17 | max_backoff = 128, storage_format = c("RDS", "CSV", "table"), row.names = FALSE, ...) { 18 | storage_format <- match.arg(storage_format) 19 | 20 | if (is.data.frame(x) && storage_format %in% c("CSV, table")) { 21 | stop("You can't store an object in ", storage_format," format if it isn't a data.frame.") 22 | } 23 | 24 | s3key <- paste(path, name, sep = "") 25 | ## This inappropriately-named function actually checks existence 26 | ## of an entire *s3key*, not a bucket. 27 | AWS.tools:::check.bucket(s3key) 28 | 29 | ## Ensure backoff vector has correct number of elements and is capped 30 | if (num_retries > 0) { 31 | if (length(backoff) != num_retries) { 32 | stop("Your backoff vector length must match the number of retries.") 33 | } 34 | backoff <- pmin(backoff, max_backoff) 35 | } 36 | 37 | ## We create a temporary file, *write* the R object to the file, and then 38 | ## upload that file to S3. This magic works thanks to R's fantastic 39 | ## support for [arbitrary serialization](https://stat.ethz.ch/R-manual/R-patched/library/base/html/readRDS.html) 40 | ## (including closures!). 41 | x.serialized <- tempfile(); 42 | dir.create(dirname(x.serialized), showWarnings = FALSE, recursive = TRUE) 43 | on.exit(unlink(x.serialized, force = TRUE), add = TRUE) 44 | save_to_file <- get(paste0("save_as_", storage_format)) 45 | save_to_file(x, x.serialized, row.names, ...) 46 | 47 | cmd <- s3cmd_put_command(s3key, x.serialized, bucket_location_to_flag(bucket_location), debug) 48 | run_system_put(path, name, cmd, check_exists, num_retries, backoff) 49 | } 50 | 51 | run_system_put <- function(path, name, s3.cmd, check_exists, num_retries, backoff) { 52 | ret <- system2(s3cmd(), s3.cmd, stdout = TRUE) 53 | if (isTRUE(check_exists) && !s3exists(name, path)) { 54 | if (num_retries > 0) { 55 | Sys.sleep(backoff[length(backoff) - num_retries + 1]) 56 | Recall(path = path, name = name, s3.cmd = s3.cmd, 57 | check_exists = check_exists, 58 | num_retries = num_retries - 1, backoff = backoff) 59 | } else { 60 | stop("Object could not be successfully stored.") 61 | } 62 | } else { 63 | ret 64 | } 65 | } 66 | 67 | s3cmd_put_command <- function(s3key, file, bucket_flag, debug) { 68 | if (use_legacy_api()) { 69 | paste("put", file, paste0('"', s3key, '"'), 70 | bucket_flag, ifelse(debug, "--debug", ""), "--force") 71 | } else { 72 | paste("s3 cp", file, s3key) 73 | } 74 | } 75 | 76 | save_as_RDS <- function(x, filename, ...) { 77 | saveRDS(x, filename, ...) 78 | } 79 | 80 | 81 | save_as_CSV <- function(x, filename, row.names, ...) { 82 | write.csv(x, filename, row.names = row.names, ...) 83 | } 84 | 85 | save_as_table <- function(x, filename, row.names, ...) { 86 | write.table(x, filename, row.names = row.names, ...) 87 | } 88 | -------------------------------------------------------------------------------- /tests/testthat/test-s3store.R: -------------------------------------------------------------------------------- 1 | context("s3store") 2 | library(testthatsomemore) 3 | 4 | withr::with_options(list( 5 | s3mpi.path = "s3://test/", 6 | s3mpi.cache = NULL 7 | ), { 8 | test_that("it stops if safe is enabled and we overwrite", { 9 | testthatsomemore::package_stub("s3mpi", "s3exists", function(...) TRUE, { 10 | expect_error(s3store("foo", "bar", safe = TRUE), "already exists") 11 | }) 12 | }) 13 | 14 | test_that("it can store raw values if the caching layer is disabled", { 15 | map <- list2env(list("s3://test/key" = NULL)) 16 | testthatsomemore::package_stub("s3mpi", "s3.get", function(...) map[[..1]], { 17 | testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map[[paste0(..2, ..3)]] <- ..1, { 18 | s3store("value", "key") 19 | expect_equal(s3read("key"), "value") 20 | map$`s3://test/key` <- "new_value" 21 | # Make sure we are not caching. 22 | expect_equal(s3read("key", cache = FALSE), "new_value") 23 | })}) 24 | }) 25 | 26 | test_that("it can store values if the caching layer is enabled", { 27 | map <- list2env(list("s3://test/key" = NULL)) 28 | map2 <- new.env(parent = map) 29 | testthatsomemore::package_stub("s3mpi", "s3.get", function(...) map2[[..1]], { 30 | testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map2[[paste0(..2, ..3)]] <- ..1, { 31 | s3store("value", "key") 32 | expect_equal(s3read("key"), "value") 33 | map$`s3://test/key` <- "new_value" 34 | # Make sure we are not caching. 35 | expect_equal(s3read("key"), "value") 36 | })}) 37 | }) 38 | 39 | test_that("it denormalizes", { 40 | map <- list2env(list("s3://test/key" = "value")) 41 | 42 | testthatsomemore::package_stub("s3mpi", "s3normalize", function(a, b) { map$norm <- missing(b); a }, { 43 | testthatsomemore::package_stub("s3mpi", "s3.get", function(...) map[[..1]], { 44 | testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map[[paste0(..2, ..3)]] <- ..1, { 45 | s3store("value", "key") 46 | expect_false(map$norm) 47 | s3store(new.env(), "key2") 48 | expect_true(map$norm) 49 | })})}) 50 | }) 51 | 52 | test_that("it can pick up missing key", { 53 | map <- list2env(list("s3://test/key" = NULL)) 54 | testthatsomemore::package_stub("s3mpi", "s3.get", function(...) map[[..1]], { 55 | testthatsomemore::package_stub("s3mpi", "s3.put", function(...) map[[paste0(..2, ..3)]] <- ..1, { 56 | key <- "value" 57 | s3store(key) 58 | expect_equal(s3read("key"), "value") 59 | })}) 60 | }) 61 | 62 | test_that("it produces an error when the object isn't found with an s3exists following the s3.put", { 63 | testthatsomemore::package_stub("base", "system2", function(...) TRUE, { 64 | testthatsomemore::package_stub("s3mpi", "s3exists", function(...) FALSE, { 65 | testthatsomemore::package_stub("s3mpi", "s3.put", function(...) run_system_put(..2, ..3, "", TRUE, 0), { 66 | expect_error(s3store("value", "key")) 67 | })})}) 68 | }) 69 | 70 | test_that("it does not produce an error when the object is found with an s3exists following the s3.put", { 71 | testthatsomemore::package_stub("base", "system2", function(...) TRUE, { 72 | testthatsomemore::package_stub("s3mpi", "s3exists", function(...) TRUE, { 73 | testthatsomemore::package_stub("s3mpi", "s3.put", function(...) run_system_put(..2, ..3, "", TRUE, 0), { 74 | expect_error(s3store("value", "key"), NA) 75 | })})}) 76 | }) 77 | 78 | calling_intervals <- NULL 79 | test_that("it can retry with the correct timing when an s3exists returns FALSE", { 80 | testthatsomemore::package_stub("base", "Sys.sleep", function(...) calling_intervals <<- c(calling_intervals, ..1), { 81 | testthatsomemore::package_stub("base", "system2", function(...) TRUE, { 82 | testthatsomemore::package_stub("s3mpi", "s3exists", function(...) FALSE, { 83 | testthatsomemore::package_stub("s3mpi", "s3.put", function(...) run_system_put(..2, ..3, "", TRUE, 3, c(1, 2, 100)), { 84 | try(run_system_put(,, "", TRUE, 3, c(1, 2, 100)), silent = TRUE) 85 | expect_identical(calling_intervals, c(1, 2, 100)) 86 | })})})}) 87 | }) 88 | }) 89 | 90 | 91 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # Version 0.2.47 2 | 3 | * Fix bug in `s3exists` when using `aws-cli`. 4 | 5 | # Version 0.2.46 6 | 7 | * Fix typo in `last_modified`. 8 | 9 | # Version 0.2.45 10 | 11 | * Compatibility with `aws-cli`. 12 | 13 | # Version 0.2.44 14 | 15 | * Bugfixes and better compatibility with system environment variables. 16 | 17 | # Version 0.2.43 18 | 19 | * Allowing s3mpi.path and s3mpi.cache to be called via system environment variables. 20 | 21 | # Version 0.2.42 22 | 23 | * Better error output in `s3read` if path does not exist. (see [issue #72](https://github.com/robertzk/s3mpi/issues/72)) 24 | 25 | # Version 0.2.41 26 | 27 | * Fix incorrect `strptime` version specification. 28 | 29 | # Version 0.2.40 30 | 31 | * Remove overly complicated metaprogramming. 32 | 33 | # Version 0.2.33-9 34 | 35 | * Other fixes for s4cmd. 36 | 37 | # Version 0.2.32 38 | 39 | * allow choice of storage format in `s3read` and `s3store`. Defaults to `RDS`, 40 | and now you can chooose `CSV` or `table` for data frames. 41 | 42 | # Version 0.2.31 43 | 44 | * Make `s3store` work with s4cmd. 45 | 46 | # Version 0.2.30 47 | 48 | * Don't set --bucket-location flag if `s4cmd` is detected as the `s3cmd` binary. 49 | 50 | # Version 0.2.29 51 | 52 | * Revert the change in 0.2.21 in favor of using `s3cmd info` over 53 | `s3cmd ls` to obtain updated_at information on files. 54 | 55 | # Version 0.2.28 56 | 57 | * Add exponential backoff logic to s3.put function. 58 | 59 | # Version 0.2.27 60 | * Turn off the LRU cache too when `cache = FALSE` in `s3read`. 61 | 62 | # Version 0.2.26 63 | * `options(s3mpi.num_retries)` now allows you to specify default number of retries globally. 64 | 65 | # Version 0.2.25 66 | * Automatically adds ending slashes to paths if they are missing when using 67 | `s3store`, `s3exists`, or `s3delete`. 68 | 69 | # Version 0.2.24 70 | 71 | * Add the ability to delete an object in s3 using `s3delete`. 72 | 73 | # Version 0.2.23 74 | 75 | * `s3path()` is exported. 76 | 77 | # Version 0.2.22 78 | 79 | * `s3read()` (with no arguments) is no longer supported. 80 | 81 | # Version 0.2.21 82 | 83 | * Fixed an issue where reading files that have the same prefix as another file 84 | on the S3 bucket generates a warning. 85 | * Fix a more serious problem where writing and reading within the same minute 86 | produces incorrect results due to the s3cmd utility having *minute*-level 87 | rather than second-level granularity. 88 | 89 | # Version 0.2.20 90 | 91 | * Workaround for the silent but oh-so-deadly sporadic failure of s3cmd's put. 92 | By default we now check for the existence of the object when issuing a put, 93 | with the option to retry a number of times. 94 | 95 | # Version 0.2.19 96 | 97 | * Keep AWS.tools on a remote. 98 | 99 | # Version 0.2.18 100 | 101 | * Add remotes to DESCRIPTION. 102 | 103 | # Version 0.2.17 104 | 105 | * Explicitly create the directory of a file given by `tempfile()` to prevent 106 | rare errors wherein the directory does not exist and yields a 107 | file connection error. ([#41](https://github.com/robertzk/s3mpi/issues/41)) 108 | 109 | # Version 0.2.16 110 | 111 | * Introduce an `s3mpi.disable_lru_cache` option as well as 112 | silently fail if storage to LRU does not succeed. 113 | 114 | # Version 0.2.15 115 | 116 | * Switch to `system2`, which should be more windows friendly, and allow 117 | the user to specify path to executable of s3cmd, by setting `options(s3mpi.s3cmd_path = '/usr/local/bin/s3cmd')` 118 | 119 | # Version 0.2.13 120 | 121 | * Fixup LRU cache to actually use size parameter option. 122 | 123 | # Version 0.2.11 124 | 125 | * A stylistic refactor of the package. The `.path` argument 126 | has been deprecated in `s3read` and `s3store` in favor of 127 | simply `path`. 128 | 129 | # Version 0.2.9-10 130 | 131 | * Remove the need to type a trailing slash in `.path`. 132 | 133 | # Version 0.2.8 134 | 135 | * A hotfix for cache corruption, where data exists without metadata. 136 | It can happen if writing metadata ever fails. 137 | 138 | # Version 0.2.7 139 | 140 | * Remove the `s3mpi.memoize_cache` global option, since it makes no sense. 141 | A user could have overwritten an S3 key in a different R session. 142 | 143 | * `s3exists(NULL)` now returns FALSE. Fixes issue #22. 144 | 145 | # Version 0.2.5-6 146 | 147 | * The `s3mpi.memoize_cache` global controls whether or not caching is 148 | [memoised](https://github.com/hadley/memoise). If set to `TRUE`, it would 149 | have the effect of keeping a common object in the R session instead of 150 | retrieving it from the cached file for each given s3 key. This can significantly 151 | speed up code that reads from the same S3 key multiple times within a 152 | single R session. 153 | 154 | # Version 0.2.4 155 | 156 | * The safety check on `s3store` uses `safe = FALSE` by default now. The new 157 | function `s3put` is equivalent to `s3store` and should be used going forward 158 | if one does not wish to overwrite existing keys. The other approach was causing 159 | too many breaking changes to existing codebases. 160 | 161 | # Version 0.2.2 162 | 163 | * Added a safety check for `s3store`. Now if you want to overwrite a key inside a bucket, 164 | you need to use `s3store(key, safe = FALSE)`. By default safe is set to `TRUE`. 165 | 166 | # Version 0.2.0 167 | 168 | * Added a caching mechanism that will keep copies of files downloaded and 169 | uploaded to S3. Useful if local storage constraints are not an issue. 170 | To enable, set `options(s3mpi.cache = '~/.s3cache')` in your `~/.Rprofile` 171 | (or replace `'~/.s3cache'` with a directory of your choice). 172 | -------------------------------------------------------------------------------- /R/s3.get.R: -------------------------------------------------------------------------------- 1 | #' Fetch an R object from an S3 path. 2 | #' 3 | #' @param path character. A full S3 path. 4 | #' @param bucket_location character. Usually \code{"US"}. 5 | #' @param verbose logical. If \code{TRUE}, the \code{s3cmd} 6 | #' utility verbose flag will be set. 7 | #' @param debug logical. If \code{TRUE}, the \code{s3cmd} 8 | #' utility debug flag will be set. 9 | #' @param cache logical. If \code{TRUE}, an LRU in-memory cache will be referenced. 10 | #' @param storage_format character. What format the object is stored in. Defaults to RDS. 11 | #' @aliases s3.put 12 | #' @return For \code{s3.get}, the R object stored in RDS format on S3 in the \code{path}. 13 | #' For \code{s3.put}, the system exit code from running the \code{s3cmd} 14 | #' command line tool to perform the upload. 15 | s3.get <- function (path, bucket_location = "US", verbose = FALSE, debug = FALSE, cache = TRUE, storage_format = c("RDS", "CSV", "table"), ...) { 16 | storage_format <- match.arg(storage_format) 17 | 18 | ## This inappropriately-named function actually checks existence 19 | ## of a *path*, not a bucket. 20 | AWS.tools:::check.bucket(path) 21 | 22 | # Helper function for fetching data from s3 23 | fetch <- function(path, storage_format, bucket_location, ...) { 24 | x.serialized <- tempfile() 25 | dir.create(dirname(x.serialized), showWarnings = FALSE, recursive = TRUE) 26 | ## We remove the file [when we exit the function](https://stat.ethz.ch/R-manual/R-patched/library/base/html/on.exit.html). 27 | on.exit(unlink(x.serialized), add = TRUE) 28 | 29 | if (file.exists(x.serialized)) { 30 | unlink(x.serialized, force = TRUE) 31 | } 32 | 33 | ## Run the s3cmd tool to fetch the file from S3. 34 | cmd <- s3cmd_get_command(path, x.serialized, bucket_location_to_flag(bucket_location), verbose, debug) 35 | status <- system2(s3cmd(), cmd) 36 | 37 | if (as.logical(status)) { 38 | warning("Nothing exists for key ", path) 39 | `attr<-`(`class<-`(data.frame(), c("s3mpi_error", status)), "key", path) 40 | } else { 41 | ## And then read it back in RDS format. 42 | load_from_file <- get(paste0("load_as_", storage_format)) 43 | load_from_file(x.serialized, ...) 44 | } 45 | } 46 | 47 | ## Check for the path in the cache 48 | ## If it does not exist, create and return its entry. 49 | ## The `s3LRUcache` helper is defined in utils.R 50 | if (is.windows() || isTRUE(get_option("s3mpi.disable_lru_cache")) || !isTRUE(cache)) { 51 | ## We do not have awk, which we will need for the moment to 52 | ## extract the modified time of the S3 object. 53 | ans <- fetch(path, storage_format, bucket_location, ...) 54 | } else if (!s3LRUcache()$exists(path)) { 55 | ans <- fetch(path, storage_format, bucket_location, ...) 56 | ## We store the value of the R object in a *least recently used cache*, 57 | ## expecting the user to not think about optimizing their code and 58 | ## call `s3read` with the same key multiple times in one session. With 59 | ## this approach, we keep the latest 10 object in RAM and do not have 60 | ## to reload them into memory unnecessarily--a wise time-space trade-off! 61 | tryCatch(s3LRUcache()$set(path, ans), error = function(...) { 62 | warning("Failed to store object in LRU cache. Repeated calls to ", 63 | "s3read will not benefit from a performance speedup.") 64 | }) 65 | } else { 66 | # Check time on s3LRUcache's copy 67 | last_cached <- s3LRUcache()$last_accessed(path) # assumes a POSIXct object 68 | 69 | # Check time on s3 remote's copy using the `s3cmd info` command. 70 | s3.cmd <- paste("info ", path, "| head -n 3 | tail -n 1") 71 | result <- system2(s3cmd(), s3.cmd, stdout = TRUE, stderr = NULL) 72 | # The `s3cmd info` command produces the output 73 | # " Last mod: Tue, 16 Jun 2015 19:36:10 GMT" 74 | # in its third line, so we subset to the 20-39 index range 75 | # to extract "16 Jun 2015 19:36:10". 76 | result <- substring(result, 20, 39) 77 | last_updated <- strptime(result, format = "%d %b %Y %H:%M:%S", tz = "GMT") 78 | 79 | if (last_updated > last_cached) { 80 | ans <- fetch(path, storage_format, bucket_location, ...) 81 | s3LRUcache()$set(path, ans) 82 | } else { 83 | ans <- s3LRUcache()$get(path) 84 | } 85 | } 86 | ans 87 | } 88 | 89 | s3cmd_get_command <- function(path, file, bucket_flag, verbose, debug) { 90 | if (use_legacy_api()) { 91 | paste("get", paste0('"', path, '"'), file, 92 | bucket_flag, 93 | if (verbose) "--verbose --progress" else "--no-progress", 94 | if (debug) "--debug" else "") 95 | } else { 96 | paste0("s3 cp ", path, " ", file) 97 | } 98 | } 99 | 100 | ## Given an s3cmd path and a bucket location, will construct a flag 101 | ## argument for s3cmd. If it looks like the s3cmd is actually 102 | ## pointing to an s4cmd, return empty string as s4cmd doesn't 103 | ## support bucket location. 104 | bucket_location_to_flag <- function(bucket_location) { 105 | if (grepl("s4cmd", s3cmd())) { 106 | if (bucket_location != "US") { 107 | warning(paste0("Ignoring non-default bucket location ('", 108 | bucket_location, 109 | "') in s3mpi::s3.get since s4cmd was detected", 110 | "-- this might be a little slower but is safe to ignore.")); 111 | } 112 | return("") 113 | } 114 | return(paste("--bucket_location", bucket_location)) 115 | } 116 | 117 | load_as_RDS <- function(filename, ...) { 118 | readRDS(filename, ...) 119 | } 120 | 121 | load_as_CSV <- function(filename, ...) { 122 | read.csv(filename, ..., stringsAsFactors = FALSE) 123 | } 124 | 125 | load_as_table <- function(filename, ...) { 126 | read.table(filename, ..., stringsAsFactors = FALSE) 127 | } 128 | 129 | #' Printing for s3mpi errors. 130 | #' 131 | #' @param x ANY. R object to print. 132 | #' @param ... additional objects to pass to print function. 133 | #' @export 134 | print.s3mpi_error <- function(x, ...) { 135 | cat("Error reading from S3: key", crayon::white$bold(attr(x, "key")), "not found.\n") 136 | } 137 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | ## A standard helper: if `x` is null, `y` will be returned instead. 2 | `%||%` <- function(x, y) if (is.null(x)) y else x 3 | 4 | ## A package specific environment 5 | .s3mpienv <- new.env() 6 | 7 | ## path to shell util 8 | s3cmd <- function() { 9 | cmd <- if (use_legacy_api()) { 10 | if (isTRUE(nzchar(cmd <- get_option("s3mpi.s3cmd_path")))) { 11 | cmd 12 | } else { as.character(Sys.which("s3cmd")) } 13 | } else { 14 | if (isTRUE(nzchar(cmd <- get_option("s3mpi.aws_path")))) { 15 | cmd 16 | } else { as.character(Sys.which("aws")) } 17 | } 18 | if (is.null(cmd)) { stop("No s3mpi backend found on your system! Make sure you install either aws-cli or s3cmd or s4cmd") } 19 | cmd 20 | } 21 | 22 | use_legacy_api <- function() { 23 | isTRUE(get_option("s3mpi.legacy_api")) 24 | } 25 | 26 | ## Given an s3cmd path and a bucket location, will construct a flag 27 | ## argument for s3cmd. If it looks like the s3cmd is actually 28 | ## pointing to an s4cmd, return empty string as s4cmd doesn't 29 | ## support bucket location. 30 | bucket_location_to_flag <- function(bucket_location) { 31 | if (using_s4cmd()) { 32 | if (!identical(bucket_location, "US")) { 33 | warning(paste0("Ignoring non-default bucket location ('", 34 | bucket_location, 35 | "') in s3mpi::s3.get since s4cmd was detected", 36 | "-- this might be a little slower but is safe to ignore.")); 37 | } 38 | "" 39 | } else if (use_legacy_api()) { 40 | paste("--bucket-location", bucket_location) 41 | } 42 | } 43 | 44 | ## Given an s3cmd path and a bucket location, will construct a flag 45 | ## argument for s3cmd. If it looks like the s3cmd is actually 46 | ## pointing to an s4cmd, return empty string as s4cmd doesn't 47 | ## support bucket location. 48 | bucket_location_to_flag <- function(bucket_location) { 49 | if (using_s4cmd()) { 50 | if (!identical(bucket_location, "US")) { 51 | warning(paste0("Ignoring non-default bucket location ('", 52 | bucket_location, 53 | "') in s3mpi::s3.get since s4cmd was detected", 54 | "-- this might be a little slower but is safe to ignore.")); 55 | } 56 | "" 57 | } else { 58 | paste("--bucket-location", bucket_location) 59 | } 60 | } 61 | 62 | ## We use the [memoise](https://github.com/hadley/memoise) package to 63 | ## ensure this check only gets run once in a given R session. This 64 | ## means a user will have to restart R if they install s3cmd 65 | ## during a session, but we are comfortable with that! 66 | ensure_s3cmd_present <- memoise::memoise(function() { 67 | check <- try(system("s3cmd --help", intern = TRUE), silent = TRUE) 68 | if (is(check, "try-error")) { 69 | ## It is always preferable to make life as easy as possible for the user! 70 | ## If they have the [homebrew](https://brew.sh) package manager, we 71 | ## give them the fastest installation instructions. 72 | if (is.mac() && system2("which", "brew", stdout = FALSE) == 0) { 73 | stop("Please install the ", crayon::yellow("s3cmd"), " command-line ", 74 | "utility using by running ", crayon::green("brew install s3cmd"), 75 | " from your terminal and then configuring your S3 credentials ", 76 | "using ", crayon::yellow("s3cmd --configure"), call. = FALSE) 77 | } else { 78 | ## Otherwise, manual it is! 79 | stop("Please install s3cmd, the S3 command line utility: ", 80 | "http://s3tools.org/kb/item14.htm\nand then setup your S3 ", 81 | "credentials using ", crayon::yellow("s3cmd --configure"), 82 | call. = FALSE) 83 | } 84 | } 85 | }) 86 | 87 | cache_enabled <- function() { 88 | !is.null(tmp <- cache_directory()) && nzchar(tmp) 89 | } 90 | 91 | cache_directory <- function() { 92 | dir <- get_option("s3mpi.cache") 93 | if (!is.null(dir) && !(is.character(dir) && length(dir) == 1 && !is.na(dir))) { 94 | stop("Please set the ", sQuote("s3mpi.cache"), " option to a character ", 95 | "vector of length 1 giving a directory path.") 96 | } 97 | dir 98 | } 99 | 100 | ## We ping google.com to ensure the user has an internet connection. If not, 101 | ## we operate in "offline mode" for the whole session, that is, we read 102 | ## from the s3cache if the user has set their `s3mpi.s3cache` option 103 | ## but cannot store or read new keys. 104 | has_internet <- local({ 105 | has_internet_flag <- NULL 106 | function() { 107 | if (!is.null(get_option("s3mpi.skip_connection_check"))) return(FALSE) 108 | if (!is.null(has_internet_flag)) { return(has_internet_flag) } 109 | has_internet_flag <<- suppressWarnings({ 110 | internet_check <- try(file("http://google.com", "r")) 111 | if (!is(internet_check, "try-error") && is(internet_check, "connection")) { 112 | on.exit(close.connection(internet_check)) 113 | } 114 | !(is(internet_check, "try-error") && 115 | grepl("cannot open", internet_check$message)) 116 | }) 117 | } 118 | }) 119 | 120 | ## A sexy [least recently used cache](http://mcicpc.cs.atu.edu/archives/2012/mcpc2012/lru/lru.html) 121 | ## using [the cacher package](https://github.com/kirillseva/cacher). 122 | s3LRUcache <- function() { 123 | if (is.null(.s3mpienv$lrucache)) { 124 | .s3mpienv$lrucache <- cacher::LRUcache(get_option("s3mpi.cache_size", "2Gb")) 125 | } else { 126 | .s3mpienv$lrucache 127 | } 128 | } 129 | 130 | # All S3 paths need a slash at the end to work, but we don't need the user 131 | # to know that, so let's add a slash for them if they forget. 132 | add_ending_slash <- function(path) { 133 | last_character <- function(str) { 134 | substr(str, nchar(str), nchar(str)) 135 | } 136 | if (last_character(path) != "/") { paste0(path, "/") } else { path } 137 | } 138 | 139 | using_s4cmd <- function() { 140 | grepl("s4cmd", s3cmd()) 141 | } 142 | 143 | get_option <- function(x, default = NULL) { 144 | result <- getOption(x) 145 | if (is.null(result)) { 146 | result <- Sys.getenv(toupper(gsub("\\.", "_", x))) 147 | if (!nzchar(result)) { result <- NULL } 148 | } 149 | result %||% default 150 | } 151 | -------------------------------------------------------------------------------- /R/s3cache.R: -------------------------------------------------------------------------------- 1 | ## If we are frequently using `s3read` and `s3store` from within an 2 | ## active R session, it is likely that we will need to pull the stored 3 | ## object multiple times. For example, if we have the training data 4 | ## set for a model or a list with some summary statistics, we may be 5 | ## pulling this frequently when performing analysis during a week-long 6 | ## project. 7 | ## 8 | ## To facilitate this process and speed things up a bit, we keep a 9 | ## local *file system cache* of the objects downloaded from S3 using 10 | ## `s3read`. If the user has set their `s3mpi.cache` option or system 11 | ## environment variable to a 12 | ## directory path (by default `~/.s3cache`), we will use that directory 13 | ## to store downloaded R objects. The second time a user calls 14 | ## `s3read("some/key")` we will fetch it from the local file system 15 | ## instead of spending time re-downloading the object. 16 | ## 17 | ## This functionality should be disabled if we regularly are storing 18 | ## and pulling objects that in aggregate exceed the user's available disk space. 19 | #' A caching layer around s3mpi calls. 20 | #' 21 | #' Fetching large files from the S3 MPI can be expensive when performed 22 | #' multiple times. This method allows one to add a caching layer 23 | #' around S3 fetching. The user should specify the configuration option 24 | #' \code{options(s3mpi.cache = "some/dir")}. The recommended cache 25 | #' directory (where files will be stored) is \code{"~/.s3cache"}. 26 | #' 27 | #' @param s3key character. The full S3 key to attempt to read or write 28 | #' to the cache. 29 | #' @param value ANY. The R object to save in the cache. If missing, 30 | #' a cache read will be performed instead. 31 | s3cache <- function(s3key, value) { 32 | if (!cache_enabled()) { 33 | stop("Cannot use s3mpi::s3cache until you set options(s3mpi.cache) ", 34 | "to a directory in which to place cache contents.") 35 | } 36 | 37 | d <- cache_directory() 38 | dir.create(d, FALSE, TRUE) 39 | ## We will hold the objects in the `data` subdirectory of the `s3mpi.cache` 40 | ## path and *metadata* about the objects (such as when it was last modified 41 | ## on S3, so we can perform cache invalidation) in the `info` directory. 42 | dir.create(file.path(d, "info"), FALSE, TRUE) 43 | dir.create(file.path(d, "data"), FALSE, TRUE) 44 | 45 | # If no value to store was provided, we assume we are reading from the cache. 46 | if (missing(value)) { 47 | fetch_from_cache(s3key, d) 48 | } else { # Otherwise, we are writing to it. 49 | save_to_cache(s3key, value, d) 50 | } 51 | } 52 | 53 | #' Helper function for fetching a file from a cache directory. 54 | #' 55 | #' This function will also test to determine whether the file has been 56 | #' modified on S3 since the last cache save. If the file has never been 57 | #' cached or the cache is invalidated, it will return \code{s3mpi::not_cached}. 58 | #' 59 | #' @param key character. The key under which the cache entry is stored. 60 | #' @param cache_dir character. The cache directory. The default is 61 | #' \code{cache_directory()}. 62 | #' @return the cached object if the cache has not invalidated. Otherwise, 63 | #' return \code{s3mpi::not_cached}. 64 | fetch_from_cache <- function(key, cache_dir) { 65 | ## We use an [MD5 hash](https://en.wikipedia.org/wiki/MD5) to convert an 66 | ## arbitrary R object to a 32-character string representation. We use this 67 | ## as an implicit hash table in the file system so we do not have to deal 68 | ## with keys that cause conflicts with the file system (such as "../blah"). 69 | cache_key <- digest::digest(key) 70 | cache_file <- function(dir) file.path(cache_dir, dir, cache_key) 71 | 72 | if (!file.exists(cache_file("data"))) return(not_cached) 73 | 74 | if (!file.exists(cache_file("info"))) { 75 | # Somehow the cache became corrupt: data exists without accompanying 76 | # meta-data. In this case, simply wipe the cache. 77 | file.remove(cache_file("data")) 78 | return(not_cached) 79 | } 80 | 81 | info <- readRDS(cache_file("info")) 82 | # Check if cache is invalid. 83 | connected <- has_internet() 84 | if (!connected) { 85 | warning("Your network connection seems to be unavailable. s3mpi will ", 86 | "use the latest cache entries instead of pulling from S3.", 87 | call. = FALSE, immediate. = FALSE) 88 | } 89 | 90 | ## If the modification time has changed since we last cached the 91 | ## value, re-pull it from S3 and wipe the cache. 92 | if (connected && !identical(info$mtime, last_modified(key))) { 93 | not_cached 94 | } else { 95 | readRDS(cache_file("data")) 96 | } 97 | } 98 | 99 | #' Helper function for saving a file to a cache directory. 100 | #' 101 | #' @param key character. The key under which the cache entry is stored. 102 | #' @param value ANY. The R object to save in the cache. 103 | #' @param cache_dir character. The cache directory. The default is 104 | #' \code{cache_directory()}. 105 | save_to_cache <- function(key, value, cache_dir = cache_directory()) { 106 | cache_key <- digest::digest(key) 107 | cache_file <- function(dir) file.path(cache_dir, dir, cache_key) 108 | 109 | saveRDS(value, cache_file("data")) 110 | info <- list(mtime = last_modified(key), key = key) 111 | saveRDS(info, cache_file("info")) 112 | invisible(NULL) 113 | } 114 | 115 | #' Determine the last modified time of an S3 object. 116 | #' 117 | #' @param key character. The s3 key of the object. 118 | #' @return the last modified time or \code{NULL} if it does not exist on S3. 119 | last_modified <- function(key) { 120 | ## If the user doesn't have internet, assume the file hasn't changed 121 | ## since we can't figure out if it has! Here, we simply pull from 122 | ## the cache. 123 | if (!has_internet()) { return(as.POSIXct(as.Date("2000-01-01"))) } 124 | cmd <- if (use_legacy_api()) { 125 | paste("ls", key) 126 | } else { 127 | paste("s3", "ls", key) 128 | } 129 | s3result <- system2(s3cmd(), cmd, stdout = TRUE)[1L] 130 | if (is.character(s3result) && !is.na(s3result) && nzchar(s3result)) { 131 | ## We use [`strptime`](https://stat.ethz.ch/R-manual/R-patched/library/base/html/strptime.html) 132 | ## to extract the modification time from the `s3cmd ls` query. 133 | strptime(substring(s3result, 1, 16), "%Y-%m-%d %H:%M") 134 | } 135 | } 136 | 137 | ## This is a special object we use to signify the object is not 138 | ## cached. We assume no one will try to `s3store` an object with 139 | ## class `"not_cached"`! 140 | not_cached <- local({ tmp <- list(); class(tmp) <- "not_cached"; tmp }) 141 | is.not_cached <- function(x) identical(x, not_cached) 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | R and AWS S3 [![Build Status](https://travis-ci.org/robertzk/s3mpi.svg?branch=master)](https://travis-ci.org/robertzk/s3mpi) [![Coverage Status](https://coveralls.io/repos/robertzk/s3mpi/badge.png)](https://coveralls.io/r/robertzk/s3mpi) [![Documentation](https://img.shields.io/badge/rocco--docs-%E2%9C%93-blue.svg)](http://robertzk.github.io/s3mpi/) 2 | ========= 3 | 4 | A common problem for data scientists is passing data or models to each 5 | other without interrupting their workflow. There are typically two approaches: 6 | 7 | 1. Writing CSV and RDS files and passing them around using tools like 8 | email, Dropbox, or SFTP. Typically, these files are too large for 9 | inclusion in version control. 10 | 11 | 2. Building an API infrastructure around some data backends, such as 12 | databases, data warehouses, and streaming providers like Kafka. 13 | 14 | The former works well for small teams consisting of 1-3 people but soon 15 | becomes prohibitive. Additionally, tracking the array of files and outputs 16 | soon becomes cumbersome and interrupts the data scientist's workflow. 17 | 18 | The second option is an inevitable progression for any sufficiently large data 19 | team, but requires major coordination with software or data engineers 20 | and may not be practical for small teams or experimental projects. It is 21 | also usually limited by well-defined specification of the formats that 22 | are being passed into consoles and outputted to data storage systems. 23 | 24 | On the other hand, S3mpi (S3 [*message passing interface*](https://en.wikipedia.org/wiki/Message_Passing_Interface), 25 | affectionately named after the distributed message passing library) 26 | allows for **storage and serialization of arbitrary R objects** and does 27 | not have the limits of the second approach, while providing **on-demand 28 | access to stored data and objects**, avoiding the need for large amounts of 29 | disk space locally. 30 | 31 | Here, S3 stands for [Amazon's cloud storage](https://aws.amazon.com/s3/) which 32 | you can think of as an infinite hard drive. You write an object to a path, 33 | and then it *remains there indefinitely and is accessible to anyone you wish 34 | to share it with*. For example, if you have several terabytes of datasets split 35 | into thousands of components, you can individually load small pieces and perform 36 | computation on them to avoid storing the entire dataset locally. This is the 37 | basis for distributed computing systems like [Hadoop](https://en.wikipedia.org/wiki/Apache_Hadoop). 38 | 39 | Assuming you have set up your [S3 configuration](http://s3tools.org/kb/item14.htm) 40 | correctly (see the tutorial below), you can immediately get started with: 41 | 42 | ```R 43 | library(s3mpi) 44 | s3store(obj, "s3key/for/your/object") 45 | ``` 46 | 47 | You can then read it back from S3 in any R session running on a machine with 48 | compatible S3 credentials: 49 | 50 | ```R 51 | s3read("s3key/for/your/object") 52 | ``` 53 | 54 | Paired with [chat-driven development](https://sameroom.io/blog/self-hosted-team-chat-options-and-alternatives/) 55 | this allows a team of data scientists to quickly generate team-global accessible 56 | objects like data sets and models and chat the key to teammates so they pull down 57 | the results within seconds for inspection, modification, or further analysis. 58 | 59 | #### Installing the Package 60 | 61 | This package is not currently available on CRAN and has several non-CRAN 62 | dependencies. First, ensure you have the [s3cmd](http://s3tools.org/s3cmd) command-line 63 | tool installed. If you are on OS X, you can simply run `brew install s3cmd` if 64 | you have [homebrew](http://brew.sh/). Next, you will have to copy the [example 65 | `.s3cfg`](http://s3tools.org/kb/item14.htm) file and place it in `~/.s3cfg` (or 66 | generate it using `s3cmd --configure`) and then obtain 67 | [AWS access credentials](http://docs.aws.amazon.com/general/latest/gr/getting-aws-sec-creds.html) 68 | and fill out the `access_key` and `secret_key` sections of your `~/.s3cfg` file. 69 | Note that [S3 storage is pretty cheap](https://aws.amazon.com/s3/pricing/) 70 | and even the most intense data use is unlikely to exceed $100/month. 71 | 72 | To install the R package and its dependencies, run the following from the R console. 73 | 74 | ```R 75 | if (!require("devtools")) { install.packages("devtools") } 76 | devtools::install_github("avantcredit/AWS.tools") 77 | devtools::install_github("kirillseva/cacher") 78 | devtools::install_github("robertzk/s3mpi") 79 | ``` 80 | 81 | This package has been used on OSX and Linux systems in a production-facing 82 | environment, but **we have not tested it extensively on Windows**, 83 | so if you run into problems please [file an issue](https://github.com/robertzk/s3mpi/issues/new) 84 | immediately. 85 | 86 | Finally, put the name of a default [bucket](http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingBucket.html) 87 | in your `~/.Rprofile`: 88 | 89 | ```R 90 | options(s3mpi.path = "s3://yourS3Bucket/") 91 | ``` 92 | 93 | If you do not specify a default S3 path, you will have to include it 94 | manually as the second parameter: 95 | 96 | ```R 97 | s3store(obj, "s3key/for/your/object", "s3://somebucket/") 98 | # From another R session 99 | s3read("s3key/for/your/object", "s3://somebucket/") 100 | ``` 101 | 102 | #### Potential uses 103 | 104 | S3mpi has been used in production-facing environments for: 105 | 106 | 1. Light-weight **mapreduce** for background jobs on medium data sets. One can 107 | partition a set of primary keys, perform a computation, and `s3store` 108 | the results in a separate S3 location for each partition. 109 | 110 | 2. **Logging** supplementation. For example, if you alert your errors to 111 | a service like [honeybadger](http://honeybadger.io) it is possible to 112 | provide additional details by noting the S3 key with an R object containing 113 | further information in the notification. This also works with chat-driven 114 | development using [hipchat](http://hipchat.com) or [slack](http://slack.com) 115 | by adding an S3 key with "additional details" to failure notifications. 116 | 117 | 3. **Caching** of functions that should have deterministic outputs or infrequent 118 | refresh intervals can be accomplishing by wrapping it with an 119 | ["s3 memoise"](https://github.com/peterhurford/s3memoize) layer (compare to 120 | the totally in-memory [memoise](https://github.com/hadley/memoise)). 121 | 122 | 4. For **debugging**, it is possible to `s3store` intermediate output during a complex 123 | computation for later inspection, especially if you do not wish to store 124 | this information on the local file system. 125 | 126 | 5. **Collaboration** in data science teams can be massively improved by 127 | using `s3store` and `s3read` to quickly pass data sets under investigation 128 | between R sessions. ("Hey can you send me the IDs of the customers that 129 | had a messed up leads record?") This completely eliminates the error-prone 130 | email / dropbox alternative and leaves a paper trail since it is unlikely 131 | one would ever need to delete objects from S3. 132 | 133 | 6. Interfacing with **production environments** during background jobs, especially 134 | if a compatible [Ruby](https://github.com/robertzk/s3mpi-ruby) or 135 | [Python](https://github.com/robertzk/s3mpy) API is written. This can be used 136 | to ask an engineer to pull data from a production console, "ruby s3store" 137 | or "python s3store" it, and seamlessly read it from the R console as an 138 | R object such as a list or a data.frame. This **narrows the gap between 139 | analysts and engineers**. 140 | 141 | 7. **Reproducible reports** can be generated by storing all intermediate and 142 | final output in a pre-defined S3 convention. At [Avant](https://github.com/avantcredit), 143 | this approach is used to store all information about all trained models 144 | stretching to the beginning of time. 145 | 146 | The time required to store and read objects can be massively sped up by 147 | adopting a workflow where one **sshes into an [EC2 instance](https://aws.amazon.com/ec2/instance-types/)**. 148 | 149 | #### Alternative S3 key setup 150 | 151 | Instead of setting up an `~/.s3cfg` file, you can also add the 152 | following environment variables to `.bash_profile` / `.zshrc`: 153 | 154 | ``` 155 | export AWS_ACCESS_KEY_ID=PUTYOURACCESSKEYHERE 156 | export AWS_SECRET_ACCESS_KEY=PUTYOURSECRETKEYHERE 157 | ``` 158 | 159 | #### Local Caching 160 | 161 | You can enable local caching of downloaded and uploaded files by setting a 162 | global systen environment variable or by using: 163 | 164 | ```R 165 | options(s3mpi.cache = '~/.s3cache') # Or a directory of your choice 166 | ``` 167 | 168 | If you have the caching layer enabled in the above manner, the s3mpi package will 169 | check if you have a functioning connection to S3 before reading from the cache 170 | to determine whether the value is invalidated (i.e., if someone updated the object). 171 | If you wish to skip this check and read directly from the cache when you do not 172 | have an internet connection, set `options(s3mpi.skip_connection_check = TRUE)`. 173 | 174 | 175 | #### Ruby and Python Versions 176 | 177 | You can also use S3MPI in [Ruby](https://github.com/robertzk/s3mpi-ruby) and in [Python](https://github.com/robertzk/s3mpy). 178 | 179 | #### Command Line Accompaniment 180 | 181 | One can find file size(s) and contents of the remote bucket 182 | using the [s3 command line tool](http://s3tools.org/s3cmd): 183 | 184 | ```sh 185 | s3cmd ls s3://yourS3Bucket/some/key" 186 | s3cmd ls -H s3://yourS3Bucket/some/key" # Human readable 187 | ``` 188 | 189 | ### License 190 | 191 | This project is licensed under the MIT License: 192 | 193 | Copyright (c) 2015-2016 Robert Krzyzanowski 194 | 195 | Permission is hereby granted, free of charge, to any person obtaining 196 | a copy of this software and associated documentation files (the 197 | "Software"), to deal in the Software without restriction, including 198 | without limitation the rights to use, copy, modify, merge, publish, 199 | distribute, sublicense, and/or sell copies of the Software, and to 200 | permit persons to whom the Software is furnished to do so, subject to 201 | the following conditions: 202 | 203 | The above copyright notice and this permission notice shall be included 204 | in all copies or substantial portions of the Software. 205 | 206 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 207 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 208 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 209 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 210 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 211 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 212 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 213 | 214 | ### Authors 215 | 216 | This package was originally created by Robert Krzyzanowski. Additional 217 | maintenance and improvement work was later done by Peter Hurford 218 | and Kirill Sevastyanenko. 219 | 220 | --------------------------------------------------------------------------------