├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── Makefile
├── NAMESPACE
├── R
    ├── autogenerate.R
    ├── datastorr.R
    ├── download.R
    ├── github.R
    ├── github_api.R
    ├── github_auth.R
    ├── github_release.R
    ├── simple.R
    └── utils.R
├── README.md
├── inst
    └── template.whisker
├── man
    ├── autogenerate.Rd
    ├── datastorr.Rd
    ├── datastorr_auth.Rd
    ├── datastorr_path.Rd
    ├── github_release_create.Rd
    ├── github_release_del.Rd
    ├── github_release_get.Rd
    ├── github_release_info.Rd
    ├── github_release_versions.Rd
    └── release.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── example.csv
    │   ├── helper-dataverse.R
    │   ├── metadata.json
    │   ├── test-autogenerate.R
    │   ├── test-data-package.R
    │   ├── test-default.R
    │   ├── test-github-releases.R
    │   ├── test-multi-file.R
    │   └── test-simple.R
├── update_web.sh
└── vignettes
    ├── datastorr.Rmd
    └── src
        └── datastorr.R


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^Makefile$
4 | ^README.Rmd$
5 | ^.travis.yml$
6 | ^appveyor.yml$
7 | ^tests/testthat/mtcars\.rds$
8 | ^update_web\.sh$
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | tests/testthat/mtcars.rds
5 | inst/doc
6 | inst/web
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | cache: packages
 3 | sudo: false
 4 | 
 5 | r_packages:
 6 |   - covr
 7 | 
 8 | after_success:
 9 |   - Rscript -e 'covr::codecov()'
10 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: datastorr
 2 | Title: Simple Data Versioning
 3 | Version: 0.0.4
 4 | Author: Rich FitzJohn
 5 | Maintainer: Rich FitzJohn <rich.fitzjohn@gmail.com>
 6 | Description: Simple dataversioning using GitHub to store data.
 7 | License: MIT + file LICENSE
 8 | LazyData: true
 9 | URL: https://github.com/ropenscilabs/datastorr
10 | BugReports: https://github.com/ropenscilabs/datastorr/issues
11 | Imports:
12 |     httr,
13 |     jsonlite,
14 |     rappdirs,
15 |     storr (>= 1.0.0),
16 |     tibble, 
17 |     stringr
18 | Suggests:
19 |     knitr,
20 |     rmarkdown,
21 |     testthat (>= 0.11.0),
22 |     whisker
23 | RoxygenNote: 7.0.2
24 | VignetteBuilder: knitr
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2015
2 | COPYRIGHT HOLDER: Rich FitzJohn
3 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PACKAGE := $(shell grep '^Package:' DESCRIPTION | sed -E 's/^Package:[[:space:]]+//')
 2 | RSCRIPT = Rscript --no-init-file
 3 | 
 4 | all: install
 5 | 
 6 | test:
 7 | 	DATASTORR_SKIP_DOWNLOADS=true make test_all
 8 | 
 9 | test_all:
10 | 	${RSCRIPT} -e 'library(methods); devtools::test()'
11 | 
12 | roxygen:
13 | 	@mkdir -p man
14 | 	${RSCRIPT} -e "library(methods); devtools::document()"
15 | 
16 | install:
17 | 	R CMD INSTALL .
18 | 
19 | build:
20 | 	R CMD build .
21 | 
22 | check:
23 | 	DATASTORR_SKIP_DOWNLOADS=true make check_all
24 | 
25 | check_all: build
26 | 	_R_CHECK_CRAN_INCOMING_=FALSE R CMD check --as-cran --no-manual `ls -1tr ${PACKAGE}*gz | tail -n1`
27 | 	@rm -f `ls -1tr ${PACKAGE}*gz | tail -n1`
28 | 	@rm -rf ${PACKAGE}.Rcheck
29 | 
30 | vignettes/datastorr.Rmd: vignettes/src/datastorr.R
31 | 	${RSCRIPT} -e 'library(sowsear); sowsear("$<", output="$@")'
32 | vignettes: vignettes/datastorr.Rmd
33 | 	${RSCRIPT} -e 'library(methods); devtools::build_vignettes()'
34 | 
35 | staticdocs:
36 | 	@mkdir -p inst/staticdocs
37 | 	${RSCRIPT} -e "library(methods); staticdocs::build_site()"
38 | 	rm -f vignettes/*.html
39 | 	@rmdir inst/staticdocs
40 | website: staticdocs
41 | 	./update_web.sh
42 | 
43 | # No real targets!
44 | .PHONY: all test document install vignettes
45 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(autogenerate)
 4 | export(datastorr)
 5 | export(datastorr_auth)
 6 | export(datastorr_path)
 7 | export(datastorr_versions)
 8 | export(github_release_create)
 9 | export(github_release_del)
10 | export(github_release_get)
11 | export(github_release_info)
12 | export(github_release_version_current)
13 | export(github_release_versions)
14 | export(release)
15 | export(setup_github_token)
16 | 


--------------------------------------------------------------------------------
/R/autogenerate.R:
--------------------------------------------------------------------------------
 1 | ##' Autogenerate an datastorr interface for a package.  The idea is to
 2 | ##' run this function and save the resulting code in a file in your
 3 | ##' package.  Then users will be able to download data and you will be
 4 | ##' able to relase data easily.
 5 | ##'
 6 | ##' In addition to running this, you will need to add \code{datastorr}
 7 | ##' to the \code{Imports:} section of your DESCRIPTION.  To upload
 8 | ##' files you will need to set your \code{GITHUB_TOKEN} environment
 9 | ##' variable.  These steps will be described more fully in a vignette.
10 | ##'
11 | ##' More complete instructions:
12 | ##'
13 | ##' Let \code{pkg} be \code{basename(repo}); the name of the package
14 | ##' and of the GitHub repository.
15 | ##'
16 | ##' First, create a new R package, e.g.  \code{devtools::create(pkg)}.
17 | ##'
18 | ##' Then, copy the result of running \code{autogenerate} into a file
19 | ##' in that package, e.g.
20 | ##'
21 | ##' \preformatted{   writeLines(autogenerate(repo, read),
22 | ##'              file.path(pkg, "datastorr.R"))
23 | ##'   devtools::document(pkg)
24 | ##' }
25 | ##'
26 | ##' Create a new git repository for this package, and add all the
27 | ##' files in the package, and commit.
28 | ##'
29 | ##' On GitHub, create a repository for the package and push your code
30 | ##' there.
31 | ##'
32 | ##' At this point you are now ready to start making releases by
33 | ##' loading your package and running \code{pkg::<name>_release()}.
34 | ##'
35 | ##' @title Autogenerate a datastorr interface
36 | ##'
37 | ##' @param repo Name of the repo on github (in username/repo format)
38 | ##'
39 | ##' @param read \emph{name} of a function to read the data.  Do not
40 | ##'   give the function itself!
41 | ##'
42 | ##' @param filename Name of the file to read.  If not given, then the
43 | ##'   single file in a release will be read (but you will need to
44 | ##'   provide a filename on upload).  If given, you cannot change the
45 | ##'   filename ever as all releases will be assumed to have the same
46 | ##'   filename.
47 | ##'
48 | ##' @param name Name of the dataset, used in generating the functions.
49 | ##'   If omitted the repo name is used.
50 | ##'
51 | ##' @param roxygen Include roxygen headers for the functions?
52 | ##'
53 | ##' @export
54 | ##' @examples
55 | ##' writeLines(autogenerate("richfitz/datastorr.example",
56 | ##'                         read = "readRDS", name = "mydata"))
57 | ##' writeLines(autogenerate("richfitz/datastorr.example",
58 | ##'                         read = "readRDS", name = "mydata",
59 | ##'                         roxygen = FALSE))
60 | autogenerate <- function(repo, read, filename = NULL, name = basename(repo),
61 |                          roxygen = TRUE) {
62 |   loadNamespace("whisker")
63 |   template <- readLines(system.file("template.whisker", package = "datastorr"))
64 |   if (is.null(filename)) {
65 |     filename <- "NULL"
66 |   } else {
67 |     filename <- sprintf('"%s"', filename)
68 |   }
69 |   if (!is.character(read)) {
70 |     stop("Expected a string for the function")
71 |   }
72 |   data <- list(repo = repo, read = read, name = name, filename = filename)
73 |   x <- whisker::whisker.render(template, data)
74 |   x <- strsplit(x, "\n")[[1]]
75 |   if (!roxygen) {
76 |     x <- x[!grepl("^##'", x)]
77 |   }
78 |   ## Part of a workaround around a whisker bug:
79 |   x <- gsub("{ ", "{", x, fixed = TRUE)
80 |   x <- gsub(" }", "}", x, fixed = TRUE)
81 |   x
82 | }
83 | 


--------------------------------------------------------------------------------
/R/datastorr.R:
--------------------------------------------------------------------------------
  1 | ## This is the core of the package - it holds all the facilities for
  2 | ## caching etc.
  3 | R6_datastorr <- R6::R6Class(
  4 |   "datastorr",
  5 |   public = list(
  6 |     storr = NULL,
  7 |     cache = NULL,
  8 |     info = NULL,
  9 |     path = NULL,
 10 | 
 11 |     initialize = function(info) {
 12 |       self$info <- info
 13 |       self$path <- info$path
 14 |       self$storr <- storr::storr_rds(self$path)
 15 |       self$cache <- storr::storr_environment()
 16 | 
 17 |       dir.create(file.path(self$path, "file"), FALSE, TRUE)
 18 |       dir.create(file.path(self$path, "workdir"), FALSE, TRUE)
 19 | 
 20 |       lockBinding("info", self)
 21 |       lockBinding("path", self)
 22 |     },
 23 | 
 24 |     get = function(version = NULL, download = TRUE, verbose = TRUE,
 25 |                    reread = FALSE) {
 26 |       if (is.null(version)) {
 27 |         version <- self$version_current()
 28 |       }
 29 |       
 30 |       # DEFAULT should pull the source code 
 31 |       # TODO: temporary solution is to fill in NULL fileanmes 
 32 |       # with "version.zip" and handle it at 
 33 |       # download stage 
 34 |       if (is.null(self$info$filenames)) {
 35 |         filenames <- "Source.zip"
 36 |       } else {
 37 |         filenames <- self$info$filenames
 38 |       }
 39 |       
 40 |       for(target_file in filenames) {
 41 |         version_file <- create_version_filename(version, target_file)  
 42 |         if (!self$storr$exists(version_file, "file") && download) {
 43 |          self$download(version, target_file, verbose)
 44 |         }
 45 |       }
 46 |       
 47 |       ## TODO: messy handling of single vs multiple files 
 48 |       if(length(filenames) == 1L) {
 49 |         self$read(version, filenames[1], self$info$read[[1]], reread)
 50 |       } else { 
 51 |         opened_files <- vector("list", length=length(filenames))
 52 |         for(index in 1:length(opened_files)) {
 53 |           opened_files[[index]] <- self$read(version, filenames[index], self$info$read[[index]], reread)
 54 |         }
 55 |         names(opened_files) <- filenames
 56 |         opened_files
 57 |       }
 58 |     },
 59 | 
 60 |     read = function(version, target_file, read_function, reread = FALSE) {
 61 |       version_file <- create_version_filename(version, target_file)
 62 |       if (reread || !self$cache$exists(version_file)) {
 63 |         file <- file.path(self$path, "file", self$storr$get(version_file, "file"))
 64 |         ret <- read_function(file)
 65 |         self$cache$set(version_file, ret, use_cache = FALSE)
 66 |       } else {
 67 |         ret <- self$cache$get(version_file, use_cache = FALSE)
 68 |       }
 69 |       ret
 70 |     },
 71 | 
 72 |     download = function(version, target_file, verbose = TRUE) {
 73 |       # API interaction 
 74 |       message(paste0("Downloading ", target_file))
 75 |       
 76 |       # This check needs to be changed 
 77 |       # to enable backwards compatibilty 
 78 |       if(target_file == "Source.zip") {  
 79 |         url <- github_api_source_url(version, self$info$repo, self$info$private)
 80 |         filename <- "Source.zip"
 81 |       } else {
 82 |         url <- github_api_release_url(version, target_file, self$info$repo,
 83 |                                       self$info$private)
 84 |         filename <- basename(sub("\\?.*$", "", url)) 
 85 |       }  
 86 |       
 87 |       ## needs new handling when source code is being pulled
 88 |       ext <- tools::file_ext(filename)
 89 |       if (nzchar(ext)) {
 90 |         ext <- paste0(".", ext)
 91 |       }
 92 |       
 93 |       tmp <- tempfile(tmpdir = file.path(self$path, "workdir"), fileext = ext)
 94 |       on.exit(unlink(tmp))
 95 |       download_file(url, dest = tmp, verbose = verbose, binary = ifelse(target_file == "Source.zip", FALSE, TRUE))
 96 |       
 97 |       # hash key for storr  
 98 |       hash <- unname(tools::md5sum(tmp))
 99 |       # file name 
100 |       file <- paste0(hash, ext)
101 |       dest <- file.path(self$path, "file", file)
102 |       if (!file.exists(dest)) {
103 |         file.rename(tmp, dest)
104 |       }
105 |       
106 |       version_filename <- create_version_filename(version, target_file)
107 |       self$storr$set(version_filename, hash, "hash")
108 |       self$storr$set(version_filename, file, "file")
109 |       
110 |     },
111 | 
112 |     versions = function(local = TRUE) {
113 |       if (local) {
114 |         local_files <- self$storr$list("file")
115 |         local_versions <- unlist(lapply(local_files, function(x) {regmatches(x,regexpr("^(\\d+\\.){2}\\d+(?:(\\.\\d+))?",x))}))
116 |         # Captures versions formatted x.y.z with optional 4th sub version
117 |         stringr::str_sort(unique(local_versions), numeric=TRUE)
118 |       } else {
119 |         rev(names(github_api_cache(self$info$private)$get(self$info$repo)))
120 |       }
121 |     },
122 | 
123 |     version_current = function(local = TRUE) {
124 |       v <- self$versions(local)
125 |       if (length(v) == 0L && local) {
126 |         v <- self$versions(FALSE)
127 |       }
128 |       if (length(v) == 0L) {
129 |         NULL
130 |       } else {
131 |         v[[length(v)]]
132 |       }
133 |     },
134 | 
135 |     del = function(version) {
136 |       if (is.null(version)) {
137 |         # delete all files cached with associated repository 
138 |         unlink(file.path(self$path, "file"), recursive = TRUE)
139 |         self$storr$destroy()
140 |       } else {
141 |         # delete all files from specified version
142 |         file_list_keys <- self$storr$list("file")[grepl(get_version_regex(version), self$storr$list("file"))]
143 |         if(length(file_list_keys) == 0L) {
144 |           stop(paste0("Version ", version, " is already deleted or does not exist"))
145 |         }
146 |         for(key in file_list_keys) {
147 |           file <- self$storr$get(key, "file")
148 |           unlink(file.path(self$path, "file", file))
149 |           self$storr$del(key, "file")
150 |           self$storr$del(key, "hash")
151 |           self$cache$del(key)
152 |         }
153 |       }
154 |     }
155 |   )
156 | )
157 | 


--------------------------------------------------------------------------------
/R/download.R:
--------------------------------------------------------------------------------
 1 | ## A file downloader that can (a) handle https and (b) actually fail
 2 | ## when the download fails.  Not sure why that combination is so hard,
 3 | ## but here it is:
 4 | ##
 5 | ## TODO: rewrite to use curl only, possibly with gabor's progress bar
 6 | ## package.
 7 | download_file <- function(url, ..., dest = tempfile(), overwrite = FALSE,
 8 |                           verbose = TRUE, binary = FALSE) {
 9 |   content <- httr::GET(url,
10 |                        httr::write_disk(dest, overwrite),
11 |                        if (verbose) httr::progress("down"),
12 |                        if (binary) httr::accept("application/octet-stream"),
13 |                        ...)
14 |   cat("\n")
15 |   code <- httr::status_code(content)
16 |   if (code >= 300L) {
17 |     stop(DownloadError(url, code))
18 |   }
19 |   dest
20 | }
21 | 
22 | DownloadError <- function(url, code) {
23 |   msg <- sprintf("Downloading %s failed with code %d", url, code)
24 |   structure(list(message = msg, call = NULL),
25 |             class = c("DownloadError", "error", "condition"))
26 | }
27 | 


--------------------------------------------------------------------------------
/R/github.R:
--------------------------------------------------------------------------------
  1 | ##' Information to describe how to process github releases
  2 | ##'
  3 | ##' The simplest case is where the data are stored in a single file
  4 | ##' attached to the release (this is different to the zip/tar.gz files
  5 | ##' that the web interface displays).  For example, a single csv file.
  6 | ##' In that case the filename argument can be safely ommited and we'll
  7 | ##' work it out based on the filename.
  8 | ##'
  9 | ##' @title Github release information
 10 | ##' @param repo Name of the repo in \code{username/repo} format.
 11 | ##'
 12 | ##' @param read Function to read the file.  See Details.
 13 | ##'
 14 | ##' @param private Is the repository private?  If so authentication
 15 | ##'   will be required for all actions.  Setting this is optional but
 16 | ##'   will result in better error messages because of the way GitHub
 17 | ##'   returns not found/404 (rather than forbidden/403) errors when
 18 | ##'   accessing private repositories without authorisation.
 19 | ##'
 20 | ##' @param filename Optional filename.  If omitted, all files in the
 21 | ##'   release can be used.  If the filename contains a star ("*") it
 22 | ##'   will be treated as a filename glob.  So you can do
 23 | ##'   \code{filename = "*.csv"} to match all csv files (dynamically
 24 | ##'   computed on each release).
 25 | ##'
 26 | ##' @param path Optional path in which to store the data.  If omitted
 27 | ##'   we use \code{\link{datastorr_path}} to generate a reasonable
 28 | ##'   path.
 29 | ##'
 30 | ##' @export
 31 | github_release_info <- function(repo, read, private = FALSE, filename = NULL,
 32 |                                 path = NULL) {
 33 |   ## TODO: filename name argument
 34 |   if (is.null(path)) {
 35 |     path <- datastorr_path(repo)
 36 |   }
 37 |   ## case for single function types
 38 |   if(!is.list(read)) {
 39 |     read <- c(read)
 40 |   }
 41 |   if (length(filename) != length(read) && !is.null(filename)) {
 42 |     stop("Each file requires a respective read function")
 43 |   } 
 44 |   for(read_function in read) {
 45 |     assert_function(read_function)
 46 |   }
 47 | 
 48 |   structure(list(path = path, repo = repo, private = private,
 49 |                  filenames = filename, 
 50 |                  read = read),
 51 |             class = "github_release_info")
 52 | }
 53 | 
 54 | ##' Get release versions
 55 | ##' @title Get release versions
 56 | ##' @param info Result of running \code{github_release_info}
 57 | ##'
 58 | ##' @param local Should we return local (TRUE) or github (FALSE)
 59 | ##'   version numbers?  Github version numbers are pulled once per
 60 | ##'   session only.  The exception is for
 61 | ##'   \code{github_release_version_current} which when given
 62 | ##'   \code{local = TRUE} will fall back on trying github if there are
 63 | ##'   no local versions.
 64 | ##'
 65 | ##' @export
 66 | ##' @author Rich FitzJohn
 67 | github_release_versions <- function(info, local = TRUE) {
 68 |   R6_datastorr$new(info)$versions(local)
 69 | }
 70 | 
 71 | 
 72 | ##' @rdname github_release_versions
 73 | ##' @export
 74 | github_release_version_current <- function(info, local = TRUE) {
 75 |   R6_datastorr$new(info)$version_current(local)
 76 | }
 77 | 
 78 | ##' Get a version of a data set, downloading it if necessary.
 79 | ##' @title Get data
 80 | ##' @param info Result of running \code{github_release_info}
 81 | ##'
 82 | ##' @param version Version to fetch.  If \code{NULL} it will get the
 83 | ##'   current version as returned by
 84 | ##'   \code{github_release_version_current()}
 85 | ##'
 86 | ##' @export
 87 | github_release_get <- function(info, version = NULL) {
 88 |   R6_datastorr$new(info)$get(version)
 89 | }
 90 | 
 91 | 
 92 | ##' Delete a local copy of a version (or all local copies).  Note that
 93 | ##' that does not affect the actual github release in any way!.
 94 | ##'
 95 | ##' @title Delete version
 96 | ##'
 97 | ##' @param info Result of running \code{github_release_info}
 98 | ##'
 99 | ##' @param version Version to delete.  If \code{NULL} it will delete
100 | ##'   the entire storr
101 | ##'
102 | ##' @export
103 | github_release_del <- function(info, version) {
104 |   R6_datastorr$new(info)$del(version)
105 | }
106 | 


--------------------------------------------------------------------------------
/R/github_api.R:
--------------------------------------------------------------------------------
  1 | ## Github API helpers.  There's a chance that some of this will port
  2 | ## to use the gh package once it's on CRAN.
  3 | 
  4 | cache <- new.env(parent = emptyenv())
  5 | github_api_cache <- function(private) {
  6 |   fetch <- function(key, namespace) {
  7 |     ret <- github_api_releases(list(repo = key, private = private))
  8 |     tag_names <- vcapply(ret, "[[", "tag_name")
  9 |     names(ret) <- strip_v(tag_names)
 10 |     i <- duplicated(names(ret))
 11 |     if (any(i)) {
 12 |       warning("Removing duplicated tag names: ",
 13 |               paste(sprintf("%s (%s)", names(ret)[i], tag_names[i]),
 14 |                     collapse = ", "))
 15 |       ret <- ret[!i]
 16 |     }
 17 |     ret
 18 |   }
 19 |   force(private)
 20 |   storr::storr_external(storr::driver_environment(cache), fetch)
 21 | }
 22 | 
 23 | github_api_cache_clear <- function(info) {
 24 |   github_api_cache(info$private)$del(info$repo)
 25 | }
 26 | 
 27 | github_api_release_info <- function(info, version) {
 28 |   st <- github_api_cache(info$private)
 29 |   vv <- strip_v(version)
 30 |   x <- st$get(info$repo)
 31 | 
 32 |   if (vv %in% names(x)) {
 33 |     ret <- x[[vv]]
 34 |   } else {
 35 |     url <- sprintf("https://api.github.com/repos/%s/releases/tags/%s",
 36 |                    info$repo, add_v(version))
 37 |     r <- httr::GET(url, datastorr_auth(info$private))
 38 |     if (httr::status_code(r) >= 300L) {
 39 |       msg <- httr::content(r)$message
 40 |       if (is.null(msg)){
 41 |         msg <- "(no message)"
 42 |       }
 43 |       stop(sprintf("No such release with error: %d, %s",
 44 |                    httr::status_code(r), msg))
 45 |     }
 46 |     ## Invalidate the cache as we're clearly out of date:
 47 |     github_api_cache_clear(info)
 48 |     ret <- httr::content(r)
 49 |   }
 50 |   ret
 51 | }
 52 | 
 53 | github_api_releases <- function(info) {
 54 |   ## TODO: This will be more nicely handled with the pagnation
 55 |   ## feature of Gabor's gh package but I'd rather that hits CRAN
 56 |   ## before depending on it.  Replace the following four lines with:
 57 |   ##   ret <- gh::gh("/repos/:repo/releases", repo = key)
 58 |   url <- sprintf("https://api.github.com/repos/%s/releases", info$repo)
 59 |   dat <- httr::GET(url,
 60 |                    query = list(per_page = 100),
 61 |                    datastorr_auth(info$private))
 62 |   httr::stop_for_status(dat)
 63 |   httr::content(dat)
 64 | }
 65 | 
 66 | github_api_release_delete <- function(info, version, yes = FALSE) {
 67 |   message(sprintf("Deleting version %s from %s", version, info$repo))
 68 |   if (!yes && !prompt_confirm()) {
 69 |     stop("Not deleting release")
 70 |   }
 71 |   x <- github_api_release_info(info, version)
 72 | 
 73 |   r <- httr::DELETE(x$url, datastorr_auth(TRUE))
 74 |   httr::stop_for_status(r)
 75 |   github_api_cache_clear(info)
 76 |   ## Need to also delete the tag:
 77 |   github_api_tag_delete(info, x$tag_name)
 78 |   invisible(TRUE)
 79 | }
 80 | 
 81 | github_api_tag_delete <- function(info, tag_name) {
 82 |   url <- sprintf("https://api.github.com/repos/%s/git/refs/tags/%s",
 83 |                  info$repo, tag_name)
 84 |   r <- httr::DELETE(url, datastorr_auth(TRUE))
 85 |   httr::stop_for_status(r)
 86 |   invisible(httr::content(r))
 87 | }
 88 | 
 89 | github_api_release_create <- function(info, version,
 90 |                                       description = NULL, target = NULL) {
 91 |   data <- list(tag_name = add_v(version),
 92 |                body = description,
 93 |                target_commitish = target)
 94 |   url <- sprintf("https://api.github.com/repos/%s/releases", info$repo)
 95 |   r <- httr::POST(url, body = drop_null(data), encode = "json",
 96 |                   datastorr_auth(TRUE))
 97 |   github_api_catch_error(r, "Failed to create release")
 98 |   github_api_cache_clear(info)
 99 |   invisible(httr::content(r))
100 | }
101 | 
102 | github_api_release_upload <- function(info, version, filename, name) {
103 |   x <- github_api_release_info(info, version)
104 |   r <- httr::POST(sub("\\{.+$", "", x$upload_url),
105 |                   query = list(name = name),
106 |                   body = httr::upload_file(filename),
107 |                   httr::progress("up"),
108 |                   datastorr_auth(TRUE))
109 |   cat("\n") # clean up after httr's progress bar :(
110 |   httr::stop_for_status(r)
111 |   github_api_cache_clear(info)
112 |   invisible(httr::content(r))
113 | }
114 | 
115 | github_api_release_update <- function(info, version,
116 |                                       description = NULL, target = NULL) {
117 |   x <- github_api_release_info(info, version)
118 |   data <- list(tag_name = version,
119 |                body = description,
120 |                target_commitish = target)
121 |   r <- httr::PATCH(x$url, body = drop_null(data),
122 |                    datastorr_auth(TRUE), encode = "json")
123 |   httr::stop_for_status(r)
124 |   github_api_cache_clear(info)
125 |   invisible(httr::content(r))
126 | }
127 | 
128 | github_api_repo <- function(info) {
129 |   url <- sprintf("https://api.github.com/repos/%s", info$repo)
130 |   r <- httr::GET(url, datastorr_auth(info$private))
131 |   httr::stop_for_status(r)
132 |   httr::content(r)
133 | }
134 | github_api_ref <- function(info, ref, type = "heads") {
135 |   type <- match.arg(type, c("heads", "tags"))
136 |   url <- sprintf("https://api.github.com/repos/%s/git/refs/%s/%s",
137 |                  info$repo, type, ref)
138 |   r <- httr::GET(url, datastorr_auth(info$private))
139 |   httr::stop_for_status(r)
140 |   httr::content(r)
141 | }
142 | 
143 | github_api_commit <- function(info, sha) {
144 |   url <- sprintf("https://api.github.com/repos/%s/git/commits/%s",
145 |                  info$repo, sha)
146 |   r <- httr::GET(url, datastorr_auth(info$private))
147 |   github_api_catch_error(r)
148 |   httr::content(r)
149 | }
150 | 
151 | github_api_catch_error <- function(r, message = NULL) {
152 |   code <- httr::status_code(r)
153 |   if (code > 300L) {
154 |     x <- httr::content(r)
155 |     if (code == 422L) {
156 |       e <- x$errors[[1]]
157 |       msg <- paste(e$resource, sub("_", " ", e$code))
158 |       if (!is.null(x$message)) {
159 |         msg <- sprintf("%s (%s)", msg, x$message)
160 |       }
161 |     } else {
162 |       msg <- httr::http_status(r)$message
163 |     }
164 |     if (!is.null(message)) {
165 |       msg <- sprintf("%s: %s", message, msg)
166 |     }
167 |     stop(msg, call. = FALSE)
168 |   }
169 | }
170 | 
171 | github_api_source_url <- function(version, repo, private) {
172 |   dat <- github_api_cache(private)$get(repo)
173 |   x <- dat[[strip_v(version)]]
174 |   if (is.null(x)) {
175 |     stop("No such release ", version)
176 |   }
177 |   source_zip_url <- x$zipball_url
178 |   source_zip_url
179 | }
180 | 
181 | github_api_release_url <- function(version, filename, repo, private) {
182 |   dat <- github_api_cache(private)$get(repo)
183 |   x <- dat[[strip_v(version)]]
184 |   if (is.null(x)) {
185 |     stop("No such release ", version)
186 |   }
187 |   files <- vcapply(x$assets, "[[", "name")
188 |   if (is.null(filename)) {
189 |     if (length(files) == 1L) {
190 |       i <- 1L
191 |     } else {
192 |       stop("Multiple files not yet handled and no filename given")
193 |     }
194 |   } else {
195 |     
196 |     # resolve here 
197 |     file_string_captures <- sapply(files, function(x) {grepl(filename, x, fixed = TRUE)})
198 |     resolved_filename <- files[which(file_string_captures)]
199 |     if(length(resolved_filename) != 1) {
200 |       stop(sprintf("File %s could not be resolved in release.",
201 |                    filename, paste(files, collapse = ", ")))
202 |     }
203 |     
204 |     i <- match(resolved_filename, files)
205 |     
206 |     # if (is.na(i)) original check 
207 |     if (is.na(i)) {
208 |       # TODO: this does not report found filename
209 |       stop(sprintf("File %s not found in release (did find: )",
210 |                    filename, paste(files, collapse = ", ")))
211 |     }
212 |   }
213 | 
214 |    if (private) {
215 |     ## https://stackoverflow.com/a/35688093
216 |     token <- datastorr_auth(private, token_only = TRUE)
217 |     url <- sprintf("%s?access_token=%s", x$assets[[i]]$url, token)
218 |   } else {
219 |     url <- x$assets[[i]]$browser_download_url
220 |   }
221 |   url
222 | }
223 | 
224 | 
225 | ## Consistently deal with leading vs; we'll just remove them
226 | ## everywhere that has them and that way vx.y.z will match x.y.z and
227 | ## v.v.  Pretty strict matching though.
228 | strip_v <- function(x) {
229 |   if (inherits(x, "AsIs")) {
230 |     x
231 |   } else {
232 |     sub("^v([0-9]+([-_.][0-9]+){0,2})", "\\1", x)
233 |   }
234 | }
235 | add_v <- function(x) {
236 |   if (!inherits(x, "AsIs")) {
237 |     i <- grepl("^([0-9]+([-_.][0-9]+){0,2})$", x)
238 |     x[i] <- paste0("v", x[i])
239 |   }
240 |   x
241 | }
242 | 
243 | drop_null <- function(x) {
244 |   x[!vapply(x, is.null, logical(1))]
245 | }
246 | 


--------------------------------------------------------------------------------
/R/github_auth.R:
--------------------------------------------------------------------------------
  1 | ##' Authentication for accessing GitHub.  This will first look for a
  2 | ##' GitHub personal token (stored in the \code{GITHUB_TOKEN} or
  3 | ##' \code{GITHUB_PAT} environment variables, and then try
  4 | ##' authenticating with OAuth.
  5 | ##'
  6 | ##' Run this \code{datastorr_auth} function to force setting up
  7 | ##' authentication with OAuth.  Alternatively, run
  8 | ##' \code{setup_github_token} to set up a personal access token.
  9 | ##' Either can be revoked at any time
 10 | ##' \url{https://github.com/settings/tokens} to revke a personal
 11 | ##' access token and \url{https://github.com/settings/applications} to
 12 | ##' revoke the OAuth token.
 13 | ##'
 14 | ##' @title datastorr/GitHub authentication
 15 | ##' @param required Is authentication required?  Reading from public
 16 | ##'   repositories does not require authentication so there's no point
 17 | ##'   worrying if we can't get it.  datastorr will set this when
 18 | ##'   appropriate internally.
 19 | ##' @param key,secret The application key and secret.  If \code{NULL},
 20 | ##'   uses datastorr's key.  But if you have your own application feel
 21 | ##'   free to replace these with your own.
 22 | ##' @param cache Logical, indicating whether we should cache the
 23 | ##'   token.  If \code{TRUE}, the token will be cached at
 24 | ##'   \code{\link{datastorr_auth}()}, so that it is accessible to all
 25 | ##'   datastorr usages.  Note that this is affected by the
 26 | ##'   \code{datastorr.path} global option.  Alternatively, set
 27 | ##'   \code{FALSE} to do no caching and be prompted each session or a
 28 | ##'   string to choose your own filename.  Or set the
 29 | ##'   \code{GITHUB_TOKEN} or \code{GITHUB_PAT} environment variables
 30 | ##'   to use a token rather than OAuth.
 31 | ##' @param token_only return the token only
 32 | ##' @export
 33 | datastorr_auth <- function(required = FALSE, key = NULL, secret = NULL,
 34 |                            cache = TRUE, token_only = FALSE) {
 35 |   token <- github_token(token_only)
 36 |   ## Only go out to OAuth if required:
 37 |   if (!token_only && required && is.null(token)) {
 38 |     token <- datastorr_oauth(key, secret, cache)
 39 |   }
 40 |   if (required && is.null(token)) {
 41 |     stop("GitHub token not found; please see ?datastorr_token")
 42 |   }
 43 |   invisible(token)
 44 | }
 45 | 
 46 | ## NOTE: also using GITHUB_PAT because that's what devtools uses so
 47 | ## might be able to piggy back of that in some cases, but starting
 48 | ## with GITHUB_TOKEN because it's more self explanatory and Hadley
 49 | ## also uses that in the httr "Best practices" document.
 50 | ##
 51 | ## My token doesn't seem to have the right scope at present so
 52 | ## temporarily expandsing this a bit.
 53 | github_token <- function(token_only = FALSE) {
 54 |   token <- Sys.getenv("DATASTORR_TOKEN",
 55 |                       Sys.getenv("GITHUB_TOKEN",
 56 |                                  Sys.getenv("GITHUB_PAT", "")))
 57 |   if (token == "") {
 58 |     NULL
 59 |   } else if (token_only) {
 60 |     token
 61 |   } else {
 62 |     httr::authenticate(token, "x-oauth-basic", "basic")
 63 |   }
 64 | }
 65 | 
 66 | datastorr_oauth <- function(key = NULL, secret = NULL, cache = TRUE) {
 67 |   if (is.null(key)) {
 68 |     key <- "d6da716e8eabccb6e3db"
 69 |     secret <- "4e83b024b12bb249f1052cfb1c259bd3baa5e672"
 70 |   }
 71 |   if (isTRUE(unname(cache))) {
 72 |     ## Here, we might want to consider trying both the option with and
 73 |     ## without the options for datastorr.path because if an option is
 74 |     ## set we don't want to have to redo the auth just for that
 75 |     ## application?
 76 |     cache <- file.path(datastorr_path(), "httr-oath")
 77 |     dir.create(dirname(cache), FALSE, TRUE)
 78 |   }
 79 |   endpoint <- httr::oauth_endpoints("github")
 80 |   app <- httr::oauth_app("github/datastorr", key = key, secret = secret)
 81 |   token <- httr::oauth2.0_token(endpoint, app, scope = "repo", cache = cache)
 82 |   httr::config(token = token)
 83 | }
 84 | 
 85 | ##' @export
 86 | ##' @rdname datastorr_auth
 87 | ##' @param path Path to environment file; the default is the user
 88 | ##'   environment variable file which is usually a good choice.
 89 | setup_github_token <- function(path = "~/.Renviron") {
 90 |   if (file.exists(path)) {
 91 |     dat <- readLines(path)
 92 |     if (any(grepl("^\\s*GITHUB_TOKEN\\s*=[A-Za-z0-9]+\\s*$", dat))) {
 93 |       message("Your GitHub token is set!")
 94 |       return(invisible())
 95 |     } else {
 96 |       message("Did not find GitHub token in ", path)
 97 |     }
 98 |   }
 99 | 
100 |   message("In the page that will open:")
101 |   message("  1. add a description (e.g. your computer name)")
102 |   message("  2. click 'Generate token'")
103 |   message("  3. copy the token or click the 'copy' button")
104 |   message("  4. close the window and come back to R")
105 |   if (!prompt_confirm()) {
106 |     stop("Cancelling", call. = FALSE)
107 |   }
108 |   utils::browseURL("https://github.com/settings/tokens/new")
109 | 
110 |   message("  5. paste your token in below and press return")
111 |   token <- readline("GITHUB_TOKEN = ")
112 |   prompt <- sprintf("Add token %s to '%s'?",
113 |                     sub("^(...).*(...)$", "\\1...\\2", token), path)
114 |   if (nchar(token) == 0L || !prompt_confirm(prompt)) {
115 |     stop("Cancelling", call. = FALSE)
116 |   }
117 | 
118 |   environ <- c("# Added by datastorr:", paste0("GITHUB_TOKEN=", token))
119 |   if (file.exists(path)) {
120 |     environ <- c(readLines(path), environ)
121 |   }
122 |   writeLines(environ, path)
123 |   Sys.setenv(GITHUB_TOKEN = token)
124 | }
125 | 


--------------------------------------------------------------------------------
/R/github_release.R:
--------------------------------------------------------------------------------
  1 | ##' Create a github release for your package.  This tries very hard to
  2 | ##' do the right thing but it's not always straightforward.  It first
  3 | ##' looks for your package.  Then it will work out what your last
  4 | ##' commit was (if \code{target} is NULL), the version of the package
  5 | ##' (from the DESCRIPTION).  It then creates a release on GitHub with
  6 | ##' the appropriate version number and uploads the file
  7 | ##' \code{filename} to the release.  The version number in the
  8 | ##' DESCRIPTION must be greater than the highest version number on
  9 | ##' GitHub.
 10 | ##'
 11 | ##' This function requires a system git to be installed and on the
 12 | ##' path.  The version does not have to be particularly recent.
 13 | ##'
 14 | ##' This function also requires the \code{GITHUB_TOKEN} environment
 15 | ##' variable to be set, and for the token to be authorised to have
 16 | ##' write access to your repositories.
 17 | ##'
 18 | ##' @title Create a github release
 19 | ##'
 20 | ##' @param info Result of running \code{github_release_info}
 21 | ##'
 22 | ##' @param description Optional text description for the release.  If
 23 | ##'   this is omitted then GitHub will display the commit message from
 24 | ##'   the commit that the release points at.
 25 | ##'
 26 | ##' @param filenames Filename to upload; optional if in \code{info}.
 27 | ##'   If listed in \code{info}, \code{filename} can be different but
 28 | ##'   the file will be renamed to \code{info$filename} on uploading.
 29 | ##'   If given but not in \code{info}, the uploaded file will be
 30 | ##'   \code{basename(filename)} (i.e., the directory will be
 31 | ##'   stripped).
 32 | ##'
 33 | ##' @param target Target of the release.  This can be either the name
 34 | ##'   of a branch (e.g., \code{master}, \code{origin/master}),
 35 | ##'   existing tag \emph{without a current release} or an SHA of a
 36 | ##'   commit.  It is an error if the commit that this resolves to
 37 | ##'   locally is not present on GitHub (e.g., if your branch is ahead
 38 | ##'   of GitHub).  Push first!
 39 | ##'
 40 | ##' @param ignore_dirty Ignore non-checked in files?  By default, your
 41 | ##'   repository is expected to be in a clean state, though files not
 42 | ##'   known to git are ignored (as are files that are ignored by git).
 43 | ##'   But you must have no uncommited changes or staged but uncommited
 44 | ##'   files.
 45 | ##'   
 46 | ##' @param  binary Arguement to determine whether to upload binaries, 
 47 | ##'   or default to the source code generated in a version. If \code{binary}
 48 | ##'   is \code{FALSE}, users can only pull \code{Source.zip} for that
 49 | ##'   particular version.
 50 | ##'
 51 | ##' @param yes Skip the confirmation prompt?  Only prompts if
 52 | ##'   interactive.
 53 | ##'
 54 | ##' @export
 55 | github_release_create <- function(info, description = NULL, filenames = NULL,
 56 |                                   target = NULL, ignore_dirty = FALSE, binary = TRUE, 
 57 |                                   yes = !interactive()) {
 58 |   if(binary) {
 59 |     if (is.null(filenames)) {
 60 |       if (is.null(info$filenames)) {
 61 |         stop("list of filenames must be given")
 62 |       }
 63 |       filenames <- info$filenames
 64 |     }
 65 |     ## resolve abbreviated filenames 
 66 |     resolved_filenames <- verify_files(filenames)
 67 |     info$filenames <- resolved_filenames
 68 |     fill_info_files(info, resolved_filenames)
 69 |   }
 70 | 
 71 |   dat <- github_release_package_info(info, target)
 72 | 
 73 |   github_release_create_(info, dat, resolved_filenames, binary, version, description,
 74 |                          ignore_dirty, yes)
 75 | }
 76 | 
 77 | github_release_create_ <- function(info, dat, filename, binary, version, description,
 78 |                                    ignore_dirty, yes) {
 79 |   if(binary) {
 80 |     
 81 |     ftarget <- if (is.null(info$filename)) basename(filename) else info$filename
 82 |     
 83 |     ## TODO: will this fail in the case where info$filename is null?
 84 |     msg_file <- sprintf("  file: %s (as %s) %.2f KB", filename, ftarget,
 85 |                         file.info(filename)$size / 1024)
 86 |   }
 87 |   
 88 |   github_release_preflight(dat, ignore_dirty)
 89 | 
 90 |   ## This is the complicated bit of the message; enough context to
 91 |   ## know if the message looks good.
 92 |   msg_at <- c("  at:",
 93 |               paste0("    sha: ", dat$sha_remote$sha),
 94 |               paste0("    date: ", dat$sha_remote$committer$date),
 95 |               paste0("    message: ",
 96 |                      paste(dat$sha_remote$message, collapse = "\n")),
 97 |               paste0("    by: ",
 98 |                      sprintf("%s <%s>",
 99 |                      dat$sha_remote$committer$name,
100 |                      dat$sha_remote$committer$email)))
101 | 
102 |   version <- add_v(dat$version_local)
103 |   target <- dat$sha_local
104 | 
105 |   message("Will create release:")
106 |   message("  tag: ", version)
107 |   message(paste(msg_at, collapse = "\n"))
108 |   if(binary) message(msg_file)
109 |   message("  description: ",
110 |           if (is.null(description)) "(no description)" else description)
111 | 
112 |   if (!yes && !prompt_confirm()) {
113 |     stop("Not creating release")
114 |   }
115 | 
116 |   ret <- github_api_release_create(info, version, description, target)
117 |   
118 |   ## TODO: loop this to upload multiple files
119 |   if(binary) {
120 |     asset = list()
121 |     for(index in 1:length(filename)) {
122 |       asset[index] <- list(github_api_release_upload(info, version,filename[index], info$filename[index]))
123 |     }
124 |     ret$assets <- asset
125 |   } 
126 | 
127 |   message("Created release!")
128 |   message("Please check the page to make sure everything is OK:\n",
129 |           ret$html_url)
130 |   if (interactive() && !yes && prompt_confirm("Open in browser?")) {
131 |     utils::browseURL(ret$html_url)
132 |   }
133 |   invisible(ret)
134 | }
135 | 
136 | github_release_package_info <- function(info, sha_local = NULL,
137 |                                         version = NULL) {
138 |   ## This can be done with either system commands or with git2r.  Not
139 |   ## entirely sure which is the least bad way of doing it.
140 |   git <- Sys.which("git")
141 |   if (git == "") {
142 |     stop("Need a system git to create releases: http://git-scm.com")
143 |   }
144 | 
145 |   if (is.null(version)) {
146 |     git_root <- system2(git, c("rev-parse", "--show-toplevel"), stdout = TRUE)
147 |     pkg_root <- find_package_root(git_root)
148 |     dcf <- as.list(read.dcf(file.path(pkg_root, "DESCRIPTION"))[1,])
149 |     version_local <- dcf$Version
150 |   } else {
151 |     version_local <- version
152 |   }
153 |   version_remote <- github_release_version_current(info, FALSE)
154 | 
155 |   if (is.null(sha_local)) {
156 |     sha_local <- system2(git, c("rev-parse", "HEAD"), stdout = TRUE)
157 |   } else {
158 |     err <- tempfile()
159 |     on.exit(file.remove(err))
160 |     res <- suppressWarnings(
161 |       system2(git, c("rev-parse", sha_local), stdout = TRUE, stderr = err))
162 |     code <- attr(res, "status", exact = TRUE)
163 |     if (!is.null(code) && code != 0L) {
164 |       stop(paste(c("Did not find sha in local git tree: ", readLines(err)),
165 |                  collapse = "\n"))
166 |     }
167 |     sha_local <- as.character(res)
168 |   }
169 |   sha_remote <- tryCatch(github_api_commit(info, sha_local),
170 |                          error = function(e) NULL)
171 | 
172 |   status <- system2(git, c("status", "--porcelain", "--untracked-files=no"),
173 |                     stdout = TRUE)
174 |   dirty <- length(status) > 0L
175 | 
176 |   nversion_local <- numeric_version(version_local)
177 |   if (is.null(version_remote)) {
178 |     nversion_remote <- NULL
179 |   } else {
180 |     nversion_remote <- numeric_version(strip_v(version_remote))
181 |   }
182 | 
183 |   list(version_local = version_local,
184 |        version_remote = version_remote,
185 |        nversion_local = nversion_local,
186 |        nversion_remote = nversion_remote,
187 |        sha_local = sha_local,
188 |        sha_remote = sha_remote,
189 |        status = status,
190 |        dirty = dirty)
191 | }
192 | 
193 | github_release_preflight <- function(dat, ignore_dirty = FALSE) {
194 |   if (is.null(dat$sha_remote)) {
195 |     stop(sprintf("Could not resolve sha %s on remote", dat$sha_local))
196 |   }
197 | 
198 |   if (dat$dirty && !ignore_dirty) {
199 |     msg <- paste(c("Local git is dirty (untracked files ignored):",
200 |                    dat$status), collapse = "\n")
201 |     stop(msg)
202 |   }
203 | 
204 |   if (!is.null(dat$nversion_remote) &&
205 |        dat$nversion_remote >= dat$nversion_local) {
206 |     stop(sprintf("Local version (%s) is not ahead of remote version (%s)",
207 |                  dat$version_local, dat$version_remote))
208 |   }
209 | }
210 | 


--------------------------------------------------------------------------------
/R/simple.R:
--------------------------------------------------------------------------------
  1 | ## This is the simple interface.  The simplest thing to do is to
  2 | ## assume the same github interface for now.  I like that because it's
  3 | ## really simple but another, even simpler, approach would be to store
  4 | ## pointers somewhere and grab files from there.  To some degree that
  5 | ## can be done more efficiently just with storr though.
  6 | ##
  7 | ## TODO: having this support OKFN data packages would seem preferable.
  8 | ## But I don't know that they have enough of this information in it.
  9 | 
 10 | ##' Create a lightweight datastorr interface (rather than using the
 11 | ##' full package approach).  This approach is designed for the
 12 | ##' "files that don't fit in git" use-case.
 13 | ##'
 14 | ##' Note that the package approach is likely to scale better; in
 15 | ##' particular it allows for the reading function to be arbitrarily
 16 | ##' complicated, allows for package installation and loading, etc.
 17 | ##' With this simple interface you will need to document your
 18 | ##' dependencies carefully.  But it does remove the requirement for
 19 | ##' making a package and will likely work pretty well as part of an
 20 | ##' analysis pipeline where your dependencies are well documented
 21 | ##' anyway.
 22 | ##' @title Fetch data from a datastorr repository
 23 | ##' @param repo Either a github repo in the form
 24 | ##'   \code{<username>/<repo>} (e.g.,
 25 | ##'   \code{"richfitz/data"} or the path to a json file
 26 | ##'   on your filesystem.
 27 | ##' @param path The path to store the data at.  Using \code{NULL} will
 28 | ##' @param metadata The name of the metadata file within the repo (if
 29 | ##'   \code{repo} refers to a github repo.  The default is
 30 | ##'   \code{datastorr.json} at the root of the repository, but any
 31 | ##'   other filename can be used.
 32 | ##' @param branch The branch in the repo to use.  Default is
 33 | ##'   \code{master}.
 34 | ##' @param private A logical indicating if the repository is private
 35 | ##'   and therefor if authentication will be needed to access it.
 36 | ##' @param refetch Refetch the metadata file even if it has already
 37 | ##'   been downloaded previously.
 38 | ##' @param version Which version to download (if \code{extended} is
 39 | ##'   \code{FALSE} -- the default).  By default the most recent
 40 | ##'   version on the remote, or the current version locally will be
 41 | ##'   fetched.
 42 | ##' @param extended Don't fetch the data, but instead return an object
 43 | ##'   that can query data, versions, etc.
 44 | ##' @export
 45 | ##' @examples
 46 | ##' \dontrun{
 47 | ##' path <- tempfile()
 48 | ##' dat <- datastorr("richfitz/data", path, extended = TRUE)
 49 | ##' dat$list()
 50 | ##' dat()
 51 | ##' }
 52 | datastorr <- function(repo, path = NULL,
 53 |                       metadata = "datastorr.json", branch = "master",
 54 |                       private = FALSE, refetch = FALSE,
 55 |                       version = NULL, extended = FALSE) {
 56 |   info <- datastorr_info(repo, path, metadata, branch, private, refetch)
 57 |   obj <- R6_datastorr$new(info)
 58 |   if (extended) {
 59 |     if (!is.null(version)) {
 60 |       warning("Ignoring argument 'version'")
 61 |     }
 62 |     obj
 63 |   } else {
 64 |     version <- version %||% obj$version_current()
 65 |     if (is.null(version)) {
 66 |       stop(sprintf("No versions found at '%s'", info$repo))
 67 |     }
 68 |     obj$get(version)
 69 |   }
 70 | }
 71 | 
 72 | 
 73 | ##' @param ... Arguments passed through to \code{datastorr}
 74 | ##' @param local Return information on local versions?
 75 | ##' @export
 76 | ##' @rdname datastorr
 77 | datastorr_versions <- function(..., local = TRUE) {
 78 |   datastorr(..., extended = TRUE)$versions(local)
 79 | }
 80 | 
 81 | 
 82 | ##' Create a relase for a simple datastorr (i.e., non-package based).
 83 | ##'
 84 | ##' @title Release data to a datastorr repository
 85 | ##'
 86 | ##' @inheritParams datastorr
 87 | ##'
 88 | ##' @param version A version number for the new version.  Should be of
 89 | ##'   the form x.y.z, and may or may not contain a leading "v" (one
 90 | ##'   will be added in any case).
 91 | ##'
 92 | ##' @param description Optional text description for the release.  If
 93 | ##'   this is omitted then GitHub will display the commit message from
 94 | ##'   the commit that the release points at.
 95 | ##'
 96 | ##' @param filename Filename to upload; optional if in
 97 | ##'   \code{datastorr.json}.  If listed, \code{filename} can be
 98 | ##'   different but the file will be renamed on uploading.  If given
 99 | ##'   but not in \code{info}, the uploaded file will be
100 | ##'   \code{basename(filename)} (i.e., the directory will be
101 | ##'   stripped).
102 | ##'
103 | ##' @param target The SHA or tag to attach the release to.  By
104 | ##'   default, will use the current HEAD, which is typically what you
105 | ##'   want to do.
106 | ##'
107 | ##' @param ignore_dirty Ignore non-checked in files?  By default, your
108 | ##'   repository is expected to be in a clean state, though files not
109 | ##'   known to git are ignored (as are files that are ignored by git).
110 | ##'   But you must have no uncommited changes or staged but uncommited
111 | ##'   files.
112 | ##'
113 | ##' @param yes Skip the confirmation prompt?  Only prompts if
114 | ##'   interactive.
115 | ##' @export
116 | release <- function(repo, version, description = NULL, filename = NULL,
117 |                     path = NULL, metadata = "datastorr.json",
118 |                     branch = "master", private = FALSE, refetch = FALSE,
119 |                     target = NULL, ignore_dirty = FALSE,
120 |                     yes = !interactive()) {
121 |   info <- datastorr_info(repo, path, metadata, branch, private, refetch)
122 |   if (is.null(filename)) {
123 |     filename <- info$filename
124 |     if (is.null(filename)) {
125 |       stop("filename must be given (as is not included in json)")
126 |     }
127 |   }
128 | 
129 |   dat <- github_release_package_info(info, target, version)
130 |   github_release_create_(info, dat, filename, version, description,
131 |                          ignore_dirty, yes)
132 | }
133 | 
134 | datastorr_info <- function(repo, path = NULL, metadata = "datastorr.json",
135 |                            branch = "master", private = FALSE,
136 |                            refetch = FALSE) {
137 |   if (file.exists(repo)) {
138 |     info <- read_metadata(repo, NULL, path)
139 |     if (private && is.null(info$private)) {
140 |       info$private <- TRUE
141 |     }
142 |   } else {
143 |     if (is.null(path)) {
144 |       check_repo(repo)
145 |       path <- datastorr_path(repo)
146 |     }
147 |     ## TODO: in the case of non-NULL path, consider stuffing the
148 |     ## metadata into the storr above (so that things are self
149 |     ## contained) but into a different namespace (e.g. metadata).
150 |     ##
151 |     ## TODO: add support for a options() path for storing file at.
152 |     cache <- storr::storr_rds(path, default_namespace = "datastorr")
153 |     if (cache$exists("info") && !refetch) {
154 |       info <- cache$get("info")
155 |     } else {
156 |       url <- sprintf("https://raw.githubusercontent.com/%s/%s/%s",
157 |                      repo, branch, metadata)
158 |       tmp <- download_file(url, datastorr_auth(private))
159 |       on.exit(file.remove(tmp))
160 |       info <- read_metadata(tmp, repo, path)
161 |       info$private <- private
162 |       cache$set("info", info)
163 |     }
164 |   }
165 |   info
166 | }
167 | 
168 | read_metadata <- function(filename, repo = NULL, path = NULL) {
169 |   req <- c("read")
170 |   valid <- union(req, c("repo", "filename", "private", "args"))
171 | 
172 |   info <- jsonlite::fromJSON(filename)
173 |   err <- setdiff(req, names(info))
174 |   if (length(err) > 0L) {
175 |     stop("Missing required files in metadata file: ",
176 |          paste(err, collapse = ", "))
177 |   }
178 |   err <- setdiff(names(info), valid)
179 |   if (length(err) > 0L) {
180 |     stop("Unexpected data in metadata file: ", paste(err, collapse = ", "))
181 |   }
182 | 
183 |   if (is.null(info$repo)) {
184 |     if (is.null(repo)) {
185 |       stop("repo must be supplied if not present in metadata")
186 |     }
187 |     info$repo <- repo
188 |   }
189 | 
190 |   if (is.null(info$private)) {
191 |     info$private <- FALSE
192 |   } else {
193 |     p <- info$private
194 |     if (!(length(p) == 1L && is.logical(p) && !is.na(p))) {
195 |       stop("Expected non-NA scalar logical for private")
196 |     }
197 |   }
198 | 
199 |   ## So this is fundamentally dangerous because it evaluates code
200 |   ## straight from the internet.  Worth thinking about!
201 |   expr <- parse(text = info$read, keep.source = FALSE)
202 |   fn_def <- function(x) {
203 |     is.name(x) || (
204 |       is.recursive(x) && (
205 |         identical(x[[1L]], quote(`function`)) ||
206 |         identical(x[[1L]], quote(`::`))))
207 |   }
208 |   ok <- length(expr) == 1L && fn_def(expr[[1L]])
209 |   if (!ok) {
210 |     stop("`read` must be a function definition or symbol")
211 |   }
212 |   read <- eval(expr, envir = .GlobalEnv)
213 | 
214 |   ## The other way of doing this is to store:
215 |   ##
216 |   ##   "read": "function(x) read.csv(x, stringsAsFactors = TRUE)"
217 |   ##
218 |   ## which evaluates to a function with all the right bits bound.
219 |   if ("args" %in% names(info)) {
220 |     read_fun <- read
221 |     args <- info$args
222 |     read <- function(x) do.call(read, c(list(x), args))
223 |   }
224 | 
225 |   github_release_info(info$repo, read, info$private,
226 |                       info$filename, path)
227 | }
228 | 
229 | check_repo <- function(repo) {
230 |   if (length(repo) != 1L) {
231 |     stop("Expected a scalar for 'repo'")
232 |   }
233 |   x <- strsplit(repo, "/", fixed = TRUE)[[1L]]
234 |   if (length(x) != 2L) {
235 |     stop("Expected a string of form <username>/<repo> for 'repo'")
236 |   }
237 | }
238 | 
239 | ##' Location of datastorr files.  This is determined by
240 | ##' \code{rappdirs} using the \code{user_data_dir} function.
241 | ##' Alternatively, if the option \code{datastorr.path} is set, that is
242 | ##' used for the base path.  The path to data from an actual repo is
243 | ##' stored in a subdirectory under this directory.
244 | ##'
245 | ##' Files in this directory can be deleted at will (e.g., running
246 | ##' \code{unlink(datastorr_path(), recursive = TRUE)} will delete all
247 | ##' files that datstorr has ever downloaded.  The only issue here is
248 | ##' that the OAuth token (used to authenticate with GitHub) is also
249 | ##' stored in this directory.
250 | ##'
251 | ##' @title Location of datastorr files
252 | ##'
253 | ##' @param repo An optional repo (of the form \code{user/repo}, though
254 | ##'   this is not checked).
255 | ##'
256 | ##' @export
257 | datastorr_path <- function(repo = NULL) {
258 |   path <- getOption("datastorr.path", rappdirs::user_data_dir("datastorr"))
259 |   if (is.null(repo)) path else  file.path(path, repo)
260 | }
261 | 


--------------------------------------------------------------------------------
/R/utils.R:
--------------------------------------------------------------------------------
  1 | vcapply <- function(X, FUN, ...) {
  2 |   vapply(X, FUN, character(1), ...)
  3 | }
  4 | 
  5 | stop_quietly <- function() {
  6 |   opt <- options(show.error.messages = FALSE)
  7 |   on.exit(options(opt))
  8 |   stop()
  9 | }
 10 | 
 11 | assert_function <- function(x, name = deparse(substitute(x))) {
 12 |   if (!is.function(x)) {
 13 |     stop(sprintf("%s must be a function", name), call. = FALSE)
 14 |   }
 15 | }
 16 | 
 17 | assert_file <- function(filename) {
 18 |   if(!file.exists(filename)) {
 19 |     stop(sprintf("%s doesn't exist or cannot be found", filename, call. = FALSE))
 20 |   }
 21 | }
 22 | 
 23 | verify_files <- function(files) {
 24 |   ## Search through current working directory to resolve filename
 25 |   local_files_dir <- list.files(path=".")
 26 |   verified_filenames <- c()
 27 |  
 28 |   for(filename in files) {
 29 |     local_files_dir <- list.files(path=".")
 30 |     resolved_filename <- local_files_dir[grepl(filename, local_files_dir )]
 31 |     
 32 |     if (length(resolved_filename) != 1) {
 33 |       stop(paste0("Using file keyword \"", filename, "\" resolved none or multiple filenames.
 34 |                   Please ensure that your file keyword matches exactly ONE filename in your working directory."))
 35 |     } else { 
 36 |       message(paste0("Matched keyword ", filename, " to ", resolved_filename))
 37 |     } 
 38 |     
 39 |     if (interactive() && !prompt_confirm(paste0("Upload ", resolved_filename, "?"))) {
 40 |       message("Stopping release")
 41 |       stop_quietly()
 42 |     }
 43 |     
 44 |     assert_file(resolved_filename)
 45 |     verified_filenames <- c(verified_filenames, resolved_filename)
 46 |   }
 47 |   
 48 |   verified_filenames 
 49 | }
 50 | 
 51 | fill_info_files <- function(info, filenames) {
 52 |   info$filenames <- filenames
 53 |   
 54 |   for(filename in info$filenames) {
 55 |     if (grepl("/", filename, fixed = TRUE)) {
 56 |       stop("Expected path-less info$filename")
 57 |     }
 58 |   }
 59 | }
 60 | 
 61 | create_version_filename = function(version, filename) {
 62 |   paste0(version, "_", filename)
 63 | }
 64 | 
 65 | get_version_regex <- function(version) {
 66 |   version_values <- unlist(stringr::str_match_all(version, pattern="\\d"))
 67 |   paste0("^", version_values[1], "\\.", version_values[2], "\\.", version_values[3])
 68 | }
 69 | 
 70 | ## From dide-tools/encryptr:
 71 | prompt_confirm <- function(msg = "continue?", valid = c(n = FALSE, y = TRUE),
 72 |                            default = names(valid)[[1]]) {
 73 |   valid_values <- names(valid)
 74 |   msg <- sprintf("%s [%s]: ", msg,
 75 |                  paste(c(toupper(default), setdiff(valid_values, default)),
 76 |                        collapse = "/"))
 77 |   repeat {
 78 |     x <- trimws(tolower(readline(prompt = msg)))
 79 |     if (!nzchar(x)) {
 80 |       x <- default
 81 |     }
 82 |     if (x %in% valid_values) {
 83 |       return(valid[[x]])
 84 |     } else {
 85 |       cat("Invalid choice\n")
 86 |     }
 87 |   }
 88 | }
 89 | 
 90 | dquote <- function(x) {
 91 |   sprintf('"%s"', x)
 92 | }
 93 | 
 94 | ## Will this work on windows?
 95 | find_package_root <- function(stop_by = "/") {
 96 |   root <- normalizePath(stop_by, mustWork = TRUE)
 97 |   f <- function(path) {
 98 |     if (file.exists(file.path(path, "DESCRIPTION"))) {
 99 |       return(path)
100 |     }
101 |     if (normalizePath(path, mustWork = TRUE) == root) {
102 |       stop("Hit the root without finding a package")
103 |     }
104 |     Recall(file.path("..", path))
105 |   }
106 |   normalizePath(f("."), mustWork = TRUE)
107 | }
108 | 
109 | 
110 | `%||%` <- function(a, b) {
111 |   if (is.null(a)) b else a
112 | }
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # datastorr
  2 | 
  3 | Simple data retrieval and versioning using GitHub
  4 | 
  5 | This project is described in a [paper](https://peerj.com/preprints/3401v1) by [Daniel Falster](https://github.com/dfalster/), [Rich FitzJohn](https://github.com/richfitz/), [Matt Pennell](https://github.com/mwpennell/), and [Will Cornwell](https://github.com/wcornwell/). Below we describe the motivation and general idea. Please see the paper for full details.
  6 | 
  7 | ## The problem
  8 | 
  9 | Over the last several years, there has been an increasing recognition that data is a first-class scientific product and a tremendous about of repositories and platforms have been developed to facilitate the storage, sharing, and re-use of data. However we think there is still an important gap in this ecosystem: platforms for data sharing offer limited functions for distributing and interacting with evolving datasets - those that continue to grow with time as more records are added, errors fixed, and new data structures are created. This is particularly the case for small to medium sized datasets that a typical scientific lab, or collection of labs, might produce.
 10 | 
 11 | In addition to enabling data creators to maintain and share a `living` dataset, ideally, such an infrastructure would allow enable data users to:
 12 | 
 13 | * Cache downloads, including across R sessions, to make things faster and to work offline
 14 | * Keep track of which versions are downloaded and available remotely
 15 | * Access multiple versions of the data at once; this would be especially helpful if trying to understand why results have changed with the version of the data.
 16 | 
 17 | ## How datastorr helps
 18 | 
 19 | This package can be used in two ways:
 20 | 
 21 | 1. Use data stored elsewhere in R efficiently (e.g., work with csv files that are too large to comfortably fit in git).
 22 | 2. Create another lightweight package designed to allow easy access to your data.
 23 | 
 24 | For both of these use-cases, `datastorr` will store your data using _GitHub releases_ which do not clog up your repository but allow up to 2GB files to be stored (future versions may support things like figshare).
 25 | 
 26 | `datastorr` is concerned about a simple versioning scheme for your data.  If you do not imagine the version changing that should not matter.  But if you work with data that changes (and everyone does eventually) this approach should make it easy to update files.
 27 | 
 28 | From the point of view of a user, using your data could be as simple as:
 29 | 
 30 | ```r
 31 | d <- datastorr::datastorr("richfitz/datastorr.example")
 32 | ```
 33 | 
 34 | (see below for details, how this works, and what it is doing).
 35 | 
 36 | ## End user interface
 37 | 
 38 | See [here](https://github.com/richfitz/datastorr.example) for the aim from the point of view for an end user.
 39 | 
 40 | They would install your package (which contains no data so is nice and
 41 | light and can be uploaded to CRAN).
 42 | 
 43 | ```r
 44 | devtools::install_github("richfitz/datastorr.example")
 45 | ```
 46 | 
 47 | The user can see what versions they have locally
 48 | 
 49 | ```r
 50 | datastorr.example::mydata_versions()
 51 | ```
 52 | 
 53 | and can see what versions are present on GitHub:
 54 | 
 55 | ```r
 56 | datastorr.example::mydata_versions(local=FALSE) # remote
 57 | ```
 58 | 
 59 | To download the most recent dataset:
 60 | 
 61 | ```r
 62 | d <- datastorr.example::mydata()
 63 | ```
 64 | 
 65 | Subsequent calls (even across R sessions) are cached so that the mydata() function is fast enough you can use it in place of the data.
 66 | 
 67 | To get a particular version:
 68 | 
 69 | ```r
 70 | d <- datastorr.example::mydata("0.0.1")
 71 | ```
 72 | 
 73 | Downloads are cached across sessions using `rappdirs`.
 74 | 
 75 | ## Package developer process
 76 | 
 77 | The simplest way is to run the (hidden) function `datastorr:::autogenerate`, as
 78 | 
 79 | 
 80 | ```r
 81 | datastorr:::autogenerate(repo="richfitz/datastorr.example", read="readRDS", name="mydata")
 82 | ```
 83 | 
 84 | which will print to the screen a bunch of code to add do your package.  There will be a vignette explaining this more fully soon.  A file generated in this way can be seen  [here](https://github.com/richfitz/datastorr.example/blob/master/R/package.R).
 85 | 
 86 | Once set up, new releases can be made by running, within your package directory:
 87 | 
 88 | ```r
 89 | datastorr.example::mydata_release("description of release", "path/to/file")
 90 | ```
 91 | 
 92 | provided you have your `GITHUB_TOKEN` environment variable set appropriatey.  See the vignette for more details.
 93 | 
 94 | ## Installation
 95 | 
 96 | ```r
 97 | devtools::install_github("ropenscilabs/datastorr")
 98 | ```
 99 | 
100 | ## License
101 | 
102 | MIT + file LICENSE © [Rich FitzJohn](https://github.com/richfitz).
103 | 


--------------------------------------------------------------------------------
/inst/template.whisker:
--------------------------------------------------------------------------------
 1 | ##' Download the example data set from {{{repo}}}
 2 | ##'  (\url{https://github.com/{{{repo}}}/})
 3 | ##' @title Download example data set
 4 | ##'
 5 | ##' @param version Version number.  The default will load the most
 6 | ##'   recent version on your computer or the most recent version known
 7 | ##'   to the package if you have never downloaded the data before.
 8 | ##'   With \code{ {{{name}}}_del}, specifying \code{version = NULL} will
 9 | ##'   delete \emph{all} data sets.
10 | ##'
11 | ##' @param path Path to store the data at.  If not given,
12 | ##'   \code{datastorr} will use \code{rappdirs} to find the best place
13 | ##'   to put persistent application data on your system.  You can
14 | ##'   delete the persistent data at any time by running
15 | ##'   \code{ {{{name}}}_del(NULL)} (or \code{ {{{name}}}_del(NULL, path)} if you
16 | ##'   use a different path).
17 | ##'
18 | ##' @export
19 | mydata <- function(version = NULL, path = NULL) {
20 |   datastorr::github_release_get(mydata_info(path), version)
21 | }
22 | 
23 | ##' @export
24 | ##' @rdname mydata
25 | ##'
26 | ##' @param local Logical indicating if local or github versions should
27 | ##'   be polled.  With any luck, \code{local = FALSE} is a superset of
28 | ##'   \code{local = TRUE}.  For \code{ {{{name}}}_version_current}, if
29 | ##'   \code{TRUE}, but there are no local versions, then we do check
30 | ##'   for the most recent github version.
31 | ##'
32 | mydata_versions <- function(local = TRUE, path = NULL) {
33 |   datastorr::github_release_versions(mydata_info(path), local)
34 | }
35 | 
36 | ##' @export
37 | ##' @rdname mydata
38 | mydata_version_current <- function(local = TRUE, path = NULL) {
39 |   datastorr::github_release_version_current(mydata_info(path), local)
40 | }
41 | 
42 | ##' @export
43 | ##' @rdname mydata
44 | mydata_del <- function(version, path = NULL) {
45 |   datastorr::github_release_del(mydata_info(path), version)
46 | }
47 | 
48 | ## Core data:
49 | mydata_info <- function(path) {
50 |   datastorr::github_release_info("{{{repo}}}",
51 |                                  filename = {{{filename}}},
52 |                                  read = {{{read}}},
53 |                                  path = path)
54 | }
55 | 
56 | ##' Maintainer-only function for releasing data.  This will look at
57 | ##' the version in the DESCRIPTION file and make a data release if the
58 | ##' GitHub repository contains the same version as we have locally.
59 | ##' Requires the \code{GITHUB_TOKEN} environment variable to be set.
60 | ##'
61 | ##' @title Make a data release.
62 | ##' @param ... Parameters passed through to \code{\link{github_release_create}}
63 | ##' @param path Path to the data (see \code{\link{ {{{name}}} }}).
64 | ##' @export
65 | mydata_release <- function(..., path = NULL) {
66 |   datastorr::github_release_create(mydata_info(path), ...)
67 | }
68 | 


--------------------------------------------------------------------------------
/man/autogenerate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/autogenerate.R
 3 | \name{autogenerate}
 4 | \alias{autogenerate}
 5 | \title{Autogenerate a datastorr interface}
 6 | \usage{
 7 | autogenerate(repo, read, filename = NULL, name = basename(repo),
 8 |   roxygen = TRUE)
 9 | }
10 | \arguments{
11 | \item{repo}{Name of the repo on github (in username/repo format)}
12 | 
13 | \item{read}{\emph{name} of a function to read the data.  Do not
14 | give the function itself!}
15 | 
16 | \item{filename}{Name of the file to read.  If not given, then the
17 | single file in a release will be read (but you will need to
18 | provide a filename on upload).  If given, you cannot change the
19 | filename ever as all releases will be assumed to have the same
20 | filename.}
21 | 
22 | \item{name}{Name of the dataset, used in generating the functions.
23 | If omitted the repo name is used.}
24 | 
25 | \item{roxygen}{Include roxygen headers for the functions?}
26 | }
27 | \description{
28 | Autogenerate an datastorr interface for a package.  The idea is to
29 | run this function and save the resulting code in a file in your
30 | package.  Then users will be able to download data and you will be
31 | able to relase data easily.
32 | }
33 | \details{
34 | In addition to running this, you will need to add \code{datastorr}
35 | to the \code{Imports:} section of your DESCRIPTION.  To upload
36 | files you will need to set your \code{GITHUB_TOKEN} environment
37 | variable.  These steps will be described more fully in a vignette.
38 | 
39 | More complete instructions:
40 | 
41 | Let \code{pkg} be \code{basename(repo}); the name of the package
42 | and of the GitHub repository.
43 | 
44 | First, create a new R package, e.g.  \code{devtools::create(pkg)}.
45 | 
46 | Then, copy the result of running \code{autogenerate} into a file
47 | in that package, e.g.
48 | 
49 | \preformatted{   writeLines(autogenerate(repo, read),
50 |              file.path(pkg, "datastorr.R"))
51 |   devtools::document(pkg)
52 | }
53 | 
54 | Create a new git repository for this package, and add all the
55 | files in the package, and commit.
56 | 
57 | On GitHub, create a repository for the package and push your code
58 | there.
59 | 
60 | At this point you are now ready to start making releases by
61 | loading your package and running \code{pkg::<name>_release()}.
62 | }
63 | \examples{
64 | writeLines(autogenerate("richfitz/datastorr.example",
65 |                         read = "readRDS", name = "mydata"))
66 | writeLines(autogenerate("richfitz/datastorr.example",
67 |                         read = "readRDS", name = "mydata",
68 |                         roxygen = FALSE))
69 | }
70 | 


--------------------------------------------------------------------------------
/man/datastorr.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simple.R
 3 | \name{datastorr}
 4 | \alias{datastorr}
 5 | \alias{datastorr_versions}
 6 | \title{Fetch data from a datastorr repository}
 7 | \usage{
 8 | datastorr(repo, path = NULL, metadata = "datastorr.json",
 9 |   branch = "master", private = FALSE, refetch = FALSE,
10 |   version = NULL, extended = FALSE)
11 | 
12 | datastorr_versions(..., local = TRUE)
13 | }
14 | \arguments{
15 | \item{repo}{Either a github repo in the form
16 | \code{<username>/<repo>} (e.g.,
17 | \code{"richfitz/data"} or the path to a json file
18 | on your filesystem.}
19 | 
20 | \item{path}{The path to store the data at.  Using \code{NULL} will}
21 | 
22 | \item{metadata}{The name of the metadata file within the repo (if
23 | \code{repo} refers to a github repo.  The default is
24 | \code{datastorr.json} at the root of the repository, but any
25 | other filename can be used.}
26 | 
27 | \item{branch}{The branch in the repo to use.  Default is
28 | \code{master}.}
29 | 
30 | \item{private}{A logical indicating if the repository is private
31 | and therefor if authentication will be needed to access it.}
32 | 
33 | \item{refetch}{Refetch the metadata file even if it has already
34 | been downloaded previously.}
35 | 
36 | \item{version}{Which version to download (if \code{extended} is
37 | \code{FALSE} -- the default).  By default the most recent
38 | version on the remote, or the current version locally will be
39 | fetched.}
40 | 
41 | \item{extended}{Don't fetch the data, but instead return an object
42 | that can query data, versions, etc.}
43 | 
44 | \item{...}{Arguments passed through to \code{datastorr}}
45 | 
46 | \item{local}{Return information on local versions?}
47 | }
48 | \description{
49 | Create a lightweight datastorr interface (rather than using the
50 | full package approach).  This approach is designed for the
51 | "files that don't fit in git" use-case.
52 | }
53 | \details{
54 | Note that the package approach is likely to scale better; in
55 | particular it allows for the reading function to be arbitrarily
56 | complicated, allows for package installation and loading, etc.
57 | With this simple interface you will need to document your
58 | dependencies carefully.  But it does remove the requirement for
59 | making a package and will likely work pretty well as part of an
60 | analysis pipeline where your dependencies are well documented
61 | anyway.
62 | }
63 | \examples{
64 | \dontrun{
65 | path <- tempfile()
66 | dat <- datastorr("richfitz/data", path, extended = TRUE)
67 | dat$list()
68 | dat()
69 | }
70 | }
71 | 


--------------------------------------------------------------------------------
/man/datastorr_auth.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/github_auth.R
 3 | \name{datastorr_auth}
 4 | \alias{datastorr_auth}
 5 | \alias{setup_github_token}
 6 | \title{datastorr/GitHub authentication}
 7 | \usage{
 8 | datastorr_auth(required = FALSE, key = NULL, secret = NULL,
 9 |   cache = TRUE, token_only = FALSE)
10 | 
11 | setup_github_token(path = "~/.Renviron")
12 | }
13 | \arguments{
14 | \item{required}{Is authentication required?  Reading from public
15 | repositories does not require authentication so there's no point
16 | worrying if we can't get it.  datastorr will set this when
17 | appropriate internally.}
18 | 
19 | \item{key, secret}{The application key and secret.  If \code{NULL},
20 | uses datastorr's key.  But if you have your own application feel
21 | free to replace these with your own.}
22 | 
23 | \item{cache}{Logical, indicating whether we should cache the
24 | token.  If \code{TRUE}, the token will be cached at
25 | \code{\link{datastorr_auth}()}, so that it is accessible to all
26 | datastorr usages.  Note that this is affected by the
27 | \code{datastorr.path} global option.  Alternatively, set
28 | \code{FALSE} to do no caching and be prompted each session or a
29 | string to choose your own filename.  Or set the
30 | \code{GITHUB_TOKEN} or \code{GITHUB_PAT} environment variables
31 | to use a token rather than OAuth.}
32 | 
33 | \item{token_only}{return the token only}
34 | 
35 | \item{path}{Path to environment file; the default is the user
36 | environment variable file which is usually a good choice.}
37 | }
38 | \description{
39 | Authentication for accessing GitHub.  This will first look for a
40 | GitHub personal token (stored in the \code{GITHUB_TOKEN} or
41 | \code{GITHUB_PAT} environment variables, and then try
42 | authenticating with OAuth.
43 | }
44 | \details{
45 | Run this \code{datastorr_auth} function to force setting up
46 | authentication with OAuth.  Alternatively, run
47 | \code{setup_github_token} to set up a personal access token.
48 | Either can be revoked at any time
49 | \url{https://github.com/settings/tokens} to revke a personal
50 | access token and \url{https://github.com/settings/applications} to
51 | revoke the OAuth token.
52 | }
53 | 


--------------------------------------------------------------------------------
/man/datastorr_path.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simple.R
 3 | \name{datastorr_path}
 4 | \alias{datastorr_path}
 5 | \title{Location of datastorr files}
 6 | \usage{
 7 | datastorr_path(repo = NULL)
 8 | }
 9 | \arguments{
10 | \item{repo}{An optional repo (of the form \code{user/repo}, though
11 | this is not checked).}
12 | }
13 | \description{
14 | Location of datastorr files.  This is determined by
15 | \code{rappdirs} using the \code{user_data_dir} function.
16 | Alternatively, if the option \code{datastorr.path} is set, that is
17 | used for the base path.  The path to data from an actual repo is
18 | stored in a subdirectory under this directory.
19 | }
20 | \details{
21 | Files in this directory can be deleted at will (e.g., running
22 | \code{unlink(datastorr_path(), recursive = TRUE)} will delete all
23 | files that datstorr has ever downloaded.  The only issue here is
24 | that the OAuth token (used to authenticate with GitHub) is also
25 | stored in this directory.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/github_release_create.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/github_release.R
 3 | \name{github_release_create}
 4 | \alias{github_release_create}
 5 | \title{Create a github release}
 6 | \usage{
 7 | github_release_create(info, description = NULL, filenames = NULL,
 8 |   target = NULL, ignore_dirty = FALSE, yes = !interactive())
 9 | }
10 | \arguments{
11 | \item{info}{Result of running \code{github_release_info}}
12 | 
13 | \item{description}{Optional text description for the release.  If
14 | this is omitted then GitHub will display the commit message from
15 | the commit that the release points at.}
16 | 
17 | \item{target}{Target of the release.  This can be either the name
18 | of a branch (e.g., \code{master}, \code{origin/master}),
19 | existing tag \emph{without a current release} or an SHA of a
20 | commit.  It is an error if the commit that this resolves to
21 | locally is not present on GitHub (e.g., if your branch is ahead
22 | of GitHub).  Push first!}
23 | 
24 | \item{ignore_dirty}{Ignore non-checked in files?  By default, your
25 | repository is expected to be in a clean state, though files not
26 | known to git are ignored (as are files that are ignored by git).
27 | But you must have no uncommited changes or staged but uncommited
28 | files.}
29 | 
30 | \item{yes}{Skip the confirmation prompt?  Only prompts if
31 | interactive.}
32 | 
33 | \item{filename}{Filename to upload; optional if in \code{info}.
34 | If listed in \code{info}, \code{filename} can be different but
35 | the file will be renamed to \code{info$filename} on uploading.
36 | If given but not in \code{info}, the uploaded file will be
37 | \code{basename(filename)} (i.e., the directory will be
38 | stripped).}
39 | }
40 | \description{
41 | Create a github release for your package.  This tries very hard to
42 | do the right thing but it's not always straightforward.  It first
43 | looks for your package.  Then it will work out what your last
44 | commit was (if \code{target} is NULL), the version of the package
45 | (from the DESCRIPTION).  It then creates a release on GitHub with
46 | the appropriate version number and uploads the file
47 | \code{filename} to the release.  The version number in the
48 | DESCRIPTION must be greater than the highest version number on
49 | GitHub.
50 | }
51 | \details{
52 | This function requires a system git to be installed and on the
53 | path.  The version does not have to be particularly recent.
54 | 
55 | This function also requires the \code{GITHUB_TOKEN} environment
56 | variable to be set, and for the token to be authorised to have
57 | write access to your repositories.
58 | }
59 | 


--------------------------------------------------------------------------------
/man/github_release_del.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/github.R
 3 | \name{github_release_del}
 4 | \alias{github_release_del}
 5 | \title{Delete version}
 6 | \usage{
 7 | github_release_del(info, version)
 8 | }
 9 | \arguments{
10 | \item{info}{Result of running \code{github_release_info}}
11 | 
12 | \item{version}{Version to delete.  If \code{NULL} it will delete
13 | the entire storr}
14 | }
15 | \description{
16 | Delete a local copy of a version (or all local copies).  Note that
17 | that does not affect the actual github release in any way!.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/github_release_get.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/github.R
 3 | \name{github_release_get}
 4 | \alias{github_release_get}
 5 | \title{Get data}
 6 | \usage{
 7 | github_release_get(info, version = NULL)
 8 | }
 9 | \arguments{
10 | \item{info}{Result of running \code{github_release_info}}
11 | 
12 | \item{version}{Version to fetch.  If \code{NULL} it will get the
13 | current version as returned by
14 | \code{github_release_version_current()}}
15 | }
16 | \description{
17 | Get a version of a data set, downloading it if necessary.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/github_release_info.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/github.R
 3 | \name{github_release_info}
 4 | \alias{github_release_info}
 5 | \title{Github release information}
 6 | \usage{
 7 | github_release_info(repo, read, private = FALSE, filename = NULL,
 8 |   path = NULL)
 9 | }
10 | \arguments{
11 | \item{repo}{Name of the repo in \code{username/repo} format.}
12 | 
13 | \item{read}{Function to read the file.  See Details.}
14 | 
15 | \item{private}{Is the repository private?  If so authentication
16 | will be required for all actions.  Setting this is optional but
17 | will result in better error messages because of the way GitHub
18 | returns not found/404 (rather than forbidden/403) errors when
19 | accessing private repositories without authorisation.}
20 | 
21 | \item{filename}{Optional filename.  If omitted, all files in the
22 | release can be used.  If the filename contains a star ("*") it
23 | will be treated as a filename glob.  So you can do
24 | \code{filename = "*.csv"} to match all csv files (dynamically
25 | computed on each release).}
26 | 
27 | \item{path}{Optional path in which to store the data.  If omitted
28 | we use \code{\link{datastorr_path}} to generate a reasonable
29 | path.}
30 | }
31 | \description{
32 | Information to describe how to process github releases
33 | }
34 | \details{
35 | The simplest case is where the data are stored in a single file
36 | attached to the release (this is different to the zip/tar.gz files
37 | that the web interface displays).  For example, a single csv file.
38 | In that case the filename argument can be safely ommited and we'll
39 | work it out based on the filename.
40 | }
41 | 


--------------------------------------------------------------------------------
/man/github_release_versions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/github.R
 3 | \name{github_release_versions}
 4 | \alias{github_release_versions}
 5 | \alias{github_release_version_current}
 6 | \title{Get release versions}
 7 | \usage{
 8 | github_release_versions(info, local = TRUE)
 9 | 
10 | github_release_version_current(info, local = TRUE)
11 | }
12 | \arguments{
13 | \item{info}{Result of running \code{github_release_info}}
14 | 
15 | \item{local}{Should we return local (TRUE) or github (FALSE)
16 | version numbers?  Github version numbers are pulled once per
17 | session only.  The exception is for
18 | \code{github_release_version_current} which when given
19 | \code{local = TRUE} will fall back on trying github if there are
20 | no local versions.}
21 | }
22 | \description{
23 | Get release versions
24 | }
25 | \author{
26 | Rich FitzJohn
27 | }
28 | 


--------------------------------------------------------------------------------
/man/release.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simple.R
 3 | \name{release}
 4 | \alias{release}
 5 | \title{Release data to a datastorr repository}
 6 | \usage{
 7 | release(repo, version, description = NULL, filename = NULL,
 8 |   path = NULL, metadata = "datastorr.json", branch = "master",
 9 |   private = FALSE, refetch = FALSE, target = NULL,
10 |   ignore_dirty = FALSE, yes = !interactive())
11 | }
12 | \arguments{
13 | \item{repo}{Either a github repo in the form
14 | \code{<username>/<repo>} (e.g.,
15 | \code{"richfitz/data"} or the path to a json file
16 | on your filesystem.}
17 | 
18 | \item{version}{A version number for the new version.  Should be of
19 | the form x.y.z, and may or may not contain a leading "v" (one
20 | will be added in any case).}
21 | 
22 | \item{description}{Optional text description for the release.  If
23 | this is omitted then GitHub will display the commit message from
24 | the commit that the release points at.}
25 | 
26 | \item{filename}{Filename to upload; optional if in
27 | \code{datastorr.json}.  If listed, \code{filename} can be
28 | different but the file will be renamed on uploading.  If given
29 | but not in \code{info}, the uploaded file will be
30 | \code{basename(filename)} (i.e., the directory will be
31 | stripped).}
32 | 
33 | \item{path}{The path to store the data at.  Using \code{NULL} will}
34 | 
35 | \item{metadata}{The name of the metadata file within the repo (if
36 | \code{repo} refers to a github repo.  The default is
37 | \code{datastorr.json} at the root of the repository, but any
38 | other filename can be used.}
39 | 
40 | \item{branch}{The branch in the repo to use.  Default is
41 | \code{master}.}
42 | 
43 | \item{private}{A logical indicating if the repository is private
44 | and therefor if authentication will be needed to access it.}
45 | 
46 | \item{refetch}{Refetch the metadata file even if it has already
47 | been downloaded previously.}
48 | 
49 | \item{target}{The SHA or tag to attach the release to.  By
50 | default, will use the current HEAD, which is typically what you
51 | want to do.}
52 | 
53 | \item{ignore_dirty}{Ignore non-checked in files?  By default, your
54 | repository is expected to be in a clean state, though files not
55 | known to git are ignored (as are files that are ignored by git).
56 | But you must have no uncommited changes or staged but uncommited
57 | files.}
58 | 
59 | \item{yes}{Skip the confirmation prompt?  Only prompts if
60 | interactive.}
61 | }
62 | \description{
63 | Create a relase for a simple datastorr (i.e., non-package based).
64 | }
65 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(datastorr)
3 | 
4 | test_check("datastorr")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/example.csv:
--------------------------------------------------------------------------------
1 | target,version,dataset,description
2 | 84af21c,1.0.0,mtcars,"First release: have some cars!"
3 | c1ad272,1.0.1,iris,"Changed my mind: here are some flowers"
4 | master,NA,Nile,"Like a river, time flows on"
5 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-dataverse.R:
--------------------------------------------------------------------------------
 1 | skip_if_no_downloads <- function() {
 2 |   skip_unless_internet()
 3 |   if (Sys.getenv("DATASTORR_SKIP_DOWNLOADS") == "") {
 4 |     return()
 5 |   }
 6 |   skip("Skipping downloads")
 7 | }
 8 | 
 9 | skip_unless_internet <- function() {
10 |   if (has_internet()) {
11 |     return()
12 |   }
13 |   skip("No internet :(")
14 | }
15 | 
16 | skip_if_no_github_token <- function() {
17 |   skip_if_no_downloads()
18 |   if (inherits(github_token(), "request")) {
19 |     return()
20 |   }
21 |   skip("No GITHUB_TOKEN set")
22 | }
23 | 
24 | has_internet <- function() {
25 |   !is.null(suppressWarnings(nsl("www.google.com")))
26 | }
27 | 
28 | ## I don't think that this wants to be part of the main bit of the
29 | ## package as it's a bit savage but it'll do for now:
30 | github_api_delete_all_releases <- function(info, yes = !interactive()) {
31 |   d <- github_api_releases(info)
32 |   for (x in d) {
33 |     github_api_release_delete(info, I(x$tag_name), yes)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/testthat/metadata.json:
--------------------------------------------------------------------------------
1 | {
2 |     "repo": "richfitz/datastorr.example",
3 |     "filename": null,
4 |     "read": "base::readRDS"
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/testthat/test-autogenerate.R:
--------------------------------------------------------------------------------
 1 | context("autogenerate")
 2 | 
 3 | test_that("basic", {
 4 |   res <- autogenerate("richfitz/datastorr.example", "readRDS", name = "mydata")
 5 |   expect_is(res, "character")
 6 | 
 7 |   res2 <- autogenerate("richfitz/datastorr.example", "readRDS", name = "mydata",
 8 |                        roxygen = FALSE)
 9 |   expect_lt(length(res2), length(res))
10 |   expect_gt(length(res2), 0)
11 |   expect_true(all(res2 %in% res))
12 | 
13 |   skip_if_no_downloads()
14 |   path <- tempfile()
15 |   download_file("https://raw.githubusercontent.com/richfitz/datastorr.example/master/R/package.R", dest = path)
16 |   cmp <- readLines(path)
17 |   expect_equal(res, cmp)
18 | })
19 | 


--------------------------------------------------------------------------------
/tests/testthat/test-data-package.R:
--------------------------------------------------------------------------------
 1 | ## test-data-package.R 
 2 | 
 3 | test_that("datastorrtest", {
 4 |   ## if this doesn't work, run: 
 5 |   ## devtools::install_github("FabriceSamonte/datastorrtest)
 6 |   ## 
 7 |   library(datastorrtest)
 8 |   
 9 |   path <- tempfile("test-cache")
10 |   on.exit(unlink(path, recursive = TRUE))
11 |   
12 |   expect_is(dataset_access_function(version="2.0.0", path=path), "list")
13 |   expect_is(dataset_access_function(version="1.0.0", path=path), "data.frame")
14 |   
15 |   expect_identical(dataset_versions(local=TRUE, path=path), c("1.0.0", "2.0.0"))
16 |   
17 |   expect_silent(datastorrtest::dataset_del("1.0.0", path=path))
18 |   expect_silent(datastorrtest::dataset_del("2.0.0", path=path))
19 |   
20 |   # can't delete something that doesn't exist
21 |   expect_error(datastorrtest::dataset_del("2.0.0", path=path))
22 |   
23 |   expect_identical(dataset_versions(local=TRUE, path=path), character(0))
24 |   
25 | })
26 | 
27 | test_that("taxonlookup", {
28 |   
29 |   library(taxonlookup)
30 |   
31 |   path <- tempfile("test-cache")
32 |   on.exit(unlink(path, recursive = TRUE))
33 |   
34 |   expect_is(plant_lookup(path=path), "data.frame")
35 |   expect_silent(plant_lookup_del(version=NULL))
36 | 
37 | })  
38 | 
39 |   
40 |   
41 | 


--------------------------------------------------------------------------------
/tests/testthat/test-default.R:
--------------------------------------------------------------------------------
 1 | # test-default.R 
 2 | 
 3 | test_that("Default Case", {
 4 |   
 5 |   unpack <- function(...) {
 6 |     files <- unzip(...) 
 7 |   }
 8 |   
 9 |   path <- tempfile("test-cache")
10 |   on.exit(unlink(path, recursive = TRUE))
11 |   info <- github_release_info("FabriceSamonte/datastorrtest", 
12 |                               c(length), 
13 |                               filename=NULL,
14 |                               path=path)
15 |   
16 |   exists <- file.exists(path)
17 |   
18 |   st <- R6_datastorr$new(info) 
19 |   
20 |   expect_identical(path, st$path)
21 |   expect_identical(st$storr$list("file"), character(0))
22 |   
23 |   dat <- st$get() 
24 |   
25 |   
26 |   info <- github_release_info("FabriceSamonte/datastorrtest", 
27 |                               read=unpack, 
28 |                               filename="Source.zip",
29 |                               path=path)
30 |   
31 |   st <- R6_datastorr$new(info) 
32 |   
33 |   expect_identical(github_release_version_current(info), "2.0.0")
34 |   expect_identical(github_release_version_current(info, local=FALSE), "2.0.0")
35 |   expect_identical(github_release_versions(info), "2.0.0")
36 |   expect_is(github_release_versions(info), "character")
37 |   
38 |   dat <- st$get()  
39 |   
40 |   
41 |   
42 | }
43 | )


--------------------------------------------------------------------------------
/tests/testthat/test-github-releases.R:
--------------------------------------------------------------------------------
  1 | context("github_release")
  2 | 
  3 | test_that("github_release", {
  4 |   read_csv <- function(...) {
  5 |     read.csv(..., stringsAsFactors = FALSE)
  6 |   }
  7 | 
  8 |   info <- github_release_info("wcornwell/taxonlookup", read_csv)
  9 | 
 10 |   path <- datastorr_path(info$repo)
 11 |   exists <- file.exists(path)
 12 | 
 13 |   expect_is(info, "github_release_info")
 14 |   expect_is(info$path, "character")
 15 |   expect_identical(info$path, path)
 16 |   expect_identical(file.exists(info$path), exists)
 17 | 
 18 |   ## for testing use a temporary file
 19 |   path <- tempfile("datastorr_")
 20 |   on.exit(unlink(path, recursive = TRUE))
 21 |   info <- github_release_info("wcornwell/taxonlookup", read_csv, path = path)
 22 |   expect_identical(info$path, path)
 23 |   expect_false(file.exists(path))
 24 | 
 25 |   st <- R6_datastorr$new(info)
 26 |   expect_true(file.exists(path))
 27 |   expect_is(st$storr, "storr")
 28 |   expect_identical(st$storr$list("file"), character(0))
 29 | 
 30 |   expect_identical(github_release_versions(info), character(0))
 31 | 
 32 |   skip_if_no_downloads()
 33 |   tmp <- github_release_versions(info, FALSE)
 34 |   expect_gt(length(tmp), 9)
 35 | 
 36 |   tmp <- github_release_version_current(info)
 37 |   expect_true(numeric_version(tmp) >= numeric_version("1.0.0"))
 38 |   
 39 |   skip("skipping case where no filenames are passed into info structure")
 40 |   expect_is(dat, "data.frame")
 41 |   expect_identical(st$storr$list("file"), tmp)
 42 |   expect_identical(github_release_versions(info), tmp)
 43 | 
 44 |   github_release_del(info, tmp)
 45 |   expect_identical(github_release_versions(info, TRUE),
 46 |                    character(0))
 47 |   expect_true(file.exists(path))
 48 |   github_release_del(info, NULL)
 49 |   expect_false(file.exists(path))
 50 | })
 51 | 
 52 | test_that("datastorr.example", {
 53 |   ## So, basically nothing here will work without the token, and as
 54 |   ## it's my repository, that's not ideal.  Happy for other solutions
 55 |   ## here.
 56 |   
 57 |   skip_if_no_github_token()
 58 |   path <- tempfile("datastorr_")
 59 |   url <- "https://github.com/richfitz/datastorr.example.git"
 60 |   system2("git", c("clone", url, path))
 61 | 
 62 |   owd <- setwd(path)
 63 |   on.exit({
 64 |     setwd(owd)
 65 |     unlink(path, recursive = TRUE)
 66 |   })
 67 | 
 68 |   ## A fairly unconventional way of loading the package :)
 69 |   source("R/package.R", local = TRUE)
 70 | 
 71 |   info <- mydata_info(tempfile("datastorr_"))
 72 | 
 73 |   d <- read.csv(file.path(owd, "example.csv"), stringsAsFactors = FALSE)
 74 |   dd_contents <- lapply(d$dataset, get, as.environment("package:datasets"))
 75 |   names(dd_contents) <- d$dataset
 76 | 
 77 |   ## Temporary place to stick data:
 78 |   tmp <- tempfile("datastorr_")
 79 |   dir.create(tmp)
 80 |   on.exit(unlink(tmp, recursive = TRUE), add = TRUE)
 81 |   tmp_data_path <- function(x) file.path(tmp, paste0(x, ".rds"))
 82 |   lapply(d$dataset, function(x)
 83 |     saveRDS(get(x, "package:datasets"), tmp_data_path(x)))
 84 |   dd <- tmp_data_path(d$dataset)
 85 | 
 86 |   ## Need to delete everything:
 87 |   github_api_delete_all_releases(info, yes = TRUE)
 88 | 
 89 |   v_master <- numeric_version(read.dcf("DESCRIPTION")[, "Version"])
 90 |   last <- numeric_version("0.0.0")
 91 | 
 92 |   f <- function(i) {
 93 |     sha <- d$target[[i]]
 94 |     system2("git", c("checkout", sha))
 95 |     github_release_create(info, d$description[[i]], dd[[i]], sha, yes = TRUE)
 96 |   }
 97 | 
 98 |   for (i in seq_len(nrow(d))) {
 99 |     if (i == nrow(d)) {
100 |       do_last <- grepl("^https", url) && v_master > last
101 |       if (!do_last) {
102 |         system2("git", c("checkout", "master"))
103 |         break
104 |       }
105 |     }
106 | 
107 |     x <- f(i)
108 | 
109 |     expect_is(x, "list")
110 |     curr <- numeric_version(strip_v(x$tag_name))
111 |     expect_true(curr > last)
112 |     if (i < nrow(d)) {
113 |       expect_equal(curr, numeric_version(d$version[[i]]))
114 |     } else {
115 |       d$version[[i]] <- as.character(curr)
116 |     }
117 |     expect_equal(length(x$assets), 1)
118 |     expect_equal(x$assets[[1]]$name, paste0(d$dataset[[i]], ".rds"))
119 |     expect_equal(x$assets[[1]]$content_type, "application/octet-stream")
120 |     expect_equal(x$body, d$description[[i]])
121 |     expect_equal(x$target_commitish,
122 |                  system2("git", c("rev-parse", d$target[[i]]), stdout = TRUE))
123 | 
124 |     last <- curr
125 |   }
126 | 
127 |   ## Now, try and make a github release on top of the branch; this
128 |   ## should not be possible because the version will not have moved on
129 |   ## (especially if the previous version goes with master).  I don't
130 |   ## think this should generally run for a local clone though.
131 |   path_data <- file.path(tmp, "rock.rds")
132 |   saveRDS(rock, path_data)
133 |   expect_error(github_release_create(info, "should fail", path_data,
134 |                                      target = "master", yes = TRUE),
135 |                "is not ahead of remote version")
136 | 
137 |   ## Now, pull the data down and have a look:
138 |   j <- seq_len(nrow(d) - if (do_last) 0 else 1)
139 | 
140 |   vv <- mydata_versions(FALSE, info$path)
141 |   expect_equal(length(vv), length(j))
142 |   expect_equal(vv[j], d$version[j])
143 | 
144 |   for (i in j) {
145 |     data_i <- mydata(vv[[i]], info$path)
146 |     expect_identical(data_i, dd_contents[[i]])
147 |   }
148 | })
149 | 


--------------------------------------------------------------------------------
/tests/testthat/test-multi-file.R:
--------------------------------------------------------------------------------
 1 | # test-new-template.R 
 2 | 
 3 | test_that("Multi file test", {
 4 |   library(datastorrtest)       
 5 |   read_csv <- function(...) {
 6 |     read.csv(...)
 7 |   }
 8 |   
 9 |   read_raster <- function(...) {
10 |     raster::raster(...)
11 |   }
12 |   
13 |   read_spreadsheet <- function(...) {
14 |     readxl::read_xls(...)
15 |   }
16 |   
17 |   path <- tempfile("test-cache")
18 |   on.exit(unlink(path, recursive = TRUE))
19 |   info <- github_release_info("FabriceSamonte/datastorrtest", 
20 |                                     c(read_csv, read_spreadsheet), 
21 |                                     filename=c("baad_with_map.csv", "Globcover_Legend.xls"),
22 |                                     path=path)
23 |   
24 |   exists <- file.exists(path)
25 |   
26 |   expect_is(info, "github_release_info")
27 |   expect_is(info$path, "character")
28 |   expect_identical(info$path, path)
29 |   expect_identical(file.exists(info$path), exists)
30 |   
31 |   # version test 
32 |   
33 |   expect_identical(github_release_version_current(info), "2.0.0")
34 |   expect_identical(github_release_version_current(info, local=FALSE), "2.0.0")
35 |   expect_identical(github_release_versions(info), character(0))
36 |   expect_is(github_release_versions(info), "character")
37 |   
38 |   
39 |   # test datastorr attributes 
40 |   st <- R6_datastorr$new(info) 
41 |   
42 |   expect_identical(path, st$path)
43 |   expect_identical(st$storr$list("file"), character(0))
44 |   
45 |   expect_identical(st$version_current(), "2.0.0")
46 |   expect_identical(st$version_current(local=FALSE), "2.0.0")
47 |   expect_identical(st$versions(), character(0))
48 |   expect_is(st$versions(local=FALSE), "character")
49 |   
50 |   dat <- st$get(version="2.0.0")
51 |   
52 |   expect_is(dat, "list")
53 |   
54 |   expect_identical(st$version_current(), "2.0.0")
55 |   expect_identical(st$versions(), "2.0.0")
56 |   expect_is(st$storr$list("file"), "character")
57 |   expect_silent(st$del("2.0.0"))
58 |   expect_error(st$del("2.0.0"))
59 |   
60 |   st$del(NULL)
61 |   expect_true(!file.exists(path))
62 |   
63 |   
64 |   
65 | })


--------------------------------------------------------------------------------
/tests/testthat/test-simple.R:
--------------------------------------------------------------------------------
 1 | context("simple")
 2 | 
 3 | test_that("basic usage", {
 4 |   skip("skipping case where no filenames are passed into info structure")
 5 |   d <- read.csv("example.csv", stringsAsFactors = FALSE)
 6 |   dd_contents <- lapply(d$dataset, get, as.environment("package:datasets"))
 7 |   names(dd_contents) <- d$version
 8 |   len <- length(dd_contents)
 9 | 
10 |   path <- tempfile()
11 |   d <- datastorr("richfitz/datastorr.example", path = path)
12 | 
13 |   expect_equal(d, dd_contents[[length(dd_contents)]])
14 | 
15 |   obj <- datastorr("richfitz/datastorr.example", path = path, extended = TRUE)
16 |   v <- obj$versions(FALSE)
17 |   expect_equal(length(v), len)
18 |   expect_equal(v[-len], names(dd_contents)[-len])
19 |   names(dd_contents) <- v
20 | 
21 |   for (i in v) {
22 |     expect_equal(obj$get(i), dd_contents[[i]])
23 |   }
24 | 
25 |   expect_equal(obj$versions(), v)
26 | 
27 |   expect_equal(obj$path, path)
28 | 
29 |   expect_equal(obj$version_current(), v[[len]])
30 |   expect_equal(obj$version_current(FALSE), v[[len]])
31 |   obj$del(NULL)
32 |   expect_false(file.exists(path))
33 | })
34 | 
35 | ## Just a smoke test for now:
36 | test_that("private", {
37 |   skip_if_no_github_token()
38 |   obj <- datastorr("mrc-ide/data_private", tempfile(),
39 |                    private = TRUE, extended = TRUE)
40 |   expect_is(obj$get(), "data.frame")
41 | })
42 | 


--------------------------------------------------------------------------------
/update_web.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -e
 3 | 
 4 | DOCS_DIR=inst/web
 5 | VERSION=$(git rev-parse --short HEAD)
 6 | REMOTE_URL=$(git config --get remote.origin.url)
 7 | 
 8 | rm -rf ${DOCS_DIR}/.git
 9 | git init ${DOCS_DIR}
10 | git -C ${DOCS_DIR} checkout --orphan gh-pages
11 | git -C ${DOCS_DIR} add .
12 | git -C ${DOCS_DIR} commit --no-verify -m "Update docs for version ${VERSION}"
13 | git -C ${DOCS_DIR} remote add origin -m "gh-pages" ${REMOTE_URL}
14 | git -C ${DOCS_DIR} push --force -u origin gh-pages
15 | 


--------------------------------------------------------------------------------
/vignettes/datastorr.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "datastorr"
  3 | author: "Rich FitzJohn"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{datastorr}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ## Scope
 13 | 
 14 | This package attempts to simultaneously solve a number of problems
 15 | around small-scale data versioning and distribution:
 16 | 
 17 | * Giving users access to your data in an easily machine-digestable
 18 |   format.
 19 | 
 20 | * Hosting and distributing the data somewhere fast and reliable
 21 |   without having to deal with creating websites.
 22 | 
 23 | * Protecting access to the data to collaborators, especially with
 24 |   the idea of later public release.
 25 | 
 26 | * Allowing a dataset to be downloaded once and reused for multiple
 27 |   projects on a single computer, without having to deal with
 28 |   pathnames within or between systems.
 29 | 
 30 | * Versioning the data so that:
 31 |     * fetching the current version is easy,
 32 |     * fetching a previous version is easy,
 33 |     * simultaneously looking at two versions is easy,
 34 |     * data versions are strongly associated with the code that created them,
 35 |     * end users do not have to use git,
 36 |     * large files do not end up clogging up your git repository.
 37 | 
 38 | * Allows publication of data packages on CRAN without causing
 39 |   problems of large package file downloads.
 40 | 
 41 | * Provides a common interface for storing and retrieving data that
 42 |   works across diverse underlying data formats (one or many csv
 43 |   files, binary data, phylogenic trees, or a collection of all of
 44 |   these), so long as you have a way of reading the data into R.
 45 | 
 46 | The package is designed so be simple to use so that all that can be
 47 | done in a couple of lines of code, or (for more involved cases)
 48 | with a package that can be generated automatically.
 49 | 
 50 | ## Background
 51 | 
 52 | Data comes in all shapes and sizes, and a one-size fits all
 53 | solution will not fit everything.
 54 | 
 55 | * Too small: One-off data sets (e.g. a field experient that will
 56 |   not be updated).  Put the data on data dryad, figshare, or
 57 |   wherever you fancy.  Stick a fork in it, it's done (though you
 58 |   can use this package you'll likely find it easier not to).
 59 | 
 60 | * Too big: Massively collaborative datasets with large end users
 61 |   communities, data sets that are so large they require access via
 62 |   APIs, data with access control requiring complex authentication
 63 |   layers, data with complex metadata where access is related to
 64 |   metadata.  There are more comprehensive solutions for your data
 65 |   but identifying the correct solution may depend on the data.
 66 | 
 67 | * Just right: A data set of medium size (say, under 100 MB), that
 68 |   is under moderate levels of change (either stabilising or a
 69 |   "living database" that is continually being updated).
 70 | 
 71 | Getting data into R is typically done with a [data
 72 | package](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Data-in-packages).
 73 | This works well for small data, but CRAN will [not generally
 74 | allow](https://cran.r-project.org/web/packages/policies.html)
 75 | distribution of "large" data sets.  The `data()` loading mechanism
 76 | of R always seemed a bit of a weird historical quirk in any case;
 77 | it operates in some additional namespace (`package:datasets`),
 78 | works by modifying an environment as a side-effect.  Plus if you
 79 | need to compare two versions of the data you have to do some
 80 | gynmastics to install two different versions of a package (or
 81 | create a package with all the different versions of the data in
 82 | it).
 83 | 
 84 | ## How `datastorr` works
 85 | 
 86 | GitHub has a "releases" feature for allowing (potentially large)
 87 | file uploads.  These files can be any format.  GitHub releases are
 88 | build off of git "tags"; they are *associated with a specific
 89 | version*.  So if you have code that creates or processes a dataset,
 90 | the dataset will be stored against the code used to create it,
 91 | which is nice.  GitHub releases *do not store the file in the
 92 | repository*.  This avoids issues with git slowing down on large
 93 | files, on lengthy clone times, and on distributing and installing
 94 | your package.  This could be an issue if you had 100 versions of a
 95 | 10 MB dataset; that could be 1GB of data to clone or install.  But
 96 | storing your data against GitHub releases will leave the data in
 97 | the cloud until it is needed.  And the files can be quite large;
 98 | [up to
 99 | 2GB](https://help.github.com/articles/distributing-large-binaries).
100 | 
101 | The releases will be numbered.  We recommend [semantic
102 | versioning](http://semver.org) mostly because it signals some
103 | intent about changes to the data (see below).  If the data is not
104 | going to change, that's not a problem - the version can just be
105 | `v1.0.0` forever (chances are it will change though!).
106 | 
107 | We will make the simplifying assumption that your data set will be
108 | stored in a single file.  In practice this is not a large
109 | limitation because that file could be a zip archive.  The file can
110 | be in any format; csv, rds (R's internal format), a SQLite
111 | database.  You, however, need to specify or provide a function that
112 | will read the data and convert it into an R object.  This is most
113 | easily done with `rds` files (R's serialisation format -- though
114 | note they say that it is not a great long-term archival format [see
115 | `?serialize`]).
116 | 
117 | To orchestrate getting the data from github to R we need to add a
118 | little metadata about what the file will be called and how it
119 | should be loaded into R.  This can be done most simply with a small
120 | [json](https://en.wikipedia.org/wiki/JSON) file at the root of the
121 | repository containing information like:
122 | 
123 | ```json
124 | {
125 |     "filename": "myfile.rds",
126 |     "read": "base::readRDS"
127 | }
128 | ```
129 | 
130 | Note that the function used here must take a filename as an
131 | argument and return an R object.  So functions like `read.csv`,
132 | `read.table` and functions from the
133 | [`rio`](https://github.com/leeper/rio) package may be good here.
134 | 
135 | Once your git repository is set up, the metadata file added to it,
136 | and a release with data has been created, it can be downloaded
137 | like:
138 | ``` {r }
139 | d <- datastorr::datastorr("richfitz/data")
140 | ```
141 | 
142 | though, with your username/repo pair instead of `richfitz/data`.
143 | 
144 | This function is designed to be *fast* for users, and so suitable
145 | for using in scripts.  It uses
146 | [`storr`](https://github.com/richfitz/storr) behind the scenes and
147 | looks in various places for the data:
148 | 
149 | 1. In memory; if it has been loaded within this session it is
150 | already in memory.  Takes on the order of microseconds.
151 | 
152 | 2. From disk; if the data has _ever_ been loaded datastorr will
153 | cache a copy on disk.  Takes on the order of milliseconds up to a
154 | second, depending on the size of the data.
155 | 
156 | 3. From GitHub; if the data has never been loaded, it will be
157 | downloaded from GitHub, saved to disk, and loaded to memory.  This
158 | will take several seconds or longer depending on the size of the
159 | dataset.
160 | 
161 | In addition, users can download specific versions of a dataset.
162 | This might be to synchronise data versions across different people
163 | in a project, to lock a project onto a specific version, etc:
164 | ``` {r }
165 | d_old <- datastorr::datastorr("richfitz/data", version="1.0.0")
166 | ```
167 | 
168 | (The same cascading lookup as above is used.)
169 | 
170 | Versions can be listed; those stored locally:
171 | ``` {r }
172 | datastorr::datastorr_versions("richfitz/data")
173 | ```
174 | 
175 | or available remotely:
176 | ``` {r }
177 | datastorr::datastorr_versions("richfitz/data", local=FALSE)
178 | ```
179 | 
180 | The versions that have been downloaded (here `d` and `d_old`) are
181 | just normal R objects Unlike use with `data()` there's no ambiguity
182 | about where they are stored, and modifying one acts like any other
183 | object.
184 | 
185 | Similarly, because these are ordinary R objects you can do things
186 | like use [`daff`](https://github.com/edwindj/daff) to compare them
187 | 
188 | ```r
189 | p <- daff::diff_data(d_old, d)
190 | daff::render_diff(p)
191 | ```
192 | 
193 | ## The package interface
194 | 
195 | Alternatively we can create a very small R package that exists at
196 | the repo that we store releases against.  This package can be
197 | autogenerated, and is a useful approach when there is a significant
198 | amount of work needed in processing the data, to simplify
199 | installation of dependencies used in reading or displaying the
200 | data, or to work with the data once it has been downloaded.In our
201 | own use, the repository (but not the package) contains code for
202 | _building_ the data set (see
203 | [taxonlookup](https://github.com/traitecoevo/taxonlookup)).  The
204 | package approach will be described more fully later in the
205 | document.
206 | 
207 | Once your git repsitory is published and your data have been
208 | released, downloading it becomes a function within your package.  A
209 | user would run something like:
210 | 
211 | ```{r,eval=FALSE}
212 | d <- mypackage::mydata()
213 | ```
214 | 
215 | to fetch or load the data.
216 | 
217 | 
218 | ```{r,eval=FALSE}
219 | d <- mypackage::mydata("v1.0.0")
220 | ```
221 | 
222 | 
223 | 
224 | This approach extends to holding multiple versions of the data on a
225 | single computer (or in a single R session).  This might be useful
226 | when the dataset has changed and you want to see what has changed.
227 | 
228 | ```{r,eval=FALSE}
229 | d1 <- mypackage::mydata("v1.0.0")
230 | d2 <- mypackage::mydata("v1.1.0")
231 | ## ...compare d1 and d2 here...
232 | ```
233 | 
234 | ## Worked example
235 | 
236 | 
237 | 
238 | 
239 | 
240 | First, you will need a package.  Creating packages is not that
241 | hard, especialy with tools like
242 | [devtools](https://github.com/hadley/devtools) and
243 | [mason](https://github.com/gaborcsardi/mason).  Packages make
244 | running R code on other machines much simpler than sourcing in
245 | files or copy and paste.  Packages are also nice because if your
246 | data require specific package to work with (e.g., `ape` for
247 | phylogenetic trees) you can declare them in your `DESCRIPTION` file
248 | and R will ensure that they are installed when your package is
249 | installed and loaded when your package is used.
250 | 
251 | However, you will need to come up with a few details:
252 | 
253 | * a package name
254 | * a name for the dataset (if different to the package name)
255 | * a _licence_ for your package (code) and data (not code)
256 | * ideally, documentation for your end users
257 | * the name of the file that you will store with each release
258 | 
259 | In addition you need to set up a GitHub token so that you can
260 | upload files to GitHub from R, or to access your private
261 | repositories; see the section on authentication below, or just do
262 | nothing as datastorr will prompt you at the appropriate time.
263 | 
264 | The core code can be autogenerated.  For example the package
265 | [datastorr.example](https://github.com/richfitz/datastorr.example)
266 | was generated using
267 | ``` {r eval=FALSE}
268 | datastorr::autogenerate("richfitz/datastorr.example", "readRDS",
269 |                         name="mydata", roxygen=FALSE)
270 | ```
271 | 
272 | ``` {r echo=FALSE, results="asis"}
273 | pkg <- datastorr::autogenerate("richfitz/datastorr.example", "readRDS",
274 |                                name="mydata", roxygen=FALSE)
275 | writeLines(c("```r", pkg,  "```"))
276 | ```
277 | 
278 | This code can be copied into a file within the package.  If you set
279 | `roxygen=TRUE` you'll get roxygen help that `devtools::document()`
280 | will convert into R help files and `NAMESPACE` declarations.
281 | 
282 | The package can then be loaded and data accessed with the `mydata`
283 | function.
284 | 
285 | To make the release:
286 | 
287 | 1. Increase the version number in your `DESCRIPTION` file
288 | 
289 | 2. Your local repo is all committed (no unstaged files etc).  This
290 | is important if you want to closely associate the release and your
291 | data and at the moment datastorr enforces it.
292 | 
293 | 3. Push your changes to GitHub and install your package
294 | 
295 | 4. Run `yourpackage::yourdata_release("A description here")`
296 | 
297 | 5. Check that it all worked by running `yourpackage::yourdata("new version")`
298 | 
299 | (you can get your new version by `read.dcf("DESCRIPTION")[,
300 | "Version"]`).
301 | 
302 | ## Access control
303 | 
304 | Because GitHub offers private repositories, this gives some
305 | primitive, but potentailly useful, access control.  Because
306 | datastorr uses GitHub's authentication, GitHub knows if the user
307 | has access to private repositories.  Therefore for this to work you
308 | will need to authenticate datastorr to work with GitHub.
309 | 
310 | The simplest way to do this is to let datastorr prompt you when
311 | access is required.  Or run:
312 | 
313 | ```r
314 | datastorr::datastorr_auth()
315 | ```
316 | 
317 | to force the authentication process to run (no error and no output
318 | indicates success).  To force using personal access tokens rather
319 | than OAuth, run:
320 | 
321 | ```r
322 | setup_github_token()
323 | ```
324 | 
325 | which will walk you through the steps of setting a token up.
326 | 
327 | If you use a personal private repository, then you invite other
328 | users to "collaborate" on the repository.  Note that this gives the
329 | users push access to the repository; the access control is very
330 | coarse.
331 | 
332 | If you have an organisation account you can create groups of users
333 | that have read only access to particular repositories, which will
334 | likely scale better.
335 | 
336 | ## Semantic versioning of data
337 | 
338 | Some will argue that it is not possible and they are probably
339 | right.  But you need to go with some versioning system.  If the
340 | idea of semantically versioning data bothers you, use incrementing
341 | integers (`v1`, `v2`, `v<n>`) and read no further!
342 | 
343 | The idea with semantic versioning is that it formalises what people
344 | do already with versioning.  We feel this can be applied fairly
345 | successfully to data.
346 | 
347 | * **Update patch release**; small changes, backward compatible.
348 |     * adding new rows to the data set (more data)
349 |     * error correcting existing data
350 | 
351 | * **Update minor version**; medium changes, but generally backward
352 |   compatible.
353 |     * new columns
354 |     * substantial new data
355 |     * new tables
356 | 
357 | * **Update major version**; large (API) changes, likely to be backward
358 |   incompatible.
359 |     * renaming or deleting columns
360 |     * changing variable coding
361 |     * deleting large amounts of data
362 | 
363 | Forks make this a lot more complicated.  If two people are working
364 | in parallel how do they decide what version number to use?
365 | However, with our solution, the datasets are still sensibly named;
366 | we have:
367 | 
368 |    * `user1/dataset@v1.2.3`
369 |    * `user2/dataset@v1.3.5`
370 | 
371 | It's just not possible to know from the outside exactly what
372 | differs between the datasets but they are at least distinctly named
373 | (and you could download both of them).  When the fork is resolved
374 | and `user2` merges back into `user1` the two researchers can
375 | discuss what version number they would want to use.  Like resolving
376 | merge conflicts, we see this as a _social_ problem, not a
377 | _technological_ one and the soltuion will be social.
378 | 
379 | ## Beyond GitHub
380 | 
381 | Apart from the ease of use, mindshare and the explicit association
382 | between data and code, there is no strong reason to use GitHub
383 | here.  Certainly Bitbucket provides all the same functionality that
384 | is required to generalise our approach to work there.  And self
385 | hosting would work too, with more effort.  Over time we may develop
386 | support for alternative storage providers.
387 | 
388 | At the same time, the fast and generally reliable webserver, the
389 | access controls and the nice API make it a great first place to try
390 | this proof of concept.
391 | 
392 | ## How it _actually_ works
393 | 
394 | GitHub has an API that lets you programmatically query the state of
395 | basically everything on GitHub, as well *create* things.  So the
396 | interaction with the website is straightforward; getting lists of
397 | releases for a repository, filenames associated with releases, etc.
398 | 
399 | With this information, `datastorr` uses a
400 | [`storr_external`](https://richfitz.github.io/storr/vignettes/external.html)
401 | object and stores data with versions as keys.  If a version is not
402 | found it is downloaded (using the information from GitHub) and read
403 | into R using the `read` function.  A copy of this R-readable
404 | verison is saved to disk.
405 | 
406 | In order to save and load data repeatedly, especially across
407 | different projects on the same computer, `datastorr` uses the
408 | `rappdirs` package to find the "Right Place" to store "application
409 | data".  This varies by system and is documented in the
410 | `?rappdirs::user_data_dir` help page.  Using this directory means
411 | there is little chance of accidently commiting large data sets into
412 | the repository (which might be a problem if storing the data in a
413 | subdirectory of the project).
414 | 


--------------------------------------------------------------------------------
/vignettes/src/datastorr.R:
--------------------------------------------------------------------------------
  1 | ## ---
  2 | ## title: "datastorr"
  3 | ## author: "Rich FitzJohn"
  4 | ## date: "`r Sys.Date()`"
  5 | ## output: rmarkdown::html_vignette
  6 | ## vignette: >
  7 | ##   %\VignetteIndexEntry{datastorr}
  8 | ##   %\VignetteEngine{knitr::rmarkdown}
  9 | ##   %\VignetteEncoding{UTF-8}
 10 | ## ---
 11 | 
 12 | ## ## Scope
 13 | 
 14 | ## This package attempts to simultaneously solve a number of problems
 15 | ## around small-scale data versioning and distribution:
 16 | ##
 17 | ## * Giving users access to your data in an easily machine-digestable
 18 | ##   format.
 19 | ##
 20 | ## * Hosting and distributing the data somewhere fast and reliable
 21 | ##   without having to deal with creating websites.
 22 | ##
 23 | ## * Protecting access to the data to collaborators, especially with
 24 | ##   the idea of later public release.
 25 | ##
 26 | ## * Allowing a dataset to be downloaded once and reused for multiple
 27 | ##   projects on a single computer, without having to deal with
 28 | ##   pathnames within or between systems.
 29 | ##
 30 | ## * Versioning the data so that:
 31 | ##     * fetching the current version is easy,
 32 | ##     * fetching a previous version is easy,
 33 | ##     * simultaneously looking at two versions is easy,
 34 | ##     * data versions are strongly associated with the code that created them,
 35 | ##     * end users do not have to use git,
 36 | ##     * large files do not end up clogging up your git repository.
 37 | ##
 38 | ## * Allows publication of data packages on CRAN without causing
 39 | ##   problems of large package file downloads.
 40 | ##
 41 | ## * Provides a common interface for storing and retrieving data that
 42 | ##   works across diverse underlying data formats (one or many csv
 43 | ##   files, binary data, phylogenic trees, or a collection of all of
 44 | ##   these), so long as you have a way of reading the data into R.
 45 | ##
 46 | ## The package is designed so be simple to use so that all that can be
 47 | ## done in a couple of lines of code, or (for more involved cases)
 48 | ## with a package that can be generated automatically.
 49 | 
 50 | ## ## Background
 51 | 
 52 | ## Data comes in all shapes and sizes, and a one-size fits all
 53 | ## solution will not fit everything.
 54 | 
 55 | ## * Too small: One-off data sets (e.g. a field experient that will
 56 | ##   not be updated).  Put the data on data dryad, figshare, or
 57 | ##   wherever you fancy.  Stick a fork in it, it's done (though you
 58 | ##   can use this package you'll likely find it easier not to).
 59 | ##
 60 | ## * Too big: Massively collaborative datasets with large end users
 61 | ##   communities, data sets that are so large they require access via
 62 | ##   APIs, data with access control requiring complex authentication
 63 | ##   layers, data with complex metadata where access is related to
 64 | ##   metadata.  There are more comprehensive solutions for your data
 65 | ##   but identifying the correct solution may depend on the data.
 66 | ##
 67 | ## * Just right: A data set of medium size (say, under 100 MB), that
 68 | ##   is under moderate levels of change (either stabilising or a
 69 | ##   "living database" that is continually being updated).
 70 | 
 71 | ## Getting data into R is typically done with a [data
 72 | ## package](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Data-in-packages).
 73 | ## This works well for small data, but CRAN will [not generally
 74 | ## allow](https://cran.r-project.org/web/packages/policies.html)
 75 | ## distribution of "large" data sets.  The `data()` loading mechanism
 76 | ## of R always seemed a bit of a weird historical quirk in any case;
 77 | ## it operates in some additional namespace (`package:datasets`),
 78 | ## works by modifying an environment as a side-effect.  Plus if you
 79 | ## need to compare two versions of the data you have to do some
 80 | ## gynmastics to install two different versions of a package (or
 81 | ## create a package with all the different versions of the data in
 82 | ## it).
 83 | 
 84 | ## ## How `datastorr` works
 85 | 
 86 | ## GitHub has a "releases" feature for allowing (potentially large)
 87 | ## file uploads.  These files can be any format.  GitHub releases are
 88 | ## build off of git "tags"; they are *associated with a specific
 89 | ## version*.  So if you have code that creates or processes a dataset,
 90 | ## the dataset will be stored against the code used to create it,
 91 | ## which is nice.  GitHub releases *do not store the file in the
 92 | ## repository*.  This avoids issues with git slowing down on large
 93 | ## files, on lengthy clone times, and on distributing and installing
 94 | ## your package.  This could be an issue if you had 100 versions of a
 95 | ## 10 MB dataset; that could be 1GB of data to clone or install.  But
 96 | ## storing your data against GitHub releases will leave the data in
 97 | ## the cloud until it is needed.  And the files can be quite large;
 98 | ## [up to
 99 | ## 2GB](https://help.github.com/articles/distributing-large-binaries).
100 | 
101 | ## The releases will be numbered.  We recommend [semantic
102 | ## versioning](http://semver.org) mostly because it signals some
103 | ## intent about changes to the data (see below).  If the data is not
104 | ## going to change, that's not a problem - the version can just be
105 | ## `v1.0.0` forever (chances are it will change though!).
106 | 
107 | ## We will make the simplifying assumption that your data set will be
108 | ## stored in a single file.  In practice this is not a large
109 | ## limitation because that file could be a zip archive.  The file can
110 | ## be in any format; csv, rds (R's internal format), a SQLite
111 | ## database.  You, however, need to specify or provide a function that
112 | ## will read the data and convert it into an R object.  This is most
113 | ## easily done with `rds` files (R's serialisation format -- though
114 | ## note they say that it is not a great long-term archival format [see
115 | ## `?serialize`]).
116 | 
117 | ## To orchestrate getting the data from github to R we need to add a
118 | ## little metadata about what the file will be called and how it
119 | ## should be loaded into R.  This can be done most simply with a small
120 | ## [json](https://en.wikipedia.org/wiki/JSON) file at the root of the
121 | ## repository containing information like:
122 | ##
123 | ## ```json
124 | ## {
125 | ##     "filename": "myfile.rds",
126 | ##     "read": "base::readRDS"
127 | ## }
128 | ## ```
129 | ##
130 | ## Note that the function used here must take a filename as an
131 | ## argument and return an R object.  So functions like `read.csv`,
132 | ## `read.table` and functions from the
133 | ## [`rio`](https://github.com/leeper/rio) package may be good here.
134 | 
135 | ## Once your git repository is set up, the metadata file added to it,
136 | ## and a release with data has been created, it can be downloaded
137 | ## like:
138 | d <- datastorr::datastorr("richfitz/data")
139 | 
140 | ## though, with your username/repo pair instead of `richfitz/data`.
141 | 
142 | ## This function is designed to be *fast* for users, and so suitable
143 | ## for using in scripts.  It uses
144 | ## [`storr`](https://github.com/richfitz/storr) behind the scenes and
145 | ## looks in various places for the data:
146 | ##
147 | ## 1. In memory; if it has been loaded within this session it is
148 | ## already in memory.  Takes on the order of microseconds.
149 | ##
150 | ## 2. From disk; if the data has _ever_ been loaded datastorr will
151 | ## cache a copy on disk.  Takes on the order of milliseconds up to a
152 | ## second, depending on the size of the data.
153 | ##
154 | ## 3. From GitHub; if the data has never been loaded, it will be
155 | ## downloaded from GitHub, saved to disk, and loaded to memory.  This
156 | ## will take several seconds or longer depending on the size of the
157 | ## dataset.
158 | 
159 | ## In addition, users can download specific versions of a dataset.
160 | ## This might be to synchronise data versions across different people
161 | ## in a project, to lock a project onto a specific version, etc:
162 | d_old <- datastorr::datastorr("richfitz/data", version="1.0.0")
163 | 
164 | ## (The same cascading lookup as above is used.)
165 | 
166 | ## Versions can be listed; those stored locally:
167 | datastorr::datastorr_versions("richfitz/data")
168 | 
169 | ## or available remotely:
170 | datastorr::datastorr_versions("richfitz/data", local=FALSE)
171 | 
172 | ## The versions that have been downloaded (here `d` and `d_old`) are
173 | ## just normal R objects Unlike use with `data()` there's no ambiguity
174 | ## about where they are stored, and modifying one acts like any other
175 | ## object.
176 | 
177 | ## Similarly, because these are ordinary R objects you can do things
178 | ## like use [`daff`](https://github.com/edwindj/daff) to compare them
179 | ##
180 | ## ```r
181 | ## p <- daff::diff_data(d_old, d)
182 | ## daff::render_diff(p)
183 | ## ```
184 | 
185 | ## ## The package interface
186 | 
187 | ## Alternatively we can create a very small R package that exists at
188 | ## the repo that we store releases against.  This package can be
189 | ## autogenerated, and is a useful approach when there is a significant
190 | ## amount of work needed in processing the data, to simplify
191 | ## installation of dependencies used in reading or displaying the
192 | ## data, or to work with the data once it has been downloaded.In our
193 | ## own use, the repository (but not the package) contains code for
194 | ## _building_ the data set (see
195 | ## [taxonlookup](https://github.com/traitecoevo/taxonlookup)).  The
196 | ## package approach will be described more fully later in the
197 | ## document.
198 | 
199 | ## Once your git repsitory is published and your data have been
200 | ## released, downloading it becomes a function within your package.  A
201 | ## user would run something like:
202 | ##
203 | ## ```{r,eval=FALSE}
204 | ## d <- mypackage::mydata()
205 | ## ```
206 | ##
207 | ## to fetch or load the data.
208 | 
209 | ##
210 | ## ```{r,eval=FALSE}
211 | ## d <- mypackage::mydata("v1.0.0")
212 | ## ```
213 | ##
214 | 
215 | 
216 | ## This approach extends to holding multiple versions of the data on a
217 | ## single computer (or in a single R session).  This might be useful
218 | ## when the dataset has changed and you want to see what has changed.
219 | ##
220 | ## ```{r,eval=FALSE}
221 | ## d1 <- mypackage::mydata("v1.0.0")
222 | ## d2 <- mypackage::mydata("v1.1.0")
223 | ## ## ...compare d1 and d2 here...
224 | ## ```
225 | 
226 | ## ## Worked example
227 | 
228 | ### So for a csv file you'd provide a function that would be
229 | ### `read.csv` perhaps with a few arguments set (e.g.,
230 | ### `stringsAsFactors=FALSE`).  For an rds file you could just use
231 | ### `readRDS`.  And for a zip file you would need a more complicated
232 | ### function (see the worked example).
233 | 
234 | 
235 | 
236 | ### TODO: I should get a link to a cooking show and do a "Here's one I
237 | ### prepared earlier" thing.
238 | 
239 | ## First, you will need a package.  Creating packages is not that
240 | ## hard, especialy with tools like
241 | ## [devtools](https://github.com/hadley/devtools) and
242 | ## [mason](https://github.com/gaborcsardi/mason).  Packages make
243 | ## running R code on other machines much simpler than sourcing in
244 | ## files or copy and paste.  Packages are also nice because if your
245 | ## data require specific package to work with (e.g., `ape` for
246 | ## phylogenetic trees) you can declare them in your `DESCRIPTION` file
247 | ## and R will ensure that they are installed when your package is
248 | ## installed and loaded when your package is used.
249 | ##
250 | ## However, you will need to come up with a few details:
251 | ##
252 | ## * a package name
253 | ## * a name for the dataset (if different to the package name)
254 | ## * a _licence_ for your package (code) and data (not code)
255 | ## * ideally, documentation for your end users
256 | ## * the name of the file that you will store with each release
257 | 
258 | ## In addition you need to set up a GitHub token so that you can
259 | ## upload files to GitHub from R, or to access your private
260 | ## repositories; see the section on authentication below, or just do
261 | ## nothing as datastorr will prompt you at the appropriate time.
262 | 
263 | ## The core code can be autogenerated.  For example the package
264 | ## [datastorr.example](https://github.com/richfitz/datastorr.example)
265 | ## was generated using
266 | ##+ eval=FALSE
267 | datastorr::autogenerate("richfitz/datastorr.example", "readRDS",
268 |                         name="mydata", roxygen=FALSE)
269 | 
270 | ##+ echo=FALSE, results="asis"
271 | pkg <- datastorr::autogenerate("richfitz/datastorr.example", "readRDS",
272 |                                name="mydata", roxygen=FALSE)
273 | writeLines(c("```r", pkg,  "```"))
274 | 
275 | ## This code can be copied into a file within the package.  If you set
276 | ## `roxygen=TRUE` you'll get roxygen help that `devtools::document()`
277 | ## will convert into R help files and `NAMESPACE` declarations.
278 | 
279 | ## The package can then be loaded and data accessed with the `mydata`
280 | ## function.
281 | 
282 | ## To make the release:
283 | 
284 | ## 1. Increase the version number in your `DESCRIPTION` file
285 | ##
286 | ## 2. Your local repo is all committed (no unstaged files etc).  This
287 | ## is important if you want to closely associate the release and your
288 | ## data and at the moment datastorr enforces it.
289 | ##
290 | ## 3. Push your changes to GitHub and install your package
291 | ##
292 | ## 4. Run `yourpackage::yourdata_release("A description here")`
293 | ##
294 | ## 5. Check that it all worked by running `yourpackage::yourdata("new version")`
295 | ##
296 | ## (you can get your new version by `read.dcf("DESCRIPTION")[,
297 | ## "Version"]`).
298 | 
299 | ## ## Access control
300 | 
301 | ## Because GitHub offers private repositories, this gives some
302 | ## primitive, but potentailly useful, access control.  Because
303 | ## datastorr uses GitHub's authentication, GitHub knows if the user
304 | ## has access to private repositories.  Therefore for this to work you
305 | ## will need to authenticate datastorr to work with GitHub.
306 | 
307 | ## The simplest way to do this is to let datastorr prompt you when
308 | ## access is required.  Or run:
309 | ##
310 | ## ```r
311 | ## datastorr::datastorr_auth()
312 | ## ```
313 | ##
314 | ## to force the authentication process to run (no error and no output
315 | ## indicates success).  To force using personal access tokens rather
316 | ## than OAuth, run:
317 | ##
318 | ## ```r
319 | ## setup_github_token()
320 | ## ```
321 | ##
322 | ## which will walk you through the steps of setting a token up.
323 | 
324 | ## If you use a personal private repository, then you invite other
325 | ## users to "collaborate" on the repository.  Note that this gives the
326 | ## users push access to the repository; the access control is very
327 | ## coarse.
328 | ##
329 | ## If you have an organisation account you can create groups of users
330 | ## that have read only access to particular repositories, which will
331 | ## likely scale better.
332 | 
333 | ## ## Semantic versioning of data
334 | 
335 | ## Some will argue that it is not possible and they are probably
336 | ## right.  But you need to go with some versioning system.  If the
337 | ## idea of semantically versioning data bothers you, use incrementing
338 | ## integers (`v1`, `v2`, `v<n>`) and read no further!
339 | 
340 | ## The idea with semantic versioning is that it formalises what people
341 | ## do already with versioning.  We feel this can be applied fairly
342 | ## successfully to data.
343 | 
344 | ## * **Update patch release**; small changes, backward compatible.
345 | ##     * adding new rows to the data set (more data)
346 | ##     * error correcting existing data
347 | 
348 | ## * **Update minor version**; medium changes, but generally backward
349 | ##   compatible.
350 | ##     * new columns
351 | ##     * substantial new data
352 | ##     * new tables
353 | ##
354 | ## * **Update major version**; large (API) changes, likely to be backward
355 | ##   incompatible.
356 | ##     * renaming or deleting columns
357 | ##     * changing variable coding
358 | ##     * deleting large amounts of data
359 | 
360 | ## Forks make this a lot more complicated.  If two people are working
361 | ## in parallel how do they decide what version number to use?
362 | ## However, with our solution, the datasets are still sensibly named;
363 | ## we have:
364 | ##
365 | ##    * `user1/dataset@v1.2.3`
366 | ##    * `user2/dataset@v1.3.5`
367 | ##
368 | ## It's just not possible to know from the outside exactly what
369 | ## differs between the datasets but they are at least distinctly named
370 | ## (and you could download both of them).  When the fork is resolved
371 | ## and `user2` merges back into `user1` the two researchers can
372 | ## discuss what version number they would want to use.  Like resolving
373 | ## merge conflicts, we see this as a _social_ problem, not a
374 | ## _technological_ one and the soltuion will be social.
375 | 
376 | ## ## Beyond GitHub
377 | 
378 | ## Apart from the ease of use, mindshare and the explicit association
379 | ## between data and code, there is no strong reason to use GitHub
380 | ## here.  Certainly Bitbucket provides all the same functionality that
381 | ## is required to generalise our approach to work there.  And self
382 | ## hosting would work too, with more effort.  Over time we may develop
383 | ## support for alternative storage providers.
384 | 
385 | ## At the same time, the fast and generally reliable webserver, the
386 | ## access controls and the nice API make it a great first place to try
387 | ## this proof of concept.
388 | 
389 | ## ## How it _actually_ works
390 | 
391 | ## GitHub has an API that lets you programmatically query the state of
392 | ## basically everything on GitHub, as well *create* things.  So the
393 | ## interaction with the website is straightforward; getting lists of
394 | ## releases for a repository, filenames associated with releases, etc.
395 | 
396 | ## With this information, `datastorr` uses a
397 | ## [`storr_external`](https://richfitz.github.io/storr/vignettes/external.html)
398 | ## object and stores data with versions as keys.  If a version is not
399 | ## found it is downloaded (using the information from GitHub) and read
400 | ## into R using the `read` function.  A copy of this R-readable
401 | ## verison is saved to disk.
402 | 
403 | ## In order to save and load data repeatedly, especially across
404 | ## different projects on the same computer, `datastorr` uses the
405 | ## `rappdirs` package to find the "Right Place" to store "application
406 | ## data".  This varies by system and is documented in the
407 | ## `?rappdirs::user_data_dir` help page.  Using this directory means
408 | ## there is little chance of accidently commiting large data sets into
409 | ## the repository (which might be a problem if storing the data in a
410 | ## subdirectory of the project).
411 | 


--------------------------------------------------------------------------------