├── .Rbuildignore ├── .gitignore ├── .gitmodules ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── R ├── datapkg_read.R ├── datapkg_validate.R ├── datapkg_write.R └── old │ ├── datapkg_new.R │ └── print.R ├── README-NOT.md ├── README.md ├── appveyor.yml ├── datapkg.Rproj ├── inst └── tabular-data-package.json └── man └── datapackage.Rd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^appveyor\.yml$ 4 | ^\.travis\.yml$ 5 | ^data$ 6 | ^tests/testsuite-py$ 7 | ^datapackage.json$ 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | src/*.o 5 | src/*.so 6 | src/*.dll 7 | src/rexp.pb.cc 8 | src/rexp.pb.h 9 | src/Makevars 10 | inst/doc 11 | windows 12 | data/* 13 | datapackage.json 14 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ropensci-archive/datapkg/d08f68d8dd8533aa0a7f49fe5a590736e3923754/.gitmodules -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | 7 | r_github_packages: 8 | - jimhester/covr 9 | 10 | warnings_are_errors: true 11 | #r_check_revdep: true 12 | 13 | # V8 is required for jsonvalidate 14 | addons: 15 | apt: 16 | packages: 17 | - libv8-dev 18 | 19 | notifications: 20 | email: 21 | on_success: change 22 | on_failure: change 23 | 24 | after_success: 25 | - Rscript -e 'covr::codecov(type = "all")' 26 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: datapkg 2 | Type: Package 3 | Title: Read and Write Data Packages 4 | Version: 0.1 5 | Authors@R: c( 6 | person("Jeroen", "Ooms", email = "jeroen.ooms@stat.ucla.edu", role = c("aut", "cre")), 7 | person("Karthik", "Ram", email = "karthik.ram@gmail.com", role = "aut")) 8 | Description: Convenience functions for reading and writing datasets following 9 | the 'data packagist' format. 10 | URL: http://frictionlessdata.io/data-packages/, https://github.com/ropenscilabs/datapkg 11 | BugReports: https://github.com/ropenscilabs/datapkg/issues 12 | License: MIT + file LICENSE 13 | Imports: 14 | methods, 15 | readr, 16 | git2r, 17 | jsonlite, 18 | curl 19 | Suggests: 20 | jsonvalidate, 21 | ggplot2 22 | Remotes: 23 | ropenscilabs/jsonvalidate, 24 | hadley/readr 25 | RoxygenNote: 5.0.1 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2016 2 | COPYRIGHT HOLDER: Jeroen Ooms, Karthik Ram 3 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,datapkg_data) 4 | S3method(print,datapkg_resources) 5 | export(datapkg_read) 6 | export(datapkg_validate) 7 | export(datapkg_write) 8 | import(readr) 9 | -------------------------------------------------------------------------------- /R/datapkg_read.R: -------------------------------------------------------------------------------- 1 | #' Read/write data-package 2 | #' 3 | #' Read and write data frames to/from 'data-package' format. For reading 4 | #' supported paths are disk, http or git. For writing only disk is supported. 5 | #' 6 | #' @import readr 7 | #' @param path file path or URL to the data package directory 8 | #' @rdname datapackage 9 | #' @name datapackage 10 | #' @aliases datapkg 11 | #' @references \url{http://frictionlessdata.io/data-packages}, \url{https://github.com/datasets} 12 | #' @export 13 | #' @examples # Create new data package 14 | #' pkgdir <- tempfile() 15 | #' datapkg_write(mtcars, path = pkgdir) 16 | #' datapkg_write(iris, path = pkgdir) 17 | #' 18 | #' # Read it back 19 | #' mypkg <- datapkg_read(pkgdir) 20 | #' print(mypkg$data$mtcars) 21 | #' 22 | #' # Clone package with git: 23 | #' cities <- datapkg_read("git://github.com/datasets/world-cities") 24 | #' 25 | #' # Read over http 26 | #' euribor <- datapkg_read("https://raw.githubusercontent.com/datasets/euribor/master") 27 | datapkg_read <- function(path = getwd()){ 28 | root <- sub("datapackage.json$", "", path) 29 | root <- sub("/$", "", root) 30 | if(is_git(root)){ 31 | newroot <- tempfile() 32 | git2r::clone(root, newroot) 33 | root <- newroot 34 | } 35 | json_path <- file.path(root, "datapackage.json") 36 | json <- if(is_url(root)){ 37 | con <- curl::curl(json_path, "r") 38 | on.exit(close(con)) 39 | readLines(con, warn = FALSE) 40 | } else { 41 | readLines(normalizePath(json_path, mustWork = TRUE), warn = FALSE) 42 | } 43 | pkg_info <- jsonlite::fromJSON(json, simplifyVector = TRUE) 44 | if(is.data.frame(pkg_info$resources)) 45 | class(pkg_info$resources) <- c("datapkg_resources", class(pkg_info$resources)) 46 | if(is.data.frame(pkg_info$sources)) 47 | class(pkg_info$sources) <- c("datapkg_sources", class(pkg_info$sources)) 48 | pkg_info$data <- list(rep(NA, nrow(pkg_info$resources))) 49 | data_names <- pkg_info$resources$name 50 | for(i in seq_len(nrow(pkg_info$resources))){ 51 | target <- as.list(pkg_info$resources[i, ]) 52 | if(!length(target$schema)) 53 | stop("Dataset ", i, "is missing a schema") 54 | if(!length(target$schema$fields)) 55 | stop("Dataset ", i, "is missing the schema.fields property") 56 | pkg_info$data[[i]] <- read_data_package(get_data_path(target, root), 57 | dialect = as.list(target$dialect), hash = target$hash, target$schema$fields[[1]]) 58 | } 59 | class(pkg_info$data) <- c("datapkg_data") 60 | if(length(data_names)) 61 | names(pkg_info$data) <- ifelse(is.na(data_names), "", data_names) 62 | pkg_info 63 | } 64 | 65 | get_data_path <- function(x, root){ 66 | if(length(x$path)){ 67 | data_path <- normalizePath(file.path(root, x$path), mustWork = FALSE) 68 | if(is_url(data_path) || file.exists(data_path)){ 69 | return(data_path) 70 | } else { 71 | if(length(x$url)){ 72 | message("File not found: ", data_path) 73 | return(x$url) 74 | } else { 75 | stop("File not found: ", data_path) 76 | } 77 | } 78 | } 79 | } 80 | 81 | is_git <- function(x){ 82 | grepl("^git://", x) 83 | } 84 | 85 | is_url <- function(x){ 86 | grepl("^[a-zA-Z]+://", x) 87 | } 88 | 89 | read_data_package <- function(path, dialect = list(), hash = NULL, fields = NULL) { 90 | if(!length(fields)) 91 | return(data.frame()) 92 | col_types <- list() 93 | for(i in seq_len(nrow(fields))) 94 | col_types[[i]] <- do.call(make_field, as.list(fields[i,])) 95 | do.call(parse_data_file, c(list(file = path, col_types = col_types), dialect)) 96 | } 97 | 98 | make_field <- function(name = "", type = "string", description = "", format = NULL, ...){ 99 | 100 | #datapkg prefixes strptime format with 'fmt:' 101 | if(length(format)) 102 | format <- sub("^fmt:", "", format) 103 | switch(type, 104 | string = col_character(), 105 | number = col_number(), 106 | integer = col_integer(), 107 | boolean = col_logical(), 108 | object = col_character(), 109 | array = col_character(), 110 | date = col_date(format), 111 | datetime = col_datetime(format), 112 | time = col_time(format), 113 | col_character() 114 | ) 115 | } 116 | 117 | ## Defaults from http://dataprotocols.org/csv-dialect/ 118 | parse_data_file <- function(file, col_types = NULL, delimiter = ",", doubleQuote = TRUE, 119 | lineTerminator = "\r\n", quoteChar = '"', escapeChar = "", skipInitialSpace = TRUE, 120 | header = TRUE, caseSensitiveHeader = FALSE){ 121 | # unused fields: lineTerminator, skipInitialSpace, caseSensitiveHeader 122 | message("Reading file ", file) 123 | readr::read_delim( 124 | col_types = col_types, 125 | file = file, 126 | delim = delimiter, 127 | escape_double = doubleQuote, 128 | quote = quoteChar, 129 | escape_backslash = identical(escapeChar, "\\"), 130 | col_names = header 131 | ) 132 | } 133 | 134 | #' @export 135 | print.datapkg_resources <- function(x, ...){ 136 | print_names <- names(x) %in% c("name", "path", "format") 137 | print(as.data.frame(x)[print_names]) 138 | } 139 | 140 | #' @export 141 | print.datapkg_data <- function(x, ...){ 142 | for(i in seq_along(x)){ 143 | data_name <- names(x[i]) 144 | if(length(data_name) && !is.na(data_name)){ 145 | cat(" $", data_name, "\n", sep = "") 146 | } else { 147 | cat(" [[", i, "]]\n", sep = "") 148 | } 149 | mydata <- x[[i]] 150 | for(j in seq_along(mydata)){ 151 | cat(" [", j, "] ", names(mydata)[j], " (", methods::is(mydata[[j]])[1], ")\n", sep = "") 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /R/datapkg_validate.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | #' @rdname datapackage 3 | datapkg_validate <- function(path = getwd()){ 4 | root <- sub("datapackage.json$", "", path) 5 | root <- sub("/$", "", root) 6 | json_path <- file.path(root, "datapackage.json") 7 | schema_path <- system.file("tabular-data-package.json", package = "datapkg") 8 | json <- paste(readLines(json_path), collapse = "\n") 9 | schema <- paste(readLines(schema_path), collapse = "\n") 10 | jsonvalidate::json_validate(json, schema, verbose =TRUE, greedy = TRUE) 11 | } 12 | -------------------------------------------------------------------------------- /R/datapkg_write.R: -------------------------------------------------------------------------------- 1 | #' @rdname datapackage 2 | #' @param data a data frame to be added to the package 3 | #' @param name what to name this dataset 4 | #' @export 5 | datapkg_write <- function(data, name, path = getwd()){ 6 | if(missing(name)) 7 | name <- deparse(substitute(data)) 8 | stopifnot(is.data.frame(data)) 9 | root <- sub("datapackage.json$", "", path) 10 | root <- sub("/$", "", root) 11 | dir.create(file.path(root, "data"), showWarnings = FALSE, recursive = TRUE) 12 | json_path <- file.path(root, "datapackage.json") 13 | csv_name <- file.path("data", paste0(name, ".csv")) 14 | csv_path <- file.path(root, csv_name) 15 | if(file.exists(csv_path)) 16 | stop("File already exists: ", csv_path, call. = FALSE) 17 | pkg_info <- if(file.exists(json_path)){ 18 | message("Opening existing ", json_path) 19 | jsonlite:::fromJSON(json_path, simplifyVector = FALSE) 20 | } else { 21 | message("Creating new ", json_path) 22 | list(name = basename(path)) 23 | } 24 | readr::write_csv(data, csv_path) 25 | pkg_info$resources <- c(pkg_info$resources, 26 | list(list( 27 | path = csv_name, 28 | name = name, 29 | schema = make_schema(data) 30 | )) 31 | ) 32 | json <- jsonlite::toJSON(pkg_info, pretty = TRUE, auto_unbox = TRUE) 33 | writeLines(json, json_path) 34 | } 35 | 36 | make_schema <- function(data){ 37 | out <- as.list(rep(NA, length(data))) 38 | for(i in seq_along(data)){ 39 | out[[i]] <- list( 40 | name = names(data)[i], 41 | type = get_type(data[[i]]) 42 | ) 43 | } 44 | list(fields = out) 45 | } 46 | 47 | get_type <- function(x){ 48 | if(inherits(x, "Date")) return("date") 49 | if(inherits(x, "POSIXt")) return("datetime") 50 | if(is.character(x)) return("string") 51 | if(is.integer(x)) return("integer") 52 | if(is.numeric(x)) return("number") 53 | if(is.logical(x)) return("boolean") 54 | return("string") 55 | } 56 | -------------------------------------------------------------------------------- /R/old/datapkg_new.R: -------------------------------------------------------------------------------- 1 | #' Data-package 2 | #' 3 | #' Load or initiate a \href{http://dataprotocols.org/data-packages}{data package} for 4 | #' reading / writing data and metadata. A data package can be an R package at the same 5 | #' time. The default format for storing data is 6 | #' \href{http://dataprotocols.org/linear-tsv}{linear-tsv} which is the least 7 | #' ambiguous format and natively supported by R via \code{\link{read.table}} 8 | #' or \code{\link[readr:read_tsv]{readr::read_tsv}}. 9 | #' 10 | #' @aliases datapackage 11 | #' @importFrom tools md5sum 12 | #' @param path root directory of the data package 13 | #' @param verbose emits some debugging messages 14 | #' @examples # Create a data package in a dir 15 | #' pkgdir <- tempfile() 16 | #' dir.create(pkgdir) 17 | #' pkg <- data_package(pkgdir) 18 | #' 19 | #' # Show methods 20 | #' print(pkg) 21 | #' 22 | #' # Examples 23 | #' pkg$author("Jerry", "jerry@gmail.com") 24 | #' pkg$resources$add(iris) 25 | #' pkg$sources$add("Fisher, R. A. (1936)") 26 | #' 27 | #' # View json file 28 | #' pkg$json() 29 | #' 30 | #' # Parse data 31 | #' pkg$resources$read("iris") 32 | datapkg_new <- function(path = ".", verbose = TRUE){ 33 | pkg_file <- function(x, exists = TRUE) { 34 | normalizePath(file.path(path, x), mustWork = exists && !is_url(x)) 35 | } 36 | 37 | pkg_json <- function(){ 38 | pkg_file("datapackage.json") 39 | } 40 | 41 | pkg_read <- function(){ 42 | from_json(pkg_json()) 43 | } 44 | 45 | pkg_update <- function(...){ 46 | meta <- pkg_read() 47 | args <- list(...) 48 | for(i in seq_along(args)){ 49 | key <- names(args[i]) 50 | meta[[key]] = args[[i]] 51 | } 52 | writeLines(to_json(meta), pkg_json()) 53 | return(meta) 54 | } 55 | 56 | pkg_init <- function(){ 57 | if(file.exists(pkg_file("datapackage.json", FALSE))){ 58 | meta <- pkg_read() 59 | if(verbose) 60 | message("Opening existing datapackage: ", meta$name) 61 | } else { 62 | writeLines("{}", pkg_file("datapackage.json", FALSE)) 63 | pkg_update( 64 | name = basename(normalizePath(path)), 65 | resources = list() 66 | ) 67 | } 68 | } 69 | 70 | # Sources object 71 | pkg_contributors <- function(){ 72 | find <- function(name = "", exact = FALSE){ 73 | data <- Filter(function(x){ 74 | if(isTRUE(exact)){ 75 | return(x$name == name) 76 | } else { 77 | grepl(name, x$name, fixed = TRUE) 78 | } 79 | }, pkg_read()$contributors) 80 | jsonlite:::simplifyDataFrame(data, c("name", "email", "web"), flatten = FALSE, simplifyMatrix = FALSE) 81 | } 82 | add <- function(name, email, web){ 83 | out <- list(name = name) 84 | if(!missing(email)) 85 | out$email = email 86 | if(!missing(web)) 87 | out$web = web 88 | pkg_update(contributors = c(pkg_read()$contributors, list(out))) 89 | find() 90 | } 91 | remove <- function(name){ 92 | stopifnot(is_string(name)) 93 | all <- find(name, exact = TRUE) 94 | if(!nrow(all)) 95 | stop("No source found for: ", name) 96 | pkg_update(contributors = Filter(function(x){ 97 | (x$name != name) 98 | }, pkg_read()$contributors)) 99 | find() 100 | } 101 | lockEnvironment(environment(), TRUE) 102 | structure(environment(), class=c("dpkg-contributors", "jeroen", "environment")) 103 | } 104 | 105 | # Sources object 106 | pkg_sources <- function(){ 107 | find <- function(name = "", exact = FALSE){ 108 | data <- Filter(function(x){ 109 | if(isTRUE(exact)){ 110 | return(x$name == name) 111 | } else { 112 | grepl(name, x$name, fixed = TRUE) 113 | } 114 | }, pkg_read()$sources) 115 | jsonlite:::simplifyDataFrame(data, c("name", "email", "web"), flatten = FALSE, simplifyMatrix = FALSE) 116 | } 117 | add <- function(name, email, web){ 118 | out <- list(name = name) 119 | if(!missing(email)) 120 | out$email = email 121 | if(!missing(web)) 122 | out$web = web 123 | pkg_update(sources = c(pkg_read()$sources, list(out))) 124 | find() 125 | } 126 | remove <- function(name){ 127 | stopifnot(is_string(name)) 128 | all <- find(name, exact = TRUE) 129 | if(!nrow(all)) 130 | stop("No source found for: ", name) 131 | pkg_update(sources = Filter(function(x){ 132 | (x$name != name) 133 | }, pkg_read()$sources)) 134 | find() 135 | } 136 | lockEnvironment(environment(), TRUE) 137 | structure(environment(), class=c("datapkg-sources", "jeroen", "environment")) 138 | } 139 | 140 | # Resources object 141 | pkg_resources <- function(){ 142 | find <- function(name = "", folder = NULL){ 143 | data <- Filter(function(x){ 144 | res_path <- paste0("", x$path) 145 | res_name <- paste0("", x$name) 146 | if(length(folder) && !(grepl(paste0("^", folder, "/"), res_path))) 147 | return(FALSE) 148 | grepl(name, res_name, fixed = TRUE) 149 | }, pkg_read()$resources) 150 | for(i in seq_along(data)){ 151 | data[[i]]$read = function(){ 152 | target <- data[[i]] 153 | read_data_package(pkg_file(target$path), dialect = target$dialect, hash = target$hash, target$schema) 154 | } 155 | } 156 | jsonlite:::simplifyDataFrame(data, c("name", "path", "format", "read"), flatten = FALSE, simplifyMatrix = FALSE) 157 | } 158 | info <- function(name){ 159 | data <- Filter(function(x){ 160 | (x$name == name) 161 | }, pkg_read()$resources) 162 | if(!length(data)) 163 | stop("Resource not found: ", name) 164 | data[[1]] 165 | } 166 | add <- function(data, name, folder = "data", format = "csv"){ 167 | stopifnot(is.data.frame(data)) 168 | if(missing(name)) 169 | name <- deparse(substitute(data)) 170 | format <- match.arg(format) 171 | if(nrow(find(name))) 172 | stop("Resource with name '", name, "' already exists.") 173 | file_name <- paste(name, format, sep = ".") 174 | file_path <- file.path(folder, file_name) 175 | abs_path <- pkg_file(file_path, exists = FALSE) 176 | dir.create(pkg_file(folder, exists = FALSE), showWarnings = FALSE) 177 | write_data <- prepare_data(data) 178 | readr::write_delim(write_data, abs_path, delim = ";", col_names = TRUE) 179 | hash <- tools::md5sum(abs_path) 180 | rec <- base::list( 181 | name = name, 182 | path = file_path, 183 | format = "tsv", 184 | hash = unname(hash), 185 | schema = make_schema(data), 186 | dialect = base::list( 187 | header = TRUE, 188 | delimiter = ";" 189 | ) 190 | ) 191 | pkg_update(resources = c(pkg_read()$resources, base::list(rec))) 192 | find() 193 | } 194 | remove <- function(name, folder = "data"){ 195 | stopifnot(is_string(name)) 196 | target <- info(name) 197 | unlink(pkg_file(target$path)) 198 | pkg_update(resources = Filter(function(x){ 199 | (x$name != name) 200 | }, pkg_read()$resources)) 201 | find() 202 | } 203 | read <- function(name){ 204 | target <- info(name) 205 | data_path <- pkg_file(target$path) 206 | read_data_package(data_path, dialect = target$dialect, hash = target$hash, target$schema) 207 | } 208 | lockEnvironment(environment(), TRUE) 209 | structure(environment(), class=c("datapkg-resources", "jeroen", "environment")) 210 | } 211 | 212 | # Exported methods 213 | pkg_init() 214 | self <- local({ 215 | sources <- pkg_sources() 216 | resources <- pkg_resources() 217 | contributors <- pkg_contributors() 218 | name <- function(x){ 219 | if(!missing(x)) 220 | pkg_update(name = x) 221 | pkg_read()$name 222 | } 223 | license <- function(type, url){ 224 | if(!missing(type)){ 225 | if(!missing(url)){ 226 | pkg_update(license = list( 227 | type = type, 228 | url = url 229 | )) 230 | } else { 231 | pkg_update(license = type) 232 | } 233 | } 234 | pkg_read()$license 235 | } 236 | author <- function(name, email, web){ 237 | if(!missing(name)){ 238 | out <- list(name = name) 239 | if(!missing(email)) 240 | out$email = email 241 | if(!missing(web)) 242 | out$web = web 243 | pkg_update(author = out) 244 | } 245 | pkg_read()$author 246 | } 247 | description <- function(x){ 248 | if(!missing(x)) 249 | pkg_update(description = x) 250 | pkg_read()$description 251 | } 252 | homepage <- function(x){ 253 | if(!missing(x)) 254 | pkg_update(homepage = x) 255 | pkg_read()$homepage 256 | } 257 | version <- function(x){ 258 | if(!missing(x)) 259 | pkg_update(version = x) 260 | pkg_read()$version 261 | } 262 | json <- function(){ 263 | str <- paste(readLines(pkg_json()), collapse = "\n") 264 | structure(str, class = "json") 265 | } 266 | lockEnvironment(environment(), TRUE) 267 | structure(environment(), class=c("dpkg", "jeroen", "environment")) 268 | }) 269 | } 270 | 271 | prepare_data <- function(data){ 272 | for(i in seq_along(data)){ 273 | if(is.logical(data[[i]])){ 274 | out <- ifelse(data[[i]], "true", "false") 275 | out[is.na(data[[i]])] <- "" 276 | data[[i]] <- out 277 | } 278 | } 279 | data 280 | } 281 | 282 | make_schema <- function(data){ 283 | out <- as.list(rep(NA, length(data))) 284 | for(i in seq_along(data)){ 285 | out[[i]] <- list( 286 | name = names(data)[i], 287 | type = get_type(data[[i]]) 288 | ) 289 | } 290 | list(fields = out) 291 | } 292 | 293 | from_json <- function(path){ 294 | path <- normalizePath(path, mustWork = TRUE) 295 | jsonlite::fromJSON(readLines(path, warn = FALSE), simplifyVector = FALSE) 296 | } 297 | 298 | to_json <- function(x){ 299 | jsonlite::toJSON(x, auto_unbox = TRUE, pretty = TRUE) 300 | } 301 | 302 | is_string <- function(x){ 303 | is.character(x) && identical(length(x), 1L) 304 | } 305 | 306 | is_url <- function(x){ 307 | grepl("^[a-zA-Z]+://", x) 308 | } 309 | 310 | 311 | # Implements: http://dataprotocols.org/json-table-schema/#schema 312 | coerse_type <- function(x, type){ 313 | switch(type, 314 | string = as.character(x), 315 | number = as.numeric(x), 316 | integer = as.integer(x), 317 | boolean = parse_bool(x), 318 | object = lapply(x, from_json), 319 | array = lapply(x, from_json), 320 | date = parse_date(x), 321 | datetime = parse_datetime(x), 322 | time = paste_time(x), 323 | as.character(x) 324 | ) 325 | } 326 | 327 | get_type <- function(x){ 328 | if(inherits(x, "Date")) return("date") 329 | if(inherits(x, "POSIXt")) return("datetime") 330 | if(is.character(x)) return("string") 331 | if(is.integer(x)) return("integer") 332 | if(is.numeric(x)) return("number") 333 | if(is.logical(x)) return("boolean") 334 | return("string") 335 | } 336 | 337 | parse_bool <- function(x){ 338 | is_true <- (x %in% c("yes", "y", "true", "t", "1")) 339 | is_false <- (x %in% c("no", "n", "false", "f", "0")) 340 | is_na <- is.na(x) | (x %in% c("NA", "na", "")) 341 | is_none <- (!is_true & !is_false & !is_na) 342 | if(any(is_none)) 343 | stop("Failed to parse boolean values: ", paste(head(x[is_none], 5), collapse = ", ")) 344 | out <- rep(FALSE, length(x)) 345 | out[is_na] <- NA 346 | out[is_true] <- TRUE 347 | out 348 | } 349 | 350 | parse_date <- function(x){ 351 | as.Date(x) 352 | } 353 | 354 | parse_datetime <- function(x){ 355 | as.POSIXct(x) 356 | } 357 | 358 | paste_time <- function(x){ 359 | as.POSIXct(x) 360 | } 361 | 362 | -------------------------------------------------------------------------------- /R/old/print.R: -------------------------------------------------------------------------------- 1 | # A poor man's oo system. 2 | 3 | #' @export 4 | print.jeroen <- function(x, title = paste0("<", is(x), ">"), indent = 0, ...){ 5 | ns <- ls(x) 6 | if(length(title)) cat(title, "\n") 7 | lapply(ns, function(fn){ 8 | if(is.function(x[[fn]])){ 9 | cat(format_function(x[[fn]], fn, indent = indent), sep = "\n") 10 | } else { 11 | cat(" $", fn, ":\n", sep = "") 12 | print(x[[fn]], title = NULL, indent = indent + 2L) 13 | } 14 | }) 15 | invisible(x) 16 | } 17 | 18 | #' @export 19 | `$.jeroen` <- function(x, y){ 20 | if(!exists(y, x, inherits = FALSE)){ 21 | stop("Class '", is(x), "' has no field '", y, "'", call. = FALSE) 22 | } 23 | get(y, x, inherits = FALSE) 24 | } 25 | 26 | #' @export 27 | `[[.jeroen` <- `$.jeroen` 28 | 29 | #' @export 30 | `[.jeroen` <- `$.jeroen` 31 | 32 | # Pretty format function headers 33 | format_function <- function(fun, name = deparse(substitute(fun)), indent = 0){ 34 | #header <- sub("\\{$", "", capture.output(fun)[1]) 35 | header <- head(deparse(args(fun)), -1) 36 | header <- sub("^[ ]*", " ", header) 37 | header[1] <- sub("^[ ]*function ?", paste0(" $", name), header[1]) 38 | paste(c(rep(" ", indent), header), collapse = "") 39 | } 40 | 41 | # Override default call argument. 42 | stop <- function(..., call. = FALSE){ 43 | base::stop(..., call. = call.) 44 | } 45 | 46 | # Override default call argument. 47 | warning <- function(..., call. = FALSE){ 48 | base::warning(..., call. = call.) 49 | } 50 | -------------------------------------------------------------------------------- /README-NOT.md: -------------------------------------------------------------------------------- 1 | ## Data Package in R 2 | 3 | [![Project Status: Inactive – The project has reached a stable, usable state but is no longer being actively developed; support/maintenance will be provided as time allows.](http://www.repostatus.org/badges/latest/inactive.svg)](http://www.repostatus.org/#inactive) 4 | 5 | Data-packages is a [standard format](http://frictionlessdata.io/data-packages/) for describing meta-data for a collection of datasets. The package `datapkg` provides convenience functions for retrieving and parsing data packages in R. To install in R: 6 | 7 | ```r 8 | library(devtools) 9 | install_github("hadley/readr") 10 | install_github("ropenscilabs/jsonvalidate") 11 | install_github("ropenscilabs/datapkg") 12 | ``` 13 | 14 | ## Reading data 15 | 16 | The `datapkg_read` function retrieves and parses data packages from a local or remote sources. A few example packages are available from the [datasets](https://github.com/datasets) and [testsuite-py](https://github.com/frictionlessdata/testsuite-py) repositories. The path needs to point to a directory on disk or git remote or URL containing the root of the data package. 17 | 18 | ```r 19 | # Load client 20 | library(datapkg) 21 | 22 | # Clone via git 23 | cities <- datapkg_read("git://github.com/datasets/world-cities") 24 | 25 | # Same data but download over http 26 | cities <- datapkg_read("https://raw.githubusercontent.com/datasets/world-cities/master") 27 | ``` 28 | 29 | The output object contains data and metadata from the data-package, with actual datasets inside the `$data` field. 30 | 31 | ```r 32 | # Package info 33 | print(cities) 34 | 35 | # Open actual data in RStudio Viewer 36 | View(cities$data[[1]]) 37 | ``` 38 | 39 | In the case of multiple datasets, each one is either referenced by index or, if available, by name (names are optional in data packages). 40 | 41 | ```r 42 | # Package with many datasets 43 | euribor <- datapkg_read("https://raw.githubusercontent.com/datasets/euribor/master") 44 | 45 | # List datasets in this package 46 | names(euribor$data) 47 | View(euribor$data[[1]]) 48 | ``` 49 | 50 | ## Writing data 51 | 52 | The package also has basic functionality to save a data frame into a data package and 53 | update the `datapackage.json` file accordingly. 54 | 55 | ```r 56 | # Create new data package 57 | pkgdir <- tempfile() 58 | datapkg_write(mtcars, path = pkgdir) 59 | datapkg_write(iris, path = pkgdir) 60 | 61 | # Read it back 62 | mypkg <- datapkg_read(pkgdir) 63 | print(mypkg$data$mtcars) 64 | ``` 65 | 66 | From here you can modify the `datapackage.json` file with other metadata. 67 | 68 | ## Status 69 | 70 | This package is work in progress. Current open issues: 71 | 72 | - Make `readr` parse `0`/`1` values for booleans: [PR#406](https://github.com/hadley/readr/pull/406) 73 | - Support "year only" dates (`%Y`). Not sure if this constituates a valid date actually: [PR#407](https://github.com/hadley/readr/pull/407) 74 | - R and `readr` require to specify which strings are interepreted as missing values. Default are empty string `""` and `NA`. A similar property needs to be defined in the spec. 75 | - It is unclear what to do with parsing errors, or if the fields in `datapackage.json` does not match the csv data. Examples: [s-and-p-500](https://github.com/datasets/s-and-p-500) and [currency-codes](https://raw.githubusercontent.com/frictionlessdata/testsuite-py/master/datasets/currency-codes) 76 | 77 | Features: 78 | 79 | - Writing data packages from data frames. 80 | 81 | [![rOpenSci](http://ropensci.org/public_images/github_footer.png)](http://ropensci.org) 82 | [![OKFN](http://assets.okfn.org/p/labs/img/logo.png)](https://okfn.org) 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # datapkg 2 | 3 | [![Project Status: Abandoned](https://www.repostatus.org/badges/latest/abandoned.svg)](https://www.repostatus.org/#abandoned) 4 | 5 | This repository has been archived. The former README is now in [README-NOT.md](README-NOT.md). 6 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | 10 | install: 11 | ps: Bootstrap 12 | 13 | # Adapt as necessary starting from here 14 | 15 | build_script: 16 | - travis-tool.sh install_deps 17 | 18 | test_script: 19 | - travis-tool.sh run_tests 20 | 21 | on_failure: 22 | - 7z a failure.zip *.Rcheck\* 23 | - appveyor PushArtifact failure.zip 24 | 25 | artifacts: 26 | - path: '*.Rcheck\**\*.log' 27 | name: Logs 28 | 29 | - path: '*.Rcheck\**\*.out' 30 | name: Logs 31 | 32 | - path: '*.Rcheck\**\*.fail' 33 | name: Logs 34 | 35 | - path: '*.Rcheck\**\*.Rout' 36 | name: Logs 37 | 38 | - path: '\*_*.tar.gz' 39 | name: Bits 40 | 41 | - path: '\*_*.zip' 42 | name: Bits 43 | -------------------------------------------------------------------------------- /datapkg.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /inst/tabular-data-package.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "title": "Tabular Data Package", 4 | "description": "Tabular Data Package is a simple specification for data access and delivery of tabular data.", 5 | "type": "object", 6 | "required": [ "name", "resources" ], 7 | "properties": { 8 | "name": { 9 | "$ref": "definitions.json#/define/name", 10 | "propertyOrder": 10 11 | }, 12 | "title": { 13 | "$ref": "definitions.json#/define/title", 14 | "propertyOrder": 20 15 | }, 16 | "description": { 17 | "$ref": "definitions.json#/define/description", 18 | "format": "textarea", 19 | "propertyOrder": 30 20 | }, 21 | "homepage": { 22 | "$ref": "definitions.json#/define/homepage", 23 | "propertyOrder": 40 24 | }, 25 | "version": { 26 | "$ref": "definitions.json#/define/version", 27 | "propertyOrder": 50 28 | }, 29 | "license": { 30 | "$ref": "definitions.json#/define/license", 31 | "propertyOrder": 60 32 | }, 33 | "author": { 34 | "$ref": "definitions.json#/define/author", 35 | "propertyOrder": 70 36 | }, 37 | "contributors": { 38 | "$ref": "definitions.json#/define/contributors", 39 | "propertyOrder": 80, 40 | "options": { "hidden": true } 41 | }, 42 | "resources": { 43 | "title": "Resources", 44 | "description": "The data resources that this package describes.", 45 | "type": "array", 46 | "propertyOrder": 90, 47 | "minItems": 0, 48 | "items": { 49 | "type": "object", 50 | "properties": { 51 | "name": { 52 | "$ref": "definitions.json#/define/name", 53 | "propertyOrder": 10 54 | }, 55 | "title": { 56 | "$ref": "definitions.json#/define/title", 57 | "propertyOrder": 20 58 | }, 59 | "description": { 60 | "$ref": "definitions.json#/define/description", 61 | "propertyOrder": 30, 62 | "format": "textarea" 63 | }, 64 | "schema": { 65 | "$ref": "definitions.json#/define/schema", 66 | "propertyOrder": 40 67 | }, 68 | "url": { 69 | "$ref": "definitions.json#/define/url", 70 | "propertyOrder": 50 71 | }, 72 | "path": { 73 | "$ref": "definitions.json#/define/path", 74 | "propertyOrder": 60 75 | }, 76 | "data": { 77 | "$ref": "definitions.json#/define/data", 78 | "propertyOrder": 70 79 | }, 80 | "format": { 81 | "$ref": "definitions.json#/define/format", 82 | "propertyOrder": 80 83 | }, 84 | "mediatype": { 85 | "$ref": "definitions.json#/define/mediatype", 86 | "propertyOrder": 90 87 | }, 88 | "encoding": { 89 | "$ref": "definitions.json#/define/encoding", 90 | "propertyOrder": 100 91 | }, 92 | "bytes": { 93 | "$ref": "definitions.json#/define/bytes", 94 | "propertyOrder": 110, 95 | "options": { "hidden": true } 96 | }, 97 | "hash": { 98 | "$ref": "definitions.json#/define/hash", 99 | "propertyOrder": 120, 100 | "options": { "hidden": true } 101 | }, 102 | "dialect": { 103 | "$ref": "definitions.json#/define/dialect", 104 | "propertyOrder": 130, 105 | "options": { "hidden": true } 106 | }, 107 | "sources": { 108 | "$ref": "definitions.json#/define/sources", 109 | "propertyOrder": 140, 110 | "options": { "hidden": true } 111 | }, 112 | "license": { 113 | "$ref": "definitions.json#/define/license", 114 | "description": "The license under which the resource is published.", 115 | "propertyOrder": 150, 116 | "options": { "hidden": true } 117 | } 118 | }, 119 | "anyOf": [ 120 | { "title": "url required", "required": ["url"] }, 121 | { "title": "path required", "required": ["path"] }, 122 | { "title": "data required", "required": ["data"] } 123 | ] 124 | } 125 | }, 126 | "keywords": { 127 | "$ref": "definitions.json#/define/keywords", 128 | "propertyOrder": 100 129 | }, 130 | "sources": { 131 | "$ref": "definitions.json#/define/sources", 132 | "propertyOrder": 110, 133 | "options": { "hidden": true } 134 | }, 135 | "image": { 136 | "$ref": "definitions.json#/define/image", 137 | "propertyOrder": 120, 138 | "options": { "hidden": true } 139 | }, 140 | "base": { 141 | "$ref": "definitions.json#/define/base", 142 | "propertyOrder": 130, 143 | "options": { "hidden": true } 144 | }, 145 | "dataDependencies": { 146 | "$ref": "definitions.json#/define/dataDependencies", 147 | "propertyOrder": 140, 148 | "options": { "hidden": true } 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /man/datapackage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/datapkg_read.R, R/datapkg_validate.R, R/datapkg_write.R 3 | \name{datapackage} 4 | \alias{datapackage} 5 | \alias{datapkg} 6 | \alias{datapkg_read} 7 | \alias{datapkg_validate} 8 | \alias{datapkg_write} 9 | \title{Read/write data-package} 10 | \usage{ 11 | datapkg_read(path = getwd()) 12 | 13 | datapkg_validate(path = getwd()) 14 | 15 | datapkg_write(data, name, path = getwd()) 16 | } 17 | \arguments{ 18 | \item{path}{file path or URL to the data package directory} 19 | 20 | \item{data}{a data frame to be added to the package} 21 | 22 | \item{name}{what to name this dataset} 23 | } 24 | \description{ 25 | Read and write data frames to/from 'data-package' format. For reading 26 | supported paths are disk, http or git. For writing only disk is supported. 27 | } 28 | \examples{ 29 | # Create new data package 30 | pkgdir <- tempfile() 31 | datapkg_write(mtcars, path = pkgdir) 32 | datapkg_write(iris, path = pkgdir) 33 | 34 | # Read it back 35 | mypkg <- datapkg_read(pkgdir) 36 | print(mypkg$data$mtcars) 37 | 38 | # Clone package with git: 39 | cities <- datapkg_read("git://github.com/datasets/world-cities") 40 | 41 | # Read over http 42 | euribor <- datapkg_read("https://raw.githubusercontent.com/datasets/euribor/master") 43 | } 44 | \references{ 45 | \url{http://frictionlessdata.io/data-packages}, \url{https://github.com/datasets} 46 | } 47 | 48 | --------------------------------------------------------------------------------