├── .Rbuildignore ├── .all-contributorsrc ├── .dir-locals.el ├── .github ├── .gitignore └── workflows │ ├── main.yml │ └── pkgdown.yaml ├── .gitignore ├── .projectile ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── clean.R ├── ons.R ├── util.R └── utils-pipe.R ├── README.md ├── THFOpenDataPipeline.Rproj ├── _pkgdown.yml ├── man ├── figures │ ├── logo.png │ ├── monstR_2.png │ └── monstR_sticker.png ├── generate_download_filename.Rd ├── monstr_clean.Rd ├── monstr_data.Rd ├── monstr_pipeline_defaults.Rd ├── monstr_read_file.Rd ├── monstr_write_clean.Rd ├── ons_api_call.Rd ├── ons_available_datasets.Rd ├── ons_available_editions.Rd ├── ons_available_versions.Rd ├── ons_dataset_by_id.Rd ├── ons_datasets_setup.Rd ├── ons_download.Rd ├── pipe.Rd ├── safe_download.Rd ├── write_csv.Rd ├── write_metadata.Rd ├── write_rds.Rd └── write_xlsx.Rd ├── pkgdown └── favicon │ ├── apple-touch-icon-120x120.png │ ├── apple-touch-icon-152x152.png │ ├── apple-touch-icon-180x180.png │ ├── apple-touch-icon-60x60.png │ ├── apple-touch-icon-76x76.png │ ├── apple-touch-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ └── favicon.ico └── vignettes ├── .gitignore ├── merged_table.PNG ├── mortality-rate-using-population-and-deaths.Rmd ├── mortality_by_region.png ├── pipeline.Rmd ├── pop_deaths.PNG ├── pop_deaths_aggregate.PNG └── pop_table.PNG /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^Meta$ 2 | ^doc$ 3 | ^LICENSE\.md$ 4 | ^\.dir-locals\.el$ 5 | ^\.github/.*$ 6 | ^.projectile$ 7 | ^.*\.Rproj$ 8 | ^\.Rproj\.user$ 9 | ^_pkgdown\.yml$ 10 | ^docs$ 11 | ^pkgdown$ 12 | ^\.github$ 13 | -------------------------------------------------------------------------------- /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "imageSize": 100, 6 | "commit": false, 7 | "contributors": [ 8 | { 9 | "login": "emmavestesson", 10 | "name": "Emma Vestesson", 11 | "avatar_url": "https://avatars2.githubusercontent.com/u/31949401?v=4", 12 | "profile": "https://emmavestesson.netlify.com/", 13 | "contributions": [ 14 | "ideas", 15 | "content", 16 | "doc" 17 | ] 18 | }, 19 | { 20 | "login": "tomjemmett", 21 | "name": "Tom Jemmett", 22 | "avatar_url": "https://avatars1.githubusercontent.com/u/12023696?v=4", 23 | "profile": "https://www.strategyunitwm.nhs.uk/", 24 | "contributions": [ 25 | "bug" 26 | ] 27 | }, 28 | { 29 | "login": "JohnHC86", 30 | "name": "JohnHC86", 31 | "avatar_url": "https://avatars1.githubusercontent.com/u/12610020?v=4", 32 | "profile": "https://github.com/JohnHC86", 33 | "contributions": [ 34 | "bug" 35 | ] 36 | }, 37 | { 38 | "login": "sw1nn", 39 | "name": "Neale Swinnerton", 40 | "avatar_url": "https://avatars1.githubusercontent.com/u/373335?v=4", 41 | "profile": "http://sw1nn.com", 42 | "contributions": [ 43 | "code" 44 | ] 45 | }, 46 | { 47 | "login": "fiona-grimm", 48 | "name": "fiona-grimm", 49 | "avatar_url": "https://avatars1.githubusercontent.com/u/31844347?v=4", 50 | "profile": "https://github.com/fiona-grimm", 51 | "contributions": [ 52 | "ideas", 53 | "design" 54 | ] 55 | }, 56 | { 57 | "login": "SimonCRUK", 58 | "name": "SimonCRUK", 59 | "avatar_url": "https://avatars2.githubusercontent.com/u/58686505?v=4", 60 | "profile": "https://github.com/SimonCRUK", 61 | "contributions": [ 62 | "bug" 63 | ] 64 | }, 65 | { 66 | "login": "Lextuga007", 67 | "name": "Zoe Turner", 68 | "avatar_url": "https://avatars0.githubusercontent.com/u/39963221?v=4", 69 | "profile": "https://github.com/Lextuga007", 70 | "contributions": [ 71 | "bug" 72 | ] 73 | } 74 | ], 75 | "contributorsPerLine": 7, 76 | "projectName": "monstR", 77 | "projectOwner": "HFAnalyticsLab", 78 | "repoType": "github", 79 | "repoHost": "https://github.com", 80 | "skipCi": true 81 | } 82 | -------------------------------------------------------------------------------- /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ;;; Directory Local Variables 2 | ;;; For more information see (info "(emacs) Directory Variables") 3 | 4 | ((ess-mode 5 | (comment-column . 0))) 6 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: macOS-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: r-lib/actions/setup-r@master 17 | - name: Install dependencies 18 | run: | 19 | install.packages(c("remotes", "rcmdcheck")) 20 | remotes::install_deps(dependencies = TRUE) 21 | shell: Rscript {0} 22 | - name: Check 23 | run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") 24 | shell: Rscript {0} 25 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | - master 6 | 7 | name: pkgdown 8 | 9 | jobs: 10 | pkgdown: 11 | runs-on: macOS-latest 12 | env: 13 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - uses: r-lib/actions/setup-r@v1 18 | 19 | - uses: r-lib/actions/setup-pandoc@v1 20 | 21 | - name: Query dependencies 22 | run: | 23 | install.packages('remotes') 24 | saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) 25 | writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") 26 | shell: Rscript {0} 27 | 28 | - name: Cache R packages 29 | uses: actions/cache@v2 30 | with: 31 | path: ${{ env.R_LIBS_USER }} 32 | key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} 33 | restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- 34 | 35 | - name: Install dependencies 36 | run: | 37 | remotes::install_deps(dependencies = TRUE) 38 | install.packages("pkgdown", type = "binary") 39 | shell: Rscript {0} 40 | 41 | - name: Install package 42 | run: R CMD INSTALL . 43 | 44 | - name: Deploy package 45 | run: | 46 | git config --local user.email "actions@github.com" 47 | git config --local user.name "GitHub Actions" 48 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Meta 2 | doc 3 | inst/doc 4 | # History files 5 | .Rhistory 6 | .Rapp.history 7 | 8 | # Session Data files 9 | .RData 10 | # User-specific files 11 | .Ruserdata 12 | # Example code in package build process 13 | *-Ex.R 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | # Output files from R CMD check 17 | /*.Rcheck/ 18 | # RStudio files 19 | .Rproj.user/ 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 24 | .httr-oauth 25 | # knitr and R markdown default cache directories 26 | *_cache/ 27 | /cache/ 28 | # Temporary files created by R markdown 29 | *.utf8.md 30 | *.knit.md 31 | # R Environment Variables 32 | .Renviron 33 | docs 34 | -------------------------------------------------------------------------------- /.projectile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/.projectile -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: monstR 2 | Title: Download publically available data the ONS API 3 | Version: 0.0.0.9000 4 | Authors@R: 5 | c(person(given = "Neale", 6 | family = "Swinnerton", 7 | role = c("aut"), 8 | email = "neale@mastodonc.com"), 9 | person(given = "Emma", 10 | family = "Vestesson", 11 | role = c("cre", "ctb"), 12 | email = "emma.vestesson@gmail.com", 13 | comment = c(ORCID = "0000-0002-7284-9172"))) 14 | Description: Queries ONS API to download data. It can be used to retrieve publically available data and meta data from the ONS. 15 | License: MIT + file LICENSE 16 | Encoding: UTF-8 17 | LazyData: true 18 | Roxygen: list(markdown = TRUE) 19 | RoxygenNote: 7.1.1 20 | Imports: 21 | magrittr, 22 | logger, 23 | jsonlite, 24 | usethis, 25 | curl, 26 | purrr, 27 | dplyr, 28 | whisker, 29 | here, 30 | readr, 31 | janitor, 32 | readxl, 33 | writexl 34 | Suggests: 35 | knitr, 36 | rmarkdown 37 | VignetteBuilder: knitr 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2020 The Health Foundation Analytics Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2020 The Health Foundation Analytics Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export("%>%") 4 | export(monstr_clean) 5 | export(monstr_data) 6 | export(monstr_pipeline_defaults) 7 | export(monstr_read_file) 8 | export(monstr_write_clean) 9 | export(ons_available_datasets) 10 | export(ons_available_editions) 11 | export(ons_available_versions) 12 | export(ons_dataset_by_id) 13 | export(ons_datasets_setup) 14 | export(ons_download) 15 | import(dplyr) 16 | import(here) 17 | import(janitor) 18 | import(jsonlite) 19 | import(logger) 20 | import(readr) 21 | import(readxl) 22 | import(whisker) 23 | import(writexl) 24 | importFrom(curl,curl_download) 25 | importFrom(magrittr,"%>%") 26 | importFrom(readr,write_csv) 27 | importFrom(readr,write_rds) 28 | importFrom(writexl,write_xlsx) 29 | -------------------------------------------------------------------------------- /R/clean.R: -------------------------------------------------------------------------------- 1 | ##' @title Create the MONSTR defaults 2 | ##' @param download_root Root of directory hierarchy. 3 | ##' @return an augmented metadata 4 | ##' @author Neale Swinnerton 5 | ##' @export 6 | ##' @import here 7 | monstr_pipeline_defaults <- function(download_root="") { 8 | basedir <- "{{download_root}}/data" 9 | filepath <- "{{datasource}}/{{dataset}}/{{edition}}/{{dataset}}-v{{version}}.{{format}}" 10 | 11 | metadata <- list() 12 | metadata$download_filename_template = sprintf("%s/raw/%s", 13 | basedir, 14 | filepath) 15 | metadata$clean_filename_template = sprintf("%s/clean/%s", 16 | basedir, 17 | filepath) 18 | metadata$create_latest_symlink <- FALSE 19 | if (missing(download_root)) { 20 | metadata$download_root = here::here() # TODO here supposedly for 21 | # interactive use? 22 | } 23 | metadata$download_root = download_root 24 | metadata 25 | } 26 | 27 | ##' @title Read the file described by the metadata 28 | ##' @param metadata description of the downloaded file. 29 | ##' @return a metadata incorporating the data. The actually data can then be 30 | ##' extracted with \code{\link{monstr_data}} 31 | ##' @author Neale Swinnerton 32 | ##' @export 33 | ##' @import readr 34 | ##' @import readxl 35 | monstr_read_file <- function(metadata) { 36 | monstr <- metadata$monstr 37 | 38 | if (monstr$format == "csv") { 39 | metadata$monstr_data <- readr::read_csv(metadata$monstr$destfile) 40 | } else if (monstr$format %in% c("xls", "xlsx")) { 41 | metadata$monstr_data <- readxl::read_excel(metadata$monstr$destfile) 42 | } 43 | metadata$monstr <- monstr 44 | metadata 45 | } 46 | 47 | ##' @title Clean the data according to MONSTR rules. 48 | ##' @param metadata description the downloaded file. 49 | ##' @return description of the cleaned data 50 | ##' @author Neale Swinnerton 51 | ##' @export 52 | ##' @import janitor 53 | monstr_clean <- function(metadata) { 54 | metadata$monstr_data <- janitor::clean_names(metadata$monstr_data) 55 | metadata$monstr$is_clean <- TRUE 56 | metadata 57 | } 58 | 59 | ##' Extract the tibble of the actual data 60 | ##' 61 | ##' @title Get the Data 62 | ##' @param metadata description of the downloaded data 63 | ##' @return a \code{\link[tibble]{dplyr::tibble}} of the data from the 64 | ##' described download 65 | ##' @author Neale Swinnerton 66 | ##' @export 67 | monstr_data <- function(metadata) { 68 | metadata$monstr_data 69 | } 70 | 71 | ##' @title Writes the data to the 'clean' area 72 | ##' @param metadata description of the data. 73 | ##' @param format any known format or "all" to save a copy as all 74 | ##' known formats 75 | ##' @param create_directory boolean indicating whether directories 76 | ##' should be created. 77 | ##' @return a boolean indicating success 78 | ##' @author Neale Swinnerton 79 | ##' @export 80 | ##' @import logger 81 | ##' @importFrom readr write_csv write_rds 82 | ##' @importFrom writexl write_xlsx 83 | monstr_write_clean <- function(metadata, 84 | format="csv", 85 | create_directory=TRUE) { 86 | success <- TRUE 87 | monstr <- metadata$monstr 88 | 89 | if (monstr$is_clean) { 90 | 91 | data <- metadata$monstr_data 92 | csv <- format == "csv" 93 | xls <- format %in% c("xls", "xlsx") 94 | rds <- format == "rds" 95 | 96 | if (format == "all") { 97 | csv <- TRUE 98 | xls <- TRUE 99 | rds <- TRUE 100 | } 101 | 102 | # TODO - should success be a logical vector indicating which 103 | # have succeeded? 104 | if (csv) { 105 | success <- success && write_csv(data, monstr, create_directory) 106 | } 107 | 108 | if (xls) { 109 | success <- success && write_xlsx(data, monstr, create_directory) 110 | } 111 | 112 | if (rds) { 113 | success <- success && write_rds(data, monstr, create_directory) 114 | } 115 | } else { 116 | logger::log_warn("Data has not been cleaned. NOT writing") 117 | success <- FALSE 118 | } 119 | 120 | success 121 | } 122 | -------------------------------------------------------------------------------- /R/ons.R: -------------------------------------------------------------------------------- 1 | 2 | api_base_url <- "https://api.beta.ons.gov.uk/v1/datasets" 3 | 4 | ## START TODO - make these fns more general? 5 | ## Something like this (but this example doesn't work): 6 | ## ons_get_item_by <- function(df, name, value) { 7 | ## df$items[df$items[name] %>% detect_index(~ . == value)] 8 | ## } 9 | 10 | ## TODO - fix weirdness here - should be able to df$items %>% 11 | ## filter(...) rather than this detect_index but some type confusion 12 | 13 | 14 | ons_item_by_id <- function(df, id) { 15 | df$items[df$items$id %>% purrr::detect_index(~ . == id), ] 16 | } 17 | 18 | ons_edition_by_name <- function(df, edition) { 19 | df$items[df$items$edition %>% purrr::detect_index(~ . == edition), ] 20 | } 21 | 22 | ons_version_by_version <- function(df, version) { 23 | df$items[df$items$version %>% purrr::detect_index(~ . == version), ] 24 | } 25 | 26 | ## END TODO - make these fns more general? 27 | 28 | ons_download_by_format <- function(metadata, format) { 29 | download <- metadata$downloads[[format]] 30 | if (is.null(download)) { 31 | valid_formats <- names(metadata$downloads) 32 | logger::log_error(sprintf("Format '%s' not found, valid formats for this dataset are %s", format, toString(names(metadata$downloads)))) 33 | stop() 34 | } 35 | 36 | download 37 | } 38 | 39 | ## TODO - is there a std fn for this? 40 | ##' @import logger 41 | log_panic <- function(...) { 42 | logger::log_error(...) 43 | quit(status = 1) 44 | } 45 | 46 | ##' Make request to given url, which is assumed to be the ONS api. 47 | ##' 48 | ##' data retrieved is converted to tidyverse tibble if possible. 49 | ##' 50 | ##' @title Call the ONS API 51 | ##' @param url url to call @seeAlso \code{\link{[api_base_url]}} 52 | ##' @return a list contained the API call results 53 | ##' @author Neale Swinnerton 54 | ##' @import dplyr 55 | ons_api_call <- function(url) { 56 | df <- jsonlite::fromJSON(url) 57 | if ("items" %in% colnames(df)) { 58 | df$items <- dplyr::as_tibble(df$items) 59 | } 60 | df 61 | } 62 | 63 | 64 | ##' This returns a dataframe containing details that can be passed to 65 | ##' other fns in this package for further processing 66 | ##' @title Datasets Setup 67 | ##' @param defaults a list with folder system. Valid values from \code{monstr_pipeline_defaults(...)} 68 | ##' @return a list describing available datasets 69 | ##' @author Neale Swinnerton 70 | ##' @export 71 | ##' @import jsonlite 72 | ##' @import dplyr 73 | ##' @examples 74 | ##' \dontrun{ 75 | ##' monstr_pipeline_defaults() %>% 76 | ##' ons_datasets_setup() # rooted in current project 77 | ##' } 78 | ##' \dontrun{ 79 | ##' monstr_pipeline_defaults(download_root="/path/to/download/root/") %>% 80 | ##' ons_datasets_setup() 81 | ##' } 82 | ons_datasets_setup <- function(defaults) { 83 | results <- ons_api_call(api_base_url) 84 | results$monstr <- defaults 85 | results$monstr$src_url <- api_base_url 86 | 87 | results 88 | } 89 | 90 | ##' Retrieves a dataframe describing the datasets available from ONS via the API. 91 | ##' @title Available Datasets 92 | ##' @return list of available datasets and associated metadata 93 | ##' @author Neale Swinnerton 94 | ##' @export 95 | ##' @import dplyr 96 | ##' @examples 97 | ##' \dontrun{ 98 | ##' # return information on all available datasets and then filter on specific id 99 | ##' datasets <- ons_available_datasets() 100 | ##' 101 | ##' datasets %>% 102 | ##' filter(id='health-accounts') 103 | ##' } 104 | ##' \dontrun{ 105 | ##' # display just the ids 106 | ##' ons_available_datasets() %>% select(id) 107 | ##' } 108 | ons_available_datasets <- function() { 109 | desc <- ons_api_call(api_base_url)$items %>% 110 | dplyr::select(id, title, description, unit_of_measure, next_release, release_frequency, publications) 111 | return(desc) 112 | 113 | } 114 | 115 | #' Retrieve the metadata for the given dataset. 116 | #' 117 | #' Makes calls to the ONS API and retrieves the metadata for the 118 | #' datasets. The dataset selection can be refined via the edition and 119 | #' version parameters 120 | #' 121 | #' @title Dataset By Id 122 | #' @param metadata data describing the dataset 123 | #' @param id the identifier of the dataset. Valid values from \code{ons_available_datasets()} 124 | #' @param edition the edition of the dataset (if empty, select latest). Valid values from \code{ons_available_editions(...)} 125 | #' @param version the version of the dataset (if empty, select latest). Valid values from \code{ons_available_available(...)} 126 | #' @return a dataframe describing the dataset. 127 | #' @author Neale Swinnerton 128 | #' @export 129 | ##' @import logger 130 | ons_dataset_by_id <- function(metadata, id, edition, version) { 131 | links <- ons_item_by_id(metadata, id)$links 132 | monstr <- metadata$monstr # save for later 133 | 134 | if (missing(edition)) { 135 | logger::log_info("Edition not specified, defaulting to latest version") 136 | link <- links$latest_version$href 137 | is_latest <- TRUE 138 | } else { 139 | metadata <- 140 | ons_api_call(links$editions$href) %>% 141 | ons_edition_by_name(edition) 142 | 143 | is_latest <- FALSE 144 | if (missing(version)) { 145 | logger::log_info("Version of ", edition, 146 | " edition not specified, defaulting to latest version") 147 | link <- metadata$links$latest_version$href 148 | is_latest <- TRUE 149 | } else { 150 | version_metadata <- 151 | ons_api_call(metadata$links$versions$href) %>% 152 | ons_version_by_version(version) 153 | 154 | if (nrow(version_metadata) == 0) { 155 | log_panic("Version ", version, " of ", edition, 156 | " is not available") 157 | } else { 158 | logger::log_info("Version ", version, " of ", edition, 159 | " edition selected") 160 | } 161 | 162 | link <- version_metadata$links$self$href 163 | 164 | ## TODO should we work out whether the specified version is the latest here? 165 | ## is 'latest' highest version or newest release-date ? 166 | } 167 | } 168 | 169 | logger::log_info(sprintf("Retrieving dataset metadata from %s", link)) 170 | dataset <- ons_api_call(link) 171 | 172 | dataset$monstr <- monstr 173 | dataset$monstr$is_latest <- is_latest 174 | dataset$monstr$datasource <- "ons" 175 | dataset$monstr$dataset <- id 176 | dataset$monstr$edition <- dataset$edition 177 | dataset$monstr$version <- dataset$version 178 | dataset 179 | } 180 | 181 | ##' @title Available Editions 182 | ##' @param id dataset identifier. Valid values from \code{ons_available_datasets(...)} 183 | ##' @return a list of edition identifiers 184 | ##' @author Neale Swinnerton 185 | ##' @export 186 | ##' @import dplyr 187 | ##' @examples 188 | ##' \dontrun{ 189 | #' ons_available_editions(id = 'mid-year-pop-est') 190 | #' } 191 | ons_available_editions <- function(id) { 192 | metadata <- ons_api_call(sprintf("%s/%s/editions", api_base_url, id)) 193 | 194 | metadata$items %>% 195 | dplyr::select(matches("edition")) 196 | } 197 | 198 | ##' @title Available Versions 199 | ##' @param id dataset identifier. Valid values from \code{ons_available_datasets(...)} 200 | ##' @param edition edition identifier. Valid values from \code{ons_available_editions(...)} 201 | ##' @return a list of version identifiers 202 | ##' @author Neale Swinnerton 203 | ##' @export 204 | ##' @import dplyr 205 | ##' @examples 206 | #' \dontrun{ 207 | #' ons_available_versions(id = "regional-gdp-by-quarter", edition = "time-series") 208 | #' } 209 | ons_available_versions <- function(id, edition) { 210 | metadata <- ons_api_call(sprintf("%s/%s/editions/%s/versions", api_base_url, id, edition)) 211 | 212 | metadata$items %>% 213 | dplyr::select(version) 214 | } 215 | 216 | ##' Download 217 | ##' 218 | ##' \code{ons_download} retrieves the data described by the given df 219 | ##' @param metadata data describing the download 220 | ##' @param format a valid format for the download 221 | ##' @export 222 | ##' @import logger 223 | ons_download <- function(metadata, 224 | format="csv" ) { 225 | validate_file <- function(f) { 226 | expected_size <- as.numeric(download$size) 227 | 228 | if (file.size(f) != expected_size) { 229 | log_panic(sprintf("Inconsistent file size expected %d, got %d", 230 | expected_size, 231 | file.size(f))) 232 | FALSE 233 | } else { 234 | TRUE 235 | } 236 | } 237 | try (if(!(format %in% c('csv', 'xls'))) stop('Format not allowed')) 238 | download <- 239 | metadata %>% 240 | ons_download_by_format(format) ## TODO - error if format not found? 241 | 242 | metadata$monstr$format <- format 243 | 244 | logger::log_info(sprintf("Downloading data from %s", download$href)) 245 | 246 | destfile <- generate_download_filename(template=metadata$monstr$download_filename_template, 247 | root=metadata$monstr$download_root, 248 | data=metadata$monstr) 249 | 250 | if (safe_download(url = c(download$href), 251 | destfile = destfile, 252 | fvalidate = validate_file)) { 253 | write_metadata(metadata, sprintf("%s.meta.json", destfile)) 254 | logger::log_info(sprintf("File created at %s ", destfile)) 255 | } 256 | 257 | if (metadata$monstr$create_latest_symlink && 258 | metadata$monstr$is_latest) { 259 | 260 | version <- metadata$monstr$version 261 | metadata$monstr$version <- "LATEST" 262 | 263 | linkfile <- generate_download_filename(template=metadata$monstr$download_filename_template, 264 | root=metadata$monstr$download_root, 265 | data=metadata$monstr) 266 | 267 | metadata$monstr$version <- version 268 | if (file.exists(linkfile)) { 269 | file.remove(linkfile) 270 | } 271 | 272 | file.symlink(destfile, 273 | linkfile) 274 | log_info("Create symlink to LATEST file") 275 | } 276 | 277 | metadata$monstr$destfile <- destfile 278 | metadata 279 | } 280 | -------------------------------------------------------------------------------- /R/util.R: -------------------------------------------------------------------------------- 1 | 2 | ##' @title Safe Download 3 | ##' 4 | ##' Downloads a file and tries hard to tidy up in the event of 5 | ##' errors. Since these files are typically large we don't want to 6 | ##' leave them in temp directories. 7 | ##' 8 | ##' The destfile should only appear if the download was successful. 9 | ##' 10 | ##' @param url src for the download 11 | ##' @param destfile destination filename 12 | ##' @param fvalidate a fn that is passed the filename after download 13 | ##' to validate it in some way. The fn should return TRUE if the 14 | ##' file is valid. 15 | ##' @importFrom curl curl_download 16 | safe_download <- function(url, destfile, fvalidate) { 17 | success <- TRUE 18 | 19 | tryCatch({ 20 | tmp <- tempfile() 21 | curl::curl_download(url = url, 22 | destfile = tmp) 23 | 24 | if (!missing(fvalidate) && !fvalidate(tmp)) { 25 | success <- FALSE 26 | ## report the destfile name to not confuse user, although 27 | ## not strictly true 28 | log_panic("file ", destfile, " failed validation. Deleting it") 29 | } 30 | 31 | ## rename to final destination. This is generally an atomic 32 | ## operation, so we can assume the final file only appears if 33 | ## this succeeds. 34 | if (success && !file.rename(from = tmp, 35 | to = destfile)) { 36 | success <- FALSE 37 | log_panic("file ", destfile, " Not created!") 38 | } 39 | }, 40 | finally = if (file.exists(tmp)) file.remove(tmp)) 41 | 42 | success 43 | } 44 | 45 | #' @title Write Metadata 46 | #' 47 | #' \code{(write_metadata)} writes some metadata about where the file 48 | #' came from. TODO - could do this with fs xattr, but maybe that's 49 | #' not well known by users? 50 | #' 51 | #' @param metadata a dataframe containing metadata 52 | #' @param destfile filename into which the metadata should be written 53 | #' as JSON 54 | write_metadata <- function(metadata, destfile) { 55 | json <- jsonlite::toJSON(metadata, pretty = TRUE, flatten = TRUE) 56 | tryCatch({ 57 | f <- file(destfile) 58 | writeLines(c(json), con = f, sep = "") 59 | }, 60 | finally = close(f) 61 | ) 62 | } 63 | 64 | 65 | ##' @title generate a filename for a download 66 | ##' 67 | ##' @param template same as whisker template 68 | ##' @param root the root of the directory hierarchy 69 | ##' @param data data used to populate the template 70 | ##' @param create_directory boolean indicating whether to 71 | ##' (recursively) create the directory hierarchy. 72 | ##' @return a filename 73 | ##' @import whisker 74 | generate_download_filename <- function(template, root, data, create_directory=TRUE) { 75 | 76 | path <- whisker.render(template, 77 | data) 78 | 79 | dir <- dirname(path) 80 | 81 | if (create_directory && !dir.exists(dir)) { 82 | logger::log_info("Creating directory ", dir) 83 | dir.create(dir, recursive=TRUE) 84 | } 85 | 86 | path 87 | 88 | } 89 | 90 | ##' @title write the data as a csv. 91 | ##' @param data The actual data 92 | ##' @param monstr metadata dataframe created by the pipeline 93 | ##' @param create_directory boolean indicating whether to 94 | ##' (recursively) create the directory hierarchy. 95 | ##' @return boolean indicating success 96 | ##' @author Neale Swinnerton 126 | ##' @import writexl 127 | ##' @import logger 128 | write_xlsx <- function(data, monstr, create_directory) { 129 | success <- TRUE 130 | monstr$format <- "xlsx" 131 | destfile <- generate_download_filename(monstr$clean_filename_template, 132 | monstr$download_root, 133 | monstr, 134 | create_directory) 135 | logger::log_info(sprintf("Writing %s data to %s", monstr$format, destfile)) 136 | tryCatch ( 137 | writexl::write_xlsx(x=data, path=destfile), 138 | error = function(e) { 139 | logger::log_error("Problem writing xlsx") 140 | success <- FALSE 141 | }) 142 | 143 | success 144 | } 145 | 146 | ##' @title write the data as a RDS. 147 | ##' @param data The actual data 148 | ##' @param monstr metadata dataframe created by the pipeline 149 | ##' @param create_directory boolean indicating whether to 150 | ##' (recursively) create the directory hierarchy. 151 | ##' @return boolean indicating success 152 | ##' @author Neale Swinnerton 153 | ##' @import logger 154 | write_rds <- function(data, monstr,create_directory) { 155 | success <- TRUE 156 | monstr$format <- "rds" 157 | destfile <- generate_download_filename(monstr$clean_filename_template, 158 | monstr$download_root, 159 | monstr, 160 | create_directory) 161 | logger::log_info(sprintf("Writing %s data to %s", monstr$format, destfile)) 162 | tryCatch ( 163 | saveRDS(object=data, file=destfile), 164 | 165 | error = function(e) { 166 | logger::log_error("Problem writing rds") 167 | success <- FALSE 168 | } 169 | ) 170 | 171 | success 172 | } 173 | -------------------------------------------------------------------------------- /R/utils-pipe.R: -------------------------------------------------------------------------------- 1 | #' Pipe operator 2 | #' 3 | #' See \code{magrittr::\link[magrittr]{\%>\%}} for details. 4 | #' 5 | #' @name %>% 6 | #' @rdname pipe 7 | #' @keywords internal 8 | #' @export 9 | #' @importFrom magrittr %>% 10 | #' @usage lhs \%>\% rhs 11 | NULL 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # monstR - making ONS tables readable 2 | 3 | [![All Contributors](https://img.shields.io/badge/all_contributors-7-orange.svg?style=flat-square)](#contributors-) 4 | 5 | 6 | ![R-CMD-check](https://github.com/HFAnalyticsLab/Open_data_pipelines/workflows/R-CMD-check/badge.svg) 7 | 8 | #### Project Status: in progress 9 | ## Project Description 10 | 11 | This package is a part of our open-source R pipeline to download and clean public data related to health and social care. The aim is to provide analysts, primarily at the Health Foundation, with clean and ready for analysis data. 12 | 13 | ## Overview 14 | 15 | monstR - making ONS tables readable is a package that queries the [Office for National Statistics (ONS) API](https://developer.beta.ons.gov.uk/) to download data. It can be used to retrieve publically available data and meta data from the ONS. 16 | 17 | - `ons_available_datasets()` returns information about available datasets 18 | - `ons_available_versions()` returns information about available dataset versions 19 | - `ons_available_editions()` returns information about available dataset editions 20 | - `ons_download()` downloads the specified data 21 | 22 | Please note that the ONS API that this package relies on is in beta and it might change. 23 | 24 | ## Installation 25 | 26 | 27 | If you have cloned a local copy of the repo, you should be able to load it using devtools 28 | 29 |
 30 | 
 31 | library(devtools)
 32 | setwd("{location of monstR repo}")
 33 | devtools::load_all()
 34 | 
35 | 36 | or to install direct from Github 37 | ``` 38 | remotes::install_github("HFAnalyticsLab/monstR", build_vignettes = TRUE ) 39 | ``` 40 | 41 | ## Examples 42 | 43 | This is an example of how to download weekly mortality data by region. Note that this will create folders and download data. 44 | 45 | ``` 46 | monstr_pipeline_defaults() %>% # Uses the monstr 'standards' for location and format 47 | ons_datasets_setup() %>% 48 | ons_dataset_by_id("weekly-deaths-region") %>% 49 | ons_download(format="csv") %>% 50 | monstr_read_file() %>% 51 | monstr_clean() %>% 52 | monstr_write_clean(format="all") 53 | 54 | ``` 55 | 56 | ## Resources 57 | 58 | ### Online documentation 59 | 60 | You can find the help pages at . 61 | 62 | ## Questions and bug reports 63 | 64 | This is a package under active development and we would love for you to contribute or flag any issues you might find. 65 | 66 | You can ask questions or flag a bug by [filing an issue](https://github.com/HFAnalyticsLab/monstR/issues). We are more likely to be able to help you if we can reproduce your issue. The `reprex` package is a good way of producing a minimal reproducible package and [So you've been asked to make a reprex](https://www.jessemaegan.com/post/so-you-ve-been-asked-to-make-a-reprex/) will help you get started. 67 | 68 | ### Contributing to the package development 69 | 70 | We aim to make the documentation as comprehensive as possible. Please contribute examples or suggest improvements to the 71 | documentation. 72 | 73 | If you have written a function that you think should be added to the package, or improved an existing function, please submit a pull request that includes: 74 | 75 | - the new/amended function(s) with code and roxygen tags (with examples) 76 | - a new section in the appropriate vignette that describes how to use 77 | the new function 78 | - corresponding tests in directory `tests/testthat`. 79 | 80 | ## Design Principles 81 | 82 | The monstrR Open Data Pipeline is designed to work well with tidyverse and in particular within pipelines created by the `%>%` pipe operator. With this in mind, most functions take a data structure in the first argument and return a data structure which has been augmented in some way. Typically this is metadata about the actual data, although once the data has been cleaned it can be accessed using `monstr_data(metadata)` to get at a tidyverse tibble of the data. 83 | 84 | 85 | ## Authors 86 | * **Neale Swinnerton** - [Github](https://github.com/sw1nn) 87 | * **Emma Vestesson** - [Github](https://github.com/emmavestesson) [Twitter](https://twitter.com/Gummifot) 88 | 89 | ## License 90 | 91 | This project is licensed under the [MIT License](https://github.com/HFAnalyticsLab/monstR/blob/master/LICENSE). 92 | 93 | ## Contributors ✨ 94 | 95 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 |

Emma Vestesson

🤔 🖋 📖

Tom Jemmett

🐛

JohnHC86

🐛

Neale Swinnerton

💻

fiona-grimm

🤔 🎨

SimonCRUK

🐛

Zoe Turner

🐛
111 | 112 | 113 | 114 | 115 | 116 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! 117 | -------------------------------------------------------------------------------- /THFOpenDataPipeline.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | home: 2 | title: An R package to download publically available data from the ONS API 3 | description: Download publically available data from the ONS API 4 | 5 | template: 6 | opengraph: 7 | image: 8 | src: man/figures/monstR_sticker.png 9 | alt: "The logo for the monstR package - a grey monster" 10 | twitter: 11 | creator: "@gummifot" 12 | site: "@HealthFdn" 13 | card: summary_large_image -------------------------------------------------------------------------------- /man/figures/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/man/figures/logo.png -------------------------------------------------------------------------------- /man/figures/monstR_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/man/figures/monstR_2.png -------------------------------------------------------------------------------- /man/figures/monstR_sticker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/man/figures/monstR_sticker.png -------------------------------------------------------------------------------- /man/generate_download_filename.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{generate_download_filename} 4 | \alias{generate_download_filename} 5 | \title{generate a filename for a download} 6 | \usage{ 7 | generate_download_filename(template, root, data, create_directory = TRUE) 8 | } 9 | \arguments{ 10 | \item{template}{same as whisker template} 11 | 12 | \item{root}{the root of the directory hierarchy} 13 | 14 | \item{data}{data used to populate the template} 15 | 16 | \item{create_directory}{boolean indicating whether to 17 | (recursively) create the directory hierarchy.} 18 | } 19 | \value{ 20 | a filename 21 | } 22 | \description{ 23 | generate a filename for a download 24 | } 25 | -------------------------------------------------------------------------------- /man/monstr_clean.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clean.R 3 | \name{monstr_clean} 4 | \alias{monstr_clean} 5 | \title{Clean the data according to MONSTR rules.} 6 | \usage{ 7 | monstr_clean(metadata) 8 | } 9 | \arguments{ 10 | \item{metadata}{description the downloaded file.} 11 | } 12 | \value{ 13 | description of the cleaned data 14 | } 15 | \description{ 16 | Clean the data according to MONSTR rules. 17 | } 18 | \author{ 19 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 20 | } 21 | -------------------------------------------------------------------------------- /man/monstr_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clean.R 3 | \name{monstr_data} 4 | \alias{monstr_data} 5 | \title{Get the Data} 6 | \usage{ 7 | monstr_data(metadata) 8 | } 9 | \arguments{ 10 | \item{metadata}{description of the downloaded data} 11 | } 12 | \value{ 13 | a \code{\link[tibble]{dplyr::tibble}} of the data from the 14 | described download 15 | } 16 | \description{ 17 | Extract the tibble of the actual data 18 | } 19 | \author{ 20 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 21 | } 22 | -------------------------------------------------------------------------------- /man/monstr_pipeline_defaults.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clean.R 3 | \name{monstr_pipeline_defaults} 4 | \alias{monstr_pipeline_defaults} 5 | \title{Create the MONSTR defaults} 6 | \usage{ 7 | monstr_pipeline_defaults(download_root = "") 8 | } 9 | \arguments{ 10 | \item{download_root}{Root of directory hierarchy.} 11 | } 12 | \value{ 13 | an augmented metadata 14 | } 15 | \description{ 16 | Create the MONSTR defaults 17 | } 18 | \author{ 19 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 20 | } 21 | -------------------------------------------------------------------------------- /man/monstr_read_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clean.R 3 | \name{monstr_read_file} 4 | \alias{monstr_read_file} 5 | \title{Read the file described by the metadata} 6 | \usage{ 7 | monstr_read_file(metadata) 8 | } 9 | \arguments{ 10 | \item{metadata}{description of the downloaded file.} 11 | } 12 | \value{ 13 | a metadata incorporating the data. The actually data can then be 14 | extracted with \code{\link{monstr_data}} 15 | } 16 | \description{ 17 | Read the file described by the metadata 18 | } 19 | \author{ 20 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 21 | } 22 | -------------------------------------------------------------------------------- /man/monstr_write_clean.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/clean.R 3 | \name{monstr_write_clean} 4 | \alias{monstr_write_clean} 5 | \title{Writes the data to the 'clean' area} 6 | \usage{ 7 | monstr_write_clean(metadata, format = "csv", create_directory = TRUE) 8 | } 9 | \arguments{ 10 | \item{metadata}{description of the data.} 11 | 12 | \item{format}{any known format or "all" to save a copy as all 13 | known formats} 14 | 15 | \item{create_directory}{boolean indicating whether directories 16 | should be created.} 17 | } 18 | \value{ 19 | a boolean indicating success 20 | } 21 | \description{ 22 | Writes the data to the 'clean' area 23 | } 24 | \author{ 25 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 26 | } 27 | -------------------------------------------------------------------------------- /man/ons_api_call.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ons.R 3 | \name{ons_api_call} 4 | \alias{ons_api_call} 5 | \title{Call the ONS API} 6 | \usage{ 7 | ons_api_call(url) 8 | } 9 | \arguments{ 10 | \item{url}{url to call @seeAlso \code{\link{[api_base_url]}}} 11 | } 12 | \value{ 13 | a list contained the API call results 14 | } 15 | \description{ 16 | Make request to given url, which is assumed to be the ONS api. 17 | } 18 | \details{ 19 | data retrieved is converted to tidyverse tibble if possible. 20 | } 21 | \author{ 22 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 23 | } 24 | -------------------------------------------------------------------------------- /man/ons_available_datasets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ons.R 3 | \name{ons_available_datasets} 4 | \alias{ons_available_datasets} 5 | \title{Available Datasets} 6 | \usage{ 7 | ons_available_datasets() 8 | } 9 | \value{ 10 | list of available datasets and associated metadata 11 | } 12 | \description{ 13 | Retrieves a dataframe describing the datasets available from ONS via the API. 14 | } 15 | \examples{ 16 | \dontrun{ 17 | # return information on all available datasets and then filter on specific id 18 | datasets <- ons_available_datasets() 19 | 20 | datasets \%>\% 21 | filter(id='health-accounts') 22 | } 23 | \dontrun{ 24 | # display just the ids 25 | ons_available_datasets() \%>\% select(id) 26 | } 27 | } 28 | \author{ 29 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 30 | } 31 | -------------------------------------------------------------------------------- /man/ons_available_editions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ons.R 3 | \name{ons_available_editions} 4 | \alias{ons_available_editions} 5 | \title{Available Editions} 6 | \usage{ 7 | ons_available_editions(id) 8 | } 9 | \arguments{ 10 | \item{id}{dataset identifier. Valid values from \code{ons_available_datasets(...)}} 11 | } 12 | \value{ 13 | a list of edition identifiers 14 | } 15 | \description{ 16 | Available Editions 17 | } 18 | \examples{ 19 | \dontrun{ 20 | ons_available_editions(id = 'mid-year-pop-est') 21 | } 22 | } 23 | \author{ 24 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 25 | } 26 | -------------------------------------------------------------------------------- /man/ons_available_versions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ons.R 3 | \name{ons_available_versions} 4 | \alias{ons_available_versions} 5 | \title{Available Versions} 6 | \usage{ 7 | ons_available_versions(id, edition) 8 | } 9 | \arguments{ 10 | \item{id}{dataset identifier. Valid values from \code{ons_available_datasets(...)}} 11 | 12 | \item{edition}{edition identifier. Valid values from \code{ons_available_editions(...)}} 13 | } 14 | \value{ 15 | a list of version identifiers 16 | } 17 | \description{ 18 | Available Versions 19 | } 20 | \examples{ 21 | \dontrun{ 22 | ons_available_versions(id = "regional-gdp-by-quarter", edition = "time-series") 23 | } 24 | } 25 | \author{ 26 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 27 | } 28 | -------------------------------------------------------------------------------- /man/ons_dataset_by_id.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ons.R 3 | \name{ons_dataset_by_id} 4 | \alias{ons_dataset_by_id} 5 | \title{Dataset By Id} 6 | \usage{ 7 | ons_dataset_by_id(metadata, id, edition, version) 8 | } 9 | \arguments{ 10 | \item{metadata}{data describing the dataset} 11 | 12 | \item{id}{the identifier of the dataset. Valid values from \code{ons_available_datasets()}} 13 | 14 | \item{edition}{the edition of the dataset (if empty, select latest). Valid values from \code{ons_available_editions(...)}} 15 | 16 | \item{version}{the version of the dataset (if empty, select latest). Valid values from \code{ons_available_available(...)}} 17 | } 18 | \value{ 19 | a dataframe describing the dataset. 20 | } 21 | \description{ 22 | Retrieve the metadata for the given dataset. 23 | } 24 | \details{ 25 | Makes calls to the ONS API and retrieves the metadata for the 26 | datasets. The dataset selection can be refined via the edition and 27 | version parameters 28 | } 29 | \author{ 30 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 31 | } 32 | -------------------------------------------------------------------------------- /man/ons_datasets_setup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ons.R 3 | \name{ons_datasets_setup} 4 | \alias{ons_datasets_setup} 5 | \title{Datasets Setup} 6 | \usage{ 7 | ons_datasets_setup(defaults) 8 | } 9 | \arguments{ 10 | \item{defaults}{a list with folder system. Valid values from \code{monstr_pipeline_defaults(...)}} 11 | } 12 | \value{ 13 | a list describing available datasets 14 | } 15 | \description{ 16 | This returns a dataframe containing details that can be passed to 17 | other fns in this package for further processing 18 | } 19 | \examples{ 20 | \dontrun{ 21 | monstr_pipeline_defaults() \%>\% 22 | ons_datasets_setup() # rooted in current project 23 | } 24 | \dontrun{ 25 | monstr_pipeline_defaults(download_root="/path/to/download/root/") \%>\% 26 | ons_datasets_setup() 27 | } 28 | } 29 | \author{ 30 | Neale Swinnerton \href{mailto:neale@mastodonc.com}{neale@mastodonc.com} 31 | } 32 | -------------------------------------------------------------------------------- /man/ons_download.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ons.R 3 | \name{ons_download} 4 | \alias{ons_download} 5 | \title{Download} 6 | \usage{ 7 | ons_download(metadata, format = "csv") 8 | } 9 | \arguments{ 10 | \item{metadata}{data describing the download} 11 | 12 | \item{format}{a valid format for the download} 13 | } 14 | \description{ 15 | \code{ons_download} retrieves the data described by the given df 16 | } 17 | -------------------------------------------------------------------------------- /man/pipe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-pipe.R 3 | \name{\%>\%} 4 | \alias{\%>\%} 5 | \title{Pipe operator} 6 | \usage{ 7 | lhs \%>\% rhs 8 | } 9 | \description{ 10 | See \code{magrittr::\link[magrittr]{\%>\%}} for details. 11 | } 12 | \keyword{internal} 13 | -------------------------------------------------------------------------------- /man/safe_download.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{safe_download} 4 | \alias{safe_download} 5 | \title{Safe Download 6 | 7 | Downloads a file and tries hard to tidy up in the event of 8 | errors. Since these files are typically large we don't want to 9 | leave them in temp directories. 10 | 11 | The destfile should only appear if the download was successful.} 12 | \usage{ 13 | safe_download(url, destfile, fvalidate) 14 | } 15 | \arguments{ 16 | \item{url}{src for the download} 17 | 18 | \item{destfile}{destination filename} 19 | 20 | \item{fvalidate}{a fn that is passed the filename after download 21 | to validate it in some way. The fn should return TRUE if the 22 | file is valid.} 23 | } 24 | \description{ 25 | Safe Download 26 | 27 | Downloads a file and tries hard to tidy up in the event of 28 | errors. Since these files are typically large we don't want to 29 | leave them in temp directories. 30 | 31 | The destfile should only appear if the download was successful. 32 | } 33 | -------------------------------------------------------------------------------- /man/write_csv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/util.R 3 | \name{write_csv} 4 | \alias{write_csv} 5 | \title{write the data as a csv.} 6 | \usage{ 7 | write_csv(data, monstr, create_directory) 8 | } 9 | \arguments{ 10 | \item{data}{The actual data} 11 | 12 | \item{monstr}{metadata dataframe created by the pipeline} 13 | 14 | \item{create_directory}{boolean indicating whether to 15 | (recursively) create the directory hierarchy.} 16 | } 17 | \value{ 18 | boolean indicating success 19 | } 20 | \description{ 21 | write the data as a csv. 22 | } 23 | \author{ 24 | Neale Swinnerton 7 | %\VignetteIndexEntry{Mortality rate} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r setup, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | 18 | # install.packages("dplyr") 19 | # install.packages("data.table") 20 | # install.packages("ggplot2") 21 | # install.packages("here") 22 | 23 | # library(monstR) 24 | # library(dplyr) 25 | # library(data.table) 26 | library(here) 27 | ``` 28 | 29 | ## Vignette background 30 | 31 | The `monstR` package interacts with the UK's Office for National Statistics (ONS) API, enabling us to easily extract their datasets for analysis. In this vignette, we will demonstrate how to use `monstR` to download the following ONS tables: 32 | 33 | - Mid-year population estimates by region, age and gender 34 | - Deaths registered weekly in England and Wales by region 35 | 36 | We will then merge both of those datasets to compute a time-series of mortality rates in England and Wales by sub-region. We will also plot those statistics. 37 | 38 | ## Find identifiers for desired ONS datasets 39 | 40 | The function `ons_available_datasets()` is useful to find out what datasets are available through the ONS API. 41 | 42 | ```{r,eval=FALSE} 43 | datasets <- ons_available_datasets() 44 | ``` 45 | 46 | 47 | Each dataset is associated with an identifier, which we can retrieve by inspecting the output from `ons_available_datasets()`. First, we will extract the identifier for the desired population dataset ('Mid-year population estimates by region, age and gender'). 48 | 49 | ```{r,eval=FALSE} 50 | population_id <- datasets %>% 51 | filter(str_detect(tolower(title),'population estimates for uk')) %>% 52 | pull(id) 53 | 54 | population_id 55 | 56 | datasets %>% 57 | filter(id %in% population_id) %>% 58 | select(title) 59 | ``` 60 | 61 | Then, we will do the same for our desired dataset recording deaths by region. 62 | 63 | ```{r,eval=FALSE} 64 | mortality_id <- datasets %>% 65 | filter(str_detect(tolower(title),'deaths')) %>% 66 | filter(str_detect(tolower(title),'by region')) %>% 67 | pull(id) 68 | 69 | mortality_id 70 | 71 | datasets %>% 72 | filter(id %in% mortality_id) %>% 73 | select(title) 74 | ``` 75 | 76 | ## Browse editions and versions available for each dataset 77 | 78 | ONS datasets are usually associated with multiple editions and versions. Different editions may contain different variables or be presented in different formats, while versions usually refresh or update the content with new data points. Before downloading a dataset with `monstR`, it is recommended to know beforehand which edition and version you would like to donwload. 79 | 80 | The function `ons_available_editions` returns available editions, while `ons_available_versions` return available versions. 81 | 82 | We will check the editions and versions associated with the population dataset. 83 | 84 | ```{r,eval=FALSE} 85 | ### Editions available for the population dataset 86 | ids_and_editions_pop <- map(population_id, ons_available_editions) %>% 87 | set_names(population_id) %>% 88 | bind_rows(.id='id') %>% 89 | mutate(.,id_edition=paste(id,edition,sep="-")) 90 | 91 | ids_and_editions_pop 92 | 93 | ### Versions available for each edition 94 | ids_and_editions_and_versions_pop <- mapply(id=ids_and_editions_pop$id, 95 | edition=ids_and_editions_pop$edition, 96 | ons_available_versions) 97 | names(ids_and_editions_and_versions_pop) <- ids_and_editions_pop$id_edition 98 | 99 | ids_and_editions_and_versions_pop 100 | ``` 101 | 102 | From the dataset with identifier `mid-year-pop` we would like to download edition `mid-2018-april-2019-geography` and version `1`. 103 | 104 | We will also explore editins and vesions for the deaths dataset. 105 | 106 | ```{r,eval=FALSE} 107 | ### Editions available for the deaths dataset 108 | ids_and_editions_deaths <- map(mortality_id, ons_available_editions) %>% 109 | set_names(mortality_id) %>% 110 | bind_rows(.id='id') %>% 111 | mutate(.,id_edition=paste(id,edition,sep="-")) 112 | 113 | ids_and_editions_deaths 114 | 115 | ### Versions available for each edition 116 | ids_and_editions_and_versions_deaths <- mapply(id=ids_and_editions_deaths$id, 117 | edition=ids_and_editions_deaths$edition, 118 | ons_available_versions) 119 | names(ids_and_editions_and_versions_deaths) <- ids_and_editions_deaths$id_edition 120 | 121 | ids_and_editions_and_versions_deaths 122 | ``` 123 | 124 | From the dataset with identifier `weekly-deaths-local-authority` we would like to download edition `2010-19` and version `1`. 125 | 126 | ## Download both ONS datasets 127 | 128 | After inspection in the previous step, we have decided which edition and version pair we would like to download for each dataset. 129 | 130 | The following command sets up the pipeline using the `monstR` default settings. 131 | 132 | ```{r,eval=FALSE} 133 | set_up_df <- monstr_pipeline_defaults() %>% 134 | ons_datasets_setup() 135 | ``` 136 | 137 | We feed the edition and version to function `ons_dataset_by_id`, while 138 | `ons_download` downloads the data. Additional functions `monstr_read_file`, `monstr_clean` and `monstr_write_clean` read, clean and write the desired datasets. 139 | 140 | We can now download our population dataset. 141 | 142 | ```{r echo=TRUE, eval=FALSE,results='hide'} 143 | set_up_df %>% 144 | ons_dataset_by_id(id=population_id,edition="mid-2019-april-2020-geography",version=1) %>% 145 | ons_download(format="csv") %>% 146 | monstr_read_file() %>% 147 | monstr_clean() %>% 148 | monstr_write_clean(format="all") 149 | ``` 150 | 151 | And our deaths dataset. 152 | 153 | ```{r echo=TRUE, eval=FALSE,results='hide'} 154 | set_up_df %>% 155 | ons_dataset_by_id(id=mortality_id,edition="2010-19",version=1) %>% 156 | ons_download(format="csv") %>% 157 | monstr_read_file() %>% 158 | monstr_clean() %>% 159 | monstr_write_clean(format="all") 160 | ``` 161 | 162 | ## Import and clean population dataset 163 | 164 | The cleaned data can be found in your root project folder under `data/clean/ons/`. 165 | 166 | ```{r,eval=FALSE} 167 | ons_midyear_pop <- fread(here("data","clean","ons","mid-year-pop-est","mid-2019-april-2020-geography","mid-year-pop-est-v1.csv"), header=TRUE, sep=",", check.names=TRUE) %>% 168 | rename(.,population=v4_0) 169 | ``` 170 | 171 | For this analysis, we will keep the number of deaths for all ages and both genders in each year and region. 172 | 173 | ```{r,eval=FALSE} 174 | ons_midyear_pop_total <- filter(ons_midyear_pop,age=="Total"&sex=="All") %>% 175 | select(.,population,calendar_years,admin_geography) %>% 176 | arrange(.,admin_geography,desc(calendar_years)) 177 | 178 | knitr::kable(head(ons_midyear_pop_total, 10)) 179 | ``` 180 | 181 | ```{r,echo=FALSE,out.width = "50%", fig.pos="h"} 182 | knitr::include_graphics(here('vignettes','pop_table.PNG'),dpi=500) 183 | ``` 184 | 185 | ## Import and clean deaths dataset 186 | 187 | ```{r,eval=FALSE} 188 | ons_weekly_deaths_region <- fread(here("data","clean","ons","weekly-deaths-region","2010-19","weekly-deaths-region-v1.csv"), header=TRUE, sep=",", check.names=TRUE) %>% 189 | rename(.,nr_deaths=v4_1) %>% 190 | arrange(.,admin_geography,desc(calendar_years)) 191 | 192 | knitr::kable(head(ons_weekly_deaths_region, 10)) 193 | ``` 194 | 195 | ```{r,echo=FALSE,,out.width = "50%", fig.pos="h"} 196 | knitr::include_graphics(here('vignettes','pop_deaths.PNG'),dpi=500) 197 | ``` 198 | 199 | This data is currently presented as a weekly time series, with one row per week. We will aggregate it into a yearly time series, thus matching the ONS population dataset. 200 | 201 | ```{r,eval=FALSE} 202 | ons_weekly_deaths_region <- as.data.table(ons_weekly_deaths_region) 203 | 204 | ons_weekly_deaths_region_annual <- ons_weekly_deaths_region[, list( 205 | geography = first(geography), 206 | nr_deaths=sum(nr_deaths,na.rm=TRUE)), 207 | by = list(calendar_years,admin_geography)] 208 | 209 | knitr::kable(head(ons_weekly_deaths_region_annual, 10)) 210 | ``` 211 | 212 | ```{r,echo=FALSE,out.width = "50%", fig.pos="h"} 213 | knitr::include_graphics(here('vignettes','pop_deaths_aggregate.PNG'),dpi=500) 214 | ``` 215 | 216 | ## Merge datasets 217 | 218 | We are now ready to merge our population data into our deaths data, which will allow us to compute a new variable: the death rate per 100,000 residents for a given region and year. 219 | 220 | ```{r,eval=FALSE} 221 | ons_weekly_deaths_region_annual <- left_join(ons_weekly_deaths_region_annual, 222 | ons_midyear_pop_total, 223 | by=c("calendar_years" = "calendar_years", 224 | "admin_geography" = "admin_geography")) 225 | 226 | ons_weekly_deaths_region_annual <- mutate(ons_weekly_deaths_region_annual, 227 | deaths_per_100K=nr_deaths/population*100000) 228 | 229 | knitr::kable(head(ons_weekly_deaths_region_annual, 10)) 230 | ``` 231 | 232 | ```{r,echo=FALSE,out.width = "50%", fig.pos="h"} 233 | knitr::include_graphics(here('vignettes','merged_table.PNG'),dpi=500) 234 | ``` 235 | 236 | ## Present results in a chart 237 | 238 | Having used the `monstR` package to download our required ONS tables and processed the data, we are ready to display the mortality rate by year according to region. 239 | 240 | ```{r, eval=FALSE} 241 | ons_weekly_deaths_region_annual$bold <- ifelse(ons_weekly_deaths_region_annual$geography=="England and Wales",1,0) 242 | 243 | ggplot(ons_weekly_deaths_region_annual, 244 | aes(x=factor(calendar_years), y=deaths_per_100K, group=geography)) + 245 | geom_line(aes(color=geography,size = factor(bold)))+ 246 | geom_point(aes(color=geography)) + 247 | theme(text = element_text(size = 10), 248 | panel.border = element_blank(), 249 | panel.grid.major = element_blank(), 250 | panel.grid.minor = element_blank(), 251 | panel.background = element_blank(), 252 | legend.key=element_blank()) + 253 | ggtitle("Mortality rate by region in England and Wales") + 254 | xlab("Year") + ylab("Number of deaths per 100,000") + 255 | scale_color_brewer(palette="Set3",name = "Region") + 256 | scale_size_manual(values = c(0.5,1.25),guide = "none") + 257 | scale_y_continuous(labels = function(x) format(x, big.mark = ",", 258 | scientific = FALSE)) 259 | ``` 260 | 261 | ```{r,echo=FALSE,out.width = "50%", fig.pos="h"} 262 | knitr::include_graphics(here('vignettes','mortality_by_region.png'),dpi=800) 263 | ``` -------------------------------------------------------------------------------- /vignettes/mortality_by_region.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/vignettes/mortality_by_region.png -------------------------------------------------------------------------------- /vignettes/pipeline.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quick start" 3 | author: "The Health Foundation Analytics Lab" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{Vignette Title} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | 13 | 14 | ## Discover what is available: 15 | 16 | There are a few helper functions to help you find out what data sets are available as well as the corresponding editions and versions The `ons_available_datasets()` function will return a dataframe with information about all available datasets. The `id` column is what you need to download a dataset. 17 | 18 | ```{r , eval = FALSE, include = TRUE} 19 | 20 | datasets <- ons_available_datasets() 21 | 22 | datasets() %>% 23 | select(id) 24 | id 25 | 1 cpih01 26 | 2 mid-year-pop-est 27 | 3 ashe-table-7-hours 28 | 4 ashe-table-7-earnings 29 | 5 ashe-table-8-hours 30 | 6 ashe-table-8-earnings 31 | 7 opss-rates 32 | 8 opss-membership 33 | 9 wellbeing-year-ending 34 | 10 wellbeing-local-authority 35 | ... 36 | 37 | 38 | ``` 39 | 40 | Once you have picked a dataset, you need pick the edition you want. This can be done using `ons_available_editions()`. 41 | ```{r, eval = FALSE, include = TRUE} 42 | 43 | # Discover the available editions for a particular dataset 44 | ons_available_editions(id = "mid-year-pop-est") 45 | 46 | edition 47 | 48 | 1 mid-2018-april-2019-geography 49 | 2 mid-2019-april-2020-geography 50 | 3 time-series 51 | 52 | 53 | 54 | ``` 55 | 56 | Finally, you need to find out what what versions are availble for a specific edition of a dataset. 57 | ```{r, eval = FALSE, include = TRUE} 58 | # Discover the available versions for a particular edition 59 | 60 | ons_available_versions("mid-year-pop-est", "time-series") 61 | 62 | version 63 | 1 1 64 | 2 2 65 | 3 3 66 | 4 4 67 | 68 | ``` 69 | 70 | ## Download the data 71 | 72 | You should now be ready to download the data. Start by specifying where you want the data to downloaded to. The `monstr_pipeline_defaults()` returns a default folder structure (without creating it). You can specify the a file path base using the `download_root` argument. If you do not specify `download_root`, the base file path will be your project root if you are using Rstudio projects and wherever you working directory is set to otherwise. The output from `monstr_pipeline_defaults()` is then fed to `ons_datasets_setup()` which queries the ONS API to get the relevant information to prepare for downloading the data. Finally, `ons_download()` downloads the data. The rest of the piped code reads in, cleans and saves a clean version of the data. 73 | ```{r, eval=FALSE, include=TRUE} 74 | monstr_pipeline_defaults(download_root="/path/to/download/root/") %>% 75 | ons_datasets_setup() %>% # Uses the monstr 'standards' for location and format 76 | ons_dataset_by_id("weekly-deaths-local-authority") %>% 77 | ons_download(format="csv") %>% 78 | monstr_read_file() %>% 79 | monstr_clean() %>% 80 | monstr_write_clean(format="all") 81 | 82 | ``` 83 | 84 | 85 | 86 | 87 | ## Further Examples 88 | 89 | ### Download the latest weekly-deaths-local-authority data as a csv. 90 | 91 | 92 | ```{r , eval = FALSE, include = TRUE} 93 | ons_datasets_setup(monstr_pipeline_defaults()) %>% 94 | ons_dataset_by_id("weekly-deaths-local-authority") %>% 95 | ons_download(format="csv") 96 | 97 | # file will be in `{{root}}/data/raw/ons/weekly-deaths-local-authority/time-series/vN.csv` 98 | # metadata about the file will be in `{{root}}/data/raw/ons/weekly-deaths-local-authority/time-series/vN.csv.meta.json` 99 | ``` 100 | 101 | ### Similarly it can be downloaded as an xls 102 | 103 | ```{r , eval = FALSE, include = TRUE} 104 | ons_datasets_setup(monstr_pipeline_defaults()) %>% 105 | ons_dataset_by_id("weekly-deaths-local-authority") %>% 106 | ons_download(format="xls") 107 | ``` 108 | 109 | 110 | ### Specific versions can be selected. 111 | 112 | ```{r , eval = FALSE, include = TRUE} 113 | datasets <- ons_datasets_setup(monstr_pipeline_defaults()) 114 | ## get the metadata about v4 of the time-series edition of weekly-deaths-local-authority dataset. 115 | wdla4_meta <- datasets %>% ons_dataset_by_id("weekly-deaths-local-authority", edition="time-series", version=4) 116 | 117 | # download it 118 | wdla4_meta %>% 119 | monstr_pipeline_defaults() %>% 120 | ons_download(format="csv") 121 | 122 | # Or get the latest 123 | wdla_latest <- datasets %>% ons_dataset_by_id("weekly-deaths-local-authority", edition="time-series") 124 | 125 | 126 | # csv for the web meta data about the schema of the data. 127 | wdla_latest %>% ons_download(format="csv") 128 | 129 | ``` 130 | -------------------------------------------------------------------------------- /vignettes/pop_deaths.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/vignettes/pop_deaths.PNG -------------------------------------------------------------------------------- /vignettes/pop_deaths_aggregate.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/vignettes/pop_deaths_aggregate.PNG -------------------------------------------------------------------------------- /vignettes/pop_table.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HFAnalyticsLab/monstR/4f428e0ea5f896108e3ac78488d50a33edc8af65/vignettes/pop_table.PNG --------------------------------------------------------------------------------