├── .Rbuildignore ├── .github └── workflows │ ├── R-build-test.yml │ └── py-build-test.yml ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENCE.md ├── MANIFEST.in ├── NAMESPACE ├── R ├── Nomisweb.R └── UKCensusAPI.R ├── README.md ├── README.txt ├── UKCensusAPI.Rproj ├── doc ├── paper.bib └── paper.md ├── inst ├── examples │ ├── contextify.R │ ├── contextify.py │ ├── geoquery.R │ └── geoquery.py └── scripts │ ├── package.sh │ └── ukcensus-query ├── man ├── UKCensusAPI.Rd ├── contextify.Rd ├── geoCodeLookup.Rd ├── geoCodes.Rd ├── getData.Rd ├── getLADCodes.Rd ├── getMetadata.Rd ├── instance.Rd ├── queryInstance.Rd └── queryMetadata.Rd ├── pyproject.toml ├── requirements.txt ├── setup.py ├── tests ├── extended_scotland.py ├── test_all.py ├── testthat.R └── testthat │ └── test-all.R └── ukcensusapi ├── NISRA.py ├── NRScotland.py ├── Nomisweb.py ├── Query.py ├── __init__.py └── utils.py /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^.*\.tsv$ 4 | .eggs 5 | .travis.yml 6 | -------------------------------------------------------------------------------- /.github/workflows/R-build-test.yml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: true 19 | matrix: 20 | config: 21 | # - {os: macOS-latest, r: 'release'} 22 | # - {os: windows-latest, r: 'release'} 23 | # - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | # - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | NOMIS_API_KEY: DUMMY 29 | RETICULATE_PYTHON: /usr/bin/python3 30 | #GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 31 | R_KEEP_PKG_SOURCE: yes 32 | 33 | steps: 34 | - uses: actions/checkout@v2 35 | 36 | - name: "pip: Python 3.8" 37 | uses: actions/setup-python@v2 38 | with: 39 | python-version: 3.8 40 | - name: Install dependencies 41 | run: | 42 | python -m pip install --upgrade pip 43 | - name: Build 44 | run: | 45 | python -m pip install . 46 | 47 | - uses: r-lib/actions/setup-pandoc@v1 48 | 49 | - uses: r-lib/actions/setup-r@v1 50 | with: 51 | r-version: ${{ matrix.config.r }} 52 | http-user-agent: ${{ matrix.config.http-user-agent }} 53 | use-public-rspm: true 54 | 55 | - uses: r-lib/actions/setup-r-dependencies@v1 56 | with: 57 | extra-packages: rcmdcheck 58 | 59 | - uses: r-lib/actions/check-r-package@v1 60 | 61 | - name: Show testthat output 62 | if: always() 63 | run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true 64 | shell: bash 65 | 66 | - name: Upload check results 67 | if: failure() 68 | uses: actions/upload-artifact@main 69 | with: 70 | name: ${{ runner.os }}-r${{ matrix.config.r }}-results 71 | path: check 72 | -------------------------------------------------------------------------------- /.github/workflows/py-build-test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python (pip) build 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | tags: '**' 10 | pull_request: 11 | branches: [ main ] 12 | schedule: 13 | - cron: '0 5 * * SAT' 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ${{ matrix.os }} 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | python-version: ["3.8", "3.9", "3.10"] 23 | os: [ubuntu-latest, windows-latest, macos-latest] 24 | steps: 25 | - uses: actions/checkout@v2 26 | - name: "pip: Python ${{ matrix.python-version }} / ${{ matrix.os }}" 27 | uses: actions/setup-python@v2 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | - name: Build 34 | run: | 35 | python -m pip install . 36 | # - name: Lint with flake8 37 | # run: | 38 | # # stop the build if there are Python syntax errors or undefined names 39 | # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | - name: Test with unittest 43 | run: | 44 | python setup.py test 45 | env: 46 | NOMIS_API_KEY: ${{ secrets.NOMIS_API_KEY }} 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | 5 | *.pyc 6 | /dist/ 7 | /*.egg-info 8 | .eggs/ 9 | build/ 10 | 11 | .vscode/ 12 | .venv*/ 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: focal 2 | cache: packages 3 | 4 | matrix: 5 | include: 6 | - language: r 7 | r: oldrel 8 | env: 9 | - NOMIS_API_KEY=DUMMY 10 | - RETICULATE_PYTHON=/usr/bin/python3 11 | # install the python package first 12 | before_install: 13 | - sudo apt-get update && sudo apt-get install -y python3 python3-setuptools python3-dev python3-pip 14 | - sudo python3 -m pip install --upgrade pip 15 | - sudo python3 -m pip install numpy pandas 16 | - python3 setup.py install --user 17 | 18 | - language: r 19 | r: release 20 | env: 21 | - NOMIS_API_KEY=DUMMY 22 | - RETICULATE_PYTHON=/usr/bin/python3 23 | # install the python package first 24 | before_install: 25 | - sudo apt-get update && sudo apt-get install -y python3 python3-setuptools python3-dev python3-pip 26 | - sudo python3 -m pip install --upgrade pip 27 | - sudo python3 -m pip install numpy pandas 28 | - python3 setup.py install --user 29 | 30 | - language: r 31 | r: devel 32 | env: 33 | - NOMIS_API_KEY=DUMMY 34 | - RETICULATE_PYTHON=/usr/bin/python3 35 | # install the python package first 36 | before_install: 37 | - sudo apt-get update && sudo apt-get install -y python3 python3-setuptools python3-dev python3-pip 38 | - sudo python3 -m pip install --upgrade pip 39 | - sudo python3 -m pip install numpy pandas 40 | - python3 setup.py install --user 41 | 42 | - language: python 43 | python: 3.7 44 | env: 45 | - NOMIS_API_KEY=DUMMY 46 | install: 47 | - pip install -r requirements.txt 48 | script: 49 | - python setup.py test 50 | 51 | - language: python 52 | python: 3.8 53 | env: 54 | - NOMIS_API_KEY=DUMMY 55 | install: 56 | - pip install -r requirements.txt 57 | script: 58 | - python setup.py test 59 | 60 | - language: python 61 | python: 3.9 62 | env: 63 | - NOMIS_API_KEY=DUMMY 64 | install: 65 | - pip install -r requirements.txt 66 | script: 67 | - python setup.py test 68 | 69 | # allow_failures: 70 | # - r: oldrel 71 | # - r: release 72 | # - r: devel 73 | 74 | warnings_are_errors: false 75 | 76 | notifications: 77 | email: 78 | on_success: change 79 | on_failure: change 80 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: UKCensusAPI 2 | Title: Automated Query and Download of UK Census Data 3 | Version: 1.1.6 4 | Authors@R: person("Andrew", "Smith", email = "a.p.smith@leeds.ac.uk", role = c("aut", "cre")) 5 | Description: This package provides an R interface to the www.nomisweb.co.uk census data API. It enables: 6 | - querying table metadata 7 | - autogenerating customised python and R query code for future use 8 | - automated cached data downloads 9 | - easily modifying the geography of existing queries 10 | - adding descriptive information to tables (from metadata) 11 | Depends: R (>= 3.3.3) 12 | Imports: 13 | reticulate, 14 | testthat, 15 | License: MIT + file LICENCE.md 16 | Encoding: UTF-8 17 | RoxygenNote: 7.0.2 18 | -------------------------------------------------------------------------------- /LICENCE.md: -------------------------------------------------------------------------------- 1 | # MIT Licence 2 | 3 | ### Copyright © 2017-2022 Andrew P Smith, Tom Russell, Luke Archer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | **THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.** 10 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENCE.md 2 | include README.md 3 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | 2 | export(instance) 3 | export(queryInstance) 4 | export(geoCodeLookup) 5 | export(geoCodes) 6 | export(getLADCodes) 7 | export(getData) 8 | export(getMetadata) 9 | export(queryMetadata) 10 | export(contextify) 11 | 12 | importFrom("utils", "data", "read.csv") 13 | -------------------------------------------------------------------------------- /R/Nomisweb.R: -------------------------------------------------------------------------------- 1 | 2 | #' Interactive metadata query 3 | #' 4 | #' This function calls an interactive script where the user selects a table, a geography, and selects fields, optionally filtering by value. 5 | #' This script will not run in RStudio due to the way it handles standard input. Please run in a standalone R session (or call the python script directly) 6 | #' @examples 7 | #' \dontrun{queryMetadata()} 8 | #' @export 9 | queryMetadata = function() { 10 | # first check we are not running in RStudio (in which can we cannot run interatively, since RStudio redirects stdin from /dev/null) 11 | if (.Platform$GUI == "RStudio") { 12 | cat("This interactive code cannot be run from within RStudio due to the way RStudio handles stdin.\n") 13 | cat("Please either run it from a standalone R session, or call the python code (interactive.py) directly\n") 14 | } else { 15 | system("scripts/interactive.py") 16 | } 17 | } 18 | 19 | #' Geographic code lookup 20 | #' 21 | #' This function returns the nomisweb code for a particular geographic area type 22 | #' @param api an instance of the API (returned by instance()) 23 | #' @param geoCodeString the string representation of a geography, e.g MSOA11 24 | #' @return code - an integer 25 | #' @examples 26 | #' library(UKCensusAPI) 27 | #' censusapi = instance("/tmp/UKCensusAPI") 28 | #' geoCodeLookup(censusapi, "MSOA11") 29 | #' @export 30 | geoCodeLookup = function(api, geoCodeString) { 31 | return(as.character(api$GeoCodeLookup[geoCodeString])) 32 | } 33 | 34 | #' getMetadata() 35 | #' Fetch the metadata for a census table 36 | #' 37 | #' @param api the census provider api 38 | #' @param tableName the name of the census table 39 | #' @return metadata a list data structure containing the table metadata 40 | #' @examples 41 | #' \dontrun{ 42 | #' library(UKCensusAPI) 43 | #' censusapi = instance("/tmp/UKCensusAPI") 44 | #' getMetadata(censusapi, "KS001") 45 | #' } 46 | #' @export 47 | getMetadata = function(api, tableName) { 48 | return(api$get_metadata(tableName)) 49 | } 50 | 51 | #' getData() 52 | #' Fetch and cache census data using a predefined query 53 | #' 54 | #' Ensure all query numeric parameters are passed as strings (e.g. "0" not 0) 55 | #' This prevents conversion to floating-point which can makie queries fail 56 | #' @param api a predefined query 57 | #' @param tableName name of census table (e.g KS401EW) 58 | #' @param query query parameters 59 | #' @return a data.frame contraing the downloaded data 60 | #' @examples 61 | #' \dontrun{ 62 | #' library(UKCensusAPI) 63 | #' censusapi = instance("/tmp/UKCensusAPI") 64 | #' table = "KS102EW" 65 | #' meta=getMetadata(censusapi, table) 66 | #' # queryParams can be autogenerated using the interactive query functionality 67 | #' queryParams = list( 68 | #' geography = "1249902593...1249902596,1249934513...1249934514", 69 | #' MEASURES = "20100", 70 | #' select = "GEOGRAPHY_CODE,OBS_VALUE", 71 | #' CELL = "0", 72 | #' RURAL_URBAN = "0", 73 | #' date = "latest" 74 | #' ) 75 | #' getData(censusapi, table, queryParams) 76 | #' } 77 | #' @export 78 | getData = function(api, tableName, query) { 79 | # returned value is filename (or error) to avoid data frame compatibility issues 80 | filename = api$get_data(tableName, query, TRUE) 81 | # check that filename string isnt an error message! 82 | if (!file.exists(filename)) { 83 | print(paste("Error getting data:", filename)) 84 | return(data.frame(stringsAsFactors = F)) 85 | } 86 | data = read.csv(filename, sep="\t", stringsAsFactors = FALSE) 87 | if (nrow(data) == 1000000) { 88 | warning("Download has reached nomisweb's single query limit. Truncation is extremely likely") 89 | } 90 | return(data) 91 | } 92 | 93 | #' Map local authority names to nomisweb codes 94 | #' 95 | #' @param api an instance of the UKCensusData API. 96 | #' @param laNames a string vector of local authority names or ONS codes. 97 | #' @return an integer vector of nomisweb local authority codes 98 | #' @examples 99 | #' library(UKCensusAPI) 100 | #' censusapi = instance("/tmp/UKCensusAPI") 101 | #' codes = getLADCodes(censusapi, c("Leeds","Bradford")) 102 | #' @export 103 | getLADCodes = function(api, laNames) { 104 | return(api$get_lad_codes(laNames)) 105 | } 106 | 107 | #' geoCodes 108 | #' Get nomisweb geographical codes for a region 109 | #' 110 | #' @param api the instance of the an integer vector of nomisweb geographical codes 111 | #' @param coverage an integer vector of nomisweb geographical codes 112 | #' @param resolution the nomisweb code for a particular area type (e.g. 297 for MSOA) 113 | #' @return a compressed string (nomisweb format) containing nomisweb area codes 114 | #' @examples 115 | #' library(UKCensusAPI) 116 | #' censusapi = instance("/tmp/UKCensusAPI") 117 | #' coverage = getLADCodes(censusapi, c("City of London")) 118 | #' resolution = geoCodeLookup(censusapi, "LSOA11") 119 | #' codes = geoCodes(censusapi, coverage, resolution) 120 | #' @export 121 | geoCodes = function(api, coverage, resolution) { 122 | # force correct types 123 | return(api$get_geo_codes(as.integer(coverage), resolution)) 124 | } 125 | 126 | #' contextify 127 | #' 128 | #' Append table with a contextual column. 129 | #' 130 | #' @param api the instance of the an integer vector of nomisweb geographical codes 131 | #' @param tableName name of census table 132 | #' @param columnName name of column in the table 133 | #' @param table the table 134 | #' @return the table containing a new column with the contextual data 135 | #' @examples 136 | #' \dontrun{ 137 | #' library("UKCensusAPI") 138 | #' cacheDir = "/tmp/UKCensusAPI/" 139 | #' censusapi = UKCensusAPI::instance(cacheDir) 140 | #' table = "KS401EW" 141 | #' queryParams = list( 142 | #' date = "latest", 143 | #' CELL = "7...13", # dwelling type 144 | #' select = "GEOGRAPHY_CODE,CELL,OBS_VALUE", 145 | #' MEASURES = "20100", 146 | #' geography = "1245710558...1245710560", 147 | #' RURAL_URBAN = "0" 148 | #' ) 149 | #' KS401EW = UKCensusAPI::getData(censusapi, table, queryParams) 150 | #' annotated = contextify(censusapi, table, "CELL", KS401EW) 151 | #' } 152 | #' @export 153 | contextify = function(api, tableName, columnName, table) { 154 | metadata = api$load_metadata(tableName) 155 | # append a column using the value lookup provided by the metadata... 156 | # Look at R go! such exquisitely beautiful and intuitive syntax 157 | table[paste0(columnName, "_NAME")] = unlist(metadata$fields[columnName][[1]][as.character(table[[columnName]])]) 158 | return(table) 159 | } 160 | -------------------------------------------------------------------------------- /R/UKCensusAPI.R: -------------------------------------------------------------------------------- 1 | #' UKCensusAPI 2 | #' 3 | #' R package for creating, and modifying, automated downloads of UK census data. See below for an overview of the package. 4 | #' 5 | #' It requires that you register with www.nomisweb.co.uk and obtain an API key, 6 | #' whiBch should be stored in your .Renviron as "NOMIS_API_KEY", e.g. 7 | #' 8 | #' \samp{NOMIS_API_KEY=0x0123456789abcdef0123456789abcdef01234567} 9 | 10 | #' See README.md for detailed information and examples. 11 | #' 12 | #' @section Overview: 13 | #' Nomisweb, run by Durham University, provides online access to the most detailed and up-to-date statistics from official sources for local areas throughout the UK, including census data. 14 | #' This package provides both a python and an R wrapper around the nomisweb census data API, enabling: 15 | #' 16 | #' \itemize{ 17 | #' \item querying table metadata 18 | #' \item autogenerating customised python and R query code for future use 19 | #' \item automated cached data downloads 20 | #' \item modifying the geography of queries 21 | #' \item adding descriptive information to tables (from metadata) 22 | #'} 23 | #' 24 | #' Queries can be customised on geographical coverage, geographical resolution, and table fields, the latter can be filtered to include only the category values you require. 25 | #' The package generates reusable code snippets that can be inserted into applications. Such applications will work seamlessly for any user as long as they have installed this package, and possess their own nomisweb API key. 26 | #' Since census data is essentially static, it makes little sense to download the data every time it is requested: all data downloads are cached. 27 | 28 | #' @example inst/examples/geoquery.R 29 | #' @example inst/examples/contextify.R 30 | 31 | #' @section Functions: 32 | #' \code{\link{geoCodeLookup}} 33 | #' 34 | #' \code{\link{geoCodes}} 35 | #' 36 | #' \code{\link{getData}} 37 | #' 38 | #' \code{\link{getLADCodes}} 39 | #' 40 | #' \code{\link{getMetadata}} 41 | #' 42 | #' \code{\link{instance}} 43 | #' 44 | #' \code{\link{queryInstance}} 45 | #' 46 | #' \code{\link{queryMetadata}} 47 | #' 48 | #' \code{\link{contextify}} 49 | #' 50 | #' @docType package 51 | #' @name UKCensusAPI 52 | NULL 53 | 54 | NRScotland <- setRefClass( 55 | "NRScotland", 56 | fields=c("api"), 57 | methods=list( 58 | getGeog = function(region, resolution) { 59 | return (api$get_geog(region, resolution)) 60 | }, 61 | getMetadata = function(table, geog) { 62 | return (api$get_metadata(table, geog)) 63 | }, 64 | getData = function(table, coverage, resolution, category_filters=list()) { 65 | data = api$get_data(table, coverage, resolution, category_filters) 66 | # reassemble into R data frame 67 | df = data.frame(data$values) 68 | colnames(df)=data$columns 69 | return (df) 70 | } 71 | ) 72 | ) 73 | 74 | NISRA <- setRefClass( 75 | "NISRA", 76 | fields=c("api"), 77 | methods=list( 78 | getGeog = function(region, resolution) { 79 | return (api$get_geog(region, resolution)) 80 | }, 81 | getMetadata = function(table, geog) { 82 | return (api$get_metadata(table, geog)) 83 | }, 84 | getData = function(table, coverage, resolution, category_filters=list()) { 85 | data = api$get_data(table, coverage, resolution, category_filters) 86 | # reassemble into R data frame 87 | df = data.frame(data$values) 88 | colnames(df)=data$columns 89 | return (df) 90 | } 91 | ) 92 | ) 93 | 94 | Api <- NULL 95 | Query <- NULL 96 | 97 | .onLoad <- function(libname, pkgname) { 98 | Api <<- reticulate::import("ukcensusapi.Nomisweb", delay_load = TRUE) 99 | Query <<- reticulate::import("ukcensusapi.Query", delay_load = TRUE) 100 | } 101 | 102 | getApiSC = function(cacheDir) { 103 | module=reticulate::import("ukcensusapi.NRScotland") 104 | return (module$NRScotland(cacheDir)) 105 | } 106 | 107 | getApiNI = function(cacheDir) { 108 | module=reticulate::import("ukcensusapi.NISRA") 109 | return (module$NISRA(cacheDir)) 110 | } 111 | 112 | #' get an instance of the python API (required to call any of the functions) 113 | #' 114 | #' @param cacheDir directory to cache data 115 | #' @param country either "EW" (default, nomisweb API), "SC" (NRScotland bulk data), "NI" (NISRA bulk data) 116 | #' @return an instance of one of the python apis 117 | #' @export 118 | instance = function(cacheDir, country = "EW") { 119 | if (country == "NI") { 120 | api = NISRA$new(api=getApiNI(cacheDir)) 121 | } else if (country == "SC") { 122 | api = NRScotland$new(api=getApiSC(cacheDir)) 123 | } else { 124 | api = Api$Nomisweb(cacheDir) 125 | } 126 | return(api) 127 | } 128 | 129 | #' get an instance of the python query (required to call any of the functions) 130 | #' 131 | #' @param cacheDir directory to cache data 132 | #' @return an instance of the query module 133 | #' @export 134 | queryInstance = function(cacheDir) { 135 | # TODO can we have a function-static variable here? 136 | query = Query$Query(cacheDir) 137 | return(query) 138 | } 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UK Census Data API 2 | 3 | [![PyPI version](https://badge.fury.io/py/ukcensusapi.svg)](https://badge.fury.io/py/ukcensusapi) [![Anaconda-Server Badge](https://anaconda.org/conda-forge/ukcensusapi/badges/version.svg)](https://anaconda.org/conda-forge/ukcensusapi) [![Anaconda-Server Badge](https://anaconda.org/conda-forge/ukcensusapi/badges/downloads.svg)](https://anaconda.org/conda-forge/ukcensusapi) 4 | 5 | [![License](https://img.shields.io/github/license/mashape/apistatus.svg)](https://opensource.org/licenses/MIT)[![JOSS status](http://joss.theoj.org/papers/40041a0ebb1364286d5eb144d333bb6a/status.svg)](http://joss.theoj.org/papers/40041a0ebb1364286d5eb144d333bb6a) 6 | [![DOI](https://zenodo.org/badge/99702514.svg)](https://zenodo.org/badge/latestdoi/99702514) 7 | 8 | [![Python (pip) build](https://github.com/virgesmith/UKCensusAPI/actions/workflows/py-build-test.yml/badge.svg)](https://github.com/virgesmith/UKCensusAPI/actions/workflows/py-build-test.yml) 9 | [![R-CMD-check](https://github.com/virgesmith/UKCensusAPI/actions/workflows/R-build-test.yml/badge.svg)](https://github.com/virgesmith/UKCensusAPI/actions/workflows/R-build-test.yml) 10 | 11 | > ## Update 12 | > This package has been something of a misnomer as it only used Nomisweb as its data source, which only provides full census data for England & Wales. (They do provide some UK key statistics and quick statistics tables). 13 | 14 | > Version 1.1.x of this package extends the 2011 census data coverage for Scotland and Northern Ireland. The aim is to make the data (and the metadata) consistent across all nations, but as neither country provide a web API for their data we have to resort to web scraping. This means the slicing-and-dicing and geographical query functionality may be more limited than it is for England & Wales. Note also that category values in equivalent tables may differ slightly. 15 | 16 | > ### Scotland 17 | > For Scotland, data can be downloaded at country or Council Area (~LAD) level, at geographical resolutions of Council Area, Data Zone (~LSOA) and Output Area. Intermediate Area (~MSOA) data can be aggregated (only) where the data is available at a higher geographical resolution. 18 | 19 | > The principal functions are `NRScotland.get_metadata()` for metadata, `NRScotland.get_data()` for the actual data, and `NRScotland.contextify()` to annotate the data using the metadata. 20 | 21 | > **NB The OA-level Scotland data is provided in a zip compression format (deflate64) that python cannot extract. If this data is requested, you'll get an error message containing instructions on how to fix the issue by manually extracting the file(s) using unzip or 7zip.** 22 | 23 | > ### Northern Ireland 24 | > For Northern Ireland, data can be downloaded at country or Local Government District (~LAD) level, at geographical resolutions of Super Output Area (~LSOA) and Small Area (OA). Ward (~MSOA) (~MSOA) data can be aggregated (only) where the data is available at higher geographical resolution. 25 | > The principal functions are `NISRA.get_metadata()` for metadata, `NISRA.get_data()` for the actual data, and `NISRA.contextify()` to annotate the data using the metadata. 26 | 27 | [Nomisweb](https://www.nomisweb.co.uk), run by Durham University, provides online access to the most detailed and up-to-date statistics from official sources for local areas throughout the UK, including census data. 28 | 29 | This package provides both a `python` and an `R` wrapper around the nomisweb census data API, the NRScotland and NISRA websites, enabling: 30 | 31 | - querying table metadata 32 | - autogenerating customised python and R query code for future use 33 | - automated cached data downloads 34 | - modifying the geography of queries 35 | - adding descriptive information to tables (from metadata) 36 | 37 | Queries can be customised on geographical coverage, geographical resolution, and table fields, the latter can be filtered to include only the category values you require. 38 | 39 | The package can generate reusable code snippets that can be inserted into applications. Such applications will work seamlessly for any user as long as they have installed this package, and possess their own nomisweb API key. 40 | 41 | Since census data is essentially static, it makes little sense to download the data every time it is requested: all data downloads are cached. 42 | 43 | Example code is also provided which: 44 | - shows how an existing query can easily be modified in terms of geographical coverage. 45 | - shows how raw data can be annotated with meaningful metadata 46 | 47 | ## Prerequisites 48 | 49 | ### Software 50 | 51 | - python3.4 or higher, with pip, numpy and pandas. The dependencies should install automatically. Python 2 is not supported. 52 | - R version 3.3.3 or higher (optional, if using the R interface) 53 | 54 | ### API key 55 | 56 | It is recommended that you register with [nomisweb](https://www.nomisweb.co.uk) before using this package and use the API key the supply you in all queries. Without a key, queries will be truncated (max 25000 rows). With a key, the row limit is 1000000 and this package will warn if a query generates data with this number of rows. 57 | 58 | Once registered, you will find your API key on [this page](https://www.nomisweb.co.uk/myaccount/webservice.asp). You should not divulge this key to others. 59 | 60 | This package will look for the key in the following places (in order): 61 | - locally: a file `NOMIS_API_KEY` in the cache directory defined at initialisation, e.g. 62 | ``` 63 | $ cat cache/NOMIS_API_KEY 64 | 0x0000000000000000000000000000000000000000 65 | ``` 66 | - globally: the environment variable NOMIS_API_KEY. R users can store the key in their `.Renviron` file: R will set the environment on startup, which will be visible to a python session instantiated from R. 67 | 68 | Initialisation will fail if the key is not defined in one of these locations. Note: if for some reason you cannot register with nomisweb, you must still define an API key - just set it to an obviously invalid value. 69 | 70 | ## Installation 71 | 72 | ### python release (from PyPI) 73 | 74 | ```sh 75 | pip install UKCensusAPI 76 | ``` 77 | 78 | (NB This will install only the core package without the examples.) 79 | 80 | ### python release (from conda-forge) 81 | 82 | ```sh 83 | conda install -c conda-forge ukcensusapi 84 | ``` 85 | (NB This will install only the core package without the examples.) 86 | 87 | ### python main branch (from github) 88 | 89 | ```sh 90 | pip install git+https://github.com/virgesmith/UKCensusAPI.git 91 | ``` 92 | 93 | or for local development, clone and separately install: 94 | 95 | ```sh 96 | git clone git+https://github.com//UKCensusAPI.git 97 | pip install -e . 98 | ``` 99 | 100 | To test: 101 | 102 | ```sh 103 | pytest 104 | ``` 105 | 106 | ### R 107 | 108 | ```R 109 | > devtools::install_github("virgesmith/UKCensusAPI") 110 | ``` 111 | 112 | Set the `RETICULATE_PYTHON` environment variable in your .Renviron file to the python3 interpreter, e.g. (for linux) 113 | 114 | ```sh 115 | RETICULATE_PYTHON=$(which python3) 116 | ``` 117 | 118 | ## Usage 119 | 120 | In your Python code import the package like e.g.: 121 | 122 | ```py 123 | import ukcensusapi.Nomisweb as census_api 124 | ``` 125 | And in R: 126 | 127 | ```R 128 | library(UKCensusAPI) 129 | ``` 130 | 131 | ### Queries 132 | 133 | Queries have three distinct subtypes: 134 | 135 | - metadata: query a table for the fields and categories it contains 136 | - geography: retrieve a list of area codes of a particular type within a given region of another (larger) type. 137 | - data: retrieve data from a table using a query built from the metadata and geography. 138 | 139 | Data and metadata are cached locally to minimise requests to the data providers. 140 | 141 | Using the interactive query builder, and a known table, you can construct a programmatically reusable query selecting categories, specific category values, and (optionally) geography, See example below. 142 | 143 | Queries can subsequently be programmatically modified to switched to a different geographical region and/or resolution. 144 | 145 | ### Interactive Query 146 | 147 | The first thing users may want to do is an interactive query. All you need to do is specify the name of a census table. The script will then iterate over the categories within the table, prompting you user to select the categories and values you're interested in. 148 | 149 | Once done you'll be prompted to (optionally) specify a geography for the data - a geographical region and a resolution. 150 | 151 | Finally, if you've specified the geography, the script will ask if you want to download (and cache) the data immediately. 152 | 153 | This can be run using this script: 154 | ```bash 155 | $ ukcensus-query [--no-api-key] 156 | ``` 157 | An API key must be specified (see [above](#api-key)) unless the `--no-api-key` flag has been set. 158 | 159 | The script will produce the following files (in the supplied cache directory): 160 | 161 | - a json file containing the table metadata 162 | - python and R code snippets that build the query and call this package to download the data 163 | - (optionally, depending on above selections) the data itself 164 | 165 | The code snippets are designed to be copy/pasted into user code. The (cached) data and metadata can simply be loaded by user code as required. 166 | 167 | Note for R users - there is no direct R script for the interactive query largely due to the fact it will not work from within RStudio (due to the way RStudio handles stdin). 168 | 169 | ### Data reuse 170 | 171 | Existing cached data is always used in preference to downloading. The data is stored locally using a filename based on the table name and md5 hash of the query used to download the data. This way, different queries on the same table can be stored. 172 | 173 | To force the data to be downloaded, just delete the cached data. 174 | 175 | ### Query Reuse 176 | 177 | The code snippets can simply be inserted into user code, and the metadata (json) can be used as a guide for modifying the query, either manually or automatically. 178 | 179 | ### Switching Geography 180 | 181 | Existing queries can easily be modified to switch to a different geographical area and/or a different geographical resolution. 182 | 183 | This allows, for example, users to write models where the geographical coverage and resolution can be user inputs. 184 | 185 | Examples of how to do this are in [`geoquery.py`](inst/examples/geoquery.py) and [`geoquery.R`](inst/examples/geoquery.R). 186 | 187 | ### Annotating Data 188 | 189 | Queries will download data with a minimal memory footprint, but also metadata that provides meaning. Whilst this makes manipulating and querying the data efficient, it means that the data itself lacks human-readability. For this reason the package provides a way of annotating tables with contextual data derived from the table metadata. 190 | 191 | Examples of how to do this are in [`contextify.py`](inst/examples/contextify.py) and [`contextify.R`](inst/examples/contextify.R). 192 | 193 | ## Interactive Query Builder 194 | 195 | This functionality requires that you already know the name of the census table of interest, and want to define a custom query on that table, for a specific geography at a specific resolution. 196 | 197 | If you're unsure about which table to query, Nomisweb provide a useful [table finder](https://www.nomisweb.co.uk/census/2011/data_finder). NB Not all census tables are available at all geographical resolutions, but the above link will enumerate the available resolutions for each table. 198 | 199 | ### Interactive Query - Example 200 | 201 | Run the script. You'll be prompted to enter the name of the census table of interest: 202 | 203 |
204 | $ ukcensus-query .
205 | Nomisweb census data interactive query builder
206 | See README.md for details on how to use this package
207 | Census table: KS401EW
208 | KS401EW - Dwellings, household spaces and accommodation type
209 | 
210 | 211 | The table description is displayed. The script then iterates through the available fields and you are prompted to select the categories you require. For the purposes of this example let's say we only want a subset of the fields: just some of the dwelling types. Required values should be comma separated, or where contiguous, separated by '...'. 212 | 213 |
214 | CELL:
215 |   0 (All categories: Dwelling type)
216 |   1 (Unshared dwelling)
217 |   2 (Shared dwelling: Two household spaces)
218 |   3 (Shared dwelling: Three or more household spaces)
219 |   4 (All categories: Household spaces)
220 |   5 (Household spaces with at least one usual resident)
221 |   6 (Household spaces with no usual residents)
222 |   7 (Whole house or bungalow: Detached)
223 |   8 (Whole house or bungalow: Semi-detached)
224 |   9 (Whole house or bungalow: Terraced (including end-terrace))
225 |   10 (Flat, maisonette or apartment: Purpose-built block of flats or tenement)
226 |   11 (Flat, maisonette or apartment: Part of a converted or shared house (including bed-sits))
227 |   12 (Flat, maisonette or apartment: In a commercial building)
228 |   13 (Caravan or other mobile or temporary structure)
229 | Select categories (default 0): 7...13
230 | 
231 | Select the output type you want (absolute values or percentages) 232 |
233 | MEASURES:
234 |   20100 (value)
235 |   20301 (percent)
236 | Select categories (default 0): 20100
237 | 
238 | For the purposes of this example we don't require the RURAL_URBAN field in our output, so we just hit return to accept the default selection. When the default is selected, the query builder will prompt you for whether you want to include this field in the output. (If something other than the default is not selected, the query builder will always assume that you want the field in the output.) 239 |
240 | RURAL_URBAN:
241 |   0 (Total)
242 |   1 (Urban city and town in a sparse setting)
243 |   2 (Urban major conurbation)
244 |   3 (Urban minor conurbation)
245 |   4 (Urban city and town)
246 |   101 (Rural (total))
247 |   6 (Rural village in a sparse setting)
248 |   7 (Rural hamlet and isolated dwellings in a sparse setting)
249 |   8 (Rural town and fringe)
250 |   9 (Rural village)
251 |   10 (Rural hamlet and isolated dwellings)
252 |   100 (Urban (total))
253 |   5 (Rural town and fringe in a sparse setting)
254 | Select categories (default 0): 
255 | include in output? (y/n) n
256 | 
257 | Now you can optionally select the geographical area(s) you want to cover. This can be a single local authority, multiple local authorities, England, England & Wales, GB or UK. If a local authority, you can specify it either by name or ONS code (e.g. E09000001) 258 |
259 | Add geography? (y/N): y
260 | 
261 | Geographical coverage
262 | E/EW/GB/UK or LA code(s)/name(s), comma separated: Leeds
263 | 
264 | All the available geographies for the data are displayed. Select the geographical resolution required. 265 |
266 | TYPE265 NHS area teams
267 | TYPE266 clinical commissioning groups
268 | TYPE267 built-up areas including subdivisions
269 | TYPE269 built-up areas
270 | TYPE273 national assembly for wales electoral regions 2010
271 | TYPE274 postcode areas
272 | TYPE275 postcode districts
273 | TYPE276 postcode sectors
274 | TYPE277 national assembly for wales constituencies 2010
275 | TYPE279 parishes 2011
276 | TYPE282 2011 local health boards
277 | TYPE283 2011 primary care trusts
278 | TYPE284 2011 strategic health authorities
279 | TYPE295 2011 wards
280 | TYPE297 2011 super output areas - middle layer
281 | TYPE298 2011 super output areas - lower layer
282 | TYPE299 2011 output areas
283 | TYPE459 local enterprise partnerships (as of April 2017)
284 | TYPE460 parliamentary constituencies 2010
285 | TYPE462 former metropolitan counties
286 | TYPE463 local authorities: county / unitary (prior to April 2015)
287 | TYPE464 local authorities: district / unitary (prior to April 2015)
288 | TYPE480 regions
289 | TYPE499 countries
290 | Select Resolution: TYPE297
291 | 
292 | 293 | You will then be prompted to choose whether to download the data immediately. If so, the query builder assembles the query and computes an md5 hash of it. It then checks the cache directory if a file with this name exists and will load the data from the file if so. If not, the query builder downloads the data and save the data in the cache directory. 294 | ``` 295 | Get data now? (y/N): y 296 | 297 | Getting data... 298 | 299 | Writing python code snippet to KS401EW.py 300 | 301 | Writing R code snippet to KS401EW.R 302 | $ 303 | ``` 304 | Regardless of whether you selected geography, or downloaded the data, the query builder will generate python and R code snippets for later use. 305 | 306 | The generated python code snippet is: 307 | 308 | ``` 309 | """ 310 | KS401EW - Dwellings, household spaces and accommodation type 311 | 312 | Code autogenerated by UKCensusAPI 313 | (https://github.com/virgesmith/UKCensusAPI) 314 | """ 315 | 316 | # This code requires an API key, see the README.md for details 317 | 318 | # Query url: 319 | # https://www.nomisweb.co.uk/api/v01/dataset/NM_618_1.data.tsv?CELL=7...13&MEASURES=20100&RURAL_URBAN=0&date=latest&geography=1245714681...1245714688&select=GEOGRAPHY_CODE%2CCELL%2COBS_VALUE 320 | 321 | import ukcensusapi.Nomisweb as CensusApi 322 | 323 | api = CensusApi.Nomisweb("/tmp/UKCensusAPI/") 324 | table = "KS401EW" 325 | table_internal = "NM_618_1" 326 | query_params = {} 327 | query_params["RURAL_URBAN"] = "0" 328 | query_params["select"] = "GEOGRAPHY_CODE,CELL,OBS_VALUE" 329 | query_params["date"] = "latest" 330 | query_params["geography"] = "1245714681...1245714688" 331 | query_params["MEASURES"] = "20100" 332 | query_params["CELL"] = "7...13" 333 | KS401EW = api.get_data(table, query_params) 334 | ``` 335 | The the R code: 336 | ``` 337 | # KS401EW - Dwellings, household spaces and accommodation type 338 | 339 | # Code autogenerated by UKCensusAPI 340 | #https://github.com/virgesmith/UKCensusAPI 341 | 342 | # This code requires an API key, see the README.md for details 343 | # Query url: https://www.nomisweb.co.uk/api/v01/dataset/NM_618_1.data.tsv?CELL=7...13&MEASURES=20100&RURAL_URBAN=0&date=latest&geography=1245714681...1245714688&select=GEOGRAPHY_CODE%2CCELL%2COBS_VALUE 344 | 345 | library("UKCensusAPI") 346 | cacheDir = "/tmp/UKCensusAPI/" 347 | api = UKCensusAPI::instance(cacheDir) 348 | table = "KS401EW" 349 | table_internal = "NM_618_1" 350 | queryParams = list( 351 | RURAL_URBAN = "0", 352 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE", 353 | date = "latest", 354 | geography = "1245714681...1245714688", 355 | MEASURES = "20100", 356 | CELL = "7...13" 357 | ) 358 | KS401EW = UKCensusAPI::getData(api, table, table_internal, queryParams) 359 | ``` 360 | Users can then copy and paste the generated code snippets into their models, modifying as necessary, to automate the download of the correct data. The metadata looks like this: 361 | 362 | ``` 363 | { 364 | "nomis_table": "NM_618_1", 365 | "description": "KS401EW - Dwellings, household spaces and accommodation type", 366 | "fields": { 367 | "GEOGRAPHY": { 368 | "2092957703": "England and Wales", 369 | "2092957699": "England", 370 | "2092957700": "Wales" 371 | }, 372 | "RURAL_URBAN": { 373 | "0": "Total", 374 | "100": "Urban (total)", 375 | "2": "Urban major conurbation", 376 | "3": "Urban minor conurbation", 377 | "4": "Urban city and town", 378 | "1": "Urban city and town in a sparse setting", 379 | "101": "Rural (total)", 380 | "8": "Rural town and fringe", 381 | "5": "Rural town and fringe in a sparse setting", 382 | "9": "Rural village", 383 | "6": "Rural village in a sparse setting", 384 | "10": "Rural hamlet and isolated dwellings", 385 | "7": "Rural hamlet and isolated dwellings in a sparse setting" 386 | }, 387 | "CELL": { 388 | "0": "All categories: Dwelling type", 389 | "1": "Unshared dwelling", 390 | "2": "Shared dwelling: Two household spaces", 391 | "3": "Shared dwelling: Three or more household spaces", 392 | "4": "All categories: Household spaces", 393 | "5": "Household spaces with at least one usual resident", 394 | "6": "Household spaces with no usual residents", 395 | "7": "Whole house or bungalow: Detached", 396 | "8": "Whole house or bungalow: Semi-detached", 397 | "9": "Whole house or bungalow: Terraced (including end-terrace)", 398 | "10": "Flat, maisonette or apartment: Purpose-built block of flats or tenement", 399 | "11": "Flat, maisonette or apartment: Part of a converted or shared house (including bed-sits)", 400 | "12": "Flat, maisonette or apartment: In a commercial building", 401 | "13": "Caravan or other mobile or temporary structure" 402 | }, 403 | "MEASURES": { 404 | "20100": "value", 405 | "20301": "percent" 406 | }, 407 | "FREQ": { 408 | "A": "Annually" 409 | } 410 | }, 411 | "geographies": { 412 | "TYPE265": "NHS area teams", 413 | "TYPE266": "clinical commissioning groups", 414 | "TYPE267": "built-up areas including subdivisions", 415 | "TYPE269": "built-up areas", 416 | "TYPE273": "national assembly for wales electoral regions 2010", 417 | "TYPE274": "postcode areas", 418 | "TYPE275": "postcode districts", 419 | "TYPE276": "postcode sectors", 420 | "TYPE277": "national assembly for wales constituencies 2010", 421 | "TYPE279": "parishes 2011", 422 | "TYPE282": "2011 local health boards", 423 | "TYPE283": "2011 primary care trusts", 424 | "TYPE284": "2011 strategic health authorities", 425 | "TYPE295": "2011 wards", 426 | "TYPE297": "2011 super output areas - middle layer", 427 | "TYPE298": "2011 super output areas - lower layer", 428 | "TYPE299": "2011 output areas", 429 | "TYPE459": "local enterprise partnerships (as of April 2017)", 430 | "TYPE460": "parliamentary constituencies 2010", 431 | "TYPE462": "former metropolitan counties", 432 | "TYPE463": "local authorities: county / unitary (prior to April 2015)", 433 | "TYPE464": "local authorities: district / unitary (prior to April 2015)", 434 | "TYPE480": "regions", 435 | "TYPE499": "countries" 436 | } 437 | } 438 | ``` 439 | If you've selected to download the data, a tsv file (like csv but with a tab separator) called `KS401EW_8a13b34bade69f230b62ce0875c47437.tsv` will be saved in the cache directory: 440 | 441 | ``` 442 | "GEOGRAPHY_CODE" "CELL" "OBS_VALUE" 443 | "E02002330" "7" 1736 444 | "E02002330" "8" 743 445 | "E02002330" "9" 224 446 | "E02002330" "10" 106 447 | "E02002330" "11" 13 448 | "E02002330" "12" 7 449 | "E02002330" "13" 0 450 | "E02002331" "7" 597 451 | "E02002331" "8" 797 452 | ... 453 | ``` 454 | 455 | The data in this table has (for brevity and efficiency) the values "7" to "13" in the cell column, which are obviously meaningless without context. Meaning can be conveyed using the metadata that is also downloaded and cached locally. It's probably best to leave this step until the result stage, but you can annotate a table, given a column name and the appropriate metadata, using the `contextify` function, like this: 456 | 457 | ``` 458 | "GEOGRAPHY_CODE" "CELL" "OBS_VALUE" "CELL_NAME" 459 | "E02002330" "7" 1736 "Whole house or bungalow: Detached" 460 | "E02002330" "8" 743 "Whole house or bungalow: Semi-detached" 461 | "E02002330" "9" 224 "Whole house or bungalow: Terraced (including end-terrace)" 462 | "E02002330" "10" 106 "Flat, maisonette or apartment: Purpose-built block of flats or tenement" 463 | "E02002330" "11" 13 "Flat, maisonette or apartment: Part of a converted or shared house (including bed-sits)" 464 | "E02002330" "12" 7 "Flat, maisonette or apartment: In a commercial building" 465 | "E02002330" "13" 0 "Caravan or other mobile or temporary structure" 466 | "E02002331" "7" 597 "Whole house or bungalow: Detached" 467 | "E02002331" "8" 797 "Whole house or bungalow: Semi-detached" 468 | ... 469 | ``` 470 | See the example code in [contextify.py](inst/examples/contextify.py) and/or [contextify.R](inst/examples/contextify.R) 471 | 472 | ## Detailed Help 473 | 474 | ### Public classes/methods (python) 475 | 476 | Use python's built-in help functionality, e.g. 477 | ``` 478 | >>> import ukcensusapi.Nomisweb as api 479 | >>> help(api) 480 | ... 481 | >>> import ukcensusapi.Query as query 482 | >>> help(query) 483 | ``` 484 | ### Public functions (R) 485 | 486 | See the man pages, which can be accessed from RStudio using the command `?UKCensusAPI` 487 | 488 | ## Support and Feature Requests 489 | 490 | Please use the issues section to report bugs, request features and see status of existing issues. Code contributions (by PR) are most welcome. 491 | 492 | 493 | 494 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | See README.md 2 | -------------------------------------------------------------------------------- /UKCensusAPI.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /doc/paper.bib: -------------------------------------------------------------------------------- 1 | 2 | @misc{noauthor_nomis_nodate, 3 | title = {Nomis - {Official} {Labour} {Market} {Statistics}}, 4 | url = {https://www.nomisweb.co.uk/}, 5 | urldate = {2017-09-06} 6 | } 7 | 8 | @misc{smith_ukcensusapi:_2017, 9 | title = {{UKCensusAPI}: {UK} {Census} {Data} queries and downloads from python or {R}}, 10 | copyright = {MIT}, 11 | shorttitle = {{UKCensusAPI}}, 12 | url = {https://github.com/virgesmith/UKCensusAPI}, 13 | urldate = {2017-09-06}, 14 | author = {Smith, Andrew}, 15 | month = sep, 16 | year = {2017}, 17 | note = {original-date: 2017-08-08T14:34:02Z}, 18 | keywords = {data-science, python, r}, 19 | } 20 | -------------------------------------------------------------------------------- /doc/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'UKCensusAPI: python and R interfaces to the nomisweb UK census data API' 3 | tags: 4 | - python 5 | - r 6 | - data science 7 | authors: 8 | - name: Andrew P Smith 9 | orcid: 0000-0002-9951-6642 10 | affiliation: 1 11 | affiliations: 12 | - name: School of Geography and Leeds Institute for Data Analytics, University of Leeds 13 | index: 1 14 | date: 6 September 2017 15 | bibliography: paper.bib 16 | --- 17 | 18 | # Summary 19 | 20 | Nomisweb [@noauthor_nomis_nodate] provide an extremely useful API for querying and downloading UK census data. However, in practice data queries must be built manually and the query URL copied and pasted into user code. This makes modification of queries laborious and this is especially so when (re)defining the geographical coverage and resolution of a query. 21 | 22 | This package [@smith_ukcensusapi:_2017] provides both python and R interfaces around the nomisweb API that address these shortcomings. It contains functionality to: 23 | - query tables directly for their metadata 24 | - autogenerate customised python and R query code for reuse 25 | - automate and cache data and metadata downloads 26 | - easily modify the geographical coverage and resolution of existing queries 27 | - add descriptive information to downloaded tables (from metadata) 28 | 29 | This is particularly useful in applications such as microsimulation, where there are requirements to run the model for different geographical areas and/or different geographical resolutions with minimal user/developer intervention. 30 | 31 | # References 32 | -------------------------------------------------------------------------------- /inst/examples/contextify.R: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # Example: Annotating data from metadata 3 | # 4 | # shows how raw data can be annotated with meaningful metadata 5 | ############################################################### 6 | 7 | library("UKCensusAPI") 8 | 9 | cacheDir = "/tmp/UKCensusAPI" 10 | 11 | # Here's a predefined query, to which we add contextual data 12 | 13 | table = "KS401EW" 14 | queryParams = list( 15 | date = "latest", 16 | RURAL_URBAN = "0", 17 | MEASURES = "20100", 18 | CELL = "7...13", 19 | geography = "1245710558...1245710660", 20 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE" 21 | ) 22 | 23 | api = instance(cacheDir) 24 | 25 | # Fetch the data 26 | KS401EW = getData(api, table, queryParams) 27 | 28 | # Add the context... 29 | KS401EW = contextify(api, table, "CELL", KS401EW) 30 | head(KS401EW) 31 | 32 | # end of example 33 | 34 | -------------------------------------------------------------------------------- /inst/examples/contextify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | 5 | """ 6 | Example of adding context to a table 7 | """ 8 | 9 | import ukcensusapi.Nomisweb as Api 10 | 11 | def main(): 12 | api = Api.Nomisweb("/tmp/UKCensusAPI") 13 | 14 | print("Nomisweb census data geographical query example") 15 | print("See README.md for details on how to use this package") 16 | 17 | # Heres predefined query on a small geographical area 18 | table = "KS401EW" 19 | query_params = {} 20 | query_params["CELL"] = "7...13" 21 | query_params["date"] = "latest" 22 | query_params["RURAL_URBAN"] = "0" 23 | query_params["select"] = "GEOGRAPHY_CODE,CELL,OBS_VALUE" 24 | query_params["geography"] = "1245710558...1245710560" 25 | query_params["MEASURES"] = "20100" 26 | 27 | ks401 = api.get_data(table, query_params) 28 | # display the first ten rows 29 | print(ks401.head(10)) 30 | 31 | # Now add context - the desriptions of the values (7 to 13) in the CELL column 32 | api.contextify(table, "CELL", ks401) 33 | print(ks401.head(10)) 34 | ks401.to_csv("/tmp/contextified", sep="\t") 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /inst/examples/geoquery.R: -------------------------------------------------------------------------------- 1 | ############################################################### 2 | # Example: Modifying the geography of a query: 3 | # 4 | # This file shows how an existing query can easily be modified 5 | # in terms of geographical coverage: 6 | ############################################################### 7 | library("UKCensusAPI") 8 | 9 | cacheDir = "/tmp/UKCensusAPI" 10 | 11 | # Here's a predefined query using Leeds at MSOA resolution, 12 | # but we want to change the geographical area and refine the resolution 13 | table = "KS401EW" 14 | queryParams = list( 15 | date = "latest", 16 | RURAL_URBAN = "0", 17 | MEASURES = "20100", 18 | CELL = "7...13", 19 | geography = "1245710558...1245710660,1245714998...1245714998,1245715007...1245715007,1245715021...1245715022", 20 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE" 21 | ) 22 | 23 | api = instance(cacheDir) 24 | 25 | # Define the new region and resolution 26 | coverage = c("City of London", "Westminster") 27 | resolution = geoCodeLookup(api, "OA11") # OA 2011 - see NomiswebApi.py 28 | 29 | # Modify the query 30 | coverageCodes = getLADCodes(api, coverage) 31 | queryParams["geography"] = geoCodes(api, coverageCodes, resolution) 32 | 33 | # Fetch the new data 34 | KS401EW = getData(api, table, queryParams) 35 | 36 | # End of example 37 | 38 | 39 | -------------------------------------------------------------------------------- /inst/examples/geoquery.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import ukcensusapi.Nomisweb as Api 4 | 5 | def main(): 6 | api = Api.Nomisweb("/tmp/UKCensusAPI") 7 | 8 | print("Nomisweb census data geographical query example") 9 | print("See README.md for details on how to use this package") 10 | 11 | # In the previous example we had a predefined query using Leeds at MSOA resolution, 12 | # but we want to expand the geographical area and refine the resolution 13 | table = "KS401EW" 14 | query_params = {} 15 | query_params["CELL"] = "7...13" 16 | query_params["date"] = "latest" 17 | query_params["RURAL_URBAN"] = "0" 18 | query_params["select"] = "GEOGRAPHY_CODE,CELL,OBS_VALUE" 19 | query_params["geography"] = "1245710558...1245710660,1245714998...1245714998,1245715007...1245715007,1245715021...1245715022" 20 | query_params["MEASURES"] = "20100" 21 | 22 | # Define the new coverage area in terms of local authorities 23 | coverage = ["Leeds", "Bradford"] 24 | # Define the new resolution 25 | resolution = Api.Nomisweb.GeoCodeLookup["OA11"] 26 | # Convert the coverage area into nomis codes 27 | coverage_codes = api.get_lad_codes(coverage) 28 | # replace the geography value in the query 29 | query_params["geography"] = api.get_geo_codes(coverage_codes, resolution) 30 | # get the data 31 | ks401fine = api.get_data(table, query_params) 32 | print(ks401fine.head(5)) 33 | 34 | # Now widen the coverage to England & Wales and coarsen the resolution to LA 35 | coverage_codes = [Api.Nomisweb.GeoCodeLookup["EnglandWales"]] 36 | resolution = Api.Nomisweb.GeoCodeLookup["LAD"] 37 | query_params["geography"] = api.get_geo_codes(coverage_codes, resolution) 38 | # get the data 39 | ks401broad = api.get_data(table, query_params) 40 | print(ks401broad.head(5)) 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /inst/scripts/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | version=1.1.6 4 | 5 | # package 6 | python3 setup.py sdist bdist_wheel 7 | # upload 8 | twine upload --repository-url https://test.pypi.org/legacy/ dist/ukcensusapi-$version* 9 | #twine upload --repository-url https://upload.pypi.org/legacy/ dist/ukcensusapi-$version* 10 | if [ "$?" -ne "0" ]; then 11 | echo "upload failed" 12 | exit 1 13 | fi 14 | 15 | # test package in tmp env 16 | # segregrated env PYTHONPATH="" to be certain 17 | virtualenv -p python3 --no-site-packages /tmp/env 18 | source /tmp/env/bin/activate 19 | 20 | # local wheel 21 | #python3 -m pip install ~/dev/UKCensusAPI/dist/ukcensusapi-$version-py3-none-any.whl 22 | # test pypi 23 | python3 -m pip install --index-url https://test.pypi.org/simple/ UKCensusAPI --user 24 | # real pypi 25 | #python3 -m pip install UKCensusAPI 26 | 27 | ukcensus-query 28 | 29 | # clean up 30 | deactivate 31 | rm -rf /tmp/env 32 | -------------------------------------------------------------------------------- /inst/scripts/ukcensus-query: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Disable "Invalid constant name" 4 | # pylint: disable=C0103 5 | 6 | # -*- coding: utf-8 -*- 7 | """ 8 | interactive census table query 9 | """ 10 | import os 11 | 12 | import argparse 13 | 14 | import ukcensusapi.Nomisweb as CensusApi 15 | import ukcensusapi.Query as Census 16 | 17 | 18 | def main(cache_dir): 19 | # intialise the API using current directory as the cache directory 20 | 21 | # initialise the census query 22 | census = Census.Query(cache_dir) 23 | 24 | # run the interactive query 25 | census.table() 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser(description="ukcensus interactive query builder") 29 | parser.add_argument("cache_dir", type=str, help="the directory in which to cache data (optionally containing API key") 30 | parser.add_argument("--no-api-key", action='store_const', const=True, default=False, help="use a dummy nomisweb API key") 31 | 32 | args = parser.parse_args() 33 | # set a dummy API key if requested 34 | if args.no_api_key: 35 | print("WARNING: Using a dummy nomisweb API key, data downloads are truncated at 25000 rows") 36 | os.environ["NOMIS_API_KEY"] = "DUMMY" 37 | main(args.cache_dir) -------------------------------------------------------------------------------- /man/UKCensusAPI.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/UKCensusAPI.R 3 | \docType{package} 4 | \name{UKCensusAPI} 5 | \alias{UKCensusAPI} 6 | \title{UKCensusAPI} 7 | \description{ 8 | R package for creating, and modifying, automated downloads of UK census data. See below for an overview of the package. 9 | } 10 | \details{ 11 | It requires that you register with www.nomisweb.co.uk and obtain an API key, 12 | whiBch should be stored in your .Renviron as "NOMIS_API_KEY", e.g. 13 | 14 | \samp{NOMIS_API_KEY=0x0123456789abcdef0123456789abcdef01234567} 15 | See README.md for detailed information and examples. 16 | } 17 | \section{Overview}{ 18 | 19 | Nomisweb, run by Durham University, provides online access to the most detailed and up-to-date statistics from official sources for local areas throughout the UK, including census data. 20 | This package provides both a python and an R wrapper around the nomisweb census data API, enabling: 21 | 22 | \itemize{ 23 | \item querying table metadata 24 | \item autogenerating customised python and R query code for future use 25 | \item automated cached data downloads 26 | \item modifying the geography of queries 27 | \item adding descriptive information to tables (from metadata) 28 | } 29 | 30 | Queries can be customised on geographical coverage, geographical resolution, and table fields, the latter can be filtered to include only the category values you require. 31 | The package generates reusable code snippets that can be inserted into applications. Such applications will work seamlessly for any user as long as they have installed this package, and possess their own nomisweb API key. 32 | Since census data is essentially static, it makes little sense to download the data every time it is requested: all data downloads are cached. 33 | } 34 | 35 | \section{Functions}{ 36 | 37 | \code{\link{geoCodeLookup}} 38 | 39 | \code{\link{geoCodes}} 40 | 41 | \code{\link{getData}} 42 | 43 | \code{\link{getLADCodes}} 44 | 45 | \code{\link{getMetadata}} 46 | 47 | \code{\link{instance}} 48 | 49 | \code{\link{queryInstance}} 50 | 51 | \code{\link{queryMetadata}} 52 | 53 | \code{\link{contextify}} 54 | } 55 | 56 | \examples{ 57 | ############################################################### 58 | # Example: Modifying the geography of a query: 59 | # 60 | # This file shows how an existing query can easily be modified 61 | # in terms of geographical coverage: 62 | ############################################################### 63 | library("UKCensusAPI") 64 | 65 | cacheDir = "/tmp/UKCensusAPI" 66 | 67 | # Here's a predefined query using Leeds at MSOA resolution, 68 | # but we want to change the geographical area and refine the resolution 69 | table = "KS401EW" 70 | queryParams = list( 71 | date = "latest", 72 | RURAL_URBAN = "0", 73 | MEASURES = "20100", 74 | CELL = "7...13", 75 | geography = "1245710558...1245710660,1245714998...1245714998,1245715007...1245715007,1245715021...1245715022", 76 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE" 77 | ) 78 | 79 | api = instance(cacheDir) 80 | 81 | # Define the new region and resolution 82 | coverage = c("City of London", "Westminster") 83 | resolution = geoCodeLookup(api, "OA11") # OA 2011 - see NomiswebApi.py 84 | 85 | # Modify the query 86 | coverageCodes = getLADCodes(api, coverage) 87 | queryParams["geography"] = geoCodes(api, coverageCodes, resolution) 88 | 89 | # Fetch the new data 90 | KS401EW = getData(api, table, queryParams) 91 | 92 | # End of example 93 | 94 | 95 | ############################################################### 96 | # Example: Annotating data from metadata 97 | # 98 | # shows how raw data can be annotated with meaningful metadata 99 | ############################################################### 100 | 101 | library("UKCensusAPI") 102 | 103 | cacheDir = "/tmp/UKCensusAPI" 104 | 105 | # Here's a predefined query, to which we add contextual data 106 | 107 | table = "KS401EW" 108 | queryParams = list( 109 | date = "latest", 110 | RURAL_URBAN = "0", 111 | MEASURES = "20100", 112 | CELL = "7...13", 113 | geography = "1245710558...1245710660", 114 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE" 115 | ) 116 | 117 | api = instance(cacheDir) 118 | 119 | # Fetch the data 120 | KS401EW = getData(api, table, queryParams) 121 | 122 | # Add the context... 123 | KS401EW = contextify(api, table, "CELL", KS401EW) 124 | head(KS401EW) 125 | 126 | # end of example 127 | 128 | } 129 | -------------------------------------------------------------------------------- /man/contextify.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Nomisweb.R 3 | \name{contextify} 4 | \alias{contextify} 5 | \title{contextify} 6 | \usage{ 7 | contextify(api, tableName, columnName, table) 8 | } 9 | \arguments{ 10 | \item{api}{the instance of the an integer vector of nomisweb geographical codes} 11 | 12 | \item{tableName}{name of census table} 13 | 14 | \item{columnName}{name of column in the table} 15 | 16 | \item{table}{the table} 17 | } 18 | \value{ 19 | the table containing a new column with the contextual data 20 | } 21 | \description{ 22 | Append table with a contextual column. 23 | } 24 | \examples{ 25 | \dontrun{ 26 | library("UKCensusAPI") 27 | cacheDir = "/tmp/UKCensusAPI/" 28 | censusapi = UKCensusAPI::instance(cacheDir) 29 | table = "KS401EW" 30 | queryParams = list( 31 | date = "latest", 32 | CELL = "7...13", # dwelling type 33 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE", 34 | MEASURES = "20100", 35 | geography = "1245710558...1245710560", 36 | RURAL_URBAN = "0" 37 | ) 38 | KS401EW = UKCensusAPI::getData(censusapi, table, queryParams) 39 | annotated = contextify(censusapi, table, "CELL", KS401EW) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /man/geoCodeLookup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Nomisweb.R 3 | \name{geoCodeLookup} 4 | \alias{geoCodeLookup} 5 | \title{Geographic code lookup} 6 | \usage{ 7 | geoCodeLookup(api, geoCodeString) 8 | } 9 | \arguments{ 10 | \item{api}{an instance of the API (returned by instance())} 11 | 12 | \item{geoCodeString}{the string representation of a geography, e.g MSOA11} 13 | } 14 | \value{ 15 | code - an integer 16 | } 17 | \description{ 18 | This function returns the nomisweb code for a particular geographic area type 19 | } 20 | \examples{ 21 | library(UKCensusAPI) 22 | censusapi = instance("/tmp/UKCensusAPI") 23 | geoCodeLookup(censusapi, "MSOA11") 24 | } 25 | -------------------------------------------------------------------------------- /man/geoCodes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Nomisweb.R 3 | \name{geoCodes} 4 | \alias{geoCodes} 5 | \title{geoCodes 6 | Get nomisweb geographical codes for a region} 7 | \usage{ 8 | geoCodes(api, coverage, resolution) 9 | } 10 | \arguments{ 11 | \item{api}{the instance of the an integer vector of nomisweb geographical codes} 12 | 13 | \item{coverage}{an integer vector of nomisweb geographical codes} 14 | 15 | \item{resolution}{the nomisweb code for a particular area type (e.g. 297 for MSOA)} 16 | } 17 | \value{ 18 | a compressed string (nomisweb format) containing nomisweb area codes 19 | } 20 | \description{ 21 | geoCodes 22 | Get nomisweb geographical codes for a region 23 | } 24 | \examples{ 25 | library(UKCensusAPI) 26 | censusapi = instance("/tmp/UKCensusAPI") 27 | coverage = getLADCodes(censusapi, c("City of London")) 28 | resolution = geoCodeLookup(censusapi, "LSOA11") 29 | codes = geoCodes(censusapi, coverage, resolution) 30 | } 31 | -------------------------------------------------------------------------------- /man/getData.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Nomisweb.R 3 | \name{getData} 4 | \alias{getData} 5 | \title{getData() 6 | Fetch and cache census data using a predefined query} 7 | \usage{ 8 | getData(api, tableName, query) 9 | } 10 | \arguments{ 11 | \item{api}{a predefined query} 12 | 13 | \item{tableName}{name of census table (e.g KS401EW)} 14 | 15 | \item{query}{query parameters} 16 | } 17 | \value{ 18 | a data.frame contraing the downloaded data 19 | } 20 | \description{ 21 | Ensure all query numeric parameters are passed as strings (e.g. "0" not 0) 22 | This prevents conversion to floating-point which can makie queries fail 23 | } 24 | \examples{ 25 | \dontrun{ 26 | library(UKCensusAPI) 27 | censusapi = instance("/tmp/UKCensusAPI") 28 | table = "KS102EW" 29 | meta=getMetadata(censusapi, table) 30 | # queryParams can be autogenerated using the interactive query functionality 31 | queryParams = list( 32 | geography = "1249902593...1249902596,1249934513...1249934514", 33 | MEASURES = "20100", 34 | select = "GEOGRAPHY_CODE,OBS_VALUE", 35 | CELL = "0", 36 | RURAL_URBAN = "0", 37 | date = "latest" 38 | ) 39 | getData(censusapi, table, queryParams) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /man/getLADCodes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Nomisweb.R 3 | \name{getLADCodes} 4 | \alias{getLADCodes} 5 | \title{Map local authority names to nomisweb codes} 6 | \usage{ 7 | getLADCodes(api, laNames) 8 | } 9 | \arguments{ 10 | \item{api}{an instance of the UKCensusData API.} 11 | 12 | \item{laNames}{a string vector of local authority names or ONS codes.} 13 | } 14 | \value{ 15 | an integer vector of nomisweb local authority codes 16 | } 17 | \description{ 18 | Map local authority names to nomisweb codes 19 | } 20 | \examples{ 21 | library(UKCensusAPI) 22 | censusapi = instance("/tmp/UKCensusAPI") 23 | codes = getLADCodes(censusapi, c("Leeds","Bradford")) 24 | } 25 | -------------------------------------------------------------------------------- /man/getMetadata.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Nomisweb.R 3 | \name{getMetadata} 4 | \alias{getMetadata} 5 | \title{getMetadata() 6 | Fetch the metadata for a census table} 7 | \usage{ 8 | getMetadata(api, tableName) 9 | } 10 | \arguments{ 11 | \item{api}{the census provider api} 12 | 13 | \item{tableName}{the name of the census table} 14 | } 15 | \value{ 16 | metadata a list data structure containing the table metadata 17 | } 18 | \description{ 19 | getMetadata() 20 | Fetch the metadata for a census table 21 | } 22 | \examples{ 23 | \dontrun{ 24 | library(UKCensusAPI) 25 | censusapi = instance("/tmp/UKCensusAPI") 26 | getMetadata(censusapi, "KS001") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /man/instance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/UKCensusAPI.R 3 | \name{instance} 4 | \alias{instance} 5 | \title{get an instance of the python API (required to call any of the functions)} 6 | \usage{ 7 | instance(cacheDir, country = "EW") 8 | } 9 | \arguments{ 10 | \item{cacheDir}{directory to cache data} 11 | 12 | \item{country}{either "EW" (default, nomisweb API), "SC" (NRScotland bulk data), "NI" (NISRA bulk data)} 13 | } 14 | \value{ 15 | an instance of one of the python apis 16 | } 17 | \description{ 18 | get an instance of the python API (required to call any of the functions) 19 | } 20 | -------------------------------------------------------------------------------- /man/queryInstance.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/UKCensusAPI.R 3 | \name{queryInstance} 4 | \alias{queryInstance} 5 | \title{get an instance of the python query (required to call any of the functions)} 6 | \usage{ 7 | queryInstance(cacheDir) 8 | } 9 | \arguments{ 10 | \item{cacheDir}{directory to cache data} 11 | } 12 | \value{ 13 | an instance of the query module 14 | } 15 | \description{ 16 | get an instance of the python query (required to call any of the functions) 17 | } 18 | -------------------------------------------------------------------------------- /man/queryMetadata.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Nomisweb.R 3 | \name{queryMetadata} 4 | \alias{queryMetadata} 5 | \title{Interactive metadata query} 6 | \usage{ 7 | queryMetadata() 8 | } 9 | \description{ 10 | This function calls an interactive script where the user selects a table, a geography, and selects fields, optionally filtering by value. 11 | This script will not run in RStudio due to the way it handles standard input. Please run in a standalone R session (or call the python script directly) 12 | } 13 | \examples{ 14 | \dontrun{queryMetadata()} 15 | } 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | minversion = "6.0" 3 | testpaths = ["tests"] 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | requests 4 | openpyxl 5 | xlrd -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import setuptools 4 | 5 | def readme(): 6 | with open('README.md') as f: 7 | return f.read() 8 | 9 | setuptools.setup(name='ukcensusapi', 10 | version='1.1.6', 11 | description='UK census data query automation', 12 | long_description=readme(), 13 | long_description_content_type="text/markdown", 14 | url='https://github.com/virgesmith/UKCensusAPI', 15 | author='Andrew P Smith', 16 | author_email='a.p.smith@leeds.ac.uk', 17 | packages=setuptools.find_packages(), 18 | install_requires=['numpy', 19 | 'pandas', 20 | 'requests', 21 | 'openpyxl', 22 | 'xlrd'], 23 | classifiers=( 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | ), 28 | scripts=['inst/scripts/ukcensus-query'], 29 | tests_require=['pytest'], 30 | ) 31 | -------------------------------------------------------------------------------- /tests/extended_scotland.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # These tests are inappropriate for travis as they require too much downloads 4 | # But they should be run locally before any commit 5 | 6 | import ukcensusapi.NRScotland as NRScotland 7 | 8 | census = NRScotland.NRScotland("~/.ukpopulation/cache") 9 | 10 | meta = census.get_metadata("KS401SC", "LAD") 11 | print(meta) 12 | 13 | # KS401 no data at LSOA level, yet there is data at OA? 14 | # compression format of OA data not supported by zipfile 15 | 16 | data = census.get_data("KS401SC", "LAD", "S92000003", category_filters={"KS401SC_0_CODE": 0}) 17 | print(data.head()) 18 | meta = census.get_metadata("DC1117SC", "LAD") 19 | print(meta) 20 | meta = census.get_metadata("DC2101SC", "LAD") 21 | print(meta) 22 | 23 | table = census.get_data("KS402SC", "MSOA11", "S12000033", category_filters={"KS402SC_0_CODE": 0}) 24 | print(len(table) == 49) 25 | table = census.get_data("KS402SC", "LSOA11", "S12000033", category_filters={"KS402SC_0_CODE": 0}) 26 | print(len(table) == 283) 27 | table = census.get_data("KS402SC", "OA11", "S12000033", category_filters={"KS402SC_0_CODE": 0}) 28 | print(len(table) == 1992) 29 | 30 | table = census.get_data("DC1117SC", "LAD", "S12000033") 31 | print(table) 32 | 33 | table = census.get_data("DC2101SC", "LAD", "S12000033", category_filters={ 34 | "DC2101SC_0_CODE": 4, # white irish 35 | "DC2101SC_1_CODE": [1,2], # male & female 36 | "DC2101SC_2_CODE": range(6,13) # 18-49 37 | }) 38 | print(table.shape) 39 | 40 | table = census.contextify(table, meta, "DC2101SC_2_CODE") 41 | print(table) 42 | 43 | -------------------------------------------------------------------------------- /tests/test_all.py: -------------------------------------------------------------------------------- 1 | """ Test harness """ 2 | 3 | import os 4 | from random import sample 5 | import numpy as np 6 | import sys 7 | import pytest 8 | 9 | from ukcensusapi import Nomisweb as Api_EW, NRScotland as Api_SC, NISRA as Api_NI, Query as Census 10 | 11 | CACHE_DIR = "/tmp/UKCensusAPI" 12 | 13 | @pytest.fixture(scope='session') 14 | def api_ew(): return Api_EW.Nomisweb(CACHE_DIR, verbose=True) 15 | 16 | 17 | @pytest.fixture(scope='session') 18 | def api_sc(): return Api_SC.NRScotland(CACHE_DIR) 19 | 20 | 21 | @pytest.fixture(scope='session') 22 | def api_ni(): return Api_NI.NISRA(CACHE_DIR) 23 | 24 | 25 | @pytest.fixture(scope='session') 26 | def query(): return Census.Query(CACHE_DIR) 27 | 28 | 29 | def test_get_lad_codes(api_ew): 30 | assert api_ew.get_lad_codes("Royston Vasey") == [] 31 | assert api_ew.get_lad_codes("Leeds") == [1946157127] 32 | assert api_ew.get_lad_codes(["Leeds", "Bradford"]) == [1946157127, 1946157124] 33 | 34 | 35 | def test_cache_dir_invalid(): 36 | with pytest.raises((OSError, PermissionError)): 37 | Api_EW.Nomisweb("/home/invalid") 38 | with pytest.raises((OSError, PermissionError)): 39 | Api_SC.NRScotland("/bin") 40 | with pytest.raises((OSError, PermissionError)): 41 | Api_NI.NISRA("/bin/ls") 42 | 43 | 44 | # This overlaps test_getGeographyFromCodes 45 | def test_geo_codes_ew(api_ew): 46 | result = api_ew.get_geo_codes([Api_EW.Nomisweb.GeoCodeLookup["EnglandWales"]], Api_EW.Nomisweb.GeoCodeLookup["LAD"]) 47 | assert result == '1946157057...1946157404' 48 | result = api_ew.get_geo_codes([1946157127], Api_EW.Nomisweb.GeoCodeLookup["OA11"]) 49 | assert result == '1254151943...1254154269,1254258198...1254258221,1254261711...1254261745,1254261853...1254261870,1254261894...1254261918,1254262125...1254262142,1254262341...1254262353,1254262394...1254262398,1254262498...1254262532,1254262620...1254262658,1254262922...1254262925' 50 | # test 2001 codes 51 | result = api_ew.get_geo_codes([1946157127], Api_EW.Nomisweb.GeoCodeLookup["MSOA01"]) 52 | assert result == '1279265050...1279265157' 53 | 54 | 55 | def test_geo_codes_sc(api_sc): 56 | lads = sorted(['S12000033', 'S12000034', 'S12000041', 'S12000035', 'S12000026', 'S12000005', 57 | 'S12000039', 'S12000006', 'S12000042', 'S12000008', 'S12000045', 'S12000010', 58 | 'S12000011', 'S12000036', 'S12000014', 'S12000015', 'S12000046', 'S12000017', 59 | 'S12000018', 'S12000019', 'S12000020', 'S12000021', 'S12000044', 'S12000023', 60 | 'S12000024', 'S12000038', 'S12000027', 'S12000028', 'S12000029', 'S12000030', 61 | 'S12000040', 'S12000013']) 62 | 63 | msoa_ab = sorted(['S02001275', 'S02001238', 'S02001237', 'S02001236', 'S02001278', 'S02001284', 64 | 'S02001247', 'S02001249', 'S02001246', 'S02001250', 'S02001261', 'S02001252', 65 | 'S02001251', 'S02001257', 'S02001258', 'S02001254', 'S02001256', 'S02001253', 66 | 'S02001248', 'S02001242', 'S02001240', 'S02001241', 'S02001239', 'S02001243', 67 | 'S02001259', 'S02001260', 'S02001264', 'S02001265', 'S02001268', 'S02001269', 68 | 'S02001267', 'S02001274', 'S02001266', 'S02001263', 'S02001262', 'S02001245', 69 | 'S02001270', 'S02001272', 'S02001273', 'S02001271', 'S02001244', 'S02001279', 70 | 'S02001282', 'S02001281', 'S02001280', 'S02001276', 'S02001277', 'S02001283', 71 | 'S02001255']) 72 | 73 | result = sorted(api_sc.get_geog("S92000003", "LAD")) 74 | assert np.array_equal(result, lads) 75 | result = sorted(api_sc.get_geog("S12000033", "MSOA11")) 76 | assert np.array_equal(result, msoa_ab) 77 | result = api_sc.get_geog("S12000033", "LSOA11") 78 | assert len(result) == 283 79 | result = api_sc.get_geog("S12000033", "OA11") 80 | assert len(result) == 1992 81 | 82 | 83 | def test_geo_codes_ni(api_ni): 84 | # NI data 85 | lads = ['95AA', '95BB', '95CC', '95DD', '95EE', '95FF', '95GG', '95HH', '95II', '95JJ', '95KK', '95LL', '95MM', 86 | '95NN', '95OO', '95PP', '95QQ', '95RR', '95SS', '95TT', '95UU', '95VV', '95WW', '95XX', '95YY', '95ZZ'] 87 | 88 | msoa_95aa = ['95AA01', '95AA02', '95AA03', '95AA04', '95AA05', '95AA06', '95AA07', '95AA08', '95AA09', '95AA10', '95AA11', '95AA12', 89 | '95AA13', '95AA14', '95AA15', '95AA16', '95AA17', '95AA18', '95AA19'] 90 | lsoa_95aa = ['95AA01S1', '95AA01S2', '95AA01S3', '95AA02W1', '95AA03W1', '95AA04W1', '95AA05W1', '95AA06S1', '95AA06S2', '95AA07W1', '95AA08W1', 91 | '95AA09W1', '95AA10W1', '95AA11S1', '95AA11S2', '95AA12W1', '95AA13S1', '95AA13S2', '95AA14W1', '95AA15S1', '95AA15S2', '95AA16W1', 92 | '95AA17W1', '95AA18W1', '95AA19W1'] 93 | 94 | result = sorted(api_ni.get_geog("N92000002", "LAD")) 95 | assert np.array_equal(result, lads) 96 | result = sorted(api_ni.get_geog("95AA", "MSOA11")) 97 | assert np.array_equal(result, msoa_95aa) 98 | result = sorted(api_ni.get_geog("95AA", "LSOA11")) 99 | assert np.array_equal(result, lsoa_95aa) 100 | result = sorted(api_ni.get_geog("95AA", "OA11")) 101 | assert len(result) == 129 102 | 103 | 104 | def test_get_metadata_ew(api_ew): 105 | meta = api_ew.get_metadata("NONEXISTENT") 106 | assert not meta 107 | meta = api_ew.get_metadata("KS401EW") 108 | assert meta["description"] == 'KS401EW - Dwellings, household spaces and accommodation type' 109 | assert meta["nomis_table"] == 'NM_618_1' 110 | # test 2001 table 111 | meta = api_ew.get_metadata("UV070") 112 | assert meta["description"] == 'UV070 - Communal Establishments' 113 | assert meta["nomis_table"] == 'NM_1686_1' 114 | 115 | 116 | def test_get_metadata_sc(api_sc): 117 | # Scotland 118 | meta = api_sc.get_metadata("KS401SC", "LAD") 119 | assert meta["table"] == 'KS401SC' 120 | assert meta["geography"] == 'LAD' 121 | assert 'KS401SC_0_CODE' in meta["fields"] 122 | 123 | 124 | def test_get_metadata_ni(api_ni): 125 | # NI 126 | meta = api_ni.get_metadata("QS401NI", "LSOA11") 127 | assert meta["table"] == 'QS401NI' 128 | assert meta["geography"] == 'SOA' 129 | assert 'QS401NI_0_CODE' in meta["fields"] 130 | assert len(meta["fields"]['QS401NI_0_CODE']) == 12 131 | 132 | 133 | def test_get_url(api_ew): 134 | table = "NM_618_1" 135 | query_params = { 136 | "CELL": "7...13", 137 | "date": "latest", 138 | "RURAL_URBAN": "0", 139 | "select": "GEOGRAPHY_CODE,CELL,OBS_VALUE", 140 | "geography": "1245710558...1245710660,1245714998...1245714998,1245715007...1245715007,1245715021...1245715022", 141 | "MEASURES": "20100" 142 | } 143 | assert api_ew.get_url(table, query_params) == "https://www.nomisweb.co.uk/api/v01/dataset/NM_618_1.data.tsv?CELL=7...13&MEASURES=20100&RURAL_URBAN=0&date=latest&geography=1245710558...1245710660%2C1245714998...1245714998%2C1245715007...1245715007%2C1245715021...1245715022&select=GEOGRAPHY_CODE%2CCELL%2COBS_VALUE" 144 | 145 | 146 | def test_get_data_ew(api_ew): 147 | table_name = "KS401EW" 148 | # table_internal = "NM_618_1" 149 | query_params = { 150 | "CELL": "7...13", 151 | "date": "latest", 152 | "RURAL_URBAN": "0", 153 | "select": "GEOGRAPHY_CODE,CELL,OBS_VALUE", 154 | "geography": "1245710558...1245710560", 155 | "MEASURES": "20100" 156 | } 157 | table = api_ew.get_data(table_name, query_params) 158 | assert table.shape == (21, 3) 159 | assert sum(table.OBS_VALUE) == 8214 160 | 161 | 162 | def test_get_data_sc(api_sc): 163 | table_name = "KS401SC" 164 | geography = "S12000033" # Aberdeen 165 | categories = { "KS401SC_0_CODE": range(8,15) } 166 | table = api_sc.get_data(table_name, geography, "LAD", categories) 167 | assert table.shape == (7, 3) 168 | assert sum(table.OBS_VALUE) == 108153 169 | 170 | table_name = "DC2101SC" 171 | geography = "S12000033" # Aberdeen 172 | categories = { "DC2101SC_0_CODE": 4, # White Irish 173 | "DC2101SC_1_CODE": [1,2], # M+F 174 | "DC2101SC_2_CODE": [6,7,8,9,10,11,12] } # 18-49 175 | table = api_sc.get_data(table_name, geography, "LAD", categories) 176 | assert table.shape == (14, 5) 177 | assert sum(table.OBS_VALUE) == 1732 178 | 179 | 180 | def test_get_data_ni(api_ni): 181 | table_name = "QS401NI" 182 | geography = "95AA" # Antrim 183 | categories = { "QS401NI_0_CODE": [1,6,8,9,10] } 184 | table = api_ni.get_data(table_name, geography, "LAD", categories) 185 | assert table.shape == (5, 3) 186 | assert sum(table.OBS_VALUE) == 52454 187 | 188 | table_name = "QS202NI" 189 | geography = "95ZZ" # Strabane 190 | categories = { "QS202NI_0_CODE": [1,2,3,4,5,6] } 191 | table = api_ni.get_data(table_name, geography, "MSOA11", categories) 192 | assert table.shape == (96, 3) 193 | assert sum(table.OBS_VALUE) == 14817 194 | 195 | #'table': 'QS202NI', 'description': '', 'geography': 'SOA', 'fields': {'QS202NI_0_CODE': {0: 'All Household Reference Persons (HRPs)', 1: 'Ethnic group of HRP: Black', 2: 'Ethnic group of HRP: Chinese', 3: 'Ethnic group of HRP: Mixed', 4: 'Ethnic group of HRP: Other', 5: 'Ethnic group of HRP: Other Asian', 6: 'Ethnic group of HRP: White'}}} 196 | 197 | # OD data is structured differently 198 | def test_get_od_data(api_ew): 199 | table = "WF01BEW" 200 | # table_internal = "NM_1228_1" 201 | query_params = { 202 | "date": "latest", 203 | "select": "currently_residing_in_code,place_of_work_code,OBS_VALUE", 204 | # OD are 5 LSOAs in central Leeds 205 | "currently_residing_in": "1249934756...1249934758,1249934760,1249934761", 206 | "place_of_work": "1249934756...1249934758,1249934760,1249934761", 207 | "MEASURES": "20100" 208 | } 209 | table = api_ew.get_data(table, query_params) 210 | assert table.shape == (25, 3) 211 | assert sum(table.OBS_VALUE) == 1791 212 | 213 | 214 | # Projection data doesnt explicitly have a table name - tests directly specifying nomis internal name 215 | def test_get_proj_data(api_ew): 216 | table_internal = "NM_2002_1" 217 | query_params = { 218 | "gender": "1,2", 219 | "c_age": "101...191", 220 | "MEASURES": "20100", 221 | "select": "geography_code,gender,c_age,obs_value", 222 | "geography": "1879048193...1879048194", 223 | "date": "latestMINUS15" # 2003 224 | } 225 | 226 | table = api_ew.get_data(table_internal, query_params) 227 | assert table.shape == (364, 4) 228 | assert sum(table.OBS_VALUE) == 597505 229 | 230 | 231 | def test_get_and_add_descriptive_column(api_ew): 232 | 233 | table_name = "KS401EW" 234 | 235 | query_params = { 236 | "CELL": "7...13", 237 | "date": "latest", 238 | "RURAL_URBAN": "0", 239 | "select": "GEOGRAPHY_CODE,CELL,OBS_VALUE", 240 | "geography": "1245710558...1245710560", 241 | "MEASURES": "20100" 242 | } 243 | table = api_ew.get_data(table_name, query_params) 244 | assert table.shape == (21, 3) 245 | assert sum(table.OBS_VALUE) == 8214 246 | 247 | # first ensure table is unmodified if column doesnt exist 248 | old_cols = len(table.columns) 249 | api_ew.contextify(table_name, "NOT_THERE", table) 250 | assert len(table.columns) == old_cols 251 | 252 | api_ew.contextify(table_name, "CELL", table) 253 | 254 | assert table.at[0, "CELL_NAME"] == "Whole house or bungalow: Detached" 255 | assert table.at[1, "CELL_NAME"] == "Whole house or bungalow: Semi-detached" 256 | assert table.at[2, "CELL_NAME"] == "Whole house or bungalow: Terraced (including end-terrace)" 257 | assert table.at[3, "CELL_NAME"] == "Flat, maisonette or apartment: Purpose-built block of flats or tenement" 258 | assert table.at[4, "CELL_NAME"] == "Flat, maisonette or apartment: Part of a converted or shared house (including bed-sits)" 259 | assert table.at[5, "CELL_NAME"] == "Flat, maisonette or apartment: In a commercial building" 260 | assert table.at[6, "CELL_NAME"] == "Caravan or other mobile or temporary structure" 261 | 262 | 263 | def test_get_geog_from_names(query): 264 | result = query.get_geog_from_names(["Leeds"], Api_EW.Nomisweb.GeoCodeLookup["OA11"]) 265 | assert result == '1254151943...1254154269,1254258198...1254258221,1254261711...1254261745,1254261853...1254261870,1254261894...1254261918,1254262125...1254262142,1254262341...1254262353,1254262394...1254262398,1254262498...1254262532,1254262620...1254262658,1254262922...1254262925' 266 | 267 | # same, but query with ONS code 268 | result = query.get_geog_from_names(["E08000035"], Api_EW.Nomisweb.GeoCodeLookup["OA11"]) 269 | assert result == '1254151943...1254154269,1254258198...1254258221,1254261711...1254261745,1254261853...1254261870,1254261894...1254261918,1254262125...1254262142,1254262341...1254262353,1254262394...1254262398,1254262498...1254262532,1254262620...1254262658,1254262922...1254262925' 270 | 271 | result = query.get_geog_from_names(["Newcastle upon Tyne"], Api_EW.Nomisweb.GeoCodeLookup["LSOA11"]) 272 | assert result == '1249910667...1249910832,1249935220...1249935228' 273 | 274 | result = query.get_geog_from_names(["Leeds", "Bradford"], Api_EW.Nomisweb.GeoCodeLookup["MSOA11"]) 275 | assert result == '1245710411...1245710471,1245710558...1245710660,1245714998...1245714998,1245715007...1245715007,1245715021...1245715022' 276 | 277 | def test_get_geog_from_codes(query): 278 | result = query.api.get_geo_codes([Api_EW.Nomisweb.GeoCodeLookup["EnglandWales"]], Api_EW.Nomisweb.GeoCodeLookup["LAD"]) 279 | assert result == '1946157057...1946157404' 280 | 281 | # test example code 282 | def test_geoquery(): 283 | import inst.examples.geoquery as eg_geo 284 | eg_geo.main() 285 | 286 | 287 | def test_contextify(): 288 | import inst.examples.contextify as eg_cont 289 | eg_cont.main() 290 | 291 | 292 | # just checks code snippet runs ok (i.e. returns 0) 293 | def test_code_snippet(api_ew, query): 294 | if not sys.platform.startswith("win"): 295 | table = "KS401EW" 296 | meta = api_ew.get_metadata(table) 297 | query_params = { 298 | "CELL": "7...13", 299 | "date": "latest", 300 | "RURAL_URBAN": "0", 301 | "select": "GEOGRAPHY_CODE,CELL,OBS_VALUE", 302 | "geography": "1245710558...1245710560", 303 | "MEASURES": "20100" 304 | } 305 | 306 | query.write_code_snippets(table, meta, query_params) 307 | assert os.system("python " + str(api_ew.cache_dir / (table + ".py"))) == 0 308 | 309 | 310 | # checks the logic to compress a list of nomis geo codes into a shorter form for url 311 | def test_shorten_codelist(): 312 | n = list(range(1,21)) 313 | 314 | for _ in range(0,100): 315 | short = Api_EW._shorten(sample(n, len(n))) 316 | assert short == "1...20" 317 | 318 | del(n[3]) 319 | for _ in range(0,100): 320 | short = Api_EW._shorten(sample(n, len(n))) 321 | assert short == "1...3,5...20" 322 | 323 | del(n[16]) 324 | for _ in range(0,100): 325 | short = Api_EW._shorten(sample(n, len(n))) 326 | assert short == "1...3,5...17,19...20" 327 | 328 | del(n[16]) 329 | for _ in range(0,100): 330 | short = Api_EW._shorten(sample(n, len(n))) 331 | assert short == "1...3,5...17,20" 332 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # Hack to make snippet test work in R CMD CHECK 2 | # see https://github.com/hadley/testthat/issues/86 3 | Sys.setenv("R_TESTS" = "") 4 | 5 | library(testthat) 6 | library(UKCensusAPI) 7 | test_check("UKCensusAPI") 8 | -------------------------------------------------------------------------------- /tests/testthat/test-all.R: -------------------------------------------------------------------------------- 1 | # see http://kbroman.org/pkg_primer/pages/tests.html 2 | 3 | context("UKCensusAPI") 4 | library(reticulate) 5 | 6 | apiEW = UKCensusAPI::instance("/tmp/UKCensusAPI") 7 | apiSC = UKCensusAPI::instance("/tmp/UKCensusAPI", "SC") 8 | apiNI = UKCensusAPI::instance("/tmp/UKCensusAPI", "NI") 9 | 10 | # simply checks we can get nomis geo codes back 11 | test_that("geoCodeLookup", { 12 | expect_true(UKCensusAPI::geoCodeLookup(apiEW, "MSOA11") == "TYPE297") 13 | expect_true(UKCensusAPI::geoCodeLookup(apiEW, "LSOA01") == "TYPE304") 14 | expect_true(UKCensusAPI::geoCodeLookup(apiEW, "LAD") == "TYPE464") 15 | expect_true(UKCensusAPI::geoCodeLookup(apiEW, "EnglandWales") == "2092957703") 16 | }) 17 | 18 | # simply checks we get data back# simply checks we get data back 19 | test_that("getMetadata", { 20 | table = "KS401EW" 21 | expect_true(class(UKCensusAPI::getMetadata(apiEW, table)) == "list") 22 | }) 23 | 24 | # simply checks we get data back 25 | test_that("getMetadataSC", { 26 | table = "KS401SC" 27 | expect_true(class(apiSC$getMetadata(table, "LAD")) == "list") 28 | }) 29 | 30 | test_that("getMetadataNI", { 31 | table = "KS401NI" 32 | expect_true(class(apiNI$getMetadata(table, "LAD")) == "list") 33 | }) 34 | 35 | # simply checks we get a data frame back 36 | test_that("getData", { 37 | table = "KS401EW" 38 | query = list(date = "latest", 39 | geography = "1245714681...1245714688", 40 | CELL = "7...13", 41 | RURAL_URBAN="0", 42 | measures = "20100", 43 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE") 44 | expect_true(class(UKCensusAPI::getData(apiEW, table, query)) == "data.frame") 45 | }) 46 | 47 | # simply checks we get a data frame back 48 | test_that("getDataSC", { 49 | table = "KS401SC" 50 | region = "S12000033" 51 | resolution = "LAD" 52 | filter = list("KS401SC_0_CODE" = c(0,1,2,3)) 53 | expect_true(class(apiSC$getData(table, region, resolution, filter)) == "data.frame") 54 | }) 55 | 56 | # simply checks we get a data frame back 57 | test_that("getDataNI", { 58 | table = "KS401NI" 59 | region = "95AA" 60 | resolution = "LAD" 61 | filter = list("KS401NI_0_CODE" = c(0,1,2,3)) 62 | expect_true(class(apiNI$getData(table, region, resolution, filter)) == "data.frame") 63 | }) 64 | 65 | # simply checks we get a data frame back 66 | test_that("getOdData", { 67 | table = "WF01BEW" 68 | query = list(date = "latest", 69 | # OD are 5 LSOAs in central Leeds 70 | currently_residing_in = "1249934756...1249934758,1249934760,1249934761", 71 | place_of_work = "1249934756...1249934758,1249934760,1249934761", 72 | measures = "20100", 73 | select = "currently_residing_in_code,place_of_work_code,OBS_VALUE") 74 | expect_true(class(UKCensusAPI::getData(apiEW, table, query)) == "data.frame") 75 | }) 76 | 77 | test_that("getLADCodes", { 78 | expect_true(length(getLADCodes(apiEW, c())) == 0) 79 | expect_true(length(getLADCodes(apiEW, c("Framley"))) == 0) 80 | expect_true(getLADCodes(apiEW, c("Leeds")) == 1946157127) 81 | expect_true(getLADCodes(apiEW, c("Leeds")) == c(1946157127)) 82 | 83 | codes = getLADCodes(apiEW, c("Leeds", "Bradford", "Kirklees", "Wakefield", "Calderdale")) 84 | # == returns a bool vector, so check that its sum is its length 85 | expect_true(sum(codes == c(1946157127, 1946157124, 1946157126, 1946157128, 1946157125)) == length(codes)) 86 | 87 | codes = getLADCodes(apiEW, c("Leeds", "Bradford", "Skipdale", "Wakefield", "Calderdale")) 88 | # == returns a bool vector, so check that its sum is its length 89 | expect_true(sum(codes == c(1946157127, 1946157124, 1946157128, 1946157125)) == length(codes)) 90 | 91 | codes = getLADCodes(apiEW, c("Trumpton", "Camberwick Green", "Chigley")) 92 | # == returns a bool vector, so check that its sum is its length 93 | expect_true(length(codes) == 0) 94 | }) 95 | 96 | test_that("geoCodes empty", { 97 | expect_true(geoCodes(apiEW, c(), "TYPE999") == "") 98 | }) 99 | 100 | test_that("geoCodes invalid", { 101 | expect_true(geoCodes(apiEW, c(999), "TYPE999") == "") 102 | }) 103 | 104 | test_that("geoCodes single LA", { 105 | expect_true(geoCodes(apiEW, 1946157124, "TYPE464") == "1946157124") 106 | }) 107 | 108 | test_that("geoCodes multi MSOA", { 109 | expect_true(geoCodes(apiEW, c(1946157124, 1946157128), "TYPE297") == "1245710411...1245710471,1245710661...1245710705") 110 | }) 111 | 112 | test_that("geoCodes multi LSOA", { 113 | expect_true(geoCodes(apiEW, c(1946157124, 1946157128), "TYPE298") == "1249912854...1249913154,1249913980...1249914188,1249935357...1249935365") 114 | }) 115 | 116 | test_that("geoCodes single OA", { 117 | expect_true(geoCodes(apiEW, 1946157124, "TYPE299") == "1254148629...1254150034,1254267588...1254267709") 118 | }) 119 | 120 | test_that("geoCodes SC", { 121 | expect_true(length(apiSC$getGeog("S12000033", "MSOA11")) == 49) 122 | expect_true(length(apiSC$getGeog("S12000033", "LSOA11")) == 283) 123 | expect_true(length(apiSC$getGeog("S12000033", "OA11")) == 1992) 124 | }) 125 | 126 | test_that("geoCodes NI", { 127 | expect_true(length(apiNI$getGeog("95AA", "LAD")) == 1) 128 | expect_true(length(apiNI$getGeog("95AA", "MSOA11")) == 19) 129 | expect_true(length(apiNI$getGeog("95AA", "LSOA11")) == 25) 130 | expect_true(length(apiNI$getGeog("95AA", "OA11")) == 129) 131 | }) 132 | 133 | test_that("contextify", { 134 | table = "KS401EW" 135 | query = list(date = "latest", 136 | geography = "1245714681...1245714688", 137 | CELL = "7...13", 138 | RURAL_URBAN="0", 139 | measures = "20100", 140 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE") 141 | data = UKCensusAPI::getData(apiEW, table, query) 142 | column = "CELL" 143 | 144 | data = contextify(apiEW, table, column, data) 145 | # check table has column 146 | expect_true("CELL_NAME" %in% colnames(data)) 147 | # then check values 148 | expect_true(data$CELL_NAME[[1]] == "Whole house or bungalow: Detached") 149 | expect_true(data$CELL_NAME[[2]] == "Whole house or bungalow: Semi-detached") 150 | expect_true(data$CELL_NAME[[3]] == "Whole house or bungalow: Terraced (including end-terrace)") 151 | expect_true(data$CELL_NAME[[4]] == "Flat, maisonette or apartment: Purpose-built block of flats or tenement") 152 | expect_true(data$CELL_NAME[[5]] == "Flat, maisonette or apartment: Part of a converted or shared house (including bed-sits)") 153 | expect_true(data$CELL_NAME[[6]] == "Flat, maisonette or apartment: In a commercial building") 154 | expect_true(data$CELL_NAME[[7]] == "Caravan or other mobile or temporary structure") 155 | }) 156 | 157 | test_that("geoquery example", { 158 | # hack to get test to run as part of checks 159 | if (dir.exists("../../inst/examples")) { 160 | path = "../../inst/examples/" 161 | } else { 162 | path = "../../UKCensusAPI/examples/" 163 | } 164 | # # better to use system and Rscript? 165 | # ret = source(paste0(path, "geoquery.R")) 166 | # expect_true(class(ret) == "list") 167 | # run the R snippet in a separate process 168 | script = paste0(R.home("bin"), "/Rscript ", path, "geoquery.R") 169 | ret = system(script) 170 | expect_true(ret == 0) 171 | }) 172 | 173 | test_that("contextify example", { 174 | # hack to get test to run as part of checks 175 | if (dir.exists("../../inst/examples")) { 176 | path = "../../inst/examples/" 177 | } else { 178 | path = "../../UKCensusAPI/examples/" 179 | } 180 | # # better to use system and Rscript? 181 | # ret = source(paste0(path, "geoquery.R")) 182 | # expect_true(class(ret) == "list") 183 | # run the R snippet in a separate process 184 | script = paste0(R.home("bin"), "/Rscript ", path, "contextify.R") 185 | ret = system(script) 186 | expect_true(ret == 0) 187 | }) 188 | 189 | test_that("code snippet", { 190 | 191 | # generate a code snippet 192 | table = "KS401EW" 193 | meta = getMetadata(apiEW, table) 194 | queryParams = list( 195 | CELL = "7...13", 196 | geography = "1245710558...1245710560", 197 | select = "GEOGRAPHY_CODE,CELL,OBS_VALUE", 198 | date = "latest", 199 | RURAL_URBAN = "0", 200 | MEASURES = "20100" 201 | ) 202 | query = UKCensusAPI::queryInstance(apiEW$cache_dir) 203 | query$write_code_snippets(table, meta, queryParams) 204 | 205 | # run the R snippet in a separate process 206 | script = paste0(R.home("bin"), "/Rscript ", apiEW$cache_dir, "/", table, ".R") 207 | ret = system(script) 208 | expect_true(ret == 0) 209 | }) 210 | -------------------------------------------------------------------------------- /ukcensusapi/NISRA.py: -------------------------------------------------------------------------------- 1 | """ 2 | Northern Ireland 3 | """ 4 | 5 | import os.path 6 | from pathlib import Path 7 | import urllib.parse 8 | import zipfile 9 | import pandas as pd 10 | import requests 11 | 12 | import ukcensusapi.utils as utils 13 | 14 | # assumes all areas in coverage are the same type 15 | def _coverage_type(code): 16 | if isinstance(code, list): 17 | code = code[0] 18 | if code == "N92000002": 19 | return "ALL" 20 | # TODO regex? 21 | elif len(code) == 4: # e.g. 95AA 22 | return "LGD" 23 | elif len(code) == 6: # e.g. 95AA01 (ward) 24 | return "WARD" 25 | elif len(code) == 8: # e.g. 95AA01S1 26 | return "SOA" 27 | elif code[:3] == "N00": 28 | return "OA" 29 | else: 30 | raise ValueError("Invalid code: {}".format(code)) 31 | 32 | class NISRA: 33 | """ 34 | Scrapes and refomats NI 2011 census data from NISRA website 35 | """ 36 | # static constants 37 | URL = "http://www.ninis2.nisra.gov.uk/Download/Census%202011/" 38 | 39 | # timeout for http requests 40 | Timeout = 15 41 | 42 | data_sources = ["Detailed Characteristics Tables (statistical geographies).zip", 43 | "Key Statistics Tables (statistical geographies).zip", 44 | "Local Characteristic Tables (statistical geographies).zip", # note slight inconsistency in name 45 | "Quick Statistics Tables (statistical geographies).zip"] 46 | 47 | GeoCodeLookup = { 48 | "LAD": 0, # LGD 49 | "MSOA11": 1, # WARD 50 | "LSOA11": 2, # SOA 51 | "OA11": 3 # SA 52 | } 53 | 54 | NIGeoCodes = [ "LGD", "WARD", "SOA", "SA" ] 55 | 56 | source_map = { "LC": 2, "DC": 0, "KS": 1, "QS": 3 } 57 | 58 | res_map = { "SA": "SMALL AREAS", "SOA": "SUPER OUTPUT AREAS"} 59 | 60 | LADs = { 61 | "95AA": "Antrim", 62 | "95BB": "Ards", 63 | "95CC": "Armagh", 64 | "95DD": "Ballymena", 65 | "95EE": "Ballymoney", 66 | "95FF": "Banbridge", 67 | "95GG": "Belfast", 68 | "95HH": "Carrickfergus", 69 | "95II": "Castlereagh", 70 | "95JJ": "Coleraine", 71 | "95KK": "Cookstown", 72 | "95LL": "Craigavon", 73 | "95MM": "Derry", 74 | "95NN": "Down", 75 | "95OO": "Dungannon", 76 | "95PP": "Fermanagh", 77 | "95QQ": "Larne", 78 | "95RR": "Limavady", 79 | "95SS": "Lisburn", 80 | "95TT": "Magherafelt", 81 | "95UU": "Moyle", 82 | "95VV": "Newry and Mourne", 83 | "95WW": "Newtownabbey", 84 | "95XX": "North Down", 85 | "95YY": "Omagh", 86 | "95ZZ": "Strabane" 87 | } 88 | 89 | # initialise, supplying a location to cache downloads 90 | def __init__(self, cache_dir): 91 | """Constructor. 92 | Args: 93 | cache_dir: cache directory 94 | Returns: 95 | an instance. 96 | """ 97 | # checks exists and is writable, creates if necessary 98 | self.cache_dir = utils.init_cache_dir(cache_dir) 99 | 100 | self.offline_mode = not utils.check_online(self.URL) 101 | if self.offline_mode: 102 | print("Unable to contact %s, operating in offline mode - pre-cached data only" % self.URL) 103 | 104 | # download the lookup if not present 105 | lookup_file = self.cache_dir / "ni_lookup.csv" 106 | if not os.path.isfile(str(lookup_file)): 107 | z = zipfile.ZipFile(str(self.__source_to_zip(NISRA.data_sources[2]))) 108 | pd.read_csv(z.open("All_Geographies_Code_Files/NI_HIERARCHY.csv")) \ 109 | .drop(["NUTS3","HSCT","ELB","COUNTRY"], axis=1) \ 110 | .to_csv(str(lookup_file), index=False) 111 | 112 | # load the area lookup 113 | self.area_lookup = pd.read_csv(str(lookup_file)) 114 | 115 | # TODO this is very close to duplicating the code in NRScotland.py - refactor? 116 | def get_geog(self, coverage, resolution): 117 | """ 118 | Returns all areas at resolution in coverage 119 | """ 120 | 121 | resolution = _ni_resolution(resolution) 122 | 123 | # assumes all areas in coverage are the same type 124 | coverage_type = _coverage_type(coverage) 125 | if coverage_type == "ALL": 126 | return self.area_lookup[resolution].unique() 127 | 128 | # ensure list 129 | if isinstance(coverage, str): 130 | coverage = [coverage] 131 | 132 | return self.area_lookup[self.area_lookup[coverage_type].isin(coverage)][resolution].unique() 133 | 134 | def get_metadata(self, table, resolution): 135 | return self.__get_metadata_impl(table, resolution)[0] 136 | 137 | def __get_metadata_impl(self, table, resolution): 138 | 139 | resolution = _ni_resolution(resolution) 140 | 141 | # If request at LGD/WARD level we will need to aggregate finer data 142 | if resolution == "LGD" or resolution == "WARD": 143 | resolution = "SOA" 144 | 145 | z = zipfile.ZipFile(str(self.__source_to_zip(NISRA.data_sources[NISRA.source_map[table[:2]]]))) 146 | raw_meta = pd.read_csv(z.open(NISRA.res_map[resolution]+"/"+table+"DESC0.CSV")) \ 147 | .drop(["ColumnVariableMeasurementUnit", "ColumnVariableStatisticalUnit"], axis=1) 148 | # if every field has the same number of commas we split, otherwise assume number of categories 149 | # is the minimum. Warn that category names may be messed up 150 | commas = raw_meta["ColumnVariableDescription"].str.count(",").unique() 151 | min_categories = min(commas) 152 | if len(commas) > 1 and min_categories > 0: 153 | print("WARNING: it appears that {} is multivariate and some category descriptions contain a comma. ".format(table) + \ 154 | "This makes the individual category names ambiguous. Be aware that category names may have been be incorrectly interpreted.") 155 | 156 | # str.split interprets 0 as split on all instances 157 | if min_categories > 0: 158 | raw_meta = pd.concat([raw_meta["ColumnVariableCode"], raw_meta["ColumnVariableDescription"].str.split(", ", n=min_categories, expand=True)], axis=1) 159 | else: 160 | raw_meta.rename({"ColumnVariableDescription": 0}, axis=1, inplace=True) 161 | 162 | #raw_meta['ColumnVariableCode'] = raw_meta['ColumnVariableCode'].map(lambda x: int(x[-4:])) 163 | raw_meta = raw_meta.set_index("ColumnVariableCode", drop=True) 164 | 165 | meta = { "table": table, 166 | "description": "", 167 | "geography": resolution, 168 | "fields": {} } 169 | 170 | text_columns = range(0,len(raw_meta.columns)) 171 | for text_column in text_columns: 172 | raw_meta[text_column] = raw_meta[text_column].astype("category") 173 | code_column = table + "_" + str(text_column) + "_CODE" 174 | raw_meta[code_column] = raw_meta[text_column].cat.codes 175 | meta["fields"][code_column] = dict(enumerate(raw_meta[text_column].cat.categories)) 176 | 177 | # now remove text columns 178 | raw_meta.drop(text_columns, axis=1, inplace=True) 179 | 180 | return (meta, raw_meta) 181 | 182 | def get_data(self, table, region, resolution, category_filters={}, r_compat=False): 183 | 184 | resolution = _ni_resolution(resolution) 185 | 186 | # No data is available for Ward/LGD (~MSOA/LAD) so we get SOA (LSOA) then aggregate 187 | agg_workaround = False 188 | if resolution == "LGD" or resolution == "WARD": 189 | agg_workaround = True 190 | actual_resolution = resolution 191 | resolution = "SOA" 192 | 193 | (meta, raw_meta) = self.__get_metadata_impl(table, resolution) 194 | 195 | area_codes = self.get_geog(region, resolution) 196 | 197 | z = zipfile.ZipFile(str(self.__source_to_zip(NISRA.data_sources[NISRA.source_map[table[:2]]]))) 198 | id_vars = ["GeographyCode"] 199 | raw_data = pd.read_csv(z.open(NISRA.res_map[resolution]+"/"+table+"DATA0.CSV")) \ 200 | .melt(id_vars=id_vars) 201 | raw_data.columns = ["GEOGRAPHY_CODE", table, "OBS_VALUE"] 202 | 203 | # Filter by region 204 | raw_data = raw_data[raw_data["GEOGRAPHY_CODE"].isin(area_codes)] 205 | 206 | # join with raw metadata and drop the combo code 207 | data = raw_data.join(raw_meta, on=table).drop([table], axis=1) 208 | 209 | # If we actually requested MSOA-level data, aggregrate the LSOAs within each MSOA 210 | if agg_workaround: 211 | data = data.reset_index(drop=True) 212 | lookup = self.area_lookup[self.area_lookup[resolution].isin(data.GEOGRAPHY_CODE)] 213 | lookup = pd.Series(lookup[actual_resolution].values, index=lookup[resolution]).to_dict() 214 | data.GEOGRAPHY_CODE = data.GEOGRAPHY_CODE.map(lookup) 215 | cols = list(data.columns) 216 | # remove acts in-place and has no return value so can't chain it 217 | cols.remove("OBS_VALUE") 218 | data = data.groupby(cols).sum().reset_index() 219 | 220 | # Filter by category 221 | for category in category_filters: 222 | filter = category_filters[category] 223 | if isinstance(filter, int): 224 | filter = [filter] 225 | data = data[data[category].isin(filter)] 226 | 227 | # for R (which doesnt understand a pandas dataframe), we return np.arrays 228 | data.reset_index(drop=True, inplace=True) 229 | if r_compat: 230 | return {"columns": data.columns.values, "values": data.values} 231 | else: 232 | return data 233 | 234 | # TODO this is very close to duplicating the code in Nomisweb.py/NRScotland.py - refactor 235 | def contextify(self, table, meta, colname): 236 | """ 237 | Replaces the numeric category codes with the descriptive strings from the metadata 238 | """ 239 | lookup = meta["fields"][colname] 240 | # convert list into dict keyed on list index 241 | mapping = { k: v for k, v in enumerate(lookup)} 242 | category_name = colname.replace("_CODE", "_NAME") 243 | 244 | table[category_name] = table[colname].map(mapping) 245 | 246 | return table 247 | 248 | # TODO this could be merged with the Scottish version 249 | def __source_to_zip(self, source_name): 250 | """ 251 | Downloads if necessary and returns the name of the locally cached zip file of the source data (replacing spaces with _) 252 | """ 253 | zipfile = self.cache_dir / source_name.replace(" ", "_") 254 | if not os.path.isfile(str(zipfile)): 255 | # The URL must have %20 for space (only) 256 | ni_src = NISRA.URL + source_name.replace(" ", "%20") 257 | print(ni_src, " -> ", zipfile, "...", end="") 258 | response = requests.get(ni_src) 259 | response.raise_for_status() 260 | with open(str(zipfile), 'wb') as fd: 261 | for chunk in response.iter_content(chunk_size=1024): 262 | fd.write(chunk) 263 | print("OK") 264 | return zipfile 265 | 266 | def _ni_resolution(resolution): 267 | """ 268 | Maps E&W statistical geography codes to their closest NI equvalents 269 | """ 270 | # check if already an NI code 271 | if resolution in NISRA.NIGeoCodes: 272 | return resolution 273 | 274 | if not resolution in NISRA.GeoCodeLookup: 275 | raise ValueError("resolution '{}' is not available".format(resolution)) 276 | 277 | return NISRA.NIGeoCodes[NISRA.GeoCodeLookup[resolution]] 278 | 279 | -------------------------------------------------------------------------------- /ukcensusapi/NRScotland.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data scraper for Scottish 2011 census Data 3 | """ 4 | 5 | import os.path 6 | from pathlib import Path 7 | import urllib.parse 8 | import zipfile 9 | import pandas as pd 10 | import requests 11 | 12 | import ukcensusapi.utils as utils 13 | 14 | # workaround for apparent bug in later versions of openssl (e.g. 1.1.1f on ubuntu focal) 15 | # that causes this issue: https://github.com/virgesmith/UKCensusAPI/issues/48 16 | def _ssl_get_workaround(url, headers): 17 | import ssl 18 | from urllib3 import poolmanager 19 | import warnings 20 | # suppress ResourceWarning: unclosed "1...3,6,7...10" 37 | which can drastically reduce the length of the query url 38 | """ 39 | # empty evals to False 40 | if not code_list: 41 | return "" 42 | if len(code_list) == 1: 43 | return str(code_list[0]) 44 | 45 | code_list.sort() # assume this is a modifying operation 46 | short_string = "" 47 | index0 = 0 48 | index1 = 0 # appease lint 49 | for index1 in range(1, len(code_list)): 50 | if code_list[index1] != (code_list[index1-1] + 1): 51 | if index0 == index1: 52 | short_string += str(code_list[index0]) + "," 53 | else: 54 | short_string += str(code_list[index0]) + "..." + str(code_list[index1-1]) + "," 55 | index0 = index1 56 | if index0 == index1: 57 | short_string += str(code_list[index0]) 58 | else: 59 | short_string += str(code_list[index0]) + "..." + str(code_list[index1]) 60 | return short_string 61 | 62 | 63 | 64 | # The core functionality for accessing the www.nomisweb.co.uk API 65 | class Nomisweb: 66 | """ 67 | Nomisweb API methods and data. 68 | """ 69 | 70 | # static constants 71 | URL = "https://www.nomisweb.co.uk/" 72 | 73 | # timeout for http requests 74 | Timeout = 15 75 | 76 | # # Define Nomisweb geographic area codes, see e.g. 77 | # https://www.nomisweb.co.uk/api/v01/dataset/NM_144_1/geography/2092957703TYPE464.def.sdmx.json 78 | # https://www.nomisweb.co.uk/api/v01/dataset/NM_1_1/geography/2092957703TYPE464.def.sdmx.json 79 | GeoCodeLookup = { 80 | # give meaning to some common nomis geography types/codes 81 | "LAD": "TYPE464", 82 | "MSOA11": "TYPE297", 83 | "LSOA11": "TYPE298", 84 | "OA11": "TYPE299", 85 | "MSOA01": "TYPE305", 86 | "LSOA01": "TYPE304", 87 | "OA01": "TYPE310", 88 | "England": "2092957699", 89 | "EnglandWales": "2092957703", 90 | "GB": "2092957698", 91 | "UK": "2092957697" 92 | } 93 | 94 | # initialise, supplying a location to cache downloads 95 | def __init__(self, cache_dir, verbose=False): 96 | """Constructor. 97 | Args: 98 | cache_dir: cache directory 99 | Returns: 100 | an instance. 101 | """ 102 | self.cache_dir = utils.init_cache_dir(cache_dir) 103 | self.verbose = verbose 104 | self.offline_mode = True 105 | 106 | # how best to deal with site unavailable... 107 | self.offline_mode = not utils.check_online(self.URL, Nomisweb.Timeout) 108 | if self.offline_mode: 109 | print("Unable to contact %s, operating in offline mode - pre-cached data only" % self.URL) 110 | 111 | self.key = _get_api_key(self.cache_dir) 112 | if not self.offline_mode and self.key is None: 113 | raise RuntimeError("No API key found. Whilst downloads still work, they may be truncated,\n" \ 114 | "causing potentially unforseen problems in any modelling/analysis.\n" \ 115 | "Set the key value in the environment variable NOMIS_API_KEY.\n" \ 116 | "Register at www.nomisweb.co.uk to obtain a key") 117 | 118 | if self.verbose: print("Cache directory: ", self.cache_dir) 119 | 120 | # static member 121 | Nomisweb.cached_lad_codes = self.__cache_lad_codes() 122 | 123 | def get_geo_codes(self, la_codes, code_type): 124 | """Get nomis geographical codes. 125 | 126 | Args: 127 | la_codes: local authority codes for the region 128 | code_type: enumeration specifying the geographical resolution 129 | Returns: 130 | a string representation of the codes. 131 | """ 132 | # force input to be a list 133 | if not isinstance(la_codes, list): 134 | la_codes = [la_codes] 135 | 136 | geo_codes = [] 137 | for i in range(0, len(la_codes)): 138 | path = "api/v01/dataset/NM_144_1/geography/" + str(la_codes[i]) + code_type + ".def.sdmx.json?" 139 | rawdata = self.__fetch_json(path, {}) 140 | 141 | # use try-catch block to deal with any issues arising from the returned json 142 | # which are likely due to invalid/empty LA codes 143 | try: 144 | n_results = len(rawdata["structure"]["codelists"]["codelist"][0]["code"]) 145 | # seems a bit daft not to take advantage of the fact we know the length 146 | for j in range(0, n_results): 147 | geo_codes.append(rawdata["structure"]["codelists"]["codelist"][0]["code"][j]["value"]) 148 | except (KeyError, ValueError): 149 | print(la_codes[i], " does not appear to be a valid LA code") 150 | return _shorten(geo_codes) 151 | 152 | def get_lad_codes(self, la_names): 153 | """Convert local autority name(s) to nomisweb codes. 154 | Args: 155 | la_names: one or more local authorities (specify either the name or the ONS code) 156 | Returns: 157 | codes. 158 | """ 159 | if not isinstance(la_names, list): 160 | la_names = [la_names] 161 | codes = [] 162 | for la_name in la_names: 163 | if la_name in Nomisweb.cached_lad_codes: 164 | codes.append(Nomisweb.cached_lad_codes[la_name]) 165 | return codes 166 | 167 | def get_url(self, table_internal, query_params): 168 | """Constructs a query url given a nomisweb table code and a query. 169 | Args: 170 | table_internal: nomis table code. This can be found in the table metadata 171 | query_params: a dictionary of parameters and values 172 | Returns: 173 | the url that can be used to download the data 174 | """ 175 | 176 | # python dicts have nondeterministic order, see 177 | # https://stackoverflow.com/questions/14956313/why-is-dictionary-ordering-non-deterministic 178 | # this is problematic for the cacheing (md5 sum dependent on order), so we insert alphabetically 179 | # into an OrderedDict (which preserves insertion order) 180 | ordered = OrderedDict() 181 | for key in sorted(query_params): 182 | ordered[key] = query_params[key] 183 | 184 | return Nomisweb.URL + "api/v01/dataset/" + table_internal + ".data.tsv?" + str(urlencode(ordered)) 185 | 186 | # r_compat forces function to return strings (either cached filename, or error msg) 187 | # Two reasons for this: 188 | # - pandas/R dataframes conversion is done via matrix (which drops col names) 189 | # - reporting errors to R is useful (print statements aren't displayed in R(Studio)) 190 | def get_data(self, table, query_params, r_compat=False): 191 | """Downloads or retrieves data given a table and query parameters. 192 | Args: 193 | table: ONS table name, or nomisweb table code if no explicit ONS name 194 | query_params: table query parameters 195 | r_compat: return values suitable for R 196 | Returns: 197 | a dataframe containing the data. If downloaded, the data is also cached to a file 198 | """ 199 | 200 | # load the metadata 201 | metadata = self.load_metadata(table) 202 | 203 | query_params["uid"] = self.key 204 | query_string = self.get_url(metadata["nomis_table"], query_params) 205 | filename = self.cache_dir / (table + "_" + hashlib.md5(query_string.encode()).hexdigest()+".tsv") 206 | 207 | # retrieve if not in cache 208 | if not os.path.isfile(str(filename)): 209 | if self.verbose: print("Downloading and cacheing data: " + str(filename)) 210 | #'TODO migrate to requests package 211 | request.urlretrieve(query_string, str(filename)) #, timeout = Nomisweb.Timeout) 212 | 213 | # check for empty file, if so delete it and report error 214 | if os.stat(str(filename)).st_size == 0: 215 | os.remove(str(filename)) 216 | errormsg = "ERROR: Query returned no data. Check table and query parameters" 217 | if r_compat: 218 | return errormsg 219 | print(errormsg) 220 | return 221 | else: 222 | if self.verbose: print("Using cached data: " + str(filename)) 223 | 224 | # now load from cache and return 225 | if r_compat: 226 | return str(filename) # R expects a string not a Path 227 | data = pd.read_csv(str(filename), delimiter='\t') 228 | if len(data) == 1000000: 229 | warnings.warn("Data download has reached nomisweb's single-query row limit. Truncation is extremely likely") 230 | return data 231 | 232 | def get_metadata(self, table_name): 233 | """Downloads census table metadata. 234 | Args: 235 | table_name: the (ONS) table name, e.g. KS4402EW 236 | Returns: 237 | a dictionary containing information about the table contents including categories and category values. 238 | """ 239 | # see if already downloaded 240 | 241 | 242 | if not table_name.startswith("NM_"): 243 | path = "api/v01/dataset/def.sdmx.json?" 244 | query_params = {"search": "*"+table_name+"*"} 245 | else: 246 | path = "api/v01/" + table_name + ".def.sdmx.json?" 247 | query_params = {} 248 | 249 | data = self.__fetch_json(path, query_params) 250 | 251 | # return empty if no useful metadata returned (likely table doesnt exist) 252 | if not data["structure"]["keyfamilies"]: 253 | return 254 | 255 | # this is the nomis internal table name 256 | table = data["structure"]["keyfamilies"]["keyfamily"][0]["id"] 257 | 258 | rawfields = data["structure"]["keyfamilies"]["keyfamily"][0]["components"]["dimension"] 259 | fields = {} 260 | for rawfield in rawfields: 261 | field = rawfield["conceptref"] 262 | 263 | fields[field] = {} 264 | 265 | # ignore when too many categories (i.e. geograpical ones) 266 | if field.upper() == "CURRENTLY_RESIDING_IN" or field.upper() == "PLACE_OF_WORK": 267 | continue 268 | 269 | # further query to get categories 270 | path = "api/v01/dataset/"+table+"/"+field+".def.sdmx.json?" 271 | #print(path) 272 | 273 | try: 274 | fdata = self.__fetch_json(path, {}) 275 | except timeout: 276 | print("HTTP timeout requesting metadata for " + table_name) 277 | return {} 278 | except (HTTPError, URLError): 279 | print("HTTP error requesting metadata for " + table_name) 280 | return {} 281 | else: 282 | values = fdata["structure"]["codelists"]["codelist"][0]["code"] 283 | #print(field+":") 284 | for value in values: 285 | # KEYs are stored as strings for json compatibility 286 | fields[field][value["value"]] = value["description"]["value"] 287 | 288 | # Fetch the geographies available for this table 289 | geogs = {} 290 | path = "api/v01/dataset/"+table+"/geography/TYPE.def.sdmx.json?" 291 | try: 292 | fdata = self.__fetch_json(path, {}) 293 | except timeout: 294 | print("HTTP timeout requesting geography metadata for " + table_name) 295 | except (HTTPError, URLError): 296 | print("HTTP error requesting geography metadata for " + table_name) 297 | else: 298 | if fdata["structure"]["codelists"]: 299 | values = fdata["structure"]["codelists"]["codelist"][0]["code"] 300 | #print(values) 301 | for value in values: 302 | geogs[str(value["value"])] = value["description"]["value"] 303 | 304 | result = {"nomis_table": table, 305 | "description": data["structure"]["keyfamilies"]["keyfamily"][0]["name"]["value"], 306 | "fields": fields, 307 | "geographies": geogs} 308 | 309 | # save a copy 310 | self.write_metadata(table_name, result) 311 | 312 | return result 313 | 314 | # loads metadata from cached json if available, otherwises downloads from nomisweb. 315 | # NB category KEYs need to be converted from string to integer for this data to work properly, see convert_code 316 | def load_metadata(self, table_name): 317 | """Retrieves cached, or downloads census table metadata. Use this in preference to get_metadata. 318 | Args: 319 | table_name: the (ONS) table name, e.g. KS4402EW 320 | Returns: 321 | a dictionary containing information about the table contents including categories and category values. 322 | """ 323 | filename = self.cache_dir / (table_name + "_metadata.json") 324 | # if file not there, get from nomisweb 325 | if not os.path.isfile(str(filename)): 326 | if self.verbose: print(filename, "not found, downloading...") 327 | return self.get_metadata(table_name) 328 | else: 329 | if self.verbose: print(filename, "found, using cached metadata...") 330 | with open(str(filename)) as metafile: 331 | meta = json.load(metafile) 332 | 333 | return meta 334 | 335 | # private 336 | 337 | # download and cache the nomis codes for local authorities 338 | def __cache_lad_codes(self): 339 | 340 | filename = self.cache_dir / "lad_codes.json" 341 | 342 | if not os.path.isfile(str(filename)): 343 | if self.verbose: print(filename, "not found, downloading LAD codes...") 344 | 345 | data = self.__fetch_json("api/v01/dataset/NM_144_1/geography/" \ 346 | + str(Nomisweb.GeoCodeLookup["EnglandWales"]) + Nomisweb.GeoCodeLookup["LAD"] + ".def.sdmx.json?", {}) 347 | if data == {}: 348 | return [] 349 | 350 | rawfields = data["structure"]["codelists"]["codelist"][0]["code"] 351 | codes = {} 352 | for rawfield in rawfields: 353 | codes[rawfield["description"]["value"]] = rawfield["value"] 354 | codes[rawfield["annotations"]["annotation"][2]["annotationtext"]] = rawfield["value"] 355 | if self.verbose: print("Writing LAD codes to ", filename) 356 | 357 | # save LAD codes 358 | with open(str(filename), "w") as metafile: 359 | json.dump(codes, metafile, indent=2) 360 | 361 | else: 362 | if self.verbose: print("using cached LAD codes:", filename) 363 | with open(str(filename)) as cached_ladcodes: 364 | codes = json.load(cached_ladcodes) 365 | return codes 366 | 367 | # given a list of integer codes, generates a string using the nomisweb shortened form 368 | # (consecutive numbers represented by a range, non-consecutive are comma separated 369 | def __fetch_json(self, path, query_params): 370 | # add API KEY to params 371 | query_params["uid"] = self.key 372 | 373 | query_string = Nomisweb.URL + path + str(urlencode(query_params)) 374 | 375 | reply = {} 376 | try: 377 | response = request.urlopen(query_string, timeout=Nomisweb.Timeout) 378 | except (HTTPError, URLError) as error: 379 | print('ERROR: ', error, '\n', query_string) 380 | except timeout: 381 | print('ERROR: request timed out\n', query_string) 382 | else: 383 | reply = json.loads(response.read().decode("utf-8")) 384 | return reply 385 | 386 | # save metadata as JSON for future reference 387 | def write_metadata(self, table, meta): 388 | """method. 389 | Args: 390 | table: name of table 391 | meta: the metadata 392 | ... 393 | Returns: 394 | nothing. 395 | """ 396 | 397 | filename = self.cache_dir / (table + "_metadata.json") 398 | if self.verbose: print("Writing metadata to ", str(filename)) 399 | with open(str(filename), "w") as metafile: 400 | json.dump(meta, metafile, indent=2) 401 | 402 | # append numeric values with the string values from the metadata 403 | # NB the "numeric" values are stored as strings in both the table and the metadata 404 | # this doesnt need to be a member 405 | def contextify(self, table_name, column, table): 406 | """Adds context to a column in a table, as a separate column containing the meanings of each numerical value 407 | Args: 408 | table_name: name of census table 409 | column: name of column within the table (containing numeric values) 410 | table: 411 | Returns: 412 | a new table containing an extra column with descriptions of the numeric values. 413 | """ 414 | 415 | metadata = self.load_metadata(table_name) 416 | 417 | if not column in metadata["fields"]: 418 | print(column, " is not in metadata") 419 | return 420 | if not column in table.columns: 421 | print(column, " is not in table") 422 | return 423 | 424 | # convert KEYs on the fly to integers (if they've been loaded from json they will be strings) 425 | lookup = {int(k):v for k, v in metadata["fields"][column].items()} 426 | table[column + "_NAME"] = table[column].map(lookup) 427 | -------------------------------------------------------------------------------- /ukcensusapi/Query.py: -------------------------------------------------------------------------------- 1 | """ 2 | Nomisweb census data interactive query builder 3 | See README.md for details on how to use this package 4 | """ 5 | 6 | import ukcensusapi.Nomisweb as ApiEW 7 | import ukcensusapi.NRScotland as ApiSC 8 | import ukcensusapi.NISRA as ApiNI 9 | 10 | def _get_scni(table, api, codes): 11 | meta = { "geographies": {}} 12 | for k in codes: 13 | try: 14 | raw = api.get_metadata(table, k) 15 | meta["table"] = raw["table"] 16 | meta["description"] = raw["description"] 17 | #meta["geographies"] = {} 18 | meta["geographies"][raw["geography"]] = raw["fields"] 19 | except ValueError: 20 | pass 21 | return meta 22 | 23 | def _print_scni(meta): 24 | for k in meta["geographies"].keys(): 25 | print("Geography: %s" % k) 26 | for c in meta["geographies"][k]: 27 | print(" %s:" % c) 28 | for i, v in meta["geographies"][k][c].items(): 29 | print(" %3d: %s" %(i, v)) 30 | 31 | class Query: 32 | """ 33 | Census query functionality 34 | """ 35 | def __init__(self, cache_dir): 36 | self.cache_dir = cache_dir 37 | self.api = ApiEW.Nomisweb(cache_dir) 38 | 39 | def table(self): 40 | """ 41 | Interactive census table query 42 | """ 43 | 44 | print("Nomisweb census data interactive query builder") 45 | print("See README.md for details on how to use this package") 46 | 47 | table = input("Census table: ") 48 | 49 | # only init Sc/NI APIs if required (large initial download) 50 | if table.endswith("SC"): 51 | api_sc = ApiSC.NRScotland(self.cache_dir) 52 | print("Data source: NRScotland") 53 | _print_scni(_get_scni(table, api_sc, ApiSC.NRScotland.GeoCodeLookup.keys())) 54 | return 55 | elif table.endswith("NI"): 56 | api_ni = ApiNI.NISRA(self.cache_dir) 57 | print("Data source: NISRA") 58 | _print_scni(_get_scni(table, api_ni, ApiNI.NISRA.GeoCodeLookup.keys())) 59 | return 60 | print("Data source: nomisweb (default)") 61 | 62 | query_params = {} 63 | query_params["date"] = "latest" 64 | query_params["select"] = "GEOGRAPHY_CODE," 65 | 66 | # select fields/categories from table 67 | meta = self.api.get_metadata(table) 68 | print(meta["description"]) 69 | for field in meta["fields"]: 70 | if field != "GEOGRAPHY" and field != "FREQ": 71 | print(field + ":") 72 | for category in meta["fields"][field]: 73 | print(" " + str(category) + " (" + meta["fields"][field][category] + ")") 74 | categories = input("Select categories (default 0): ") 75 | include = True 76 | if categories == "" or categories == "0": 77 | include = input("include in output (y/n, default=n)? ") == "y" 78 | categories = "0" 79 | query_params[field] = categories 80 | if field != "MEASURES" and include: 81 | query_params["select"] += field + "," 82 | 83 | query_params["select"] += "OBS_VALUE" 84 | 85 | add_geog = input("Add geography? (y/N): ") == "y" 86 | if add_geog: 87 | query_params["geography"] = self.__add_geog(meta) 88 | #print(query_params) 89 | 90 | get_data = input("Get data now? (y/N): ") == "y" 91 | if get_data: 92 | print("\n\nGetting data...") 93 | 94 | # Fetch (and cache) data 95 | self.api.get_data(table, query_params) 96 | 97 | # Remove API key in example code (lest it be accidentally committed) 98 | if "uid" in query_params: 99 | del query_params["uid"] 100 | 101 | self.write_code_snippets(table, meta, query_params) 102 | 103 | # returns a geography string that can be inserted into an existing query 104 | def get_geog_from_names(self, coverage, resolution): 105 | """ 106 | Return a set of nomisweb geography codes for areas within the specified coverage at the specified resolution 107 | """ 108 | 109 | # Convert the coverage area into nomis codes 110 | coverage_codes = self.api.get_lad_codes(coverage) 111 | return self.api.get_geo_codes(coverage_codes, resolution) 112 | 113 | def __add_geog(self, metadata): 114 | 115 | coverage = input("\nGeographical coverage\nE/EW/GB/UK or LAD codes(s)/name(s), comma separated: ") 116 | 117 | if coverage == "E": 118 | coverage_codes = [ApiEW.Nomisweb.GeoCodeLookup["England"]] 119 | elif coverage == "EW": 120 | coverage_codes = [ApiEW.Nomisweb.GeoCodeLookup["EnglandWales"]] 121 | elif coverage == "GB": 122 | coverage_codes = [ApiEW.Nomisweb.GeoCodeLookup["GB"]] 123 | elif coverage == "UK": 124 | coverage_codes = [ApiEW.Nomisweb.GeoCodeLookup["UK"]] 125 | else: 126 | coverage_codes = self.api.get_lad_codes(coverage.split(",")) 127 | 128 | #print(metadata) 129 | for key in metadata["geographies"]: 130 | print(key, metadata["geographies"][key]) 131 | 132 | resolution_valid = False 133 | while not resolution_valid: 134 | resolution = input("Select Resolution: ") 135 | if resolution in metadata["geographies"].keys(): 136 | resolution_valid = True 137 | else: 138 | print(resolution + " is not valid") 139 | 140 | area_codes = self.api.get_geo_codes(coverage_codes, resolution) 141 | return area_codes 142 | 143 | def write_code_snippets(self, table, meta, query_params): 144 | """ 145 | Write out python and R code snippets, based on the supplied query, for later use 146 | """ 147 | snippet_file = self.api.cache_dir / (table + ".py") 148 | print("\nWriting python code snippet to " + str(snippet_file)) 149 | with open(str(snippet_file), "w") as py_file: 150 | py_file.write("\"\"\"\n" + meta["description"]) 151 | py_file.write("\n\nCode autogenerated by UKCensusAPI\n") 152 | py_file.write("(https://github.com/virgesmith/UKCensusAPI)\n\"\"\"") 153 | py_file.write("\n\n# This code requires an API key, see the README.md for details") 154 | py_file.write("\n\n# Query url:\n# " + self.api.get_url(meta["nomis_table"], query_params)) 155 | py_file.write("\n\nimport ukcensusapi.Nomisweb as CensusApi") 156 | py_file.write("\n\napi = CensusApi.Nomisweb(\"" + str(self.api.cache_dir) + "\")") 157 | py_file.write("\ntable = \"" + table + "\"") 158 | py_file.write("\ntable_internal = \"" + meta["nomis_table"] + "\"") 159 | py_file.write("\nquery_params = {}") 160 | for key in query_params: 161 | py_file.write("\nquery_params[\""+key+"\"] = \""+query_params[key]+"\"") 162 | if not "geography" in query_params: 163 | py_file.write("\n# TODO query_params[\"geography\"] = ...") 164 | py_file.write("\n" + table + " = api.get_data(table, query_params)\n") 165 | 166 | snippet_file = self.api.cache_dir / (table + ".R") 167 | print("\nWriting R code snippet to " + str(snippet_file)) 168 | with open(str(snippet_file), "w") as r_file: 169 | r_file.write("# " + meta["description"]) 170 | r_file.write("\n\n# Code autogenerated by UKCensusAPI") 171 | r_file.write("\n#https://github.com/virgesmith/UKCensusAPI") 172 | r_file.write("\n\n# This code requires an API key, see the README.md for details") 173 | r_file.write("\n# Query url: " + self.api.get_url(meta["nomis_table"], query_params)) 174 | r_file.write("\n\nlibrary(\"UKCensusAPI\")") 175 | r_file.write("\ncacheDir = \"" + str(self.api.cache_dir) + "\"") 176 | r_file.write("\napi = UKCensusAPI::instance(cacheDir)") 177 | r_file.write("\ntable = \"" + table + "\"") 178 | r_file.write("\ntable_internal = \"" + meta["nomis_table"] + "\"") 179 | r_file.write("\nqueryParams = list(") 180 | first = True 181 | for key in query_params: 182 | if first: 183 | r_file.write("\n "+key+" = \""+query_params[key] + "\"") 184 | first = False 185 | else: 186 | r_file.write(",\n "+key+" = \""+query_params[key] + "\"") 187 | if not "geography" in query_params: 188 | r_file.write("\n # TODO add geography parameter to this query...") 189 | r_file.write("\n)") 190 | r_file.write("\n" + table + " = UKCensusAPI::getData(api, table, queryParams)\n") 191 | -------------------------------------------------------------------------------- /ukcensusapi/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.6" 2 | 3 | -------------------------------------------------------------------------------- /ukcensusapi/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common utility/helpers 3 | """ 4 | import os 5 | from pathlib import Path 6 | import requests 7 | 8 | def _expand_home(path): 9 | """ 10 | pathlib doesn't interpret ~/ as $HOME 11 | This doesnt deal with other user's homes e.g. ~another/dir is not changed 12 | """ 13 | return Path(str(path).replace("~/", str(Path.home()) + "/")) 14 | 15 | def init_cache_dir(directory): 16 | """ 17 | Checks path exists and is a writable directory 18 | Create if it doesnt exist 19 | Throw PermissionError if not 20 | """ 21 | directory = _expand_home(directory) 22 | 23 | if not os.path.exists(str(directory)): 24 | os.makedirs(str(directory)) 25 | 26 | if not os.path.isdir(str(directory)): 27 | raise PermissionError(str(directory) + " is not a directory") 28 | 29 | if not os.access(str(directory), os.W_OK): 30 | raise PermissionError(str(directory) + " is not writable") 31 | 32 | return directory 33 | 34 | def check_online(url, t=5): 35 | try: 36 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:92.0) Gecko/20100101 Firefox/92.0'} 37 | r = requests.get(url, timeout=t, headers=headers) 38 | r.raise_for_status() 39 | return True 40 | except (requests.exceptions.RequestException) as error: 41 | return False 42 | --------------------------------------------------------------------------------