├── .coveragerc ├── .github ├── issue_template.md └── pull_request_template.md ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.md ├── LEAD.md ├── LICENSE.md ├── MANIFEST.in ├── Makefile ├── README.md ├── data_quality ├── VERSION ├── __init__.py ├── compat.py ├── datapackage.default.json ├── dq.default.json ├── exceptions.py ├── generators │ ├── __init__.py │ ├── base.py │ └── ckan.py ├── main.py ├── tasks │ ├── __init__.py │ ├── aggregate.py │ ├── assess_performance.py │ ├── base_task.py │ ├── check_datapackage.py │ ├── deploy.py │ ├── extract_relevance_period.py │ ├── generate.py │ └── initialize_datapackage.py └── utilities.py ├── dq-config.example.json ├── pylintrc ├── setup.py ├── tests ├── __init__.py ├── fixtures │ ├── datapackage.json │ ├── datapackage_missing_required.json │ ├── datapackage_sources_with_period.json │ ├── dq.json │ ├── fetched │ │ ├── empty_rows_multiple.csv │ │ └── valid.csv │ ├── performance.csv │ ├── publishers.csv │ ├── results.csv │ ├── runs.csv │ ├── sources.csv │ └── sources_with_period_id.csv ├── mock_generator.py ├── tasks │ ├── __init__.py │ ├── test_aggregate.py │ ├── test_assess_performance.py │ ├── test_extract_relevance_period.py │ ├── test_generate.py │ ├── test_initialize_datapackage.py │ ├── test_task.py │ └── tests_check_datapackage.py ├── test_interface.py └── test_utilities.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */docs* 4 | */tests* 5 | */examples* 6 | */requirements* 7 | setup.py 8 | 9 | [xml] 10 | output = shippable/codecoverage/coverage.xml 11 | 12 | [report] 13 | # Regexes for lines to exclude from consideration 14 | exclude_lines = 15 | # Don't complain about missing debug-only code: 16 | def __repr__ 17 | if self\.debug 18 | 19 | # Don't complain if tests don't hit defensive assertion code: 20 | raise AssertionError 21 | raise NotImplementedError 22 | 23 | # Don't complain if non-runnable code isn't run: 24 | if 0: 25 | if False: 26 | if __name__ == .__main__.: 27 | -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it 4 | 5 | --- 6 | 7 | Please preserve this line to notify @roll (lead of this repository) 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Node 60 | node_modules/ 61 | 62 | # Virtualenv 63 | venv/ 64 | venv2/ 65 | venv3/ 66 | 67 | # Tmux 68 | .tmuxp.yml 69 | 70 | # Project 71 | tmp 72 | .projectile 73 | .idea 74 | *.sublime-project 75 | *.sublime-workspace 76 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: 2 | false 3 | 4 | language: 5 | python 6 | 7 | python: 8 | - 2.7 9 | - 3.3 10 | - 3.4 11 | - 3.5 12 | 13 | env: 14 | global: 15 | - TOXENV="py${PYTHON_VERSION//./}" 16 | 17 | install: 18 | - make develop 19 | - pip install coveralls 20 | 21 | script: 22 | # - make lint 23 | - make test 24 | 25 | after_success: 26 | - coveralls 27 | 28 | deploy: 29 | provider: pypi 30 | user: okfn 31 | on: 32 | tags: true 33 | password: 34 | secure: Iuf7V4+XHL6wwFYt4IyEe0vWLGO/uOpMJWQnO+1eUjmcQ1qi4E9vyEJvsJRzWKm5+/Lv9uFIRGlmpNWQzUPs5VnMc3LEBh7Clv/WIlRGvi+omCeWoEPAPUueF8qjBcvpT37QNzjB5QXJY074uAihmKh/DU2xA4K0yCB8YQefBHYeNBl0pNYVnELUW8BFmz0GE0lTwHOnM681vgR01LdPjrgIHVEvnTZkKYtDXc/cwkw610fqrFS10srnTX6KjjC/pgDm4WSuaUxbPycmriIhZR29QgAx24NO/wrdGdp5H8TIsvBFnNFlC4QuHfwiXdAKpjL6cMu2uMo639Sev/484XxTorg2QQvNhNAJtiESVAaqVviAlmUItGdmsw4xhZb0JK6NC8fOuOoccL4DBD6JtCyGurwSpznuGXh1DQUYZ7fTd5qaUDnzBuhYGc8XDvcj14XU4P5OKES4NdruRVJOwFiNSMOAT6wm8b2Ue6N+FvgsghjwUr9ESKBrPj0VoouC2+FGZWT65vt/3R9PhFuBdC6SgMLWHESBuU5GW9Bc2ucS3HUi+uUV1IGjpfIsc3qifojNJiaU7hSAggJs9QlXd7goH2fKhb9ro2klzcDKmpBLXmMk3uH0QRpv1dGUYFtgGeEFN93vP3cxYsXf8OvV+MuCxYYGgrGZu3h8fvbc5hY= 35 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards). 4 | 5 | ## Getting Started 6 | 7 | Recommended way to get started is to create and activate a project virtual environment. 8 | To install package and development dependencies into active environment: 9 | 10 | ``` 11 | $ make develop 12 | ``` 13 | 14 | ## Linting 15 | 16 | To lint the project codebase: 17 | 18 | ``` 19 | $ make lint 20 | ``` 21 | 22 | Under the hood `pylint` configured in `pylintrc` is used. On this stage it's already 23 | installed into your environment and could be used separately with more fine-grained control 24 | as described in documentation - https://www.pylint.org/. 25 | 26 | For example to check only errors: 27 | 28 | ``` 29 | $ pylint -E 30 | ``` 31 | 32 | ## Testing 33 | 34 | To run tests with coverage: 35 | 36 | ``` 37 | $ make test 38 | ``` 39 | Under the hood `tox` powered by `py.test` and `coverage` configured in `tox.ini` is used. 40 | It's already installed into your environment and could be used separately with more fine-grained control 41 | as described in documentation - https://testrun.org/tox/latest/. 42 | 43 | For example to check subset of tests against Python 2 environment with increased verbosity. 44 | All positional arguments and options after `--` will be passed to `py.test`: 45 | 46 | ``` 47 | tox -e py27 tests/ -- -v 48 | ``` 49 | -------------------------------------------------------------------------------- /LEAD.md: -------------------------------------------------------------------------------- 1 | roll 2 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Open Knowledge 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include VERSION 2 | include LICENSE.md 3 | include Makefile 4 | include pylintrc 5 | include README.md 6 | include tox.ini 7 | include datapackage.default.json 8 | include dq.default.json 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all develop list lint release test version 2 | 3 | 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2) 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION) 6 | LEAD := $(shell head -n 1 LEAD.md) 7 | 8 | 9 | all: list 10 | 11 | develop: 12 | pip install --upgrade -e .[develop] 13 | 14 | list: 15 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 16 | 17 | lint: 18 | pylint $(PACKAGE) 19 | 20 | readme: 21 | pip install md-toc 22 | md_toc -p README.md github --header-levels 3 23 | sed -i '/(#tableschema-spss-py)/,+2d' README.md 24 | 25 | release: 26 | git checkout master 27 | git pull origin 28 | git fetch -p 29 | git commit -a -m 'v$(VERSION)' 30 | git tag -a v$(VERSION) -m 'v$(VERSION)' 31 | git push --follow-tags 32 | 33 | templates: 34 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md 35 | sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md 36 | 37 | test: 38 | tox 39 | 40 | version: 41 | @echo $(VERSION) 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/frictionlessdata/data-quality-cli.svg)](https://travis-ci.org/frictionlessdata/data-quality-cli) 2 | [![Coverage Status](https://coveralls.io/repos/frictionlessdata/data-quality-cli/badge.svg)](https://coveralls.io/r/frictionlessdata/data-quality-cli) 3 | 4 | 5 | # Data Quality CLI 6 | 7 | A command line tool that assesses the quality of a set of data sources (e.g.: CSV files of open data published by a government). 8 | 9 | ## What's it about? 10 | 11 | The `dq` (alias: `dataquality`) CLI is a tool to create and manage a [Data Package](http://specs.frictionlessdata.io/data-package/) 12 | from a given source of data that can be used by [Data Quality Dashboard](https://github.com/frictionlessdata/data-quality-dashboard). 13 | The quality assessment is done using [GoodTables](http://goodtables.readthedocs.io/en/latest/index.html) and [can be configured](#quality-config). 14 | 15 | The proposed workflow is this: 16 | 17 | * An administrator creates a folder for a given project which will be equivalent to a data package. 18 | * The administrator runs the [`dq init`](#init) command to create templates for the configuration file 19 | and the `datapackage.json` file along with the folder structure. 20 | * The administrator updates the [configuration file](#config) to reflect the structure of the data package 21 | and optionally to [configure the quality assessment](#quality-config). 22 | * The administrator updates the `datapackage.json` file with information specific to the project 23 | and other customizations. 24 | * The administrator creates a `source_file` and a `publisher_file`: 25 | * By using the [generate command](#generate). 26 | * By using custom scripts ([see this example](https://github.com/okfn/data-quality-uk-25k-spend)). 27 | * In any other way that is in sync with the [schema](#schema). 28 | * The administrator [runs the validation](#run) over the set of sources. 29 | * The data is managed in a git repository (or other version control system), which the administrator has locally 30 | * The administrator [deploys](#deploy) the data package to a central data repository (ex: GitHub) 31 | * The administrator [updates the configuration](https://github.com/frictionlessdata/data-quality-dashboard#configure-database) 32 | of the corresponding Data Quality Dashboard instance 33 | * The administrator, or possibly content editor, occasionally updates 34 | the `source_file` file in the data directory with new data 35 | * Periodically (once a month, once a quarter), the administrator runs 36 | `dq run /path/to-config.json --deploy`. This builds a new set of results for the data, 37 | and deploys the updated data back to the central data repository 38 | * Since Data Quality Dashboard is a pure client-side application, as soon as updated 39 | data is deployed, the app will start working with the updated data. 40 | 41 | ## Install 42 | 43 | ``` 44 | pip install git+https://github.com/frictionlessdata/data-quality-cli.git 45 | ``` 46 | 47 | ## Use 48 | 49 | ``` 50 | dq --help 51 | ``` 52 | 53 | ### Init 54 | 55 | Before starting building the database, it is recommended that you run: 56 | 57 | 58 | ``` 59 | dq init --folder_path /path/to/future/datapackage 60 | ``` 61 | 62 | This command will potentially spare you some effort and create a `dq_config.json` file 63 | with the default configuration for Data Quality CLI, a `datapackage.json` with the default 64 | info about the data package and schemas for all the required resources, a `data` folder 65 | that will be used to store the database and a `fetched` folder that will store the 66 | fetched sources. If you'd like to change the names of these folder or other configuration 67 | options, you can make a `dq_config.json` file before running the command. The command will 68 | leave your config file as it is and create the others according to your configuration. 69 | 70 | After running it, you should review and update your `dq_config` and `datapackage.json` 71 | with values specific to your project. 72 | 73 | ### Generate 74 | 75 | Generic command: 76 | 77 | ``` 78 | dq generate generator_name http://endpoint_to_data_sources 79 | ``` 80 | 81 | There is currently one built-in generator for [CKAN](http://ckan.org/) instances. 82 | Ex: In the example below, we generate a database from `data.qld.gov.au`: 83 | 84 | ``` 85 | dq generate ckan https://data.qld.gov.au/ 86 | ``` 87 | 88 | By default, it will include only `CSV` and `excel`(`XLS`, `XLSX`) files. If you want to change that use 89 | the `--file_type` option. In the example below, we ask for `CSV` and `TXT`: 90 | 91 | ``` 92 | dq generate ckan https://data.qld.gov.au/ --file_type csv --file_type txt 93 | ``` 94 | 95 | If you want to built a custom Generator, just inherit and overwrite the methods of [`data_quality.generators.BaseGenerator`](data_quality/generators/base.py) class. 96 | To load your custom generator class you need to provide the path to it so that it can be imported via 97 | [importlib.import_module](https://docs.python.org/3/library/importlib.html#importlib.import_module). 98 | You can either provide it in the config, or by using the `--generator_class_path` option: 99 | 100 | ``` 101 | dq generate custom_generator_name endpoint --generator_class_path mymodule.MyGenerator 102 | ``` 103 | 104 | If no config file is provided, the generator will use the [default configuration](###default-configuration) 105 | creating the files in the folder where the command is executed. If you want to change that, use the `--config_file_path` option: 106 | 107 | ``` 108 | dq generate generator_name endpoint --config_file_path path/to/config 109 | ``` 110 | 111 | ### Run 112 | 113 | 114 | ``` 115 | dq run /path/to/config.json --deploy 116 | ``` 117 | 118 | Runs a *data quality assessment* on all data sources in a data repository. 119 | 120 | * Writes aggregated results to the results.csv. 121 | * Writes run meta data to the run.csv. 122 | * If `--deploy` is passed, then also commits, tags and pushes the new changes back to the data repositories central repository. 123 | 124 | ### Deploy 125 | 126 | ``` 127 | dq deploy /path/to/config.json 128 | ``` 129 | 130 | 131 | ### Configuration 132 | 133 | 134 | #### Structure of json config 135 | 136 | ```json 137 | { 138 | # folder that contains the source_file and publisher_file 139 | "data_dir": "data", 140 | 141 | # folder that will store each source as local cache 142 | "cache_dir": "fetched", 143 | 144 | # file that will contain the result for each source 145 | "result_file": "results.csv", 146 | 147 | # file that will contain the report for each collection of sources 148 | "run_file": "runs.csv", 149 | 150 | # file containing the collection of sources that will be analyzed 151 | "source_file": "sources.csv", 152 | 153 | # file containing the publishers of the above mentioned sources 154 | "publisher_file": "publishers.csv", 155 | 156 | # will contain the results for each publisher 157 | "performance_file": "performance.csv", 158 | 159 | "remotes": ["origin"], 160 | "branch": "master", 161 | 162 | # name and path to custom generator (this name should be used when executing the generate command) 163 | "generator": {"my_generator_name": "my_module.MyGenerator" }, 164 | 165 | # whether or not to include timeliness as a dimension of quality assessment 166 | "assess_timeliness": false, 167 | 168 | # timeliness options: 169 | "timeliness": { 170 | 171 | # columns from source_file that should be checked for period detection 172 | "timeliness_strategy": ["column1", "column2"], 173 | 174 | # whether Data Quality CLI should detect period or expect it to be provided 175 | "extract_period": false, 176 | 177 | # maximum percent of sources with empty period allowed 178 | "max_empty_relevance_period": 10, 179 | 180 | # when date is ambiguous, which order should be preffered 181 | "date_order": "DMY", 182 | 183 | # how long after the period_id range is the data still considered timely (in months) 184 | "timeliness_period": 1 185 | } 186 | # options for GoodTables ("http://goodtables.readthedocs.org/en/latest/") 187 | "goodtables": { 188 | 189 | # set base url for the report links 190 | "goodtables_web": "http://goodtables.okfnlabs.org", 191 | 192 | "arguments": { 193 | 194 | # options for pipeline ("http://goodtables.readthedocs.org/en/latest/pipeline.html") 195 | "pipeline": { 196 | 197 | # what processors will analyze every pipeline 198 | "processors": ["structure", "schema"], 199 | 200 | # specify encoding for every pipeline 201 | (use this if all the files have the same encoding) 202 | "encoding": "ISO-8859-2", 203 | 204 | # pass options to procesors 205 | "options": { 206 | "schema": {"case_insensitive_headers": true} 207 | } 208 | }, 209 | 210 | # options for batch ("http://goodtables.readthedocs.org/en/latest/batch.html") 211 | "batch": { 212 | 213 | # column from source_file containing path/url to data source 214 | "data_key": "data", 215 | 216 | # column from source_file containing path/url to schema 217 | "schema_key": "schema", 218 | 219 | # column from source_file containing file format (csv, xls) 220 | "format_key": "format", 221 | 222 | # column from source_file containings file encoding 223 | (use this if you want to specify encoding for each source separately) 224 | "encoding_key": "encoding", 225 | 226 | # time in seconds to wait between pipelines 227 | "sleep": 2, 228 | 229 | # execute something after the analysis of a batch is finished 230 | "post_task": "", 231 | 232 | # execute something after the analysis of a pipeline is finished 233 | "pipeline_post_task": "", 234 | } 235 | } 236 | } 237 | } 238 | ``` 239 | 240 | 241 | #### Default config 242 | 243 | 244 | ```json 245 | { 246 | "data_dir": "current_working_directory/data", 247 | "cache_dir": "current_working_directory/fetched", 248 | "result_file": "results.csv", 249 | "run_file": "runs.csv", 250 | "source_file": "sources.csv", 251 | "publisher_file": "publishers.csv", 252 | "performance_file": "performance.csv", 253 | "remotes": ["origin"], 254 | "branch": "master", 255 | "assess_timeliness": false, 256 | "timeliness":{}, 257 | "goodtables": { 258 | "goodtables_web": "http://goodtables.okfnlabs.org", 259 | "arguments": { 260 | "pipeline": {}, 261 | "batch": { 262 | "data_key": "data" 263 | } 264 | } 265 | } 266 | } 267 | ``` 268 | 269 | 270 | #### Quality assessment configuration 271 | 272 | 273 | Currently, Data Quality CLI assesses the quality of a file based on its structure and 274 | by comparing its contents against a schema. This is done using the 275 | [built-in processors](http://goodtables.readthedocs.io/en/latest/cli.html) (a.k.a. validators) 276 | in [GoodTables](http://goodtables.readthedocs.io/en/latest/). 277 | 278 | *Note:* If the files are compressed, they cannot be found at the specified path or the path returns 279 | an HTML page, they will be scored 0. 280 | 281 | If you want to add other criteria for quality assessment, you can 282 | [create a custom processor for GoodTables](http://goodtables.readthedocs.io/en/latest/tutorial.html#implementing-a-custom-processor). 283 | Then include the name of your custom processor in the list passed to the `processors` parameter from [data quality config](###structure-of-json-config): 284 | `"processors": ["structure", "schema", "custom_processor"]`. 285 | You can also exclude processors that you don't want by removing them from the list. 286 | 287 | ##### Structure Processor: 288 | 289 | Checks the structure of a tabular file. 290 | 291 | Ex: blank or duplicate rows, rows that have more/less columns than the header, bad formatting etc. 292 | 293 | Options and their defaults: 294 | 295 | * `ignore_empty_rows: false` - Should empty rows be considered errors or just ignored? 296 | * `ignore_duplicate_rows: false` - Should duplicate rows be considered errors or just ignored? 297 | * `ignore_empty_columns: false` 298 | * `ignore_duplicate_columns: false` 299 | * `ignore_headerless_columns: false` - Should values in a row that don't correspond to a column be ignored? 300 | * `empty_strings: None` - A list/set of what should be considered empty string, otherwise only `''` will 301 | 302 | 303 | ##### Schema Processor: 304 | 305 | Compares the content of a tabular file against a [Json Table Schema](http://specs.frictionlessdata.io/table-schema/). 306 | You have the following options for the schema: 307 | 308 | 1. Provide a path to the schema for each source in `source_file` and [set the "schema_key"](#config) to the name 309 | of the column that contains it 310 | 2. Let GoodTables infer the schema for each file from its first few lines (less transparent). 311 | 312 | Options and defaults: 313 | 314 | * `ignore_field_order: true` - Should columns have the same order as in the schema? 315 | * `infer_schema: false` - Should the schema be infered? (see above) 316 | * `process_extra_fields: false` - Should fields that are not present in the schema be infered and checked? 317 | * `case_insensitive_headers: false` - Should headers be matched with the equivalent field names from schema regardless of case? 318 | 319 | *Note:* If you use the schema processor but you don't provide a schema to compare against, the files will be evaluated as having no errors. 320 | 321 | ##### Examples: 322 | To exemplify how using different processors influences the quality assessment, we set up several versions 323 | of the same dataset: UK public spend over £25000. 324 | 325 | [Here](https://uk-25k-structure-only.herokuapp.com/) is a dashboard whose 326 | data quality database is assessed only on `structure`. You can find the database and configuration 327 | [in this repository](https://github.com/georgiana-b/data-quality-uk-25k-spend/tree/uk-25k-spend-structure-only). 328 | 329 | [This alternative version](https://uk-25k-given-schema.herokuapp.com/) 330 | uses both `structure` and `schema` processors, comparing each file agaist the 331 | [spend publishing schema](https://raw.githubusercontent.com/okfn/goodtables/master/examples/hmt/spend-publishing-schema.json). 332 | It is the official configuration, with its corresponding repository [here](https://github.com/georgiana-b/data-quality-uk-25k-spend/tree/uk-25k-given-schema). 333 | 334 | Lastly, [here is the less predictible version](https://uk-25k-inferred-schema.herokuapp.com/) 335 | that uses both `structure` and `schema`, but it compares files agaist inferred schemas (i.e. using `infer_schema: true`).Corresponding 336 | database repostory [here](https://github.com/georgiana-b/data-quality-uk-25k-spend/tree/uk-25k-spend-inferred-schema). 337 | 338 | ##### Timeliness 339 | An optional criteria for quality assessment is the timeliness of data publication. 340 | We define timeliness as the difference in months between when the data source _should have been published_ 341 | and _when it was published_. If you want to include timeliness in the quality assessment 342 | set `assess_timeliness: true`. 343 | 344 | "When the data should have been published" is what we call `period_id` and reffers to 345 | the period of time the data is relevant for. There are two options for providing `period_id`: 346 | 347 | - You can provide it for each source and include the column name in the config: 348 | `"timeliness": {"timeliness_strategy": ["column_name"]}` 349 | - Let Data Quality CLI detect the period from certain fields in `source_file` 350 | that are likely to contain it: 351 | 352 | ``` 353 | "timeliness": { 354 | "extract_period": true, 355 | "timeliness_strategy": ["column1", "column2"] 356 | } 357 | ``` 358 | 359 | The order will tell Data Quality CLI which field has priority. In this example, 360 | it will try to find something in `column1` and move to `column2` only if 361 | nothing was found. You can specify as many fields as you want. Please note that 362 | if the date is ambiguous, Data Quality CLI will prefer the format `dd-mm-yyyy`. 363 | You can change that with the `date_order` option. For example, 364 | `"timeliness": {"date_order": "MDY"}` will change the preffered order 365 | to `mm-dd-yyyy`. 366 | 367 | Regardless of the method you choose, Data Quality CLI will parse the fields you 368 | provided in `timeliness_strategy`, try to extract a period out of them and write 369 | it in the `source_file`. 370 | 371 | NOTE: If you provide a `period_id` it will be parsed and replaced by one with 372 | the same dates but a different format used thoughout the CLI. 373 | 374 | If no `period_id` can be extracted for more than 10% of the sources, Data Quality CLI 375 | will abort timeliness assessment and raise an error. If you want to change that, 376 | set `max_empty_relevance_period` to the desired percent. If the precent of sources 377 | laking `period_id` doesn't exceed `max_empty_relevance_period`, the value in the 378 | `created_at` column will be used as `period_id` for them. 379 | 380 | By default, a data source is considered timely if no more than a month has passed from 381 | the end of `period_id` until it was published (`created_at`). You can change that with the 382 | `timeliness_period` option by providing a different *number of months*. 383 | Ex: `"timeliness": {"timeliness_period": 3}` means that the data source is timely 384 | if no more than 3 months passed since the end of `period_id`. 385 | The quality score will decrease for every additional month after the period considered timely. 386 | 387 | ### Schema 388 | 389 | `Data Quality CLI` expects the following structure of the project folder, where 390 | the names of files and folders are the ones defined in the json config given to `dq run`: 391 | 392 | ``` 393 | project 394 | │ 395 | └──────data_dir 396 | │ source_file 397 | │ publisher_file 398 | │ run_file 399 | │ result_file 400 | │ performance_file 401 | │ 402 | └───cache_dir 403 | │ 404 | └───datapackage.json 405 | ``` 406 | 407 | The `datapackage.json` file is required in order to make the project 408 | a valid [Data Package](http://specs.frictionlessdata.io/data-package/). If you use 409 | the `dq init` command, it will be automatically generated for you from the 410 | [the default datapackage](data_quality/datapackage.json). 411 | This file will be needed thoughout the app so you'll need to have it. 412 | Take a look over the [Data Package](http://specs.frictionlessdata.io/data-package/) 413 | specification if you'd like to customize the it for your project. 414 | 415 | *Warning:* The `datapackage.json` file is extensively used thoughtout Data Quality CLI and 416 | the Data Quality Dashboard. To make sure it is kept in sync with the database that it 417 | describes, several checks are performed at different steps. While you are free to customize 418 | your database by using custom generators and extra fields, 419 | you have to make sure that the fields required by Data Quality CLI to perform it's tasks are present. 420 | 421 | -------------------------------------------------------------------------------- /data_quality/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.1 -------------------------------------------------------------------------------- /data_quality/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | from . import tasks 10 | from . import main 11 | from . import generators 12 | from . import compat 13 | from . import utilities 14 | 15 | def get_version(): 16 | version_path = os.path.join(os.path.dirname(__file__), 'VERSION') 17 | return io.open(version_path, encoding='utf-8').readline().strip() 18 | 19 | __version__ = get_version() 20 | 21 | __all__ = ['main', 'tasks', 'generators', 'compat'] 22 | 23 | -------------------------------------------------------------------------------- /data_quality/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #pylint: skip-file 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | from __future__ import unicode_literals 7 | 8 | import sys 9 | import io 10 | import csv 11 | import os 12 | 13 | 14 | _ver = sys.version_info 15 | is_py2 = (_ver[0] == 2) 16 | is_py3 = (_ver[0] == 3) 17 | is_py33 = (is_py3 and _ver[1] == 3) 18 | is_py34 = (is_py3 and _ver[1] == 4) 19 | is_py27 = (is_py2 and _ver[1] == 7) 20 | 21 | 22 | if is_py2: 23 | from urlparse import urljoin 24 | 25 | builtin_str = str 26 | bytes = str 27 | str = unicode 28 | basestring = basestring 29 | numeric_types = (int, long, float) 30 | 31 | elif is_py3: 32 | from urllib.parse import urljoin 33 | 34 | builtin_str = str 35 | str = str 36 | bytes = bytes 37 | basestring = (str, bytes) 38 | numeric_types = (int, float) 39 | 40 | 41 | def to_bytes(str): 42 | """Convert a text string to a byte string""" 43 | return str.encode('utf-8') 44 | 45 | 46 | def to_builtin_str(str): 47 | """Convert a text string to the built-in `str` on the runtime.""" 48 | if is_py2: 49 | return str.encode('utf-8') 50 | else: 51 | return str 52 | 53 | class UnicodeWriter(object): 54 | """ 55 | This class provides functionality for writing CSV files 56 | in a given encoding, python 2 and 3 compatible 57 | It is a slight adaptation of the code here: 58 | http://python3porting.com/problems.html#csv-api-changes 59 | """ 60 | def __init__(self, filename, 61 | encoding='utf-8', **kw): 62 | self.filename = filename 63 | self.encoding = encoding 64 | self.kw = kw 65 | 66 | def __enter__(self): 67 | if is_py3: 68 | self.f = open(self.filename, 'w+t', 69 | encoding=self.encoding) 70 | else: 71 | self.f = open(self.filename, 'w+b') 72 | self.writer = csv.writer(self.f, lineterminator=os.linesep, **self.kw) 73 | return self 74 | 75 | def __exit__(self, type, value, traceback): 76 | self.f.close() 77 | 78 | def writerow(self, row): 79 | for index, val in enumerate(row): 80 | if type(val) not in [str, bytes, builtin_str]: 81 | if val is None: 82 | val = '' 83 | val = str(val) 84 | if is_py2: 85 | val = val.encode(self.encoding) 86 | row[index] = val 87 | self.writer.writerow(row) 88 | 89 | def writerows(self, rows): 90 | for row in rows: 91 | self.writerow(row) 92 | 93 | 94 | class UnicodeAppender(UnicodeWriter): 95 | """ 96 | This class provides functionality for appending to CSV files 97 | in a given encoding, python 2 and 3 compatible 98 | """ 99 | 100 | def __enter__(self): 101 | if is_py3: 102 | self.f = open(self.filename, 'at', 103 | encoding=self.encoding) 104 | else: 105 | self.f = open(self.filename, 'ab') 106 | self.writer = csv.writer(self.f, lineterminator=os.linesep, **self.kw) 107 | return self 108 | 109 | 110 | class UnicodeDictWriter(UnicodeWriter): 111 | """ 112 | This class provides functionality for writing CSV file rows from dicts 113 | in a given encoding, python 2 and 3 compatible 114 | """ 115 | def __init__(self, filename, fieldnames, encoding='utf-8', **kw): 116 | self.fieldnames = fieldnames 117 | super(UnicodeDictWriter, self).__init__(filename, encoding, **kw) 118 | 119 | def writerow(self, row): 120 | for key, val in row.items(): 121 | if type(val) not in [str, bytes, builtin_str]: 122 | if val is None: 123 | val = '' 124 | val = str(val) 125 | if is_py2: 126 | val = val.encode(self.encoding) 127 | row[key] = val 128 | self.writer.writerow([row.get(key, '') for key in self.fieldnames]) 129 | 130 | def writeheader(self): 131 | self.writer.writerow(self.fieldnames) 132 | 133 | 134 | class UnicodeReader(object): 135 | """ 136 | This class provides functionality to read from CSV files 137 | in a given encoding, python 2 and 3 compatible 138 | """ 139 | def __init__(self, filename, encoding='utf-8', **kw): 140 | self.filename = filename 141 | self.encoding = encoding 142 | self.kw = kw 143 | 144 | def __enter__(self): 145 | if is_py3: 146 | self.f = open(self.filename, 'rt', encoding=self.encoding) 147 | else: 148 | self.f = open(self.filename, 'rb') 149 | self.reader = csv.reader(self.f, **self.kw) 150 | return self 151 | 152 | def __exit__(self, type, value, traceback): 153 | self.f.close() 154 | 155 | def next(self): 156 | row = next(self.reader) 157 | if is_py3: 158 | return row 159 | return [s.decode('utf-8') for s in row] 160 | 161 | __next__ = next 162 | 163 | def __iter__(self): 164 | return self 165 | 166 | 167 | class UnicodeDictReader(UnicodeReader): 168 | """ 169 | This class provides functionality to read CSV file rows as dicts 170 | in a given encoding, python 2 and 3 compatible 171 | """ 172 | def __init__(self, filename, encoding='utf-8', **kw): 173 | super(UnicodeDictReader, self).__init__(filename, encoding, **kw) 174 | 175 | def __enter__(self): 176 | if is_py3: 177 | self.f = open(self.filename, 'rt', encoding=self.encoding) 178 | else: 179 | self.f = open(self.filename, 'rb') 180 | self.reader = csv.reader(self.f, **self.kw) 181 | self.header = next(self.reader) 182 | return self 183 | 184 | def next(self): 185 | row = next(self.reader) 186 | if is_py2: 187 | row= [s.decode('utf-8') for s in row] 188 | return {self.header[x]: row[x] for x in range(len(self.header))} 189 | 190 | __next__ = next 191 | 192 | def __iter__(self): 193 | return self 194 | 195 | -------------------------------------------------------------------------------- /data_quality/datapackage.default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "last_modified": "", 4 | "validator_url": "https://goodtables.okfnlabs.org/api/run", 5 | "admin": "", 6 | "pitch": "", 7 | "context": "", 8 | "sources": [{"name": "", "web": ""}], 9 | "resources": [ 10 | { 11 | "path": "publishers.csv", 12 | "name": "publisher_file", 13 | "schema": { 14 | "fields": [ 15 | { 16 | "name": "id", 17 | "title": "ID of the publisher", 18 | "type": "string", 19 | "constraints": { "required": true, "unique": true } 20 | }, 21 | { 22 | "name": "title", 23 | "title": "Title or official name of the publisher", 24 | "type": "string", 25 | "constraints": { "required": true, "unique": true } 26 | } 27 | ], 28 | "primaryKey": "id" 29 | } 30 | }, 31 | { 32 | "path": "sources.csv", 33 | "name": "source_file", 34 | "schema": { 35 | "fields": [ 36 | { 37 | "name": "id", 38 | "title": "ID of the source", 39 | "type": "string", 40 | "constraints": { "required": true, "unique": true } 41 | }, 42 | { 43 | "name": "publisher_id", 44 | "title": "ID of the source's publisher", 45 | "type": "string", 46 | "constraints": { "required": true, "unique": true } 47 | }, 48 | { 49 | "name": "title", 50 | "title": "Title or name of the source", 51 | "type": "string", 52 | "constraints": { "required": true } 53 | }, 54 | { 55 | "name": "data", 56 | "title": "Path/url to source", 57 | "type": "string", 58 | "constraints": { "required": true } 59 | }, 60 | { 61 | "name": "format", 62 | "title": "File format of the source", 63 | "type": "string" 64 | }, 65 | { 66 | "name": "created_at", 67 | "title": "Time of the source's creation.", 68 | "type": "string", 69 | "constraints": { "required": true } 70 | } 71 | ], 72 | "primaryKey": "id", 73 | "foreignKeys": [ 74 | { 75 | "fields": "publisher_id", 76 | "reference": { 77 | "resource": "publisher_file", 78 | "fields": "id" 79 | } 80 | } 81 | ] 82 | } 83 | }, 84 | { 85 | "path": "runs.csv", 86 | "name": "run_file", 87 | "schema": { 88 | "fields": [ 89 | { 90 | "name": "id", 91 | "title": "ID of the run", 92 | "type": "string", 93 | "constraints": { "required": true, "unique": true } 94 | }, 95 | { 96 | "name": "timestamp", 97 | "title": "Timestamp of the run execution", 98 | "type": "date", 99 | "format": "datetime", 100 | "constraints": { "required": true } 101 | }, 102 | { 103 | "name": "total_score", 104 | "title": "Rounded average score of results in this run", 105 | "type": "integer", 106 | "constraints": { "required": true} 107 | } 108 | ], 109 | "primaryKey": "id" 110 | } 111 | }, 112 | { 113 | "path": "results.csv", 114 | "name": "result_file", 115 | "schema": { 116 | "fields": [ 117 | { 118 | "name": "id", 119 | "title": "ID of the result", 120 | "type": "string", 121 | "constraints": { "required": true, "unique": true } 122 | }, 123 | { 124 | "name": "source_id", 125 | "title": "ID of the correspoding source", 126 | "type": "string", 127 | "constraints": { "required": true, "unique": true } 128 | }, 129 | { 130 | "name": "publisher_id", 131 | "title": "ID of the source's publisher", 132 | "type": "string", 133 | "constraints": { "required": true} 134 | }, 135 | { 136 | "name": "created_at", 137 | "title": "time of the source's creation.", 138 | "type": "date", 139 | "format": "date", 140 | "constraints": { "required": true } 141 | }, 142 | { 143 | "name": "data", 144 | "title": "Path/url to source", 145 | "type": "string", 146 | "constraints": { "required": true } 147 | }, 148 | { 149 | "name": "schema", 150 | "title": "Path/url to the source's schema", 151 | "type": "string" 152 | }, 153 | { 154 | "name": "score", 155 | "title": "Score of correctness given by GoodTables", 156 | "type": "integer", 157 | "contrains": { "required": true } 158 | }, 159 | { 160 | "name": "summary", 161 | "title": "Summary", 162 | "type": "string" 163 | }, 164 | { 165 | "name": "run_id", 166 | "title": "ID of the run in which the result was calculated", 167 | "type": "string", 168 | "constraints": { "required": true, "unique": true } 169 | }, 170 | { 171 | "name": "timestamp", 172 | "title": "Timestamp of the run execution", 173 | "type": "date", 174 | "format": "datetime", 175 | "constraints": { "required": true } 176 | }, 177 | { 178 | "name": "report", 179 | "title": "Path/url to the full GoodTabels report", 180 | "type": "string" 181 | } 182 | ], 183 | "primaryKey": "id", 184 | "foreignKeys": [ 185 | { 186 | "fields": "source_id", 187 | "reference": { 188 | "resource": "source_file", 189 | "fields": "id" 190 | } 191 | }, 192 | { 193 | "fields": "publisher_id", 194 | "reference": { 195 | "resource": "publisher_file", 196 | "fields": "id" 197 | } 198 | }, 199 | { 200 | "fields": "run_id", 201 | "reference": { 202 | "resource": "run_file", 203 | "fields": "id" 204 | } 205 | } 206 | ] 207 | } 208 | }, 209 | { 210 | "path": "performance.csv", 211 | "name": "performance_file", 212 | "schema": { 213 | "fields": [ 214 | { 215 | "name": "publisher_id", 216 | "title": "ID of the publisher", 217 | "type": "string", 218 | "constraints": { "required": true, "unique": true } 219 | }, 220 | { 221 | "name": "month_of_creation", 222 | "title": "Month when the source was created", 223 | "type": "date", 224 | "format": "date", 225 | "constraints": { "required": true } 226 | }, 227 | { 228 | "name": "files_count", 229 | "title": "Number of files published by the publisher during period", 230 | "type": "integer", 231 | "constraints": { "required": true } 232 | }, 233 | { 234 | "name": "score", 235 | "title": "Rounded average score of files published by the publisher during period", 236 | "type": "integer", 237 | "constraints": { "required": true } 238 | }, 239 | { 240 | "name": "valid", 241 | "title": "Number of valid files published by the publisher during period", 242 | "type": "integer", 243 | "constraints": { "required": true } 244 | }, 245 | { 246 | "name": "files_count_to_date", 247 | "title": "Number of files published by the publisher up to period", 248 | "type": "integer", 249 | "constraints": { "required": true } 250 | }, 251 | { 252 | "name": "score_to_date", 253 | "title": "Rounded average score of files published by the publisher up to period", 254 | "type": "integer", 255 | "constraints": { "required": true } 256 | }, 257 | { 258 | "name": "valid_to_date", 259 | "title": "Number of valid files published by the publisher up to period", 260 | "type": "integer", 261 | "constraints": { "required": true } 262 | } 263 | ], 264 | "foreignKeys": [ 265 | { 266 | "fields": "publisher_id", 267 | "reference": { 268 | "resource": "publisher_file", 269 | "fields": "id" 270 | } 271 | } 272 | ] 273 | } 274 | } 275 | ] 276 | } -------------------------------------------------------------------------------- /data_quality/dq.default.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "data", 3 | "cache_dir": "fetched", 4 | "result_file": "results.csv", 5 | "run_file": "runs.csv", 6 | "source_file": "sources.csv", 7 | "publisher_file": "publishers.csv", 8 | "performance_file": "performance.csv", 9 | "datapackage_file": "datapackage.json", 10 | "remotes": ["origin"], 11 | "branch": "master", 12 | "assess_timeliness": false, 13 | "timeliness":{}, 14 | "data_quality_spec": { 15 | "data_quality_spec_web": "https://cdn.rawgit.com/frictionlessdata/data-quality-spec/4d7140394f2d46c5d66f91d4be2bb41477e5f583/spec.json" 16 | }, 17 | "goodtables": { 18 | "goodtables_web": "http://goodtables.okfnlabs.org", 19 | "arguments": { 20 | "pipeline": { 21 | "break_on_invalid_processor": false 22 | }, 23 | "batch": { 24 | "data_key": "data" 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /data_quality/exceptions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | 8 | class SourceNotFoundError(Exception): 9 | 10 | def __init__(self, msg=None, source=None): 11 | default_msg = 'The source {0} was not found in \'source_file\''.format(source) 12 | self.msg = msg or default_msg 13 | super(SourceNotFoundError, self).__init__(msg) 14 | 15 | 16 | class DuplicateDataSourceError(Exception): 17 | 18 | def __init__(self, msg=None, source=None): 19 | default_msg = 'Different sources with the same path {0} have been found \ 20 | in \'source_file\''.format(source) 21 | self.msg = msg or default_msg 22 | super(DuplicateDataSourceError, self).__init__(msg) 23 | 24 | class UnableToAssessTimeliness(Exception): 25 | 26 | def __init__(self, msg=None): 27 | default_msg = 'Timeliness cannot be assessed.' 28 | self.msg = msg or default_msg 29 | super(UnableToAssessTimeliness, self).__init__(msg) 30 | 31 | -------------------------------------------------------------------------------- /data_quality/generators/__init__.py: -------------------------------------------------------------------------------- 1 | # # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .ckan import CkanGenerator 8 | from .base import BaseGenerator 9 | 10 | __all__ = ['CkanGenerator', 'BaseGenerator'] 11 | 12 | _built_in_generators = {'ckan': CkanGenerator} 13 | -------------------------------------------------------------------------------- /data_quality/generators/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | 8 | class BaseGenerator(object): 9 | """This is the base class for generators. All generators should inherit.""" 10 | 11 | def __init__(self, url=None, datapackage=None): 12 | 13 | self.base_url = url 14 | self.datapackage = datapackage 15 | 16 | if not self.base_url: 17 | raise TypeError('Cannot generate the database without the "url" parameter.') 18 | 19 | def generate_sources(self, sources_filepath, file_types=['csv', 'excel']): 20 | """Generate sources file for CSV database""" 21 | 22 | raise NotImplementedError('You must overwrite this method with your ' 23 | 'generator\'s specific logic.') 24 | 25 | def generate_publishers(self, publishers_filepath): 26 | """Generate publishers file for CSV database""" 27 | 28 | raise NotImplementedError('You must overwrite this method with your ' 29 | 'generator\'s specific logic.') 30 | -------------------------------------------------------------------------------- /data_quality/generators/ckan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | import csv 8 | from os import path 9 | import requests 10 | import jsontableschema 11 | from data_quality import compat, utilities 12 | from .base import BaseGenerator 13 | 14 | class CkanGenerator(BaseGenerator): 15 | """This class generates a csv database from a CKAN instance located at the given url""" 16 | 17 | def __init__(self, url=None, datapackage=None): 18 | """Create an instance if the source url is given. 19 | 20 | Args: 21 | url: the base url for the CKAN instance 22 | """ 23 | 24 | super(CkanGenerator, self).__init__(url, datapackage) 25 | self.default_publisher = None 26 | 27 | def generate_sources(self, sources_filepath, file_types=['csv', 'excel']): 28 | """Generates sources_file from the url""" 29 | 30 | file_types = [ftype.lower() for ftype in file_types] 31 | results = self.get_sources() 32 | sources = [] 33 | source_resource = utilities.get_datapackage_resource(sources_filepath, 34 | self.datapackage) 35 | source_schema = jsontableschema.model.SchemaModel(source_resource.descriptor['schema']) 36 | for result in results: 37 | sources += self.extract_sources(result, file_types) 38 | 39 | with compat.UnicodeWriter(sources_filepath, 40 | quoting=csv.QUOTE_MINIMAL) as sfile: 41 | sfile.writerow(source_schema.headers) 42 | for source in sources: 43 | try: 44 | values = [compat.str(source[key]) for key in source_schema.headers] 45 | sfile.writerow(list(source_schema.convert_row(*values))) 46 | except jsontableschema.exceptions.MultipleInvalid as e: 47 | for error in e.errors: 48 | raise error 49 | 50 | def get_sources(self): 51 | """Get all sources from CKAN API as a list""" 52 | 53 | extension = 'api/3/action/package_search' 54 | full_url = compat.urljoin(self.base_url, extension) 55 | response = requests.get(full_url) 56 | response.raise_for_status() 57 | data = response.json() 58 | count = data['result']['count'] 59 | all_packages = [] 60 | all_sources = [] 61 | for start in range(0, count, 500): 62 | payload = {'rows': 500, 'start': start} 63 | response = requests.get(full_url, params=payload) 64 | data = response.json() 65 | all_packages += [result['id'] for result in data['result']['results']] 66 | 67 | for package_id in all_packages: 68 | ext = 'api/3/action/package_show' 69 | full_package_url = compat.urljoin(self.base_url, ext) 70 | package_payload = {'use_default_schema': True, 'id': package_id} 71 | response = requests.get(full_package_url, params=package_payload) 72 | data = response.json() 73 | all_sources.append(data['result']) 74 | return all_sources 75 | 76 | def extract_sources(self, datum, file_types): 77 | """Extract all sources for one result""" 78 | 79 | resources = [] 80 | for resource in datum.get('resources', {}): 81 | new_resource = {} 82 | new_resource['data'] = resource['url'] 83 | ext = path.splitext(new_resource['data'])[1][1:].lower() 84 | new_resource['format'] = 'excel' if ext in ['xls', 'xlsx'] else ext 85 | file_types = ['excel' if ext in ['xls', 'xlsx'] else ext for ext in file_types] 86 | file_types.append('') 87 | if new_resource['format'] in file_types: 88 | publisher = datum.get('organization', None) 89 | if publisher: 90 | new_resource['publisher_id'] = publisher.get('name') 91 | else: 92 | self.default_publisher = {'name': 'no_organization', 93 | 'display_name': 'No Organization'} 94 | new_resource['publisher_id'] = self.default_publisher['name'] 95 | new_resource['id'] = resource['id'] 96 | new_resource['created_at'] = resource['created'] 97 | title = datum.get('title', '') 98 | name = resource.get('name', '') 99 | new_resource['title'] = ' / '.join(val for val in [title, name] if val) 100 | resources.append(new_resource) 101 | return resources 102 | 103 | def generate_publishers(self, publishers_filepath): 104 | """Generates publisher_file from the url""" 105 | 106 | results = self.get_publishers() 107 | if self.default_publisher: 108 | results.append(self.default_publisher) 109 | pub_resource = utilities.get_datapackage_resource(publishers_filepath, 110 | self.datapackage) 111 | pub_schema = jsontableschema.model.SchemaModel(pub_resource.descriptor['schema']) 112 | 113 | with compat.UnicodeWriter(publishers_filepath, 114 | quoting=csv.QUOTE_MINIMAL) as pfile: 115 | pfile.writerow(pub_schema.headers) 116 | for result in results: 117 | result = self.extract_publisher(result) 118 | try: 119 | values = [result[key] for key in pub_schema.headers] 120 | pfile.writerow(list(pub_schema.convert_row(*values))) 121 | except jsontableschema.exceptions.MultipleInvalid as e: 122 | for error in e.errors: 123 | raise error 124 | 125 | def get_publishers(self): 126 | """Retrieves the publishers from CKAN API as a list""" 127 | 128 | extension = "api/3/action/organization_list" 129 | payload = {'all_fields':True, 130 | 'include_groups': True, 131 | 'include_extras':True 132 | } 133 | full_url = compat.urljoin(self.base_url, extension) 134 | response = requests.get(full_url, params=payload) 135 | publishers = response.json()['result'] 136 | return publishers 137 | 138 | def extract_publisher(self, result): 139 | """Converts `result` into dict with standard compliant field names""" 140 | 141 | publisher = {} 142 | publisher['id'] = result.get('name', '') 143 | publisher['title'] = result.get('display_name', '') 144 | for extra in result.get('extras', []): 145 | key = extra.get('key') 146 | if key == 'contact-email': 147 | publisher['email'] = extra.get('value') 148 | if key == 'contact-name': 149 | publisher['contact'] = extra.get('value') 150 | if key == 'category': 151 | publisher['type'] = extra.get('value') 152 | return publisher 153 | -------------------------------------------------------------------------------- /data_quality/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import click 9 | from goodtables import pipeline 10 | from . import tasks, utilities, generators 11 | 12 | @click.group() 13 | def cli(): 14 | """The entry point into the CLI.""" 15 | 16 | @cli.command() 17 | @click.argument('config_file_path') 18 | @click.option('--encoding', default=None) 19 | @click.option('--deploy', is_flag=True) 20 | def run(config_file_path, deploy, encoding): 21 | """Process data sources for a Spend Publishing Dashboard instance.""" 22 | 23 | config = utilities.load_json_config(config_file_path) 24 | utilities.resolve_dir(config['cache_dir']) 25 | utilities.set_up_cache_dir(config['cache_dir']) 26 | source_filepath = os.path.join(config['data_dir'], config['source_file']) 27 | 28 | if config['assess_timeliness'] is True: 29 | extractor = tasks.extract_relevance_period.RelevancePeriodExtractor(config) 30 | extractor.run() 31 | 32 | aggregator = tasks.Aggregator(config) 33 | 34 | if deploy: 35 | 36 | def batch_handler(instance): 37 | aggregator.write_run() 38 | assesser = tasks.PerformanceAssessor(config) 39 | assesser.run() 40 | deployer = tasks.Deployer(config) 41 | deployer.run() 42 | 43 | else: 44 | 45 | def batch_handler(instance): 46 | aggregator.write_run() 47 | assesser = tasks.PerformanceAssessor(config) 48 | assesser.run() 49 | 50 | post_tasks = {'post_task': batch_handler, 'pipeline_post_task': aggregator.run} 51 | config['goodtables']['arguments']['batch'].update(post_tasks) 52 | batch_options = config['goodtables']['arguments']['batch'] 53 | batch_options['pipeline_options'] = config['goodtables']['arguments']['pipeline'] 54 | batch = pipeline.Batch(source_filepath, **batch_options) 55 | batch.run() 56 | 57 | 58 | @cli.command() 59 | @click.argument('config_file_path') 60 | def deploy(config_file_path): 61 | """Deploy data sources for a Spend Publishing Dashboard instance.""" 62 | 63 | config = utilities.load_json_config(config_file_path) 64 | deployer = tasks.Deployer(config) 65 | deployer.run() 66 | 67 | 68 | @cli.command() 69 | @click.argument('generator_name') 70 | @click.argument('endpoint') 71 | @click.option('-cf', '--config_file_path', type=click.Path(exists=True), default=None, 72 | help='Full path to the json config for data-quality-cli') 73 | @click.option('-gp', '--generator_class_path', default=None, 74 | help='Path to your custom generator (Ex: mymodule.CustomGenerator)') 75 | @click.option('-ft', '--file_type', multiple=True, default=['csv','excel'], 76 | help='File types that should be included in sources (default: csv and excel)') 77 | def generate(generator_name, endpoint, config_file_path, generator_class_path, file_type): 78 | """Generate a database from the given endpoint 79 | 80 | Args: 81 | generator_name: Name of the generator (ex: ckan) 82 | endpoint: Url where the generator should get the data from 83 | """ 84 | 85 | file_types = list(file_type) 86 | config = utilities.load_json_config(config_file_path) 87 | if not config_file_path: 88 | default_config_path = os.path.join(os.getcwd(), 'dq_config.json') 89 | config['data_dir'] = utilities.resolve_dir_name(default_config_path, 90 | config['data_dir']) 91 | utilities.resolve_dir(config['data_dir']) 92 | 93 | if generator_name not in generators._built_in_generators.keys(): 94 | generator_class_path = (generator_class_path or 95 | config.get('generator', {}).get(generator_name, None)) 96 | if not generator_class_path: 97 | msg = ('You need to provide the path for your custom generator using the' 98 | '`--generator_class_path` option or by providing it in the config:' 99 | 'Ex: {"generator":{"generator_name": "mymodule.CustomGenerator"}}') 100 | raise ValueError(msg) 101 | 102 | generator = tasks.GeneratorManager(config) 103 | generator.run(generator_name, endpoint, generator_class_path, file_types) 104 | generator.update_datapackage_sources() 105 | 106 | 107 | @cli.command() 108 | @click.option('-p', '--folder_path', type=click.Path(exists=True), default=None, 109 | help='Full path to the workspace folder') 110 | def init(folder_path): 111 | 112 | workspace_folder = folder_path 113 | if not workspace_folder: 114 | workspace_folder = os.getcwd() 115 | 116 | initializer = tasks.DataPackageInitializer(workspace_folder) 117 | initializer.run() 118 | 119 | if __name__ == '__main__': 120 | cli() 121 | -------------------------------------------------------------------------------- /data_quality/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .base_task import Task 8 | from .initialize_datapackage import DataPackageInitializer 9 | from .generate import GeneratorManager 10 | from .aggregate import Aggregator 11 | from .deploy import Deployer 12 | from .assess_performance import PerformanceAssessor 13 | 14 | __all__ = ['Task', 'DataPackageInitializer', 'GeneratorManager', 'Aggregator', 15 | 'PerformanceAssessor', 'Deployer'] 16 | -------------------------------------------------------------------------------- /data_quality/tasks/aggregate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import csv 10 | import uuid 11 | import pytz 12 | import jsontableschema 13 | from math import log 14 | from datetime import datetime, timedelta 15 | from data_quality import utilities, compat, exceptions 16 | from .base_task import Task 17 | from .check_datapackage import DataPackageChecker 18 | from .extract_relevance_period import RelevancePeriodExtractor 19 | 20 | 21 | class Aggregator(Task): 22 | 23 | """A Task runner to create results for data sources as they move 24 | through a processing pipeline. 25 | """ 26 | 27 | def __init__(self, config, **kwargs): 28 | super(Aggregator, self).__init__(config, **kwargs) 29 | datapackage_check = DataPackageChecker(self.config) 30 | datapackage_check.run() 31 | run_resource = utilities.get_datapackage_resource(self.run_file, 32 | self.datapackage) 33 | result_resource = utilities.get_datapackage_resource(self.result_file, 34 | self.datapackage) 35 | self.run_schema = jsontableschema.model.SchemaModel(run_resource.descriptor['schema']) 36 | self.result_schema = jsontableschema.model.SchemaModel(result_resource.descriptor['schema']) 37 | self.initialize_file(self.result_file, self.result_schema.headers) 38 | self.initialize_file(self.run_file, self.run_schema.headers) 39 | self.run_id = compat.str(uuid.uuid4().hex) 40 | self.timestamp = datetime.now(pytz.utc) 41 | self.all_scores = [] 42 | self.assess_timeliness = self.config['assess_timeliness'] 43 | self.timeliness_period = self.config['timeliness'].get('timeliness_period', 1) 44 | self.max_score = 100 45 | required_resources = [self.result_file, self.source_file, 46 | self.publisher_file, self.run_file] 47 | datapackage_check.check_database_completeness(required_resources) 48 | self.lookup = self.get_lookup() 49 | 50 | def run(self, pipeline): 51 | """Run on a Pipeline instance.""" 52 | 53 | with compat.UnicodeAppender(self.result_file, quoting=csv.QUOTE_MINIMAL) as result_file: 54 | source = self.get_source(pipeline.data_source) 55 | result_id = compat.str(uuid.uuid4().hex) 56 | source['created_at'] = utilities.date_from_string(source['created_at']) 57 | if source['created_at'] is None: 58 | raise ValueError(('No date could be extracted from `created_at`' 59 | ' field in source: {0}.').format(source)) 60 | score = self.get_pipeline_score(pipeline, source) 61 | data_source = pipeline.data_source 62 | schema = '' 63 | summary = '' # TODO: how/what should a summary be? 64 | report = self.get_pipeline_report_url(pipeline) 65 | 66 | result = [result_id, source['id'], source['publisher_id'], 67 | source['created_at'], data_source, schema, score, 68 | summary, self.run_id, self.timestamp, report] 69 | try: 70 | result_file.writerow(list(self.result_schema.convert_row(*result))) 71 | except jsontableschema.exceptions.MultipleInvalid as e: 72 | for error in e.errors: 73 | raise error 74 | 75 | if pipeline.data: 76 | self.fetch_data(pipeline.data.stream, pipeline.data.encoding, source) 77 | 78 | def get_lookup(self): 79 | 80 | _keys = ['id', 'publisher_id', self.data_key, 'created_at', 'title', 81 | 'period_id'] 82 | lookup = [] 83 | 84 | with compat.UnicodeDictReader(self.source_file) as sources_file: 85 | for row in sources_file: 86 | lookup.append({k: v for k, v in row.items() if k in _keys}) 87 | 88 | return lookup 89 | 90 | def initialize_file(self, filepath, headers): 91 | """"Make sure a file exists and has headers before appending to it. 92 | 93 | Args: 94 | filepath: path to the file to be created 95 | headers: a tuple to write as header 96 | 97 | """ 98 | if not os.path.exists(filepath): 99 | with compat.UnicodeWriter(filepath, quoting=csv.QUOTE_MINIMAL) as a_file: 100 | a_file.writerow(headers) 101 | 102 | def write_run(self): 103 | """Write this run in the run file.""" 104 | 105 | with compat.UnicodeAppender(self.run_file, quoting=csv.QUOTE_MINIMAL) as run_file: 106 | entry = [self.run_id, self.timestamp, int(round(sum(self.all_scores) / len(self.lookup)))] 107 | try: 108 | run_file.writerow(list(self.run_schema.convert_row(*entry))) 109 | except jsontableschema.exceptions.MultipleInvalid as e: 110 | for error in e.errors: 111 | raise error 112 | 113 | return True 114 | 115 | def fetch_data(self, data_stream, encoding, source): 116 | """Cache the data source in the /fetched directory""" 117 | 118 | source_name = source.get('name', source[self.data_key].rsplit('/', 1)[-1]) 119 | source_name = source_name or source['id'] 120 | cached_file_name = os.path.join(self.cache_dir, source_name) 121 | data_stream.seek(0) 122 | 123 | with io.open(cached_file_name, mode='w+', encoding=encoding) as fetched_file: 124 | for line in data_stream: 125 | fetched_file.write(line) 126 | 127 | def get_source(self, data_src): 128 | """Find the entry correspoding to data_src from sources file""" 129 | 130 | matches = [match for match in self.lookup if match[self.data_key] == data_src] 131 | 132 | if len(matches) == 0: 133 | raise exceptions.SourceNotFoundError(source=data_src) 134 | elif len(matches) > 1: 135 | for pos in range(len(matches)-1): 136 | first_values = set(matches[pos].values()) 137 | second_values = set(matches[pos+1].values()) 138 | differences = first_values.symmetric_difference(second_values) 139 | if len(differences) != 0: 140 | raise exceptions.DuplicateDataSourceError(source=data_src) 141 | 142 | return matches[0] 143 | 144 | def get_pipeline_report_url(self, pipeline): 145 | """Return a URL to a report on this data.""" 146 | 147 | return self.config['goodtables']['goodtables_web'] 148 | 149 | def get_pipeline_score(self, pipeline, source): 150 | """Return a score for this pipeline run.""" 151 | 152 | score = self.max_score 153 | report = pipeline.report.generate() 154 | error_stats = self.get_error_stats(report) 155 | base_errors = {err: stats for err, stats in error_stats.items() 156 | if stats['processor'] == 'base'} 157 | if base_errors: 158 | score = 0 159 | else: 160 | score = self.score_by_error_occurences(error_stats) 161 | if self.assess_timeliness: 162 | publication_delay = self.get_publication_delay(source) 163 | score -= publication_delay 164 | score = round(score) 165 | if score < 0: 166 | score = 0 167 | self.all_scores.append(score) 168 | return score 169 | 170 | def get_publication_delay(self, source): 171 | """Determine how long the data source publication was delayed""" 172 | 173 | dates = {} 174 | relevance_period = source['period_id'].split('/') 175 | relevance_period = relevance_period + [None]*(2 - len(relevance_period)) 176 | dates['period_start'], dates['period_end'] = relevance_period 177 | dates = {k: utilities.date_from_string(v) for k, v in dates.items()} 178 | dates['period_end'] = dates['period_end'] or dates['period_start'] 179 | timely_until = dates['period_end'] + \ 180 | timedelta(days=(self.timeliness_period * 30)) 181 | if dates['period_start'] <= source['created_at'] <= timely_until: 182 | delay = 0 183 | else: 184 | delay = source['created_at'] - timely_until 185 | delay = delay.days 186 | if delay < 0: 187 | delay = 0 188 | delay = delay / 30.00 189 | return delay 190 | 191 | def get_error_stats(self, report): 192 | """Return dict with stats on errors""" 193 | 194 | results = report['results'] 195 | dq_spec = utilities.get_data_quality_spec() 196 | error_stats = {} 197 | for result in results: 198 | if result['result_level'] == 'error': 199 | error = error_stats.get(result['result_id'], None) 200 | if not error: 201 | if result['processor'] == 'base': 202 | error_spec = {} 203 | else: 204 | error_number = result['result_id'].split('_')[-1] 205 | error_number = str(int(error_number) - 1) 206 | error_spec = dq_spec[result['processor']][error_number] 207 | new_stats = {'occurrences': 1, 'rows': [result['row_index']], 208 | 'weight': error_spec.get('weight', 1), 209 | 'processor': result['processor']} 210 | error_stats[result['result_id']] = new_stats 211 | else: 212 | error['occurrences'] += 1 213 | error['rows'].append(result['row_index']) 214 | return error_stats 215 | 216 | def score_by_error_occurences(self, error_stats): 217 | """Score data source based on based on number of occurrences of each error 218 | Algorithm: `total score - (error_weight * no_occurrences) / 219 | (Σ 1/no_occurrences )` 220 | 221 | Args: 222 | error_stats: dict with stats on each error 223 | """ 224 | 225 | score = self.max_score 226 | for error, stats in error_stats.items(): 227 | no_occurrences = stats['occurrences'] 228 | harmonic_mean_occ = no_occurrences / harmonic_number(no_occurrences) 229 | error_impact = stats['weight'] * harmonic_mean_occ 230 | score -= error_impact 231 | return score 232 | 233 | def harmonic_number(n): 234 | """Return an approximate value of n-th harmonic number, based on the 235 | Euler-Mascheroni constant by the formula: H(n)≈ln(n)+γ+1/2*n−1/12*n^2 236 | """ 237 | 238 | gamma = 0.57721566490153286 239 | return gamma + log(n) + 0.5/n - 1./(12*n**2) 240 | -------------------------------------------------------------------------------- /data_quality/tasks/assess_performance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import pytz 8 | import dateutil 9 | import datetime 10 | import jsontableschema 11 | from data_quality import utilities, compat 12 | from .base_task import Task 13 | from .check_datapackage import DataPackageChecker 14 | 15 | 16 | class PerformanceAssessor(Task): 17 | 18 | """A Task runner to assess and write the performance of publishers for each 19 | period. 20 | """ 21 | 22 | def __init__(self, *args, **kwargs): 23 | super(PerformanceAssessor, self).__init__(*args, **kwargs) 24 | datapackage_check = DataPackageChecker(self.config) 25 | datapackage_check.run() 26 | required_resources = [self.result_file, self.source_file, 27 | self.publisher_file, self.run_file] 28 | datapackage_check.check_database_completeness(required_resources) 29 | 30 | def run(self): 31 | """Write the performance for all publishers.""" 32 | 33 | publisher_ids = self.get_publishers() 34 | performance_resource = utilities.get_datapackage_resource(self.performance_file, 35 | self.datapackage) 36 | performance_schema = jsontableschema.model.SchemaModel(performance_resource.descriptor['schema']) 37 | 38 | with compat.UnicodeWriter(self.performance_file) as performance_file: 39 | performance_file.writerow(performance_schema.headers) 40 | available_periods = [] 41 | 42 | for publisher_id in publisher_ids: 43 | sources = self.get_sources(publisher_id) 44 | periods = self.get_unique_periods(sources) 45 | available_periods += periods 46 | all_periods = self.get_all_periods(available_periods) 47 | 48 | publishers_performances = [] 49 | all_sources = [] 50 | 51 | for publisher_id in publisher_ids: 52 | sources = self.get_sources(publisher_id) 53 | performances = self.get_periods_data(publisher_id, all_periods, 54 | sources) 55 | publishers_performances += performances 56 | all_sources += sources 57 | for row in utilities.dicts_to_schema_rows(performances, 58 | performance_schema): 59 | performance_file.writerow(row) 60 | 61 | all_performances = self.get_periods_data('all', all_periods, all_sources) 62 | for row in utilities.dicts_to_schema_rows(all_performances, 63 | performance_schema): 64 | performance_file.writerow(row) 65 | 66 | def get_publishers(self): 67 | """Return list of publishers ids.""" 68 | 69 | publisher_ids = [] 70 | 71 | with compat.UnicodeDictReader(self.publisher_file) as publishers_file: 72 | for row in publishers_file: 73 | publisher_ids.append(row['id']) 74 | return publisher_ids 75 | 76 | def get_sources(self, publisher_id): 77 | """Return list of sources of a publisher with id, period and score. """ 78 | 79 | sources = [] 80 | 81 | with compat.UnicodeDictReader(self.source_file) as sources_file: 82 | for row in sources_file: 83 | source = {} 84 | if row['publisher_id'] == publisher_id: 85 | source['id'] = row['id'] 86 | source['created_at'] = utilities.date_from_string(row['created_at']) 87 | source['score'] = self.get_source_score(source['id']) 88 | sources.append(source) 89 | return sources 90 | 91 | def get_source_score(self, source_id): 92 | """Return latest score of a source from results. 93 | 94 | Args: 95 | source_id: id of the source whose score is wanted 96 | """ 97 | 98 | score = 0 99 | latest_timestamp = pytz.timezone('UTC').localize(datetime.datetime.min) 100 | 101 | with compat.UnicodeDictReader(self.result_file) as result_file: 102 | for row in result_file: 103 | if row['source_id'] == source_id: 104 | timestamp = dateutil.parser.parse(row['timestamp']) 105 | if timestamp > latest_timestamp: 106 | latest_timestamp = timestamp 107 | score = int(row['score']) 108 | return score 109 | 110 | def get_periods_data(self, publisher_id, periods, sources): 111 | """Return list of performances for a publisher, by period. 112 | 113 | Args: 114 | publisher_id: publisher in dicussion 115 | periods: list of all available_periods 116 | sources: list of publisher's sources 117 | 118 | """ 119 | 120 | performances = [] 121 | period_sources_to_date = [] 122 | 123 | for period in periods: 124 | period_sources = self.get_period_sources(period, sources) 125 | period_sources_to_date += period_sources 126 | performance = {} 127 | performance['publisher_id'] = publisher_id 128 | performance['month_of_creation'] = compat.str(period) 129 | performance['files_count'] = len(period_sources) 130 | performance['score'] = self.get_period_score(period_sources) 131 | performance['valid'] = self.get_period_valid(period_sources) 132 | performance['score_to_date'] = self.get_period_score(period_sources_to_date) 133 | performance['valid_to_date'] = self.get_period_valid(period_sources_to_date) 134 | performance['files_count_to_date'] = len(period_sources_to_date) 135 | performances.append(performance) 136 | return performances 137 | 138 | def get_period_sources(self, period, sources): 139 | """Return list of sources for a period. 140 | 141 | Args: 142 | period: a date object 143 | sources: list of sources 144 | 145 | """ 146 | 147 | period_sources = [] 148 | 149 | for source in sources: 150 | if period == source['created_at'].replace(day=1): 151 | period_sources.append(source) 152 | return period_sources 153 | 154 | def get_period_score(self, period_sources): 155 | """Return average score from list of sources. 156 | 157 | Args: 158 | period_sources: sources correspoding to a certain period 159 | """ 160 | 161 | score = 0 162 | 163 | if len(period_sources) > 0: 164 | total = 0 165 | for source in period_sources: 166 | total += int(source['score']) 167 | score = int(round(total / len(period_sources))) 168 | return score 169 | 170 | def get_period_valid(self, period_sources): 171 | """Return valid percentage from list of sources. 172 | 173 | Args: 174 | period_sources: sources correspoding to a certain period 175 | """ 176 | 177 | valid = 0 178 | if len(period_sources) > 0: 179 | valids = [] 180 | for source in period_sources: 181 | if int(source['score']) == 100: 182 | valids.append(source) 183 | if valids: 184 | valid = int(round(len(valids) / len(period_sources) * 100)) 185 | return valid 186 | 187 | def get_unique_periods(self, sources): 188 | """Return list of unique periods as date objects from sources. 189 | 190 | Args: 191 | sources: a list of sources 192 | 193 | """ 194 | 195 | periods = [] 196 | for source in sources: 197 | periods.append(source['created_at']) 198 | periods = list(set(periods)) 199 | return periods 200 | 201 | def get_all_periods(self, periods): 202 | """Return all periods from oldest in periods to now. 203 | 204 | Args: 205 | periods: list of date objects 206 | 207 | """ 208 | 209 | oldest_date = sorted(periods)[0] 210 | oldest_date = oldest_date.replace(day=1) 211 | current_date = datetime.date.today() 212 | delta = dateutil.relativedelta.relativedelta(months=1) 213 | relative_date = oldest_date 214 | all_periods = [] 215 | 216 | while relative_date <= current_date: 217 | all_periods.append(relative_date) 218 | relative_date += delta 219 | return all_periods 220 | -------------------------------------------------------------------------------- /data_quality/tasks/base_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import datapackage 9 | 10 | 11 | class Task(object): 12 | 13 | """Base class for Data Quality CLI tasks.""" 14 | 15 | def __init__(self, config, **kwargs): 16 | self.config = config 17 | self.remotes = self.config['remotes'] 18 | self.branch = self.config['branch'] 19 | self.data_dir = self.config['data_dir'] 20 | self.result_file = os.path.join(self.data_dir, self.config['result_file']) 21 | self.run_file = os.path.join(self.data_dir, self.config['run_file']) 22 | self.source_file = os.path.join(self.data_dir, self.config['source_file']) 23 | self.performance_file = os.path.join(self.data_dir, 24 | self.config['performance_file']) 25 | self.publisher_file = os.path.join(self.data_dir, 26 | self.config['publisher_file']) 27 | self.cache_dir = self.config['cache_dir'] 28 | self.data_key = self.config['goodtables']['arguments']['batch']['data_key'] 29 | datapkg_file_path = self.config.get('datapackage_file', 'datapackage.json') 30 | if not os.path.isabs(datapkg_file_path): 31 | datapkg_file_path = os.path.join(os.path.dirname(self.data_dir), 32 | datapkg_file_path) 33 | try: 34 | self.datapackage = datapackage.DataPackage(datapkg_file_path) 35 | except datapackage.exceptions.DataPackageException as e: 36 | raise ValueError(('A datapackage couldn\'t be created because of the ' 37 | 'following error: "{0}". Make sure the file is not ' 38 | 'empty and use "dq init" command.').format(e)) 39 | self.all_scores = [] -------------------------------------------------------------------------------- /data_quality/tasks/check_datapackage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | from jsontableschema.model import SchemaModel 9 | from goodtables import pipeline 10 | from data_quality import utilities 11 | from . import Task 12 | 13 | 14 | class DataPackageChecker(Task): 15 | 16 | """A task runner to check that the data package is correct""" 17 | 18 | def __init__(self, config, inflexible_resources=[]): 19 | super(DataPackageChecker, self).__init__(config) 20 | self.inflexible_resources = ['run_file', 'result_file', 'performance_file'] 21 | self.inflexible_resources.extend(inflexible_resources) 22 | self.inflexible_resources = set(inflexible_resources) 23 | 24 | def run(self): 25 | """Check user datapackage against default datapackage""" 26 | 27 | default_datapkg = utilities.get_default_datapackage() 28 | for default_resource in default_datapkg.resources: 29 | resource_path = os.path.join(self.config['data_dir'], 30 | self.config[default_resource.descriptor['name']]) 31 | resource = utilities.get_datapackage_resource(resource_path, 32 | self.datapackage) 33 | self.check_resource_schema(default_resource, resource) 34 | 35 | def check_resource_schema(self, default_resource, resource): 36 | """Check that user resource schema contains all the mandatory fields""" 37 | 38 | def get_uncustomizable_fields(schema): 39 | uncustomizable = ['constraints', 'format', 'name', 'type'] 40 | field_filter = lambda field: {key: val for key, val in field.items() 41 | if key in uncustomizable} 42 | fields = [field_filter(field) for field in schema.fields] 43 | fields = sorted(fields, key=lambda k: k['name']) 44 | 45 | resource_schema = SchemaModel(resource.descriptor['schema']) 46 | default_schema_dict = default_resource.descriptor['schema'] 47 | if default_resource.descriptor['name'] == 'source_file': 48 | for field in default_schema_dict['fields']: 49 | if field['name'] == 'data': 50 | field['name'] = self.data_key 51 | default_schema = SchemaModel(default_schema_dict) 52 | 53 | if default_resource.descriptor['name'] in self.inflexible_resources: 54 | if get_uncustomizable_fields(default_schema) != \ 55 | get_uncustomizable_fields(resource_schema): 56 | msg = ('The fields for "{0}" are not subject to' 57 | 'change').format(resource.local_data_path) 58 | raise ValueError(msg, resource.local_data_path) 59 | else: 60 | required_headers = set(default_schema.required_headers) 61 | resource_headers = set(resource_schema.headers) 62 | if not required_headers.issubset(resource_headers): 63 | missing_headers = required_headers.difference(resource_headers) 64 | msg = ('Fields [{0}] are needed for internal processing' 65 | 'but are missing from {1}.' 66 | ).format(','.join(missing_headers), resource.local_data_path) 67 | raise ValueError(msg, resource.local_data_path) 68 | 69 | def check_database_content(self): 70 | """Check that the database content is compliant with the datapackage""" 71 | 72 | self.run() 73 | for resource in self.datapackage.resources: 74 | resource_path = resource.local_data_path 75 | if os.path.exists(resource_path): 76 | options = {'schema': {'schema': resource.descriptor['schema']}} 77 | pipe = pipeline.Pipeline(resource_path, processors=['schema'], 78 | options=options) 79 | result, report = pipe.run() 80 | if result is False: 81 | issues = [res['result_message'] for res in report.generate()['results']] 82 | msg = ('The file {0} is not compliant with the schema ' 83 | 'you declared for it in "datapackage.json".' 84 | 'Errors: {1}' 85 | ).format(resource_path, ';'.join(issues)) 86 | raise ValueError(msg) 87 | 88 | def check_database_completeness(self, required_resources=None): 89 | """Checks that 'required_resources', or all necessary ones exist in the database 90 | 91 | Args: 92 | required_resources: list of paths to required resources 93 | """ 94 | 95 | all_resources = [res.local_data_path for res in self.datapackage.resources] 96 | resources = required_resources or all_resources 97 | for resource_file in resources: 98 | if not os.path.exists(resource_file): 99 | msg = ('The file "{0}" is needed but it doesn\'t exist.' 100 | 'Please create it or use "dq generate".' 101 | ).format(resource_file) 102 | raise ValueError(msg) 103 | -------------------------------------------------------------------------------- /data_quality/tasks/deploy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import subprocess 10 | import contextlib 11 | from time import strftime, gmtime 12 | import json 13 | from data_quality import compat 14 | from .base_task import Task 15 | from .check_datapackage import DataPackageChecker 16 | 17 | @contextlib.contextmanager 18 | def cd(path): 19 | """Move into a dir while the context is active.""" 20 | workpath = os.getcwd() 21 | os.chdir(path) 22 | yield 23 | os.chdir(workpath) 24 | 25 | 26 | class Deployer(Task): 27 | 28 | """A Task runner to deploy a Data Quality repository to a remote.""" 29 | 30 | commit_msg = 'New result and run data.' 31 | tag_msg = 'New result and run data.' 32 | tag_version = '' 33 | 34 | def run(self, simulate=False, *args): 35 | """Commit and deploy changes.""" 36 | 37 | datapackage_check = DataPackageChecker(self.config) 38 | datapackage_check.run() 39 | self._pull() 40 | self.update_last_modified() 41 | datapackage_check.check_database_completeness() 42 | datapackage_check.check_database_content() 43 | self._add() 44 | self._commit() 45 | if simulate: 46 | return True 47 | # self._tag() 48 | self._push() 49 | 50 | def _pull(self): 51 | """Pull in any changes from remotes.""" 52 | 53 | with cd(self.config['data_dir']): 54 | 55 | for remote in self.remotes: 56 | # fetch 57 | command = ['git', 'fetch', remote, self.branch] 58 | subprocess.call(command) 59 | # merge; prefer ours 60 | command = ['git', 'merge', '-s', 'recursive', '-X', 'ours', 61 | '{0}/{1}'.format(remote, self.branch)] 62 | subprocess.call(command) 63 | 64 | def _add(self): 65 | """Add the changed files to the git index.""" 66 | 67 | with cd(self.config['data_dir']): 68 | 69 | # add the changed files 70 | command = ['git', 'add', self.result_file] 71 | subprocess.call(command) 72 | command = ['git', 'add', self.run_file] 73 | subprocess.call(command) 74 | 75 | def _commit(self): 76 | 77 | with cd(self.config['data_dir']): 78 | command = ['git', 'commit', '-a', '-m', '{0}'.format(self.commit_msg)] 79 | subprocess.call(command) 80 | 81 | def _tag(self): 82 | with cd(self.config['data_dir']): 83 | command = ['git', 'tag', '-a', self.tag_version, '-m', '{0}'.format(self.tag_msg)] 84 | subprocess.call(command) 85 | 86 | def _push(self): 87 | 88 | with cd(self.config['data_dir']): 89 | command = ['git', 'push', '--follow-tags'] 90 | subprocess.call(command) 91 | 92 | def update_last_modified(self): 93 | """Update the 'last_modified' field in datapackage.json""" 94 | 95 | datapackage_path = os.path.join(self.datapackage.base_path, 96 | 'datapackage.json') 97 | 98 | with io.open(datapackage_path, mode='w+', encoding='utf-8') as datapkg_file: 99 | current_time = strftime("%Y-%m-%d %H:%M:%S %Z", gmtime()) 100 | self.datapackage.descriptor['last_modified'] = current_time 101 | updated_datapkg = json.dumps(self.datapackage.to_dict(), indent=4, 102 | sort_keys=True) 103 | datapkg_file.write(compat.str(updated_datapkg)) 104 | -------------------------------------------------------------------------------- /data_quality/tasks/extract_relevance_period.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | import datetime 9 | from dateparser.date import DateDataParser 10 | from jsontableschema.model import SchemaModel 11 | from data_quality import utilities, compat, exceptions 12 | from .base_task import Task 13 | from .check_datapackage import DataPackageChecker 14 | 15 | class RelevancePeriodExtractor(Task): 16 | 17 | """A Task runner that extracts the period a sources's content reffers to 18 | (is relevant for). 19 | """ 20 | 21 | def __init__(self, config): 22 | super(RelevancePeriodExtractor, self).__init__(config) 23 | timeliness_params = self.config['timeliness'] 24 | self.extract_period = timeliness_params.get('extract_period', False) 25 | self.timeliness_strategy = timeliness_params.get('timeliness_strategy', []) 26 | self.date_order = timeliness_params.get('date_order', 'DMY') 27 | self.max_empty_relevance_period = timeliness_params.get('max_empty_relevance_period', 10) 28 | if not self.timeliness_strategy: 29 | raise ValueError('You need to provide values for "timeliness_strategy."') 30 | datapackage_check = DataPackageChecker(self.config) 31 | datapackage_check.check_database_completeness([self.source_file]) 32 | settings = {'RETURN_AS_TIMEZONE_AWARE': False, 33 | 'PREFER_DAY_OF_MONTH': 'last', 34 | 'PREFER_DATES_FROM': 'past', 35 | 'SKIP_TOKENS': ['to'], 36 | 'DATE_ORDER': self.date_order} 37 | self.date_parser = DateDataParser(allow_redetect_language=True, 38 | settings=settings) 39 | 40 | def run(self): 41 | """Try to indentify the relevance period of sources""" 42 | 43 | sources = self.extract_period_from_sources() 44 | empty_period_sources = [source for source in sources 45 | if source['period_id'] is None] 46 | empty_period_percent = (len(empty_period_sources) * 100) / len(sources) 47 | empty_period_percent = round(empty_period_percent) 48 | if empty_period_percent > int(self.max_empty_relevance_period): 49 | msg = ('The relevance period couldn\'t be identified for' 50 | ' {0}% of sources therefore timeliness cannot be' 51 | ' assessed. Please provide more fields for "timeliness_' 52 | 'strategy", set "assess_timeliness" to false or increase' 53 | ' "max_empty_relevance_period".').format(empty_period_percent) 54 | raise exceptions.UnableToAssessTimeliness(msg) 55 | 56 | for source in sources: 57 | if source['period_id'] is None: 58 | creation_date = utilities.date_from_string(source['created_at']) 59 | dates = [creation_date, creation_date] 60 | else: 61 | period_start, period_end = source['period_id'] 62 | dates = [period_start.date(), period_end.date()] 63 | dates = [date.strftime('%d-%m-%Y') if isinstance(date, datetime.date) 64 | else '' for date in dates] 65 | source['period_id'] = '/'.join(dates) 66 | self.update_sources_period(sources) 67 | 68 | def extract_period_from_sources(self): 69 | """Try to extract relevance period for each source or return None""" 70 | 71 | sources = [] 72 | with compat.UnicodeDictReader(self.source_file) as source_file: 73 | timeliness_set = set(self.timeliness_strategy) 74 | found_fields = timeliness_set.intersection(set(source_file.header)) 75 | if not found_fields: 76 | raise ValueError(('At least one of the "timeliness_strategy" ' 77 | 'fields must be present in your "source_file".')) 78 | if not found_fields.issuperset(timeliness_set): 79 | missing_fields = timeliness_set.difference(found_fields) 80 | print(('Fields "{0}" from "timeliness_strategy" were not found ' 81 | 'in your `source_file`').format(missing_fields)) 82 | 83 | for source in source_file: 84 | timeliness_fields = {field: val for field, val in source.items() 85 | if field in self.timeliness_strategy} 86 | extracted_period = self.identify_period(timeliness_fields) 87 | source['period_id'] = extracted_period 88 | sources.append(source) 89 | return sources 90 | 91 | def identify_period(self, source={}): 92 | """Try to indentify the period of a source based on timeliess strategy 93 | 94 | Args: 95 | source: a dict corresponding to a source_file row 96 | """ 97 | 98 | field_dates = {} 99 | for field in self.timeliness_strategy: 100 | value = source.get(field, '') 101 | if not value: 102 | continue 103 | field_dates[field] = self.extract_dates(value) 104 | 105 | for field in self.timeliness_strategy: 106 | dates = field_dates.get(field, []) 107 | if not dates: 108 | continue 109 | period = resolve_period(dates) 110 | if period: 111 | break 112 | else: 113 | # It means we have more than 2 dates 114 | other_fields = list(self.timeliness_strategy) 115 | other_fields.remove(field) 116 | other_values = [field_dates.get(other_field, []) 117 | for other_field in other_fields] 118 | for values in other_values: 119 | date_objects = set(date['date_obj'] for date in dates) 120 | common_values = [date for date in values 121 | if date['date_obj'] in date_objects] 122 | period = resolve_period(common_values) 123 | if period: 124 | break 125 | else: 126 | period = None 127 | return period 128 | 129 | def extract_dates(self, line=""): 130 | """Try to extract dates from a line 131 | 132 | Args: 133 | line: a string that could contain a date or time range 134 | """ 135 | 136 | dates = [] 137 | potential_dates = re.findall(r'[0-9]+[\W_][0-9]+[\W_][0-9]+', line) 138 | line_words = re.sub(r'[\W_]+', ' ', line).split() 139 | years = filter_years(line_words) 140 | for word in years: 141 | if re.search(r'[a-zA-Z]', word): 142 | potential_dates.append(word) 143 | break 144 | for index, entry in enumerate(line_words): 145 | if entry == word: 146 | date = self.scan_for_date(line_words, index) 147 | if date: 148 | potential_dates.append(date) 149 | # Try to find a range 150 | if date['period'] != 'year' and date['date_obj']: 151 | range_start = self.scan_for_range(line_words, index, date) 152 | if not range_start: 153 | continue 154 | if range_start['date_obj'] < date['date_obj']: 155 | potential_dates.append(range_start) 156 | 157 | for potential_date in potential_dates: 158 | try: 159 | dates.append(self.date_parser.get_date_data(potential_date)) 160 | except TypeError: 161 | if isinstance(potential_date, dict): 162 | dates.append(potential_date) 163 | except ValueError: 164 | potential_date = None 165 | dates = [date for date in dates if date['date_obj'] is not None] 166 | dates = list({date['date_obj']:date for date in dates}.values()) 167 | return dates 168 | 169 | def scan_for_date(self, line_words, year_index): 170 | """Scan around the year for a date as complete as possible 171 | 172 | Args: 173 | line_words: a list of words (strings) 174 | year_index: index of a string from line_word that contains a year 175 | """ 176 | 177 | date_parts = line_words[year_index-2:year_index+1] or \ 178 | line_words[:year_index+1] 179 | potential_date = self.create_date_from_parts(date_parts) 180 | if not potential_date or potential_date['period'] == 'year': 181 | new_parts = list(reversed(line_words[year_index:year_index+3])) 182 | new_potential_date = self.create_date_from_parts(new_parts) 183 | if new_potential_date: 184 | potential_date = new_potential_date 185 | return potential_date 186 | 187 | def scan_for_range(self, line_words, year_index, range_end): 188 | """Scan to the left of the year whose corresponding date has 189 | been extracted to see if there is a range. 190 | 191 | Args: 192 | line_words: a list of words (strings) 193 | year_index: index of a string from line_word that contains a year 194 | range_end: date that has already been extracted from the year at 195 | year_index, potentially end of range 196 | """ 197 | 198 | if range_end['period'] == 'month': 199 | scan_start = year_index-2 200 | scan_end = year_index-4 201 | else: 202 | scan_start = year_index-3 203 | scan_end = year_index-5 204 | range_start_parts = line_words[scan_end:scan_start+1] or \ 205 | line_words[:scan_start+1] 206 | range_start_parts = [part for part in range_start_parts 207 | if self.create_date_from_parts([part]) is not None] 208 | years = filter_years(range_start_parts) 209 | if years: 210 | range_start_parts = [] 211 | if range_start_parts: 212 | if len(range_start_parts) == 1 and range_end['period'] == 'day': 213 | range_start_parts.append(compat.str(range_end['date_obj'].month)) 214 | range_start_parts.append(compat.str(range_end['date_obj'].year)) 215 | range_start = self.create_date_from_parts(range_start_parts) 216 | if range_start and range_start['period'] != range_end['period']: 217 | range_start = None 218 | return range_start 219 | 220 | def create_date_from_parts(self, date_parts=None): 221 | """Try to create a date object with date_parser or return None.""" 222 | 223 | if not date_parts: 224 | return None 225 | for index, part in enumerate(date_parts): 226 | if len(date_parts) == 2: 227 | if False not in [el.isdigit() for el in date_parts]: 228 | date_parts.insert(index, '31') 229 | potential_date = ' '.join(date_parts[index:]) 230 | try: 231 | date = self.date_parser.get_date_data(potential_date) 232 | except (ValueError, TypeError): 233 | date = None 234 | if date and date.get('date_obj') is not None: 235 | break 236 | else: 237 | date = None 238 | return date 239 | 240 | def update_sources_period(self, new_sources): 241 | """Overwrite source_file with the identified period_id""" 242 | 243 | source_resource = utilities.get_datapackage_resource(self.source_file, 244 | self.datapackage) 245 | source_idx = self.datapackage.resources.index(source_resource) 246 | source_schema_dict = self.datapackage.resources[source_idx].descriptor['schema'] 247 | updates = {'fields':[{'name': 'period_id', 'type': 'string', 248 | 'title': 'The period source data is relevant for.'}]} 249 | utilities.deep_update_dict(source_schema_dict, updates) 250 | source_schema = SchemaModel(source_schema_dict) 251 | 252 | with compat.UnicodeWriter(self.source_file) as source_file: 253 | source_file.writerow(source_schema.headers) 254 | for row in utilities.dicts_to_schema_rows(new_sources, 255 | source_schema): 256 | source_file.writerow(row) 257 | 258 | def resolve_period(dates=None): 259 | """Given a list of dates, try to create a period tuple or return None""" 260 | 261 | if not dates: 262 | period = None 263 | elif len(dates) == 1: 264 | period = period_from_date(dates[0]) 265 | elif len(dates) == 2: 266 | date_objects = sorted([date['date_obj'] for date in dates]) 267 | if dates[0]['period'] == 'year': 268 | date_objects[0] = date_objects[0].replace(month=1, day=1) 269 | if dates[1]['period'] == 'year': 270 | date_objects[1] = date_objects[1].replace(month=12, day=31) 271 | if dates[0]['period'] == 'month': 272 | date_objects[0] = date_objects[0].replace(day=1) 273 | period = (date_objects[0], date_objects[1]) 274 | else: 275 | period = None 276 | return period 277 | 278 | def period_from_date(date={}): 279 | """Create a period from a `dateparser` date dict""" 280 | 281 | if date.get('date_obj', None) is None: 282 | return None 283 | if date['period'] == 'day': 284 | range_start = date['date_obj'] 285 | range_end = date['date_obj'].replace(hour=23, minute=59) 286 | elif date['period'] == 'month': 287 | range_start = date['date_obj'].replace(day=1) 288 | range_end = date['date_obj'] 289 | else: 290 | range_start = datetime.datetime(date['date_obj'].year, 1, 1) 291 | range_end = datetime.datetime(date['date_obj'].year, 12, 31) 292 | return (range_start, range_end) 293 | 294 | def filter_years(words_list): 295 | """Filter strings that could contain a year from a list of words""" 296 | 297 | condition = lambda x: re.search(r'(?:19|20)[0-9]{2}', x) 298 | filtered_list = [word for word in filter(condition, words_list)] 299 | return filtered_list 300 | -------------------------------------------------------------------------------- /data_quality/tasks/generate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import json 10 | import importlib 11 | from data_quality import generators, utilities, compat 12 | from .base_task import Task 13 | from .check_datapackage import DataPackageChecker 14 | 15 | 16 | class GeneratorManager(Task): 17 | 18 | """A Task runner that manages dataset generators (ex: CkanGenerator).""" 19 | 20 | def __init__(self, config): 21 | super(GeneratorManager, self).__init__(config) 22 | datapackage_check = DataPackageChecker(self.config) 23 | datapackage_check.run() 24 | 25 | def run(self, generator_name, endpoint, generator_path, file_types, simulate=False): 26 | """Delegate the generation processes to the chosen generator 27 | Args: 28 | generator_name: Name of the generator (ex: ckan) 29 | endpoint: Url where the generator should get the data from 30 | generator_path: Path to the custom generator class, if used 31 | file_types: List of file types that should be included in sources 32 | """ 33 | 34 | if generators._built_in_generators.get(generator_name, None): 35 | inflexible_resources = ['source_file', 'publisher_file'] 36 | datapackage_check = DataPackageChecker(self.config, inflexible_resources) 37 | try: 38 | datapackage_check.run() 39 | except ValueError as e: 40 | msg = ('Looks like you have a custom schema for "{0}". Generator ' 41 | '"{1}" only works with the default schema. Please use a ' 42 | 'custom generator or match your schema to the default one.' 43 | ).format(e[1], generator_name) 44 | raise ValueError(msg) 45 | 46 | generator_class = generators._built_in_generators[generator_name] 47 | else: 48 | try: 49 | _module, _class = generator_path.rsplit('.', 1) 50 | generator_class = getattr(importlib.import_module(_module), _class) 51 | except ValueError: 52 | raise ValueError(('The path you provided for the generator class is ' 53 | 'not valid. Should be of type `mymodule.MyGenerator`')) 54 | generator = generator_class(endpoint, self.datapackage) 55 | 56 | if simulate: 57 | return generator 58 | 59 | generator.generate_sources(self.source_file, file_types=file_types) 60 | generator.generate_publishers(self.publisher_file) 61 | 62 | def update_datapackage_sources(self): 63 | """Update the 'sources' property of datapackage with the new sources""" 64 | 65 | datapackage_check = DataPackageChecker(self.config) 66 | required_resources = [self.source_file, self.publisher_file] 67 | datapackage_check.check_database_completeness(required_resources) 68 | datapackage_check.run() 69 | self.datapackage.descriptor['sources'] = [] 70 | datapkg_path = os.path.join(self.datapackage.base_path, 'datapackage.json') 71 | 72 | with compat.UnicodeDictReader(self.source_file) as sources_file: 73 | for source in sources_file: 74 | src_info = {'name': source['title'], 'web': source[self.data_key]} 75 | self.datapackage.descriptor['sources'].append(src_info) 76 | 77 | with io.open(datapkg_path, mode='w+', encoding='utf-8') as datapkg_file: 78 | new_datapkg = json.dumps(self.datapackage.to_dict(), indent=4, 79 | sort_keys=True) 80 | datapkg_file.write(compat.str(new_datapkg)) 81 | -------------------------------------------------------------------------------- /data_quality/tasks/initialize_datapackage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import io 9 | import json 10 | import datapackage 11 | from data_quality import utilities, compat 12 | from .check_datapackage import DataPackageChecker 13 | 14 | 15 | class DataPackageInitializer(object): 16 | 17 | """A task runner that makes a data-quality style data package from a 18 | given workspace folder 19 | """ 20 | 21 | def __init__(self, workspace_path): 22 | self.workspace_path = workspace_path 23 | 24 | def run(self): 25 | """Initialize all necessary files and folders""" 26 | 27 | config = self.initialize_config() 28 | utilities.resolve_dir(config['data_dir']) 29 | utilities.resolve_dir(config['cache_dir']) 30 | self.initialize_datapackage(config) 31 | 32 | def initialize_config(self): 33 | """Create a config for this instance or use the existing one""" 34 | 35 | init_config_path = os.path.join(self.workspace_path, 'dq_config.json') 36 | 37 | if os.path.exists(init_config_path): 38 | config = utilities.load_json_config(init_config_path) 39 | else: 40 | config = utilities.load_json_config(None) 41 | 42 | with io.open(init_config_path, mode='w+', encoding='utf-8') as new_config: 43 | new_json_config = json.dumps(config, indent=4, sort_keys=True) 44 | new_config.write(compat.str(new_json_config)) 45 | print(('A new config file has been created at {0}. ' 46 | 'Please review and update it.'.format(init_config_path))) 47 | return config 48 | 49 | def initialize_datapackage(self, config): 50 | """Create a datapackage or return the existing one along with it's path""" 51 | 52 | datapkg_file_path = config.get('datapackage_file', '') 53 | if not datapkg_file_path or not os.path.isabs(datapkg_file_path): 54 | datapkg_file_path = os.path.join(self.workspace_path, 'datapackage.json') 55 | 56 | datapkg_file_path = os.path.abspath(datapkg_file_path) 57 | if not os.path.exists(datapkg_file_path): 58 | with io.open(datapkg_file_path, mode='w+', encoding='utf-8') as new_datapkg: 59 | default_datapkg = utilities.get_default_datapackage() 60 | for resource in default_datapkg.resources: 61 | resource_path = config.get(resource.descriptor['name'], 62 | resource.descriptor['path']) 63 | resource.descriptor['path'] = os.path.join(config['data_dir'], 64 | resource_path) 65 | json_datapkg = json.dumps(default_datapkg.to_dict(), indent=4) 66 | new_datapkg.write(compat.str(json_datapkg)) 67 | print(('A new "datapackage.json" file has been created at {0}. ' 68 | 'Please review and update it.'.format(datapkg_file_path))) 69 | return default_datapkg 70 | else: 71 | datapackage_check = DataPackageChecker(config) 72 | datapackage_check.run() 73 | return datapackage.DataPackage(datapkg_file_path) 74 | 75 | -------------------------------------------------------------------------------- /data_quality/utilities.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import os 9 | import json 10 | import shutil 11 | import dateutil 12 | import requests 13 | import collections 14 | import datapackage 15 | import jsontableschema 16 | import pkg_resources 17 | 18 | def set_up_cache_dir(cache_dir_path): 19 | """Reset /cache_dir before a new batch.""" 20 | 21 | if os.path.lexists(cache_dir_path): 22 | for root, dirs, files in os.walk(cache_dir_path): 23 | for contained_file in files: 24 | os.unlink(os.path.join(root, contained_file)) 25 | 26 | for directory in dirs: 27 | shutil.rmtree(os.path.join(root, directory)) 28 | 29 | def resolve_dir(dir_path): 30 | """ Make sure the dir_path given in the config exists 31 | 32 | Args: 33 | dir_path: path of directory from config that should be resolved 34 | """ 35 | 36 | try: 37 | os.makedirs(dir_path) 38 | except OSError: 39 | if not os.path.isdir(dir_path): 40 | raise 41 | return dir_path 42 | 43 | def resolve_dir_name(config_filepath, dir_path): 44 | """Create an absolute path from the file path and the path given in the config""" 45 | 46 | if not os.path.isabs(dir_path): 47 | config_path = os.path.abspath(os.path.dirname(config_filepath)) 48 | return os.path.join(config_path, dir_path) 49 | else: 50 | return dir_path 51 | 52 | def load_json_config(config_filepath): 53 | """Loads the json config into a dictionary, overwriting the defaults""" 54 | 55 | default_config = pkg_resources.resource_string('data_quality', 'dq.default.json') 56 | default_config = json.loads(default_config.decode('utf-8')) 57 | 58 | if not config_filepath: 59 | return default_config 60 | with io.open(config_filepath, mode='rt', encoding='utf-8') as config_file: 61 | user_config = json.loads(config_file.read()) 62 | config = deep_update_dict(default_config, user_config) 63 | config['data_dir'] = resolve_dir_name(config_filepath, config['data_dir']) 64 | config['cache_dir'] = resolve_dir_name(config_filepath, config['cache_dir']) 65 | return config 66 | 67 | def get_data_quality_spec(): 68 | """Downloads and loads the data quality spec json""" 69 | 70 | config = load_json_config(None) 71 | dq_spec_url = config['data_quality_spec']['data_quality_spec_web'] 72 | json_dq_spec = requests.get(dq_spec_url) 73 | return json_dq_spec.json() 74 | 75 | def get_default_datapackage(): 76 | """Return the default datapackage""" 77 | 78 | default_datapkg = pkg_resources.resource_string('data_quality', 79 | 'datapackage.default.json') 80 | datapkg = datapackage.DataPackage(json.loads(default_datapkg.decode('utf-8'))) 81 | return datapkg 82 | 83 | def get_datapackage_resource(resource_path, datapkg): 84 | """Return the resource correspondent to `resource_path` from datapackage or raise""" 85 | 86 | matching_resources = [res for res in datapkg.resources 87 | if res.local_data_path == resource_path] 88 | if len(matching_resources) > 1: 89 | raise ValueError(('The resource with path "{0}" appears multiple times ' 90 | 'in your datapackage.').format(resource_path)) 91 | elif not matching_resources: 92 | raise ValueError(('The resource with path "{0}" can\'t be found in ' 93 | 'your datapackage. Please include it or ' 94 | 'use the "dq init" command.').format(resource_path)) 95 | else: 96 | return matching_resources[0] 97 | 98 | def deep_update_dict(source_dict, new_dict): 99 | """Update a nested dictionary (modified in place) with another dictionary. 100 | 101 | Args: 102 | source_dict: dict to be updated 103 | new_dict: dict to update with 104 | 105 | """ 106 | 107 | for key, value in new_dict.items(): 108 | if isinstance(value, collections.Mapping) and value: 109 | returned = deep_update_dict(source_dict.get(key, {}), value) 110 | source_dict[key] = returned 111 | elif isinstance(value, list): 112 | source_dict[key] = (source_dict.get(key, []) + value) 113 | else: 114 | source_dict[key] = new_dict[key] 115 | return source_dict 116 | 117 | def date_from_string(date_string): 118 | """Return a date object from a string or None 119 | 120 | Args: 121 | date_string: a string that should contain a date 122 | """ 123 | 124 | if not date_string: 125 | date = None 126 | else: 127 | try: 128 | date = dateutil.parser.parse(date_string).date() 129 | except ValueError: 130 | date = None 131 | return date 132 | 133 | def dicts_to_schema_rows(rows, schema): 134 | """Convert a list of dicts in a generator for schema compliant rows""" 135 | 136 | for row in rows: 137 | try: 138 | values = [row[key] for key in schema.headers] 139 | converted_row = list(schema.convert_row(*values)) 140 | yield converted_row 141 | except jsontableschema.exceptions.MultipleInvalid as e: 142 | for error in e.errors: 143 | raise error 144 | -------------------------------------------------------------------------------- /dq-config.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "/PATH/TO/DATA/DIRECTORY", 3 | "cache_dir": "/PATH/TO/CACHE/DIRECTORY", 4 | "result_file": "results.csv", 5 | "run_file": "runs.csv", 6 | "source_file": "sources.csv", 7 | "publisher_file": "publishers.csv", 8 | "remotes": ["origin"], 9 | "branch": "master", 10 | "goodtables_web": "http://goodtables.okfnlabs.org" 11 | } 12 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [BASIC] 2 | 3 | # List of builtins function names that should not be used, separated by a comma. 4 | bad-functions=map,filter,input,open 5 | 6 | [FORMAT] 7 | 8 | # Maximum number of characters on a single line. 9 | max-line-length=79 10 | 11 | [MESSAGES CONTROL] 12 | 13 | # Allow modules to be without docstrings. 14 | disable=C0111 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import os 7 | import io 8 | from setuptools import setup, find_packages 9 | 10 | 11 | def read(*paths): 12 | """Read a text file.""" 13 | basedir = os.path.dirname(__file__) 14 | fullpath = os.path.join(basedir, *paths) 15 | contents = io.open(fullpath, encoding='utf-8').read().strip() 16 | return contents 17 | 18 | 19 | PACKAGE = 'data_quality' 20 | INSTALL_REQUIRES = ['click>=6.2,<=7.0.0a', 'goodtables==0.7.6', 'pytz==2017.2', 'datapackage==0.8.1', 21 | 'jsontableschema==0.6.5', 'dateparser==0.4.0', 'tabulator==0.5.0'] 22 | TESTS_REQUIRE = ['tox'] 23 | README = read('README.md') 24 | VERSION = read(PACKAGE, 'VERSION') 25 | PACKAGES = find_packages(exclude=['examples', 'tests']) 26 | 27 | setup( 28 | name=PACKAGE, 29 | version=VERSION, 30 | packages=PACKAGES, 31 | include_package_data=True, 32 | install_requires=INSTALL_REQUIRES, 33 | tests_require=TESTS_REQUIRE, 34 | extras_require = {'develop': TESTS_REQUIRE + ['pylint']}, 35 | test_suite='tox', 36 | zip_safe=False, 37 | long_description=README, 38 | description='A CLI that builds a data quality assessment, for use in a Data Quality Dashboard.', 39 | author='Open Knowledge Foundation', 40 | author_email='info@okfn.org', 41 | url='https://github.com/okfn/data-quality-cli', 42 | license='MIT', 43 | keywords=['frictionless data', 'data quality'], 44 | package_data={ 45 | 'data_quality': ['datapackage.default.json', 'dq.default.json'], 46 | }, 47 | classifiers=[ 48 | 'Development Status :: 4 - Beta', 49 | 'Environment :: Web Environment', 50 | 'Intended Audience :: Developers', 51 | 'License :: OSI Approved :: MIT License', 52 | 'Operating System :: OS Independent', 53 | 'Programming Language :: Python :: 2', 54 | 'Programming Language :: Python :: 2.7', 55 | 'Programming Language :: Python :: 3', 56 | 'Programming Language :: Python :: 3.3', 57 | 'Programming Language :: Python :: 3.4', 58 | 'Programming Language :: Python :: 3.5', 59 | 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 60 | 'Topic :: Software Development :: Libraries :: Python Modules' 61 | ], 62 | entry_points={ 63 | 'console_scripts': [ 64 | 'dq = data_quality.main:cli', 65 | 'dataquality = data_quality.main:cli' 66 | ] 67 | }, 68 | ) 69 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/data-quality-cli/e9abc93b896ea59269d11cdc8f2d301f81be20ad/tests/__init__.py -------------------------------------------------------------------------------- /tests/fixtures/datapackage.json: -------------------------------------------------------------------------------- 1 | { 2 | "admin": "", 3 | "context": "", 4 | "last_modified": "", 5 | "name": "", 6 | "pitch": "", 7 | "resources": [ 8 | { 9 | "name": "publisher_file", 10 | "path": "publishers.csv", 11 | "schema": { 12 | "fields": [ 13 | { 14 | "constraints": { 15 | "required": true, 16 | "unique": true 17 | }, 18 | "name": "id", 19 | "title": "ID of the publisher", 20 | "type": "string" 21 | }, 22 | { 23 | "constraints": { 24 | "required": true, 25 | "unique": true 26 | }, 27 | "name": "title", 28 | "title": "Title or official name of the publisher", 29 | "type": "string" 30 | } 31 | ], 32 | "primaryKey": "id" 33 | } 34 | }, 35 | { 36 | "name": "source_file", 37 | "path": "sources.csv", 38 | "schema": { 39 | "fields": [ 40 | { 41 | "constraints": { 42 | "required": true, 43 | "unique": true 44 | }, 45 | "name": "id", 46 | "title": "ID of the source", 47 | "type": "string" 48 | }, 49 | { 50 | "constraints": { 51 | "required": true, 52 | "unique": true 53 | }, 54 | "name": "publisher_id", 55 | "title": "ID of the source's publisher", 56 | "type": "string" 57 | }, 58 | { 59 | "constraints": { 60 | "required": true 61 | }, 62 | "name": "title", 63 | "title": "Title of the source", 64 | "type": "string" 65 | }, 66 | { 67 | "constraints": { 68 | "required": true 69 | }, 70 | "name": "data", 71 | "title": "Path/url to source", 72 | "type": "string" 73 | }, 74 | { 75 | "name": "format", 76 | "title": "File format of the source", 77 | "type": "string" 78 | }, 79 | { 80 | "constraints": { 81 | "required": true 82 | }, 83 | "name": "created_at", 84 | "title": "Time of the source's creation.", 85 | "type": "string" 86 | } 87 | ], 88 | "foreignKeys": [ 89 | { 90 | "fields": "publisher_id", 91 | "reference": { 92 | "fields": "id", 93 | "resource": "publisher_file" 94 | } 95 | } 96 | ], 97 | "primaryKey": "id" 98 | } 99 | }, 100 | { 101 | "name": "run_file", 102 | "path": "runs.csv", 103 | "schema": { 104 | "fields": [ 105 | { 106 | "constraints": { 107 | "required": true, 108 | "unique": true 109 | }, 110 | "name": "id", 111 | "title": "ID of the run", 112 | "type": "string" 113 | }, 114 | { 115 | "constraints": { 116 | "required": true 117 | }, 118 | "format": "datetime", 119 | "name": "timestamp", 120 | "title": "Timestamp of the run execution", 121 | "type": "date" 122 | }, 123 | { 124 | "constraints": { 125 | "required": true 126 | }, 127 | "name": "total_score", 128 | "title": "Rounded average score of results in this run", 129 | "type": "integer" 130 | } 131 | ], 132 | "primaryKey": "id" 133 | } 134 | }, 135 | { 136 | "name": "result_file", 137 | "path": "results.csv", 138 | "schema": { 139 | "fields": [ 140 | { 141 | "constraints": { 142 | "required": true, 143 | "unique": true 144 | }, 145 | "name": "id", 146 | "title": "ID of the result", 147 | "type": "string" 148 | }, 149 | { 150 | "constraints": { 151 | "required": true, 152 | "unique": true 153 | }, 154 | "name": "source_id", 155 | "title": "ID of the correspoding source", 156 | "type": "string" 157 | }, 158 | { 159 | "constraints": { 160 | "required": true 161 | }, 162 | "name": "publisher_id", 163 | "title": "ID of the source's publisher", 164 | "type": "string" 165 | }, 166 | { 167 | "constraints": { 168 | "required": true 169 | }, 170 | "format": "date", 171 | "name": "created_at", 172 | "title": "Time of the source's creation.", 173 | "type": "date" 174 | }, 175 | { 176 | "constraints": { 177 | "required": true 178 | }, 179 | "name": "data", 180 | "title": "Path/url to source", 181 | "type": "string" 182 | }, 183 | { 184 | "name": "schema", 185 | "title": "Path/url to the source's schema", 186 | "type": "string" 187 | }, 188 | { 189 | "contrains": { 190 | "required": true 191 | }, 192 | "name": "score", 193 | "title": "Score of correctness given by GoodTables", 194 | "type": "integer" 195 | }, 196 | { 197 | "name": "summary", 198 | "title": "Summary", 199 | "type": "string" 200 | }, 201 | { 202 | "constraints": { 203 | "required": true, 204 | "unique": true 205 | }, 206 | "name": "run_id", 207 | "title": "ID of the run in which the result was calculated", 208 | "type": "string" 209 | }, 210 | { 211 | "constraints": { 212 | "required": true 213 | }, 214 | "format": "datetime", 215 | "name": "timestamp", 216 | "title": "Timestamp of the run execution", 217 | "type": "date" 218 | }, 219 | { 220 | "name": "report", 221 | "title": "Path/url to the full GoodTabels report", 222 | "type": "string" 223 | } 224 | ], 225 | "foreignKeys": [ 226 | { 227 | "fields": "source_id", 228 | "reference": { 229 | "fields": "id", 230 | "resource": "source_file" 231 | } 232 | }, 233 | { 234 | "fields": "publisher_id", 235 | "reference": { 236 | "fields": "id", 237 | "resource": "publisher_file" 238 | } 239 | }, 240 | { 241 | "fields": "run_id", 242 | "reference": { 243 | "fields": "id", 244 | "resource": "run_file" 245 | } 246 | } 247 | ], 248 | "primaryKey": "id" 249 | } 250 | }, 251 | { 252 | "name": "performance_file", 253 | "path": "performance.csv", 254 | "schema": { 255 | "fields": [ 256 | { 257 | "constraints": { 258 | "required": true, 259 | "unique": true 260 | }, 261 | "name": "publisher_id", 262 | "title": "ID of the publisher", 263 | "type": "string" 264 | }, 265 | { 266 | "constraints": { 267 | "required": true 268 | }, 269 | "format": "date", 270 | "name": "month_of_creation", 271 | "title": "Month when the source was created", 272 | "type": "date" 273 | }, 274 | { 275 | "constraints": { 276 | "required": true 277 | }, 278 | "name": "files_count", 279 | "title": "Number of files published by the publisher during period", 280 | "type": "integer" 281 | }, 282 | { 283 | "constraints": { 284 | "required": true 285 | }, 286 | "name": "score", 287 | "title": "Rounded average score of files published by the publisher during period", 288 | "type": "integer" 289 | }, 290 | { 291 | "constraints": { 292 | "required": true 293 | }, 294 | "name": "valid", 295 | "title": "Number of valid files published by the publisher during period", 296 | "type": "integer" 297 | }, 298 | { 299 | "constraints": { 300 | "required": true 301 | }, 302 | "name": "files_count_to_date", 303 | "title": "Number of files published by the publisher up to period", 304 | "type": "integer" 305 | }, 306 | { 307 | "constraints": { 308 | "required": true 309 | }, 310 | "name": "score_to_date", 311 | "title": "Rounded average score of files published by the publisher up to period", 312 | "type": "integer" 313 | }, 314 | { 315 | "constraints": { 316 | "required": true 317 | }, 318 | "name": "valid_to_date", 319 | "title": "Number of valid files published by the publisher up to period", 320 | "type": "integer" 321 | } 322 | ], 323 | "foreignKeys": [ 324 | { 325 | "fields": "publisher_id", 326 | "reference": { 327 | "fields": "id", 328 | "resource": "publisher_file" 329 | } 330 | } 331 | ] 332 | } 333 | } 334 | ], 335 | "sources": [], 336 | "validator_url": "https://goodtables.okfnlabs.org/api/run" 337 | } -------------------------------------------------------------------------------- /tests/fixtures/datapackage_missing_required.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "last_modified": "", 4 | "validator_url": "https://goodtables.okfnlabs.org/api/run", 5 | "admin": "", 6 | "pitch": "", 7 | "context": "", 8 | "sources": [{"name": "", "web": ""}], 9 | "resources": [ 10 | { 11 | "path": "publishers.csv", 12 | "name": "publisher_file", 13 | "schema": { 14 | "fields": [ 15 | { 16 | "name": "id", 17 | "title": "ID of the publisher", 18 | "type": "string", 19 | "constraints": { "required": true, "unique": true } 20 | }, 21 | ], 22 | "primaryKey": "id" 23 | } 24 | }, 25 | { 26 | "path": "sources.csv", 27 | "name": "source_file", 28 | "schema": { 29 | "fields": [ 30 | { 31 | "name": "id", 32 | "title": "ID of the source", 33 | "type": "string", 34 | "constraints": { "required": true, "unique": true } 35 | }, 36 | { 37 | "name": "publisher_id", 38 | "title": "ID of the source's publisher", 39 | "type": "string", 40 | "constraints": { "required": true, "unique": true } 41 | }, 42 | { 43 | "name": "title", 44 | "title": "Title of the source", 45 | "type": "string", 46 | "constraints": { "required": true } 47 | }, 48 | { 49 | "name": "data", 50 | "title": "Path/url to source", 51 | "type": "string", 52 | "constraints": { "required": true } 53 | }, 54 | { 55 | "name": "format", 56 | "title": "File format of the source", 57 | "type": "string" 58 | }, 59 | { 60 | "name": "created_at", 61 | "title": "Time covered by the source / of its creation", 62 | "type": "string", 63 | "constraints": { "required": true } 64 | } 65 | ], 66 | "primaryKey": "id", 67 | "foreignKeys": [ 68 | { 69 | "fields": "publisher_id", 70 | "reference": { 71 | "resource": "publisher_file", 72 | "fields": "id" 73 | } 74 | } 75 | ] 76 | } 77 | }, 78 | { 79 | "path": "runs.csv", 80 | "name": "run_file", 81 | "schema": { 82 | "fields": [ 83 | { 84 | "name": "id", 85 | "title": "ID of the run", 86 | "type": "string", 87 | "constraints": { "required": true, "unique": true } 88 | }, 89 | { 90 | "name": "timestamp", 91 | "title": "Timestamp of the run execution", 92 | "type": "date", 93 | "format": "datetime", 94 | "constraints": { "required": true } 95 | }, 96 | { 97 | "name": "total_score", 98 | "title": "Rounded average score of results in this run", 99 | "type": "integer", 100 | "constraints": { "required": true} 101 | } 102 | ], 103 | "primaryKey": "id" 104 | } 105 | }, 106 | { 107 | "path": "results.csv", 108 | "name": "result_file", 109 | "schema": { 110 | "fields": [ 111 | { 112 | "name": "id", 113 | "title": "ID of the result", 114 | "type": "string", 115 | "constraints": { "required": true, "unique": true } 116 | }, 117 | { 118 | "name": "source_id", 119 | "title": "ID of the correspoding source", 120 | "type": "string", 121 | "constraints": { "required": true, "unique": true } 122 | }, 123 | { 124 | "name": "publisher_id", 125 | "title": "ID of the source's publisher", 126 | "type": "string", 127 | "constraints": { "required": true} 128 | }, 129 | { 130 | "name": "created_at", 131 | "title": "Time covered by the source / of its creation", 132 | "type": "date", 133 | "format": "date", 134 | "constraints": { "required": true } 135 | }, 136 | { 137 | "name": "data", 138 | "title": "Path/url to source", 139 | "type": "string", 140 | "constraints": { "required": true } 141 | }, 142 | { 143 | "name": "schema", 144 | "title": "Path/url to the source's schema", 145 | "type": "string" 146 | }, 147 | { 148 | "name": "score", 149 | "title": "Score of correctness given by GoodTables", 150 | "type": "integer", 151 | "contrains": { "required": true } 152 | }, 153 | { 154 | "name": "summary", 155 | "title": "Summary", 156 | "type": "string" 157 | }, 158 | { 159 | "name": "run_id", 160 | "title": "ID of the run in which the result was calculated", 161 | "type": "string", 162 | "constraints": { "required": true, "unique": true } 163 | }, 164 | { 165 | "name": "timestamp", 166 | "title": "Timestamp of the run execution", 167 | "type": "date", 168 | "format": "datetime", 169 | "constraints": { "required": true } 170 | }, 171 | { 172 | "name": "report", 173 | "title": "Path/url to the full GoodTabels report", 174 | "type": "string" 175 | } 176 | ], 177 | "primaryKey": "id", 178 | "foreignKeys": [ 179 | { 180 | "fields": "source_id", 181 | "reference": { 182 | "resource": "source_file", 183 | "fields": "id" 184 | } 185 | }, 186 | { 187 | "fields": "publisher_id", 188 | "reference": { 189 | "resource": "publisher_file", 190 | "fields": "id" 191 | } 192 | }, 193 | { 194 | "fields": "run_id", 195 | "reference": { 196 | "resource": "run_file", 197 | "fields": "id" 198 | } 199 | } 200 | ] 201 | } 202 | } 203 | ] 204 | } -------------------------------------------------------------------------------- /tests/fixtures/datapackage_sources_with_period.json: -------------------------------------------------------------------------------- 1 | { 2 | "admin": "", 3 | "context": "", 4 | "last_modified": "", 5 | "name": "", 6 | "pitch": "", 7 | "resources": [ 8 | { 9 | "name": "publisher_file", 10 | "path": "publishers.csv", 11 | "schema": { 12 | "fields": [ 13 | { 14 | "constraints": { 15 | "required": true, 16 | "unique": true 17 | }, 18 | "name": "id", 19 | "title": "ID of the publisher", 20 | "type": "string" 21 | }, 22 | { 23 | "constraints": { 24 | "required": true, 25 | "unique": true 26 | }, 27 | "name": "title", 28 | "title": "Title or official name of the publisher", 29 | "type": "string" 30 | } 31 | ], 32 | "primaryKey": "id" 33 | } 34 | }, 35 | { 36 | "name": "source_file", 37 | "path": "sources_with_period_id.csv", 38 | "schema": { 39 | "fields": [ 40 | { 41 | "constraints": { 42 | "required": true, 43 | "unique": true 44 | }, 45 | "name": "id", 46 | "title": "ID of the source", 47 | "type": "string" 48 | }, 49 | { 50 | "constraints": { 51 | "required": true, 52 | "unique": true 53 | }, 54 | "name": "publisher_id", 55 | "title": "ID of the source's publisher", 56 | "type": "string" 57 | }, 58 | { 59 | "constraints": { 60 | "required": true 61 | }, 62 | "name": "title", 63 | "title": "Title of the source", 64 | "type": "string" 65 | }, 66 | { 67 | "constraints": { 68 | "required": true 69 | }, 70 | "name": "data", 71 | "title": "Path/url to source", 72 | "type": "string" 73 | }, 74 | { 75 | "name": "format", 76 | "title": "File format of the source", 77 | "type": "string" 78 | }, 79 | { 80 | "constraints": { 81 | "required": true 82 | }, 83 | "name": "created_at", 84 | "title": "Time of the source's creation.", 85 | "type": "string" 86 | } 87 | ], 88 | "foreignKeys": [ 89 | { 90 | "fields": "publisher_id", 91 | "reference": { 92 | "fields": "id", 93 | "resource": "publisher_file" 94 | } 95 | } 96 | ], 97 | "primaryKey": "id" 98 | } 99 | }, 100 | { 101 | "name": "run_file", 102 | "path": "runs.csv", 103 | "schema": { 104 | "fields": [ 105 | { 106 | "constraints": { 107 | "required": true, 108 | "unique": true 109 | }, 110 | "name": "id", 111 | "title": "ID of the run", 112 | "type": "string" 113 | }, 114 | { 115 | "constraints": { 116 | "required": true 117 | }, 118 | "format": "datetime", 119 | "name": "timestamp", 120 | "title": "Timestamp of the run execution", 121 | "type": "date" 122 | }, 123 | { 124 | "constraints": { 125 | "required": true 126 | }, 127 | "name": "total_score", 128 | "title": "Rounded average score of results in this run", 129 | "type": "integer" 130 | } 131 | ], 132 | "primaryKey": "id" 133 | } 134 | }, 135 | { 136 | "name": "result_file", 137 | "path": "results.csv", 138 | "schema": { 139 | "fields": [ 140 | { 141 | "constraints": { 142 | "required": true, 143 | "unique": true 144 | }, 145 | "name": "id", 146 | "title": "ID of the result", 147 | "type": "string" 148 | }, 149 | { 150 | "constraints": { 151 | "required": true, 152 | "unique": true 153 | }, 154 | "name": "source_id", 155 | "title": "ID of the correspoding source", 156 | "type": "string" 157 | }, 158 | { 159 | "constraints": { 160 | "required": true 161 | }, 162 | "name": "publisher_id", 163 | "title": "ID of the source's publisher", 164 | "type": "string" 165 | }, 166 | { 167 | "constraints": { 168 | "required": true 169 | }, 170 | "name": "created_at", 171 | "title": "Time of the source's creation.", 172 | "type": "date", 173 | "format": "date" 174 | }, 175 | { 176 | "constraints": { 177 | "required": true 178 | }, 179 | "name": "data", 180 | "title": "Path/url to source", 181 | "type": "string" 182 | }, 183 | { 184 | "name": "schema", 185 | "title": "Path/url to the source's schema", 186 | "type": "string" 187 | }, 188 | { 189 | "contrains": { 190 | "required": true 191 | }, 192 | "name": "score", 193 | "title": "Score of correctness given by GoodTables", 194 | "type": "integer" 195 | }, 196 | { 197 | "name": "summary", 198 | "title": "Summary", 199 | "type": "string" 200 | }, 201 | { 202 | "constraints": { 203 | "required": true, 204 | "unique": true 205 | }, 206 | "name": "run_id", 207 | "title": "ID of the run in which the result was calculated", 208 | "type": "string" 209 | }, 210 | { 211 | "constraints": { 212 | "required": true 213 | }, 214 | "format": "datetime", 215 | "name": "timestamp", 216 | "title": "Timestamp of the run execution", 217 | "type": "date" 218 | }, 219 | { 220 | "name": "report", 221 | "title": "Path/url to the full GoodTabels report", 222 | "type": "string" 223 | } 224 | ], 225 | "foreignKeys": [ 226 | { 227 | "fields": "source_id", 228 | "reference": { 229 | "fields": "id", 230 | "resource": "source_file" 231 | } 232 | }, 233 | { 234 | "fields": "publisher_id", 235 | "reference": { 236 | "fields": "id", 237 | "resource": "publisher_file" 238 | } 239 | }, 240 | { 241 | "fields": "run_id", 242 | "reference": { 243 | "fields": "id", 244 | "resource": "run_file" 245 | } 246 | } 247 | ], 248 | "primaryKey": "id" 249 | } 250 | }, 251 | { 252 | "name": "performance_file", 253 | "path": "performance.csv", 254 | "schema": { 255 | "fields": [ 256 | { 257 | "constraints": { 258 | "required": true, 259 | "unique": true 260 | }, 261 | "name": "publisher_id", 262 | "title": "ID of the publisher", 263 | "type": "string" 264 | }, 265 | { 266 | "name": "month_of_creation", 267 | "title": "Month when the source was created", 268 | "type": "date", 269 | "format": "date", 270 | "constraints": { "required": true } 271 | }, 272 | { 273 | "constraints": { 274 | "required": true 275 | }, 276 | "name": "files_count", 277 | "title": "Number of files published by the publisher during period", 278 | "type": "integer" 279 | }, 280 | { 281 | "constraints": { 282 | "required": true 283 | }, 284 | "name": "score", 285 | "title": "Rounded average score of files published by the publisher during period", 286 | "type": "integer" 287 | }, 288 | { 289 | "constraints": { 290 | "required": true 291 | }, 292 | "name": "valid", 293 | "title": "Number of valid files published by the publisher during period", 294 | "type": "integer" 295 | }, 296 | { 297 | "constraints": { 298 | "required": true 299 | }, 300 | "name": "files_count_to_date", 301 | "title": "Number of files published by the publisher up to period", 302 | "type": "integer" 303 | }, 304 | { 305 | "constraints": { 306 | "required": true 307 | }, 308 | "name": "score_to_date", 309 | "title": "Rounded average score of files published by the publisher up to period", 310 | "type": "integer" 311 | }, 312 | { 313 | "constraints": { 314 | "required": true 315 | }, 316 | "name": "valid_to_date", 317 | "title": "Number of valid files published by the publisher up to period", 318 | "type": "integer" 319 | } 320 | ], 321 | "foreignKeys": [ 322 | { 323 | "fields": "publisher_id", 324 | "reference": { 325 | "fields": "id", 326 | "resource": "publisher_file" 327 | } 328 | } 329 | ] 330 | } 331 | } 332 | ], 333 | "sources": [], 334 | "validator_url": "https://goodtables.okfnlabs.org/api/run" 335 | } -------------------------------------------------------------------------------- /tests/fixtures/dq.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_dir": "", 3 | "cache_dir": "fetched", 4 | "result_file": "results.csv", 5 | "run_file": "runs.csv", 6 | "source_file": "sources.csv", 7 | "publisher_file": "publishers.csv", 8 | "performance_file": "performance.csv", 9 | "datapackage_file": "datapackage.json", 10 | "remotes": ["origin"], 11 | "branch": "master", 12 | "goodtables": { 13 | "goodtables_web": "http://goodtables.okfnlabs.org", 14 | "arguments": { 15 | "pipeline": { 16 | "processors": ["schema", "structure"], 17 | "encoding": "utf-8", 18 | "options": { 19 | "schema": {"case_insensitive_headers": true} 20 | }, 21 | "break_on_invalid_processor": false 22 | }, 23 | "batch": { 24 | "format_key": "format", 25 | "schema_key": "schema", 26 | "data_key": "data" 27 | } 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tests/fixtures/fetched/empty_rows_multiple.csv: -------------------------------------------------------------------------------- 1 | id,name,age 2 | 1101,John,30 3 | 1102,Julie,26 4 | ,, 5 | ,, 6 | ,, 7 | ,, 8 | ,, 9 | ,, 10 | ,, 11 | ,, 12 | ,, 13 | ,, 14 | ,, 15 | -------------------------------------------------------------------------------- /tests/fixtures/fetched/valid.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,english 3 | 2,中国人 4 | -------------------------------------------------------------------------------- /tests/fixtures/performance.csv: -------------------------------------------------------------------------------- 1 | publisher_id,month_of_creation,files_count,score,valid,files_count_to_date,score_to_date,valid_to_date 2 | xx_dept1,2015-01-01,1,100,100,1,100,100 3 | xx_dept1,2015-02-01,0,0,0,1,100,100 4 | xx_dept1,2015-03-01,0,0,0,1,100,100 5 | xx_dept1,2015-04-01,0,0,0,1,100,100 6 | xx_dept1,2015-05-01,0,0,0,1,100,100 7 | xx_dept1,2015-06-01,0,0,0,1,100,100 8 | xx_dept1,2015-07-01,0,0,0,1,100,100 9 | xx_dept1,2015-08-01,0,0,0,1,100,100 10 | xx_dept1,2015-09-01,0,0,0,1,100,100 11 | xx_dept1,2015-10-01,0,0,0,1,100,100 12 | xx_dept1,2015-11-01,0,0,0,1,100,100 13 | xx_dept1,2015-12-01,0,0,0,1,100,100 14 | xx_dept1,2016-01-01,0,0,0,1,100,100 15 | xx_dept1,2016-02-01,0,0,0,1,100,100 16 | xx_dept1,2016-03-01,0,0,0,1,100,100 17 | xx_dept1,2016-04-01,0,0,0,1,100,100 18 | xx_dept1,2016-05-01,0,0,0,1,100,100 19 | xx_dept1,2016-06-01,0,0,0,1,100,100 20 | xx_dept1,2016-07-01,0,0,0,1,100,100 21 | xx_dept1,2016-08-01,0,0,0,1,100,100 22 | xx_dept1,2016-09-01,0,0,0,1,100,100 23 | xx_dept1,2016-10-01,0,0,0,1,100,100 24 | xx_dept1,2016-11-01,0,0,0,1,100,100 25 | xx_dept1,2016-12-01,0,0,0,1,100,100 26 | xx_dept1,2017-01-01,0,0,0,1,100,100 27 | xx_dept1,2017-02-01,0,0,0,1,100,100 28 | xx_dept1,2017-03-01,0,0,0,1,100,100 29 | xx_dept1,2017-04-01,0,0,0,1,100,100 30 | xx_dept2,2015-01-01,0,0,0,0,0,0 31 | xx_dept2,2015-02-01,0,0,0,0,0,0 32 | xx_dept2,2015-03-01,0,0,0,0,0,0 33 | xx_dept2,2015-04-01,0,0,0,0,0,0 34 | xx_dept2,2015-05-01,0,0,0,0,0,0 35 | xx_dept2,2015-06-01,0,0,0,0,0,0 36 | xx_dept2,2015-07-01,0,0,0,0,0,0 37 | xx_dept2,2015-08-01,0,0,0,0,0,0 38 | xx_dept2,2015-09-01,0,0,0,0,0,0 39 | xx_dept2,2015-10-01,0,0,0,0,0,0 40 | xx_dept2,2015-11-01,0,0,0,0,0,0 41 | xx_dept2,2015-12-01,0,0,0,0,0,0 42 | xx_dept2,2016-01-01,0,0,0,0,0,0 43 | xx_dept2,2016-02-01,0,0,0,0,0,0 44 | xx_dept2,2016-03-01,0,0,0,0,0,0 45 | xx_dept2,2016-04-01,0,0,0,0,0,0 46 | xx_dept2,2016-05-01,0,0,0,0,0,0 47 | xx_dept2,2016-06-01,0,0,0,0,0,0 48 | xx_dept2,2016-07-01,0,0,0,0,0,0 49 | xx_dept2,2016-08-01,0,0,0,0,0,0 50 | xx_dept2,2016-09-01,0,0,0,0,0,0 51 | xx_dept2,2016-10-01,0,0,0,0,0,0 52 | xx_dept2,2016-11-01,0,0,0,0,0,0 53 | xx_dept2,2016-12-01,0,0,0,0,0,0 54 | xx_dept2,2017-01-01,0,0,0,0,0,0 55 | xx_dept2,2017-02-01,0,0,0,0,0,0 56 | xx_dept2,2017-03-01,0,0,0,0,0,0 57 | xx_dept2,2017-04-01,0,0,0,0,0,0 58 | xx_dept3,2015-01-01,0,0,0,0,0,0 59 | xx_dept3,2015-02-01,0,0,0,0,0,0 60 | xx_dept3,2015-03-01,0,0,0,0,0,0 61 | xx_dept3,2015-04-01,0,0,0,0,0,0 62 | xx_dept3,2015-05-01,0,0,0,0,0,0 63 | xx_dept3,2015-06-01,0,0,0,0,0,0 64 | xx_dept3,2015-07-01,0,0,0,0,0,0 65 | xx_dept3,2015-08-01,0,0,0,0,0,0 66 | xx_dept3,2015-09-01,0,0,0,0,0,0 67 | xx_dept3,2015-10-01,0,0,0,0,0,0 68 | xx_dept3,2015-11-01,0,0,0,0,0,0 69 | xx_dept3,2015-12-01,0,0,0,0,0,0 70 | xx_dept3,2016-01-01,0,0,0,0,0,0 71 | xx_dept3,2016-02-01,0,0,0,0,0,0 72 | xx_dept3,2016-03-01,0,0,0,0,0,0 73 | xx_dept3,2016-04-01,0,0,0,0,0,0 74 | xx_dept3,2016-05-01,0,0,0,0,0,0 75 | xx_dept3,2016-06-01,0,0,0,0,0,0 76 | xx_dept3,2016-07-01,0,0,0,0,0,0 77 | xx_dept3,2016-08-01,0,0,0,0,0,0 78 | xx_dept3,2016-09-01,0,0,0,0,0,0 79 | xx_dept3,2016-10-01,0,0,0,0,0,0 80 | xx_dept3,2016-11-01,0,0,0,0,0,0 81 | xx_dept3,2016-12-01,0,0,0,0,0,0 82 | xx_dept3,2017-01-01,0,0,0,0,0,0 83 | xx_dept3,2017-02-01,0,0,0,0,0,0 84 | xx_dept3,2017-03-01,0,0,0,0,0,0 85 | xx_dept3,2017-04-01,0,0,0,0,0,0 86 | xx_dept4,2015-01-01,0,0,0,0,0,0 87 | xx_dept4,2015-02-01,0,0,0,0,0,0 88 | xx_dept4,2015-03-01,0,0,0,0,0,0 89 | xx_dept4,2015-04-01,0,0,0,0,0,0 90 | xx_dept4,2015-05-01,0,0,0,0,0,0 91 | xx_dept4,2015-06-01,0,0,0,0,0,0 92 | xx_dept4,2015-07-01,0,0,0,0,0,0 93 | xx_dept4,2015-08-01,0,0,0,0,0,0 94 | xx_dept4,2015-09-01,0,0,0,0,0,0 95 | xx_dept4,2015-10-01,0,0,0,0,0,0 96 | xx_dept4,2015-11-01,0,0,0,0,0,0 97 | xx_dept4,2015-12-01,0,0,0,0,0,0 98 | xx_dept4,2016-01-01,0,0,0,0,0,0 99 | xx_dept4,2016-02-01,0,0,0,0,0,0 100 | xx_dept4,2016-03-01,0,0,0,0,0,0 101 | xx_dept4,2016-04-01,0,0,0,0,0,0 102 | xx_dept4,2016-05-01,0,0,0,0,0,0 103 | xx_dept4,2016-06-01,0,0,0,0,0,0 104 | xx_dept4,2016-07-01,0,0,0,0,0,0 105 | xx_dept4,2016-08-01,0,0,0,0,0,0 106 | xx_dept4,2016-09-01,0,0,0,0,0,0 107 | xx_dept4,2016-10-01,0,0,0,0,0,0 108 | xx_dept4,2016-11-01,0,0,0,0,0,0 109 | xx_dept4,2016-12-01,0,0,0,0,0,0 110 | xx_dept4,2017-01-01,0,0,0,0,0,0 111 | xx_dept4,2017-02-01,0,0,0,0,0,0 112 | xx_dept4,2017-03-01,0,0,0,0,0,0 113 | xx_dept4,2017-04-01,0,0,0,0,0,0 114 | xx_dept5,2015-01-01,0,0,0,0,0,0 115 | xx_dept5,2015-02-01,0,0,0,0,0,0 116 | xx_dept5,2015-03-01,0,0,0,0,0,0 117 | xx_dept5,2015-04-01,0,0,0,0,0,0 118 | xx_dept5,2015-05-01,0,0,0,0,0,0 119 | xx_dept5,2015-06-01,0,0,0,0,0,0 120 | xx_dept5,2015-07-01,0,0,0,0,0,0 121 | xx_dept5,2015-08-01,0,0,0,0,0,0 122 | xx_dept5,2015-09-01,0,0,0,0,0,0 123 | xx_dept5,2015-10-01,0,0,0,0,0,0 124 | xx_dept5,2015-11-01,0,0,0,0,0,0 125 | xx_dept5,2015-12-01,0,0,0,0,0,0 126 | xx_dept5,2016-01-01,0,0,0,0,0,0 127 | xx_dept5,2016-02-01,0,0,0,0,0,0 128 | xx_dept5,2016-03-01,0,0,0,0,0,0 129 | xx_dept5,2016-04-01,0,0,0,0,0,0 130 | xx_dept5,2016-05-01,0,0,0,0,0,0 131 | xx_dept5,2016-06-01,0,0,0,0,0,0 132 | xx_dept5,2016-07-01,0,0,0,0,0,0 133 | xx_dept5,2016-08-01,0,0,0,0,0,0 134 | xx_dept5,2016-09-01,0,0,0,0,0,0 135 | xx_dept5,2016-10-01,0,0,0,0,0,0 136 | xx_dept5,2016-11-01,0,0,0,0,0,0 137 | xx_dept5,2016-12-01,0,0,0,0,0,0 138 | xx_dept5,2017-01-01,0,0,0,0,0,0 139 | xx_dept5,2017-02-01,0,0,0,0,0,0 140 | xx_dept5,2017-03-01,0,0,0,0,0,0 141 | xx_dept5,2017-04-01,0,0,0,0,0,0 142 | xx_dept6,2015-01-01,0,0,0,0,0,0 143 | xx_dept6,2015-02-01,0,0,0,0,0,0 144 | xx_dept6,2015-03-01,0,0,0,0,0,0 145 | xx_dept6,2015-04-01,0,0,0,0,0,0 146 | xx_dept6,2015-05-01,0,0,0,0,0,0 147 | xx_dept6,2015-06-01,0,0,0,0,0,0 148 | xx_dept6,2015-07-01,0,0,0,0,0,0 149 | xx_dept6,2015-08-01,0,0,0,0,0,0 150 | xx_dept6,2015-09-01,0,0,0,0,0,0 151 | xx_dept6,2015-10-01,0,0,0,0,0,0 152 | xx_dept6,2015-11-01,0,0,0,0,0,0 153 | xx_dept6,2015-12-01,0,0,0,0,0,0 154 | xx_dept6,2016-01-01,0,0,0,0,0,0 155 | xx_dept6,2016-02-01,0,0,0,0,0,0 156 | xx_dept6,2016-03-01,0,0,0,0,0,0 157 | xx_dept6,2016-04-01,0,0,0,0,0,0 158 | xx_dept6,2016-05-01,0,0,0,0,0,0 159 | xx_dept6,2016-06-01,0,0,0,0,0,0 160 | xx_dept6,2016-07-01,0,0,0,0,0,0 161 | xx_dept6,2016-08-01,0,0,0,0,0,0 162 | xx_dept6,2016-09-01,0,0,0,0,0,0 163 | xx_dept6,2016-10-01,0,0,0,0,0,0 164 | xx_dept6,2016-11-01,0,0,0,0,0,0 165 | xx_dept6,2016-12-01,0,0,0,0,0,0 166 | xx_dept6,2017-01-01,0,0,0,0,0,0 167 | xx_dept6,2017-02-01,0,0,0,0,0,0 168 | xx_dept6,2017-03-01,0,0,0,0,0,0 169 | xx_dept6,2017-04-01,0,0,0,0,0,0 170 | xx_dept7,2015-01-01,0,0,0,0,0,0 171 | xx_dept7,2015-02-01,0,0,0,0,0,0 172 | xx_dept7,2015-03-01,0,0,0,0,0,0 173 | xx_dept7,2015-04-01,0,0,0,0,0,0 174 | xx_dept7,2015-05-01,0,0,0,0,0,0 175 | xx_dept7,2015-06-01,0,0,0,0,0,0 176 | xx_dept7,2015-07-01,0,0,0,0,0,0 177 | xx_dept7,2015-08-01,0,0,0,0,0,0 178 | xx_dept7,2015-09-01,0,0,0,0,0,0 179 | xx_dept7,2015-10-01,0,0,0,0,0,0 180 | xx_dept7,2015-11-01,0,0,0,0,0,0 181 | xx_dept7,2015-12-01,0,0,0,0,0,0 182 | xx_dept7,2016-01-01,0,0,0,0,0,0 183 | xx_dept7,2016-02-01,0,0,0,0,0,0 184 | xx_dept7,2016-03-01,0,0,0,0,0,0 185 | xx_dept7,2016-04-01,0,0,0,0,0,0 186 | xx_dept7,2016-05-01,0,0,0,0,0,0 187 | xx_dept7,2016-06-01,0,0,0,0,0,0 188 | xx_dept7,2016-07-01,0,0,0,0,0,0 189 | xx_dept7,2016-08-01,0,0,0,0,0,0 190 | xx_dept7,2016-09-01,0,0,0,0,0,0 191 | xx_dept7,2016-10-01,0,0,0,0,0,0 192 | xx_dept7,2016-11-01,0,0,0,0,0,0 193 | xx_dept7,2016-12-01,0,0,0,0,0,0 194 | xx_dept7,2017-01-01,0,0,0,0,0,0 195 | xx_dept7,2017-02-01,0,0,0,0,0,0 196 | xx_dept7,2017-03-01,0,0,0,0,0,0 197 | xx_dept7,2017-04-01,0,0,0,0,0,0 198 | xx_dept8,2015-01-01,0,0,0,0,0,0 199 | xx_dept8,2015-02-01,0,0,0,0,0,0 200 | xx_dept8,2015-03-01,0,0,0,0,0,0 201 | xx_dept8,2015-04-01,0,0,0,0,0,0 202 | xx_dept8,2015-05-01,0,0,0,0,0,0 203 | xx_dept8,2015-06-01,0,0,0,0,0,0 204 | xx_dept8,2015-07-01,0,0,0,0,0,0 205 | xx_dept8,2015-08-01,0,0,0,0,0,0 206 | xx_dept8,2015-09-01,0,0,0,0,0,0 207 | xx_dept8,2015-10-01,0,0,0,0,0,0 208 | xx_dept8,2015-11-01,0,0,0,0,0,0 209 | xx_dept8,2015-12-01,0,0,0,0,0,0 210 | xx_dept8,2016-01-01,0,0,0,0,0,0 211 | xx_dept8,2016-02-01,0,0,0,0,0,0 212 | xx_dept8,2016-03-01,0,0,0,0,0,0 213 | xx_dept8,2016-04-01,0,0,0,0,0,0 214 | xx_dept8,2016-05-01,0,0,0,0,0,0 215 | xx_dept8,2016-06-01,0,0,0,0,0,0 216 | xx_dept8,2016-07-01,0,0,0,0,0,0 217 | xx_dept8,2016-08-01,0,0,0,0,0,0 218 | xx_dept8,2016-09-01,0,0,0,0,0,0 219 | xx_dept8,2016-10-01,0,0,0,0,0,0 220 | xx_dept8,2016-11-01,0,0,0,0,0,0 221 | xx_dept8,2016-12-01,0,0,0,0,0,0 222 | xx_dept8,2017-01-01,0,0,0,0,0,0 223 | xx_dept8,2017-02-01,0,0,0,0,0,0 224 | xx_dept8,2017-03-01,0,0,0,0,0,0 225 | xx_dept8,2017-04-01,0,0,0,0,0,0 226 | xx_dept9,2015-01-01,0,0,0,0,0,0 227 | xx_dept9,2015-02-01,0,0,0,0,0,0 228 | xx_dept9,2015-03-01,0,0,0,0,0,0 229 | xx_dept9,2015-04-01,0,0,0,0,0,0 230 | xx_dept9,2015-05-01,0,0,0,0,0,0 231 | xx_dept9,2015-06-01,0,0,0,0,0,0 232 | xx_dept9,2015-07-01,0,0,0,0,0,0 233 | xx_dept9,2015-08-01,0,0,0,0,0,0 234 | xx_dept9,2015-09-01,0,0,0,0,0,0 235 | xx_dept9,2015-10-01,0,0,0,0,0,0 236 | xx_dept9,2015-11-01,0,0,0,0,0,0 237 | xx_dept9,2015-12-01,0,0,0,0,0,0 238 | xx_dept9,2016-01-01,0,0,0,0,0,0 239 | xx_dept9,2016-02-01,0,0,0,0,0,0 240 | xx_dept9,2016-03-01,0,0,0,0,0,0 241 | xx_dept9,2016-04-01,0,0,0,0,0,0 242 | xx_dept9,2016-05-01,0,0,0,0,0,0 243 | xx_dept9,2016-06-01,0,0,0,0,0,0 244 | xx_dept9,2016-07-01,0,0,0,0,0,0 245 | xx_dept9,2016-08-01,0,0,0,0,0,0 246 | xx_dept9,2016-09-01,0,0,0,0,0,0 247 | xx_dept9,2016-10-01,0,0,0,0,0,0 248 | xx_dept9,2016-11-01,0,0,0,0,0,0 249 | xx_dept9,2016-12-01,0,0,0,0,0,0 250 | xx_dept9,2017-01-01,0,0,0,0,0,0 251 | xx_dept9,2017-02-01,0,0,0,0,0,0 252 | xx_dept9,2017-03-01,0,0,0,0,0,0 253 | xx_dept9,2017-04-01,0,0,0,0,0,0 254 | xx_dept10,2015-01-01,0,0,0,0,0,0 255 | xx_dept10,2015-02-01,0,0,0,0,0,0 256 | xx_dept10,2015-03-01,0,0,0,0,0,0 257 | xx_dept10,2015-04-01,0,0,0,0,0,0 258 | xx_dept10,2015-05-01,0,0,0,0,0,0 259 | xx_dept10,2015-06-01,0,0,0,0,0,0 260 | xx_dept10,2015-07-01,0,0,0,0,0,0 261 | xx_dept10,2015-08-01,0,0,0,0,0,0 262 | xx_dept10,2015-09-01,0,0,0,0,0,0 263 | xx_dept10,2015-10-01,0,0,0,0,0,0 264 | xx_dept10,2015-11-01,0,0,0,0,0,0 265 | xx_dept10,2015-12-01,0,0,0,0,0,0 266 | xx_dept10,2016-01-01,0,0,0,0,0,0 267 | xx_dept10,2016-02-01,0,0,0,0,0,0 268 | xx_dept10,2016-03-01,0,0,0,0,0,0 269 | xx_dept10,2016-04-01,0,0,0,0,0,0 270 | xx_dept10,2016-05-01,0,0,0,0,0,0 271 | xx_dept10,2016-06-01,0,0,0,0,0,0 272 | xx_dept10,2016-07-01,0,0,0,0,0,0 273 | xx_dept10,2016-08-01,0,0,0,0,0,0 274 | xx_dept10,2016-09-01,0,0,0,0,0,0 275 | xx_dept10,2016-10-01,0,0,0,0,0,0 276 | xx_dept10,2016-11-01,0,0,0,0,0,0 277 | xx_dept10,2016-12-01,0,0,0,0,0,0 278 | xx_dept10,2017-01-01,0,0,0,0,0,0 279 | xx_dept10,2017-02-01,0,0,0,0,0,0 280 | xx_dept10,2017-03-01,0,0,0,0,0,0 281 | xx_dept10,2017-04-01,0,0,0,0,0,0 282 | xx_dept11,2015-01-01,0,0,0,0,0,0 283 | xx_dept11,2015-02-01,0,0,0,0,0,0 284 | xx_dept11,2015-03-01,0,0,0,0,0,0 285 | xx_dept11,2015-04-01,0,0,0,0,0,0 286 | xx_dept11,2015-05-01,0,0,0,0,0,0 287 | xx_dept11,2015-06-01,0,0,0,0,0,0 288 | xx_dept11,2015-07-01,0,0,0,0,0,0 289 | xx_dept11,2015-08-01,0,0,0,0,0,0 290 | xx_dept11,2015-09-01,0,0,0,0,0,0 291 | xx_dept11,2015-10-01,0,0,0,0,0,0 292 | xx_dept11,2015-11-01,0,0,0,0,0,0 293 | xx_dept11,2015-12-01,0,0,0,0,0,0 294 | xx_dept11,2016-01-01,0,0,0,0,0,0 295 | xx_dept11,2016-02-01,0,0,0,0,0,0 296 | xx_dept11,2016-03-01,0,0,0,0,0,0 297 | xx_dept11,2016-04-01,0,0,0,0,0,0 298 | xx_dept11,2016-05-01,0,0,0,0,0,0 299 | xx_dept11,2016-06-01,0,0,0,0,0,0 300 | xx_dept11,2016-07-01,0,0,0,0,0,0 301 | xx_dept11,2016-08-01,0,0,0,0,0,0 302 | xx_dept11,2016-09-01,0,0,0,0,0,0 303 | xx_dept11,2016-10-01,0,0,0,0,0,0 304 | xx_dept11,2016-11-01,0,0,0,0,0,0 305 | xx_dept11,2016-12-01,0,0,0,0,0,0 306 | xx_dept11,2017-01-01,0,0,0,0,0,0 307 | xx_dept11,2017-02-01,0,0,0,0,0,0 308 | xx_dept11,2017-03-01,0,0,0,0,0,0 309 | xx_dept11,2017-04-01,0,0,0,0,0,0 310 | xx_dept12,2015-01-01,0,0,0,0,0,0 311 | xx_dept12,2015-02-01,0,0,0,0,0,0 312 | xx_dept12,2015-03-01,0,0,0,0,0,0 313 | xx_dept12,2015-04-01,0,0,0,0,0,0 314 | xx_dept12,2015-05-01,0,0,0,0,0,0 315 | xx_dept12,2015-06-01,0,0,0,0,0,0 316 | xx_dept12,2015-07-01,0,0,0,0,0,0 317 | xx_dept12,2015-08-01,0,0,0,0,0,0 318 | xx_dept12,2015-09-01,0,0,0,0,0,0 319 | xx_dept12,2015-10-01,0,0,0,0,0,0 320 | xx_dept12,2015-11-01,0,0,0,0,0,0 321 | xx_dept12,2015-12-01,0,0,0,0,0,0 322 | xx_dept12,2016-01-01,0,0,0,0,0,0 323 | xx_dept12,2016-02-01,0,0,0,0,0,0 324 | xx_dept12,2016-03-01,0,0,0,0,0,0 325 | xx_dept12,2016-04-01,0,0,0,0,0,0 326 | xx_dept12,2016-05-01,0,0,0,0,0,0 327 | xx_dept12,2016-06-01,0,0,0,0,0,0 328 | xx_dept12,2016-07-01,0,0,0,0,0,0 329 | xx_dept12,2016-08-01,0,0,0,0,0,0 330 | xx_dept12,2016-09-01,0,0,0,0,0,0 331 | xx_dept12,2016-10-01,0,0,0,0,0,0 332 | xx_dept12,2016-11-01,0,0,0,0,0,0 333 | xx_dept12,2016-12-01,0,0,0,0,0,0 334 | xx_dept12,2017-01-01,0,0,0,0,0,0 335 | xx_dept12,2017-02-01,0,0,0,0,0,0 336 | xx_dept12,2017-03-01,0,0,0,0,0,0 337 | xx_dept12,2017-04-01,0,0,0,0,0,0 338 | xx_dept13,2015-01-01,0,0,0,0,0,0 339 | xx_dept13,2015-02-01,0,0,0,0,0,0 340 | xx_dept13,2015-03-01,0,0,0,0,0,0 341 | xx_dept13,2015-04-01,0,0,0,0,0,0 342 | xx_dept13,2015-05-01,0,0,0,0,0,0 343 | xx_dept13,2015-06-01,0,0,0,0,0,0 344 | xx_dept13,2015-07-01,0,0,0,0,0,0 345 | xx_dept13,2015-08-01,0,0,0,0,0,0 346 | xx_dept13,2015-09-01,0,0,0,0,0,0 347 | xx_dept13,2015-10-01,0,0,0,0,0,0 348 | xx_dept13,2015-11-01,0,0,0,0,0,0 349 | xx_dept13,2015-12-01,0,0,0,0,0,0 350 | xx_dept13,2016-01-01,0,0,0,0,0,0 351 | xx_dept13,2016-02-01,0,0,0,0,0,0 352 | xx_dept13,2016-03-01,0,0,0,0,0,0 353 | xx_dept13,2016-04-01,0,0,0,0,0,0 354 | xx_dept13,2016-05-01,0,0,0,0,0,0 355 | xx_dept13,2016-06-01,0,0,0,0,0,0 356 | xx_dept13,2016-07-01,0,0,0,0,0,0 357 | xx_dept13,2016-08-01,0,0,0,0,0,0 358 | xx_dept13,2016-09-01,0,0,0,0,0,0 359 | xx_dept13,2016-10-01,0,0,0,0,0,0 360 | xx_dept13,2016-11-01,0,0,0,0,0,0 361 | xx_dept13,2016-12-01,0,0,0,0,0,0 362 | xx_dept13,2017-01-01,0,0,0,0,0,0 363 | xx_dept13,2017-02-01,0,0,0,0,0,0 364 | xx_dept13,2017-03-01,0,0,0,0,0,0 365 | xx_dept13,2017-04-01,0,0,0,0,0,0 366 | xx_dept14,2015-01-01,0,0,0,0,0,0 367 | xx_dept14,2015-02-01,0,0,0,0,0,0 368 | xx_dept14,2015-03-01,0,0,0,0,0,0 369 | xx_dept14,2015-04-01,0,0,0,0,0,0 370 | xx_dept14,2015-05-01,0,0,0,0,0,0 371 | xx_dept14,2015-06-01,0,0,0,0,0,0 372 | xx_dept14,2015-07-01,0,0,0,0,0,0 373 | xx_dept14,2015-08-01,0,0,0,0,0,0 374 | xx_dept14,2015-09-01,0,0,0,0,0,0 375 | xx_dept14,2015-10-01,0,0,0,0,0,0 376 | xx_dept14,2015-11-01,0,0,0,0,0,0 377 | xx_dept14,2015-12-01,0,0,0,0,0,0 378 | xx_dept14,2016-01-01,0,0,0,0,0,0 379 | xx_dept14,2016-02-01,0,0,0,0,0,0 380 | xx_dept14,2016-03-01,0,0,0,0,0,0 381 | xx_dept14,2016-04-01,0,0,0,0,0,0 382 | xx_dept14,2016-05-01,0,0,0,0,0,0 383 | xx_dept14,2016-06-01,0,0,0,0,0,0 384 | xx_dept14,2016-07-01,0,0,0,0,0,0 385 | xx_dept14,2016-08-01,0,0,0,0,0,0 386 | xx_dept14,2016-09-01,0,0,0,0,0,0 387 | xx_dept14,2016-10-01,0,0,0,0,0,0 388 | xx_dept14,2016-11-01,0,0,0,0,0,0 389 | xx_dept14,2016-12-01,0,0,0,0,0,0 390 | xx_dept14,2017-01-01,0,0,0,0,0,0 391 | xx_dept14,2017-02-01,0,0,0,0,0,0 392 | xx_dept14,2017-03-01,0,0,0,0,0,0 393 | xx_dept14,2017-04-01,0,0,0,0,0,0 394 | xx_dept15,2015-01-01,1,0,0,1,0,0 395 | xx_dept15,2015-02-01,0,0,0,1,0,0 396 | xx_dept15,2015-03-01,0,0,0,1,0,0 397 | xx_dept15,2015-04-01,0,0,0,1,0,0 398 | xx_dept15,2015-05-01,0,0,0,1,0,0 399 | xx_dept15,2015-06-01,0,0,0,1,0,0 400 | xx_dept15,2015-07-01,0,0,0,1,0,0 401 | xx_dept15,2015-08-01,0,0,0,1,0,0 402 | xx_dept15,2015-09-01,0,0,0,1,0,0 403 | xx_dept15,2015-10-01,0,0,0,1,0,0 404 | xx_dept15,2015-11-01,0,0,0,1,0,0 405 | xx_dept15,2015-12-01,0,0,0,1,0,0 406 | xx_dept15,2016-01-01,0,0,0,1,0,0 407 | xx_dept15,2016-02-01,0,0,0,1,0,0 408 | xx_dept15,2016-03-01,0,0,0,1,0,0 409 | xx_dept15,2016-04-01,0,0,0,1,0,0 410 | xx_dept15,2016-05-01,0,0,0,1,0,0 411 | xx_dept15,2016-06-01,0,0,0,1,0,0 412 | xx_dept15,2016-07-01,0,0,0,1,0,0 413 | xx_dept15,2016-08-01,0,0,0,1,0,0 414 | xx_dept15,2016-09-01,0,0,0,1,0,0 415 | xx_dept15,2016-10-01,0,0,0,1,0,0 416 | xx_dept15,2016-11-01,0,0,0,1,0,0 417 | xx_dept15,2016-12-01,0,0,0,1,0,0 418 | xx_dept15,2017-01-01,0,0,0,1,0,0 419 | xx_dept15,2017-02-01,0,0,0,1,0,0 420 | xx_dept15,2017-03-01,0,0,0,1,0,0 421 | xx_dept15,2017-04-01,0,0,0,1,0,0 422 | all,2015-01-01,2,50,50,2,50,50 423 | all,2015-02-01,0,0,0,2,50,50 424 | all,2015-03-01,0,0,0,2,50,50 425 | all,2015-04-01,0,0,0,2,50,50 426 | all,2015-05-01,0,0,0,2,50,50 427 | all,2015-06-01,0,0,0,2,50,50 428 | all,2015-07-01,0,0,0,2,50,50 429 | all,2015-08-01,0,0,0,2,50,50 430 | all,2015-09-01,0,0,0,2,50,50 431 | all,2015-10-01,0,0,0,2,50,50 432 | all,2015-11-01,0,0,0,2,50,50 433 | all,2015-12-01,0,0,0,2,50,50 434 | all,2016-01-01,0,0,0,2,50,50 435 | all,2016-02-01,0,0,0,2,50,50 436 | all,2016-03-01,0,0,0,2,50,50 437 | all,2016-04-01,0,0,0,2,50,50 438 | all,2016-05-01,0,0,0,2,50,50 439 | all,2016-06-01,0,0,0,2,50,50 440 | all,2016-07-01,0,0,0,2,50,50 441 | all,2016-08-01,0,0,0,2,50,50 442 | all,2016-09-01,0,0,0,2,50,50 443 | all,2016-10-01,0,0,0,2,50,50 444 | all,2016-11-01,0,0,0,2,50,50 445 | all,2016-12-01,0,0,0,2,50,50 446 | all,2017-01-01,0,0,0,2,50,50 447 | all,2017-02-01,0,0,0,2,50,50 448 | all,2017-03-01,0,0,0,2,50,50 449 | all,2017-04-01,0,0,0,2,50,50 450 | -------------------------------------------------------------------------------- /tests/fixtures/publishers.csv: -------------------------------------------------------------------------------- 1 | id,parent_id,name,description,url,jurisdiction_code,email,address,contact,score,source_count 2 | xx_dept1,,Department 1,,http://www.example.com/dept1,XX,dept1-admin@example.com,,Dept1 Admin,8,2 3 | xx_dept2,,Department 2,,http://www.example.com/dept2,XX,dept2-admin@example.com,,Dept2 Admin,8,2 4 | xx_dept3,,Department 3,,http://www.example.com/dept3,XX,dept3-admin@example.com,,Dept3 Admin,8,2 5 | xx_dept4,,Department 4,,http://www.example.com/dept4,XX,dept4-admin@example.com,,Dept4 Admin,8,2 6 | xx_dept5,,Department 5,,http://www.example.com/dept5,XX,dept5-admin@example.com,,Dept5 Admin,8,2 7 | xx_dept6,,Department 6,,http://www.example.com/dept6,XX,dept6-admin@example.com,,Dept6 Admin,8,1 8 | xx_dept7,,Department 7,,http://www.example.com/dept7,XX,dept7-admin@example.com,,Dept7 Admin,8,1 9 | xx_dept8,,Department 8,,http://www.example.com/dept8,XX,dept8-admin@example.com,,Dept8 Admin,8,1 10 | xx_dept9,,Department 9,,http://www.example.com/dept9,XX,dept9-admin@example.com,,Dept9 Admin,8,1 11 | xx_dept10,,Department 10,,http://www.example.com/dept10,XX,dept10-admin@example.com,,Dept10 Admin,8,1 12 | xx_dept11,xx_dept1,Department 11,,http://www.example.com/dept11,XX,dept-1-admin@example.com,,Dept11 Admin,8,1 13 | xx_dept12,xx_dept2,Department 12,,http://www.example.com/dept12,XX,dept-1-admin@example.com,,Dept12 Admin,8,1 14 | xx_dept13,xx_dept3,Department 13,,http://www.example.com/dept13,XX,dept-1-admin@example.com,,Dept13 Admin,8,1 15 | xx_dept14,xx_dept4,Department 14,,http://www.example.com/dept14,XX,dept-1-admin@example.com,,Dept14 Admin,8,1 16 | xx_dept15,xx_dept5,Department 15,,http://www.example.com/dept15,XX,dept-1-admin@example.com,,Dept15 Admin,8,1 17 | -------------------------------------------------------------------------------- /tests/fixtures/results.csv: -------------------------------------------------------------------------------- 1 | id,source_id,publisher_id,created_at,data,schema,score,summary,run_id,timestamp,report 2 | ce7752c9bd1a4f96a2459713687e9a72,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,45761052f48a4b158314b45d5ff08291,2016-08-08 17:42:12.141037+00:00,http://goodtables.okfnlabs.org 3 | 31190ecbda0744208ee447c14a4d3683,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,67,,45761052f48a4b158314b45d5ff08291,2016-08-08 17:42:12.141037+00:00,http://goodtables.okfnlabs.org 4 | e1463a9574d348f9abf13ea59a1e6c70,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,98,,7841c39e40ba475dab4f34e554aeef1e,2016-08-08 17:42:13.978475+00:00,http://goodtables.okfnlabs.org 5 | 678512637ef044b9a298ea29eca902e4,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,e7d90595bfca4ef6b486ac4263b2c4e5,2016-08-08 17:42:14.369176+00:00,http://goodtables.okfnlabs.org 6 | fa8776d6871f4058858bec851ee27cb2,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,704ca77f1f5a48bba44dfb7d2e84bb77,2016-08-08 17:42:14.610112+00:00,http://goodtables.okfnlabs.org 7 | eb27fff6e6454040a0a26f388588d816,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,0,,9fd42bfeaa004af996d93530d616555a,2016-08-08 17:42:15.123895+00:00,http://goodtables.okfnlabs.org 8 | 2e0ce1e3811b49fc830e75bc800bd1b6,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,00baee32a3f44619b2febe224b198f64,2016-08-08 17:42:36.428362+00:00,http://goodtables.okfnlabs.org 9 | e4bf2c5bf1724c278898eb1d8f34f0bd,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,67,,00baee32a3f44619b2febe224b198f64,2016-08-08 17:42:36.428362+00:00,http://goodtables.okfnlabs.org 10 | 3822297dc83d4b6588c571e8d663d44a,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,98,,318875d4fda648f3926bb70cfc2163e9,2016-08-08 17:42:37.366584+00:00,http://goodtables.okfnlabs.org 11 | 072c65bcdeda4219b2cbdb68695c57cc,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,fea262ac9dcb4ba3a2fd3e8d06eb5521,2016-08-08 17:42:37.584550+00:00,http://goodtables.okfnlabs.org 12 | 22c8597f93964838be3aded23827e295,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,7b2d9e44605849f28fb47cf7b8e9c36e,2016-08-08 17:42:37.785201+00:00,http://goodtables.okfnlabs.org 13 | f2a7bd1359c143bfb1e2df229338ad99,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,0,,516f62d0631d418691012dac89046c7c,2016-08-08 17:42:38.215569+00:00,http://goodtables.okfnlabs.org 14 | fd27c6d4fb8142f38b43ea52cc16f2be,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,288631224a51428ab59159a440b5e909,2016-08-08 17:42:54.854697+00:00,http://goodtables.okfnlabs.org 15 | eb0e8a70775c43459c4b5a1a068b6011,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,67,,288631224a51428ab59159a440b5e909,2016-08-08 17:42:54.854697+00:00,http://goodtables.okfnlabs.org 16 | f674ea3ec1c04bfc987a3d65cc652c42,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,98,,81032d4fb6ab4642afa8bb82e0e88ebd,2016-08-08 17:42:56.360466+00:00,http://goodtables.okfnlabs.org 17 | 92953faad075412ca490f31b81a29fcf,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,36aa26c18f0b4ae0894a7897ebe3f2d6,2016-08-08 17:42:56.620225+00:00,http://goodtables.okfnlabs.org 18 | f302ab531bc146d6929380b33afd4072,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,8ccd3cba19cc47bd9258c0e66f5dca98,2016-08-08 17:42:56.834069+00:00,http://goodtables.okfnlabs.org 19 | 91125b1297e0408d972d9e1b20010543,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,0,,22fc4915f50141259e717f2d6724c6d0,2016-08-08 17:42:57.284804+00:00,http://goodtables.okfnlabs.org 20 | 38aa93bd3b7540b4a383f72914a32e95,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,a067e4c7d5d341a689c7dfda36d54049,2017-04-18 09:27:19.019507+00:00,http://goodtables.okfnlabs.org 21 | 3e1e824daa2f4a8ab08f1b5b781e35a8,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,ad6a9ad5effd47bd8246adaa5d7fa06d,2017-04-18 09:27:20.827406+00:00,http://goodtables.okfnlabs.org 22 | ef384476e5764361b7fa2f0927cd7e46,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,b3afebc6cd90495abd48651bf3468533,2017-04-18 09:27:21.314295+00:00,http://goodtables.okfnlabs.org 23 | c8b5879aa25e46899225d94b0cb4ae82,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,66d21a37fe6d4149aaae0610afd377bf,2017-04-18 09:27:21.767780+00:00,http://goodtables.okfnlabs.org 24 | 515d2055c63c41689292b418945bd8e9,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,0756e905014540ae84a870b61896e5a4,2017-04-18 09:33:46.645467+00:00,http://goodtables.okfnlabs.org 25 | e3674d1fb27a4b478d6eb7047a469507,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,70ef71ff17f3480889c3dc9d0e6572f8,2017-04-18 09:33:48.489108+00:00,http://goodtables.okfnlabs.org 26 | 4bde14a9541546feafff7f53ab315493,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,2822a2e498d747c897b9499b4dbf6dc2,2017-04-18 09:33:48.971573+00:00,http://goodtables.okfnlabs.org 27 | dcee66bb5eeb46d0a55bcf56d1f2d8b1,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,6e3d4b85e9fb4a84b62d483ce354dd4d,2017-04-18 09:33:49.448480+00:00,http://goodtables.okfnlabs.org 28 | fbf1784269fc4511b4b6febbfa57a23e,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,e800081ce79d4bb68d33e286e40a845d,2017-04-18 10:17:58.833972+00:00,http://goodtables.okfnlabs.org 29 | 90fa232cd63d4bfbb47873483b2dfa01,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,67,,e800081ce79d4bb68d33e286e40a845d,2017-04-18 10:17:58.833972+00:00,http://goodtables.okfnlabs.org 30 | b318a27f153b468e9af757e2d05a320c,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,ae1edf8cf8dd4931aeada91629e9ce2f,2017-04-18 10:18:02.214709+00:00,http://goodtables.okfnlabs.org 31 | ce721fec98d443438e60e61492d87b76,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,7fd67e1fb1fe4562839f56dd1996e56c,2017-04-18 10:18:02.693835+00:00,http://goodtables.okfnlabs.org 32 | c60eef935157433786ad8a08872014a1,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,1feb92e96b514e3b9613ab189fecff0c,2017-04-18 10:18:03.173207+00:00,http://goodtables.okfnlabs.org 33 | fcd86a5eaea94c5b9c46f1043d634140,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,0,,17a0dbbce5fd4fddaa1365fb91ccfa5b,2017-04-18 10:18:04.337754+00:00,http://goodtables.okfnlabs.org 34 | e91bb0379a2d4fd294f2ed04cafd0e23,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,b5e36109c3894c70a7e6898f03f3be3c,2017-04-18 15:15:41.856361+00:00,http://goodtables.okfnlabs.org 35 | 0100dcccdae940d4a41c42369cb40d9a,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,67,,b5e36109c3894c70a7e6898f03f3be3c,2017-04-18 15:15:41.856361+00:00,http://goodtables.okfnlabs.org 36 | f2753167cdba403f9d7b2616f09751fd,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,a12154896fc7497c918405489b66017a,2017-04-18 15:15:44.465849+00:00,http://goodtables.okfnlabs.org 37 | 43e9f2c01b01446187b12cf4649fc48f,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,dcb41983c603452cbd602122a80d4759,2017-04-18 15:15:45.090038+00:00,http://goodtables.okfnlabs.org 38 | 709175f155c545f3866daf5c4376e915,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,6f6f0380302b4ab5bbda9df9e9df4ee9,2017-04-18 15:15:45.647236+00:00,http://goodtables.okfnlabs.org 39 | 1b1720b8d58d42b7847bf57196d4e8c4,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,0,,21057087b1484c46af6f89414656df62,2017-04-18 15:15:46.884910+00:00,http://goodtables.okfnlabs.org 40 | -------------------------------------------------------------------------------- /tests/fixtures/runs.csv: -------------------------------------------------------------------------------- 1 | id,timestamp,total_score 2 | 45761052f48a4b158314b45d5ff08291,2016-08-08 17:42:12.141037+00:00,84 3 | f0c77831c2ea4fd1801075eb4dffc625,2016-08-08 17:42:14.867729+00:00,0 4 | 00baee32a3f44619b2febe224b198f64,2016-08-08 17:42:36.428362+00:00,84 5 | 073aa1002d4242c4ac6f0078f593f1dd,2016-08-08 17:42:37.989954+00:00,0 6 | 288631224a51428ab59159a440b5e909,2016-08-08 17:42:54.854697+00:00,84 7 | 5e4824a7be2548b4a94507b825483a07,2016-08-08 17:42:57.050376+00:00,0 8 | 34bbd2146948465aa05b58603bce9767,2017-04-18 09:27:22.266260+00:00,0 9 | 6a1351e5ae5b454381d77d971bda12ca,2017-04-18 09:33:49.929597+00:00,0 10 | e228cdeb374a407ca14ee0ff34f7df8f,2017-04-18 09:45:06.245453+00:00,0 11 | e800081ce79d4bb68d33e286e40a845d,2017-04-18 10:17:58.833972+00:00,84 12 | 8c0d2b4ded574f24b649ad6a9fdcee54,2017-04-18 10:18:03.774295+00:00,0 13 | b5e36109c3894c70a7e6898f03f3be3c,2017-04-18 15:15:41.856361+00:00,84 14 | a6ac3e7c43e64f42a33d3d2c500bef06,2017-04-18 15:15:46.238074+00:00,0 15 | -------------------------------------------------------------------------------- /tests/fixtures/sources.csv: -------------------------------------------------------------------------------- 1 | id,publisher_id,title,data,score,revision,schema,created_at,timestamp,format 2 | source1,xx_dept1,Source 1,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,8,1,,2015-01-01,2015-01-01,csv 3 | source3,xx_dept15,Source 16,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,1,,2015-01-01,2015-01-01,csv 4 | -------------------------------------------------------------------------------- /tests/fixtures/sources_with_period_id.csv: -------------------------------------------------------------------------------- 1 | id,publisher_id,title,data,format,created_at,period_id 2 | source1,xx_dept1,Source 1,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,csv,2015-01-01,17-10-2014/17-10-2014 3 | source2,xx_dept15,Source 15,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/april-to-may-12th-2010.xls,excel,2010-05-01,01-04-2010/31-05-2010 4 | -------------------------------------------------------------------------------- /tests/mock_generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | from __future__ import unicode_literals 6 | 7 | from data_quality import generators 8 | 9 | class MockGenerator(generators.BaseGenerator): 10 | """This class deletes the current database and regenerates it""" 11 | 12 | def __init__(self, url=None, datapackage=None): 13 | """Create an instance 14 | 15 | Args: 16 | url: something to please the Base Generator 17 | """ 18 | 19 | super(MockGenerator, self).__init__(url) 20 | -------------------------------------------------------------------------------- /tests/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/data-quality-cli/e9abc93b896ea59269d11cdc8f2d301f81be20ad/tests/tasks/__init__.py -------------------------------------------------------------------------------- /tests/tasks/test_aggregate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import os 9 | from .test_task import TestTask 10 | from data_quality import tasks, utilities, compat 11 | from goodtables import pipeline 12 | 13 | 14 | class TestAggregatorTask(TestTask): 15 | """Test the Aggregator task""" 16 | 17 | def test_aggregator_run(self): 18 | """Test that Aggregator task runs as post task and updates results""" 19 | 20 | aggregator_task = tasks.Aggregator(self.config) 21 | url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv' 22 | pipeline_instance = pipeline.Pipeline(data=url, format='csv', 23 | post_task=aggregator_task.run) 24 | results_before_run = self.read_file_contents(aggregator_task.result_file) 25 | pipeline_instance.run() 26 | results_after_run = self.read_file_contents(aggregator_task.result_file) 27 | 28 | self.assertEqual(len(results_after_run), len(results_before_run) + 1) 29 | 30 | def test_agregator_batch_run(self): 31 | """Test that Aggregator task updates run file after each batch""" 32 | 33 | config = self.config 34 | aggregator_task = tasks.Aggregator(config) 35 | 36 | def mokup_function(instance): 37 | aggregator_task.write_run() 38 | batch_options = config['goodtables']['arguments']['batch'] 39 | batch_options['post_task'] = mokup_function 40 | batch_options['pipeline_options'] = config['goodtables']['arguments']['pipeline'] 41 | batch = pipeline.Batch(aggregator_task.source_file, **batch_options) 42 | runs_before_run = self.read_file_contents(aggregator_task.run_file) 43 | batch.run() 44 | runs_after_run = self.read_file_contents(aggregator_task.run_file) 45 | 46 | self.assertGreater(len(runs_after_run), len(runs_before_run)) 47 | 48 | def test_aggregator_fetch(self): 49 | """Test that Aggregator task fetches the source""" 50 | 51 | aggregator_task = tasks.Aggregator(self.config) 52 | url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv' 53 | utilities.set_up_cache_dir(aggregator_task.cache_dir) 54 | 55 | pipeline_instance = pipeline.Pipeline(data=url, format='csv', 56 | post_task=aggregator_task.run) 57 | pipeline_instance.run() 58 | file_names = [] 59 | for file_name in os.listdir(aggregator_task.cache_dir): 60 | file_names.append(file_name) 61 | self.assertEquals(file_names,['valid.csv']) 62 | 63 | def test_aggregator_assess_timeliness(self): 64 | """Test that Aggregator calls the RelevancePeriodExtractor""" 65 | 66 | self.config['source_file'] = 'sources_with_period_id.csv' 67 | self.config['datapackage_file'] = 'datapackage_sources_with_period.json' 68 | self.config['assess_timeliness'] = True 69 | self.config['timeliness']['timeliness_strategy'] = ['period_id'] 70 | extractor = tasks.extract_relevance_period.RelevancePeriodExtractor(self.config) 71 | extractor.run() 72 | aggregator_task = tasks.Aggregator(self.config) 73 | url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv' 74 | pipeline_instance = pipeline.Pipeline(data=url, format='csv', 75 | post_task=aggregator_task.run) 76 | pipeline_instance.run() 77 | updated_sources = self.read_file_contents(aggregator_task.result_file) 78 | result = updated_sources[-1] 79 | score = int(result['score']) 80 | self.assertEqual(98, score) 81 | 82 | def tests_aggreate_scoring(self): 83 | """Test Aggregator scoring""" 84 | 85 | aggregator_task = tasks.Aggregator(self.config) 86 | url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv' 87 | schema = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/test_schema.json' 88 | pipeline_options = self.config['goodtables']['arguments']['pipeline'] 89 | pipeline_options['options']['schema']['schema'] = schema 90 | pipeline_instance = pipeline.Pipeline(data=url, format='csv', 91 | post_task=aggregator_task.run, 92 | **pipeline_options) 93 | pipeline_instance.run() 94 | result = self.read_file_contents(aggregator_task.result_file)[-1] 95 | 96 | self.assertEqual(int(result['score']), 0) 97 | 98 | def read_file_contents(self, file_name): 99 | """Return file contents as list of dicts""" 100 | 101 | contents = [] 102 | with compat.UnicodeDictReader(file_name) as src_file: 103 | for line in src_file: 104 | contents.append(line) 105 | return contents 106 | -------------------------------------------------------------------------------- /tests/tasks/test_assess_performance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import os 9 | from .test_task import TestTask 10 | from data_quality import tasks, utilities, compat 11 | 12 | class TestPerformanceAssessorTask(TestTask): 13 | """Test the PerformanceAssessor task""" 14 | 15 | def test_performance_created(self): 16 | """Test that PerformanceAssessor task creates the performance file""" 17 | 18 | config = self.config 19 | assess_performance_task = tasks.PerformanceAssessor(config) 20 | assess_performance_task.run() 21 | self.assertTrue(os.path.exists(assess_performance_task.performance_file)) 22 | 23 | def test_performance_calculation(self): 24 | """Test that PerformanceAssessor task calculates performance correctly""" 25 | 26 | config = self.config 27 | assess_performance_task = tasks.PerformanceAssessor(config) 28 | assess_performance_task.run() 29 | test_dict = {'files_count_to_date': '1', 'valid_to_date': '100', 30 | 'score_to_date': '100', 'score': '100', 31 | 'month_of_creation': '2015-01-01', 'publisher_id': 'xx_dept1', 32 | 'valid': '100', 'files_count': '1'} 33 | with compat.UnicodeDictReader(assess_performance_task.performance_file) as pf: 34 | self.assertGreater(self.find_in_sequence(pf, test_dict), -1) 35 | 36 | def find_in_sequence(self, sequence, target): 37 | """Find `target` in `sequence`""" 38 | 39 | found = False 40 | for position, value in enumerate(sequence): 41 | if value == target: 42 | found = True 43 | break 44 | if not found: 45 | return -1 46 | return position 47 | -------------------------------------------------------------------------------- /tests/tasks/test_extract_relevance_period.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import datetime 9 | from data_quality import exceptions 10 | from data_quality.tasks.extract_relevance_period import RelevancePeriodExtractor 11 | from .test_task import TestTask 12 | 13 | class TestRelevancePeriodExtractor(TestTask): 14 | """Test the RelevancePeriodExtractor task""" 15 | 16 | def test_extract_dates(self): 17 | """Test the date extraction""" 18 | 19 | self.maxDiff = None 20 | examples = ['Transparency Data 1 to 30 April 2014', 21 | 'July 2011 return with descriptions', 22 | 'DH-May-2010-amnd4', 23 | 'April 2010 to December 2013', 24 | '2010 October Return', 25 | 'MOD\'s spending over £25,000 for August2014', 26 | 'jncc-spend-over-25k-2012-01', 27 | '12_03_15_data', 28 | 'Over_%C2%A325K_april_2014', 29 | 'Transparency_Sept2014_Final.csv', 30 | 'August - September 2015', 31 | '20-12-2015/21-01-2016', 32 | '17/07/2014 - 17/08/2014'] 33 | expected = [[datetime.datetime(2014,4,1), datetime.datetime(2014,4,30)], 34 | [datetime.datetime(2011,7,31)], 35 | [datetime.datetime(2010,5,31)], 36 | [datetime.datetime(2010,4,30), datetime.datetime(2013,12,31)], 37 | [datetime.datetime(2010,10,31)], 38 | [datetime.datetime(2014,8,31)], 39 | [datetime.datetime(2012,1,31)], 40 | [datetime.datetime(2015,3,12)], 41 | [datetime.datetime(2014,4,30)], 42 | [datetime.datetime(2014,9,30)], 43 | [datetime.datetime(2015,8,31), datetime.datetime(2015,9,30)], 44 | [datetime.datetime(2015,12,20), datetime.datetime(2016,1,21)], 45 | [datetime.datetime(2014,7,17), datetime.datetime(2014,8,17)]] 46 | 47 | self.config['timeliness']['timeliness_strategy'] = ['title', 'data'] 48 | results = [] 49 | extractor = RelevancePeriodExtractor(self.config) 50 | for line in examples: 51 | dates = extractor.extract_dates(line) 52 | results.append(dates) 53 | for index, result in enumerate(results): 54 | results[index] = sorted([extracted_date['date_obj'] 55 | for extracted_date in result]) 56 | 57 | self.assertSequenceEqual(results, expected) 58 | 59 | def test_resolve_period(self): 60 | """Test that a period is extracted and formated properly""" 61 | 62 | sources = [{ 63 | 'title': 'MOD spending over £500 on a GPC and spending over £25,000, April 2010 to December 2013/December 2012 MOD GPC spend', 64 | 'data': 'https://www.gov.uk/government/uploads/GPC_transparency_data_travel_stationery_contracts_dec2012.csv' 65 | }, 66 | { 67 | 'title': 'Spend over £25,000 in Natural England/July 2011 return', 68 | 'data': 'http://data.defra.gov.uk/ops/procurement/1107/ne-over-25k-1107.csv' 69 | }, 70 | { 71 | 'title': 'Spending over £25,000, April 2010 to December 2013/1 to 29 February 2012 GPC spend', 72 | 'data': 'https://www.gov.uk/government/uploads/attachment_data/file/28883/GPCTRANSPARENCYDATA1FEBRUARYTO29FEBRUARY2012includingdescriptions.csv' 73 | }] 74 | 75 | expected = [(datetime.datetime(2012,12,1), datetime.datetime(2012,12,31)), 76 | (datetime.datetime(2011,7,1), datetime.datetime(2011,7,31)), 77 | # This will not be found because the title is uncertain and the file name doesn't have delimitators 78 | None] 79 | 80 | self.config['timeliness']['timeliness_strategy'] = ['title', 'data'] 81 | results = [] 82 | extractor = RelevancePeriodExtractor(self.config) 83 | for source in sources: 84 | results.append(extractor.identify_period(source)) 85 | 86 | self.assertSequenceEqual(results, expected) 87 | 88 | def test_run_raises_if_field_not_provided(self): 89 | """Test that RelevancePeriodExtractor raises if the field in timeliness_strategy 90 | doesn't exist in source_file 91 | """ 92 | 93 | self.config['assess_timeliness'] = True 94 | self.config['timeliness']['timeliness_strategy'] = ['period_id'] 95 | extractor = RelevancePeriodExtractor(self.config) 96 | self.assertRaisesRegexp(ValueError, 'timeliness_strategy', extractor.run) 97 | 98 | def test_run_raises_if_insufficient_period(self): 99 | """Tests that RelevancePeriodExtractor raises if sources without `period_id` 100 | make up over 10% of total sources 101 | """ 102 | 103 | self.config['assess_timeliness'] = True 104 | self.config['timeliness']['timeliness_strategy'] = ['title', 'data'] 105 | extractor = RelevancePeriodExtractor(self.config) 106 | self.assertRaises(exceptions.UnableToAssessTimeliness, extractor.run) 107 | -------------------------------------------------------------------------------- /tests/tasks/test_generate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import os 9 | import io 10 | import json 11 | from data_quality import tasks, utilities, compat, generators 12 | from tests import mock_generator 13 | from .test_task import TestTask 14 | 15 | class TestGeneratorManagerTask(TestTask): 16 | """Test the GeneratorManager task""" 17 | 18 | def test_generate_built_in_generator(self): 19 | """Test that GeneratorManager task loads a built-in generator""" 20 | 21 | generator = tasks.GeneratorManager(self.config) 22 | generator_class = generator.run('ckan', 'endpoint', '', 23 | file_types=['csv','excel'], simulate=True) 24 | 25 | self.assertIsInstance(generator_class, generators.CkanGenerator) 26 | 27 | def test_generate_custom_generator(self): 28 | """Test that GeneratorManager task loads a custom generator""" 29 | 30 | generator = tasks.GeneratorManager(self.config) 31 | generator_path = 'tests.mock_generator.MockGenerator' 32 | generator_class = generator.run('mock', 'endpoint', generator_path, 33 | None, simulate=True) 34 | 35 | self.assertIsInstance(generator_class, mock_generator.MockGenerator) 36 | 37 | def test_generate_update_datapackage_sources(self): 38 | """Test that GeneratorManager task updates datapackage sources""" 39 | 40 | def empty_datapackage_sources(datapkg_path, datapkg): 41 | with io.open(datapkg_path, mode='w+', encoding='utf-8') as datapkg_file: 42 | datapkg.descriptor['sources'] = [] 43 | updated_json = json.dumps(datapkg.to_dict(), indent=4, sort_keys=True) 44 | datapkg_file.write(compat.str(updated_json)) 45 | 46 | generator = tasks.GeneratorManager(self.config) 47 | datapkg_path = os.path.join(generator.datapackage.base_path, 48 | 'datapackage.json') 49 | empty_datapackage_sources(datapkg_path, generator.datapackage) 50 | generator.update_datapackage_sources() 51 | second_generator = tasks.GeneratorManager(self.config) 52 | 53 | self.assertEquals(generator.datapackage.descriptor['sources'], 54 | second_generator.datapackage.descriptor['sources']) 55 | self.assertGreater(len(generator.datapackage.descriptor['sources']), 0) 56 | empty_datapackage_sources(datapkg_path, generator.datapackage) 57 | -------------------------------------------------------------------------------- /tests/tasks/test_initialize_datapackage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import os 9 | from data_quality import tasks, utilities, compat 10 | 11 | 12 | class TestDataPackageInitializer(unittest.TestCase): 13 | """Test the DataPackageInitializer task""" 14 | 15 | def setUp(self): 16 | self.workspace_path = './tests/tmp_datapackage/' 17 | utilities.resolve_dir(self.workspace_path) 18 | 19 | def tearDown(self): 20 | utilities.set_up_cache_dir(self.workspace_path) 21 | os.rmdir(self.workspace_path) 22 | 23 | def test_config_initialized(self): 24 | """Test that DataPackageInitializer generates a config file if there isn\'t one""" 25 | 26 | initializer = tasks.DataPackageInitializer(self.workspace_path) 27 | initializer.initialize_config() 28 | self.assertTrue(os.path.exists(os.path.join(self.workspace_path, 'dq_config.json'))) 29 | 30 | def test_run(self): 31 | """Test that DataPackageInitializer generates a 'datapackage.json' file""" 32 | 33 | initializer = tasks.DataPackageInitializer(self.workspace_path) 34 | initializer.run() 35 | self.assertTrue(os.path.exists(os.path.join(self.workspace_path, 'datapackage.json'))) 36 | 37 | -------------------------------------------------------------------------------- /tests/tasks/test_task.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import os 9 | from data_quality import utilities 10 | 11 | class TestTask(unittest.TestCase): 12 | """Base class for task tests""" 13 | 14 | def setUp(self): 15 | """Load the fixture config""" 16 | 17 | config_filepath = os.path.join('tests', 'fixtures', 'dq.json') 18 | config = utilities.load_json_config(config_filepath) 19 | self.config = config 20 | -------------------------------------------------------------------------------- /tests/tasks/tests_check_datapackage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import os 9 | import datapackage 10 | from data_quality import tasks, utilities, compat 11 | from .test_task import TestTask 12 | 13 | class TestDataPackageChecker(TestTask): 14 | """Test the DataPackageChecker task""" 15 | 16 | def test_lacking_required_field(self): 17 | """Test that DataPackageChecker raises if required field is missing""" 18 | 19 | filename = 'datapackage_schema_missing_required.json' 20 | self.config['datapackage_file'] = os.path.join('tests', 'fixtures', filename) 21 | checker = tasks.check_datapackage.DataPackageChecker(self.config) 22 | default_datapkg = utilities.get_default_datapackage() 23 | self.assertRaisesRegexp(ValueError, 'miss', checker.check_resource_schema, 24 | default_datapkg.resources[0], checker.datapackage.resources[0]) 25 | 26 | def test_run(self): 27 | """Test that DataPackageChecker raises if required resource is missing""" 28 | 29 | filename = 'datapackage_schema_missing_required.json' 30 | self.config['datapackage_file'] = os.path.join('tests', 'fixtures', filename) 31 | checker = tasks.check_datapackage.DataPackageChecker(self.config) 32 | self.assertRaisesRegexp(ValueError, 'found', checker.run()) 33 | 34 | def test_database_content(self): 35 | """Test that DataPackageChecker raises if a required file from the database 36 | doesn't respect the schema described in datapackage 37 | """ 38 | 39 | checker = tasks.check_datapackage.DataPackageChecker(self.config) 40 | self.assertRaisesRegexp(ValueError, 'schema', checker.check_database_content) -------------------------------------------------------------------------------- /tests/test_interface.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import unittest 9 | import subprocess 10 | import data_quality 11 | 12 | class TestDataQualityCLI(unittest.TestCase): 13 | 14 | def test_cli_run(self): 15 | config_path = os.path.join('tests', 'fixtures', 'dq.json') 16 | c = ['python', '-m', 'data_quality.main', 'run', config_path] 17 | subprocess.check_output(c) 18 | -------------------------------------------------------------------------------- /tests/test_utilities.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import unittest 8 | import os 9 | from data_quality import utilities 10 | import datapackage 11 | 12 | class TestUtilities(unittest.TestCase): 13 | 14 | def test_that_config_is_correctly_loaded(self): 15 | config_filepath = os.path.join('tests', 'fixtures', 'dq.json') 16 | config = utilities.load_json_config(config_filepath) 17 | self.assertTrue(os.path.isabs(config['data_dir'])) 18 | 19 | def test_default_datapackage_loaded(self): 20 | datapackage = utilities.get_default_datapackage() 21 | self.assertGreater(len(datapackage.resources), 0) 22 | 23 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | package=data_quality 3 | skip_missing_interpreters=true 4 | envlist= 5 | py27 6 | py33 7 | py34 8 | py35 9 | 10 | [testenv] 11 | deps= 12 | mock 13 | pytest 14 | pytest-cov 15 | coverage 16 | datapackage 17 | passenv= 18 | CI 19 | TRAVIS 20 | TRAVIS_JOB_ID 21 | TRAVIS_BRANCH 22 | commands= 23 | py.test \ 24 | --cov {[tox]package} \ 25 | --cov-config .coveragerc \ 26 | --cov-report term-missing \ 27 | {posargs:tests} 28 | 29 | [pytest] 30 | # pytest configuration here 31 | 32 | [report] 33 | # coverage configuration here 34 | --------------------------------------------------------------------------------