├── .coveragerc
├── .github
    ├── issue_template.md
    └── pull_request_template.md
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.md
├── LEAD.md
├── LICENSE.md
├── MANIFEST.in
├── Makefile
├── README.md
├── data_quality
    ├── VERSION
    ├── __init__.py
    ├── compat.py
    ├── datapackage.default.json
    ├── dq.default.json
    ├── exceptions.py
    ├── generators
    │   ├── __init__.py
    │   ├── base.py
    │   └── ckan.py
    ├── main.py
    ├── tasks
    │   ├── __init__.py
    │   ├── aggregate.py
    │   ├── assess_performance.py
    │   ├── base_task.py
    │   ├── check_datapackage.py
    │   ├── deploy.py
    │   ├── extract_relevance_period.py
    │   ├── generate.py
    │   └── initialize_datapackage.py
    └── utilities.py
├── dq-config.example.json
├── pylintrc
├── setup.py
├── tests
    ├── __init__.py
    ├── fixtures
    │   ├── datapackage.json
    │   ├── datapackage_missing_required.json
    │   ├── datapackage_sources_with_period.json
    │   ├── dq.json
    │   ├── fetched
    │   │   ├── empty_rows_multiple.csv
    │   │   └── valid.csv
    │   ├── performance.csv
    │   ├── publishers.csv
    │   ├── results.csv
    │   ├── runs.csv
    │   ├── sources.csv
    │   └── sources_with_period_id.csv
    ├── mock_generator.py
    ├── tasks
    │   ├── __init__.py
    │   ├── test_aggregate.py
    │   ├── test_assess_performance.py
    │   ├── test_extract_relevance_period.py
    │   ├── test_generate.py
    │   ├── test_initialize_datapackage.py
    │   ├── test_task.py
    │   └── tests_check_datapackage.py
    ├── test_interface.py
    └── test_utilities.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | omit =
 3 |     */docs*
 4 |     */tests*
 5 |     */examples*
 6 |     */requirements*
 7 |     setup.py
 8 | 
 9 | [xml]
10 | output = shippable/codecoverage/coverage.xml
11 | 
12 | [report]
13 | # Regexes for lines to exclude from consideration
14 | exclude_lines =
15 |     # Don't complain about missing debug-only code:
16 |     def __repr__
17 |     if self\.debug
18 | 
19 |     # Don't complain if tests don't hit defensive assertion code:
20 |     raise AssertionError
21 |     raise NotImplementedError
22 | 
23 |     # Don't complain if non-runnable code isn't run:
24 |     if 0:
25 |     if False:
26 |     if __name__ == .__main__.:
27 | 


--------------------------------------------------------------------------------
/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your idea or problem. If it's a bug share as much as possible to reproduce it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # Overview
2 | 
3 | Please replace this line with full information about your pull request. Make sure that tests pass before publishing it
4 | 
5 | ---
6 | 
7 | Please preserve this line to notify @roll (lead of this repository)
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # Node
60 | node_modules/
61 | 
62 | # Virtualenv
63 | venv/
64 | venv2/
65 | venv3/
66 | 
67 | # Tmux
68 | .tmuxp.yml
69 | 
70 | # Project
71 | tmp
72 | .projectile
73 | .idea
74 | *.sublime-project
75 | *.sublime-workspace
76 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo:
 2 |   false
 3 | 
 4 | language:
 5 |   python
 6 | 
 7 | python:
 8 |   - 2.7
 9 |   - 3.3
10 |   - 3.4
11 |   - 3.5
12 | 
13 | env:
14 |   global:
15 |   - TOXENV="py${PYTHON_VERSION//./}"
16 | 
17 | install:
18 |   - make develop
19 |   - pip install coveralls
20 | 
21 | script:
22 | #  - make lint
23 |   - make test
24 | 
25 | after_success:
26 |   - coveralls
27 | 
28 | deploy:
29 |   provider: pypi
30 |   user: okfn
31 |   on:
32 |     tags: true
33 |   password:
34 |     secure: Iuf7V4+XHL6wwFYt4IyEe0vWLGO/uOpMJWQnO+1eUjmcQ1qi4E9vyEJvsJRzWKm5+/Lv9uFIRGlmpNWQzUPs5VnMc3LEBh7Clv/WIlRGvi+omCeWoEPAPUueF8qjBcvpT37QNzjB5QXJY074uAihmKh/DU2xA4K0yCB8YQefBHYeNBl0pNYVnELUW8BFmz0GE0lTwHOnM681vgR01LdPjrgIHVEvnTZkKYtDXc/cwkw610fqrFS10srnTX6KjjC/pgDm4WSuaUxbPycmriIhZR29QgAx24NO/wrdGdp5H8TIsvBFnNFlC4QuHfwiXdAKpjL6cMu2uMo639Sev/484XxTorg2QQvNhNAJtiESVAaqVviAlmUItGdmsw4xhZb0JK6NC8fOuOoccL4DBD6JtCyGurwSpznuGXh1DQUYZ7fTd5qaUDnzBuhYGc8XDvcj14XU4P5OKES4NdruRVJOwFiNSMOAT6wm8b2Ue6N+FvgsghjwUr9ESKBrPj0VoouC2+FGZWT65vt/3R9PhFuBdC6SgMLWHESBuU5GW9Bc2ucS3HUi+uUV1IGjpfIsc3qifojNJiaU7hSAggJs9QlXd7goH2fKhb9ro2klzcDKmpBLXmMk3uH0QRpv1dGUYFtgGeEFN93vP3cxYsXf8OvV+MuCxYYGgrGZu3h8fvbc5hY=
35 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards).
 4 | 
 5 | ## Getting Started
 6 | 
 7 | Recommended way to get started is to create and activate a project virtual environment.
 8 | To install package and development dependencies into active environment:
 9 | 
10 | ```
11 | $ make develop
12 | ```
13 | 
14 | ## Linting
15 | 
16 | To lint the project codebase:
17 | 
18 | ```
19 | $ make lint
20 | ```
21 | 
22 | Under the hood `pylint` configured in `pylintrc` is used. On this stage it's already
23 | installed into your environment and could be used separately with more fine-grained control
24 | as described in documentation - https://www.pylint.org/.
25 | 
26 | For example to check only errors:
27 | 
28 | ```
29 | $ pylint -E <path>
30 | ```
31 | 
32 | ## Testing
33 | 
34 | To run tests with coverage:
35 | 
36 | ```
37 | $ make test
38 | ```
39 | Under the hood `tox` powered by `py.test` and `coverage` configured in `tox.ini` is used.
40 | It's already installed into your environment and could be used separately with more fine-grained control
41 | as described in documentation - https://testrun.org/tox/latest/.
42 | 
43 | For example to check subset of tests against Python 2 environment with increased verbosity.
44 | All positional arguments and options after `--` will be passed to `py.test`:
45 | 
46 | ```
47 | tox -e py27 tests/<path> -- -v
48 | ```
49 | 


--------------------------------------------------------------------------------
/LEAD.md:
--------------------------------------------------------------------------------
1 | roll
2 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Open Knowledge
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | global-include VERSION
2 | include LICENSE.md
3 | include Makefile
4 | include pylintrc
5 | include README.md
6 | include tox.ini
7 | include datapackage.default.json
8 | include dq.default.json
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all develop list lint release test version
 2 | 
 3 | 
 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2)
 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION)
 6 | LEAD := $(shell head -n 1 LEAD.md)
 7 | 
 8 | 
 9 | all: list
10 | 
11 | develop:
12 | 	pip install --upgrade -e .[develop]
13 | 
14 | list:
15 | 	@grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n'
16 | 
17 | lint:
18 | 	pylint $(PACKAGE)
19 | 
20 | readme:
21 | 	pip install md-toc
22 | 	md_toc -p README.md github --header-levels 3
23 | 	sed -i '/(#tableschema-spss-py)/,+2d' README.md
24 | 
25 | release:
26 | 	git checkout master
27 | 	git pull origin
28 | 	git fetch -p
29 | 	git commit -a -m 'v$(VERSION)'
30 | 	git tag -a v$(VERSION) -m 'v$(VERSION)'
31 | 	git push --follow-tags
32 | 
33 | templates:
34 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/issue_template.md
35 | 	sed -i -E "s/@(\w*)/@$(LEAD)/" .github/pull_request_template.md
36 | 
37 | test:
38 | 	tox
39 | 
40 | version:
41 | 	@echo $(VERSION)
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/frictionlessdata/data-quality-cli.svg)](https://travis-ci.org/frictionlessdata/data-quality-cli)
  2 | [![Coverage Status](https://coveralls.io/repos/frictionlessdata/data-quality-cli/badge.svg)](https://coveralls.io/r/frictionlessdata/data-quality-cli)
  3 | 
  4 | 
  5 | # Data Quality CLI
  6 | 
  7 | A command line tool that assesses the quality of a set of data sources (e.g.: CSV files of open data published by a government).
  8 | 
  9 | ## What's it about?
 10 | 
 11 | The `dq` (alias: `dataquality`) CLI is a tool to create and manage a [Data Package](http://specs.frictionlessdata.io/data-package/) 
 12 | from a given source of data that can be used by [Data Quality Dashboard](https://github.com/frictionlessdata/data-quality-dashboard).
 13 | The quality assessment is done using [GoodTables](http://goodtables.readthedocs.io/en/latest/index.html) and [can be configured](#quality-config).
 14 | 
 15 | The proposed workflow is this:
 16 | 
 17 | * An administrator creates a folder for a given project which will be equivalent to a data package.
 18 | * The administrator runs the [`dq init`](#init) command to create templates for the configuration file  
 19 | and the `datapackage.json` file along with the folder structure.
 20 | * The administrator updates the [configuration file](#config) to reflect the structure of the data package 
 21 | and optionally to [configure the quality assessment](#quality-config).
 22 | * The administrator updates the `datapackage.json` file with information specific to the project
 23 | and other customizations. 
 24 | * The administrator creates a `source_file` and a `publisher_file`:
 25 |     * By using the [generate command](#generate).
 26 |     * By using custom scripts ([see this example](https://github.com/okfn/data-quality-uk-25k-spend)).
 27 |     * In any other way that is in sync with the [schema](#schema).
 28 | * The administrator [runs the validation](#run) over the set of sources.
 29 | * The data is managed in a git repository (or other version control system), which the administrator has locally
 30 | * The administrator [deploys](#deploy) the data package to a central data repository (ex: GitHub)
 31 | * The administrator [updates the configuration](https://github.com/frictionlessdata/data-quality-dashboard#configure-database)
 32 | of the corresponding Data Quality Dashboard instance
 33 | * The administrator, or possibly content editor, occasionally updates
 34 | the `source_file` file in the data directory with new data
 35 | * Periodically (once a month, once a quarter), the administrator runs
 36 | `dq run /path/to-config.json --deploy`. This builds a new set of results for the data,
 37 | and deploys the updated data back to the central data repository
 38 | * Since Data Quality Dashboard is a pure client-side application, as soon as updated
 39 | data is deployed, the app will start working with the updated data.
 40 | 
 41 | ## Install
 42 | 
 43 | ```
 44 | pip install git+https://github.com/frictionlessdata/data-quality-cli.git
 45 | ```
 46 | 
 47 | ## Use
 48 | 
 49 | ```
 50 | dq --help
 51 | ```
 52 | 
 53 | ### Init
 54 | 
 55 | Before starting building the database, it is recommended that you run:
 56 | 
 57 | 
 58 | ```
 59 | dq init --folder_path /path/to/future/datapackage
 60 | ```
 61 | 
 62 | This command will potentially spare you some effort and create a `dq_config.json` file
 63 | with the default configuration for Data Quality CLI, a `datapackage.json` with the default
 64 | info about the data package and schemas for all the required resources, a `data` folder
 65 | that will be used to store the database and a `fetched` folder that will store the 
 66 | fetched sources. If you'd like to change the names of these folder or other configuration
 67 | options, you can make a `dq_config.json` file before running the command. The command will
 68 | leave your config file as it is and create the others according to your configuration.
 69 | 
 70 | After running it, you should review and update your `dq_config` and `datapackage.json`
 71 | with values specific to your project. 
 72 | 
 73 | ### Generate
 74 | 
 75 | Generic command:
 76 | 
 77 | ```
 78 | dq generate generator_name http://endpoint_to_data_sources
 79 | ```
 80 | 
 81 | There is currently one built-in generator for [CKAN](http://ckan.org/) instances.
 82 | Ex: In the example below, we generate a database from `data.qld.gov.au`:
 83 | 
 84 | ```
 85 | dq generate ckan https://data.qld.gov.au/
 86 | ```
 87 | 
 88 | By default, it will include only `CSV` and `excel`(`XLS`, `XLSX`) files. If you want to change that use
 89 | the `--file_type` option. In the example below, we ask for `CSV` and `TXT`:
 90 | 
 91 | ```
 92 | dq generate ckan https://data.qld.gov.au/  --file_type csv --file_type txt
 93 | ```
 94 | 
 95 | If you want to built a custom Generator, just inherit and overwrite the methods of [`data_quality.generators.BaseGenerator`](data_quality/generators/base.py) class.
 96 | To load your custom generator class you need to provide the path to it so that it can be imported via
 97 | [importlib.import_module](https://docs.python.org/3/library/importlib.html#importlib.import_module).
 98 | You can either provide it in the config, or by using the `--generator_class_path` option:
 99 | 
100 | ```
101 | dq generate custom_generator_name endpoint --generator_class_path mymodule.MyGenerator
102 | ```
103 | 
104 | If no config file is provided, the generator will use the [default configuration](###default-configuration)
105 | creating the files in the folder where the command is executed. If you want to change that, use the `--config_file_path` option:
106 | 
107 | ```
108 | dq generate generator_name endpoint --config_file_path path/to/config
109 | ```
110 | 
111 | ### Run
112 | 
113 | 
114 | ```
115 | dq run /path/to/config.json --deploy
116 | ```
117 | 
118 | Runs a *data quality assessment* on all data sources in a data repository.
119 | 
120 | * Writes aggregated results to the results.csv.
121 | * Writes run meta data to the run.csv.
122 | * If `--deploy` is passed, then also commits, tags and pushes the new changes back to the data repositories central repository.
123 | 
124 | ### Deploy
125 | 
126 | ```
127 | dq deploy /path/to/config.json
128 | ```
129 | 
130 | <a name="config"/>
131 | ### Configuration
132 | </a>
133 | 
134 | #### Structure of json config
135 | 
136 | ```json
137 | {
138 |   # folder that contains the source_file and publisher_file
139 |   "data_dir": "data",
140 | 
141 |   # folder that will store each source as local cache
142 |   "cache_dir": "fetched",
143 | 
144 |   # file that will contain the result for each source
145 |   "result_file": "results.csv",
146 | 
147 |   # file  that will contain the report for each collection of sources
148 |   "run_file": "runs.csv",
149 | 
150 |   # file containing the collection of sources that will be analyzed
151 |   "source_file": "sources.csv",
152 | 
153 |   # file containing the publishers of the above mentioned sources
154 |   "publisher_file": "publishers.csv",
155 | 
156 |   # will contain the results for each publisher
157 |   "performance_file": "performance.csv",
158 | 
159 |   "remotes": ["origin"],
160 |   "branch": "master",
161 |   
162 |   # name and path to custom generator (this name should be used when executing the generate command)
163 |   "generator": {"my_generator_name": "my_module.MyGenerator" },
164 |   
165 |   # whether or not to include timeliness as a dimension of quality assessment
166 |   "assess_timeliness": false,
167 |   
168 |   # timeliness options:
169 |   "timeliness": {
170 |     
171 |     # columns from source_file that should be checked for period detection
172 |     "timeliness_strategy": ["column1", "column2"],
173 |     
174 |     # whether Data Quality CLI should detect period or expect it to be provided
175 |     "extract_period": false,
176 |     
177 |     # maximum percent of sources with empty period allowed
178 |     "max_empty_relevance_period": 10,
179 |     
180 |     # when date is ambiguous, which order should be preffered
181 |     "date_order": "DMY",
182 |     
183 |     # how long after the period_id range is the data still considered timely (in months)
184 |     "timeliness_period": 1
185 |   }
186 |   # options for GoodTables ("http://goodtables.readthedocs.org/en/latest/")
187 |   "goodtables": {
188 | 
189 |     # set base url for the report links
190 |     "goodtables_web": "http://goodtables.okfnlabs.org",
191 | 
192 |     "arguments": {
193 | 
194 |       # options for pipeline ("http://goodtables.readthedocs.org/en/latest/pipeline.html")
195 |       "pipeline": {
196 | 
197 |         # what processors will analyze every pipeline
198 |         "processors": ["structure", "schema"],
199 | 
200 |         # specify encoding for every pipeline 
201 |           (use this if all the files have the same encoding)
202 |         "encoding": "ISO-8859-2",
203 | 
204 |          # pass options to procesors
205 |         "options": {
206 |           "schema": {"case_insensitive_headers": true}
207 |         }
208 |       },
209 | 
210 |       # options for batch ("http://goodtables.readthedocs.org/en/latest/batch.html")
211 |       "batch": {
212 | 
213 |         # column from source_file containing path/url to data source
214 |         "data_key": "data",
215 | 
216 |         # column from source_file containing path/url to schema
217 |         "schema_key": "schema",
218 | 
219 |         # column from source_file containing file format (csv, xls)
220 |         "format_key": "format",
221 | 
222 |         # column from source_file containings file encoding
223 |           (use this if you want to specify encoding for each source separately)
224 |         "encoding_key": "encoding",
225 | 
226 |         # time in seconds to wait between pipelines
227 |         "sleep": 2,
228 | 
229 |         # execute something after the analysis of a batch is finished
230 |         "post_task": "",
231 | 
232 |         # execute something after the analysis  of a pipeline is finished
233 |         "pipeline_post_task": "",
234 |       }
235 |     }
236 |   }
237 | }
238 | ```
239 | 
240 | <a name="default-config"/>
241 | #### Default config
242 | </a>
243 | 
244 | ```json
245 | {
246 |     "data_dir": "current_working_directory/data",
247 |     "cache_dir": "current_working_directory/fetched",
248 |     "result_file": "results.csv",
249 |     "run_file": "runs.csv",
250 |     "source_file": "sources.csv",
251 |     "publisher_file": "publishers.csv",
252 |     "performance_file": "performance.csv",
253 |     "remotes": ["origin"],
254 |     "branch": "master",
255 |     "assess_timeliness": false,
256 |     "timeliness":{},
257 |     "goodtables": {
258 |         "goodtables_web": "http://goodtables.okfnlabs.org",
259 |         "arguments": {
260 |             "pipeline": {},
261 |             "batch": {
262 |                 "data_key": "data"
263 |             }
264 |         }
265 |     }
266 | }
267 | ```
268 | 
269 | <a name="quality-config"/>
270 | #### Quality assessment configuration
271 | </a>
272 | 
273 | Currently, Data Quality CLI assesses the quality of a file based on its structure and
274 | by comparing its contents against a schema. This is done using the
275 | [built-in processors](http://goodtables.readthedocs.io/en/latest/cli.html) (a.k.a. validators) 
276 | in [GoodTables](http://goodtables.readthedocs.io/en/latest/).
277 | 
278 | *Note:*  If the files are compressed, they cannot be found at the specified path or the path returns
279 | an HTML page, they will be scored 0.
280 | 
281 | If you want to add other criteria for quality assessment, you can
282 | [create a custom processor for GoodTables](http://goodtables.readthedocs.io/en/latest/tutorial.html#implementing-a-custom-processor).
283 | Then include the name of your custom processor in the list passed to the `processors` parameter from [data quality config](###structure-of-json-config):
284 | `"processors": ["structure", "schema", "custom_processor"]`.
285 | You can also exclude processors that you don't want by removing them from the list.
286 | 
287 | ##### Structure Processor:
288 | 
289 |   Checks the structure of a tabular file.
290 | 
291 |   Ex: blank or duplicate rows, rows that have more/less columns than the header, bad formatting etc.
292 | 
293 |   Options and their defaults:
294 | 
295 |   * `ignore_empty_rows: false` - Should empty rows be considered errors or just ignored?
296 |   * `ignore_duplicate_rows: false` - Should duplicate rows be considered errors or just ignored?
297 |   * `ignore_empty_columns: false`
298 |   * `ignore_duplicate_columns: false`
299 |   * `ignore_headerless_columns: false` - Should values in a row that don't correspond to a column be ignored?
300 |   * `empty_strings: None` - A list/set of what should be considered empty string, otherwise only `''` will
301 | 
302 | 
303 | ##### Schema Processor:
304 | 
305 |   Compares the content of a tabular file against a [Json Table Schema](http://specs.frictionlessdata.io/table-schema/).
306 |   You have the following options for the schema:
307 | 
308 |   1. Provide a path to the schema for each source in `source_file` and [set the "schema_key"](#config) to the name 
309 | of the column that contains it
310 |   2. Let GoodTables infer the schema for each file from its first few lines (less transparent).
311 | 
312 |   Options and defaults: 
313 | 
314 |   * `ignore_field_order: true` - Should columns have the same order as in the schema?
315 |   * `infer_schema: false` - Should the schema be infered? (see above)
316 |   * `process_extra_fields: false` - Should fields that are not present in the schema be infered and checked?
317 |   * `case_insensitive_headers: false` - Should headers be matched with the equivalent field names from schema regardless of case?
318 | 
319 |   *Note:* If you use the schema processor but you don't provide a schema to compare against, the files will be evaluated as having no errors.
320 | 
321 | ##### Examples:
322 |   To exemplify how using different processors influences the quality assessment, we set up several versions
323 |   of the same dataset: UK public spend over £25000.
324 | 
325 |   [Here](https://uk-25k-structure-only.herokuapp.com/) is a dashboard whose
326 |   data quality database is assessed only on `structure`. You can find the database and configuration 
327 |   [in this repository](https://github.com/georgiana-b/data-quality-uk-25k-spend/tree/uk-25k-spend-structure-only).
328 | 
329 |   [This alternative version](https://uk-25k-given-schema.herokuapp.com/)
330 |   uses both `structure` and `schema` processors, comparing each file agaist the
331 |   [spend publishing schema](https://raw.githubusercontent.com/okfn/goodtables/master/examples/hmt/spend-publishing-schema.json).
332 |   It is the official configuration, with its corresponding repository [here](https://github.com/georgiana-b/data-quality-uk-25k-spend/tree/uk-25k-given-schema).
333 | 
334 |   Lastly, [here is the less predictible version](https://uk-25k-inferred-schema.herokuapp.com/)
335 |   that uses both `structure` and `schema`, but it compares files agaist inferred schemas (i.e. using `infer_schema: true`).Corresponding
336 |   database repostory [here](https://github.com/georgiana-b/data-quality-uk-25k-spend/tree/uk-25k-spend-inferred-schema).
337 | 
338 | ##### Timeliness
339 |   An optional criteria for quality assessment is the timeliness of data publication.
340 |    We define timeliness as the difference in months between when the data source _should have been published_
341 |    and _when it was published_. If you want to include timeliness in the quality assessment
342 |    set `assess_timeliness: true`.
343 | 
344 |   "When the data should have been published" is what we call `period_id` and reffers to
345 |    the period of time the data is relevant for. There are two options for providing `period_id`:
346 | 
347 |   - You can provide it for each source and include the column name in the config:
348 |      `"timeliness": {"timeliness_strategy": ["column_name"]}`
349 |   - Let Data Quality CLI detect the period from certain fields in `source_file`
350 |      that are likely to contain it:
351 | 
352 |       ```
353 |         "timeliness": {
354 |             "extract_period": true,
355 |             "timeliness_strategy": ["column1", "column2"]
356 |         }
357 |       ```
358 | 
359 |     The order will tell Data Quality CLI which field has priority. In this example,
360 |      it will try to find something in `column1` and move to `column2` only if
361 |      nothing was found. You can specify as many fields as you want. Please note that
362 |      if the date is ambiguous, Data Quality CLI will prefer the format `dd-mm-yyyy`.
363 |      You can change that with the `date_order` option. For example,
364 |      `"timeliness": {"date_order": "MDY"}` will change the preffered order
365 |      to `mm-dd-yyyy`.
366 | 
367 |   Regardless of the method you choose, Data Quality CLI will parse the fields you
368 |    provided in `timeliness_strategy`, try to extract a period out of them and write
369 |    it in the `source_file`.
370 | 
371 |    NOTE: If you provide a `period_id` it will be parsed and replaced by one with
372 |    the same dates but a different format used thoughout the CLI.
373 | 
374 |    If no `period_id` can be extracted for more than 10% of the sources, Data Quality CLI
375 |    will abort timeliness assessment and raise an error. If you want to change that,
376 |    set `max_empty_relevance_period` to the desired percent. If the precent of sources
377 |    laking `period_id` doesn't exceed `max_empty_relevance_period`, the value in the
378 |    `created_at` column will be used as `period_id` for them.
379 | 
380 |   By default, a data source is considered timely if no more than a month has passed from
381 |    the end of `period_id` until it was published (`created_at`). You can change that with the
382 |    `timeliness_period` option by providing a different *number of months*.
383 |    Ex: `"timeliness": {"timeliness_period": 3}` means that the data source is timely
384 |    if no more than 3 months passed since the end of `period_id`.
385 |    The quality score will decrease for every additional month after the period considered timely.
386 | 
387 | ### Schema
388 | 
389 | `Data Quality CLI` expects the following structure of the project folder, where 
390 | the names of files and folders are the ones defined in the json config given to  `dq run`:
391 | 
392 | ```
393 | project
394 | │
395 | └──────data_dir
396 |     │   source_file
397 |     │   publisher_file
398 |     │   run_file
399 |     │   result_file
400 |     │   performance_file
401 |     │
402 |     └───cache_dir
403 |     │
404 |     └───datapackage.json
405 | ```
406 | 
407 | The `datapackage.json` file is required in order to make the project
408 | a valid [Data Package](http://specs.frictionlessdata.io/data-package/). If you use
409 | the `dq init` command, it will be automatically generated for you from the 
410 | [the default datapackage](data_quality/datapackage.json).
411 | This file will be needed thoughout the app so you'll need to have it. 
412 | Take a look over the [Data Package](http://specs.frictionlessdata.io/data-package/)
413 | specification if you'd like to customize the it for your project.
414 | 
415 | *Warning:* The `datapackage.json` file is extensively used thoughtout Data Quality CLI and 
416 | the Data Quality Dashboard. To make sure it is kept in sync with the database that it 
417 | describes, several checks are performed at different steps. While you are free to customize
418 | your database by using custom generators and extra fields,
419 | you have to make sure that the fields required by Data Quality CLI to perform it's tasks are present.
420 | 
421 | 


--------------------------------------------------------------------------------
/data_quality/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.1


--------------------------------------------------------------------------------
/data_quality/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import io
 9 | from . import tasks
10 | from . import main
11 | from . import generators
12 | from . import compat
13 | from . import utilities
14 | 
15 | def get_version():
16 |     version_path = os.path.join(os.path.dirname(__file__), 'VERSION')
17 |     return io.open(version_path, encoding='utf-8').readline().strip()
18 | 
19 | __version__ = get_version()
20 | 
21 | __all__ = ['main', 'tasks', 'generators', 'compat']
22 | 
23 | 


--------------------------------------------------------------------------------
/data_quality/compat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #pylint: skip-file
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | from __future__ import unicode_literals
  7 | 
  8 | import sys
  9 | import io
 10 | import csv
 11 | import os
 12 | 
 13 | 
 14 | _ver = sys.version_info
 15 | is_py2 = (_ver[0] == 2)
 16 | is_py3 = (_ver[0] == 3)
 17 | is_py33 = (is_py3 and _ver[1] == 3)
 18 | is_py34 = (is_py3 and _ver[1] == 4)
 19 | is_py27 = (is_py2 and _ver[1] == 7)
 20 | 
 21 | 
 22 | if is_py2:
 23 |     from urlparse import urljoin
 24 | 
 25 |     builtin_str = str
 26 |     bytes = str
 27 |     str = unicode
 28 |     basestring = basestring
 29 |     numeric_types = (int, long, float)
 30 | 
 31 | elif is_py3:
 32 |     from urllib.parse import urljoin
 33 | 
 34 |     builtin_str = str
 35 |     str = str
 36 |     bytes = bytes
 37 |     basestring = (str, bytes)
 38 |     numeric_types = (int, float)
 39 | 
 40 | 
 41 | def to_bytes(str):
 42 |     """Convert a text string to a byte string"""
 43 |     return str.encode('utf-8')
 44 | 
 45 | 
 46 | def to_builtin_str(str):
 47 |     """Convert a text string to the built-in `str` on the runtime."""
 48 |     if is_py2:
 49 |         return str.encode('utf-8')
 50 |     else:
 51 |         return str
 52 | 
 53 | class UnicodeWriter(object):
 54 |     """
 55 |         This class provides functionality for writing CSV files
 56 |         in a given encoding, python 2 and 3 compatible
 57 |         It is a slight adaptation of the code here:
 58 |         http://python3porting.com/problems.html#csv-api-changes
 59 |     """
 60 |     def __init__(self, filename,
 61 |                  encoding='utf-8', **kw):
 62 |         self.filename = filename
 63 |         self.encoding = encoding
 64 |         self.kw = kw
 65 | 
 66 |     def __enter__(self):
 67 |         if is_py3:
 68 |             self.f = open(self.filename, 'w+t',
 69 |                           encoding=self.encoding)
 70 |         else:
 71 |             self.f = open(self.filename, 'w+b')
 72 |         self.writer = csv.writer(self.f, lineterminator=os.linesep, **self.kw)
 73 |         return self
 74 | 
 75 |     def __exit__(self, type, value, traceback):
 76 |         self.f.close()
 77 | 
 78 |     def writerow(self, row):
 79 |         for index, val in enumerate(row):
 80 |             if type(val) not in [str, bytes, builtin_str]:
 81 |                 if val is None:
 82 |                     val = ''
 83 |                 val = str(val)
 84 |             if is_py2:
 85 |                 val = val.encode(self.encoding)
 86 |             row[index] = val
 87 |         self.writer.writerow(row)
 88 | 
 89 |     def writerows(self, rows):
 90 |         for row in rows:
 91 |             self.writerow(row)
 92 | 
 93 | 
 94 | class UnicodeAppender(UnicodeWriter):
 95 |     """
 96 |        This class provides functionality for appending to CSV files
 97 |        in a given encoding, python 2 and 3 compatible
 98 |     """
 99 | 
100 |     def __enter__(self):
101 |         if is_py3:
102 |             self.f = open(self.filename, 'at',
103 |                           encoding=self.encoding)
104 |         else:
105 |             self.f = open(self.filename, 'ab')
106 |         self.writer = csv.writer(self.f, lineterminator=os.linesep, **self.kw)
107 |         return self
108 | 
109 | 
110 | class UnicodeDictWriter(UnicodeWriter):
111 |     """
112 |         This class provides functionality for writing CSV file rows from dicts
113 |         in a given encoding, python 2 and 3 compatible
114 |     """
115 |     def __init__(self, filename, fieldnames, encoding='utf-8', **kw):
116 |         self.fieldnames = fieldnames
117 |         super(UnicodeDictWriter, self).__init__(filename, encoding, **kw)
118 | 
119 |     def writerow(self, row):
120 |         for key, val in row.items():
121 |             if type(val) not in [str, bytes, builtin_str]:
122 |                 if val is None:
123 |                     val = ''
124 |                 val = str(val)
125 |             if is_py2:
126 |                 val = val.encode(self.encoding)
127 |             row[key] = val
128 |         self.writer.writerow([row.get(key, '') for key in self.fieldnames])
129 | 
130 |     def writeheader(self):
131 |         self.writer.writerow(self.fieldnames)
132 | 
133 | 
134 | class UnicodeReader(object):
135 |     """
136 |        This class provides functionality to read from CSV files
137 |        in a given encoding, python 2 and 3 compatible
138 |     """
139 |     def __init__(self, filename, encoding='utf-8', **kw):
140 |         self.filename = filename
141 |         self.encoding = encoding
142 |         self.kw = kw
143 | 
144 |     def __enter__(self):
145 |         if is_py3:
146 |             self.f = open(self.filename, 'rt', encoding=self.encoding)
147 |         else:
148 |             self.f = open(self.filename, 'rb')
149 |         self.reader = csv.reader(self.f, **self.kw)
150 |         return self
151 | 
152 |     def __exit__(self, type, value, traceback):
153 |         self.f.close()
154 | 
155 |     def next(self):
156 |         row = next(self.reader)
157 |         if is_py3:
158 |             return row
159 |         return [s.decode('utf-8') for s in row]
160 | 
161 |     __next__ = next
162 | 
163 |     def __iter__(self):
164 |         return self
165 | 
166 | 
167 | class UnicodeDictReader(UnicodeReader):
168 |     """
169 |        This class provides functionality to read CSV file rows as dicts
170 |        in a given encoding, python 2 and 3 compatible
171 |     """
172 |     def __init__(self, filename, encoding='utf-8', **kw):
173 |         super(UnicodeDictReader, self).__init__(filename, encoding, **kw)
174 | 
175 |     def __enter__(self):
176 |         if is_py3:
177 |             self.f = open(self.filename, 'rt', encoding=self.encoding)
178 |         else:
179 |             self.f = open(self.filename, 'rb')
180 |         self.reader = csv.reader(self.f, **self.kw)
181 |         self.header = next(self.reader)
182 |         return self
183 | 
184 |     def next(self):
185 |         row = next(self.reader)
186 |         if is_py2:
187 |            row= [s.decode('utf-8') for s in row]
188 |         return {self.header[x]: row[x] for x in range(len(self.header))}
189 | 
190 |     __next__ = next
191 | 
192 |     def __iter__(self):
193 |         return self
194 | 
195 | 


--------------------------------------------------------------------------------
/data_quality/datapackage.default.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "name": "",
  3 |     "last_modified": "",
  4 |     "validator_url": "https://goodtables.okfnlabs.org/api/run",
  5 |     "admin": "",
  6 |     "pitch": "",
  7 |     "context": "",
  8 |     "sources": [{"name": "", "web": ""}],
  9 |     "resources": [
 10 |         {
 11 |            "path": "publishers.csv",
 12 |            "name": "publisher_file",
 13 |            "schema": {
 14 |                 "fields": [
 15 |                   {
 16 |                     "name": "id",
 17 |                     "title": "ID of the publisher",
 18 |                     "type": "string",
 19 |                     "constraints": { "required": true, "unique": true }
 20 |                   },
 21 |                   {
 22 |                     "name": "title",
 23 |                     "title": "Title or official name of the publisher",
 24 |                     "type": "string",
 25 |                     "constraints": { "required": true, "unique": true }
 26 |                   }
 27 |                 ],
 28 |                 "primaryKey": "id"
 29 |             }
 30 |         },
 31 |         {
 32 |             "path": "sources.csv",
 33 |             "name": "source_file",
 34 |             "schema": {
 35 |                 "fields": [
 36 |                     {
 37 |                         "name": "id",
 38 |                         "title": "ID of the source",
 39 |                         "type": "string",
 40 |                         "constraints": { "required": true, "unique": true }
 41 |                     },
 42 |                     {
 43 |                         "name": "publisher_id",
 44 |                         "title": "ID of the source's publisher",
 45 |                         "type": "string",
 46 |                         "constraints": { "required": true, "unique": true }
 47 |                     },
 48 |                     {
 49 |                         "name": "title",
 50 |                         "title": "Title or name of the source",
 51 |                         "type": "string",
 52 |                         "constraints": { "required": true }
 53 |                     },
 54 |                     {
 55 |                         "name": "data",
 56 |                         "title": "Path/url to source",
 57 |                         "type": "string",
 58 |                         "constraints": { "required": true }
 59 |                     },
 60 |                     {
 61 |                         "name": "format",
 62 |                         "title": "File format of the source",
 63 |                         "type": "string"
 64 |                     },
 65 |                     {
 66 |                         "name": "created_at",
 67 |                         "title": "Time of the source's creation.",
 68 |                         "type": "string",
 69 |                         "constraints": { "required": true }
 70 |                     }
 71 |                 ],
 72 |                 "primaryKey": "id",
 73 |                 "foreignKeys": [
 74 |                     {
 75 |                         "fields": "publisher_id",
 76 |                         "reference": {
 77 |                             "resource": "publisher_file",
 78 |                             "fields": "id"
 79 |                         }
 80 |                     }
 81 |                 ]
 82 |             }
 83 |         },
 84 |         {
 85 |             "path": "runs.csv",
 86 |             "name": "run_file",
 87 |             "schema": {
 88 |                 "fields": [
 89 |                     {
 90 |                         "name": "id",
 91 |                         "title": "ID of the run",
 92 |                         "type": "string",
 93 |                         "constraints": { "required": true, "unique": true }
 94 |                     },
 95 |                     {
 96 |                         "name": "timestamp",
 97 |                         "title": "Timestamp of the run execution",
 98 |                         "type": "date",
 99 |                         "format": "datetime",
100 |                         "constraints": { "required": true }
101 |                     },
102 |                     {
103 |                         "name": "total_score",
104 |                         "title": "Rounded average score of results in this run",
105 |                         "type": "integer",
106 |                         "constraints": { "required": true}
107 |                     }
108 |                 ],
109 |                 "primaryKey": "id"
110 |             }
111 |         },
112 |         {
113 |             "path": "results.csv",
114 |             "name": "result_file",
115 |             "schema": {
116 |                 "fields": [
117 |                    {
118 |                         "name": "id",
119 |                         "title": "ID of the result",
120 |                         "type": "string",
121 |                         "constraints": { "required": true, "unique": true }
122 |                     },
123 |                     {
124 |                         "name": "source_id",
125 |                         "title": "ID of the correspoding source",
126 |                         "type": "string",
127 |                         "constraints": { "required": true, "unique": true }
128 |                     },
129 |                     {
130 |                         "name": "publisher_id",
131 |                         "title": "ID of the source's publisher",
132 |                         "type": "string",
133 |                         "constraints": { "required": true}
134 |                     },
135 |                     {
136 |                         "name": "created_at",
137 |                         "title": "time of the source's creation.",
138 |                         "type": "date",
139 |                         "format": "date",
140 |                         "constraints": { "required": true }
141 |                     },
142 |                     {
143 |                         "name": "data",
144 |                         "title": "Path/url to source",
145 |                         "type": "string",
146 |                         "constraints": { "required": true }
147 |                     },
148 |                     {
149 |                         "name": "schema",
150 |                         "title": "Path/url to the source's schema",
151 |                         "type": "string"
152 |                     },
153 |                     {
154 |                         "name": "score",
155 |                         "title": "Score of correctness given by GoodTables",
156 |                         "type": "integer",
157 |                         "contrains": { "required": true }
158 |                     },
159 |                     {
160 |                         "name": "summary",
161 |                         "title": "Summary",
162 |                         "type": "string"
163 |                     },
164 |                     {
165 |                         "name": "run_id",
166 |                         "title": "ID of the run in which the result was calculated",
167 |                         "type": "string",
168 |                         "constraints": { "required": true, "unique": true }
169 |                     },
170 |                     {
171 |                         "name": "timestamp",
172 |                         "title": "Timestamp of the run execution",
173 |                         "type": "date",
174 |                         "format": "datetime",
175 |                         "constraints": { "required": true }
176 |                     },
177 |                     {
178 |                         "name": "report",
179 |                         "title": "Path/url to the full GoodTabels report",
180 |                         "type": "string"
181 |                     }
182 |                 ],
183 |                 "primaryKey": "id",
184 |                 "foreignKeys": [
185 |                     {
186 |                        "fields": "source_id",
187 |                        "reference": {
188 |                             "resource": "source_file",
189 |                             "fields": "id"
190 |                        }
191 |                     },
192 |                     {
193 |                        "fields": "publisher_id",
194 |                        "reference": {
195 |                             "resource": "publisher_file",
196 |                             "fields": "id"
197 |                        }
198 |                     },
199 |                     {
200 |                        "fields": "run_id",
201 |                        "reference": {
202 |                             "resource": "run_file",
203 |                             "fields": "id"
204 |                        }
205 |                     }
206 |                 ]
207 |             }
208 |         },
209 |         {
210 |             "path": "performance.csv",
211 |             "name": "performance_file",
212 |             "schema": {
213 |                 "fields": [
214 |                     {
215 |                         "name": "publisher_id",
216 |                         "title": "ID of the publisher",
217 |                         "type": "string",
218 |                         "constraints": { "required": true, "unique": true }
219 |                     },
220 |                     {
221 |                         "name": "month_of_creation",
222 |                         "title": "Month when the source was created",
223 |                         "type": "date",
224 |                         "format": "date",
225 |                         "constraints": { "required": true }
226 |                     },
227 |                     {
228 |                         "name": "files_count",
229 |                         "title": "Number of files published by the publisher during period",
230 |                         "type": "integer",
231 |                         "constraints": { "required": true }
232 |                     },
233 |                     {
234 |                         "name": "score",
235 |                         "title": "Rounded average score of files published by the publisher during period",
236 |                         "type": "integer",
237 |                         "constraints": { "required": true }
238 |                     },
239 |                     {
240 |                         "name": "valid",
241 |                         "title": "Number of valid files published by the publisher during period",
242 |                         "type": "integer",
243 |                         "constraints": { "required": true }
244 |                     },
245 |                     {
246 |                         "name": "files_count_to_date",
247 |                         "title": "Number of files published by the publisher up to period",
248 |                         "type": "integer",
249 |                         "constraints": { "required": true }
250 |                     },
251 |                     {
252 |                         "name": "score_to_date",
253 |                         "title": "Rounded average score of files published by the publisher up to period",
254 |                         "type": "integer",
255 |                         "constraints": { "required": true }
256 |                     },
257 |                     {
258 |                         "name": "valid_to_date",
259 |                         "title": "Number of valid files published by the publisher up to period",
260 |                         "type": "integer",
261 |                         "constraints": { "required": true }
262 |                     }
263 |                 ],
264 |                 "foreignKeys": [
265 |                     {
266 |                         "fields": "publisher_id",
267 |                         "reference": {
268 |                             "resource": "publisher_file",
269 |                             "fields": "id"
270 |                         }
271 |                     }
272 |                 ]
273 |             }
274 |         }
275 |     ]
276 | }


--------------------------------------------------------------------------------
/data_quality/dq.default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data_dir": "data",
 3 |     "cache_dir": "fetched",
 4 |     "result_file": "results.csv",
 5 |     "run_file": "runs.csv",
 6 |     "source_file": "sources.csv",
 7 |     "publisher_file": "publishers.csv",
 8 |     "performance_file": "performance.csv",
 9 |     "datapackage_file": "datapackage.json",
10 |     "remotes": ["origin"],
11 |     "branch": "master",
12 |     "assess_timeliness": false,
13 |     "timeliness":{},
14 |     "data_quality_spec": {
15 |         "data_quality_spec_web": "https://cdn.rawgit.com/frictionlessdata/data-quality-spec/4d7140394f2d46c5d66f91d4be2bb41477e5f583/spec.json"
16 |     },
17 |     "goodtables": {
18 |         "goodtables_web": "http://goodtables.okfnlabs.org",
19 |         "arguments": {
20 |             "pipeline": {
21 |                 "break_on_invalid_processor": false
22 |             },
23 |             "batch": {
24 |                 "data_key": "data"
25 |             }
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/data_quality/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | class SourceNotFoundError(Exception):
 9 | 
10 |     def __init__(self, msg=None, source=None):
11 |         default_msg = 'The source {0} was not found in \'source_file\''.format(source)
12 |         self.msg = msg or default_msg
13 |         super(SourceNotFoundError, self).__init__(msg)
14 | 
15 | 
16 | class DuplicateDataSourceError(Exception):
17 | 
18 |     def __init__(self, msg=None, source=None):
19 |         default_msg = 'Different sources with the same path {0} have been found \
20 |                        in \'source_file\''.format(source)
21 |         self.msg = msg or default_msg
22 |         super(DuplicateDataSourceError, self).__init__(msg)
23 | 
24 | class UnableToAssessTimeliness(Exception):
25 | 
26 |     def __init__(self, msg=None):
27 |         default_msg = 'Timeliness cannot be assessed.'
28 |         self.msg = msg or default_msg
29 |         super(UnableToAssessTimeliness, self).__init__(msg)
30 | 
31 | 


--------------------------------------------------------------------------------
/data_quality/generators/__init__.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .ckan import CkanGenerator
 8 | from .base import BaseGenerator
 9 | 
10 | __all__ = ['CkanGenerator', 'BaseGenerator']
11 | 
12 | _built_in_generators = {'ckan': CkanGenerator}
13 | 


--------------------------------------------------------------------------------
/data_quality/generators/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | 
 8 | class BaseGenerator(object):
 9 |     """This is the base class for generators. All generators should inherit."""
10 | 
11 |     def __init__(self, url=None, datapackage=None):
12 | 
13 |         self.base_url = url
14 |         self.datapackage = datapackage
15 | 
16 |         if not self.base_url:
17 |             raise TypeError('Cannot generate the database without the "url" parameter.')
18 | 
19 |     def generate_sources(self, sources_filepath, file_types=['csv', 'excel']):
20 |         """Generate sources file for CSV database"""
21 | 
22 |         raise NotImplementedError('You must overwrite this method with your '
23 |                                   'generator\'s specific logic.')
24 | 
25 |     def generate_publishers(self, publishers_filepath):
26 |         """Generate publishers file for CSV database"""
27 | 
28 |         raise NotImplementedError('You must overwrite this method with your '
29 |                                   'generator\'s specific logic.')
30 | 


--------------------------------------------------------------------------------
/data_quality/generators/ckan.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import absolute_import
  5 | from __future__ import unicode_literals
  6 | 
  7 | import csv
  8 | from os import path
  9 | import requests
 10 | import jsontableschema
 11 | from data_quality import compat, utilities
 12 | from .base import BaseGenerator
 13 | 
 14 | class CkanGenerator(BaseGenerator):
 15 |     """This class generates a csv database from a CKAN instance located at the given url"""
 16 | 
 17 |     def __init__(self, url=None, datapackage=None):
 18 |         """Create an instance if the source url is given.
 19 | 
 20 |         Args:
 21 |             url: the base url for the CKAN instance
 22 |         """
 23 | 
 24 |         super(CkanGenerator, self).__init__(url, datapackage)
 25 |         self.default_publisher = None
 26 | 
 27 |     def generate_sources(self, sources_filepath, file_types=['csv', 'excel']):
 28 |         """Generates sources_file from the url"""
 29 | 
 30 |         file_types = [ftype.lower() for ftype in file_types]
 31 |         results = self.get_sources()
 32 |         sources = []
 33 |         source_resource = utilities.get_datapackage_resource(sources_filepath,
 34 |                                                              self.datapackage)
 35 |         source_schema = jsontableschema.model.SchemaModel(source_resource.descriptor['schema'])
 36 |         for result in results:
 37 |             sources += self.extract_sources(result, file_types)
 38 | 
 39 |         with compat.UnicodeWriter(sources_filepath,
 40 |                                   quoting=csv.QUOTE_MINIMAL) as sfile:
 41 |             sfile.writerow(source_schema.headers)
 42 |             for source in sources:
 43 |                 try:
 44 |                     values = [compat.str(source[key]) for key in source_schema.headers]
 45 |                     sfile.writerow(list(source_schema.convert_row(*values)))
 46 |                 except jsontableschema.exceptions.MultipleInvalid as e:
 47 |                     for error in e.errors:
 48 |                         raise error
 49 | 
 50 |     def get_sources(self):
 51 |         """Get all sources from CKAN API as a list"""
 52 | 
 53 |         extension = 'api/3/action/package_search'
 54 |         full_url = compat.urljoin(self.base_url, extension)
 55 |         response = requests.get(full_url)
 56 |         response.raise_for_status()
 57 |         data = response.json()
 58 |         count = data['result']['count']
 59 |         all_packages = []
 60 |         all_sources = []
 61 |         for start in range(0, count, 500):
 62 |             payload = {'rows': 500, 'start': start}
 63 |             response = requests.get(full_url, params=payload)
 64 |             data = response.json()
 65 |             all_packages += [result['id'] for result in  data['result']['results']]
 66 | 
 67 |         for package_id in all_packages:
 68 |             ext = 'api/3/action/package_show'
 69 |             full_package_url = compat.urljoin(self.base_url, ext)
 70 |             package_payload = {'use_default_schema': True, 'id': package_id}
 71 |             response = requests.get(full_package_url, params=package_payload)
 72 |             data = response.json()
 73 |             all_sources.append(data['result'])
 74 |         return all_sources
 75 | 
 76 |     def extract_sources(self, datum, file_types):
 77 |         """Extract all sources for one result"""
 78 | 
 79 |         resources = []
 80 |         for resource in datum.get('resources', {}):
 81 |             new_resource = {}
 82 |             new_resource['data'] = resource['url']
 83 |             ext = path.splitext(new_resource['data'])[1][1:].lower()
 84 |             new_resource['format'] = 'excel' if ext in ['xls', 'xlsx'] else ext
 85 |             file_types = ['excel' if ext in ['xls', 'xlsx'] else ext for ext in file_types]
 86 |             file_types.append('')
 87 |             if new_resource['format'] in file_types:
 88 |                 publisher = datum.get('organization', None)
 89 |                 if publisher:
 90 |                     new_resource['publisher_id'] = publisher.get('name')
 91 |                 else:
 92 |                     self.default_publisher = {'name': 'no_organization',
 93 |                                               'display_name': 'No Organization'}
 94 |                     new_resource['publisher_id'] = self.default_publisher['name']
 95 |                 new_resource['id'] = resource['id']
 96 |                 new_resource['created_at'] = resource['created']
 97 |                 title = datum.get('title', '')
 98 |                 name = resource.get('name', '')
 99 |                 new_resource['title'] = ' / '.join(val for val in [title, name] if val)
100 |                 resources.append(new_resource)
101 |         return resources
102 | 
103 |     def generate_publishers(self, publishers_filepath):
104 |         """Generates publisher_file from the url"""
105 | 
106 |         results = self.get_publishers()
107 |         if self.default_publisher:
108 |             results.append(self.default_publisher)
109 |         pub_resource = utilities.get_datapackage_resource(publishers_filepath,
110 |                                                           self.datapackage)
111 |         pub_schema = jsontableschema.model.SchemaModel(pub_resource.descriptor['schema'])
112 | 
113 |         with compat.UnicodeWriter(publishers_filepath,
114 |                                   quoting=csv.QUOTE_MINIMAL) as pfile:
115 |             pfile.writerow(pub_schema.headers)
116 |             for result in results:
117 |                 result = self.extract_publisher(result)
118 |                 try:
119 |                     values = [result[key] for key in pub_schema.headers]
120 |                     pfile.writerow(list(pub_schema.convert_row(*values)))
121 |                 except jsontableschema.exceptions.MultipleInvalid as e:
122 |                     for error in e.errors:
123 |                         raise error
124 | 
125 |     def get_publishers(self):
126 |         """Retrieves the publishers from CKAN API as a list"""
127 | 
128 |         extension = "api/3/action/organization_list"
129 |         payload = {'all_fields':True,
130 |                    'include_groups': True,
131 |                    'include_extras':True
132 |                   }
133 |         full_url = compat.urljoin(self.base_url, extension)
134 |         response = requests.get(full_url, params=payload)
135 |         publishers = response.json()['result']
136 |         return publishers
137 | 
138 |     def extract_publisher(self, result):
139 |         """Converts `result` into dict with standard compliant field names"""
140 | 
141 |         publisher = {}
142 |         publisher['id'] = result.get('name', '')
143 |         publisher['title'] = result.get('display_name', '')
144 |         for extra in result.get('extras', []):
145 |             key = extra.get('key')
146 |             if key == 'contact-email':
147 |                 publisher['email'] = extra.get('value')
148 |             if key == 'contact-name':
149 |                 publisher['contact'] = extra.get('value')
150 |             if key == 'category':
151 |                 publisher['type'] = extra.get('value')
152 |         return publisher
153 | 


--------------------------------------------------------------------------------
/data_quality/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import click
  9 | from goodtables import pipeline
 10 | from . import tasks, utilities, generators
 11 | 
 12 | @click.group()
 13 | def cli():
 14 |     """The entry point into the CLI."""
 15 | 
 16 | @cli.command()
 17 | @click.argument('config_file_path')
 18 | @click.option('--encoding', default=None)
 19 | @click.option('--deploy', is_flag=True)
 20 | def run(config_file_path, deploy, encoding):
 21 |     """Process data sources for a Spend Publishing Dashboard instance."""
 22 | 
 23 |     config = utilities.load_json_config(config_file_path)
 24 |     utilities.resolve_dir(config['cache_dir'])
 25 |     utilities.set_up_cache_dir(config['cache_dir'])
 26 |     source_filepath = os.path.join(config['data_dir'], config['source_file'])
 27 | 
 28 |     if config['assess_timeliness'] is True:
 29 |         extractor = tasks.extract_relevance_period.RelevancePeriodExtractor(config)
 30 |         extractor.run()
 31 | 
 32 |     aggregator = tasks.Aggregator(config)
 33 | 
 34 |     if deploy:
 35 | 
 36 |         def batch_handler(instance):
 37 |             aggregator.write_run()
 38 |             assesser = tasks.PerformanceAssessor(config)
 39 |             assesser.run()
 40 |             deployer = tasks.Deployer(config)
 41 |             deployer.run()
 42 | 
 43 |     else:
 44 | 
 45 |         def batch_handler(instance):
 46 |             aggregator.write_run()
 47 |             assesser = tasks.PerformanceAssessor(config)
 48 |             assesser.run()
 49 | 
 50 |     post_tasks = {'post_task': batch_handler, 'pipeline_post_task': aggregator.run}
 51 |     config['goodtables']['arguments']['batch'].update(post_tasks)
 52 |     batch_options = config['goodtables']['arguments']['batch']
 53 |     batch_options['pipeline_options'] = config['goodtables']['arguments']['pipeline']
 54 |     batch = pipeline.Batch(source_filepath, **batch_options)
 55 |     batch.run()
 56 | 
 57 | 
 58 | @cli.command()
 59 | @click.argument('config_file_path')
 60 | def deploy(config_file_path):
 61 |     """Deploy data sources for a Spend Publishing Dashboard instance."""
 62 | 
 63 |     config = utilities.load_json_config(config_file_path)
 64 |     deployer = tasks.Deployer(config)
 65 |     deployer.run()
 66 | 
 67 | 
 68 | @cli.command()
 69 | @click.argument('generator_name')
 70 | @click.argument('endpoint')
 71 | @click.option('-cf', '--config_file_path', type=click.Path(exists=True), default=None,
 72 |               help='Full path to the json config for data-quality-cli')
 73 | @click.option('-gp', '--generator_class_path', default=None,
 74 |               help='Path to your custom generator (Ex: mymodule.CustomGenerator)')
 75 | @click.option('-ft', '--file_type', multiple=True, default=['csv','excel'],
 76 |               help='File types that should be included in sources (default: csv and excel)')
 77 | def generate(generator_name, endpoint, config_file_path, generator_class_path, file_type):
 78 |     """Generate a database from the given endpoint
 79 | 
 80 |     Args:
 81 |         generator_name: Name of the generator (ex: ckan)
 82 |         endpoint: Url where the generator should get the data from
 83 |     """
 84 | 
 85 |     file_types = list(file_type)
 86 |     config = utilities.load_json_config(config_file_path)
 87 |     if not config_file_path:
 88 |         default_config_path = os.path.join(os.getcwd(), 'dq_config.json')
 89 |         config['data_dir'] = utilities.resolve_dir_name(default_config_path,
 90 |                                                         config['data_dir'])
 91 |     utilities.resolve_dir(config['data_dir'])
 92 | 
 93 |     if generator_name not in generators._built_in_generators.keys():
 94 |         generator_class_path = (generator_class_path or
 95 |                                 config.get('generator', {}).get(generator_name, None))
 96 |         if not generator_class_path:
 97 |             msg = ('You need to provide the path for your custom generator using the'
 98 |                    '`--generator_class_path` option or by providing it in the config:'
 99 |                    'Ex: {"generator":{"generator_name": "mymodule.CustomGenerator"}}')
100 |             raise ValueError(msg)
101 | 
102 |     generator = tasks.GeneratorManager(config)
103 |     generator.run(generator_name, endpoint, generator_class_path, file_types)
104 |     generator.update_datapackage_sources()
105 | 
106 | 
107 | @cli.command()
108 | @click.option('-p', '--folder_path', type=click.Path(exists=True), default=None,
109 |               help='Full path to the workspace folder')
110 | def init(folder_path):
111 | 
112 |     workspace_folder = folder_path
113 |     if not workspace_folder:
114 |         workspace_folder = os.getcwd()
115 | 
116 |     initializer = tasks.DataPackageInitializer(workspace_folder)
117 |     initializer.run()
118 | 
119 | if __name__ == '__main__':
120 |     cli()
121 | 


--------------------------------------------------------------------------------
/data_quality/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .base_task import Task
 8 | from .initialize_datapackage import DataPackageInitializer
 9 | from .generate import GeneratorManager
10 | from .aggregate import Aggregator
11 | from .deploy import Deployer
12 | from .assess_performance import PerformanceAssessor
13 | 
14 | __all__ = ['Task', 'DataPackageInitializer', 'GeneratorManager', 'Aggregator',
15 |            'PerformanceAssessor', 'Deployer']
16 | 


--------------------------------------------------------------------------------
/data_quality/tasks/aggregate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import io
  9 | import csv
 10 | import uuid
 11 | import pytz
 12 | import jsontableschema
 13 | from math import log
 14 | from datetime import datetime, timedelta
 15 | from data_quality import utilities, compat, exceptions
 16 | from .base_task import Task
 17 | from .check_datapackage import DataPackageChecker
 18 | from .extract_relevance_period import RelevancePeriodExtractor
 19 | 
 20 | 
 21 | class Aggregator(Task):
 22 | 
 23 |     """A Task runner to create results for data sources as they move
 24 |        through a processing pipeline.
 25 |     """
 26 | 
 27 |     def __init__(self, config, **kwargs):
 28 |         super(Aggregator, self).__init__(config, **kwargs)
 29 |         datapackage_check = DataPackageChecker(self.config)
 30 |         datapackage_check.run()
 31 |         run_resource = utilities.get_datapackage_resource(self.run_file,
 32 |                                                           self.datapackage)
 33 |         result_resource = utilities.get_datapackage_resource(self.result_file,
 34 |                                                              self.datapackage)
 35 |         self.run_schema = jsontableschema.model.SchemaModel(run_resource.descriptor['schema'])
 36 |         self.result_schema = jsontableschema.model.SchemaModel(result_resource.descriptor['schema'])
 37 |         self.initialize_file(self.result_file, self.result_schema.headers)
 38 |         self.initialize_file(self.run_file, self.run_schema.headers)
 39 |         self.run_id = compat.str(uuid.uuid4().hex)
 40 |         self.timestamp = datetime.now(pytz.utc)
 41 |         self.all_scores = []
 42 |         self.assess_timeliness = self.config['assess_timeliness']
 43 |         self.timeliness_period = self.config['timeliness'].get('timeliness_period', 1)
 44 |         self.max_score = 100
 45 |         required_resources = [self.result_file, self.source_file,
 46 |                               self.publisher_file, self.run_file]
 47 |         datapackage_check.check_database_completeness(required_resources)
 48 |         self.lookup = self.get_lookup()
 49 | 
 50 |     def run(self, pipeline):
 51 |         """Run on a Pipeline instance."""
 52 | 
 53 |         with compat.UnicodeAppender(self.result_file, quoting=csv.QUOTE_MINIMAL) as result_file:
 54 |             source = self.get_source(pipeline.data_source)
 55 |             result_id = compat.str(uuid.uuid4().hex)
 56 |             source['created_at'] = utilities.date_from_string(source['created_at'])
 57 |             if source['created_at'] is None:
 58 |                 raise ValueError(('No date could be extracted from `created_at`'
 59 |                                  ' field in source: {0}.').format(source))
 60 |             score = self.get_pipeline_score(pipeline, source)
 61 |             data_source = pipeline.data_source
 62 |             schema = ''
 63 |             summary = '' # TODO: how/what should a summary be?
 64 |             report = self.get_pipeline_report_url(pipeline)
 65 | 
 66 |             result = [result_id, source['id'], source['publisher_id'],
 67 |                       source['created_at'], data_source, schema, score,
 68 |                       summary, self.run_id, self.timestamp, report]
 69 |             try:
 70 |                 result_file.writerow(list(self.result_schema.convert_row(*result)))
 71 |             except jsontableschema.exceptions.MultipleInvalid as e:
 72 |                 for error in e.errors:
 73 |                     raise error
 74 | 
 75 |             if pipeline.data:
 76 |                 self.fetch_data(pipeline.data.stream, pipeline.data.encoding, source)
 77 | 
 78 |     def get_lookup(self):
 79 | 
 80 |         _keys = ['id', 'publisher_id', self.data_key, 'created_at', 'title',
 81 |                  'period_id']
 82 |         lookup = []
 83 | 
 84 |         with compat.UnicodeDictReader(self.source_file) as sources_file:
 85 |             for row in sources_file:
 86 |                 lookup.append({k: v for k, v in row.items() if k in _keys})
 87 | 
 88 |         return lookup
 89 | 
 90 |     def initialize_file(self, filepath, headers):
 91 |         """"Make sure a file exists and has headers before appending to it.
 92 | 
 93 |         Args:
 94 |             filepath: path to the file to be created
 95 |             headers: a tuple to write as header
 96 | 
 97 |         """
 98 |         if not os.path.exists(filepath):
 99 |             with compat.UnicodeWriter(filepath, quoting=csv.QUOTE_MINIMAL) as a_file:
100 |                 a_file.writerow(headers)
101 | 
102 |     def write_run(self):
103 |         """Write this run in the run file."""
104 | 
105 |         with compat.UnicodeAppender(self.run_file, quoting=csv.QUOTE_MINIMAL) as run_file:
106 |             entry = [self.run_id, self.timestamp, int(round(sum(self.all_scores) / len(self.lookup)))]
107 |             try:
108 |                 run_file.writerow(list(self.run_schema.convert_row(*entry)))
109 |             except jsontableschema.exceptions.MultipleInvalid as e:
110 |                 for error in e.errors:
111 |                     raise error
112 | 
113 |         return True
114 | 
115 |     def fetch_data(self, data_stream, encoding, source):
116 |         """Cache the data source in the /fetched directory"""
117 | 
118 |         source_name = source.get('name', source[self.data_key].rsplit('/', 1)[-1])
119 |         source_name = source_name or source['id']
120 |         cached_file_name = os.path.join(self.cache_dir, source_name)
121 |         data_stream.seek(0)
122 | 
123 |         with io.open(cached_file_name, mode='w+', encoding=encoding) as fetched_file:
124 |             for line in data_stream:
125 |                 fetched_file.write(line)
126 | 
127 |     def get_source(self, data_src):
128 |         """Find the entry correspoding to data_src from sources file"""
129 | 
130 |         matches = [match for match in self.lookup if match[self.data_key] == data_src]
131 | 
132 |         if len(matches) == 0:
133 |             raise exceptions.SourceNotFoundError(source=data_src)
134 |         elif len(matches) > 1:
135 |             for pos in range(len(matches)-1):
136 |                 first_values = set(matches[pos].values())
137 |                 second_values = set(matches[pos+1].values())
138 |                 differences = first_values.symmetric_difference(second_values)
139 |                 if len(differences) != 0:
140 |                     raise exceptions.DuplicateDataSourceError(source=data_src)
141 | 
142 |         return matches[0]
143 | 
144 |     def get_pipeline_report_url(self, pipeline):
145 |         """Return a URL to a report on this data."""
146 | 
147 |         return self.config['goodtables']['goodtables_web']
148 | 
149 |     def get_pipeline_score(self, pipeline, source):
150 |         """Return a score for this pipeline run."""
151 | 
152 |         score = self.max_score
153 |         report = pipeline.report.generate()
154 |         error_stats = self.get_error_stats(report)
155 |         base_errors = {err: stats for err, stats in error_stats.items()
156 |                        if stats['processor'] == 'base'}
157 |         if base_errors:
158 |             score = 0
159 |         else:
160 |             score = self.score_by_error_occurences(error_stats)
161 |             if self.assess_timeliness:
162 |                 publication_delay = self.get_publication_delay(source)
163 |                 score -= publication_delay
164 |         score = round(score)
165 |         if score < 0:
166 |             score = 0
167 |         self.all_scores.append(score)
168 |         return score
169 | 
170 |     def get_publication_delay(self, source):
171 |         """Determine how long the data source publication was delayed"""
172 | 
173 |         dates = {}
174 |         relevance_period = source['period_id'].split('/')
175 |         relevance_period = relevance_period + [None]*(2 - len(relevance_period))
176 |         dates['period_start'], dates['period_end'] = relevance_period
177 |         dates = {k: utilities.date_from_string(v) for k, v in dates.items()}
178 |         dates['period_end'] = dates['period_end'] or dates['period_start']
179 |         timely_until = dates['period_end'] + \
180 |                        timedelta(days=(self.timeliness_period * 30))
181 |         if dates['period_start'] <= source['created_at'] <= timely_until:
182 |             delay = 0
183 |         else:
184 |             delay = source['created_at'] - timely_until
185 |             delay = delay.days
186 |             if delay < 0:
187 |                 delay = 0
188 |             delay = delay / 30.00
189 |         return delay
190 | 
191 |     def get_error_stats(self, report):
192 |         """Return dict with stats on errors"""
193 | 
194 |         results = report['results']
195 |         dq_spec = utilities.get_data_quality_spec()
196 |         error_stats = {}
197 |         for result in results:
198 |             if result['result_level'] == 'error':
199 |                 error = error_stats.get(result['result_id'], None)
200 |                 if not error:
201 |                     if result['processor'] == 'base':
202 |                         error_spec = {}
203 |                     else:
204 |                         error_number = result['result_id'].split('_')[-1]
205 |                         error_number = str(int(error_number) - 1)
206 |                         error_spec = dq_spec[result['processor']][error_number]
207 |                     new_stats = {'occurrences': 1, 'rows': [result['row_index']],
208 |                                  'weight': error_spec.get('weight', 1),
209 |                                  'processor': result['processor']}
210 |                     error_stats[result['result_id']] = new_stats
211 |                 else:
212 |                     error['occurrences'] += 1
213 |                     error['rows'].append(result['row_index'])
214 |         return error_stats
215 | 
216 |     def score_by_error_occurences(self, error_stats):
217 |         """Score data source based on based on number of occurrences of each error
218 |            Algorithm: `total score - (error_weight * no_occurrences) /
219 |                         (Σ 1/no_occurrences )`
220 | 
221 |            Args:
222 |                 error_stats: dict with stats on each error
223 |         """
224 | 
225 |         score = self.max_score
226 |         for error, stats in error_stats.items():
227 |             no_occurrences = stats['occurrences']
228 |             harmonic_mean_occ = no_occurrences / harmonic_number(no_occurrences)
229 |             error_impact = stats['weight'] * harmonic_mean_occ
230 |             score -= error_impact
231 |         return score
232 | 
233 | def harmonic_number(n):
234 |     """Return an approximate value of n-th harmonic number, based on the
235 |         Euler-Mascheroni constant by the formula:  H(n)≈ln(n)+γ+1/2*n−1/12*n^2
236 |     """
237 | 
238 |     gamma = 0.57721566490153286
239 |     return gamma + log(n) + 0.5/n - 1./(12*n**2)
240 | 


--------------------------------------------------------------------------------
/data_quality/tasks/assess_performance.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import pytz
  8 | import dateutil
  9 | import datetime
 10 | import jsontableschema
 11 | from data_quality import utilities, compat
 12 | from .base_task import Task
 13 | from .check_datapackage import DataPackageChecker
 14 | 
 15 | 
 16 | class PerformanceAssessor(Task):
 17 | 
 18 |     """A Task runner to assess and write the performance of publishers for each
 19 |        period.
 20 |     """
 21 | 
 22 |     def __init__(self, *args, **kwargs):
 23 |         super(PerformanceAssessor, self).__init__(*args, **kwargs)
 24 |         datapackage_check = DataPackageChecker(self.config)
 25 |         datapackage_check.run()
 26 |         required_resources = [self.result_file, self.source_file,
 27 |                               self.publisher_file, self.run_file]
 28 |         datapackage_check.check_database_completeness(required_resources)
 29 | 
 30 |     def run(self):
 31 |         """Write the performance for all publishers."""
 32 | 
 33 |         publisher_ids = self.get_publishers()
 34 |         performance_resource = utilities.get_datapackage_resource(self.performance_file,
 35 |                                                                   self.datapackage)
 36 |         performance_schema = jsontableschema.model.SchemaModel(performance_resource.descriptor['schema'])
 37 | 
 38 |         with compat.UnicodeWriter(self.performance_file) as performance_file:
 39 |             performance_file.writerow(performance_schema.headers)
 40 |             available_periods = []
 41 | 
 42 |             for publisher_id in publisher_ids:
 43 |                 sources = self.get_sources(publisher_id)
 44 |                 periods = self.get_unique_periods(sources)
 45 |                 available_periods += periods
 46 |             all_periods = self.get_all_periods(available_periods)
 47 | 
 48 |             publishers_performances = []
 49 |             all_sources = []
 50 | 
 51 |             for publisher_id in publisher_ids:
 52 |                 sources = self.get_sources(publisher_id)
 53 |                 performances = self.get_periods_data(publisher_id, all_periods,
 54 |                                                      sources)
 55 |                 publishers_performances += performances
 56 |                 all_sources += sources
 57 |                 for row in utilities.dicts_to_schema_rows(performances,
 58 |                                                           performance_schema):
 59 |                     performance_file.writerow(row)
 60 | 
 61 |             all_performances = self.get_periods_data('all', all_periods, all_sources)
 62 |             for row in utilities.dicts_to_schema_rows(all_performances,
 63 |                                                       performance_schema):
 64 |                 performance_file.writerow(row)
 65 | 
 66 |     def get_publishers(self):
 67 |         """Return list of publishers ids."""
 68 | 
 69 |         publisher_ids = []
 70 | 
 71 |         with compat.UnicodeDictReader(self.publisher_file) as publishers_file:
 72 |             for row in publishers_file:
 73 |                 publisher_ids.append(row['id'])
 74 |         return publisher_ids
 75 | 
 76 |     def get_sources(self, publisher_id):
 77 |         """Return list of sources of a publisher with id, period and score. """
 78 | 
 79 |         sources = []
 80 | 
 81 |         with compat.UnicodeDictReader(self.source_file) as sources_file:
 82 |             for row in sources_file:
 83 |                 source = {}
 84 |                 if row['publisher_id'] == publisher_id:
 85 |                     source['id'] = row['id']
 86 |                     source['created_at'] = utilities.date_from_string(row['created_at'])
 87 |                     source['score'] = self.get_source_score(source['id'])
 88 |                     sources.append(source)
 89 |         return sources
 90 | 
 91 |     def get_source_score(self, source_id):
 92 |         """Return latest score of a source from results.
 93 | 
 94 |         Args:
 95 |             source_id: id of the source whose score is wanted
 96 |         """
 97 | 
 98 |         score = 0
 99 |         latest_timestamp = pytz.timezone('UTC').localize(datetime.datetime.min)
100 | 
101 |         with compat.UnicodeDictReader(self.result_file) as result_file:
102 |             for row in result_file:
103 |                 if row['source_id'] == source_id:
104 |                     timestamp = dateutil.parser.parse(row['timestamp'])
105 |                     if timestamp > latest_timestamp:
106 |                         latest_timestamp = timestamp
107 |                         score = int(row['score'])
108 |         return score
109 | 
110 |     def get_periods_data(self, publisher_id, periods, sources):
111 |         """Return list of performances for a publisher, by period.
112 | 
113 |         Args:
114 |             publisher_id: publisher in dicussion
115 |             periods: list of all available_periods
116 |             sources: list of publisher's sources
117 | 
118 |         """
119 | 
120 |         performances = []
121 |         period_sources_to_date = []
122 | 
123 |         for period in periods:
124 |             period_sources = self.get_period_sources(period, sources)
125 |             period_sources_to_date += period_sources
126 |             performance = {}
127 |             performance['publisher_id'] = publisher_id
128 |             performance['month_of_creation'] = compat.str(period)
129 |             performance['files_count'] = len(period_sources)
130 |             performance['score'] = self.get_period_score(period_sources)
131 |             performance['valid'] = self.get_period_valid(period_sources)
132 |             performance['score_to_date'] = self.get_period_score(period_sources_to_date)
133 |             performance['valid_to_date'] = self.get_period_valid(period_sources_to_date)
134 |             performance['files_count_to_date'] = len(period_sources_to_date)
135 |             performances.append(performance)
136 |         return performances
137 | 
138 |     def get_period_sources(self, period, sources):
139 |         """Return list of sources for a period.
140 | 
141 |         Args:
142 |             period: a date object
143 |             sources: list of sources
144 | 
145 |         """
146 | 
147 |         period_sources = []
148 | 
149 |         for source in sources:
150 |             if period == source['created_at'].replace(day=1):
151 |                 period_sources.append(source)
152 |         return period_sources
153 | 
154 |     def get_period_score(self, period_sources):
155 |         """Return average score from list of sources.
156 | 
157 |         Args:
158 |             period_sources: sources correspoding to a certain period
159 |         """
160 | 
161 |         score = 0
162 | 
163 |         if len(period_sources) > 0:
164 |             total = 0
165 |             for source in period_sources:
166 |                 total += int(source['score'])
167 |             score = int(round(total / len(period_sources)))
168 |         return score
169 | 
170 |     def get_period_valid(self, period_sources):
171 |         """Return valid percentage from list of sources.
172 | 
173 |         Args:
174 |             period_sources: sources correspoding to a certain period
175 |         """
176 | 
177 |         valid = 0
178 |         if len(period_sources) > 0:
179 |             valids = []
180 |             for source in period_sources:
181 |                 if int(source['score']) == 100:
182 |                     valids.append(source)
183 |             if valids:
184 |                 valid = int(round(len(valids) / len(period_sources) * 100))
185 |         return valid
186 | 
187 |     def get_unique_periods(self, sources):
188 |         """Return list of unique periods as date objects from sources.
189 | 
190 |         Args:
191 |             sources: a list of sources
192 | 
193 |         """
194 | 
195 |         periods = []
196 |         for source in sources:
197 |             periods.append(source['created_at'])
198 |         periods = list(set(periods))
199 |         return periods
200 | 
201 |     def get_all_periods(self, periods):
202 |         """Return all periods from oldest in periods to now.
203 | 
204 |         Args:
205 |             periods: list of date objects
206 | 
207 |         """
208 | 
209 |         oldest_date = sorted(periods)[0]
210 |         oldest_date = oldest_date.replace(day=1)
211 |         current_date = datetime.date.today()
212 |         delta = dateutil.relativedelta.relativedelta(months=1)
213 |         relative_date = oldest_date
214 |         all_periods = []
215 | 
216 |         while relative_date <= current_date:
217 |             all_periods.append(relative_date)
218 |             relative_date += delta
219 |         return all_periods
220 | 


--------------------------------------------------------------------------------
/data_quality/tasks/base_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import datapackage
 9 | 
10 | 
11 | class Task(object):
12 | 
13 |     """Base class for Data Quality CLI tasks."""
14 | 
15 |     def __init__(self, config, **kwargs):
16 |         self.config = config
17 |         self.remotes = self.config['remotes']
18 |         self.branch = self.config['branch']
19 |         self.data_dir = self.config['data_dir']
20 |         self.result_file = os.path.join(self.data_dir, self.config['result_file'])
21 |         self.run_file = os.path.join(self.data_dir, self.config['run_file'])
22 |         self.source_file = os.path.join(self.data_dir, self.config['source_file'])
23 |         self.performance_file = os.path.join(self.data_dir,
24 |                                              self.config['performance_file'])
25 |         self.publisher_file = os.path.join(self.data_dir,
26 |                                            self.config['publisher_file'])
27 |         self.cache_dir = self.config['cache_dir']
28 |         self.data_key = self.config['goodtables']['arguments']['batch']['data_key']
29 |         datapkg_file_path = self.config.get('datapackage_file', 'datapackage.json')
30 |         if not os.path.isabs(datapkg_file_path):
31 |             datapkg_file_path = os.path.join(os.path.dirname(self.data_dir),
32 |                                              datapkg_file_path)
33 |         try:
34 |             self.datapackage = datapackage.DataPackage(datapkg_file_path)
35 |         except datapackage.exceptions.DataPackageException as e:
36 |             raise ValueError(('A datapackage couldn\'t be created because of the '
37 |                               'following error: "{0}". Make sure the file is not '
38 |                               'empty and use "dq init" command.').format(e))
39 |         self.all_scores = []


--------------------------------------------------------------------------------
/data_quality/tasks/check_datapackage.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | from jsontableschema.model import SchemaModel
  9 | from goodtables import pipeline
 10 | from data_quality import utilities
 11 | from . import Task
 12 | 
 13 | 
 14 | class DataPackageChecker(Task):
 15 | 
 16 |     """A task runner to check that the data package is correct"""
 17 | 
 18 |     def __init__(self, config, inflexible_resources=[]):
 19 |         super(DataPackageChecker, self).__init__(config)
 20 |         self.inflexible_resources = ['run_file', 'result_file', 'performance_file']
 21 |         self.inflexible_resources.extend(inflexible_resources)
 22 |         self.inflexible_resources = set(inflexible_resources)
 23 | 
 24 |     def run(self):
 25 |         """Check user datapackage against default datapackage"""
 26 | 
 27 |         default_datapkg = utilities.get_default_datapackage()
 28 |         for default_resource in default_datapkg.resources:
 29 |             resource_path = os.path.join(self.config['data_dir'],
 30 |                                          self.config[default_resource.descriptor['name']])
 31 |             resource = utilities.get_datapackage_resource(resource_path,
 32 |                                                           self.datapackage)
 33 |             self.check_resource_schema(default_resource, resource)
 34 | 
 35 |     def check_resource_schema(self, default_resource, resource):
 36 |         """Check that user resource schema contains all the mandatory fields"""
 37 | 
 38 |         def get_uncustomizable_fields(schema):
 39 |             uncustomizable = ['constraints', 'format', 'name', 'type']
 40 |             field_filter = lambda field: {key: val for key, val in field.items()
 41 |                                           if key in uncustomizable}
 42 |             fields = [field_filter(field) for field in schema.fields]
 43 |             fields = sorted(fields, key=lambda k: k['name'])
 44 | 
 45 |         resource_schema = SchemaModel(resource.descriptor['schema'])
 46 |         default_schema_dict = default_resource.descriptor['schema']
 47 |         if default_resource.descriptor['name'] == 'source_file':
 48 |             for field in default_schema_dict['fields']:
 49 |                 if field['name'] == 'data':
 50 |                     field['name'] = self.data_key
 51 |         default_schema = SchemaModel(default_schema_dict)
 52 | 
 53 |         if default_resource.descriptor['name'] in self.inflexible_resources:
 54 |             if get_uncustomizable_fields(default_schema) != \
 55 |                get_uncustomizable_fields(resource_schema):
 56 |                 msg = ('The fields for "{0}" are not subject to'
 57 |                        'change').format(resource.local_data_path)
 58 |                 raise ValueError(msg, resource.local_data_path)
 59 |         else:
 60 |             required_headers = set(default_schema.required_headers)
 61 |             resource_headers = set(resource_schema.headers)
 62 |             if not required_headers.issubset(resource_headers):
 63 |                 missing_headers = required_headers.difference(resource_headers)
 64 |                 msg = ('Fields [{0}] are needed for internal processing'
 65 |                        'but are missing from {1}.'
 66 |                        ).format(','.join(missing_headers), resource.local_data_path)
 67 |                 raise ValueError(msg, resource.local_data_path)
 68 | 
 69 |     def check_database_content(self):
 70 |         """Check that the database content is compliant with the datapackage"""
 71 | 
 72 |         self.run()
 73 |         for resource in self.datapackage.resources:
 74 |             resource_path = resource.local_data_path
 75 |             if os.path.exists(resource_path):
 76 |                 options = {'schema': {'schema': resource.descriptor['schema']}}
 77 |                 pipe = pipeline.Pipeline(resource_path, processors=['schema'],
 78 |                                                  options=options)
 79 |                 result, report = pipe.run()
 80 |                 if result is False:
 81 |                     issues = [res['result_message'] for res in report.generate()['results']]
 82 |                     msg = ('The file {0} is not compliant with the schema '
 83 |                            'you declared for it in "datapackage.json".'
 84 |                            'Errors: {1}'
 85 |                           ).format(resource_path, ';'.join(issues))
 86 |                     raise ValueError(msg)
 87 | 
 88 |     def check_database_completeness(self, required_resources=None):
 89 |         """Checks that 'required_resources', or all necessary ones exist in the database
 90 | 
 91 |             Args:
 92 |                 required_resources: list of paths to required resources
 93 |         """
 94 | 
 95 |         all_resources = [res.local_data_path for res in self.datapackage.resources]
 96 |         resources = required_resources or all_resources
 97 |         for resource_file in resources:
 98 |             if not os.path.exists(resource_file):
 99 |                 msg = ('The file "{0}" is needed but it doesn\'t exist.'
100 |                        'Please create it or use "dq generate".'
101 |                       ).format(resource_file)
102 |                 raise ValueError(msg)
103 | 


--------------------------------------------------------------------------------
/data_quality/tasks/deploy.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import io
  9 | import subprocess
 10 | import contextlib
 11 | from time import strftime, gmtime
 12 | import json
 13 | from data_quality import compat
 14 | from .base_task import Task
 15 | from .check_datapackage import DataPackageChecker
 16 | 
 17 | @contextlib.contextmanager
 18 | def cd(path):
 19 |     """Move into a dir while the context is active."""
 20 |     workpath = os.getcwd()
 21 |     os.chdir(path)
 22 |     yield
 23 |     os.chdir(workpath)
 24 | 
 25 | 
 26 | class Deployer(Task):
 27 | 
 28 |     """A Task runner to deploy a Data Quality repository to a remote."""
 29 | 
 30 |     commit_msg = 'New result and run data.'
 31 |     tag_msg = 'New result and run data.'
 32 |     tag_version = ''
 33 | 
 34 |     def run(self, simulate=False, *args):
 35 |         """Commit and deploy changes."""
 36 | 
 37 |         datapackage_check = DataPackageChecker(self.config)
 38 |         datapackage_check.run()
 39 |         self._pull()
 40 |         self.update_last_modified()
 41 |         datapackage_check.check_database_completeness()
 42 |         datapackage_check.check_database_content()
 43 |         self._add()
 44 |         self._commit()
 45 |         if simulate:
 46 |             return True
 47 |         # self._tag()
 48 |         self._push()
 49 | 
 50 |     def _pull(self):
 51 |         """Pull in any changes from remotes."""
 52 | 
 53 |         with cd(self.config['data_dir']):
 54 | 
 55 |             for remote in self.remotes:
 56 |                 # fetch
 57 |                 command = ['git', 'fetch', remote, self.branch]
 58 |                 subprocess.call(command)
 59 |                 # merge; prefer ours
 60 |                 command = ['git', 'merge', '-s', 'recursive', '-X', 'ours',
 61 |                            '{0}/{1}'.format(remote, self.branch)]
 62 |                 subprocess.call(command)
 63 | 
 64 |     def _add(self):
 65 |         """Add the changed files to the git index."""
 66 | 
 67 |         with cd(self.config['data_dir']):
 68 | 
 69 |             # add the changed files
 70 |             command = ['git', 'add', self.result_file]
 71 |             subprocess.call(command)
 72 |             command = ['git', 'add', self.run_file]
 73 |             subprocess.call(command)
 74 | 
 75 |     def _commit(self):
 76 | 
 77 |         with cd(self.config['data_dir']):
 78 |             command = ['git', 'commit', '-a', '-m', '{0}'.format(self.commit_msg)]
 79 |             subprocess.call(command)
 80 | 
 81 |     def _tag(self):
 82 |         with cd(self.config['data_dir']):
 83 |             command = ['git', 'tag', '-a', self.tag_version, '-m', '{0}'.format(self.tag_msg)]
 84 |             subprocess.call(command)
 85 | 
 86 |     def _push(self):
 87 | 
 88 |         with cd(self.config['data_dir']):
 89 |             command = ['git', 'push', '--follow-tags']
 90 |             subprocess.call(command)
 91 | 
 92 |     def update_last_modified(self):
 93 |         """Update the 'last_modified' field in datapackage.json"""
 94 | 
 95 |         datapackage_path = os.path.join(self.datapackage.base_path,
 96 |                                         'datapackage.json')
 97 | 
 98 |         with io.open(datapackage_path, mode='w+', encoding='utf-8') as datapkg_file:
 99 |             current_time = strftime("%Y-%m-%d %H:%M:%S %Z", gmtime())
100 |             self.datapackage.descriptor['last_modified'] = current_time
101 |             updated_datapkg = json.dumps(self.datapackage.to_dict(), indent=4,
102 |                                          sort_keys=True)
103 |             datapkg_file.write(compat.str(updated_datapkg))
104 | 


--------------------------------------------------------------------------------
/data_quality/tasks/extract_relevance_period.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import re
  8 | import datetime
  9 | from dateparser.date import DateDataParser
 10 | from jsontableschema.model import SchemaModel
 11 | from data_quality import utilities, compat, exceptions
 12 | from .base_task import Task
 13 | from .check_datapackage import DataPackageChecker
 14 | 
 15 | class RelevancePeriodExtractor(Task):
 16 | 
 17 |     """A Task runner that extracts the period a sources's content reffers to
 18 |         (is relevant for).
 19 |     """
 20 | 
 21 |     def __init__(self, config):
 22 |         super(RelevancePeriodExtractor, self).__init__(config)
 23 |         timeliness_params = self.config['timeliness']
 24 |         self.extract_period = timeliness_params.get('extract_period', False)
 25 |         self.timeliness_strategy = timeliness_params.get('timeliness_strategy', [])
 26 |         self.date_order = timeliness_params.get('date_order', 'DMY')
 27 |         self.max_empty_relevance_period = timeliness_params.get('max_empty_relevance_period', 10)
 28 |         if not self.timeliness_strategy:
 29 |             raise ValueError('You need to provide values for "timeliness_strategy."')
 30 |         datapackage_check = DataPackageChecker(self.config)
 31 |         datapackage_check.check_database_completeness([self.source_file])
 32 |         settings = {'RETURN_AS_TIMEZONE_AWARE': False,
 33 |                     'PREFER_DAY_OF_MONTH': 'last',
 34 |                     'PREFER_DATES_FROM': 'past',
 35 |                     'SKIP_TOKENS': ['to'],
 36 |                     'DATE_ORDER': self.date_order}
 37 |         self.date_parser = DateDataParser(allow_redetect_language=True,
 38 |                                           settings=settings)
 39 | 
 40 |     def run(self):
 41 |         """Try to indentify the relevance period of sources"""
 42 | 
 43 |         sources = self.extract_period_from_sources()
 44 |         empty_period_sources = [source for source in sources
 45 |                                 if source['period_id'] is None]
 46 |         empty_period_percent = (len(empty_period_sources) * 100) / len(sources)
 47 |         empty_period_percent = round(empty_period_percent)
 48 |         if empty_period_percent > int(self.max_empty_relevance_period):
 49 |             msg = ('The relevance period couldn\'t be identified for'
 50 |                    ' {0}% of sources therefore timeliness cannot be'
 51 |                    ' assessed. Please provide more fields for "timeliness_'
 52 |                    'strategy", set "assess_timeliness" to false or increase'
 53 |                    ' "max_empty_relevance_period".').format(empty_period_percent)
 54 |             raise exceptions.UnableToAssessTimeliness(msg)
 55 | 
 56 |         for source in sources:
 57 |             if source['period_id'] is None:
 58 |                 creation_date = utilities.date_from_string(source['created_at'])
 59 |                 dates = [creation_date, creation_date]
 60 |             else:
 61 |                 period_start, period_end = source['period_id']
 62 |                 dates = [period_start.date(), period_end.date()]
 63 |             dates = [date.strftime('%d-%m-%Y') if isinstance(date, datetime.date)
 64 |                      else '' for date in dates]
 65 |             source['period_id'] = '/'.join(dates)
 66 |         self.update_sources_period(sources)
 67 | 
 68 |     def extract_period_from_sources(self):
 69 |         """Try to extract relevance period for each source or return None"""
 70 | 
 71 |         sources = []
 72 |         with compat.UnicodeDictReader(self.source_file) as source_file:
 73 |             timeliness_set = set(self.timeliness_strategy)
 74 |             found_fields = timeliness_set.intersection(set(source_file.header))
 75 |             if not found_fields:
 76 |                 raise ValueError(('At least one of the "timeliness_strategy" '
 77 |                                   'fields must be present in your "source_file".'))
 78 |             if not found_fields.issuperset(timeliness_set):
 79 |                 missing_fields = timeliness_set.difference(found_fields)
 80 |                 print(('Fields "{0}" from "timeliness_strategy" were not found '
 81 |                        'in your `source_file`').format(missing_fields))
 82 | 
 83 |             for source in source_file:
 84 |                 timeliness_fields = {field: val for field, val in source.items()
 85 |                                      if field in self.timeliness_strategy}
 86 |                 extracted_period = self.identify_period(timeliness_fields)
 87 |                 source['period_id'] = extracted_period
 88 |                 sources.append(source)
 89 |         return sources
 90 | 
 91 |     def identify_period(self, source={}):
 92 |         """Try to indentify the period of a source based on timeliess strategy
 93 | 
 94 |         Args:
 95 |             source: a dict corresponding to a source_file row
 96 |         """
 97 | 
 98 |         field_dates = {}
 99 |         for field in self.timeliness_strategy:
100 |             value = source.get(field, '')
101 |             if not value:
102 |                 continue
103 |             field_dates[field] = self.extract_dates(value)
104 | 
105 |         for field in self.timeliness_strategy:
106 |             dates = field_dates.get(field, [])
107 |             if not dates:
108 |                 continue
109 |             period = resolve_period(dates)
110 |             if period:
111 |                 break
112 |             else:
113 |                 # It means we have more than 2 dates
114 |                 other_fields = list(self.timeliness_strategy)
115 |                 other_fields.remove(field)
116 |                 other_values = [field_dates.get(other_field, [])
117 |                                 for other_field in other_fields]
118 |                 for values in other_values:
119 |                     date_objects = set(date['date_obj'] for date in dates)
120 |                     common_values = [date for date in values
121 |                                      if date['date_obj'] in date_objects]
122 |                     period = resolve_period(common_values)
123 |             if period:
124 |                 break
125 |         else:
126 |             period = None
127 |         return period
128 | 
129 |     def extract_dates(self, line=""):
130 |         """Try to extract dates from a line
131 | 
132 |         Args:
133 |             line: a string that could contain a date or time range
134 |         """
135 | 
136 |         dates = []
137 |         potential_dates = re.findall(r'[0-9]+[\W_][0-9]+[\W_][0-9]+', line)
138 |         line_words = re.sub(r'[\W_]+', ' ', line).split()
139 |         years = filter_years(line_words)
140 |         for word in years:
141 |             if re.search(r'[a-zA-Z]', word):
142 |                 potential_dates.append(word)
143 |                 break
144 |             for index, entry in enumerate(line_words):
145 |                 if entry == word:
146 |                     date = self.scan_for_date(line_words, index)
147 |                     if date:
148 |                         potential_dates.append(date)
149 |                         # Try to find a range
150 |                         if date['period'] != 'year' and date['date_obj']:
151 |                             range_start = self.scan_for_range(line_words, index, date)
152 |                             if not range_start:
153 |                                 continue
154 |                             if range_start['date_obj'] < date['date_obj']:
155 |                                 potential_dates.append(range_start)
156 | 
157 |         for potential_date in potential_dates:
158 |             try:
159 |                 dates.append(self.date_parser.get_date_data(potential_date))
160 |             except TypeError:
161 |                 if isinstance(potential_date, dict):
162 |                     dates.append(potential_date)
163 |             except ValueError:
164 |                 potential_date = None
165 |         dates = [date for date in dates if date['date_obj'] is not None]
166 |         dates = list({date['date_obj']:date for date in dates}.values())
167 |         return dates
168 | 
169 |     def scan_for_date(self, line_words, year_index):
170 |         """Scan around the year for a date as complete as possible
171 | 
172 |         Args:
173 |             line_words: a list of words (strings)
174 |             year_index: index of a string from line_word that contains a year
175 |         """
176 | 
177 |         date_parts = line_words[year_index-2:year_index+1] or \
178 |                      line_words[:year_index+1]
179 |         potential_date = self.create_date_from_parts(date_parts)
180 |         if not potential_date or potential_date['period'] == 'year':
181 |             new_parts = list(reversed(line_words[year_index:year_index+3]))
182 |             new_potential_date = self.create_date_from_parts(new_parts)
183 |             if new_potential_date:
184 |                 potential_date = new_potential_date
185 |         return potential_date
186 | 
187 |     def scan_for_range(self, line_words, year_index, range_end):
188 |         """Scan to the left of the year whose corresponding date has
189 |             been extracted to see if there is a range.
190 | 
191 |           Args:
192 |             line_words: a list of words (strings)
193 |             year_index: index of a string from line_word that contains a year
194 |             range_end: date that has already been extracted from the year at
195 |                         year_index, potentially end of range
196 |         """
197 | 
198 |         if range_end['period'] == 'month':
199 |             scan_start = year_index-2
200 |             scan_end = year_index-4
201 |         else:
202 |             scan_start = year_index-3
203 |             scan_end = year_index-5
204 |         range_start_parts = line_words[scan_end:scan_start+1] or \
205 |                             line_words[:scan_start+1]
206 |         range_start_parts = [part for part in range_start_parts
207 |                              if self.create_date_from_parts([part]) is not None]
208 |         years = filter_years(range_start_parts)
209 |         if years:
210 |             range_start_parts = []
211 |         if range_start_parts:
212 |             if len(range_start_parts) == 1 and range_end['period'] == 'day':
213 |                 range_start_parts.append(compat.str(range_end['date_obj'].month))
214 |             range_start_parts.append(compat.str(range_end['date_obj'].year))
215 |         range_start = self.create_date_from_parts(range_start_parts)
216 |         if range_start and range_start['period'] != range_end['period']:
217 |             range_start = None
218 |         return range_start
219 | 
220 |     def create_date_from_parts(self, date_parts=None):
221 |         """Try to create a date object with date_parser or return None."""
222 | 
223 |         if not date_parts:
224 |             return None
225 |         for index, part in enumerate(date_parts):
226 |             if len(date_parts) == 2:
227 |                 if False not in [el.isdigit() for el in date_parts]:
228 |                     date_parts.insert(index, '31')
229 |             potential_date = ' '.join(date_parts[index:])
230 |             try:
231 |                 date = self.date_parser.get_date_data(potential_date)
232 |             except (ValueError, TypeError):
233 |                 date = None
234 |             if date and date.get('date_obj') is not None:
235 |                 break
236 |         else:
237 |             date = None
238 |         return date
239 | 
240 |     def update_sources_period(self, new_sources):
241 |         """Overwrite source_file with the identified period_id"""
242 | 
243 |         source_resource = utilities.get_datapackage_resource(self.source_file,
244 |                                                              self.datapackage)
245 |         source_idx = self.datapackage.resources.index(source_resource)
246 |         source_schema_dict = self.datapackage.resources[source_idx].descriptor['schema']
247 |         updates = {'fields':[{'name': 'period_id', 'type': 'string',
248 |                    'title': 'The period source data is relevant for.'}]}
249 |         utilities.deep_update_dict(source_schema_dict, updates)
250 |         source_schema = SchemaModel(source_schema_dict)
251 | 
252 |         with compat.UnicodeWriter(self.source_file) as source_file:
253 |             source_file.writerow(source_schema.headers)
254 |             for row in utilities.dicts_to_schema_rows(new_sources,
255 |                                                       source_schema):
256 |                 source_file.writerow(row)
257 | 
258 | def resolve_period(dates=None):
259 |     """Given a list of dates, try to create a period tuple or return None"""
260 | 
261 |     if not dates:
262 |         period = None
263 |     elif len(dates) == 1:
264 |         period = period_from_date(dates[0])
265 |     elif len(dates) == 2:
266 |         date_objects = sorted([date['date_obj'] for date in dates])
267 |         if dates[0]['period'] == 'year':
268 |             date_objects[0] = date_objects[0].replace(month=1, day=1)
269 |         if dates[1]['period'] == 'year':
270 |             date_objects[1] = date_objects[1].replace(month=12, day=31)
271 |         if dates[0]['period'] == 'month':
272 |             date_objects[0] = date_objects[0].replace(day=1)
273 |         period = (date_objects[0], date_objects[1])
274 |     else:
275 |         period = None
276 |     return period
277 | 
278 | def period_from_date(date={}):
279 |     """Create a period from a `dateparser` date dict"""
280 | 
281 |     if date.get('date_obj', None) is None:
282 |         return None
283 |     if date['period'] == 'day':
284 |         range_start = date['date_obj']
285 |         range_end = date['date_obj'].replace(hour=23, minute=59)
286 |     elif date['period'] == 'month':
287 |         range_start = date['date_obj'].replace(day=1)
288 |         range_end = date['date_obj']
289 |     else:
290 |         range_start = datetime.datetime(date['date_obj'].year, 1, 1)
291 |         range_end = datetime.datetime(date['date_obj'].year, 12, 31)
292 |     return (range_start, range_end)
293 | 
294 | def filter_years(words_list):
295 |     """Filter strings that could contain a year from a list of words"""
296 | 
297 |     condition = lambda x: re.search(r'(?:19|20)[0-9]{2}', x)
298 |     filtered_list = [word for word in filter(condition, words_list)]
299 |     return filtered_list
300 | 


--------------------------------------------------------------------------------
/data_quality/tasks/generate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import io
 9 | import json
10 | import importlib
11 | from data_quality import generators, utilities, compat
12 | from .base_task import Task
13 | from .check_datapackage import DataPackageChecker
14 | 
15 | 
16 | class GeneratorManager(Task):
17 | 
18 |     """A Task runner that manages dataset generators (ex: CkanGenerator)."""
19 | 
20 |     def __init__(self, config):
21 |         super(GeneratorManager, self).__init__(config)
22 |         datapackage_check = DataPackageChecker(self.config)
23 |         datapackage_check.run()
24 | 
25 |     def run(self, generator_name, endpoint, generator_path, file_types, simulate=False):
26 |         """Delegate the generation processes to the chosen generator
27 |         Args:
28 |             generator_name: Name of the generator (ex: ckan)
29 |             endpoint: Url where the generator should get the data from
30 |             generator_path: Path to the custom generator class, if used
31 |             file_types: List of file types that should be included in sources
32 |         """
33 | 
34 |         if generators._built_in_generators.get(generator_name, None):
35 |             inflexible_resources = ['source_file', 'publisher_file']
36 |             datapackage_check = DataPackageChecker(self.config, inflexible_resources)
37 |             try:
38 |                 datapackage_check.run()
39 |             except ValueError as e:
40 |                 msg = ('Looks like you have a custom schema for "{0}". Generator '
41 |                        '"{1}" only works with the default schema. Please use a '
42 |                        'custom generator or match your schema to the default one.'
43 |                       ).format(e[1], generator_name)
44 |                 raise ValueError(msg)
45 | 
46 |             generator_class = generators._built_in_generators[generator_name]
47 |         else:
48 |             try:
49 |                 _module, _class = generator_path.rsplit('.', 1)
50 |                 generator_class = getattr(importlib.import_module(_module), _class)
51 |             except ValueError:
52 |                 raise ValueError(('The path you provided for the generator class is '
53 |                                   'not valid. Should be of type `mymodule.MyGenerator`'))
54 |         generator = generator_class(endpoint, self.datapackage)
55 | 
56 |         if simulate:
57 |             return generator
58 | 
59 |         generator.generate_sources(self.source_file, file_types=file_types)
60 |         generator.generate_publishers(self.publisher_file)
61 | 
62 |     def update_datapackage_sources(self):
63 |         """Update the 'sources' property of datapackage with the new sources"""
64 | 
65 |         datapackage_check = DataPackageChecker(self.config)
66 |         required_resources = [self.source_file, self.publisher_file]
67 |         datapackage_check.check_database_completeness(required_resources)
68 |         datapackage_check.run()
69 |         self.datapackage.descriptor['sources'] = []
70 |         datapkg_path = os.path.join(self.datapackage.base_path, 'datapackage.json')
71 | 
72 |         with compat.UnicodeDictReader(self.source_file) as sources_file:
73 |             for source in sources_file:
74 |                 src_info = {'name': source['title'], 'web': source[self.data_key]}
75 |                 self.datapackage.descriptor['sources'].append(src_info)
76 | 
77 |         with io.open(datapkg_path, mode='w+', encoding='utf-8') as datapkg_file:
78 |             new_datapkg = json.dumps(self.datapackage.to_dict(), indent=4,
79 |                                      sort_keys=True)
80 |             datapkg_file.write(compat.str(new_datapkg))
81 | 


--------------------------------------------------------------------------------
/data_quality/tasks/initialize_datapackage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import io
 9 | import json
10 | import datapackage
11 | from data_quality import utilities, compat
12 | from .check_datapackage import DataPackageChecker
13 | 
14 | 
15 | class DataPackageInitializer(object):
16 | 
17 |     """A task runner that makes a data-quality style data package from a 
18 |        given workspace folder
19 |     """
20 | 
21 |     def __init__(self, workspace_path):
22 |         self.workspace_path = workspace_path
23 | 
24 |     def run(self):
25 |         """Initialize all necessary files and folders"""
26 | 
27 |         config = self.initialize_config()
28 |         utilities.resolve_dir(config['data_dir'])
29 |         utilities.resolve_dir(config['cache_dir'])
30 |         self.initialize_datapackage(config)
31 | 
32 |     def initialize_config(self):
33 |         """Create a config for this instance or use the existing one"""
34 | 
35 |         init_config_path = os.path.join(self.workspace_path, 'dq_config.json')
36 | 
37 |         if os.path.exists(init_config_path):
38 |             config = utilities.load_json_config(init_config_path)
39 |         else:
40 |             config = utilities.load_json_config(None)
41 | 
42 |             with io.open(init_config_path, mode='w+', encoding='utf-8') as new_config:
43 |                 new_json_config = json.dumps(config, indent=4, sort_keys=True)
44 |                 new_config.write(compat.str(new_json_config))
45 |                 print(('A new config file has been created at {0}. '
46 |                        'Please review and update it.'.format(init_config_path)))
47 |         return config
48 | 
49 |     def initialize_datapackage(self, config):
50 |         """Create a datapackage or return the existing one along with it's path"""
51 | 
52 |         datapkg_file_path = config.get('datapackage_file', '')
53 |         if not datapkg_file_path or not os.path.isabs(datapkg_file_path):
54 |             datapkg_file_path = os.path.join(self.workspace_path, 'datapackage.json')
55 | 
56 |         datapkg_file_path = os.path.abspath(datapkg_file_path)
57 |         if not os.path.exists(datapkg_file_path):
58 |             with io.open(datapkg_file_path, mode='w+', encoding='utf-8') as new_datapkg:
59 |                 default_datapkg = utilities.get_default_datapackage()
60 |                 for resource in default_datapkg.resources:
61 |                     resource_path = config.get(resource.descriptor['name'],
62 |                                                resource.descriptor['path'])
63 |                     resource.descriptor['path'] = os.path.join(config['data_dir'],
64 |                                                                resource_path)
65 |                 json_datapkg = json.dumps(default_datapkg.to_dict(), indent=4)
66 |                 new_datapkg.write(compat.str(json_datapkg))
67 |                 print(('A new "datapackage.json" file has been created at {0}. '
68 |                       'Please review and update it.'.format(datapkg_file_path)))
69 |                 return default_datapkg
70 |         else:
71 |             datapackage_check = DataPackageChecker(config)
72 |             datapackage_check.run()
73 |             return  datapackage.DataPackage(datapkg_file_path)
74 | 
75 | 


--------------------------------------------------------------------------------
/data_quality/utilities.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import io
  8 | import os
  9 | import json
 10 | import shutil
 11 | import dateutil
 12 | import requests
 13 | import collections
 14 | import datapackage
 15 | import jsontableschema
 16 | import pkg_resources
 17 | 
 18 | def set_up_cache_dir(cache_dir_path):
 19 |     """Reset /cache_dir before a new batch."""
 20 | 
 21 |     if os.path.lexists(cache_dir_path):
 22 |         for root, dirs, files in os.walk(cache_dir_path):
 23 |             for contained_file in files:
 24 |                 os.unlink(os.path.join(root, contained_file))
 25 | 
 26 |             for directory in dirs:
 27 |                 shutil.rmtree(os.path.join(root, directory))
 28 | 
 29 | def resolve_dir(dir_path):
 30 |     """ Make sure the dir_path given in the config exists
 31 | 
 32 |         Args:
 33 |             dir_path: path of directory from config that should be resolved
 34 |     """
 35 | 
 36 |     try:
 37 |         os.makedirs(dir_path)
 38 |     except OSError:
 39 |         if not os.path.isdir(dir_path):
 40 |             raise
 41 |     return dir_path
 42 | 
 43 | def resolve_dir_name(config_filepath, dir_path):
 44 |     """Create an absolute path from the file path and the path given in the config"""
 45 | 
 46 |     if not os.path.isabs(dir_path):
 47 |         config_path = os.path.abspath(os.path.dirname(config_filepath))
 48 |         return os.path.join(config_path, dir_path)
 49 |     else:
 50 |         return dir_path
 51 | 
 52 | def load_json_config(config_filepath):
 53 |     """Loads the json config into a dictionary, overwriting the defaults"""
 54 | 
 55 |     default_config = pkg_resources.resource_string('data_quality', 'dq.default.json')
 56 |     default_config = json.loads(default_config.decode('utf-8'))
 57 | 
 58 |     if not config_filepath:
 59 |         return default_config
 60 |     with io.open(config_filepath, mode='rt', encoding='utf-8') as config_file:
 61 |         user_config = json.loads(config_file.read())
 62 |         config = deep_update_dict(default_config, user_config)
 63 |         config['data_dir'] = resolve_dir_name(config_filepath, config['data_dir'])
 64 |         config['cache_dir'] = resolve_dir_name(config_filepath, config['cache_dir'])
 65 |     return config
 66 | 
 67 | def get_data_quality_spec():
 68 |     """Downloads and loads the data quality spec json"""
 69 | 
 70 |     config = load_json_config(None)
 71 |     dq_spec_url = config['data_quality_spec']['data_quality_spec_web']
 72 |     json_dq_spec = requests.get(dq_spec_url)
 73 |     return json_dq_spec.json()
 74 | 
 75 | def get_default_datapackage():
 76 |     """Return the default datapackage"""
 77 | 
 78 |     default_datapkg = pkg_resources.resource_string('data_quality',
 79 |                                                     'datapackage.default.json')
 80 |     datapkg = datapackage.DataPackage(json.loads(default_datapkg.decode('utf-8')))
 81 |     return datapkg
 82 | 
 83 | def get_datapackage_resource(resource_path, datapkg):
 84 |     """Return the resource correspondent to `resource_path` from datapackage or raise"""
 85 | 
 86 |     matching_resources = [res for res in datapkg.resources
 87 |                           if res.local_data_path == resource_path]
 88 |     if len(matching_resources) > 1:
 89 |         raise ValueError(('The resource with path "{0}" appears multiple times '
 90 |                           'in your datapackage.').format(resource_path))
 91 |     elif not matching_resources:
 92 |         raise ValueError(('The resource with path "{0}" can\'t be found in '
 93 |                           'your datapackage. Please include it or '
 94 |                           'use the "dq init" command.').format(resource_path))
 95 |     else:
 96 |         return matching_resources[0]
 97 | 
 98 | def deep_update_dict(source_dict, new_dict):
 99 |     """Update a nested dictionary (modified in place) with another dictionary.
100 | 
101 |     Args:
102 |         source_dict: dict to be updated
103 |         new_dict: dict to update with
104 | 
105 |     """
106 | 
107 |     for key, value in new_dict.items():
108 |         if isinstance(value, collections.Mapping) and value:
109 |             returned = deep_update_dict(source_dict.get(key, {}), value)
110 |             source_dict[key] = returned
111 |         elif isinstance(value, list):
112 |             source_dict[key] = (source_dict.get(key, []) + value)
113 |         else:
114 |             source_dict[key] = new_dict[key]
115 |     return source_dict
116 | 
117 | def date_from_string(date_string):
118 |     """Return a date object from a string or None
119 | 
120 |         Args:
121 |             date_string: a string that should contain a date
122 |     """
123 | 
124 |     if not date_string:
125 |         date = None
126 |     else:
127 |         try:
128 |             date = dateutil.parser.parse(date_string).date()
129 |         except ValueError:
130 |             date = None
131 |     return date
132 | 
133 | def dicts_to_schema_rows(rows, schema):
134 |     """Convert a list of dicts in a generator for schema compliant rows"""
135 | 
136 |     for row in rows:
137 |         try:
138 |             values = [row[key] for key in schema.headers]
139 |             converted_row = list(schema.convert_row(*values))
140 |             yield converted_row
141 |         except jsontableschema.exceptions.MultipleInvalid as e:
142 |             for error in e.errors:
143 |                 raise error
144 | 


--------------------------------------------------------------------------------
/dq-config.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data_dir": "/PATH/TO/DATA/DIRECTORY",
 3 |     "cache_dir": "/PATH/TO/CACHE/DIRECTORY",
 4 |     "result_file": "results.csv",
 5 |     "run_file": "runs.csv",
 6 |     "source_file": "sources.csv",
 7 |     "publisher_file": "publishers.csv",
 8 |     "remotes": ["origin"],
 9 |     "branch": "master",
10 |     "goodtables_web": "http://goodtables.okfnlabs.org"
11 | }
12 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
 1 | [BASIC]
 2 | 
 3 | # List of builtins function names that should not be used, separated by a comma.
 4 | bad-functions=map,filter,input,open
 5 | 
 6 | [FORMAT]
 7 | 
 8 | # Maximum number of characters on a single line.
 9 | max-line-length=79
10 | 
11 | [MESSAGES CONTROL]
12 | 
13 | # Allow modules to be without docstrings.
14 | disable=C0111
15 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import os
 7 | import io
 8 | from setuptools import setup, find_packages
 9 | 
10 | 
11 | def read(*paths):
12 |     """Read a text file."""
13 |     basedir = os.path.dirname(__file__)
14 |     fullpath = os.path.join(basedir, *paths)
15 |     contents = io.open(fullpath, encoding='utf-8').read().strip()
16 |     return contents
17 | 
18 | 
19 | PACKAGE = 'data_quality'
20 | INSTALL_REQUIRES = ['click>=6.2,<=7.0.0a', 'goodtables==0.7.6', 'pytz==2017.2', 'datapackage==0.8.1',
21 |                     'jsontableschema==0.6.5', 'dateparser==0.4.0', 'tabulator==0.5.0']
22 | TESTS_REQUIRE = ['tox']
23 | README = read('README.md')
24 | VERSION = read(PACKAGE, 'VERSION')
25 | PACKAGES = find_packages(exclude=['examples', 'tests'])
26 | 
27 | setup(
28 |     name=PACKAGE,
29 |     version=VERSION,
30 |     packages=PACKAGES,
31 |     include_package_data=True,
32 |     install_requires=INSTALL_REQUIRES,
33 |     tests_require=TESTS_REQUIRE,
34 |     extras_require = {'develop': TESTS_REQUIRE + ['pylint']},
35 |     test_suite='tox',
36 |     zip_safe=False,
37 |     long_description=README,
38 |     description='A CLI that builds a data quality assessment, for use in a Data Quality Dashboard.',
39 |     author='Open Knowledge Foundation',
40 |     author_email='info@okfn.org',
41 |     url='https://github.com/okfn/data-quality-cli',
42 |     license='MIT',
43 |     keywords=['frictionless data', 'data quality'],
44 |     package_data={
45 |         'data_quality': ['datapackage.default.json', 'dq.default.json'],
46 |     },
47 |     classifiers=[
48 |         'Development Status :: 4 - Beta',
49 |         'Environment :: Web Environment',
50 |         'Intended Audience :: Developers',
51 |         'License :: OSI Approved :: MIT License',
52 |         'Operating System :: OS Independent',
53 |         'Programming Language :: Python :: 2',
54 |         'Programming Language :: Python :: 2.7',
55 |         'Programming Language :: Python :: 3',
56 |         'Programming Language :: Python :: 3.3',
57 |         'Programming Language :: Python :: 3.4',
58 |         'Programming Language :: Python :: 3.5',
59 |         'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
60 |         'Topic :: Software Development :: Libraries :: Python Modules'
61 |     ],
62 |     entry_points={
63 |         'console_scripts': [
64 |             'dq = data_quality.main:cli',
65 |             'dataquality = data_quality.main:cli'
66 |         ]
67 |     },
68 | )
69 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/data-quality-cli/e9abc93b896ea59269d11cdc8f2d301f81be20ad/tests/__init__.py


--------------------------------------------------------------------------------
/tests/fixtures/datapackage.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "admin": "", 
  3 |     "context": "", 
  4 |     "last_modified": "", 
  5 |     "name": "", 
  6 |     "pitch": "", 
  7 |     "resources": [
  8 |         {
  9 |             "name": "publisher_file", 
 10 |             "path": "publishers.csv", 
 11 |             "schema": {
 12 |                 "fields": [
 13 |                     {
 14 |                         "constraints": {
 15 |                             "required": true, 
 16 |                             "unique": true
 17 |                         }, 
 18 |                         "name": "id", 
 19 |                         "title": "ID of the publisher", 
 20 |                         "type": "string"
 21 |                     }, 
 22 |                     {
 23 |                         "constraints": {
 24 |                             "required": true, 
 25 |                             "unique": true
 26 |                         }, 
 27 |                         "name": "title", 
 28 |                         "title": "Title or official name of the publisher", 
 29 |                         "type": "string"
 30 |                     }
 31 |                 ], 
 32 |                 "primaryKey": "id"
 33 |             }
 34 |         }, 
 35 |         {
 36 |             "name": "source_file", 
 37 |             "path": "sources.csv", 
 38 |             "schema": {
 39 |                 "fields": [
 40 |                     {
 41 |                         "constraints": {
 42 |                             "required": true, 
 43 |                             "unique": true
 44 |                         }, 
 45 |                         "name": "id", 
 46 |                         "title": "ID of the source", 
 47 |                         "type": "string"
 48 |                     }, 
 49 |                     {
 50 |                         "constraints": {
 51 |                             "required": true, 
 52 |                             "unique": true
 53 |                         }, 
 54 |                         "name": "publisher_id", 
 55 |                         "title": "ID of the source's publisher", 
 56 |                         "type": "string"
 57 |                     }, 
 58 |                     {
 59 |                         "constraints": {
 60 |                             "required": true
 61 |                         }, 
 62 |                         "name": "title", 
 63 |                         "title": "Title of the source", 
 64 |                         "type": "string"
 65 |                     }, 
 66 |                     {
 67 |                         "constraints": {
 68 |                             "required": true
 69 |                         }, 
 70 |                         "name": "data", 
 71 |                         "title": "Path/url to source", 
 72 |                         "type": "string"
 73 |                     }, 
 74 |                     {
 75 |                         "name": "format", 
 76 |                         "title": "File format of the source", 
 77 |                         "type": "string"
 78 |                     }, 
 79 |                     {
 80 |                         "constraints": {
 81 |                             "required": true
 82 |                         }, 
 83 |                         "name": "created_at", 
 84 |                         "title": "Time of the source's creation.", 
 85 |                         "type": "string"
 86 |                     }
 87 |                 ], 
 88 |                 "foreignKeys": [
 89 |                     {
 90 |                         "fields": "publisher_id", 
 91 |                         "reference": {
 92 |                             "fields": "id", 
 93 |                             "resource": "publisher_file"
 94 |                         }
 95 |                     }
 96 |                 ], 
 97 |                 "primaryKey": "id"
 98 |             }
 99 |         }, 
100 |         {
101 |             "name": "run_file", 
102 |             "path": "runs.csv", 
103 |             "schema": {
104 |                 "fields": [
105 |                     {
106 |                         "constraints": {
107 |                             "required": true, 
108 |                             "unique": true
109 |                         }, 
110 |                         "name": "id", 
111 |                         "title": "ID of the run", 
112 |                         "type": "string"
113 |                     }, 
114 |                     {
115 |                         "constraints": {
116 |                             "required": true
117 |                         }, 
118 |                         "format": "datetime", 
119 |                         "name": "timestamp", 
120 |                         "title": "Timestamp of the run execution", 
121 |                         "type": "date"
122 |                     }, 
123 |                     {
124 |                         "constraints": {
125 |                             "required": true
126 |                         }, 
127 |                         "name": "total_score", 
128 |                         "title": "Rounded average score of results in this run", 
129 |                         "type": "integer"
130 |                     }
131 |                 ], 
132 |                 "primaryKey": "id"
133 |             }
134 |         }, 
135 |         {
136 |             "name": "result_file", 
137 |             "path": "results.csv", 
138 |             "schema": {
139 |                 "fields": [
140 |                     {
141 |                         "constraints": {
142 |                             "required": true, 
143 |                             "unique": true
144 |                         }, 
145 |                         "name": "id", 
146 |                         "title": "ID of the result", 
147 |                         "type": "string"
148 |                     }, 
149 |                     {
150 |                         "constraints": {
151 |                             "required": true, 
152 |                             "unique": true
153 |                         }, 
154 |                         "name": "source_id", 
155 |                         "title": "ID of the correspoding source", 
156 |                         "type": "string"
157 |                     }, 
158 |                     {
159 |                         "constraints": {
160 |                             "required": true
161 |                         }, 
162 |                         "name": "publisher_id", 
163 |                         "title": "ID of the source's publisher", 
164 |                         "type": "string"
165 |                     }, 
166 |                     {
167 |                         "constraints": {
168 |                             "required": true
169 |                         }, 
170 |                         "format": "date", 
171 |                         "name": "created_at", 
172 |                         "title": "Time of the source's creation.", 
173 |                         "type": "date"
174 |                     }, 
175 |                     {
176 |                         "constraints": {
177 |                             "required": true
178 |                         }, 
179 |                         "name": "data", 
180 |                         "title": "Path/url to source", 
181 |                         "type": "string"
182 |                     }, 
183 |                     {
184 |                         "name": "schema", 
185 |                         "title": "Path/url to the source's schema", 
186 |                         "type": "string"
187 |                     }, 
188 |                     {
189 |                         "contrains": {
190 |                             "required": true
191 |                         }, 
192 |                         "name": "score", 
193 |                         "title": "Score of correctness given by GoodTables", 
194 |                         "type": "integer"
195 |                     }, 
196 |                     {
197 |                         "name": "summary", 
198 |                         "title": "Summary", 
199 |                         "type": "string"
200 |                     }, 
201 |                     {
202 |                         "constraints": {
203 |                             "required": true, 
204 |                             "unique": true
205 |                         }, 
206 |                         "name": "run_id", 
207 |                         "title": "ID of the run in which the result was calculated", 
208 |                         "type": "string"
209 |                     }, 
210 |                     {
211 |                         "constraints": {
212 |                             "required": true
213 |                         }, 
214 |                         "format": "datetime", 
215 |                         "name": "timestamp", 
216 |                         "title": "Timestamp of the run execution", 
217 |                         "type": "date"
218 |                     }, 
219 |                     {
220 |                         "name": "report", 
221 |                         "title": "Path/url to the full GoodTabels report", 
222 |                         "type": "string"
223 |                     }
224 |                 ], 
225 |                 "foreignKeys": [
226 |                     {
227 |                         "fields": "source_id", 
228 |                         "reference": {
229 |                             "fields": "id", 
230 |                             "resource": "source_file"
231 |                         }
232 |                     }, 
233 |                     {
234 |                         "fields": "publisher_id", 
235 |                         "reference": {
236 |                             "fields": "id", 
237 |                             "resource": "publisher_file"
238 |                         }
239 |                     }, 
240 |                     {
241 |                         "fields": "run_id", 
242 |                         "reference": {
243 |                             "fields": "id", 
244 |                             "resource": "run_file"
245 |                         }
246 |                     }
247 |                 ], 
248 |                 "primaryKey": "id"
249 |             }
250 |         }, 
251 |         {
252 |             "name": "performance_file", 
253 |             "path": "performance.csv", 
254 |             "schema": {
255 |                 "fields": [
256 |                     {
257 |                         "constraints": {
258 |                             "required": true, 
259 |                             "unique": true
260 |                         }, 
261 |                         "name": "publisher_id", 
262 |                         "title": "ID of the publisher", 
263 |                         "type": "string"
264 |                     }, 
265 |                     {
266 |                         "constraints": {
267 |                             "required": true
268 |                         }, 
269 |                         "format": "date", 
270 |                         "name": "month_of_creation", 
271 |                         "title": "Month when the source was created", 
272 |                         "type": "date"
273 |                     }, 
274 |                     {
275 |                         "constraints": {
276 |                             "required": true
277 |                         }, 
278 |                         "name": "files_count", 
279 |                         "title": "Number of files published by the publisher during period", 
280 |                         "type": "integer"
281 |                     }, 
282 |                     {
283 |                         "constraints": {
284 |                             "required": true
285 |                         }, 
286 |                         "name": "score", 
287 |                         "title": "Rounded average score of files published by the publisher during period", 
288 |                         "type": "integer"
289 |                     }, 
290 |                     {
291 |                         "constraints": {
292 |                             "required": true
293 |                         }, 
294 |                         "name": "valid", 
295 |                         "title": "Number of valid files published by the publisher during period", 
296 |                         "type": "integer"
297 |                     }, 
298 |                     {
299 |                         "constraints": {
300 |                             "required": true
301 |                         }, 
302 |                         "name": "files_count_to_date", 
303 |                         "title": "Number of files published by the publisher up to period", 
304 |                         "type": "integer"
305 |                     }, 
306 |                     {
307 |                         "constraints": {
308 |                             "required": true
309 |                         }, 
310 |                         "name": "score_to_date", 
311 |                         "title": "Rounded average score of files published by the publisher up to period", 
312 |                         "type": "integer"
313 |                     }, 
314 |                     {
315 |                         "constraints": {
316 |                             "required": true
317 |                         }, 
318 |                         "name": "valid_to_date", 
319 |                         "title": "Number of valid files published by the publisher up to period", 
320 |                         "type": "integer"
321 |                     }
322 |                 ], 
323 |                 "foreignKeys": [
324 |                     {
325 |                         "fields": "publisher_id", 
326 |                         "reference": {
327 |                             "fields": "id", 
328 |                             "resource": "publisher_file"
329 |                         }
330 |                     }
331 |                 ]
332 |             }
333 |         }
334 |     ], 
335 |     "sources": [], 
336 |     "validator_url": "https://goodtables.okfnlabs.org/api/run"
337 | }


--------------------------------------------------------------------------------
/tests/fixtures/datapackage_missing_required.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "name": "",
  3 |     "last_modified": "",
  4 |     "validator_url": "https://goodtables.okfnlabs.org/api/run",
  5 |     "admin": "",
  6 |     "pitch": "",
  7 |     "context": "",
  8 |     "sources": [{"name": "", "web": ""}],
  9 |     "resources": [
 10 |         {
 11 |            "path": "publishers.csv",
 12 |            "name": "publisher_file",
 13 |            "schema": {
 14 |                 "fields": [
 15 |                   {
 16 |                     "name": "id",
 17 |                     "title": "ID of the publisher",
 18 |                     "type": "string",
 19 |                     "constraints": { "required": true, "unique": true }
 20 |                   },
 21 |                 ],
 22 |                 "primaryKey": "id"
 23 |             }
 24 |         },
 25 |         {
 26 |             "path": "sources.csv",
 27 |             "name": "source_file",
 28 |             "schema": {
 29 |                 "fields": [
 30 |                     {
 31 |                         "name": "id",
 32 |                         "title": "ID of the source",
 33 |                         "type": "string",
 34 |                         "constraints": { "required": true, "unique": true }
 35 |                     },
 36 |                     {
 37 |                         "name": "publisher_id",
 38 |                         "title": "ID of the source's publisher",
 39 |                         "type": "string",
 40 |                         "constraints": { "required": true, "unique": true }
 41 |                     },
 42 |                     {
 43 |                         "name": "title",
 44 |                         "title": "Title of the source",
 45 |                         "type": "string",
 46 |                         "constraints": { "required": true }
 47 |                     },
 48 |                     {
 49 |                         "name": "data",
 50 |                         "title": "Path/url to source",
 51 |                         "type": "string",
 52 |                         "constraints": { "required": true }
 53 |                     },
 54 |                     {
 55 |                         "name": "format",
 56 |                         "title": "File format of the source",
 57 |                         "type": "string"
 58 |                     },
 59 |                     {
 60 |                         "name": "created_at",
 61 |                         "title": "Time covered by the source / of its creation",
 62 |                         "type": "string",
 63 |                         "constraints": { "required": true }
 64 |                     }
 65 |                 ],
 66 |                 "primaryKey": "id",
 67 |                 "foreignKeys": [
 68 |                     {
 69 |                         "fields": "publisher_id",
 70 |                         "reference": {
 71 |                             "resource": "publisher_file",
 72 |                             "fields": "id"
 73 |                         }
 74 |                     }
 75 |                 ]
 76 |             }
 77 |         },
 78 |         {
 79 |             "path": "runs.csv",
 80 |             "name": "run_file",
 81 |             "schema": {
 82 |                 "fields": [
 83 |                     {
 84 |                         "name": "id",
 85 |                         "title": "ID of the run",
 86 |                         "type": "string",
 87 |                         "constraints": { "required": true, "unique": true }
 88 |                     },
 89 |                     {
 90 |                         "name": "timestamp",
 91 |                         "title": "Timestamp of the run execution",
 92 |                         "type": "date",
 93 |                         "format": "datetime",
 94 |                         "constraints": { "required": true }
 95 |                     },
 96 |                     {
 97 |                         "name": "total_score",
 98 |                         "title": "Rounded average score of results in this run",
 99 |                         "type": "integer",
100 |                         "constraints": { "required": true}
101 |                     }
102 |                 ],
103 |                 "primaryKey": "id"
104 |             }
105 |         },
106 |         {
107 |             "path": "results.csv",
108 |             "name": "result_file",
109 |             "schema": {
110 |                 "fields": [
111 |                    {
112 |                         "name": "id",
113 |                         "title": "ID of the result",
114 |                         "type": "string",
115 |                         "constraints": { "required": true, "unique": true }
116 |                     },
117 |                     {
118 |                         "name": "source_id",
119 |                         "title": "ID of the correspoding source",
120 |                         "type": "string",
121 |                         "constraints": { "required": true, "unique": true }
122 |                     },
123 |                     {
124 |                         "name": "publisher_id",
125 |                         "title": "ID of the source's publisher",
126 |                         "type": "string",
127 |                         "constraints": { "required": true}
128 |                     },
129 |                     {
130 |                         "name": "created_at",
131 |                         "title": "Time covered by the source / of its creation",
132 |                         "type": "date",
133 |                         "format": "date",
134 |                         "constraints": { "required": true }
135 |                     },
136 |                     {
137 |                         "name": "data",
138 |                         "title": "Path/url to source",
139 |                         "type": "string",
140 |                         "constraints": { "required": true }
141 |                     },
142 |                     {
143 |                         "name": "schema",
144 |                         "title": "Path/url to the source's schema",
145 |                         "type": "string"
146 |                     },
147 |                     {
148 |                         "name": "score",
149 |                         "title": "Score of correctness given by GoodTables",
150 |                         "type": "integer",
151 |                         "contrains": { "required": true }
152 |                     },
153 |                     {
154 |                         "name": "summary",
155 |                         "title": "Summary",
156 |                         "type": "string"
157 |                     },
158 |                     {
159 |                         "name": "run_id",
160 |                         "title": "ID of the run in which the result was calculated",
161 |                         "type": "string",
162 |                         "constraints": { "required": true, "unique": true }
163 |                     },
164 |                     {
165 |                         "name": "timestamp",
166 |                         "title": "Timestamp of the run execution",
167 |                         "type": "date",
168 |                         "format": "datetime",
169 |                         "constraints": { "required": true }
170 |                     },
171 |                     {
172 |                         "name": "report",
173 |                         "title": "Path/url to the full GoodTabels report",
174 |                         "type": "string"
175 |                     }
176 |                 ],
177 |                 "primaryKey": "id",
178 |                 "foreignKeys": [
179 |                     {
180 |                        "fields": "source_id",
181 |                        "reference": {
182 |                             "resource": "source_file",
183 |                             "fields": "id"
184 |                        }
185 |                     },
186 |                     {
187 |                        "fields": "publisher_id",
188 |                        "reference": {
189 |                             "resource": "publisher_file",
190 |                             "fields": "id"
191 |                        }
192 |                     },
193 |                     {
194 |                        "fields": "run_id",
195 |                        "reference": {
196 |                             "resource": "run_file",
197 |                             "fields": "id"
198 |                        }
199 |                     }
200 |                 ]
201 |             }
202 |         }
203 |     ]
204 | }


--------------------------------------------------------------------------------
/tests/fixtures/datapackage_sources_with_period.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "admin": "",
  3 |     "context": "",
  4 |     "last_modified": "",
  5 |     "name": "",
  6 |     "pitch": "",
  7 |     "resources": [
  8 |         {
  9 |             "name": "publisher_file",
 10 |             "path": "publishers.csv",
 11 |             "schema": {
 12 |                 "fields": [
 13 |                     {
 14 |                         "constraints": {
 15 |                             "required": true,
 16 |                             "unique": true
 17 |                         },
 18 |                         "name": "id",
 19 |                         "title": "ID of the publisher",
 20 |                         "type": "string"
 21 |                     },
 22 |                     {
 23 |                         "constraints": {
 24 |                             "required": true,
 25 |                             "unique": true
 26 |                         },
 27 |                         "name": "title",
 28 |                         "title": "Title or official name of the publisher",
 29 |                         "type": "string"
 30 |                     }
 31 |                 ],
 32 |                 "primaryKey": "id"
 33 |             }
 34 |         },
 35 |         {
 36 |             "name": "source_file",
 37 |             "path": "sources_with_period_id.csv",
 38 |             "schema": {
 39 |                 "fields": [
 40 |                     {
 41 |                         "constraints": {
 42 |                             "required": true,
 43 |                             "unique": true
 44 |                         },
 45 |                         "name": "id",
 46 |                         "title": "ID of the source",
 47 |                         "type": "string"
 48 |                     },
 49 |                     {
 50 |                         "constraints": {
 51 |                             "required": true,
 52 |                             "unique": true
 53 |                         },
 54 |                         "name": "publisher_id",
 55 |                         "title": "ID of the source's publisher",
 56 |                         "type": "string"
 57 |                     },
 58 |                     {
 59 |                         "constraints": {
 60 |                             "required": true
 61 |                         },
 62 |                         "name": "title",
 63 |                         "title": "Title of the source",
 64 |                         "type": "string"
 65 |                     },
 66 |                     {
 67 |                         "constraints": {
 68 |                             "required": true
 69 |                         },
 70 |                         "name": "data",
 71 |                         "title": "Path/url to source",
 72 |                         "type": "string"
 73 |                     },
 74 |                     {
 75 |                         "name": "format",
 76 |                         "title": "File format of the source",
 77 |                         "type": "string"
 78 |                     },
 79 |                     {
 80 |                         "constraints": {
 81 |                             "required": true
 82 |                         },
 83 |                         "name": "created_at",
 84 |                         "title": "Time of the source's creation.",
 85 |                         "type": "string"
 86 |                     }
 87 |                 ],
 88 |                 "foreignKeys": [
 89 |                     {
 90 |                         "fields": "publisher_id",
 91 |                         "reference": {
 92 |                             "fields": "id",
 93 |                             "resource": "publisher_file"
 94 |                         }
 95 |                     }
 96 |                 ],
 97 |                 "primaryKey": "id"
 98 |             }
 99 |         },
100 |         {
101 |             "name": "run_file",
102 |             "path": "runs.csv",
103 |             "schema": {
104 |                 "fields": [
105 |                     {
106 |                         "constraints": {
107 |                             "required": true,
108 |                             "unique": true
109 |                         },
110 |                         "name": "id",
111 |                         "title": "ID of the run",
112 |                         "type": "string"
113 |                     },
114 |                     {
115 |                         "constraints": {
116 |                             "required": true
117 |                         },
118 |                         "format": "datetime",
119 |                         "name": "timestamp",
120 |                         "title": "Timestamp of the run execution",
121 |                         "type": "date"
122 |                     },
123 |                     {
124 |                         "constraints": {
125 |                             "required": true
126 |                         },
127 |                         "name": "total_score",
128 |                         "title": "Rounded average score of results in this run",
129 |                         "type": "integer"
130 |                     }
131 |                 ],
132 |                 "primaryKey": "id"
133 |             }
134 |         },
135 |         {
136 |             "name": "result_file",
137 |             "path": "results.csv",
138 |             "schema": {
139 |                 "fields": [
140 |                     {
141 |                         "constraints": {
142 |                             "required": true,
143 |                             "unique": true
144 |                         },
145 |                         "name": "id",
146 |                         "title": "ID of the result",
147 |                         "type": "string"
148 |                     },
149 |                     {
150 |                         "constraints": {
151 |                             "required": true,
152 |                             "unique": true
153 |                         },
154 |                         "name": "source_id",
155 |                         "title": "ID of the correspoding source",
156 |                         "type": "string"
157 |                     },
158 |                     {
159 |                         "constraints": {
160 |                             "required": true
161 |                         },
162 |                         "name": "publisher_id",
163 |                         "title": "ID of the source's publisher",
164 |                         "type": "string"
165 |                     },
166 |                     {
167 |                         "constraints": {
168 |                             "required": true
169 |                         },
170 |                         "name": "created_at",
171 |                         "title": "Time of the source's creation.",
172 |                         "type": "date",
173 |                         "format": "date"
174 |                     },
175 |                     {
176 |                         "constraints": {
177 |                             "required": true
178 |                         },
179 |                         "name": "data",
180 |                         "title": "Path/url to source",
181 |                         "type": "string"
182 |                     },
183 |                     {
184 |                         "name": "schema",
185 |                         "title": "Path/url to the source's schema",
186 |                         "type": "string"
187 |                     },
188 |                     {
189 |                         "contrains": {
190 |                             "required": true
191 |                         },
192 |                         "name": "score",
193 |                         "title": "Score of correctness given by GoodTables",
194 |                         "type": "integer"
195 |                     },
196 |                     {
197 |                         "name": "summary",
198 |                         "title": "Summary",
199 |                         "type": "string"
200 |                     },
201 |                     {
202 |                         "constraints": {
203 |                             "required": true,
204 |                             "unique": true
205 |                         },
206 |                         "name": "run_id",
207 |                         "title": "ID of the run in which the result was calculated",
208 |                         "type": "string"
209 |                     },
210 |                     {
211 |                         "constraints": {
212 |                             "required": true
213 |                         },
214 |                         "format": "datetime",
215 |                         "name": "timestamp",
216 |                         "title": "Timestamp of the run execution",
217 |                         "type": "date"
218 |                     },
219 |                     {
220 |                         "name": "report",
221 |                         "title": "Path/url to the full GoodTabels report",
222 |                         "type": "string"
223 |                     }
224 |                 ],
225 |                 "foreignKeys": [
226 |                     {
227 |                         "fields": "source_id",
228 |                         "reference": {
229 |                             "fields": "id",
230 |                             "resource": "source_file"
231 |                         }
232 |                     },
233 |                     {
234 |                         "fields": "publisher_id",
235 |                         "reference": {
236 |                             "fields": "id",
237 |                             "resource": "publisher_file"
238 |                         }
239 |                     },
240 |                     {
241 |                         "fields": "run_id",
242 |                         "reference": {
243 |                             "fields": "id",
244 |                             "resource": "run_file"
245 |                         }
246 |                     }
247 |                 ],
248 |                 "primaryKey": "id"
249 |             }
250 |         },
251 |         {
252 |             "name": "performance_file",
253 |             "path": "performance.csv",
254 |             "schema": {
255 |                 "fields": [
256 |                     {
257 |                         "constraints": {
258 |                             "required": true,
259 |                             "unique": true
260 |                         },
261 |                         "name": "publisher_id",
262 |                         "title": "ID of the publisher",
263 |                         "type": "string"
264 |                     },
265 |                     {
266 |                         "name": "month_of_creation",
267 |                         "title": "Month when the source was created",
268 |                         "type": "date",
269 |                         "format": "date",
270 |                         "constraints": { "required": true }
271 |                     },
272 |                     {
273 |                         "constraints": {
274 |                             "required": true
275 |                         },
276 |                         "name": "files_count",
277 |                         "title": "Number of files published by the publisher during period",
278 |                         "type": "integer"
279 |                     },
280 |                     {
281 |                         "constraints": {
282 |                             "required": true
283 |                         },
284 |                         "name": "score",
285 |                         "title": "Rounded average score of files published by the publisher during period",
286 |                         "type": "integer"
287 |                     },
288 |                     {
289 |                         "constraints": {
290 |                             "required": true
291 |                         },
292 |                         "name": "valid",
293 |                         "title": "Number of valid files published by the publisher during period",
294 |                         "type": "integer"
295 |                     },
296 |                     {
297 |                         "constraints": {
298 |                             "required": true
299 |                         },
300 |                         "name": "files_count_to_date",
301 |                         "title": "Number of files published by the publisher up to period",
302 |                         "type": "integer"
303 |                     },
304 |                     {
305 |                         "constraints": {
306 |                             "required": true
307 |                         },
308 |                         "name": "score_to_date",
309 |                         "title": "Rounded average score of files published by the publisher up to period",
310 |                         "type": "integer"
311 |                     },
312 |                     {
313 |                         "constraints": {
314 |                             "required": true
315 |                         },
316 |                         "name": "valid_to_date",
317 |                         "title": "Number of valid files published by the publisher up to period",
318 |                         "type": "integer"
319 |                     }
320 |                 ],
321 |                 "foreignKeys": [
322 |                     {
323 |                         "fields": "publisher_id",
324 |                         "reference": {
325 |                             "fields": "id",
326 |                             "resource": "publisher_file"
327 |                         }
328 |                     }
329 |                 ]
330 |             }
331 |         }
332 |     ],
333 |     "sources": [],
334 |     "validator_url": "https://goodtables.okfnlabs.org/api/run"
335 | }


--------------------------------------------------------------------------------
/tests/fixtures/dq.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "data_dir": "",
 3 |     "cache_dir": "fetched",
 4 |     "result_file": "results.csv",
 5 |     "run_file": "runs.csv",
 6 |     "source_file": "sources.csv",
 7 |     "publisher_file": "publishers.csv",
 8 |     "performance_file": "performance.csv",
 9 |     "datapackage_file": "datapackage.json",
10 |     "remotes": ["origin"],
11 |     "branch": "master",
12 |     "goodtables": {
13 |         "goodtables_web": "http://goodtables.okfnlabs.org",
14 |         "arguments": {
15 |             "pipeline": {
16 |                 "processors": ["schema", "structure"],
17 |                 "encoding": "utf-8",
18 |                 "options": {
19 |                     "schema": {"case_insensitive_headers": true}
20 |                 },
21 |                 "break_on_invalid_processor": false
22 |             },
23 |             "batch": {
24 |                 "format_key": "format",
25 |                 "schema_key": "schema",
26 |                 "data_key": "data"
27 |             }
28 |         }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/tests/fixtures/fetched/empty_rows_multiple.csv:
--------------------------------------------------------------------------------
 1 | id,name,age
 2 | 1101,John,30
 3 | 1102,Julie,26
 4 | ,,
 5 | ,,
 6 | ,,
 7 | ,,
 8 | ,,
 9 | ,,
10 | ,,
11 | ,,
12 | ,,
13 | ,,
14 | ,,
15 | 


--------------------------------------------------------------------------------
/tests/fixtures/fetched/valid.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | 1,english
3 | 2,中国人
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/performance.csv:
--------------------------------------------------------------------------------
  1 | publisher_id,month_of_creation,files_count,score,valid,files_count_to_date,score_to_date,valid_to_date
  2 | xx_dept1,2015-01-01,1,100,100,1,100,100
  3 | xx_dept1,2015-02-01,0,0,0,1,100,100
  4 | xx_dept1,2015-03-01,0,0,0,1,100,100
  5 | xx_dept1,2015-04-01,0,0,0,1,100,100
  6 | xx_dept1,2015-05-01,0,0,0,1,100,100
  7 | xx_dept1,2015-06-01,0,0,0,1,100,100
  8 | xx_dept1,2015-07-01,0,0,0,1,100,100
  9 | xx_dept1,2015-08-01,0,0,0,1,100,100
 10 | xx_dept1,2015-09-01,0,0,0,1,100,100
 11 | xx_dept1,2015-10-01,0,0,0,1,100,100
 12 | xx_dept1,2015-11-01,0,0,0,1,100,100
 13 | xx_dept1,2015-12-01,0,0,0,1,100,100
 14 | xx_dept1,2016-01-01,0,0,0,1,100,100
 15 | xx_dept1,2016-02-01,0,0,0,1,100,100
 16 | xx_dept1,2016-03-01,0,0,0,1,100,100
 17 | xx_dept1,2016-04-01,0,0,0,1,100,100
 18 | xx_dept1,2016-05-01,0,0,0,1,100,100
 19 | xx_dept1,2016-06-01,0,0,0,1,100,100
 20 | xx_dept1,2016-07-01,0,0,0,1,100,100
 21 | xx_dept1,2016-08-01,0,0,0,1,100,100
 22 | xx_dept1,2016-09-01,0,0,0,1,100,100
 23 | xx_dept1,2016-10-01,0,0,0,1,100,100
 24 | xx_dept1,2016-11-01,0,0,0,1,100,100
 25 | xx_dept1,2016-12-01,0,0,0,1,100,100
 26 | xx_dept1,2017-01-01,0,0,0,1,100,100
 27 | xx_dept1,2017-02-01,0,0,0,1,100,100
 28 | xx_dept1,2017-03-01,0,0,0,1,100,100
 29 | xx_dept1,2017-04-01,0,0,0,1,100,100
 30 | xx_dept2,2015-01-01,0,0,0,0,0,0
 31 | xx_dept2,2015-02-01,0,0,0,0,0,0
 32 | xx_dept2,2015-03-01,0,0,0,0,0,0
 33 | xx_dept2,2015-04-01,0,0,0,0,0,0
 34 | xx_dept2,2015-05-01,0,0,0,0,0,0
 35 | xx_dept2,2015-06-01,0,0,0,0,0,0
 36 | xx_dept2,2015-07-01,0,0,0,0,0,0
 37 | xx_dept2,2015-08-01,0,0,0,0,0,0
 38 | xx_dept2,2015-09-01,0,0,0,0,0,0
 39 | xx_dept2,2015-10-01,0,0,0,0,0,0
 40 | xx_dept2,2015-11-01,0,0,0,0,0,0
 41 | xx_dept2,2015-12-01,0,0,0,0,0,0
 42 | xx_dept2,2016-01-01,0,0,0,0,0,0
 43 | xx_dept2,2016-02-01,0,0,0,0,0,0
 44 | xx_dept2,2016-03-01,0,0,0,0,0,0
 45 | xx_dept2,2016-04-01,0,0,0,0,0,0
 46 | xx_dept2,2016-05-01,0,0,0,0,0,0
 47 | xx_dept2,2016-06-01,0,0,0,0,0,0
 48 | xx_dept2,2016-07-01,0,0,0,0,0,0
 49 | xx_dept2,2016-08-01,0,0,0,0,0,0
 50 | xx_dept2,2016-09-01,0,0,0,0,0,0
 51 | xx_dept2,2016-10-01,0,0,0,0,0,0
 52 | xx_dept2,2016-11-01,0,0,0,0,0,0
 53 | xx_dept2,2016-12-01,0,0,0,0,0,0
 54 | xx_dept2,2017-01-01,0,0,0,0,0,0
 55 | xx_dept2,2017-02-01,0,0,0,0,0,0
 56 | xx_dept2,2017-03-01,0,0,0,0,0,0
 57 | xx_dept2,2017-04-01,0,0,0,0,0,0
 58 | xx_dept3,2015-01-01,0,0,0,0,0,0
 59 | xx_dept3,2015-02-01,0,0,0,0,0,0
 60 | xx_dept3,2015-03-01,0,0,0,0,0,0
 61 | xx_dept3,2015-04-01,0,0,0,0,0,0
 62 | xx_dept3,2015-05-01,0,0,0,0,0,0
 63 | xx_dept3,2015-06-01,0,0,0,0,0,0
 64 | xx_dept3,2015-07-01,0,0,0,0,0,0
 65 | xx_dept3,2015-08-01,0,0,0,0,0,0
 66 | xx_dept3,2015-09-01,0,0,0,0,0,0
 67 | xx_dept3,2015-10-01,0,0,0,0,0,0
 68 | xx_dept3,2015-11-01,0,0,0,0,0,0
 69 | xx_dept3,2015-12-01,0,0,0,0,0,0
 70 | xx_dept3,2016-01-01,0,0,0,0,0,0
 71 | xx_dept3,2016-02-01,0,0,0,0,0,0
 72 | xx_dept3,2016-03-01,0,0,0,0,0,0
 73 | xx_dept3,2016-04-01,0,0,0,0,0,0
 74 | xx_dept3,2016-05-01,0,0,0,0,0,0
 75 | xx_dept3,2016-06-01,0,0,0,0,0,0
 76 | xx_dept3,2016-07-01,0,0,0,0,0,0
 77 | xx_dept3,2016-08-01,0,0,0,0,0,0
 78 | xx_dept3,2016-09-01,0,0,0,0,0,0
 79 | xx_dept3,2016-10-01,0,0,0,0,0,0
 80 | xx_dept3,2016-11-01,0,0,0,0,0,0
 81 | xx_dept3,2016-12-01,0,0,0,0,0,0
 82 | xx_dept3,2017-01-01,0,0,0,0,0,0
 83 | xx_dept3,2017-02-01,0,0,0,0,0,0
 84 | xx_dept3,2017-03-01,0,0,0,0,0,0
 85 | xx_dept3,2017-04-01,0,0,0,0,0,0
 86 | xx_dept4,2015-01-01,0,0,0,0,0,0
 87 | xx_dept4,2015-02-01,0,0,0,0,0,0
 88 | xx_dept4,2015-03-01,0,0,0,0,0,0
 89 | xx_dept4,2015-04-01,0,0,0,0,0,0
 90 | xx_dept4,2015-05-01,0,0,0,0,0,0
 91 | xx_dept4,2015-06-01,0,0,0,0,0,0
 92 | xx_dept4,2015-07-01,0,0,0,0,0,0
 93 | xx_dept4,2015-08-01,0,0,0,0,0,0
 94 | xx_dept4,2015-09-01,0,0,0,0,0,0
 95 | xx_dept4,2015-10-01,0,0,0,0,0,0
 96 | xx_dept4,2015-11-01,0,0,0,0,0,0
 97 | xx_dept4,2015-12-01,0,0,0,0,0,0
 98 | xx_dept4,2016-01-01,0,0,0,0,0,0
 99 | xx_dept4,2016-02-01,0,0,0,0,0,0
100 | xx_dept4,2016-03-01,0,0,0,0,0,0
101 | xx_dept4,2016-04-01,0,0,0,0,0,0
102 | xx_dept4,2016-05-01,0,0,0,0,0,0
103 | xx_dept4,2016-06-01,0,0,0,0,0,0
104 | xx_dept4,2016-07-01,0,0,0,0,0,0
105 | xx_dept4,2016-08-01,0,0,0,0,0,0
106 | xx_dept4,2016-09-01,0,0,0,0,0,0
107 | xx_dept4,2016-10-01,0,0,0,0,0,0
108 | xx_dept4,2016-11-01,0,0,0,0,0,0
109 | xx_dept4,2016-12-01,0,0,0,0,0,0
110 | xx_dept4,2017-01-01,0,0,0,0,0,0
111 | xx_dept4,2017-02-01,0,0,0,0,0,0
112 | xx_dept4,2017-03-01,0,0,0,0,0,0
113 | xx_dept4,2017-04-01,0,0,0,0,0,0
114 | xx_dept5,2015-01-01,0,0,0,0,0,0
115 | xx_dept5,2015-02-01,0,0,0,0,0,0
116 | xx_dept5,2015-03-01,0,0,0,0,0,0
117 | xx_dept5,2015-04-01,0,0,0,0,0,0
118 | xx_dept5,2015-05-01,0,0,0,0,0,0
119 | xx_dept5,2015-06-01,0,0,0,0,0,0
120 | xx_dept5,2015-07-01,0,0,0,0,0,0
121 | xx_dept5,2015-08-01,0,0,0,0,0,0
122 | xx_dept5,2015-09-01,0,0,0,0,0,0
123 | xx_dept5,2015-10-01,0,0,0,0,0,0
124 | xx_dept5,2015-11-01,0,0,0,0,0,0
125 | xx_dept5,2015-12-01,0,0,0,0,0,0
126 | xx_dept5,2016-01-01,0,0,0,0,0,0
127 | xx_dept5,2016-02-01,0,0,0,0,0,0
128 | xx_dept5,2016-03-01,0,0,0,0,0,0
129 | xx_dept5,2016-04-01,0,0,0,0,0,0
130 | xx_dept5,2016-05-01,0,0,0,0,0,0
131 | xx_dept5,2016-06-01,0,0,0,0,0,0
132 | xx_dept5,2016-07-01,0,0,0,0,0,0
133 | xx_dept5,2016-08-01,0,0,0,0,0,0
134 | xx_dept5,2016-09-01,0,0,0,0,0,0
135 | xx_dept5,2016-10-01,0,0,0,0,0,0
136 | xx_dept5,2016-11-01,0,0,0,0,0,0
137 | xx_dept5,2016-12-01,0,0,0,0,0,0
138 | xx_dept5,2017-01-01,0,0,0,0,0,0
139 | xx_dept5,2017-02-01,0,0,0,0,0,0
140 | xx_dept5,2017-03-01,0,0,0,0,0,0
141 | xx_dept5,2017-04-01,0,0,0,0,0,0
142 | xx_dept6,2015-01-01,0,0,0,0,0,0
143 | xx_dept6,2015-02-01,0,0,0,0,0,0
144 | xx_dept6,2015-03-01,0,0,0,0,0,0
145 | xx_dept6,2015-04-01,0,0,0,0,0,0
146 | xx_dept6,2015-05-01,0,0,0,0,0,0
147 | xx_dept6,2015-06-01,0,0,0,0,0,0
148 | xx_dept6,2015-07-01,0,0,0,0,0,0
149 | xx_dept6,2015-08-01,0,0,0,0,0,0
150 | xx_dept6,2015-09-01,0,0,0,0,0,0
151 | xx_dept6,2015-10-01,0,0,0,0,0,0
152 | xx_dept6,2015-11-01,0,0,0,0,0,0
153 | xx_dept6,2015-12-01,0,0,0,0,0,0
154 | xx_dept6,2016-01-01,0,0,0,0,0,0
155 | xx_dept6,2016-02-01,0,0,0,0,0,0
156 | xx_dept6,2016-03-01,0,0,0,0,0,0
157 | xx_dept6,2016-04-01,0,0,0,0,0,0
158 | xx_dept6,2016-05-01,0,0,0,0,0,0
159 | xx_dept6,2016-06-01,0,0,0,0,0,0
160 | xx_dept6,2016-07-01,0,0,0,0,0,0
161 | xx_dept6,2016-08-01,0,0,0,0,0,0
162 | xx_dept6,2016-09-01,0,0,0,0,0,0
163 | xx_dept6,2016-10-01,0,0,0,0,0,0
164 | xx_dept6,2016-11-01,0,0,0,0,0,0
165 | xx_dept6,2016-12-01,0,0,0,0,0,0
166 | xx_dept6,2017-01-01,0,0,0,0,0,0
167 | xx_dept6,2017-02-01,0,0,0,0,0,0
168 | xx_dept6,2017-03-01,0,0,0,0,0,0
169 | xx_dept6,2017-04-01,0,0,0,0,0,0
170 | xx_dept7,2015-01-01,0,0,0,0,0,0
171 | xx_dept7,2015-02-01,0,0,0,0,0,0
172 | xx_dept7,2015-03-01,0,0,0,0,0,0
173 | xx_dept7,2015-04-01,0,0,0,0,0,0
174 | xx_dept7,2015-05-01,0,0,0,0,0,0
175 | xx_dept7,2015-06-01,0,0,0,0,0,0
176 | xx_dept7,2015-07-01,0,0,0,0,0,0
177 | xx_dept7,2015-08-01,0,0,0,0,0,0
178 | xx_dept7,2015-09-01,0,0,0,0,0,0
179 | xx_dept7,2015-10-01,0,0,0,0,0,0
180 | xx_dept7,2015-11-01,0,0,0,0,0,0
181 | xx_dept7,2015-12-01,0,0,0,0,0,0
182 | xx_dept7,2016-01-01,0,0,0,0,0,0
183 | xx_dept7,2016-02-01,0,0,0,0,0,0
184 | xx_dept7,2016-03-01,0,0,0,0,0,0
185 | xx_dept7,2016-04-01,0,0,0,0,0,0
186 | xx_dept7,2016-05-01,0,0,0,0,0,0
187 | xx_dept7,2016-06-01,0,0,0,0,0,0
188 | xx_dept7,2016-07-01,0,0,0,0,0,0
189 | xx_dept7,2016-08-01,0,0,0,0,0,0
190 | xx_dept7,2016-09-01,0,0,0,0,0,0
191 | xx_dept7,2016-10-01,0,0,0,0,0,0
192 | xx_dept7,2016-11-01,0,0,0,0,0,0
193 | xx_dept7,2016-12-01,0,0,0,0,0,0
194 | xx_dept7,2017-01-01,0,0,0,0,0,0
195 | xx_dept7,2017-02-01,0,0,0,0,0,0
196 | xx_dept7,2017-03-01,0,0,0,0,0,0
197 | xx_dept7,2017-04-01,0,0,0,0,0,0
198 | xx_dept8,2015-01-01,0,0,0,0,0,0
199 | xx_dept8,2015-02-01,0,0,0,0,0,0
200 | xx_dept8,2015-03-01,0,0,0,0,0,0
201 | xx_dept8,2015-04-01,0,0,0,0,0,0
202 | xx_dept8,2015-05-01,0,0,0,0,0,0
203 | xx_dept8,2015-06-01,0,0,0,0,0,0
204 | xx_dept8,2015-07-01,0,0,0,0,0,0
205 | xx_dept8,2015-08-01,0,0,0,0,0,0
206 | xx_dept8,2015-09-01,0,0,0,0,0,0
207 | xx_dept8,2015-10-01,0,0,0,0,0,0
208 | xx_dept8,2015-11-01,0,0,0,0,0,0
209 | xx_dept8,2015-12-01,0,0,0,0,0,0
210 | xx_dept8,2016-01-01,0,0,0,0,0,0
211 | xx_dept8,2016-02-01,0,0,0,0,0,0
212 | xx_dept8,2016-03-01,0,0,0,0,0,0
213 | xx_dept8,2016-04-01,0,0,0,0,0,0
214 | xx_dept8,2016-05-01,0,0,0,0,0,0
215 | xx_dept8,2016-06-01,0,0,0,0,0,0
216 | xx_dept8,2016-07-01,0,0,0,0,0,0
217 | xx_dept8,2016-08-01,0,0,0,0,0,0
218 | xx_dept8,2016-09-01,0,0,0,0,0,0
219 | xx_dept8,2016-10-01,0,0,0,0,0,0
220 | xx_dept8,2016-11-01,0,0,0,0,0,0
221 | xx_dept8,2016-12-01,0,0,0,0,0,0
222 | xx_dept8,2017-01-01,0,0,0,0,0,0
223 | xx_dept8,2017-02-01,0,0,0,0,0,0
224 | xx_dept8,2017-03-01,0,0,0,0,0,0
225 | xx_dept8,2017-04-01,0,0,0,0,0,0
226 | xx_dept9,2015-01-01,0,0,0,0,0,0
227 | xx_dept9,2015-02-01,0,0,0,0,0,0
228 | xx_dept9,2015-03-01,0,0,0,0,0,0
229 | xx_dept9,2015-04-01,0,0,0,0,0,0
230 | xx_dept9,2015-05-01,0,0,0,0,0,0
231 | xx_dept9,2015-06-01,0,0,0,0,0,0
232 | xx_dept9,2015-07-01,0,0,0,0,0,0
233 | xx_dept9,2015-08-01,0,0,0,0,0,0
234 | xx_dept9,2015-09-01,0,0,0,0,0,0
235 | xx_dept9,2015-10-01,0,0,0,0,0,0
236 | xx_dept9,2015-11-01,0,0,0,0,0,0
237 | xx_dept9,2015-12-01,0,0,0,0,0,0
238 | xx_dept9,2016-01-01,0,0,0,0,0,0
239 | xx_dept9,2016-02-01,0,0,0,0,0,0
240 | xx_dept9,2016-03-01,0,0,0,0,0,0
241 | xx_dept9,2016-04-01,0,0,0,0,0,0
242 | xx_dept9,2016-05-01,0,0,0,0,0,0
243 | xx_dept9,2016-06-01,0,0,0,0,0,0
244 | xx_dept9,2016-07-01,0,0,0,0,0,0
245 | xx_dept9,2016-08-01,0,0,0,0,0,0
246 | xx_dept9,2016-09-01,0,0,0,0,0,0
247 | xx_dept9,2016-10-01,0,0,0,0,0,0
248 | xx_dept9,2016-11-01,0,0,0,0,0,0
249 | xx_dept9,2016-12-01,0,0,0,0,0,0
250 | xx_dept9,2017-01-01,0,0,0,0,0,0
251 | xx_dept9,2017-02-01,0,0,0,0,0,0
252 | xx_dept9,2017-03-01,0,0,0,0,0,0
253 | xx_dept9,2017-04-01,0,0,0,0,0,0
254 | xx_dept10,2015-01-01,0,0,0,0,0,0
255 | xx_dept10,2015-02-01,0,0,0,0,0,0
256 | xx_dept10,2015-03-01,0,0,0,0,0,0
257 | xx_dept10,2015-04-01,0,0,0,0,0,0
258 | xx_dept10,2015-05-01,0,0,0,0,0,0
259 | xx_dept10,2015-06-01,0,0,0,0,0,0
260 | xx_dept10,2015-07-01,0,0,0,0,0,0
261 | xx_dept10,2015-08-01,0,0,0,0,0,0
262 | xx_dept10,2015-09-01,0,0,0,0,0,0
263 | xx_dept10,2015-10-01,0,0,0,0,0,0
264 | xx_dept10,2015-11-01,0,0,0,0,0,0
265 | xx_dept10,2015-12-01,0,0,0,0,0,0
266 | xx_dept10,2016-01-01,0,0,0,0,0,0
267 | xx_dept10,2016-02-01,0,0,0,0,0,0
268 | xx_dept10,2016-03-01,0,0,0,0,0,0
269 | xx_dept10,2016-04-01,0,0,0,0,0,0
270 | xx_dept10,2016-05-01,0,0,0,0,0,0
271 | xx_dept10,2016-06-01,0,0,0,0,0,0
272 | xx_dept10,2016-07-01,0,0,0,0,0,0
273 | xx_dept10,2016-08-01,0,0,0,0,0,0
274 | xx_dept10,2016-09-01,0,0,0,0,0,0
275 | xx_dept10,2016-10-01,0,0,0,0,0,0
276 | xx_dept10,2016-11-01,0,0,0,0,0,0
277 | xx_dept10,2016-12-01,0,0,0,0,0,0
278 | xx_dept10,2017-01-01,0,0,0,0,0,0
279 | xx_dept10,2017-02-01,0,0,0,0,0,0
280 | xx_dept10,2017-03-01,0,0,0,0,0,0
281 | xx_dept10,2017-04-01,0,0,0,0,0,0
282 | xx_dept11,2015-01-01,0,0,0,0,0,0
283 | xx_dept11,2015-02-01,0,0,0,0,0,0
284 | xx_dept11,2015-03-01,0,0,0,0,0,0
285 | xx_dept11,2015-04-01,0,0,0,0,0,0
286 | xx_dept11,2015-05-01,0,0,0,0,0,0
287 | xx_dept11,2015-06-01,0,0,0,0,0,0
288 | xx_dept11,2015-07-01,0,0,0,0,0,0
289 | xx_dept11,2015-08-01,0,0,0,0,0,0
290 | xx_dept11,2015-09-01,0,0,0,0,0,0
291 | xx_dept11,2015-10-01,0,0,0,0,0,0
292 | xx_dept11,2015-11-01,0,0,0,0,0,0
293 | xx_dept11,2015-12-01,0,0,0,0,0,0
294 | xx_dept11,2016-01-01,0,0,0,0,0,0
295 | xx_dept11,2016-02-01,0,0,0,0,0,0
296 | xx_dept11,2016-03-01,0,0,0,0,0,0
297 | xx_dept11,2016-04-01,0,0,0,0,0,0
298 | xx_dept11,2016-05-01,0,0,0,0,0,0
299 | xx_dept11,2016-06-01,0,0,0,0,0,0
300 | xx_dept11,2016-07-01,0,0,0,0,0,0
301 | xx_dept11,2016-08-01,0,0,0,0,0,0
302 | xx_dept11,2016-09-01,0,0,0,0,0,0
303 | xx_dept11,2016-10-01,0,0,0,0,0,0
304 | xx_dept11,2016-11-01,0,0,0,0,0,0
305 | xx_dept11,2016-12-01,0,0,0,0,0,0
306 | xx_dept11,2017-01-01,0,0,0,0,0,0
307 | xx_dept11,2017-02-01,0,0,0,0,0,0
308 | xx_dept11,2017-03-01,0,0,0,0,0,0
309 | xx_dept11,2017-04-01,0,0,0,0,0,0
310 | xx_dept12,2015-01-01,0,0,0,0,0,0
311 | xx_dept12,2015-02-01,0,0,0,0,0,0
312 | xx_dept12,2015-03-01,0,0,0,0,0,0
313 | xx_dept12,2015-04-01,0,0,0,0,0,0
314 | xx_dept12,2015-05-01,0,0,0,0,0,0
315 | xx_dept12,2015-06-01,0,0,0,0,0,0
316 | xx_dept12,2015-07-01,0,0,0,0,0,0
317 | xx_dept12,2015-08-01,0,0,0,0,0,0
318 | xx_dept12,2015-09-01,0,0,0,0,0,0
319 | xx_dept12,2015-10-01,0,0,0,0,0,0
320 | xx_dept12,2015-11-01,0,0,0,0,0,0
321 | xx_dept12,2015-12-01,0,0,0,0,0,0
322 | xx_dept12,2016-01-01,0,0,0,0,0,0
323 | xx_dept12,2016-02-01,0,0,0,0,0,0
324 | xx_dept12,2016-03-01,0,0,0,0,0,0
325 | xx_dept12,2016-04-01,0,0,0,0,0,0
326 | xx_dept12,2016-05-01,0,0,0,0,0,0
327 | xx_dept12,2016-06-01,0,0,0,0,0,0
328 | xx_dept12,2016-07-01,0,0,0,0,0,0
329 | xx_dept12,2016-08-01,0,0,0,0,0,0
330 | xx_dept12,2016-09-01,0,0,0,0,0,0
331 | xx_dept12,2016-10-01,0,0,0,0,0,0
332 | xx_dept12,2016-11-01,0,0,0,0,0,0
333 | xx_dept12,2016-12-01,0,0,0,0,0,0
334 | xx_dept12,2017-01-01,0,0,0,0,0,0
335 | xx_dept12,2017-02-01,0,0,0,0,0,0
336 | xx_dept12,2017-03-01,0,0,0,0,0,0
337 | xx_dept12,2017-04-01,0,0,0,0,0,0
338 | xx_dept13,2015-01-01,0,0,0,0,0,0
339 | xx_dept13,2015-02-01,0,0,0,0,0,0
340 | xx_dept13,2015-03-01,0,0,0,0,0,0
341 | xx_dept13,2015-04-01,0,0,0,0,0,0
342 | xx_dept13,2015-05-01,0,0,0,0,0,0
343 | xx_dept13,2015-06-01,0,0,0,0,0,0
344 | xx_dept13,2015-07-01,0,0,0,0,0,0
345 | xx_dept13,2015-08-01,0,0,0,0,0,0
346 | xx_dept13,2015-09-01,0,0,0,0,0,0
347 | xx_dept13,2015-10-01,0,0,0,0,0,0
348 | xx_dept13,2015-11-01,0,0,0,0,0,0
349 | xx_dept13,2015-12-01,0,0,0,0,0,0
350 | xx_dept13,2016-01-01,0,0,0,0,0,0
351 | xx_dept13,2016-02-01,0,0,0,0,0,0
352 | xx_dept13,2016-03-01,0,0,0,0,0,0
353 | xx_dept13,2016-04-01,0,0,0,0,0,0
354 | xx_dept13,2016-05-01,0,0,0,0,0,0
355 | xx_dept13,2016-06-01,0,0,0,0,0,0
356 | xx_dept13,2016-07-01,0,0,0,0,0,0
357 | xx_dept13,2016-08-01,0,0,0,0,0,0
358 | xx_dept13,2016-09-01,0,0,0,0,0,0
359 | xx_dept13,2016-10-01,0,0,0,0,0,0
360 | xx_dept13,2016-11-01,0,0,0,0,0,0
361 | xx_dept13,2016-12-01,0,0,0,0,0,0
362 | xx_dept13,2017-01-01,0,0,0,0,0,0
363 | xx_dept13,2017-02-01,0,0,0,0,0,0
364 | xx_dept13,2017-03-01,0,0,0,0,0,0
365 | xx_dept13,2017-04-01,0,0,0,0,0,0
366 | xx_dept14,2015-01-01,0,0,0,0,0,0
367 | xx_dept14,2015-02-01,0,0,0,0,0,0
368 | xx_dept14,2015-03-01,0,0,0,0,0,0
369 | xx_dept14,2015-04-01,0,0,0,0,0,0
370 | xx_dept14,2015-05-01,0,0,0,0,0,0
371 | xx_dept14,2015-06-01,0,0,0,0,0,0
372 | xx_dept14,2015-07-01,0,0,0,0,0,0
373 | xx_dept14,2015-08-01,0,0,0,0,0,0
374 | xx_dept14,2015-09-01,0,0,0,0,0,0
375 | xx_dept14,2015-10-01,0,0,0,0,0,0
376 | xx_dept14,2015-11-01,0,0,0,0,0,0
377 | xx_dept14,2015-12-01,0,0,0,0,0,0
378 | xx_dept14,2016-01-01,0,0,0,0,0,0
379 | xx_dept14,2016-02-01,0,0,0,0,0,0
380 | xx_dept14,2016-03-01,0,0,0,0,0,0
381 | xx_dept14,2016-04-01,0,0,0,0,0,0
382 | xx_dept14,2016-05-01,0,0,0,0,0,0
383 | xx_dept14,2016-06-01,0,0,0,0,0,0
384 | xx_dept14,2016-07-01,0,0,0,0,0,0
385 | xx_dept14,2016-08-01,0,0,0,0,0,0
386 | xx_dept14,2016-09-01,0,0,0,0,0,0
387 | xx_dept14,2016-10-01,0,0,0,0,0,0
388 | xx_dept14,2016-11-01,0,0,0,0,0,0
389 | xx_dept14,2016-12-01,0,0,0,0,0,0
390 | xx_dept14,2017-01-01,0,0,0,0,0,0
391 | xx_dept14,2017-02-01,0,0,0,0,0,0
392 | xx_dept14,2017-03-01,0,0,0,0,0,0
393 | xx_dept14,2017-04-01,0,0,0,0,0,0
394 | xx_dept15,2015-01-01,1,0,0,1,0,0
395 | xx_dept15,2015-02-01,0,0,0,1,0,0
396 | xx_dept15,2015-03-01,0,0,0,1,0,0
397 | xx_dept15,2015-04-01,0,0,0,1,0,0
398 | xx_dept15,2015-05-01,0,0,0,1,0,0
399 | xx_dept15,2015-06-01,0,0,0,1,0,0
400 | xx_dept15,2015-07-01,0,0,0,1,0,0
401 | xx_dept15,2015-08-01,0,0,0,1,0,0
402 | xx_dept15,2015-09-01,0,0,0,1,0,0
403 | xx_dept15,2015-10-01,0,0,0,1,0,0
404 | xx_dept15,2015-11-01,0,0,0,1,0,0
405 | xx_dept15,2015-12-01,0,0,0,1,0,0
406 | xx_dept15,2016-01-01,0,0,0,1,0,0
407 | xx_dept15,2016-02-01,0,0,0,1,0,0
408 | xx_dept15,2016-03-01,0,0,0,1,0,0
409 | xx_dept15,2016-04-01,0,0,0,1,0,0
410 | xx_dept15,2016-05-01,0,0,0,1,0,0
411 | xx_dept15,2016-06-01,0,0,0,1,0,0
412 | xx_dept15,2016-07-01,0,0,0,1,0,0
413 | xx_dept15,2016-08-01,0,0,0,1,0,0
414 | xx_dept15,2016-09-01,0,0,0,1,0,0
415 | xx_dept15,2016-10-01,0,0,0,1,0,0
416 | xx_dept15,2016-11-01,0,0,0,1,0,0
417 | xx_dept15,2016-12-01,0,0,0,1,0,0
418 | xx_dept15,2017-01-01,0,0,0,1,0,0
419 | xx_dept15,2017-02-01,0,0,0,1,0,0
420 | xx_dept15,2017-03-01,0,0,0,1,0,0
421 | xx_dept15,2017-04-01,0,0,0,1,0,0
422 | all,2015-01-01,2,50,50,2,50,50
423 | all,2015-02-01,0,0,0,2,50,50
424 | all,2015-03-01,0,0,0,2,50,50
425 | all,2015-04-01,0,0,0,2,50,50
426 | all,2015-05-01,0,0,0,2,50,50
427 | all,2015-06-01,0,0,0,2,50,50
428 | all,2015-07-01,0,0,0,2,50,50
429 | all,2015-08-01,0,0,0,2,50,50
430 | all,2015-09-01,0,0,0,2,50,50
431 | all,2015-10-01,0,0,0,2,50,50
432 | all,2015-11-01,0,0,0,2,50,50
433 | all,2015-12-01,0,0,0,2,50,50
434 | all,2016-01-01,0,0,0,2,50,50
435 | all,2016-02-01,0,0,0,2,50,50
436 | all,2016-03-01,0,0,0,2,50,50
437 | all,2016-04-01,0,0,0,2,50,50
438 | all,2016-05-01,0,0,0,2,50,50
439 | all,2016-06-01,0,0,0,2,50,50
440 | all,2016-07-01,0,0,0,2,50,50
441 | all,2016-08-01,0,0,0,2,50,50
442 | all,2016-09-01,0,0,0,2,50,50
443 | all,2016-10-01,0,0,0,2,50,50
444 | all,2016-11-01,0,0,0,2,50,50
445 | all,2016-12-01,0,0,0,2,50,50
446 | all,2017-01-01,0,0,0,2,50,50
447 | all,2017-02-01,0,0,0,2,50,50
448 | all,2017-03-01,0,0,0,2,50,50
449 | all,2017-04-01,0,0,0,2,50,50
450 | 


--------------------------------------------------------------------------------
/tests/fixtures/publishers.csv:
--------------------------------------------------------------------------------
 1 | id,parent_id,name,description,url,jurisdiction_code,email,address,contact,score,source_count
 2 | xx_dept1,,Department 1,,http://www.example.com/dept1,XX,dept1-admin@example.com,,Dept1 Admin,8,2
 3 | xx_dept2,,Department 2,,http://www.example.com/dept2,XX,dept2-admin@example.com,,Dept2 Admin,8,2
 4 | xx_dept3,,Department 3,,http://www.example.com/dept3,XX,dept3-admin@example.com,,Dept3 Admin,8,2
 5 | xx_dept4,,Department 4,,http://www.example.com/dept4,XX,dept4-admin@example.com,,Dept4 Admin,8,2
 6 | xx_dept5,,Department 5,,http://www.example.com/dept5,XX,dept5-admin@example.com,,Dept5 Admin,8,2
 7 | xx_dept6,,Department 6,,http://www.example.com/dept6,XX,dept6-admin@example.com,,Dept6 Admin,8,1
 8 | xx_dept7,,Department 7,,http://www.example.com/dept7,XX,dept7-admin@example.com,,Dept7 Admin,8,1
 9 | xx_dept8,,Department 8,,http://www.example.com/dept8,XX,dept8-admin@example.com,,Dept8 Admin,8,1
10 | xx_dept9,,Department 9,,http://www.example.com/dept9,XX,dept9-admin@example.com,,Dept9 Admin,8,1
11 | xx_dept10,,Department 10,,http://www.example.com/dept10,XX,dept10-admin@example.com,,Dept10 Admin,8,1
12 | xx_dept11,xx_dept1,Department 11,,http://www.example.com/dept11,XX,dept-1-admin@example.com,,Dept11 Admin,8,1
13 | xx_dept12,xx_dept2,Department 12,,http://www.example.com/dept12,XX,dept-1-admin@example.com,,Dept12 Admin,8,1
14 | xx_dept13,xx_dept3,Department 13,,http://www.example.com/dept13,XX,dept-1-admin@example.com,,Dept13 Admin,8,1
15 | xx_dept14,xx_dept4,Department 14,,http://www.example.com/dept14,XX,dept-1-admin@example.com,,Dept14 Admin,8,1
16 | xx_dept15,xx_dept5,Department 15,,http://www.example.com/dept15,XX,dept-1-admin@example.com,,Dept15 Admin,8,1
17 | 


--------------------------------------------------------------------------------
/tests/fixtures/results.csv:
--------------------------------------------------------------------------------
 1 | id,source_id,publisher_id,created_at,data,schema,score,summary,run_id,timestamp,report
 2 | ce7752c9bd1a4f96a2459713687e9a72,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,45761052f48a4b158314b45d5ff08291,2016-08-08 17:42:12.141037+00:00,http://goodtables.okfnlabs.org
 3 | 31190ecbda0744208ee447c14a4d3683,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,67,,45761052f48a4b158314b45d5ff08291,2016-08-08 17:42:12.141037+00:00,http://goodtables.okfnlabs.org
 4 | e1463a9574d348f9abf13ea59a1e6c70,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,98,,7841c39e40ba475dab4f34e554aeef1e,2016-08-08 17:42:13.978475+00:00,http://goodtables.okfnlabs.org
 5 | 678512637ef044b9a298ea29eca902e4,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,e7d90595bfca4ef6b486ac4263b2c4e5,2016-08-08 17:42:14.369176+00:00,http://goodtables.okfnlabs.org
 6 | fa8776d6871f4058858bec851ee27cb2,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,704ca77f1f5a48bba44dfb7d2e84bb77,2016-08-08 17:42:14.610112+00:00,http://goodtables.okfnlabs.org
 7 | eb27fff6e6454040a0a26f388588d816,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,0,,9fd42bfeaa004af996d93530d616555a,2016-08-08 17:42:15.123895+00:00,http://goodtables.okfnlabs.org
 8 | 2e0ce1e3811b49fc830e75bc800bd1b6,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,00baee32a3f44619b2febe224b198f64,2016-08-08 17:42:36.428362+00:00,http://goodtables.okfnlabs.org
 9 | e4bf2c5bf1724c278898eb1d8f34f0bd,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,67,,00baee32a3f44619b2febe224b198f64,2016-08-08 17:42:36.428362+00:00,http://goodtables.okfnlabs.org
10 | 3822297dc83d4b6588c571e8d663d44a,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,98,,318875d4fda648f3926bb70cfc2163e9,2016-08-08 17:42:37.366584+00:00,http://goodtables.okfnlabs.org
11 | 072c65bcdeda4219b2cbdb68695c57cc,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,fea262ac9dcb4ba3a2fd3e8d06eb5521,2016-08-08 17:42:37.584550+00:00,http://goodtables.okfnlabs.org
12 | 22c8597f93964838be3aded23827e295,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,7b2d9e44605849f28fb47cf7b8e9c36e,2016-08-08 17:42:37.785201+00:00,http://goodtables.okfnlabs.org
13 | f2a7bd1359c143bfb1e2df229338ad99,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,0,,516f62d0631d418691012dac89046c7c,2016-08-08 17:42:38.215569+00:00,http://goodtables.okfnlabs.org
14 | fd27c6d4fb8142f38b43ea52cc16f2be,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,288631224a51428ab59159a440b5e909,2016-08-08 17:42:54.854697+00:00,http://goodtables.okfnlabs.org
15 | eb0e8a70775c43459c4b5a1a068b6011,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,67,,288631224a51428ab59159a440b5e909,2016-08-08 17:42:54.854697+00:00,http://goodtables.okfnlabs.org
16 | f674ea3ec1c04bfc987a3d65cc652c42,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,98,,81032d4fb6ab4642afa8bb82e0e88ebd,2016-08-08 17:42:56.360466+00:00,http://goodtables.okfnlabs.org
17 | 92953faad075412ca490f31b81a29fcf,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,36aa26c18f0b4ae0894a7897ebe3f2d6,2016-08-08 17:42:56.620225+00:00,http://goodtables.okfnlabs.org
18 | f302ab531bc146d6929380b33afd4072,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/okfn/tabular-validator/master/examples/valid.csv,,100,,8ccd3cba19cc47bd9258c0e66f5dca98,2016-08-08 17:42:56.834069+00:00,http://goodtables.okfnlabs.org
19 | 91125b1297e0408d972d9e1b20010543,source3,xx_dept16,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables/master/examples/empty_rows_multiple.csv,,0,,22fc4915f50141259e717f2d6724c6d0,2016-08-08 17:42:57.284804+00:00,http://goodtables.okfnlabs.org
20 | 38aa93bd3b7540b4a383f72914a32e95,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,a067e4c7d5d341a689c7dfda36d54049,2017-04-18 09:27:19.019507+00:00,http://goodtables.okfnlabs.org
21 | 3e1e824daa2f4a8ab08f1b5b781e35a8,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,ad6a9ad5effd47bd8246adaa5d7fa06d,2017-04-18 09:27:20.827406+00:00,http://goodtables.okfnlabs.org
22 | ef384476e5764361b7fa2f0927cd7e46,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,b3afebc6cd90495abd48651bf3468533,2017-04-18 09:27:21.314295+00:00,http://goodtables.okfnlabs.org
23 | c8b5879aa25e46899225d94b0cb4ae82,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,66d21a37fe6d4149aaae0610afd377bf,2017-04-18 09:27:21.767780+00:00,http://goodtables.okfnlabs.org
24 | 515d2055c63c41689292b418945bd8e9,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,0756e905014540ae84a870b61896e5a4,2017-04-18 09:33:46.645467+00:00,http://goodtables.okfnlabs.org
25 | e3674d1fb27a4b478d6eb7047a469507,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,70ef71ff17f3480889c3dc9d0e6572f8,2017-04-18 09:33:48.489108+00:00,http://goodtables.okfnlabs.org
26 | 4bde14a9541546feafff7f53ab315493,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,2822a2e498d747c897b9499b4dbf6dc2,2017-04-18 09:33:48.971573+00:00,http://goodtables.okfnlabs.org
27 | dcee66bb5eeb46d0a55bcf56d1f2d8b1,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,6e3d4b85e9fb4a84b62d483ce354dd4d,2017-04-18 09:33:49.448480+00:00,http://goodtables.okfnlabs.org
28 | fbf1784269fc4511b4b6febbfa57a23e,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,e800081ce79d4bb68d33e286e40a845d,2017-04-18 10:17:58.833972+00:00,http://goodtables.okfnlabs.org
29 | 90fa232cd63d4bfbb47873483b2dfa01,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,67,,e800081ce79d4bb68d33e286e40a845d,2017-04-18 10:17:58.833972+00:00,http://goodtables.okfnlabs.org
30 | b318a27f153b468e9af757e2d05a320c,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,ae1edf8cf8dd4931aeada91629e9ce2f,2017-04-18 10:18:02.214709+00:00,http://goodtables.okfnlabs.org
31 | ce721fec98d443438e60e61492d87b76,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,7fd67e1fb1fe4562839f56dd1996e56c,2017-04-18 10:18:02.693835+00:00,http://goodtables.okfnlabs.org
32 | c60eef935157433786ad8a08872014a1,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,1feb92e96b514e3b9613ab189fecff0c,2017-04-18 10:18:03.173207+00:00,http://goodtables.okfnlabs.org
33 | fcd86a5eaea94c5b9c46f1043d634140,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,0,,17a0dbbce5fd4fddaa1365fb91ccfa5b,2017-04-18 10:18:04.337754+00:00,http://goodtables.okfnlabs.org
34 | e91bb0379a2d4fd294f2ed04cafd0e23,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,b5e36109c3894c70a7e6898f03f3be3c,2017-04-18 15:15:41.856361+00:00,http://goodtables.okfnlabs.org
35 | 0100dcccdae940d4a41c42369cb40d9a,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,67,,b5e36109c3894c70a7e6898f03f3be3c,2017-04-18 15:15:41.856361+00:00,http://goodtables.okfnlabs.org
36 | f2753167cdba403f9d7b2616f09751fd,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,98,,a12154896fc7497c918405489b66017a,2017-04-18 15:15:44.465849+00:00,http://goodtables.okfnlabs.org
37 | 43e9f2c01b01446187b12cf4649fc48f,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,dcb41983c603452cbd602122a80d4759,2017-04-18 15:15:45.090038+00:00,http://goodtables.okfnlabs.org
38 | 709175f155c545f3866daf5c4376e915,source1,xx_dept1,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,,100,,6f6f0380302b4ab5bbda9df9e9df4ee9,2017-04-18 15:15:45.647236+00:00,http://goodtables.okfnlabs.org
39 | 1b1720b8d58d42b7847bf57196d4e8c4,source3,xx_dept15,2015-01-01,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,0,,21057087b1484c46af6f89414656df62,2017-04-18 15:15:46.884910+00:00,http://goodtables.okfnlabs.org
40 | 


--------------------------------------------------------------------------------
/tests/fixtures/runs.csv:
--------------------------------------------------------------------------------
 1 | id,timestamp,total_score
 2 | 45761052f48a4b158314b45d5ff08291,2016-08-08 17:42:12.141037+00:00,84
 3 | f0c77831c2ea4fd1801075eb4dffc625,2016-08-08 17:42:14.867729+00:00,0
 4 | 00baee32a3f44619b2febe224b198f64,2016-08-08 17:42:36.428362+00:00,84
 5 | 073aa1002d4242c4ac6f0078f593f1dd,2016-08-08 17:42:37.989954+00:00,0
 6 | 288631224a51428ab59159a440b5e909,2016-08-08 17:42:54.854697+00:00,84
 7 | 5e4824a7be2548b4a94507b825483a07,2016-08-08 17:42:57.050376+00:00,0
 8 | 34bbd2146948465aa05b58603bce9767,2017-04-18 09:27:22.266260+00:00,0
 9 | 6a1351e5ae5b454381d77d971bda12ca,2017-04-18 09:33:49.929597+00:00,0
10 | e228cdeb374a407ca14ee0ff34f7df8f,2017-04-18 09:45:06.245453+00:00,0
11 | e800081ce79d4bb68d33e286e40a845d,2017-04-18 10:17:58.833972+00:00,84
12 | 8c0d2b4ded574f24b649ad6a9fdcee54,2017-04-18 10:18:03.774295+00:00,0
13 | b5e36109c3894c70a7e6898f03f3be3c,2017-04-18 15:15:41.856361+00:00,84
14 | a6ac3e7c43e64f42a33d3d2c500bef06,2017-04-18 15:15:46.238074+00:00,0
15 | 


--------------------------------------------------------------------------------
/tests/fixtures/sources.csv:
--------------------------------------------------------------------------------
1 | id,publisher_id,title,data,score,revision,schema,created_at,timestamp,format
2 | source1,xx_dept1,Source 1,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,8,1,,2015-01-01,2015-01-01,csv
3 | source3,xx_dept15,Source 16,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv,,1,,2015-01-01,2015-01-01,csv
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/sources_with_period_id.csv:
--------------------------------------------------------------------------------
1 | id,publisher_id,title,data,format,created_at,period_id
2 | source1,xx_dept1,Source 1,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv,csv,2015-01-01,17-10-2014/17-10-2014
3 | source2,xx_dept15,Source 15,https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/april-to-may-12th-2010.xls,excel,2010-05-01,01-04-2010/31-05-2010
4 | 


--------------------------------------------------------------------------------
/tests/mock_generator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | from __future__ import unicode_literals
 6 | 
 7 | from data_quality import  generators
 8 | 
 9 | class MockGenerator(generators.BaseGenerator):
10 |     """This class deletes the current database and regenerates it"""
11 | 
12 |     def __init__(self, url=None, datapackage=None):
13 |         """Create an instance
14 |         
15 |         Args:
16 |             url: something to please the Base Generator
17 |         """
18 | 
19 |         super(MockGenerator, self).__init__(url)
20 | 


--------------------------------------------------------------------------------
/tests/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/data-quality-cli/e9abc93b896ea59269d11cdc8f2d301f81be20ad/tests/tasks/__init__.py


--------------------------------------------------------------------------------
/tests/tasks/test_aggregate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import unittest
  8 | import os
  9 | from .test_task import TestTask
 10 | from data_quality import tasks, utilities, compat
 11 | from goodtables import pipeline
 12 | 
 13 | 
 14 | class TestAggregatorTask(TestTask):
 15 |     """Test the Aggregator task"""
 16 | 
 17 |     def test_aggregator_run(self):
 18 |         """Test that Aggregator task runs as post task and updates results"""
 19 | 
 20 |         aggregator_task = tasks.Aggregator(self.config)
 21 |         url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv'
 22 |         pipeline_instance = pipeline.Pipeline(data=url, format='csv',
 23 |                                               post_task=aggregator_task.run)
 24 |         results_before_run = self.read_file_contents(aggregator_task.result_file)
 25 |         pipeline_instance.run()
 26 |         results_after_run = self.read_file_contents(aggregator_task.result_file)
 27 | 
 28 |         self.assertEqual(len(results_after_run), len(results_before_run) + 1)
 29 | 
 30 |     def test_agregator_batch_run(self):
 31 |         """Test that Aggregator task updates run file after each batch"""
 32 | 
 33 |         config = self.config
 34 |         aggregator_task = tasks.Aggregator(config)
 35 | 
 36 |         def mokup_function(instance):
 37 |             aggregator_task.write_run()
 38 |         batch_options = config['goodtables']['arguments']['batch']
 39 |         batch_options['post_task'] = mokup_function
 40 |         batch_options['pipeline_options'] = config['goodtables']['arguments']['pipeline']
 41 |         batch = pipeline.Batch(aggregator_task.source_file, **batch_options)
 42 |         runs_before_run = self.read_file_contents(aggregator_task.run_file)
 43 |         batch.run()
 44 |         runs_after_run = self.read_file_contents(aggregator_task.run_file)
 45 | 
 46 |         self.assertGreater(len(runs_after_run), len(runs_before_run))
 47 | 
 48 |     def test_aggregator_fetch(self):
 49 |         """Test that Aggregator task fetches the source"""
 50 | 
 51 |         aggregator_task = tasks.Aggregator(self.config)
 52 |         url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv'
 53 |         utilities.set_up_cache_dir(aggregator_task.cache_dir)
 54 | 
 55 |         pipeline_instance = pipeline.Pipeline(data=url, format='csv',
 56 |                                               post_task=aggregator_task.run)
 57 |         pipeline_instance.run()
 58 |         file_names = []
 59 |         for file_name in os.listdir(aggregator_task.cache_dir):
 60 |             file_names.append(file_name)
 61 |         self.assertEquals(file_names,['valid.csv'])
 62 | 
 63 |     def test_aggregator_assess_timeliness(self):
 64 |         """Test that Aggregator calls the RelevancePeriodExtractor"""
 65 | 
 66 |         self.config['source_file'] = 'sources_with_period_id.csv'
 67 |         self.config['datapackage_file'] = 'datapackage_sources_with_period.json'
 68 |         self.config['assess_timeliness'] = True
 69 |         self.config['timeliness']['timeliness_strategy'] = ['period_id']
 70 |         extractor = tasks.extract_relevance_period.RelevancePeriodExtractor(self.config)
 71 |         extractor.run()
 72 |         aggregator_task = tasks.Aggregator(self.config)
 73 |         url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/valid.csv'
 74 |         pipeline_instance = pipeline.Pipeline(data=url, format='csv',
 75 |                                               post_task=aggregator_task.run)
 76 |         pipeline_instance.run()
 77 |         updated_sources = self.read_file_contents(aggregator_task.result_file)
 78 |         result = updated_sources[-1]
 79 |         score = int(result['score'])
 80 |         self.assertEqual(98, score)
 81 | 
 82 |     def tests_aggreate_scoring(self):
 83 |         """Test Aggregator scoring"""
 84 | 
 85 |         aggregator_task = tasks.Aggregator(self.config)
 86 |         url = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/empty_rows_multiple.csv'
 87 |         schema = 'https://raw.githubusercontent.com/frictionlessdata/goodtables-py/v1.0.0-alpha8/data/test_schema.json'
 88 |         pipeline_options = self.config['goodtables']['arguments']['pipeline']
 89 |         pipeline_options['options']['schema']['schema'] = schema
 90 |         pipeline_instance = pipeline.Pipeline(data=url, format='csv',
 91 |                                               post_task=aggregator_task.run,
 92 |                                               **pipeline_options)
 93 |         pipeline_instance.run()
 94 |         result = self.read_file_contents(aggregator_task.result_file)[-1]
 95 | 
 96 |         self.assertEqual(int(result['score']), 0)
 97 | 
 98 |     def read_file_contents(self, file_name):
 99 |         """Return file contents as list of dicts"""
100 | 
101 |         contents = []
102 |         with compat.UnicodeDictReader(file_name) as src_file:
103 |             for line in src_file:
104 |                 contents.append(line)
105 |         return contents
106 | 


--------------------------------------------------------------------------------
/tests/tasks/test_assess_performance.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import unittest
 8 | import os
 9 | from .test_task import TestTask
10 | from data_quality import tasks, utilities, compat
11 | 
12 | class TestPerformanceAssessorTask(TestTask):
13 |     """Test the PerformanceAssessor task"""
14 | 
15 |     def test_performance_created(self):
16 |         """Test that PerformanceAssessor task creates the performance file"""
17 | 
18 |         config = self.config
19 |         assess_performance_task = tasks.PerformanceAssessor(config)
20 |         assess_performance_task.run()
21 |         self.assertTrue(os.path.exists(assess_performance_task.performance_file))
22 | 
23 |     def test_performance_calculation(self):
24 |         """Test that PerformanceAssessor task calculates performance correctly"""
25 | 
26 |         config = self.config
27 |         assess_performance_task = tasks.PerformanceAssessor(config)
28 |         assess_performance_task.run()
29 |         test_dict = {'files_count_to_date': '1', 'valid_to_date': '100',
30 |                      'score_to_date': '100', 'score': '100',
31 |                      'month_of_creation': '2015-01-01', 'publisher_id': 'xx_dept1',
32 |                      'valid': '100', 'files_count': '1'}
33 |         with compat.UnicodeDictReader(assess_performance_task.performance_file) as pf:
34 |             self.assertGreater(self.find_in_sequence(pf, test_dict), -1)
35 | 
36 |     def find_in_sequence(self, sequence, target):
37 |         """Find `target` in `sequence`"""
38 | 
39 |         found = False
40 |         for position, value in enumerate(sequence):
41 |             if value == target:
42 |                 found = True
43 |                 break
44 |         if not found:
45 |             return -1
46 |         return position
47 | 


--------------------------------------------------------------------------------
/tests/tasks/test_extract_relevance_period.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import unittest
  8 | import datetime
  9 | from data_quality import exceptions
 10 | from data_quality.tasks.extract_relevance_period import RelevancePeriodExtractor
 11 | from .test_task import TestTask
 12 | 
 13 | class TestRelevancePeriodExtractor(TestTask):
 14 |     """Test the RelevancePeriodExtractor task"""
 15 | 
 16 |     def test_extract_dates(self):
 17 |         """Test the date extraction"""
 18 |         
 19 |         self.maxDiff = None
 20 |         examples = ['Transparency Data 1 to 30 April 2014',
 21 |                     'July 2011 return with descriptions',
 22 |                     'DH-May-2010-amnd4',
 23 |                     'April 2010 to December 2013',
 24 |                     '2010 October Return',
 25 |                     'MOD\'s spending over £25,000 for August2014',
 26 |                     'jncc-spend-over-25k-2012-01',
 27 |                     '12_03_15_data',
 28 |                     'Over_%C2%A325K_april_2014',
 29 |                     'Transparency_Sept2014_Final.csv',
 30 |                     'August - September 2015',
 31 |                     '20-12-2015/21-01-2016',
 32 |                     '17/07/2014 - 17/08/2014']
 33 |         expected =  [[datetime.datetime(2014,4,1), datetime.datetime(2014,4,30)],
 34 |                     [datetime.datetime(2011,7,31)],
 35 |                     [datetime.datetime(2010,5,31)],
 36 |                     [datetime.datetime(2010,4,30), datetime.datetime(2013,12,31)],
 37 |                     [datetime.datetime(2010,10,31)],
 38 |                     [datetime.datetime(2014,8,31)],
 39 |                     [datetime.datetime(2012,1,31)],
 40 |                     [datetime.datetime(2015,3,12)],
 41 |                     [datetime.datetime(2014,4,30)],
 42 |                     [datetime.datetime(2014,9,30)],
 43 |                     [datetime.datetime(2015,8,31), datetime.datetime(2015,9,30)],
 44 |                     [datetime.datetime(2015,12,20), datetime.datetime(2016,1,21)],
 45 |                     [datetime.datetime(2014,7,17), datetime.datetime(2014,8,17)]]
 46 | 
 47 |         self.config['timeliness']['timeliness_strategy'] = ['title', 'data']
 48 |         results = []
 49 |         extractor = RelevancePeriodExtractor(self.config)
 50 |         for line in examples:
 51 |             dates = extractor.extract_dates(line)
 52 |             results.append(dates)
 53 |         for index, result in enumerate(results):
 54 |             results[index] = sorted([extracted_date['date_obj']
 55 |                                      for extracted_date in result])
 56 | 
 57 |         self.assertSequenceEqual(results, expected)
 58 | 
 59 |     def test_resolve_period(self):
 60 |         """Test that a period is extracted and formated properly"""
 61 | 
 62 |         sources = [{
 63 |                         'title': 'MOD spending over £500 on a GPC and spending over £25,000, April 2010 to December 2013/December 2012 MOD GPC spend',
 64 |                         'data': 'https://www.gov.uk/government/uploads/GPC_transparency_data_travel_stationery_contracts_dec2012.csv'
 65 |                     },
 66 |                     {
 67 |                         'title': 'Spend over £25,000 in Natural England/July 2011 return',
 68 |                         'data': 'http://data.defra.gov.uk/ops/procurement/1107/ne-over-25k-1107.csv'
 69 |                     },
 70 |                     {
 71 |                         'title': 'Spending over £25,000, April 2010 to December 2013/1 to 29 February 2012 GPC spend',
 72 |                         'data': 'https://www.gov.uk/government/uploads/attachment_data/file/28883/GPCTRANSPARENCYDATA1FEBRUARYTO29FEBRUARY2012includingdescriptions.csv'
 73 |                     }]
 74 | 
 75 |         expected =  [(datetime.datetime(2012,12,1), datetime.datetime(2012,12,31)),
 76 |                      (datetime.datetime(2011,7,1), datetime.datetime(2011,7,31)),
 77 |                     # This will not be found because the title is uncertain and the file name doesn't have delimitators
 78 |                      None]
 79 | 
 80 |         self.config['timeliness']['timeliness_strategy'] = ['title', 'data']
 81 |         results = []
 82 |         extractor = RelevancePeriodExtractor(self.config)
 83 |         for source in sources:
 84 |             results.append(extractor.identify_period(source))
 85 | 
 86 |         self.assertSequenceEqual(results, expected)
 87 | 
 88 |     def test_run_raises_if_field_not_provided(self):
 89 |         """Test that RelevancePeriodExtractor raises if the field in timeliness_strategy
 90 |             doesn't exist in source_file
 91 |         """
 92 | 
 93 |         self.config['assess_timeliness'] = True
 94 |         self.config['timeliness']['timeliness_strategy'] = ['period_id']
 95 |         extractor = RelevancePeriodExtractor(self.config)
 96 |         self.assertRaisesRegexp(ValueError, 'timeliness_strategy', extractor.run)
 97 | 
 98 |     def test_run_raises_if_insufficient_period(self):
 99 |         """Tests that RelevancePeriodExtractor raises if sources without `period_id`
100 |             make up over 10% of total sources
101 |         """
102 | 
103 |         self.config['assess_timeliness'] = True
104 |         self.config['timeliness']['timeliness_strategy'] = ['title', 'data']
105 |         extractor = RelevancePeriodExtractor(self.config)
106 |         self.assertRaises(exceptions.UnableToAssessTimeliness, extractor.run)
107 | 


--------------------------------------------------------------------------------
/tests/tasks/test_generate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import unittest
 8 | import os
 9 | import io
10 | import json
11 | from data_quality import tasks, utilities, compat, generators
12 | from tests import mock_generator
13 | from .test_task import TestTask
14 | 
15 | class TestGeneratorManagerTask(TestTask):
16 |     """Test the GeneratorManager task"""
17 | 
18 |     def test_generate_built_in_generator(self):
19 |         """Test that GeneratorManager task loads a built-in generator"""
20 | 
21 |         generator = tasks.GeneratorManager(self.config)
22 |         generator_class = generator.run('ckan', 'endpoint', '',
23 |                                         file_types=['csv','excel'], simulate=True)
24 | 
25 |         self.assertIsInstance(generator_class, generators.CkanGenerator)
26 | 
27 |     def test_generate_custom_generator(self):
28 |         """Test that GeneratorManager task loads a custom generator"""
29 | 
30 |         generator = tasks.GeneratorManager(self.config)
31 |         generator_path = 'tests.mock_generator.MockGenerator'
32 |         generator_class = generator.run('mock', 'endpoint', generator_path,
33 |                                         None, simulate=True)
34 | 
35 |         self.assertIsInstance(generator_class, mock_generator.MockGenerator)
36 | 
37 |     def test_generate_update_datapackage_sources(self):
38 |         """Test that GeneratorManager task updates datapackage sources"""
39 | 
40 |         def empty_datapackage_sources(datapkg_path, datapkg):
41 |             with io.open(datapkg_path, mode='w+', encoding='utf-8') as datapkg_file:
42 |                 datapkg.descriptor['sources'] = []
43 |                 updated_json = json.dumps(datapkg.to_dict(), indent=4, sort_keys=True)
44 |                 datapkg_file.write(compat.str(updated_json))
45 | 
46 |         generator = tasks.GeneratorManager(self.config)
47 |         datapkg_path = os.path.join(generator.datapackage.base_path,
48 |                                     'datapackage.json')
49 |         empty_datapackage_sources(datapkg_path, generator.datapackage)
50 |         generator.update_datapackage_sources()
51 |         second_generator = tasks.GeneratorManager(self.config)
52 | 
53 |         self.assertEquals(generator.datapackage.descriptor['sources'],
54 |                           second_generator.datapackage.descriptor['sources'])
55 |         self.assertGreater(len(generator.datapackage.descriptor['sources']), 0)
56 |         empty_datapackage_sources(datapkg_path, generator.datapackage)
57 | 


--------------------------------------------------------------------------------
/tests/tasks/test_initialize_datapackage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import unittest
 8 | import os
 9 | from data_quality import tasks, utilities, compat
10 | 
11 | 
12 | class TestDataPackageInitializer(unittest.TestCase):
13 |     """Test the DataPackageInitializer task"""
14 | 
15 |     def setUp(self):
16 |         self.workspace_path = './tests/tmp_datapackage/'
17 |         utilities.resolve_dir(self.workspace_path)
18 | 
19 |     def tearDown(self):
20 |         utilities.set_up_cache_dir(self.workspace_path)
21 |         os.rmdir(self.workspace_path)
22 | 
23 |     def test_config_initialized(self):
24 |         """Test that DataPackageInitializer generates a config file if there isn\'t one"""
25 | 
26 |         initializer = tasks.DataPackageInitializer(self.workspace_path)
27 |         initializer.initialize_config()
28 |         self.assertTrue(os.path.exists(os.path.join(self.workspace_path, 'dq_config.json')))
29 | 
30 |     def test_run(self):
31 |         """Test that DataPackageInitializer generates a 'datapackage.json' file"""
32 | 
33 |         initializer = tasks.DataPackageInitializer(self.workspace_path)
34 |         initializer.run()
35 |         self.assertTrue(os.path.exists(os.path.join(self.workspace_path, 'datapackage.json')))
36 | 
37 | 


--------------------------------------------------------------------------------
/tests/tasks/test_task.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import unittest
 8 | import os
 9 | from data_quality import utilities
10 | 
11 | class TestTask(unittest.TestCase):
12 |     """Base class for task tests"""
13 |     
14 |     def setUp(self):
15 |         """Load the fixture config"""
16 | 
17 |         config_filepath = os.path.join('tests', 'fixtures', 'dq.json')
18 |         config = utilities.load_json_config(config_filepath)
19 |         self.config = config
20 | 


--------------------------------------------------------------------------------
/tests/tasks/tests_check_datapackage.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import unittest
 8 | import os
 9 | import datapackage
10 | from data_quality import tasks, utilities, compat
11 | from .test_task import TestTask
12 | 
13 | class TestDataPackageChecker(TestTask):
14 |     """Test the DataPackageChecker task"""
15 | 
16 |     def test_lacking_required_field(self):
17 |         """Test that DataPackageChecker raises if required field is missing"""
18 | 
19 |         filename = 'datapackage_schema_missing_required.json'
20 |         self.config['datapackage_file'] = os.path.join('tests', 'fixtures', filename)
21 |         checker = tasks.check_datapackage.DataPackageChecker(self.config)
22 |         default_datapkg = utilities.get_default_datapackage()
23 |         self.assertRaisesRegexp(ValueError, 'miss', checker.check_resource_schema,
24 |                                 default_datapkg.resources[0], checker.datapackage.resources[0])
25 | 
26 |     def test_run(self):
27 |         """Test that DataPackageChecker raises if required resource is missing"""
28 | 
29 |         filename = 'datapackage_schema_missing_required.json'
30 |         self.config['datapackage_file'] = os.path.join('tests', 'fixtures', filename)
31 |         checker = tasks.check_datapackage.DataPackageChecker(self.config)
32 |         self.assertRaisesRegexp(ValueError, 'found', checker.run())
33 | 
34 |     def test_database_content(self):
35 |         """Test that DataPackageChecker raises if a required file from the database 
36 |            doesn't respect the schema described in datapackage
37 |         """
38 | 
39 |         checker = tasks.check_datapackage.DataPackageChecker(self.config)
40 |         self.assertRaisesRegexp(ValueError, 'schema', checker.check_database_content)


--------------------------------------------------------------------------------
/tests/test_interface.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import unittest
 9 | import subprocess
10 | import data_quality
11 | 
12 | class TestDataQualityCLI(unittest.TestCase):
13 | 
14 |     def test_cli_run(self):
15 |         config_path = os.path.join('tests', 'fixtures', 'dq.json')
16 |         c = ['python', '-m', 'data_quality.main', 'run', config_path]
17 |         subprocess.check_output(c)
18 | 


--------------------------------------------------------------------------------
/tests/test_utilities.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import unittest
 8 | import os
 9 | from data_quality import utilities
10 | import datapackage
11 | 
12 | class TestUtilities(unittest.TestCase):
13 | 
14 |     def test_that_config_is_correctly_loaded(self):
15 |         config_filepath = os.path.join('tests', 'fixtures', 'dq.json')
16 |         config = utilities.load_json_config(config_filepath)
17 |         self.assertTrue(os.path.isabs(config['data_dir']))
18 | 
19 |     def test_default_datapackage_loaded(self):
20 |         datapackage = utilities.get_default_datapackage()
21 |         self.assertGreater(len(datapackage.resources), 0)
22 | 
23 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | package=data_quality
 3 | skip_missing_interpreters=true
 4 | envlist=
 5 |   py27
 6 |   py33
 7 |   py34
 8 |   py35
 9 | 
10 | [testenv]
11 | deps=
12 |   mock
13 |   pytest
14 |   pytest-cov
15 |   coverage
16 |   datapackage
17 | passenv=
18 |   CI
19 |   TRAVIS
20 |   TRAVIS_JOB_ID
21 |   TRAVIS_BRANCH
22 | commands=
23 |   py.test \
24 |     --cov {[tox]package} \
25 |     --cov-config .coveragerc \
26 |     --cov-report term-missing \
27 |     {posargs:tests}
28 | 
29 | [pytest]
30 | # pytest configuration here
31 | 
32 | [report]
33 | # coverage configuration here
34 | 


--------------------------------------------------------------------------------