├── .dockerignore ├── .github ├── ISSUE_TEMPLATE.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── main.yml ├── .gitignore ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile.slim ├── LICENSE.md ├── MANIFEST.in ├── Makefile ├── README.md ├── TUTORIAL.ipynb ├── datapackage_pipelines ├── VERSION ├── __init__.py ├── app.py ├── celery_tasks │ ├── __init__.py │ ├── celery_app.py │ ├── celery_common.py │ ├── celery_tasks.py │ └── dependency_manager.py ├── cli.py ├── generators │ ├── __init__.py │ ├── generator_base.py │ ├── schedules.py │ └── utilities.py ├── lib │ ├── __init__.py │ ├── add_computed_field.py │ ├── add_metadata.py │ ├── add_resource.py │ ├── cache_loader.py │ ├── concatenate.py │ ├── deduplicate.py │ ├── delete_fields.py │ ├── dump │ │ ├── __init__.py │ │ ├── dumper_base.py │ │ ├── file_formats.py │ │ ├── to_path.py │ │ ├── to_sql.py │ │ └── to_zip.py │ ├── dump_to_path.py │ ├── dump_to_sql.py │ ├── dump_to_zip.py │ ├── duplicate.py │ ├── filter.py │ ├── find_replace.py │ ├── flow.py │ ├── internal │ │ ├── __init__.py │ │ └── sink.py │ ├── join.py │ ├── load.py │ ├── load_metadata.py │ ├── load_resource.py │ ├── printer.py │ ├── set_types.py │ ├── sort.py │ ├── stream_remote_resources.py │ ├── unpivot.py │ ├── update_package.py │ └── update_resource.py ├── manager │ ├── __init__.py │ ├── logging_config.py │ ├── runner.py │ ├── runners │ │ ├── __init__.py │ │ ├── base_runner.py │ │ ├── local_python.py │ │ └── runner_config.py │ └── tasks.py ├── specs │ ├── __init__.py │ ├── errors.py │ ├── hashers │ │ ├── __init__.py │ │ ├── dependency_resolver.py │ │ └── hash_calculator.py │ ├── parsers │ │ ├── __init__.py │ │ ├── base_parser.py │ │ ├── basic_pipeline.py │ │ └── source_spec_pipeline.py │ ├── resolver.py │ ├── schemas │ │ ├── __init__.py │ │ ├── pipeline-spec.schema.json │ │ └── validator.py │ └── specs.py ├── status │ ├── __init__.py │ ├── backend_filesystem.py │ ├── backend_redis.py │ ├── backend_sqlite.py │ ├── hook_sender.py │ ├── pipeline_execution.py │ ├── pipeline_status.py │ └── status_manager.py ├── utilities │ ├── __init__.py │ ├── dirtools.py │ ├── execution_id.py │ ├── extended_json.py │ ├── flow_utils.py │ ├── lazy_dict.py │ ├── lib_test_helpers.py │ ├── resources.py │ ├── stat_utils.py │ └── tabulator_txt_parser.py ├── web │ ├── __init__.py │ ├── server.py │ └── templates │ │ └── dashboard.html └── wrapper │ ├── __init__.py │ ├── input_processor.py │ └── wrapper.py ├── docker ├── github_config.py └── run.sh ├── pylama.ini ├── samples ├── add_constant.py ├── co2-information-cdiac.zip └── pipeline-spec.yaml ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── cli │ ├── custom_formatters │ │ ├── __init__.py │ │ └── xlsx_format.py │ ├── expected_flow_data.csv │ ├── pipeline-spec.yaml │ ├── setup.py │ ├── test_cli_exit_codes.sh │ ├── test_cli_logs.sh │ ├── test_custom_formatters.sh │ ├── test_exclude_dirnames.sh │ ├── test_flow.py │ └── test_flow.sh ├── data │ ├── datapackage.json │ ├── datapackage2.json │ ├── datapackage3.json │ ├── sample.csv │ ├── sample.dups.csv │ ├── sample.txt │ ├── sample.zip │ └── sample2.csv ├── docker │ ├── .gitignore │ ├── lib │ │ ├── dpp_docker_test.py │ │ └── setup.py │ ├── pipeline-spec.yaml │ ├── test.py │ └── test.sh ├── env │ ├── common │ │ └── pipeline-common.py │ ├── dummy │ │ ├── big-outputs.py │ │ ├── pipeline-spec.yaml │ │ ├── pipeline-test-supplier-titleize.py │ │ └── types.csv │ └── extract-year.py ├── serve │ ├── html_output.py │ └── pipeline-spec.yaml ├── sitecustomize.py ├── stdlib │ ├── README.md │ ├── __init__.py │ ├── fixtures │ │ ├── add_resource_existent_env │ │ ├── dump_to_sql_update_mode__insert │ │ ├── dump_to_sql_update_mode__update │ │ ├── dump_to_sql_with_updated_data │ │ ├── load_existent_env │ │ ├── obj_fix_dump_to_sql │ │ ├── reverse_sort │ │ ├── simple_add_computed_field │ │ ├── simple_add_resource │ │ ├── simple_concat │ │ ├── simple_deduplicate │ │ ├── simple_delete_fields │ │ ├── simple_dump_dot_to_zip │ │ ├── simple_dump_dot_to_zip_with_hash │ │ ├── simple_dump_dot_to_zip_with_hash_and_pretty_descriptor │ │ ├── simple_dump_to_sql │ │ ├── simple_dump_to_zip │ │ ├── simple_dump_to_zip_with_hash │ │ ├── simple_dump_to_zip_with_hash_and_pretty_descriptor │ │ ├── simple_filter │ │ ├── simple_find_replace │ │ ├── simple_join │ │ ├── simple_load │ │ ├── simple_load_metadata │ │ ├── simple_load_resource │ │ ├── simple_load_resource_dups │ │ ├── simple_load_resource_index │ │ ├── simple_load_resource_limit_rows │ │ ├── simple_load_resource_list │ │ ├── simple_load_resource_multi │ │ ├── simple_load_resource_required │ │ ├── simple_load_resource_resources │ │ ├── simple_load_resource_resources_required │ │ ├── simple_resource_duplication │ │ ├── simple_set_types │ │ ├── simple_sort │ │ ├── simple_stream_remote_resources │ │ ├── simple_stream_remote_resources_limit_rows │ │ ├── simple_stream_remote_resources_zip │ │ ├── simple_unpivot │ │ ├── simple_update_package │ │ ├── simple_update_resource │ │ ├── sort_with_duplicate_keys │ │ └── stream_remote_resources_txt_format │ └── test_stdlib.py ├── test_main.py └── wrapper │ └── test_wrapper.py └── tox.ini /.dockerignore: -------------------------------------------------------------------------------- 1 | .tox/ 2 | .git/ 3 | .cache/ 4 | .dpp.db 5 | .dppdb 6 | .github/ 7 | .idea/ 8 | build/ 9 | datapackage_pipelines.egg-info/ 10 | dist/ 11 | samples/ 12 | tests/ 13 | 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | In order to submit an issue, please ensure you can check the following. Thanks! 2 | 3 | * [ ] Declare which version of Python you are using (`python --version`) 4 | * [ ] Declare which operating system you are using 5 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | This pull request fixes # . 2 | 3 | * [ ] I've added tests to cover the proposed changes 4 | 5 | Changes proposed in this pull request: 6 | 7 | - 8 | - 9 | - 10 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: [ master ] 5 | tags: [ '*' ] 6 | workflow_dispatch: 7 | jobs: 8 | build-server: 9 | runs-on: ubuntu-22.04 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions/setup-python@v2 13 | with: 14 | python-version: '3.9' 15 | # - name: install 16 | # run: | 17 | # sudo apt-get install libleveldb-dev libleveldb1d 18 | # make install-speedup 19 | - name: build 20 | run: | 21 | make build 22 | tests/docker/test.sh 23 | - name: version 24 | if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') 25 | env: 26 | DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} 27 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} 28 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 29 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 30 | run: | 31 | make deploy-pip 32 | make deploy-tags 33 | - name: master branch 34 | env: 35 | DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} 36 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} 37 | if: github.event_name == 'push' && contains(github.ref, '/heads/master') 38 | run: | 39 | make deploy-latest 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *,cover 44 | .hypothesis/ 45 | .pytest_cache 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | local_settings.py 54 | 55 | # Flask instance folder 56 | instance/ 57 | 58 | # Scrapy stuff: 59 | .scrapy 60 | 61 | # Sphinx documentation 62 | docs/_build/ 63 | 64 | # PyBuilder 65 | target/ 66 | 67 | # IPython Notebook 68 | .ipynb_checkpoints 69 | 70 | # pyenv 71 | .python-version 72 | 73 | # dotenv 74 | .env 75 | 76 | # Spyder project settings 77 | .spyderproject 78 | 79 | # Extras 80 | .projectile 81 | .idea/ 82 | datapackage-pipelines.iml 83 | celerybeat-schedule 84 | 85 | # Datapackage Pipeline DB 86 | .dpp.db 87 | 88 | # Resources created by our tests 89 | my-spiffy-resource.zip 90 | tests/env/dummy/dump.zip 91 | tests/env/dummy/hooks-outputs 92 | tests/env/dummy/nulls-test 93 | tests/env/dummy/type-tests-output 94 | tests/env/dummy/type-tests-output2 95 | tests/cli/.code 96 | .dpp 97 | .coverage.* 98 | .code/ 99 | .vscode/ 100 | 101 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | The datapackage-pipelines project accepts contributions via GitHub pull requests. This document outlines the process to help get your contribution accepted. 4 | 5 | The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards). 6 | 7 | We welcome adding new processors to the standard library, the following guidelines will improve the chances of your processor being accepted: 8 | 9 | * The processor has practical and common use-cases. 10 | * Minimal new dependencies - preferably, no new dependencies. 11 | 12 | ## Getting Started 13 | 14 | Recommended way to get started is to create and activate a project virtual environment. 15 | 16 | You should ensure you are using a supported Python version, you can check the .travis.yml to see which versions we use for CI. 17 | 18 | * [Pythonz](https://github.com/saghul/pythonz#installation) can be used to install a specific Python version. 19 | * [Virtualenvwrapper](http://virtualenvwrapper.readthedocs.io/en/latest/install.html#basic-installation) can help setting up and managing virtualenvs 20 | 21 | To install package and development dependencies into active environment: 22 | 23 | ``` 24 | $ make install 25 | ``` 26 | 27 | ## Lint & Test 28 | 29 | Before pushing code you should ensure lint and tests pass otherwise build will fail and your Pull request won't be merged :( 30 | 31 | You can use the following snippet to ensure everything works: 32 | 33 | ``` 34 | make install && make lint && make test 35 | ``` 36 | 37 | 38 | ## Linting 39 | 40 | To lint the project codebase: 41 | 42 | ``` 43 | $ make lint 44 | ``` 45 | 46 | Under the hood `pylama` configured in `pylama.ini` is used. On this stage it's already 47 | installed into your environment and could be used separately with more fine-grained control 48 | as described in documentation - https://www.pylint.org/. 49 | 50 | For example to check only errors: 51 | 52 | ``` 53 | $ pylanma 54 | ``` 55 | 56 | ## Testing 57 | 58 | To run tests with coverage: 59 | 60 | ``` 61 | $ make test 62 | ``` 63 | Under the hood `tox` powered by `py.test` and `coverage` configured in `tox.ini` is used. 64 | It's already installed into your environment and could be used separately with more fine-grained control 65 | as described in documentation - https://testrun.org/tox/latest/. 66 | 67 | For example to check subset of tests against Python 3 environment with increased verbosity. 68 | All positional arguments and options after `--` will be passed to `py.test`: 69 | 70 | ``` 71 | tox -e py35 -- -v tests/ 72 | ``` 73 | 74 | ## Testing with other databases 75 | 76 | By default the tests run with sqlite in-memory database which doesn't require any setup. 77 | However, most projects will want to use a real DB, like PostgreSQL. 78 | 79 | To run the tests with a different DB, you need to supply the connection string via environment variable. 80 | For example, to run with local postgresql databsae: 81 | 82 | `OVERRIDE_TEST_DB=postgresql://postgres:123456@localhost:5432/postgres py.test` 83 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-alpine 2 | 3 | RUN apk --update --no-cache --virtual=build-dependencies add \ 4 | build-base python3-dev \libxml2-dev libxslt-dev postgresql-dev leveldb leveldb-dev && \ 5 | apk --update --no-cache add libstdc++ redis libpq && \ 6 | mkdir -p /run/redis && mkdir -p /var/run/dpp && \ 7 | pip install psycopg2 datapackage-pipelines-github datapackage-pipelines-aws datapackage-pipelines-sourcespec-registry 8 | 9 | ADD . /dpp/ 10 | 11 | RUN pip install -U /dpp/[speedup] && \ 12 | mkdir -p /var/redis && chmod 775 /var/redis && chown redis:redis /var/redis 13 | 14 | ENV DPP_NUM_WORKERS=4 15 | ENV DPP_REDIS_HOST=127.0.0.1 16 | ENV DPP_CELERY_BROKER=redis://localhost:6379/6 17 | 18 | EXPOSE 5000 19 | WORKDIR /pipelines/ 20 | ENTRYPOINT ["/dpp/docker/run.sh"] 21 | 22 | 23 | -------------------------------------------------------------------------------- /Dockerfile.slim: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | RUN apt-get update && apt-get install --no-install-recommends -y redis libleveldb1d libleveldb-dev build-essential libpq-dev && \ 4 | update-ca-certificates && mkdir -p /run/redis && mkdir -p /var/run/dpp && \ 5 | apt-get clean && rm -rf /var/lib/apt/lists/* 6 | 7 | RUN pip install psycopg2 datapackage-pipelines-github datapackage-pipelines-sourcespec-registry datapackage-pipelines-aws 8 | 9 | ADD . /dpp/ 10 | 11 | RUN pip install -U /dpp/[speedup] && \ 12 | mkdir -p /var/redis && chmod 775 /var/redis && chown redis.redis /var/redis && \ 13 | mkdir -p /var/log/redis && cd /etc && ln -s redis/redis.conf 14 | 15 | ENV DPP_NUM_WORKERS=4 16 | ENV DPP_REDIS_HOST=127.0.0.1 17 | ENV DPP_CELERY_BROKER=redis://localhost:6379/6 18 | 19 | EXPOSE 5000 20 | WORKDIR /pipelines/ 21 | ENTRYPOINT ["/dpp/docker/run.sh"] 22 | 23 | 24 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Open Knowledge 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include *.json 2 | global-include *.yml 3 | global-include *.txt 4 | global-include *.html 5 | global-include VERSION 6 | include LICENSE.md 7 | include Makefile 8 | include pylintrc 9 | include README.md 10 | include tox.ini 11 | prune .tox 12 | 13 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all install list lint release test version build 2 | 3 | 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2) 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION) 6 | 7 | 8 | all: list 9 | 10 | install: 11 | pip install --upgrade -e .[develop] 12 | 13 | install-speedup: 14 | pip install --upgrade -e .[develop,speedup] 15 | 16 | list: 17 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 18 | 19 | lint: 20 | pylama $(PACKAGE) 21 | 22 | release: 23 | bash -c '[[ -z `git status -s` ]]' 24 | git tag -a -m release $(VERSION) 25 | git push --tags 26 | 27 | test: 28 | tox &&\ 29 | tests/cli/test_cli_exit_codes.sh &&\ 30 | tests/cli/test_cli_logs.sh &&\ 31 | tests/cli/test_custom_formatters.sh &&\ 32 | tests/cli/test_exclude_dirnames.sh &&\ 33 | tests/cli/test_flow.sh 34 | 35 | version: 36 | @echo $(VERSION) 37 | 38 | build: 39 | docker login -u "${DOCKER_USERNAME}" -p "${DOCKER_PASSWORD}" 40 | docker pull frictionlessdata/datapackage-pipelines:latest &&\ 41 | docker build -t frictionlessdata/datapackage-pipelines:latest --cache-from frictionlessdata/datapackage-pipelines . &&\ 42 | docker build -t frictionlessdata/datapackage-pipelines:latest-alpine --cache-from frictionlessdata/datapackage-pipelines . &&\ 43 | docker build -t frictionlessdata/datapackage-pipelines:${VERSION} --cache-from frictionlessdata/datapackage-pipelines . &&\ 44 | docker build -t frictionlessdata/datapackage-pipelines:${VERSION}-alpine --cache-from frictionlessdata/datapackage-pipelines . &&\ 45 | docker pull frictionlessdata/datapackage-pipelines:latest-slim &&\ 46 | docker build -t frictionlessdata/datapackage-pipelines:latest-slim -f Dockerfile.slim --cache-from frictionlessdata/datapackage-pipelines:latest-slim . &&\ 47 | docker build -t frictionlessdata/datapackage-pipelines:${VERSION}-slim -f Dockerfile.slim --cache-from frictionlessdata/datapackage-pipelines:latest-slim . 48 | 49 | 50 | deploy-latest: 51 | docker login -u "${DOCKER_USERNAME}" -p "${DOCKER_PASSWORD}" &&\ 52 | docker push frictionlessdata/datapackage-pipelines:latest &&\ 53 | docker push frictionlessdata/datapackage-pipelines:latest-alpine &&\ 54 | docker push frictionlessdata/datapackage-pipelines:latest-slim 55 | 56 | deploy-tags: 57 | docker login -u "${DOCKER_USERNAME}" -p "${DOCKER_PASSWORD}" &&\ 58 | docker push frictionlessdata/datapackage-pipelines:${VERSION} &&\ 59 | docker push frictionlessdata/datapackage-pipelines:${VERSION}-alpine &&\ 60 | docker push frictionlessdata/datapackage-pipelines:${VERSION}-slim 61 | 62 | deploy-pip: 63 | rm -rf dist/ || true 64 | pip install wheel twine 65 | python setup.py sdist bdist_wheel 66 | python -m twine upload dist/* -------------------------------------------------------------------------------- /datapackage_pipelines/VERSION: -------------------------------------------------------------------------------- 1 | 2.2.11 -------------------------------------------------------------------------------- /datapackage_pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import io 3 | import os 4 | 5 | from .specs import pipelines 6 | from .manager import execute_pipeline 7 | 8 | VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION') 9 | 10 | __version__ = io.open(VERSION_FILE, encoding='utf-8').readline().strip() 11 | -------------------------------------------------------------------------------- /datapackage_pipelines/app.py: -------------------------------------------------------------------------------- 1 | # pylama:ignore=W0611 2 | from .celery_tasks.celery_app import celery_app 3 | from .manager.logging_config import logging 4 | -------------------------------------------------------------------------------- /datapackage_pipelines/celery_tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/celery_tasks/__init__.py -------------------------------------------------------------------------------- /datapackage_pipelines/celery_tasks/celery_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from celery.schedules import crontab 4 | 5 | from .celery_common import get_celery_app, MANAGEMENT_TASK_NAME, SCHEDULED_TASK_NAME 6 | from .celery_tasks import build_dependents 7 | from datapackage_pipelines.specs import pipelines 8 | from datapackage_pipelines.status import status_mgr 9 | 10 | import logging 11 | 12 | kw = {} 13 | if os.environ.get('SCHEDULER'): 14 | CELERY_SCHEDULE = { 15 | '/management': { 16 | 'task': MANAGEMENT_TASK_NAME, 17 | 'schedule': crontab(), 18 | 'args': ('update', None, None), 19 | 'options': {'queue': 'datapackage-pipelines-management'} 20 | } 21 | } 22 | 23 | for spec in pipelines(): 24 | if spec.schedule is not None: 25 | entry = { 26 | 'task': SCHEDULED_TASK_NAME, 27 | 'schedule': crontab(*spec.schedule), 28 | 'args': (spec.pipeline_id,), 29 | 'options': {'queue': 'datapackage-pipelines-management'} 30 | } 31 | CELERY_SCHEDULE[spec.pipeline_id] = entry 32 | logging.info('SCHEDULING task %r: %r', spec.pipeline_id, spec.schedule) 33 | 34 | ps = status_mgr().get(spec.pipeline_id) 35 | ex = ps.get_last_execution() 36 | if ex is not None and not ex.finish_time: 37 | ex.invalidate() 38 | ex.finish_execution(False, {}, ['Cancelled']) 39 | 40 | kw = dict(CELERYBEAT_SCHEDULE=CELERY_SCHEDULE) 41 | 42 | logging.error('CELERY INITIALIZING') 43 | celery_app = get_celery_app(**kw) 44 | 45 | if os.environ.get('SCHEDULER'): 46 | build_dependents() 47 | celery_app.send_task(MANAGEMENT_TASK_NAME, ('init', None, None)) 48 | -------------------------------------------------------------------------------- /datapackage_pipelines/celery_tasks/celery_common.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from celery import Celery 4 | 5 | 6 | REGULAR_TASK_NAME = 'datapackage_pipelines.celery_tasks.celery_tasks' + \ 7 | '.execute_pipeline_task' 8 | SCHEDULED_TASK_NAME = 'datapackage_pipelines.celery_tasks.celery_tasks' + \ 9 | '.execute_scheduled_pipeline' 10 | MANAGEMENT_TASK_NAME = 'datapackage_pipelines.celery_tasks.celery_tasks' + \ 11 | '.update_pipelines' 12 | 13 | 14 | def get_celery_app(**kwargs): 15 | celery_app = Celery('dpp') 16 | 17 | broker = os.environ.get('DPP_CELERY_BROKER', 'redis://localhost:6379/6') 18 | 19 | conf = dict( 20 | CELERY_TIMEZONE='UTC', 21 | CELERY_REDIRECT_STDOUTS=False, 22 | BROKER_URL=broker, 23 | CELERY_RESULT_BACKEND=broker, 24 | CELERYD_LOG_LEVEL="DEBUG", 25 | CELERY_TASK_SERIALIZER='json', 26 | CELERY_RESULT_SERIALIZER='json', 27 | CELERY_ACCEPT_CONTENT=['json'], 28 | CELERYD_LOG_FORMAT='[%(asctime)s: %(levelname)s/%(processName)s(%(process)d)] %(message)s', 29 | CELERY_ROUTES={ 30 | REGULAR_TASK_NAME: {'queue': 'datapackage-pipelines'}, 31 | SCHEDULED_TASK_NAME: {'queue': 'datapackage-pipelines-management'}, 32 | MANAGEMENT_TASK_NAME: {'queue': 'datapackage-pipelines-management'}, 33 | } 34 | ) 35 | conf.update(kwargs) 36 | 37 | celery_app.conf.update(**conf) 38 | 39 | return celery_app 40 | -------------------------------------------------------------------------------- /datapackage_pipelines/celery_tasks/dependency_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import redis 5 | 6 | 7 | class DependencyManager(object): 8 | 9 | def __init__(self, host=os.environ.get('DPP_REDIS_HOST'), port=6379): 10 | self.redis = None 11 | if host is not None and len(host) > 0: 12 | conn = redis.StrictRedis(host=host, port=port, db=5) 13 | try: 14 | conn.ping() 15 | self.redis = conn 16 | except redis.exceptions.ConnectionError: 17 | logging.warning('Failed to connect to Redis, host:%s, port:%s', 18 | host, port) 19 | else: 20 | logging.info('Skipping redis connection, host:%s, port:%s', 21 | host, port) 22 | 23 | @staticmethod 24 | def dependents_key(x): 25 | return 'Dependents:%s' % x 26 | 27 | @staticmethod 28 | def dependencies_key(x): 29 | return 'Dependencies:%s' % x 30 | 31 | @staticmethod 32 | def encode(x): 33 | if isinstance(x, str): 34 | return x.encode('utf8') 35 | if isinstance(x, list): 36 | return [y.encode('utf8') for y in x] 37 | 38 | @staticmethod 39 | def decode(x): 40 | if isinstance(x, bytes): 41 | return x.decode('utf8') 42 | if isinstance(x, (list, set)): 43 | return [y.decode('utf8') for y in x] 44 | assert False, "Unknown type for x: %r" % x 45 | 46 | def is_init(self): 47 | return self.redis is not None 48 | 49 | def update(self, spec): 50 | if self.is_init(): 51 | for dep in spec.dependencies: 52 | self.redis.sadd(self.dependents_key(dep), self.encode(spec.pipeline_id)) 53 | self.redis.delete(self.dependencies_key(spec.pipeline_id)) 54 | for dep in self.encode(spec.dependencies): 55 | self.redis.sadd(self.dependencies_key(spec.pipeline_id), dep) 56 | 57 | def get_dependencies(self, pipeline_id): 58 | if self.is_init(): 59 | members = self.redis.smembers(self.dependencies_key(pipeline_id)) 60 | if members is not None: 61 | return self.decode(members) 62 | return [] 63 | 64 | def get_dependents(self, pipeline_id): 65 | if self.is_init(): 66 | members = self.redis.smembers(self.dependents_key(pipeline_id)) 67 | if members is not None: 68 | return self.decode(members) 69 | return [] 70 | 71 | def remove(self, pipeline_id): 72 | if self.is_init(): 73 | dependencies = self.get_dependencies(pipeline_id) 74 | dependents = self.get_dependents(pipeline_id) 75 | 76 | for p in dependencies: 77 | self.redis.srem(self.dependents_key(p), self.encode(pipeline_id)) 78 | for p in dependents: 79 | self.redis.srem(self.dependencies_key(p), self.encode(pipeline_id)) 80 | self.redis.delete(self.dependents_key(pipeline_id)) 81 | self.redis.delete(self.dependencies_key(pipeline_id)) 82 | -------------------------------------------------------------------------------- /datapackage_pipelines/generators/__init__.py: -------------------------------------------------------------------------------- 1 | from slugify import slugify 2 | 3 | from .schedules import * # noqa 4 | from .generator_base import GeneratorBase 5 | from .utilities import steps 6 | -------------------------------------------------------------------------------- /datapackage_pipelines/generators/generator_base.py: -------------------------------------------------------------------------------- 1 | import jsonschema 2 | 3 | 4 | class GeneratorBase(object): 5 | 6 | def __init__(self): 7 | self.schema = None 8 | 9 | def _get_schema(self): 10 | if self.schema is not None: 11 | return self.schema 12 | self.schema = self.get_schema() 13 | validator = jsonschema.validators.validator_for(self.schema) 14 | self.schema = validator(self.schema) 15 | return self.schema 16 | 17 | def internal_validate(self, source): 18 | schema = self._get_schema() 19 | try: 20 | schema.validate(source) 21 | except jsonschema.ValidationError: 22 | return False 23 | return True 24 | 25 | def internal_generate(self, source, base): 26 | if not self.internal_validate(source): 27 | return None 28 | return self.generate_pipeline(source, base) 29 | 30 | @classmethod 31 | def get_schema(cls): 32 | raise NotImplementedError() 33 | 34 | @classmethod 35 | def generate_pipeline(cls, source, base): 36 | raise NotImplementedError() 37 | -------------------------------------------------------------------------------- /datapackage_pipelines/generators/schedules.py: -------------------------------------------------------------------------------- 1 | SCHEDULE_NONE = None 2 | SCHEDULE_HOURLY = '0 * * * *' 3 | SCHEDULE_DAILY = '0 0 * * *' 4 | SCHEDULE_WEEKLY = '0 0 * * 0' 5 | SCHEDULE_MONTHLY = '0 0 1 * *' 6 | SCHEDULE_YEARLY = '0 0 1 1 *' 7 | -------------------------------------------------------------------------------- /datapackage_pipelines/generators/utilities.py: -------------------------------------------------------------------------------- 1 | def arg_to_step(arg): 2 | if isinstance(arg, str): 3 | return {'run': arg} 4 | else: 5 | return dict(zip(['run', 'parameters', 'cache'], arg)) 6 | 7 | 8 | def steps(*args): 9 | return [arg_to_step(arg) for arg in args] 10 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/lib/__init__.py -------------------------------------------------------------------------------- /datapackage_pipelines/lib/add_computed_field.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, add_computed_field 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | add_computed_field( 9 | parameters.get('fields', []), 10 | resources=parameters.get('resources') 11 | ), 12 | ) 13 | 14 | 15 | if __name__ == '__main__': 16 | with ingest() as ctx: 17 | spew_flow(flow(ctx.parameters), ctx) 18 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/add_metadata.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from datapackage_pipelines.wrapper import ingest 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow 5 | 6 | from datapackage_pipelines.lib.update_package import flow 7 | 8 | 9 | if __name__ == '__main__': 10 | warnings.warn( 11 | 'add_metadata will be removed in the future, use "update_package" instead', 12 | DeprecationWarning 13 | ) 14 | with ingest() as ctx: 15 | spew_flow(flow(ctx.parameters), ctx) 16 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/add_resource.py: -------------------------------------------------------------------------------- 1 | from datapackage_pipelines.wrapper import ingest, spew 2 | import os 3 | 4 | from datapackage_pipelines.utilities.resources import PATH_PLACEHOLDER, PROP_STREAMED_FROM 5 | 6 | parameters, datapackage, res_iter = ingest() 7 | 8 | 9 | if datapackage is None: 10 | datapackage = {} 11 | 12 | datapackage.setdefault('resources', []) 13 | 14 | for param in ['url', 'name']: 15 | assert param in parameters, \ 16 | "You must define {} in your parameters".format(param) 17 | 18 | url = parameters.pop('url') 19 | if url.startswith('env://'): 20 | env_var = url[6:] 21 | env_url = os.environ.get(env_var) 22 | assert env_url is not None, \ 23 | "Missing Value - " \ 24 | "Please set your '%s' environment variable" % env_var 25 | 26 | url = env_url 27 | 28 | if 'path' not in parameters: 29 | parameters['path'] = PATH_PLACEHOLDER 30 | parameters[PROP_STREAMED_FROM] = url 31 | 32 | datapackage['resources'].append(parameters) 33 | 34 | spew(datapackage, res_iter) 35 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/cache_loader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import shutil 3 | import gzip 4 | 5 | from datapackage_pipelines.wrapper import ingest 6 | 7 | params, _, _ = ingest() 8 | 9 | load_from = params['load-from'] 10 | 11 | shutil.copyfileobj(gzip.open(load_from, "rt"), sys.stdout) 12 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/concatenate.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, concatenate, update_resource 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.resources import PROP_STREAMING 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow 5 | 6 | 7 | def flow(parameters): 8 | return Flow( 9 | concatenate( 10 | parameters.get('fields', {}), 11 | parameters.get('target', {}), 12 | parameters.get('sources') 13 | ), 14 | update_resource( 15 | parameters.get('target', {}).get('name', 'concat'), 16 | **{ 17 | PROP_STREAMING: True 18 | } 19 | ) 20 | ) 21 | 22 | 23 | if __name__ == '__main__': 24 | with ingest() as ctx: 25 | spew_flow(flow(ctx.parameters), ctx) 26 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/deduplicate.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, deduplicate 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | deduplicate( 9 | resources=parameters.get('resources'), 10 | ) 11 | ) 12 | 13 | 14 | if __name__ == '__main__': 15 | with ingest() as ctx: 16 | spew_flow(flow(ctx.parameters), ctx) 17 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/delete_fields.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, delete_fields 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | resources = parameters.get('resources') 8 | regex = parameters.get('regex', True) 9 | return Flow( 10 | delete_fields( 11 | parameters.get('fields', []), 12 | resources=resources, 13 | regex=regex, 14 | ) 15 | ) 16 | 17 | 18 | if __name__ == '__main__': 19 | with ingest() as ctx: 20 | spew_flow(flow(ctx.parameters), ctx) 21 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/dump/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/lib/dump/__init__.py -------------------------------------------------------------------------------- /datapackage_pipelines/lib/dump/to_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import warnings 4 | 5 | from datapackage_pipelines.lib.dump.dumper_base import FileDumper 6 | 7 | 8 | class PathDumper(FileDumper): 9 | 10 | def initialize(self, params): 11 | super(PathDumper, self).initialize(params) 12 | self.out_path = params.get('out-path', '.') 13 | self.add_filehash_to_path = params.get('add-filehash-to-path', False) 14 | PathDumper.__makedirs(self.out_path) 15 | 16 | def write_file_to_output(self, filename, path): 17 | path = os.path.join(self.out_path, path) 18 | # Avoid rewriting existing files 19 | if self.add_filehash_to_path and os.path.exists(path): 20 | return 21 | path_part = os.path.dirname(path) 22 | PathDumper.__makedirs(path_part) 23 | shutil.copy(filename, path) 24 | os.chmod(path, 0o666) 25 | return path 26 | 27 | @staticmethod 28 | def __makedirs(path): 29 | os.makedirs(path, exist_ok=True) 30 | 31 | 32 | if __name__ == '__main__': 33 | warnings.warn( 34 | 'dump.to_path will be removed in the future, use "dump_to_path" instead', 35 | DeprecationWarning 36 | ) 37 | PathDumper()() 38 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/dump/to_sql.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from datapackage_pipelines.wrapper import ingest 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow 5 | 6 | from datapackage_pipelines.lib.dump_to_sql import flow 7 | 8 | 9 | if __name__ == '__main__': 10 | warnings.warn( 11 | 'dump.to_sql will be removed in the future, use "dump_to_sql" instead', 12 | DeprecationWarning 13 | ) 14 | with ingest() as ctx: 15 | spew_flow(flow(ctx.parameters), ctx) 16 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/dump/to_zip.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import zipfile 3 | 4 | from datapackage_pipelines.lib.dump.dumper_base import FileDumper 5 | 6 | 7 | class ZipDumper(FileDumper): 8 | 9 | def initialize(self, params): 10 | super(ZipDumper, self).initialize(params) 11 | out_filename = open(params['out-file'], 'wb') 12 | self.zip_file = zipfile.ZipFile(out_filename, 'w') 13 | 14 | def write_file_to_output(self, filename, path): 15 | self.zip_file.write(filename, arcname=path, 16 | compress_type=zipfile.ZIP_DEFLATED) 17 | 18 | def finalize(self): 19 | self.zip_file.close() 20 | super(ZipDumper, self).finalize() 21 | 22 | 23 | if __name__ == '__main__': 24 | warnings.warn( 25 | 'dump.to_zip will be removed in the future, use "dump_to_zip" instead', 26 | DeprecationWarning 27 | ) 28 | ZipDumper()() 29 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/dump_to_path.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dataflows import Flow, dump_to_path 4 | from datapackage_pipelines.wrapper import ingest 5 | from datapackage_pipelines.utilities.flow_utils import spew_flow 6 | 7 | from datapackage_pipelines.utilities.stat_utils import STATS_DPP_KEY, STATS_OUT_DP_URL_KEY 8 | 9 | 10 | def flow(parameters: dict, stats: dict): 11 | out_path = parameters.pop('out-path', '.') 12 | stats.setdefault(STATS_DPP_KEY, {})[STATS_OUT_DP_URL_KEY] = os.path.join(out_path, 'datapackage.json') 13 | return Flow( 14 | dump_to_path( 15 | out_path, 16 | **parameters 17 | ) 18 | ) 19 | 20 | 21 | if __name__ == '__main__': 22 | with ingest() as ctx: 23 | spew_flow(flow(ctx.parameters, ctx.stats), ctx) 24 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/dump_to_sql.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, dump_to_sql 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | dump_to_sql( 9 | parameters['tables'], 10 | engine=parameters.get('engine', 'env://DPP_DB_ENGINE'), 11 | updated_column=parameters.get("updated_column"), 12 | updated_id_column=parameters.get("updated_id_column") 13 | ) 14 | ) 15 | 16 | 17 | if __name__ == '__main__': 18 | with ingest() as ctx: 19 | spew_flow(flow(ctx.parameters), ctx) 20 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/dump_to_zip.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, dump_to_zip 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters: dict): 7 | out_file = parameters.pop('out-file') 8 | return Flow( 9 | dump_to_zip( 10 | out_file, 11 | **parameters 12 | ) 13 | ) 14 | 15 | 16 | if __name__ == '__main__': 17 | with ingest() as ctx: 18 | spew_flow(flow(ctx.parameters), ctx) 19 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/duplicate.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, duplicate 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow, load_lazy_json 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | load_lazy_json(parameters.get('source')), 9 | duplicate( 10 | parameters.get('source'), 11 | parameters.get('target-name'), 12 | parameters.get('target-path'), 13 | parameters.get('batch_size', 1000), 14 | parameters.get('duplicate_to_end', False) 15 | ) 16 | ) 17 | 18 | 19 | if __name__ == '__main__': 20 | with ingest() as ctx: 21 | spew_flow(flow(ctx.parameters), ctx) 22 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/filter.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, filter_rows 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | filter_rows( 9 | equals=parameters.get('in', []), 10 | not_equals=parameters.get('out', []), 11 | resources=parameters.get('resources'), 12 | ) 13 | ) 14 | 15 | 16 | if __name__ == '__main__': 17 | with ingest() as ctx: 18 | spew_flow(flow(ctx.parameters), ctx) 19 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/find_replace.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, find_replace 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | find_replace( 9 | parameters.get('fields', []), 10 | resources=parameters.get('resources') 11 | ) 12 | ) 13 | 14 | 15 | if __name__ == '__main__': 16 | with ingest() as ctx: 17 | spew_flow(flow(ctx.parameters), ctx) 18 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/flow.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from importlib import import_module 3 | from datapackage_pipelines.wrapper import ingest 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow 5 | 6 | 7 | with ingest() as ctx: 8 | parameters, datapackage, resources = ctx 9 | stats = {} 10 | 11 | sys.path.append(parameters.pop('__path')) 12 | flow_module = import_module(parameters.pop('__flow')) 13 | flow = flow_module.flow(parameters, datapackage, resources, ctx.stats) 14 | 15 | spew_flow(flow, ctx) 16 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/internal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/lib/internal/__init__.py -------------------------------------------------------------------------------- /datapackage_pipelines/lib/internal/sink.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from datapackage_pipelines.wrapper import ingest, spew 4 | 5 | SINK_MAGIC = '>>> PROCESSED ROWS: ' 6 | 7 | 8 | def sink(res_iter_): 9 | count = 0 10 | for res in res_iter_: 11 | for row in res: 12 | count += 1 13 | if count % 100 == 0: 14 | sys.stderr.write('%s%d\n' % (SINK_MAGIC, count)) 15 | sys.stderr.flush() 16 | sys.stderr.write('%s%d\n' % (SINK_MAGIC, count)) 17 | sys.stderr.flush() 18 | yield from () 19 | 20 | 21 | if __name__ == '__main__': 22 | sys.stderr.write('%s%d\n' % (SINK_MAGIC, 0)) 23 | sys.stderr.flush() 24 | params, dp, res_iter = ingest() 25 | spew({'name': 'boop', 'resources': []}, 26 | sink(res_iter)) 27 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/join.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, join, update_resource 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.resources import PROP_STREAMING 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow, load_lazy_json 5 | 6 | 7 | def flow(parameters): 8 | source = parameters['source'] 9 | target = parameters['target'] 10 | return Flow( 11 | load_lazy_json(source['name']), 12 | join( 13 | source['name'], 14 | source['key'], 15 | target['name'], 16 | target['key'], 17 | parameters['fields'], 18 | parameters.get('full', None), 19 | parameters.get('mode', 'half-outer'), 20 | source.get('delete', False) 21 | ), 22 | update_resource( 23 | target['name'], 24 | **{ 25 | PROP_STREAMING: True 26 | } 27 | ) 28 | ) 29 | 30 | 31 | if __name__ == '__main__': 32 | with ingest() as ctx: 33 | spew_flow(flow(ctx.parameters), ctx) 34 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/load.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, load 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | from datapackage_pipelines.utilities.resources import PROP_STREAMING, PROP_STREAMED_FROM 5 | 6 | 7 | def flow(parameters): 8 | _from = parameters.pop('from') 9 | 10 | num_resources = 0 11 | 12 | def count_resources(): 13 | def func(package): 14 | global num_resources 15 | num_resources = len(package.pkg.resources) 16 | yield package.pkg 17 | yield from package 18 | return func 19 | 20 | def mark_streaming(_from): 21 | def func(package): 22 | for i in range(num_resources, len(package.pkg.resources)): 23 | package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMING, True) 24 | package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMED_FROM, _from) 25 | yield package.pkg 26 | yield from package 27 | return func 28 | 29 | return Flow( 30 | count_resources(), 31 | load(_from, **parameters), 32 | mark_streaming(_from), 33 | ) 34 | 35 | 36 | if __name__ == '__main__': 37 | with ingest() as ctx: 38 | spew_flow(flow(ctx.parameters), ctx) 39 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/load_metadata.py: -------------------------------------------------------------------------------- 1 | import datapackage 2 | 3 | from datapackage_pipelines.wrapper import ingest, spew, get_dependency_datapackage_url 4 | 5 | dep_prefix = 'dependency://' 6 | 7 | parameters, dp, res_iter = ingest() 8 | 9 | url = parameters['url'] 10 | if url.startswith(dep_prefix): 11 | dependency = url[len(dep_prefix):].strip() 12 | url = get_dependency_datapackage_url(dependency) 13 | assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency 14 | 15 | datapackage = datapackage.DataPackage(url) 16 | for k, v in datapackage.descriptor.items(): 17 | if k != 'resources': 18 | dp[k] = v 19 | 20 | spew(dp, res_iter) 21 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/load_resource.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import copy 3 | import logging 4 | 5 | import datapackage 6 | 7 | from dataflows.helpers.resource_matcher import ResourceMatcher 8 | 9 | from datapackage_pipelines.wrapper import ingest, spew, get_dependency_datapackage_url 10 | from datapackage_pipelines.utilities.resources import tabular, PROP_STREAMING, \ 11 | PROP_STREAMED_FROM 12 | 13 | 14 | def progress_logger(iter, log_progress_rows): 15 | for i, row in enumerate(iter, 1): 16 | yield row 17 | if i % log_progress_rows == 0: 18 | logging.info('loaded {} rows'.format(i)) 19 | 20 | 21 | class ResourceLoader(object): 22 | 23 | def __init__(self): 24 | self.parameters, self.dp, self.res_iter = ingest() 25 | 26 | def __call__(self): 27 | url = self.parameters['url'] 28 | limit_rows = self.parameters.get('limit-rows') 29 | log_progress_rows = self.parameters.get('log-progress-rows') 30 | dep_prefix = 'dependency://' 31 | if url.startswith(dep_prefix): 32 | dependency = url[len(dep_prefix):].strip() 33 | url = get_dependency_datapackage_url(dependency) 34 | assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency 35 | stream = self.parameters.get('stream', True) 36 | required = self.parameters.get('required', True) 37 | resource = self.parameters.get('resource') 38 | resources = self.parameters.get('resources') 39 | if resource is not None: 40 | assert not resources 41 | resource_index = resource if isinstance(resource, int) else None 42 | else: 43 | assert resources 44 | resource_index = None 45 | resource = list(resources.keys()) 46 | name_matcher = ( 47 | ResourceMatcher(resource, self.dp) 48 | if isinstance(resource, (str, list)) 49 | else None 50 | ) 51 | 52 | selected_resources = [] 53 | found = False 54 | try: 55 | dp = datapackage.DataPackage(url) 56 | except Exception: 57 | if required: 58 | raise 59 | else: 60 | dp = None 61 | if dp: 62 | dp = self.process_datapackage(dp) 63 | for i, orig_res in enumerate(dp.resources): 64 | if resource_index == i or \ 65 | (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))): 66 | found = True 67 | desc = copy.deepcopy(orig_res.descriptor) 68 | if 'primaryKey' in desc.get('schema', {}): 69 | # Avoid duplication checks 70 | del orig_res.descriptor['schema']['primaryKey'] 71 | orig_res.commit() 72 | desc[PROP_STREAMED_FROM] = orig_res.source 73 | if resources: 74 | desc.update(resources[desc['name']]) 75 | self.dp['resources'].append(desc) 76 | if tabular(desc) and stream: 77 | desc[PROP_STREAMING] = True 78 | orig_res_iter = orig_res.iter(keyed=True) 79 | if limit_rows: 80 | orig_res_iter = itertools.islice(orig_res_iter, limit_rows) 81 | if log_progress_rows: 82 | orig_res_iter = progress_logger(orig_res_iter, log_progress_rows) 83 | selected_resources.append(orig_res_iter) 84 | else: 85 | desc[PROP_STREAMING] = False 86 | 87 | assert found or not required, "Failed to find resource with index or name matching %r" % resource 88 | spew(self.dp, itertools.chain(self.res_iter, selected_resources)) 89 | 90 | def process_datapackage(self, dp_): 91 | return dp_ 92 | 93 | 94 | if __name__ == '__main__': 95 | ResourceLoader()() 96 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/printer.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, printer 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | printer(), 9 | ) 10 | 11 | 12 | if __name__ == '__main__': 13 | with ingest() as ctx: 14 | spew_flow(flow(ctx.parameters), ctx) 15 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/set_types.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, set_type, validate, delete_fields 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | resources = parameters.get('resources') 8 | regex = parameters.get('regex', True) 9 | if 'types' in parameters: 10 | return Flow( 11 | *[ 12 | set_type(name, resources=resources, regex=regex, **options) 13 | if options is not None else 14 | delete_fields([name], resources=resources) 15 | for name, options in parameters['types'].items() 16 | ] 17 | ) 18 | else: 19 | return Flow( 20 | validate() 21 | ) 22 | 23 | 24 | if __name__ == '__main__': 25 | with ingest() as ctx: 26 | print(flow(ctx.parameters).chain) 27 | spew_flow(flow(ctx.parameters), ctx) 28 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/sort.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, sort_rows 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow, load_lazy_json 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | load_lazy_json(parameters.get('resources')), 9 | sort_rows( 10 | parameters['sort-by'], 11 | resources=parameters.get('resources'), 12 | reverse=parameters.get('reverse') 13 | ) 14 | ) 15 | 16 | 17 | if __name__ == '__main__': 18 | with ingest() as ctx: 19 | spew_flow(flow(ctx.parameters), ctx) 20 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/unpivot.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, unpivot 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | unpivot( 9 | parameters.get('unpivot'), 10 | parameters.get('extraKeyFields'), 11 | parameters.get('extraValueField'), 12 | resources=parameters.get('resources') 13 | ) 14 | ) 15 | 16 | 17 | if __name__ == '__main__': 18 | with ingest() as ctx: 19 | spew_flow(flow(ctx.parameters), ctx) 20 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/update_package.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, update_package 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | return Flow( 8 | update_package(**parameters) 9 | ) 10 | 11 | 12 | if __name__ == '__main__': 13 | with ingest() as ctx: 14 | spew_flow(flow(ctx.parameters), ctx) 15 | -------------------------------------------------------------------------------- /datapackage_pipelines/lib/update_resource.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, update_resource 2 | from datapackage_pipelines.wrapper import ingest 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow 4 | 5 | 6 | def flow(parameters): 7 | resources = parameters.get('resources', None) 8 | metadata = parameters.pop('metadata', {}) 9 | return Flow( 10 | update_resource(resources, **metadata), 11 | ) 12 | 13 | 14 | if __name__ == '__main__': 15 | with ingest() as ctx: 16 | spew_flow(flow(ctx.parameters), ctx) 17 | -------------------------------------------------------------------------------- /datapackage_pipelines/manager/__init__.py: -------------------------------------------------------------------------------- 1 | from .tasks import execute_pipeline, finalize 2 | from .runner import run_pipelines, ExecutionResult, ProgressReport 3 | -------------------------------------------------------------------------------- /datapackage_pipelines/manager/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | own_name = '%-32s' % 'Main' 4 | logging.basicConfig(level=logging.INFO, 5 | format="%(levelname)-8s:"+own_name+":%(message)s") 6 | logging.root.setLevel(logging.INFO) 7 | -------------------------------------------------------------------------------- /datapackage_pipelines/manager/runners/__init__.py: -------------------------------------------------------------------------------- 1 | from .runner_config import RunnerConfiguration 2 | 3 | runner_config = RunnerConfiguration() 4 | -------------------------------------------------------------------------------- /datapackage_pipelines/manager/runners/base_runner.py: -------------------------------------------------------------------------------- 1 | class BaseRunner(object): 2 | 3 | def __init__(self, name, parameters): 4 | self.name = name 5 | self.parameters = parameters 6 | 7 | def get_execution_args(self, step, cwd, idx): 8 | raise NotImplementedError() 9 | -------------------------------------------------------------------------------- /datapackage_pipelines/manager/runners/local_python.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shlex 4 | 5 | from ...utilities.extended_json import json 6 | from .base_runner import BaseRunner 7 | 8 | 9 | class LocalPythonRunner(BaseRunner): 10 | 11 | def get_execution_args(self, step, _, idx): 12 | return [ 13 | sys.executable, 14 | step['executor'], 15 | str(idx), 16 | json.dumps(step.get('parameters', {})), 17 | str(step.get('validate', False)), 18 | step.get('_cache_hash') if step.get('cache') else '' 19 | ] 20 | 21 | 22 | class WrappedPythonRunner(LocalPythonRunner): 23 | 24 | def get_execution_args(self, step, cwd, idx): 25 | args = super(WrappedPythonRunner, self).get_execution_args(step, cwd, idx) 26 | for i in range(len(args)): 27 | args[i] = '\\\"' + args[i].replace('"', '\\\\\\\"') + '\\\"' 28 | cmd = " ".join(args) 29 | abspath = os.path.abspath(cwd) 30 | cmd = self.parameters['wrapper'].format(path=cwd, 31 | abspath=abspath, 32 | cmd=cmd, 33 | env=os.environ) 34 | args = shlex.split(cmd) 35 | return args 36 | -------------------------------------------------------------------------------- /datapackage_pipelines/manager/runners/runner_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | from .local_python import LocalPythonRunner, WrappedPythonRunner 5 | 6 | 7 | class RunnerConfiguration(object): 8 | 9 | ENV_VAR = 'DPP_RUNNER_CONFIG' 10 | DEFAULT_RUNNER_CONFIG = 'dpp-runners.yaml' 11 | 12 | def __init__(self): 13 | 14 | config_fn = os.environ.get(self.ENV_VAR, self.DEFAULT_RUNNER_CONFIG) 15 | if os.path.exists(config_fn): 16 | self.config = yaml.load(open(config_fn), Loader=yaml.Loader) 17 | else: 18 | self.config = {} 19 | 20 | def get_runner_class(self, kind): 21 | return { 22 | 'local-python': LocalPythonRunner, 23 | 'wrapped-python': WrappedPythonRunner, 24 | }.get(kind, LocalPythonRunner) 25 | 26 | def get_runner(self, name): 27 | runner_config = self.config.get(name, {}) 28 | kind = runner_config.get('kind') 29 | parameters = runner_config.get('parameters') 30 | return self.get_runner_class(kind)(name, parameters) 31 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/__init__.py: -------------------------------------------------------------------------------- 1 | from .specs import pipelines, register_all_pipelines 2 | from .parsers.base_parser import PipelineSpec 3 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/errors.py: -------------------------------------------------------------------------------- 1 | # pylama:skip=1 2 | from typing import NamedTuple 3 | 4 | 5 | class SpecError(NamedTuple): 6 | short_msg: str 7 | long_msg: str 8 | 9 | def __str__(self): 10 | return '{}: {}'.format(self.short_msg, self.long_msg) 11 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/hashers/__init__.py: -------------------------------------------------------------------------------- 1 | from .hash_calculator import HashCalculator 2 | from .dependency_resolver import resolve_dependencies, DependencyMissingException 3 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/hashers/dependency_resolver.py: -------------------------------------------------------------------------------- 1 | import datapackage 2 | from datapackage.exceptions import DataPackageException 3 | from ..parsers.base_parser import PipelineSpec 4 | 5 | from ..errors import SpecError 6 | 7 | 8 | class DependencyMissingException(Exception): 9 | 10 | def __init__(self, spec, missing): 11 | self.spec = spec 12 | self.missing = missing 13 | 14 | 15 | def resolve_dependencies(spec: PipelineSpec, all_pipeline_ids, status_mgr): 16 | 17 | cache_hash = '' 18 | dependencies = spec.pipeline_details.get('dependencies', ()) 19 | for dependency in dependencies: 20 | if 'pipeline' in dependency: 21 | pipeline_id = dependency['pipeline'] 22 | if pipeline_id not in all_pipeline_ids: 23 | raise DependencyMissingException(spec, pipeline_id) 24 | 25 | for dependency in dependencies: 26 | if 'pipeline' in dependency: 27 | pipeline_id = dependency['pipeline'] 28 | ps = status_mgr.get(pipeline_id) 29 | if not ps.runnable(): 30 | spec.validation_errors.append( 31 | SpecError('Invalid dependency', 32 | 'Cannot run until dependency passes validation: {}'.format(pipeline_id)) 33 | ) 34 | elif ps.dirty(): 35 | spec.validation_errors.append( 36 | SpecError('Dirty dependency', 37 | 'Cannot run until dependency is executed: {}'.format(pipeline_id)) 38 | ) 39 | elif ps.get_last_execution() is not None and not ps.get_last_execution().success: 40 | spec.validation_errors.append( 41 | SpecError('Dependency unsuccessful', 42 | 'Cannot run until dependency "{}" is successfully ' 43 | 'executed'.format(pipeline_id)) 44 | ) 45 | 46 | for dep_err in ps.validation_errors: 47 | spec.validation_errors.append( 48 | SpecError('From {}'.format(pipeline_id), dep_err) 49 | ) 50 | 51 | pipeline_hash = all_pipeline_ids.get(pipeline_id).cache_hash 52 | assert pipeline_hash is not None 53 | cache_hash += pipeline_hash 54 | 55 | spec.dependencies.append(pipeline_id) 56 | 57 | elif 'datapackage' in dependency: 58 | dp_id = dependency['datapackage'] 59 | try: 60 | dp = datapackage.DataPackage(dp_id) 61 | if 'hash' in dp.descriptor: 62 | cache_hash += dp.descriptor['hash'] 63 | else: 64 | spec.validation_errors.append( 65 | SpecError('Missing dependency', 66 | "Couldn't get data from datapackage %s" 67 | % dp_id)) 68 | except DataPackageException: 69 | spec.validation_errors.append( 70 | SpecError('Missing dependency', 71 | "Couldn't open datapackage %s" 72 | % dp_id)) 73 | 74 | else: 75 | spec.validation_errors.append( 76 | SpecError('Missing dependency', 77 | 'Unknown dependency provided (%r)' % dependency)) 78 | 79 | return cache_hash 80 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/hashers/hash_calculator.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from ...utilities.extended_json import json 4 | from ..parsers.base_parser import PipelineSpec 5 | 6 | from ..errors import SpecError 7 | from .dependency_resolver import resolve_dependencies 8 | 9 | 10 | class HashCalculator(object): 11 | 12 | def __init__(self): 13 | self.all_pipeline_ids = {} 14 | 15 | def calculate_hash(self, spec: PipelineSpec, status_mgr, ignore_missing_deps=False): 16 | 17 | cache_hash = None 18 | if spec.pipeline_id in self.all_pipeline_ids: 19 | message = 'Duplicate key {0} in {1}' \ 20 | .format(spec.pipeline_id, spec.path) 21 | spec.validation_errors.append(SpecError('Duplicate Pipeline Id', message)) 22 | 23 | else: 24 | if ignore_missing_deps: 25 | cache_hash = '' 26 | else: 27 | cache_hash = resolve_dependencies(spec, self.all_pipeline_ids, status_mgr) 28 | 29 | self.all_pipeline_ids[spec.pipeline_id] = spec 30 | if len(spec.validation_errors) > 0: 31 | return cache_hash 32 | 33 | for step in spec.pipeline_details['pipeline']: 34 | m = hashlib.md5() 35 | m.update(cache_hash.encode('ascii')) 36 | with open(step['executor'], 'rb') as f: 37 | m.update(f.read()) 38 | m.update(json.dumps(step, ensure_ascii=True, sort_keys=True) 39 | .encode('ascii')) 40 | cache_hash = m.hexdigest() 41 | step['_cache_hash'] = cache_hash 42 | 43 | spec.cache_hash = cache_hash 44 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .basic_pipeline import BasicPipelineParser 2 | from .source_spec_pipeline import SourceSpecPipelineParser 3 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/parsers/base_parser.py: -------------------------------------------------------------------------------- 1 | class PipelineSpec(object): 2 | def __init__(self, 3 | path=None, 4 | pipeline_id=None, 5 | pipeline_details=None, 6 | source_details=None, 7 | validation_errors=None, 8 | dependencies=None, 9 | cache_hash='', 10 | schedule=None, 11 | environment=None): 12 | self.path = path 13 | self.pipeline_id = pipeline_id 14 | self.pipeline_details = pipeline_details 15 | self.source_details = source_details 16 | self.validation_errors = [] if validation_errors is None else validation_errors 17 | self.dependencies = [] if dependencies is None else dependencies 18 | self.cache_hash = cache_hash 19 | self.schedule = schedule 20 | self.environment = environment 21 | 22 | def __str__(self): 23 | return 'PipelineSpec({}, validation_errors={}, ' \ 24 | 'dependencies={}, cache_hash={})'\ 25 | .format(self.pipeline_id, self.validation_errors, 26 | self.dependencies, self.cache_hash) 27 | 28 | def __repr__(self): 29 | return str(self) 30 | 31 | 32 | class BaseParser(object): 33 | 34 | class InvalidFileException(Exception): 35 | def __init__(self, short_msg, long_msg): 36 | self.short_msg = short_msg 37 | self.long_msg = long_msg 38 | 39 | @classmethod 40 | def check_filename(cls, filename): 41 | raise NotImplementedError() 42 | 43 | @classmethod 44 | def to_pipeline(cls, spec, fullpath): 45 | raise NotImplementedError() 46 | 47 | @staticmethod 48 | def replace_root_dir(path, root_dir): 49 | if root_dir.endswith('/'): 50 | root_dir = root_dir[:-1] 51 | if path.startswith(root_dir): 52 | path = '.' + path[len(root_dir):] 53 | return path 54 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/parsers/basic_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Iterator 3 | 4 | from .base_parser import BaseParser, PipelineSpec 5 | 6 | 7 | class BasicPipelineParser(BaseParser): 8 | 9 | SPEC_FILENAME = 'pipeline-spec.yaml' 10 | 11 | @classmethod 12 | def check_filename(cls, filename): 13 | return filename == cls.SPEC_FILENAME 14 | 15 | @classmethod 16 | def to_pipeline(cls, spec, fullpath, root_dir='.') -> Iterator[PipelineSpec]: 17 | dirpath = os.path.dirname(fullpath) 18 | 19 | for pipeline_id, pipeline_details in spec.items(): 20 | pipeline_id = os.path.join(dirpath, pipeline_id) 21 | pipeline_id = cls.replace_root_dir(pipeline_id, root_dir) 22 | yield PipelineSpec(path=dirpath, 23 | pipeline_id=pipeline_id, 24 | pipeline_details=pipeline_details) 25 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/parsers/source_spec_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from ..resolver import resolve_generator 4 | from ..errors import SpecError 5 | from .base_parser import BaseParser, PipelineSpec 6 | 7 | 8 | class SourceSpecPipelineParser(BaseParser): 9 | 10 | SOURCE_FILENAME_SUFFIX = '.source-spec.yaml' 11 | 12 | @classmethod 13 | def check_filename(cls, filename): 14 | return filename.endswith(cls.SOURCE_FILENAME_SUFFIX) 15 | 16 | @classmethod 17 | def fix_dependency(cls, dep, dirpath, root_dir): 18 | if dep.startswith('./'): 19 | dep = dep[2:] 20 | return os.path.join(cls.replace_root_dir(dirpath, root_dir), dep) 21 | 22 | @classmethod 23 | def to_pipeline(cls, source_spec, fullpath, root_dir='.'): 24 | filename = os.path.basename(fullpath) 25 | dirpath = os.path.dirname(fullpath) 26 | 27 | module_name = filename[:-len(cls.SOURCE_FILENAME_SUFFIX)] 28 | pipeline_id = os.path.join(dirpath, module_name) 29 | generator = resolve_generator(module_name) 30 | 31 | if generator is None: 32 | message = 'Unknown source description kind "{}" in {}' \ 33 | .format(module_name, fullpath) 34 | error = SpecError('Unknown source kind', message) 35 | yield PipelineSpec(pipeline_id=module_name, 36 | path=dirpath, 37 | validation_errors=[error], 38 | pipeline_details={'pipeline': []}) 39 | return 40 | 41 | base = cls.replace_root_dir(dirpath, root_dir) 42 | if generator.internal_validate(source_spec): 43 | try: 44 | spec = generator.internal_generate(source_spec, base) 45 | for pipeline_id, pipeline_details in spec: 46 | if pipeline_id[0] == ':' and pipeline_id[-1] == ':': 47 | module = pipeline_id[1:-1] 48 | filename = module + cls.SOURCE_FILENAME_SUFFIX 49 | yield from cls.to_pipeline(pipeline_details, 50 | os.path.join(dirpath, filename)) 51 | else: 52 | yield PipelineSpec(path=pipeline_details.get('__path', dirpath), 53 | pipeline_id=pipeline_id, 54 | pipeline_details=pipeline_details, 55 | source_details=source_spec) 56 | except Exception as e: 57 | message = '"{}" in {}' \ 58 | .format(e, fullpath) 59 | error = SpecError('Error converting source', message) 60 | yield PipelineSpec(pipeline_id=pipeline_id, 61 | path=dirpath, validation_errors=[error], 62 | pipeline_details={'pipeline': []}) 63 | else: 64 | message = 'Invalid source description for "{}" in {}' \ 65 | .format(module_name, fullpath) 66 | error = SpecError('Invalid Source', message) 67 | yield PipelineSpec(pipeline_id=pipeline_id, 68 | path=dirpath, 69 | validation_errors=[error], 70 | pipeline_details={'pipeline': []}) 71 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/schemas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/specs/schemas/__init__.py -------------------------------------------------------------------------------- /datapackage_pipelines/specs/schemas/pipeline-spec.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema#", 3 | "type": "object", 4 | "required": [ 5 | "pipeline" 6 | ], 7 | "properties": { 8 | "title": { 9 | "type": "string" 10 | }, 11 | "description": { 12 | "type": "string" 13 | }, 14 | "environment": { 15 | "type": "object" 16 | }, 17 | "schedule": { 18 | "type": "object", 19 | "properties": { 20 | "crontab": { 21 | "type": "string" 22 | } 23 | } 24 | }, 25 | "pipeline": { 26 | "type": "array", 27 | "minLength": 1, 28 | "items": { 29 | "type": "object", 30 | "oneOf": [ 31 | { 32 | "required": [ 33 | "run" 34 | ] 35 | }, 36 | { 37 | "required": [ 38 | "flow" 39 | ] 40 | } 41 | ], 42 | "properties": { 43 | "run": { 44 | "type": "string" 45 | }, 46 | "parameters": { 47 | "type": "object" 48 | }, 49 | "cache": { 50 | "type": "boolean" 51 | }, 52 | "validate": { 53 | "type": "booelan" 54 | } 55 | } 56 | } 57 | }, 58 | "dependencies": { 59 | "type": "array", 60 | "items": { 61 | "type": "object", 62 | "maxProperties": 1, 63 | "properties": { 64 | "datapackage": { 65 | "type": "string", 66 | "format": "uri" 67 | }, 68 | "pipeline": { 69 | "type": "string" 70 | } 71 | } 72 | } 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /datapackage_pipelines/specs/schemas/validator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import jsonschema 4 | 5 | from ..errors import SpecError 6 | 7 | 8 | schema_filename = 'pipeline-spec.schema.json' 9 | schema_filename = os.path.join(os.path.dirname(__file__), 10 | schema_filename) 11 | schema = json.load(open(schema_filename)) 12 | validator = jsonschema.validators.validator_for(schema) 13 | schema = validator(schema) 14 | 15 | 16 | def validate_pipeline(pipeline_details, errors): 17 | try: 18 | schema.validate(pipeline_details) 19 | except jsonschema.ValidationError as e: 20 | errors.append(SpecError('Invalid Pipeline', str(e))) 21 | return False 22 | return True 23 | -------------------------------------------------------------------------------- /datapackage_pipelines/status/__init__.py: -------------------------------------------------------------------------------- 1 | from .status_manager import status_mgr 2 | from .hook_sender import hook_sender 3 | -------------------------------------------------------------------------------- /datapackage_pipelines/status/backend_filesystem.py: -------------------------------------------------------------------------------- 1 | import os 2 | import codecs 3 | import ujson 4 | 5 | 6 | class FilesystemBackend(object): 7 | 8 | KIND = 'filesystem' 9 | 10 | def __init__(self, root_dir='.'): 11 | dpp_dirname = os.environ.get('DPP_DB_DIRNAME', '.dpp') 12 | self.base_dir = os.path.join(root_dir, dpp_dirname) 13 | os.makedirs(self.base_dir, exist_ok=True) 14 | 15 | def fn(self, pipeline_id): 16 | pipeline_id = codecs.encode(pipeline_id.encode('utf8'), 'base64').decode('ascii').replace('\n', '') 17 | return os.path.join(self.base_dir, pipeline_id) 18 | 19 | def get_status(self, pipeline_id): 20 | try: 21 | with open(self.fn(pipeline_id)) as f: 22 | return ujson.load(f) 23 | except FileNotFoundError: 24 | pass 25 | except ValueError: 26 | pass 27 | 28 | def set_status(self, pipeline_id, status): 29 | fn = self.fn(pipeline_id) 30 | with open(fn+'.tmp', 'w') as f: 31 | ujson.dump(status, f) 32 | os.rename(fn+'.tmp', fn) 33 | 34 | def del_status(self, pipeline_id): 35 | try: 36 | os.unlink(self.fn(pipeline_id)) 37 | except FileNotFoundError: 38 | pass 39 | 40 | def register_pipeline_id(self, pipeline_id): 41 | pass 42 | 43 | def deregister_pipeline_id(self, pipeline_id): 44 | self.del_status(pipeline_id) 45 | 46 | def reset(self): 47 | for p in self.all_pipeline_ids(): 48 | self.del_status(p) 49 | 50 | def all_pipeline_ids(self): 51 | # Decoding encoded identifiers 52 | dec_ids = [] 53 | enc_ids = sorted(os.listdir(self.base_dir)) 54 | for enc_id in enc_ids: 55 | dec_id = codecs.decode(enc_id.encode('utf8'), 'base64').decode('utf8') 56 | if dec_id.startswith('PipelineStatus:'): 57 | dec_id = dec_id.replace('PipelineStatus:', '') 58 | dec_ids.append(dec_id) 59 | return dec_ids 60 | 61 | def all_statuses(self): 62 | return [self.get_status(_id) 63 | for _id in self.all_pipeline_ids()] 64 | -------------------------------------------------------------------------------- /datapackage_pipelines/status/backend_redis.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import redis 4 | 5 | from datapackage_pipelines.utilities.extended_json import json 6 | 7 | 8 | class RedisBackend(object): 9 | 10 | KIND = 'redis' 11 | 12 | def __init__(self, host=None, port=6379): 13 | self.redis = None 14 | if host is not None and len(host) > 0: 15 | conn = redis.StrictRedis(host=host, port=port, db=5) 16 | try: 17 | conn.ping() 18 | self.redis = conn 19 | except redis.exceptions.ConnectionError: 20 | logging.warning('Failed to connect to Redis, host:%s, port:%s', 21 | host, port) 22 | 23 | def is_init(self): 24 | return self.redis is not None 25 | 26 | def get_status(self, pipeline_id): 27 | if self.is_init(): 28 | status = self.redis.get(pipeline_id) 29 | if status is not None: 30 | status = json.loads(status.decode('ascii')) 31 | return status 32 | 33 | def set_status(self, pipeline_id, status): 34 | if self.is_init(): 35 | self.redis.set(pipeline_id, json.dumps(status, ensure_ascii=True)) 36 | 37 | def del_status(self, pipeline_id): 38 | if self.is_init(): 39 | self.redis.delete(pipeline_id) 40 | 41 | def register_pipeline_id(self, pipeline_id): 42 | if self.is_init(): 43 | self.redis.sadd('all-pipelines', pipeline_id.strip()) 44 | 45 | def deregister_pipeline_id(self, pipeline_id): 46 | if self.is_init(): 47 | self.redis.srem('all-pipelines', pipeline_id.strip()) 48 | 49 | def reset(self): 50 | if self.is_init(): 51 | self.redis.delete('all-pipelines') 52 | 53 | def all_pipeline_ids(self): 54 | if self.is_init(): 55 | return [x.decode('utf-8') for x in self.redis.smembers('all-pipelines')] 56 | return [] 57 | 58 | def all_statuses(self): 59 | if self.is_init(): 60 | all_ids = self.redis.smembers('all-pipelines') 61 | pipe = self.redis.pipeline() 62 | for _id in sorted(all_ids): 63 | pipe.get(_id) 64 | return [json.loads(sts.decode('ascii')) for sts in pipe.execute()] 65 | return [] 66 | -------------------------------------------------------------------------------- /datapackage_pipelines/status/backend_sqlite.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | from datapackage_pipelines.utilities.extended_json import json 4 | 5 | DPP_DB_FILENAME = os.environ.get('DPP_DB_FILENAME', '.dpp.db') 6 | 7 | 8 | class Sqlite3Dict(object): 9 | def __init__(self, filename): 10 | self.filename = filename 11 | conn = sqlite3.connect(self.filename) 12 | cursor = conn.cursor() 13 | cursor.execute( 14 | '''CREATE TABLE IF NOT EXISTS d (_key text, _value text)''' 15 | ) 16 | conn.commit() 17 | conn.close() 18 | 19 | def __getitem__(self, key): 20 | conn = sqlite3.connect(self.filename) 21 | cursor = conn.cursor() 22 | result = cursor.execute( 23 | 'SELECT _value from d where _key=?', 24 | (key,) 25 | ).fetchone() 26 | conn.close() 27 | if result is not None: 28 | return json.loads(result[0]) 29 | return None 30 | 31 | def __setitem__(self, key, value): 32 | conn = sqlite3.connect(self.filename) 33 | value = json.dumps(value) 34 | cursor = conn.cursor() 35 | cursor.execute('DELETE FROM d where _key=?', (key,)) 36 | cursor.execute('INSERT INTO d VALUES (?,?)', (key, value)) 37 | conn.commit() 38 | conn.close() 39 | 40 | def __delitem__(self, key): 41 | conn = sqlite3.connect(self.filename) 42 | cursor = conn.cursor() 43 | cursor.execute('DELETE FROM d where _key=?', (key,)) 44 | conn.commit() 45 | conn.close() 46 | 47 | 48 | class SqliteBackend(object): 49 | 50 | KIND = 'sqlite3' 51 | ALL_PIPELINES_KEY = 'all-pipelines' 52 | 53 | def __init__(self): 54 | self.db = Sqlite3Dict(DPP_DB_FILENAME) 55 | 56 | def get_status(self, pipeline_id): 57 | return self.db[pipeline_id] 58 | 59 | def set_status(self, pipeline_id, status): 60 | self.db[pipeline_id] = status 61 | 62 | def del_status(self, pipeline_id): 63 | del self.db[pipeline_id] 64 | 65 | def register_pipeline_id(self, pipeline_id): 66 | all_pipelines = self.db[self.ALL_PIPELINES_KEY] 67 | if all_pipelines is None: 68 | all_pipelines = [] 69 | if pipeline_id not in all_pipelines: 70 | all_pipelines.append(pipeline_id) 71 | self.db[self.ALL_PIPELINES_KEY] = all_pipelines 72 | 73 | def deregister_pipeline_id(self, pipeline_id): 74 | all_pipelines = self.db[self.ALL_PIPELINES_KEY] 75 | if all_pipelines is None: 76 | all_pipelines = [] 77 | if pipeline_id in all_pipelines: 78 | all_pipelines = filter(lambda x: x != pipeline_id, all_pipelines) 79 | self.db[self.ALL_PIPELINES_KEY] = all_pipelines 80 | 81 | def reset(self): 82 | self.db[self.ALL_PIPELINES_KEY] = [] 83 | 84 | def all_pipeline_ids(self): 85 | all_ids = sorted(self.db[self.ALL_PIPELINES_KEY]) 86 | return all_ids 87 | 88 | def all_statuses(self): 89 | all_ids = sorted(self.db[self.ALL_PIPELINES_KEY]) 90 | return [self.db[_id] for _id in all_ids] 91 | -------------------------------------------------------------------------------- /datapackage_pipelines/status/hook_sender.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from concurrent.futures import ThreadPoolExecutor 3 | 4 | import requests 5 | from requests.exceptions import RequestException 6 | 7 | tpe = ThreadPoolExecutor(max_workers=1) 8 | 9 | 10 | def _send(hook, payload): 11 | try: 12 | response = requests.post(hook, json=payload) 13 | if response.status_code != 200: 14 | logging.warning('Server returned %s, hook %s with payload %r ', 15 | response.status_code, hook, payload) 16 | except RequestException as e: 17 | logging.warning('Failed to call hook %s with payload %r (%s)', 18 | hook, payload, e) 19 | 20 | 21 | class HookSender(): 22 | def send(self, hook, payload, blocking=False): 23 | if blocking: 24 | _send(hook, payload) 25 | else: 26 | tpe.submit(_send, hook, payload) 27 | 28 | 29 | hook_sender = HookSender() 30 | -------------------------------------------------------------------------------- /datapackage_pipelines/status/status_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from .backend_redis import RedisBackend 4 | from .backend_filesystem import FilesystemBackend 5 | from .pipeline_status import PipelineStatus 6 | 7 | 8 | class StatusManager(object): 9 | 10 | def __init__(self, *, host=None, port=6379, root_dir='.'): 11 | self._host = host 12 | self._port = port 13 | self._backend = None 14 | self._root_dir = root_dir 15 | 16 | @property 17 | def backend(self): 18 | if self._backend is None: 19 | redis = RedisBackend(self._host, self._port) 20 | self._backend = redis if redis.is_init() else FilesystemBackend(self._root_dir) 21 | return self._backend 22 | 23 | def get_errors(self, _id): 24 | ex = self.get(_id).get_last_execution() 25 | if ex is not None: 26 | return ex.error_log 27 | return [] 28 | 29 | def initialize(self): 30 | self.backend.reset() 31 | 32 | def get(self, _id) -> PipelineStatus: 33 | return PipelineStatus(self.backend, _id) 34 | 35 | def all_statuses(self): 36 | return self.backend.all_statuses() 37 | 38 | def all_pipeline_ids(self): 39 | return self.backend.all_pipeline_ids() 40 | 41 | def deregister(self, pipeline_id): 42 | return self.get(pipeline_id).deregister() 43 | 44 | 45 | _status = None 46 | _root_dir = None 47 | 48 | 49 | def status_mgr(root_dir='.') -> StatusManager: 50 | global _status 51 | global _root_dir 52 | 53 | if _status is not None and _root_dir == root_dir: 54 | return _status 55 | _root_dir = root_dir 56 | _status = StatusManager(host=os.environ.get('DPP_REDIS_HOST'), root_dir=root_dir) 57 | return _status 58 | -------------------------------------------------------------------------------- /datapackage_pipelines/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/utilities/__init__.py -------------------------------------------------------------------------------- /datapackage_pipelines/utilities/execution_id.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | 4 | def gen_execution_id(): 5 | return str(uuid.uuid4()) 6 | -------------------------------------------------------------------------------- /datapackage_pipelines/utilities/flow_utils.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, load, update_package 2 | from dataflows.helpers.resource_matcher import ResourceMatcher 3 | 4 | from datapackage_pipelines.wrapper import ProcessorContext 5 | from datapackage_pipelines.utilities.extended_json import LazyJsonLine 6 | 7 | 8 | def load_lazy_json(resources): 9 | 10 | def func(package): 11 | matcher = ResourceMatcher(resources, package.pkg) 12 | yield package.pkg 13 | for rows in package: 14 | if matcher.match(rows.res.name): 15 | yield ( 16 | row.inner 17 | if isinstance(row, LazyJsonLine) 18 | else row 19 | for row in rows 20 | ) 21 | else: 22 | yield rows 23 | 24 | return func 25 | 26 | 27 | class MergeableStats(): 28 | def __init__(self, ds_stats, ctx_stats): 29 | self.ds_stats = ds_stats 30 | self.ctx_stats = ctx_stats 31 | 32 | def __iter__(self): 33 | if self.ds_stats is not None: 34 | for x in self.ds_stats: 35 | yield from x.items() 36 | if self.ctx_stats is not None: 37 | yield from self.ctx_stats.items() 38 | 39 | 40 | def spew_flow(flow, ctx: ProcessorContext): 41 | flow = Flow( 42 | update_package(**ctx.datapackage), 43 | load((ctx.datapackage, ctx.resource_iterator)), 44 | flow, 45 | ) 46 | datastream = flow.datastream() 47 | ctx.datapackage = datastream.dp.descriptor 48 | ctx.resource_iterator = datastream.res_iter 49 | ctx.stats = MergeableStats(datastream.stats, ctx.stats) 50 | -------------------------------------------------------------------------------- /datapackage_pipelines/utilities/lazy_dict.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | try: 4 | MutableMapping = collections.MutableMapping 5 | except: 6 | MutableMapping = collections.abc.MutableMapping 7 | 8 | class LazyDict(MutableMapping): 9 | 10 | def __init__(self): 11 | self._inner = None 12 | self._dirty = False 13 | 14 | @property 15 | def dirty(self): 16 | return self._dirty 17 | 18 | @property 19 | def inner(self): 20 | self.__ensure() 21 | return self._inner 22 | 23 | def _evaluate(self): 24 | raise NotImplementedError() 25 | 26 | def __ensure(self): 27 | if self._inner is None: 28 | self._inner = self._evaluate() 29 | 30 | def __len__(self): 31 | self.__ensure() 32 | return len(self._inner) 33 | 34 | def __getitem__(self, item): 35 | self.__ensure() 36 | return self._inner.__getitem__(item) 37 | 38 | def __setitem__(self, key, value): 39 | self.__ensure() 40 | self._inner.__setitem__(key, value) 41 | self._dirty = True 42 | 43 | def __delitem__(self, key): 44 | self.__ensure() 45 | self._inner.__delitem__(key) 46 | self._dirty = True 47 | 48 | def __iter__(self): 49 | self.__ensure() 50 | return self._inner.__iter__() 51 | -------------------------------------------------------------------------------- /datapackage_pipelines/utilities/resources.py: -------------------------------------------------------------------------------- 1 | def is_a_url(path): 2 | return (path is not None and isinstance(path, str) and 3 | (path.startswith('http://') or 4 | path.startswith('https://')) 5 | ) 6 | 7 | 8 | def tabular(descriptor): 9 | return 'schema' in descriptor 10 | 11 | 12 | def streaming(descriptor): 13 | return descriptor.get(PROP_STREAMING) 14 | 15 | 16 | def streamable(descriptor): 17 | return PROP_STREAMED_FROM in descriptor and \ 18 | not streaming(descriptor) 19 | 20 | 21 | def get_path(descriptor): 22 | path = descriptor.get('path') 23 | if isinstance(path, str): 24 | return path 25 | if isinstance(path, list): 26 | if len(path) > 0: 27 | return path.pop(0) 28 | else: 29 | return None 30 | assert path is None, '%r' % path 31 | return None 32 | 33 | 34 | PATH_PLACEHOLDER = '_' 35 | PROP_STREAMED_FROM = 'dpp:streamedFrom' 36 | PROP_STREAMING = 'dpp:streaming' 37 | -------------------------------------------------------------------------------- /datapackage_pipelines/utilities/stat_utils.py: -------------------------------------------------------------------------------- 1 | STATS_DPP_KEY = '.dpp' 2 | STATS_OUT_DP_URL_KEY = 'out-datapackage-url' 3 | 4 | 5 | def user_facing_stats(stats): 6 | if stats is not None and isinstance(stats, dict): 7 | return dict((k, v) for k, v in stats.items() if k != STATS_DPP_KEY) 8 | return None 9 | -------------------------------------------------------------------------------- /datapackage_pipelines/utilities/tabulator_txt_parser.py: -------------------------------------------------------------------------------- 1 | from tabulator.parser import Parser 2 | from tabulator.helpers import reset_stream 3 | 4 | 5 | class TXTParser(Parser): 6 | """Parser to parse TXT data format. 7 | """ 8 | 9 | # Public 10 | 11 | options = [] 12 | 13 | def __init__(self, loader, **options): 14 | super(TXTParser, self).__init__(loader, **options) 15 | 16 | # Set attributes 17 | self.__options = options 18 | self.__extended_rows = None 19 | self.__loader = loader 20 | self.__chars = None 21 | self.__encoding = None 22 | 23 | @property 24 | def closed(self): 25 | return self.__chars is None or self.__chars.closed 26 | 27 | def open(self, source, encoding=None, force_parse=False): 28 | self.close() 29 | self.__chars = self.__loader.load(source, encoding) 30 | self.__encoding = getattr(self.__chars, 'encoding', encoding) 31 | if self.__encoding: 32 | self.__encoding.lower() 33 | self.reset() 34 | 35 | def close(self): 36 | if not self.closed: 37 | self.__chars.close() 38 | 39 | def reset(self): 40 | reset_stream(self.__chars) 41 | self.__extended_rows = self.__iter_extended_rows() 42 | 43 | @property 44 | def extended_rows(self): 45 | return self.__extended_rows 46 | 47 | @property 48 | def encoding(self): 49 | return self.__encoding 50 | 51 | # Private 52 | 53 | def __iter_extended_rows(self): 54 | for number, line in enumerate(self.__chars, start=1): 55 | if line.endswith('\n'): 56 | line = line[:-1] 57 | yield (number, None, [line]) 58 | -------------------------------------------------------------------------------- /datapackage_pipelines/web/__init__.py: -------------------------------------------------------------------------------- 1 | from .server import app 2 | -------------------------------------------------------------------------------- /datapackage_pipelines/wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .wrapper import ingest, spew, process, \ 2 | get_dependency_datapackage_url, ProcessorContext 3 | -------------------------------------------------------------------------------- /docker/github_config.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | 4 | if __name__ == "__main__": 5 | repos = os.environ.get('DPP_GITHUB_REPOSITORIES') 6 | if repos is not None: 7 | repos = repos.split(';') 8 | 9 | config = {} 10 | for repo in repos: 11 | repo = repo.split(':') 12 | if len(repo) > 1: 13 | repo, path = repo 14 | else: 15 | repo = repo[0] 16 | path = None 17 | config[repo] = { 18 | 'repository': repo, 19 | } 20 | if path is not None: 21 | config[repo]['base-path'] = path 22 | with open('github.source-spec.yaml', 'w') as source_spec: 23 | yaml.dump(config, source_spec) 24 | -------------------------------------------------------------------------------- /docker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | list_descendants() { 4 | local root_pid=$1 5 | local children=$(for PID in `ps -o pid,ppid | grep " $root_pid"'$'`; do [ "$PID" != "$root_pid" ] && echo $PID; done) 6 | for PID in $children; do list_descendants "$PID"; done 7 | [ "$children" != "" ] && echo "$children" 8 | } 9 | 10 | if [ "$1" = "server" ]; then 11 | echo "Starting Server" 12 | redis-server /etc/redis.conf --daemonize yes --dir /var/redis 13 | until [ `redis-cli ping | grep -c PONG` = 1 ]; do echo "Waiting 1s for Redis to load"; sleep 1; done 14 | rm -f /var/run/dpp/dpp-celerybeat.pid /var/run/dpp/dpp-celeryd-management.pid /var/run/dpp/dpp-celeryd-worker.pid 15 | python /dpp/docker/github_config.py 16 | dpp init 17 | 18 | echo "Deleting `redis-cli -n 6 KEYS '*' | wc -l` keys" 19 | redis-cli -n 6 FLUSHDB 20 | echo "Remaining `redis-cli -n 6 KEYS '*' | wc -l` keys" 21 | 22 | SCHEDULER=1 python3 -m celery -b $DPP_CELERY_BROKER -A datapackage_pipelines.app -l INFO --pidfile=/var/run/dpp/dpp-celerybeat.pid beat & 23 | python3 -m celery -b $DPP_CELERY_BROKER --concurrency=1 -A datapackage_pipelines.app -Q datapackage-pipelines-management -l INFO --pidfile=/var/run/dpp/dpp-celeryd-management.pid worker & 24 | python3 -m celery -b $DPP_CELERY_BROKER --concurrency=$DPP_NUM_WORKERS -A datapackage_pipelines.app -Q datapackage-pipelines -l INFO --pidfile=/var/run/dpp/dpp-celeryd-worker.pid worker & 25 | dpp serve & 26 | DPP_SERVE_PID=$! 27 | sleep 5 28 | echo $DPP_SERVE_PID > /var/run/dpp/dpp-serve.pid 29 | wait $DPP_SERVE_PID 30 | rm -f /var/run/dpp/dpp-serve.pid 31 | exit 0 32 | elif [ "$1" = "server-reload" ]; then 33 | trap 'echo reloading...; while ! /dpp/docker/run.sh stop-server; do echo .; sleep 1; done' HUP 34 | while true; do 35 | /dpp/docker/run.sh server & 36 | wait $! 37 | done 38 | elif [ "$1" == "stop-server" ]; then 39 | DPP_SERVE_PID=`cat /var/run/dpp/dpp-serve.pid 2>/dev/null` && rm /var/run/dpp/dpp-serve.pid 40 | [ "$?" != "0" ] && echo missing dpp-serve.pid && exit 1 41 | DPP_SERVE_PIDS="$(list_descendants $DPP_SERVE_PID) $DPP_SERVE_PID" 42 | pstree -p 43 | echo collecting pids to terminate 44 | PIDS="" 45 | for PIDFILE in dpp-celeryd-worker dpp-celeryd-management dpp-celerybeat redis; do 46 | PID=`cat /var/run/dpp/$PIDFILE.pid 2>/dev/null` \ 47 | && PIDS="$PIDS $(list_descendants $PID) $PID" 48 | done 49 | if [ "$PIDS" != "" ]; then 50 | echo sending TERM signal for pids: ${PIDS} 51 | for PID in $PIDS; do kill $PID; done 52 | echo sleeping ${DPP_RELOAD_GRACE_PERIOD:-5} seconds before sending KILL signal 53 | sleep ${DPP_RELOAD_GRACE_PERIOD:-5} 54 | for PID in $PIDS; do kill -9 $PID 2>/dev/null; done 55 | echo ensuring all PIDS were terminated 56 | for PID in $PIDS; do kill -0 $PID 2>/dev/null \ 57 | && kill -9 $PID 2>/dev/null \ 58 | && echo sleeping ${DPP_RELOAD_TERMINATE_PERIOD:-2} seconds to allow process $PID to be KILLed \ 59 | && sleep ${DPP_RELOAD_TERMINATE_PERIOD:-2} \ 60 | && kill -0 $PID 2>/dev/null && echo $PID not killed && exit 1; done 61 | fi 62 | for PIDFILE in dpp-celeryd-worker dpp-celeryd-management dpp-celerybeat redis; do 63 | rm -f /var/run/dpp/$PIDFILE.pid 64 | done 65 | echo sending TERM signal to dpp-serve and descendats 66 | kill $DPP_SERVE_PIDS 2>/dev/null 67 | kill -0 $DPP_SERVE_PID 2>/dev/null && echo waiting up to 5 seconds to let dpp-serve to be killed peacefully \ 68 | && for i in 0 1 2 3 4 5; do ! kill -0 $DPP_SERVE_PID 2>/dev/null || sleep 1; done 69 | kill -9 $DPP_SERVE_PIDS 70 | sleep ${DPP_RELOAD_TERMINATE_PERIOD:-2} && kill -0 $DPP_SERVE_PID 2>/dev/null && echo dpp serve not killed && exit 1 71 | echo killed server PID $DPP_SERVE_PID 72 | pstree -p 73 | exit 0 74 | else 75 | /usr/local/bin/dpp "$@" 76 | fi; 77 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | linters = pyflakes,pep8 3 | ignore = E128,E301,E741 4 | 5 | [pylama:pep8] 6 | max_line_length = 120 7 | 8 | [pylama:*/__init__.py] 9 | ignore = W0611 10 | -------------------------------------------------------------------------------- /samples/add_constant.py: -------------------------------------------------------------------------------- 1 | # Add new column with constant value to first resource 2 | # Column name and value are taken from the processor's parameters 3 | from datapackage_pipelines.wrapper import process 4 | 5 | 6 | def modify_datapackage(datapackage, parameters, _): 7 | datapackage['resources'][0]['schema']['fields'].append({ 8 | 'name': parameters['column-name'], 9 | 'type': 'string' 10 | }) 11 | return datapackage 12 | 13 | 14 | def process_row(row, _1, _2, resource_index, parameters, _): 15 | if resource_index == 0: 16 | row[parameters['column-name']] = parameters['value'] 17 | return row 18 | 19 | 20 | process(modify_datapackage=modify_datapackage, 21 | process_row=process_row) 22 | -------------------------------------------------------------------------------- /samples/co2-information-cdiac.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/samples/co2-information-cdiac.zip -------------------------------------------------------------------------------- /samples/pipeline-spec.yaml: -------------------------------------------------------------------------------- 1 | worldbank-co2-emissions: 2 | schedule: 3 | crontab: '0 * * * *' 4 | pipeline: 5 | - 6 | run: update_package 7 | parameters: 8 | name: 'co2-emissions' 9 | title: 'CO2 emissions [metric tons per capita]' 10 | homepage: 'http://worldbank.org/' 11 | - 12 | run: load 13 | parameters: 14 | from: "http://api.worldbank.org/v2/en/indicator/EN.ATM.CO2E.PC?downloadformat=excel" 15 | name: 'global-data' 16 | headers: 4 17 | format: xls 18 | - 19 | run: set_types 20 | parameters: 21 | resources: global-data 22 | types: 23 | "[12][0-9]{3}": 24 | type: number 25 | - 26 | run: dump_to_zip 27 | parameters: 28 | out-file: co2-emissions-wb.zip 29 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import io 8 | from setuptools import setup, find_packages 9 | 10 | 11 | # Helpers 12 | def read(*paths): 13 | """Read a text file.""" 14 | basedir = os.path.dirname(__file__) 15 | fullpath = os.path.join(basedir, *paths) 16 | contents = io.open(fullpath, encoding='utf-8').read().strip() 17 | return contents 18 | 19 | 20 | # Prepare 21 | PACKAGE = 'datapackage_pipelines' 22 | NAME = PACKAGE.replace('_', '-') 23 | INSTALL_REQUIRES = [ 24 | 'celery<5', 25 | 'requests', 26 | 'datapackage>=1.14.0', 27 | 'tableschema>=1.2.5', 28 | 'tableschema-sql>=0.10.4', 29 | 'pyyaml', 30 | 'ujson', 31 | 'mistune<2', 32 | 'markupsafe==2.0.1', 33 | 'redis>=3,<4', 34 | 'click<8.0', 35 | 'awesome-slugify', 36 | 'flask<2.0.0', 37 | 'flask-cors', 38 | 'flask-jsonpify', 39 | 'flask-basicauth', 40 | 'cachetools', 41 | 'tabulator>=1.50.0', 42 | 'globster>=0.1.0', 43 | 'dataflows>=0.2.11', 44 | 'python-dateutil<2.8.1', 45 | 'werkzeug<1.0', 46 | ] 47 | SPEEDUP_REQUIRES = [ 48 | 'dataflows[speedup]', 49 | ] 50 | LINT_REQUIRES = [ 51 | 'pylama', 52 | ] 53 | TESTS_REQUIRE = [ 54 | 'tox', 55 | 'sqlalchemy', 56 | ] 57 | README = read('README.md') 58 | VERSION = read(PACKAGE, 'VERSION') 59 | PACKAGES = find_packages(exclude=['examples', 'tests', '.tox']) 60 | 61 | # Run 62 | setup( 63 | name=NAME, 64 | version=VERSION, 65 | packages=PACKAGES, 66 | include_package_data=True, 67 | install_requires=INSTALL_REQUIRES, 68 | tests_require=TESTS_REQUIRE, 69 | extras_require={ 70 | 'develop': LINT_REQUIRES + TESTS_REQUIRE, 71 | 'speedup': SPEEDUP_REQUIRES, 72 | }, 73 | zip_safe=False, 74 | long_description=README, 75 | long_description_content_type='text/markdown', 76 | description='{{ DESCRIPTION }}', 77 | author='Open Knowledge Foundation', 78 | author_email='info@okfn.org', 79 | url='https://github.com/frictionlessdata/datapackage-pipelines', 80 | license='MIT', 81 | keywords=[ 82 | 'data', 83 | ], 84 | classifiers=[ 85 | 'Development Status :: 4 - Beta', 86 | 'Intended Audience :: Developers', 87 | 'License :: OSI Approved :: MIT License', 88 | 'Operating System :: OS Independent', 89 | 'Programming Language :: Python :: 3.6', 90 | 'Topic :: Software Development :: Libraries :: Python Modules', 91 | ], 92 | entry_points={ 93 | 'console_scripts': [ 94 | 'dpp = datapackage_pipelines.cli:cli', 95 | ] 96 | }, 97 | ) 98 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from datapackage_pipelines.manager.logging_config import logging 2 | -------------------------------------------------------------------------------- /tests/cli/custom_formatters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/tests/cli/custom_formatters/__init__.py -------------------------------------------------------------------------------- /tests/cli/custom_formatters/xlsx_format.py: -------------------------------------------------------------------------------- 1 | from datapackage_pipelines.lib.dump.file_formats import CSVFormat, get_path 2 | import os 3 | import openpyxl 4 | 5 | 6 | class XLSXFormat(CSVFormat): 7 | 8 | def prepare_resource(self, resource): 9 | super(XLSXFormat, self).prepare_resource(resource) 10 | basename, _ = os.path.splitext(get_path(resource)) 11 | resource['path'] = basename + '.xlsx' 12 | resource['format'] = 'xlsx' 13 | 14 | def initialize_file(self, file, headers): 15 | self.file = file 16 | self.headers = headers 17 | wb = openpyxl.Workbook() 18 | wb.active.append(self.headers) 19 | return wb 20 | 21 | def write_transformed_row(self, writer, transformed_row, fields): 22 | writer.active.append([transformed_row[k] for k in self.headers]) 23 | 24 | def finalize_file(self, writer): 25 | writer.save(self.file.name) 26 | -------------------------------------------------------------------------------- /tests/cli/expected_flow_data.csv: -------------------------------------------------------------------------------- 1 | first_name,last_name,house,age,foo 2 | Tyrion,Lannister,Lannister,27,foo 3 | Jaime,Lannister,Lannister,34,foo 4 | Cersei,Lannister,Lannister,34,foo 5 | Jon,Snow,Stark,17,foo 6 | Sansa,Stark,Stark,14,foo 7 | Arya,Stark,Stark,11,foo 8 | Bran,Stark,Stark,10,foo 9 | Rickon,Stark,Stark,5,foo 10 | Daenerys,Targaryen,Targaryen,16,foo 11 | -------------------------------------------------------------------------------- /tests/cli/pipeline-spec.yaml: -------------------------------------------------------------------------------- 1 | raise-exception: 2 | pipeline: 3 | - run: raise_exception 4 | code: raise Exception() 5 | 6 | failure-no-errors: 7 | pipeline: 8 | - run: success 9 | code: "" 10 | 11 | success: 12 | pipeline: 13 | - run: success 14 | code: | 15 | from datapackage_pipelines.wrapper import ingest, spew 16 | parameters, datapackage, resources = ingest() 17 | spew(datapackage, []) 18 | 19 | verbose-logs-with-sleep: 20 | pipeline: 21 | - run: code 22 | code: | 23 | from datapackage_pipelines.wrapper import ingest, spew 24 | import logging, itertools, time 25 | log_numbers = itertools.count() 26 | def log_line(): 27 | logging.info('log line {}'.format(next(log_numbers))) 28 | log_line() 29 | time.sleep(.1) 30 | log_line() 31 | time.sleep(.1) 32 | log_line() 33 | time.sleep(.1) 34 | parameters, datapackage, resources = ingest() 35 | log_line() 36 | time.sleep(.1) 37 | log_line() 38 | time.sleep(.1) 39 | log_line() 40 | time.sleep(.1) 41 | spew(datapackage, []) 42 | log_line() 43 | time.sleep(.1) 44 | log_line() 45 | time.sleep(.1) 46 | log_line() 47 | 48 | load-resource-progress-log: 49 | pipeline: 50 | - run: load_resource 51 | parameters: 52 | url: ../data/datapackage.json 53 | resource: my-spiffy-resource 54 | log-progress-rows: 2 55 | 56 | custom-formatters: 57 | pipeline: 58 | - run: load_resource 59 | parameters: 60 | url: ../data/datapackage.json 61 | resource: my-spiffy-resource 62 | - run: duplicate 63 | parameters: 64 | source: my-spiffy-resource 65 | target-name: my-spiffy-xlsx-resource 66 | target-path: my-spiffy-resource.xlsx 67 | - run: dump.to_path 68 | parameters: 69 | out-path: custom_formatters 70 | force-format: false 71 | counters: 72 | resource-hash: '' 73 | file-formatters: 74 | xlsx: custom_formatters.xlsx_format.XLSXFormat 75 | - run: dump.to_zip 76 | parameters: 77 | out-file: custom_formatters/datapackage.zip 78 | force-format: false 79 | counters: 80 | resource-hash: '' 81 | file-formatters: 82 | xlsx: custom_formatters.xlsx_format.XLSXFormat 83 | 84 | dataflows: 85 | pipeline: 86 | - run: load_resource 87 | parameters: 88 | url: ../data/datapackage.json 89 | resource: my-spiffy-resource 90 | - flow: test_flow 91 | parameters: 92 | attr: foo 93 | - run: dump_to_path 94 | parameters: 95 | out-path: test_flow_data 96 | -------------------------------------------------------------------------------- /tests/cli/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="dpp_tests_cli", 5 | packages=["custom_formatters"] 6 | ) 7 | -------------------------------------------------------------------------------- /tests/cli/test_cli_exit_codes.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ! dpp \ 4 | && echo "test failed: dpp returned with non-zero exit code $?" && exit 1 5 | 6 | dpp run ./tests/cli/raise-exception \ 7 | && echo "test failed: exception in pipeline returned successful exit code" && exit 1 8 | 9 | dpp run ./tests/cli/failure-no-errors \ 10 | && echo "test failed: pipeline that failed without errors returned successful exit code" && exit 1 11 | 12 | ! dpp run ./tests/cli/success \ 13 | && echo "test failed: success pipeline returned with non-zero exit code $?" && exit 1 14 | 15 | dpp run --concurrency 4 \ 16 | ./tests/cli/raise-exception,./tests/env/dummy/pipeline-test-data%,./tests/cli/failure-no-errors \ 17 | && echo "test failed: concurrent run with failures returned successful exit code" && exit 1 18 | 19 | ! dpp run --concurrency 2 \ 20 | ./tests/cli/success,./tests/cli/verbose-logs-with-sleep,./tests/env/dummy/pipeline-test-data% \ 21 | && echo "test failed: concurrent run without failures returned non-zero exit code $?" && exit 1 22 | 23 | echo "Great Success" 24 | exit 0 25 | -------------------------------------------------------------------------------- /tests/cli/test_cli_logs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | TEMPDIR=`mktemp -d` 4 | 5 | ! script -ec "dpp run --verbose ./tests/cli/verbose-logs-with-sleep" $TEMPDIR/verbose_log && echo failed to run with --verbose && exit 1 6 | cat -v $TEMPDIR/verbose_log | grep '\^\[\[[0-9][0-9]*A' && echo running with --verbose - found terminal escape sequences && exit 1 7 | 8 | ! script -ec "dpp run ./tests/cli/verbose-logs-with-sleep" $TEMPDIR/log && echo failed to run without verbose && exit 1 9 | ! cat -v $TEMPDIR/log | grep '\^\[\[[0-9][0-9]*A' && echo running without verbose - did not find terminal escape sequences && exit 1 10 | 11 | ! OUTPUT=`dpp run --verbose ./tests/cli/load-resource-progress-log 2>&1` && echo failed to run load-resource-progress && exit 1 12 | for i in 2 4 6 8; do 13 | ! echo $OUTPUT | grep -q "loaded $i rows" && echo failed to detect load resource log && exit 1 14 | done 15 | 16 | rm -rf "${TEMPDIR}" 17 | 18 | echo Great Success 19 | exit 0 20 | -------------------------------------------------------------------------------- /tests/cli/test_custom_formatters.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | pip install -e tests/cli 4 | pip install openpyxl 5 | 6 | OUTPUT_FILES="tests/cli/custom_formatters/my-spiffy-resource.xlsx 7 | tests/cli/custom_formatters/sample.csv 8 | tests/cli/custom_formatters/datapackage.json 9 | tests/cli/custom_formatters/datapackage.zip" 10 | 11 | rm -f $OUTPUT_FILES 12 | 13 | ! dpp run ./tests/cli/custom-formatters && echo failed to run custom formatters pipeline && exit 1 14 | 15 | ! ls -lah $OUTPUT_FILES && echo missing custom formatters output files && exit 1 16 | 17 | validate_lannisters() { 18 | NUM_LANNISTERS=$(python - </dev/null 29 | ! validate_lannisters && exit 1 30 | popd >/dev/null 31 | 32 | DATAPACKAGE_ZIP=`pwd`/tests/cli/custom_formatters/datapackage.zip 33 | TEMP_DIR=`mktemp -d` 34 | pushd $TEMP_DIR >/dev/null 35 | unzip "${DATAPACKAGE_ZIP}" 36 | ! validate_lannisters && exit 1 37 | popd >/dev/null 38 | 39 | rm -rf $TEMP_DIR 40 | rm -f $OUTPUT_FILES 41 | 42 | echo Great Success 43 | exit 0 44 | -------------------------------------------------------------------------------- /tests/cli/test_exclude_dirnames.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ! dpp | grep ./tests/cli && echo missing tests/cli pipelines && exit 1 4 | ! dpp | grep ./samples/worldbank && echo missing samples pipelines && exit 1 5 | ! dpp | grep ./tests/env/ && echo missing tests/env pipelines && exit 1 6 | ! dpp | grep ./tests/docker/ && echo missing tests/docker pipelines && exit 1 7 | 8 | echo "env 9 | /samples 10 | /tests/cli" > .dpp_spec_ignore 11 | 12 | dpp | grep ./tests/cli && echo tests/cli pipelines not excluded && exit 1 13 | dpp | grep ./samples/worldbank && echo samples pipelines not excluded && exit 1 14 | dpp | grep ./tests/env/ && echo tests/env pipelines not excluded && exit 1 15 | ! dpp | grep ./tests/docker/ && echo missing tests/docker pipelines && exit 1 16 | 17 | rm .dpp_spec_ignore 18 | 19 | echo Great Success 20 | exit 0 21 | -------------------------------------------------------------------------------- /tests/cli/test_flow.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow, dump_to_path, PackageWrapper, load, update_package 2 | 3 | 4 | def hello_dataflows(package: PackageWrapper): 5 | print('hello dataflows') 6 | yield package.pkg 7 | yield from package 8 | 9 | 10 | def flow(parameters, datapackage, resources, stats): 11 | stats['foo_values'] = 0 12 | 13 | def add_foo_field(package: PackageWrapper): 14 | package.pkg.descriptor['resources'][0]['schema']['fields'] += [ 15 | {'name': parameters['attr'], 'type': 'string'}] 16 | yield package.pkg 17 | yield from package 18 | 19 | def add_foo_value(row): 20 | row[parameters['attr']] = 'foo' 21 | stats['foo_values'] += 1 22 | 23 | return Flow(update_package(name='_'), 24 | hello_dataflows, 25 | add_foo_field, 26 | add_foo_value) 27 | -------------------------------------------------------------------------------- /tests/cli/test_flow.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd tests/cli 4 | 5 | rm -rf test_flow_data 6 | 7 | TEMPFILE=`mktemp` 8 | 9 | set -o pipefail 10 | ! dpp run --verbose ./dataflows >/dev/stdout 2>&1 | tee $TEMPFILE && echo failed to run dataflows pipeline && exit 1 11 | set +o pipefail 12 | ! cat "${TEMPFILE}" | grep "hello dataflows" && echo dataflows output is missing && exit 1 13 | ! cat "${TEMPFILE}" | grep "'foo_values': 9" && echo dataflows output is missing stats && exit 1 14 | rm $TEMPFILE 15 | 16 | ! diff test_flow_data/sample.csv expected_flow_data.csv && echo unexpected output data && exit 1 17 | 18 | echo Great Success 19 | exit 0 20 | -------------------------------------------------------------------------------- /tests/data/datapackage.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "my-spiffy-datapackage", 3 | "my-prop": "the-props-value", 4 | "resources": [ 5 | { 6 | "name": "my-spiffy-resource", 7 | "path": "sample.csv", 8 | "schema": { 9 | "fields": [ 10 | {"name": "first_name", "type": "string"}, 11 | {"name": "last_name", "type": "string"}, 12 | {"name": "house", "type": "string"}, 13 | {"name": "age", "type": "integer"} 14 | ], 15 | "primaryKey": [ 16 | "first_name", "last_name" 17 | ] 18 | } 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /tests/data/datapackage2.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "my-spiffy-datapackage", 3 | "resources": [ 4 | { 5 | "name": "my-spiffy-resource", 6 | "path": "sample.csv", 7 | "schema": { 8 | "fields": [ 9 | {"name": "first_name", "type": "string"}, 10 | {"name": "last_name", "type": "string"}, 11 | {"name": "house", "type": "string"}, 12 | {"name": "age", "type": "integer"} 13 | ] 14 | } 15 | }, 16 | { 17 | "name": "the-spiffy-resource", 18 | "path": "sample.csv", 19 | "schema": { 20 | "fields": [ 21 | {"name": "first_name", "type": "string"}, 22 | {"name": "last_name", "type": "string"}, 23 | {"name": "house", "type": "string"}, 24 | {"name": "age", "type": "integer"} 25 | ] 26 | } 27 | }, 28 | { 29 | "name": "the-other-spiffy-resource", 30 | "path": "sample2.csv", 31 | "schema": { 32 | "fields": [ 33 | {"name": "first_name", "type": "string"}, 34 | {"name": "last_name", "type": "string"}, 35 | {"name": "house", "type": "string"}, 36 | {"name": "age", "type": "integer"} 37 | ] 38 | } 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /tests/data/datapackage3.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "my-spiffy-datapackage", 3 | "my-prop": "the-props-value", 4 | "resources": [ 5 | { 6 | "name": "my-spiffy-resource", 7 | "path": "sample.dups.csv", 8 | "schema": { 9 | "fields": [ 10 | {"name": "first_name", "type": "string"}, 11 | {"name": "last_name", "type": "string"}, 12 | {"name": "house", "type": "string"}, 13 | {"name": "age", "type": "integer"} 14 | ], 15 | "primaryKey": [ 16 | "first_name", "last_name" 17 | ] 18 | } 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /tests/data/sample.csv: -------------------------------------------------------------------------------- 1 | first_name,last_name,house,age 2 | Tyrion,Lannister,Lannister,27 3 | Jaime,Lannister,Lannister,34 4 | Cersei,Lannister,Lannister,34 5 | Jon,Snow,Stark,17 6 | Sansa,Stark,Stark,14 7 | Arya,Stark,Stark,11 8 | Bran,Stark,Stark,10 9 | Rickon,Stark,Stark,5 10 | Daenerys,Targaryen,Targaryen,16 11 | -------------------------------------------------------------------------------- /tests/data/sample.dups.csv: -------------------------------------------------------------------------------- 1 | first_name,last_name,house,age 2 | Tyrion,Lannister,Lannister,27 3 | Jaime,Lannister,Lannister,34 4 | Cersei,Lannister,Lannister,34 5 | Jon,Snow,Stark,17 6 | Sansa,Stark,Stark,14 7 | Sansa,Stark,Stark,14 8 | Arya,Stark,Stark,11 9 | Bran,Stark,Stark,10 10 | Rickon,Stark,Stark,5 11 | Daenerys,Targaryen,Targaryen,16 12 | -------------------------------------------------------------------------------- /tests/data/sample.txt: -------------------------------------------------------------------------------- 1 | <<< tabulator has html decection, keeping that causes the failure which we want to test 2 | This is a plain text file - not a CSV file! 3 | testing 4 | one two three -------------------------------------------------------------------------------- /tests/data/sample.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/tests/data/sample.zip -------------------------------------------------------------------------------- /tests/data/sample2.csv: -------------------------------------------------------------------------------- 1 | first_name,last_name,house,age 2 | Tyrion,Lannister,Lannister,27 3 | Jaime,Lannister,Lannister,34 4 | Cersei,Lannister,Lannister,34 5 | -------------------------------------------------------------------------------- /tests/docker/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | -------------------------------------------------------------------------------- /tests/docker/lib/dpp_docker_test.py: -------------------------------------------------------------------------------- 1 | DPP_DOCKER_TEST=True 2 | -------------------------------------------------------------------------------- /tests/docker/lib/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name="dpp_docker_test", 5 | py_modules=['dpp_docker_test'] 6 | ) 7 | -------------------------------------------------------------------------------- /tests/docker/pipeline-spec.yaml: -------------------------------------------------------------------------------- 1 | test: 2 | pipeline: 3 | - run: test 4 | - run: dump_to_path 5 | parameters: 6 | out-path: data 7 | 8 | test-sleep: 9 | pipeline: 10 | - run: sleep 11 | code: | 12 | import os 13 | os.system('sleep 86400') 14 | 15 | test-package: 16 | pipeline: 17 | - run: test 18 | parameters: 19 | test-package: true 20 | - run: dump_to_path 21 | parameters: 22 | out-path: data/test_package 23 | -------------------------------------------------------------------------------- /tests/docker/test.py: -------------------------------------------------------------------------------- 1 | from datapackage_pipelines.wrapper import ingest, spew 2 | from datapackage_pipelines.utilities.resources import PROP_STREAMING 3 | import datetime 4 | 5 | parameters, datapackage, resources, stats = tuple(ingest()) + ({},) 6 | 7 | 8 | if parameters.get('test-package'): 9 | from dpp_docker_test import DPP_DOCKER_TEST 10 | assert DPP_DOCKER_TEST 11 | 12 | 13 | datapackage['resources'] = [{'name': 'test', 'path': 'test.csv', 14 | PROP_STREAMING: True, 15 | 'schema': {'fields': [{'name': 'a', 'type': 'string'}]}}] 16 | 17 | 18 | spew(datapackage, [({'a': 'foo'}, {'a': 'bar'})], {'last_run_time': str(datetime.datetime.now())}) 19 | -------------------------------------------------------------------------------- /tests/docker/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | sudo rm -rf tests/docker/data 4 | 5 | ! docker run -v `pwd`/tests/docker:/pipelines:rw frictionlessdata/datapackage-pipelines run ./test \ 6 | && echo failed to run docker && exit 1 7 | 8 | ! ls -lah tests/docker/data/datapackage.json tests/docker/data/test.csv \ 9 | && echo failed to find output files from docker run && exit 1 10 | 11 | sudo rm -rf tests/docker/data 12 | 13 | ! docker run -d --name dpp -v `pwd`/tests/docker:/pipelines:rw frictionlessdata/datapackage-pipelines server-reload \ 14 | && echo failed to start daemonized docker container && exit 1 15 | 16 | for i in 1 2 3 4 5 6 7 8 9; do 17 | sleep 10 18 | ls -lah tests/docker/data/test.csv 2>/dev/null && break 19 | echo . 20 | done 21 | 22 | ! ls -lah tests/docker/data/datapackage.json tests/docker/data/test.csv \ 23 | && docker logs dpp && echo Failed to detect output data from daemonized docker container && exit 1 24 | 25 | ls -lah tests/docker/data/test_package 2>/dev/null \ 26 | && docker logs dpp && echo detected test_package data && exit 1 27 | 28 | ! docker exec dpp sh -c "cd lib; python3 setup.py install" \ 29 | && echo failed to install docker test package && exit 1 30 | 31 | ! docker kill -s HUP dpp \ 32 | && docker logs && echo failed to send HUP to docker && exit 1 33 | 34 | for i in 1 2 3 4 5 6 7 8 9; do 35 | sleep 10 36 | ls -lah tests/docker/data/test_package/test.csv 2>/dev/null && break 37 | echo . 38 | done 39 | 40 | ! ls -lah tests/docker/data/test_package/datapackage.json tests/docker/data/test_package/test.csv \ 41 | && docker logs dpp && echo Failed to detect test package output data from daemonized docker container && exit 1 42 | 43 | docker logs dpp 44 | 45 | docker rm --force dpp 46 | 47 | sudo rm -rf tests/docker 48 | 49 | echo Great Success 50 | exit 0 51 | -------------------------------------------------------------------------------- /tests/env/common/pipeline-common.py: -------------------------------------------------------------------------------- 1 | from datapackage_pipelines.wrapper import ingest, spew 2 | 3 | params, datapackage, res_iter = ingest() 4 | for res in datapackage['resources']: 5 | res['profile'] = 'tabular-data-resource' 6 | spew(datapackage, res_iter) 7 | -------------------------------------------------------------------------------- /tests/env/dummy/big-outputs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import itertools 3 | import os 4 | 5 | from datapackage_pipelines.wrapper import ingest, spew 6 | from datapackage_pipelines.utilities.resources import PROP_STREAMING 7 | 8 | params, dp, res_iter = ingest() 9 | 10 | big_string = 'z'*64*1024 11 | 12 | logging.info('Look at me %s', big_string) 13 | 14 | dp['name'] = 'a' 15 | dp['resources'].append({ 16 | 'name': 'aa%f' % os.getpid(), 17 | 'path': 'data/bla.csv', 18 | 'schema': { 19 | 'fields': [ 20 | {'name': 'a', 'type': 'string'} 21 | ] 22 | }, 23 | 'very-large-prop': '*' * 100 * 1024, 24 | PROP_STREAMING: True 25 | }) 26 | 27 | res = iter([{'a': big_string}]) 28 | 29 | spew(dp, itertools.chain(res_iter, [res])) 30 | -------------------------------------------------------------------------------- /tests/env/dummy/pipeline-test-supplier-titleize.py: -------------------------------------------------------------------------------- 1 | from datapackage_pipelines.wrapper import ingest, spew 2 | 3 | params, datapackage, res_iter = ingest() 4 | 5 | key = params['key'] 6 | 7 | 8 | def process_resources(_res_iter): 9 | for res in _res_iter: 10 | def process_res(_res): 11 | for line in _res: 12 | if key in line: 13 | line[key] = line[key].title() 14 | yield line 15 | yield process_res(res) 16 | 17 | spew(datapackage, process_resources(res_iter)) 18 | -------------------------------------------------------------------------------- /tests/env/dummy/types.csv: -------------------------------------------------------------------------------- 1 | string,number,integer,boolean,object,array,date,time,datetime,year,yearmonth,duration,geopoint,geojson 2 | "My name is Josef",1.23,10,true,{},"[1,2,3]",2015-01-31,03:00:10,2015-01-31T03:00:10Z,2015,2015-12,P3Y6M4DT12H30M5S,"90, 45","{""type"": ""Feature"",""geometry"": {""type"": ""Point"",""coordinates"": [125.6, 10.1]},""properties"": {""name"": ""Dinagat Islands""}}" 3 | "",NaN,0,True,{},[],2015-02-28,13:34:39,2015-02-28T13:34:39Z,2525,1982-01,"P3,5Y","180, -90", 4 | "",inf,0,FALSE,"{""a"":1}","[""a"",""b""]",1970-01-01,23:59:59,1970-01-01T23:59:59Z,9999,9999-09,P300YT5.2S,"12.2, 12.3", 5 | "",-inF,0,0,{},[],1900-01-01,00:00:00,1900-01-01T00:00:00Z,0000,0000-01,PT0S,"27.0, -90.0", 6 | "",1.2e2,0,1,{},[],1405-12-30,12:34:56,1405-12-30T12:34:56Z,1786,1786-09,P999Y9999M9999DT9999H9999M9999S,"0,0", -------------------------------------------------------------------------------- /tests/env/extract-year.py: -------------------------------------------------------------------------------- 1 | from datapackage_pipelines.wrapper import ingest, spew 2 | 3 | params, datapackage, res_iter = ingest() 4 | 5 | from_key = params['from-key'] 6 | to_key = params['to-key'] 7 | 8 | 9 | def process_resources(_res_iter): 10 | for res in _res_iter: 11 | def process_res(_res): 12 | for line in _res: 13 | if from_key in line: 14 | line[to_key] = line[from_key].year 15 | yield line 16 | yield process_res(res) 17 | 18 | 19 | for resource in datapackage['resources']: 20 | if len(list(filter(lambda field: field['name'] == from_key, resource.get('schema',{}).get('fields',[])))) > 0: 21 | resource['schema']['fields'].append({ 22 | 'name': to_key, 23 | 'osType': 'date:fiscal-year', 24 | 'type': 'integer' 25 | }) 26 | 27 | spew(datapackage, process_resources(res_iter)) 28 | -------------------------------------------------------------------------------- /tests/serve/html_output.py: -------------------------------------------------------------------------------- 1 | from dataflows import Flow 2 | import logging 3 | 4 | 5 | class MyClass(): 6 | pass 7 | 8 | 9 | def flow(*_): 10 | logging.info('my_object=' + str(MyClass())) 11 | return Flow() 12 | -------------------------------------------------------------------------------- /tests/serve/pipeline-spec.yaml: -------------------------------------------------------------------------------- 1 | html-output: 2 | pipeline: 3 | - flow: html_output 4 | -------------------------------------------------------------------------------- /tests/sitecustomize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import coverage 3 | 4 | os.environ['COVERAGE_PROCESS_START']= os.path.join(os.environ["PWD"], 'tox.ini') 5 | coverage.process_startup() 6 | 7 | -------------------------------------------------------------------------------- /tests/stdlib/README.md: -------------------------------------------------------------------------------- 1 | # tests for the pipelines standard library 2 | 3 | ## fixtures 4 | 5 | Each file in the fixtures sub-directory corresponds to paramaters of test to run. 6 | 7 | The parameters are laid out in the file, separated by `\n--\n` 8 | 9 | This is the order of parameters: 10 | 11 | * `processor` - name of the processor to run 12 | * `params` - parameters 13 | * `dp_in` - input datapackage 14 | * `data_in` - input data 15 | * `dp_out` - expected output datapackage 16 | * `data_out` - expected output data 17 | 18 | ## setting up the test environment and running a specific test 19 | 20 | * `pip install -e .[develop]` 21 | * `py.test -svk name-of-the-fixture` 22 | -------------------------------------------------------------------------------- /tests/stdlib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/tests/stdlib/__init__.py -------------------------------------------------------------------------------- /tests/stdlib/fixtures/add_resource_existent_env: -------------------------------------------------------------------------------- 1 | add_resource 2 | -- 3 | { 4 | "name": "my-env-resource", 5 | "url": "env://EXISTENT_ENV" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [] 11 | } 12 | -- 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [ 17 | { 18 | "name": "my-env-resource", 19 | "dpp:streamedFrom": "tests/data/sample.csv", 20 | "path": "_" 21 | } 22 | ] 23 | } 24 | -- 25 | {} 26 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/dump_to_sql_update_mode__insert: -------------------------------------------------------------------------------- 1 | dump_to_sql 2 | -- 3 | { 4 | "tables": { 5 | "test": { 6 | "resource-name": "my-spiffy-resource", 7 | "mode": "update" 8 | } 9 | } 10 | } 11 | -- 12 | { 13 | "name": "test", 14 | "resources": [ 15 | { 16 | "name": "my-spiffy-resource", 17 | "dpp:streaming": true, 18 | "path": "data/my-data.csv", 19 | "schema": { 20 | "fields": [ 21 | {"name": "id", "type": "integer"}, 22 | {"name": "mystring", "type": "string"}, 23 | {"name": "mynumber", "type": "number"}, 24 | {"name": "mydate", "type": "date"} 25 | ], 26 | "primaryKey": ["id"] 27 | } 28 | } 29 | ] 30 | } 31 | -- 32 | {"id": 1, "mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 33 | -- 34 | { 35 | "name": "test", 36 | "profile": "data-package", 37 | "resources": [ 38 | { 39 | "name": "my-spiffy-resource", 40 | "dpp:streaming": true, 41 | "path": "data/my-data.csv", 42 | "profile": "data-resource", 43 | "schema": { 44 | "fields": [ 45 | {"name": "id", "type": "integer"}, 46 | {"name": "mystring", "type": "string"}, 47 | {"name": "mynumber", "type": "number"}, 48 | {"name": "mydate", "type": "date"} 49 | ], 50 | "primaryKey": ["id"] 51 | } 52 | } 53 | ] 54 | } 55 | -- 56 | {"id": 1, "mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}} 57 | 58 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "5dad5b7c7fb3fecb7478b4f34fabbd23"} 59 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/dump_to_sql_update_mode__update: -------------------------------------------------------------------------------- 1 | dump_to_sql 2 | -- 3 | { 4 | "tables": { 5 | "test": { 6 | "resource-name": "my-spiffy-resource", 7 | "mode": "update" 8 | } 9 | } 10 | } 11 | -- 12 | { 13 | "name": "test", 14 | "resources": [ 15 | { 16 | "name": "my-spiffy-resource", 17 | "dpp:streaming": true, 18 | "path": "data/my-data.csv", 19 | "schema": { 20 | "fields": [ 21 | {"name": "id", "type": "integer"}, 22 | {"name": "mystring", "type": "string"}, 23 | {"name": "mynumber", "type": "number"}, 24 | {"name": "mydate", "type": "date"} 25 | ], 26 | "primaryKey": ["id"] 27 | } 28 | } 29 | ] 30 | } 31 | -- 32 | {"id": 1, "mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 33 | -- 34 | { 35 | "name": "test", 36 | "profile": "data-package", 37 | "resources": [ 38 | { 39 | "name": "my-spiffy-resource", 40 | "dpp:streaming": true, 41 | "path": "data/my-data.csv", 42 | "profile": "data-resource", 43 | "schema": { 44 | "fields": [ 45 | {"name": "id", "type": "integer"}, 46 | {"name": "mystring", "type": "string"}, 47 | {"name": "mynumber", "type": "number"}, 48 | {"name": "mydate", "type": "date"} 49 | ], 50 | "primaryKey": ["id"] 51 | } 52 | } 53 | ] 54 | } 55 | -- 56 | {"id": 1, "mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}} 57 | 58 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "5dad5b7c7fb3fecb7478b4f34fabbd23"} 59 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/dump_to_sql_with_updated_data: -------------------------------------------------------------------------------- 1 | dump_to_sql 2 | -- 3 | { 4 | "tables": { 5 | "test": { 6 | "resource-name": "my-spiffy-resource" 7 | } 8 | }, 9 | "updated_column": "updated", 10 | "updated_id_column": "updated_id" 11 | } 12 | -- 13 | { 14 | "name": "test", 15 | "resources": [ 16 | { 17 | "name": "my-spiffy-resource", 18 | "dpp:streaming": true, 19 | "path": "data/my-data.csv", 20 | "schema": { 21 | "fields": [ 22 | {"name": "mystring", "type": "string"}, 23 | {"name": "myinteger", "type": "integer"}, 24 | {"name": "mynumber", "type": "number"}, 25 | {"name": "mydate", "type": "date"} 26 | ] 27 | } 28 | } 29 | ] 30 | } 31 | -- 32 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 33 | -- 34 | { 35 | "name": "test", 36 | "profile": "data-package", 37 | "resources": [ 38 | { 39 | "name": "my-spiffy-resource", 40 | "dpp:streaming": true, 41 | "path": "data/my-data.csv", 42 | "profile": "data-resource", 43 | "schema": { 44 | "fields": [ 45 | {"name": "mystring", "type": "string"}, 46 | {"name": "myinteger", "type": "integer"}, 47 | {"name": "mynumber", "type": "number"}, 48 | {"name": "mydate", "type": "date"} 49 | ] 50 | } 51 | } 52 | ] 53 | } 54 | -- 55 | {"mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}, "myinteger": null, "updated": false, "updated_id": null} 56 | 57 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "c1c867cd9711aedd5c94a16ce4590ece"} 58 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/load_existent_env: -------------------------------------------------------------------------------- 1 | load 2 | -- 3 | { 4 | "from": "env://EXISTENT_ENV", 5 | "name": "my-env-resource", 6 | "validate": true 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "resources": [] 12 | } 13 | -- 14 | -- 15 | { 16 | "name": "test", 17 | "profile": "data-package", 18 | "resources": [ 19 | { 20 | "dpp:streamedFrom": "env://EXISTENT_ENV", 21 | "dpp:streaming": true, 22 | "format": "csv", 23 | "name": "my-env-resource", 24 | "path": "my-env-resource.csv", 25 | "profile": "tabular-data-resource", 26 | "schema": { 27 | "fields": [ 28 | {"format": "default", "name": "first_name", "type": "string"}, 29 | {"format": "default", "name": "last_name", "type": "string"}, 30 | {"format": "default", "name": "house", "type": "string"}, 31 | {"format": "default", "name": "age", "type": "integer"} 32 | ], 33 | "missingValues": [""] 34 | } 35 | } 36 | ] 37 | } 38 | -- 39 | {"age": 27, "first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister"} 40 | {"age": 34, "first_name": "Jaime", "house": "Lannister", "last_name": "Lannister"} 41 | {"age": 34, "first_name": "Cersei", "house": "Lannister", "last_name": "Lannister"} 42 | {"age": 17, "first_name": "Jon", "house": "Stark", "last_name": "Snow"} 43 | {"age": 14, "first_name": "Sansa", "house": "Stark", "last_name": "Stark"} 44 | {"age": 11, "first_name": "Arya", "house": "Stark", "last_name": "Stark"} 45 | {"age": 10, "first_name": "Bran", "house": "Stark", "last_name": "Stark"} 46 | {"age": 5, "first_name": "Rickon", "house": "Stark", "last_name": "Stark"} 47 | {"age": 16, "first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen"} 48 | 49 | {} 50 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/obj_fix_dump_to_sql: -------------------------------------------------------------------------------- 1 | dump_to_sql 2 | -- 3 | { 4 | "comment": [ 5 | "this test involves data types which work differently in sqlite and postgresql", 6 | "so, forcing sqlite engine here" 7 | ], 8 | "engine": "sqlite://", 9 | "tables": { 10 | "test": { 11 | "resource-name": "my-spiffy-resource" 12 | } 13 | } 14 | } 15 | -- 16 | { 17 | "name": "test", 18 | "resources": [ 19 | { 20 | "name": "my-spiffy-resource", 21 | "dpp:streaming": true, 22 | "path": "data/my-data.csv", 23 | "schema": { 24 | "fields": [ 25 | {"name": "myarray", "type": "array"}, 26 | {"name": "myobject", "type": "object"}, 27 | {"name": "mynumber", "type": "number"}, 28 | {"name": "mydate", "type": "date"} 29 | ] 30 | } 31 | } 32 | ] 33 | } 34 | -- 35 | {"myarray":[{"type{date}": "2016-12-31"}, {"type{datetime}": "2016-11-10 12:34:56"}], "myobject": {"n1": {"n2": {"type{decimal}": "78.99"}}}, "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 36 | -- 37 | { 38 | "name": "test", 39 | "profile": "data-package", 40 | "resources": [ 41 | { 42 | "name": "my-spiffy-resource", 43 | "dpp:streaming": true, 44 | "path": "data/my-data.csv", 45 | "profile": "data-resource", 46 | "schema": { 47 | "fields": [ 48 | {"name": "myarray", "type": "array"}, 49 | {"name": "myobject", "type": "object"}, 50 | {"name": "mynumber", "type": "number"}, 51 | {"name": "mydate", "type": "date"} 52 | ] 53 | } 54 | } 55 | ] 56 | } 57 | -- 58 | {"myarray": "[\"2016-12-31\", \"2016-11-10T12:34:56\"]", "mydate": {"type{date}": "2016-12-31"}, "mynumber": {"type{decimal}": "2.0"}, "myobject": "{\"n1\": {\"n2\": 78.99}}"} 59 | 60 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "bed26992ae39b43e8b58c0190e8a52e5"} 61 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/reverse_sort: -------------------------------------------------------------------------------- 1 | sort 2 | -- 3 | { 4 | "resources": ["concat-a1", "concat-a2"], 5 | "sort-by": "{a3} {a2} {a1}", 6 | "reverse": true 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "resources": [ 12 | { 13 | "name": "concat-a1", 14 | "dpp:streaming": true, 15 | "path": "concat-a1.csv", 16 | "schema": { "fields": [ 17 | {"name": "a1", "type": "string"}, 18 | {"name": "a2", "type": "string"}, 19 | {"name": "a3", "type": "string"} 20 | ]} 21 | }, 22 | { 23 | "name": "concat-a2", 24 | "dpp:streaming": true, 25 | "path": "concat-a2.csv", 26 | "schema": { "fields": [ 27 | {"name": "a1", "type": "string"}, 28 | {"name": "a2", "type": "string"}, 29 | {"name": "a3", "type": "string"} 30 | ]} 31 | }, 32 | { 33 | "name": "concat-c", 34 | "dpp:streaming": true, 35 | "path": "concat-c.csv", 36 | "schema": { "fields": [ 37 | {"name": "c1", "type": "string"}, 38 | {"name": "c2", "type": "string"}, 39 | {"name": "c3", "type": "string"} 40 | ]} 41 | } 42 | ] 43 | } 44 | -- 45 | {"a1":"a1","a2":"a1","a3":"a2"} 46 | {"a1":"a2","a2":"a1","a3":"a1"} 47 | {"a1":"a3","a2":"a2","a3":"a2"} 48 | {"a1":"a4","a2":"a2","a3":"a1"} 49 | 50 | {"a1":"a1","a2":"a3","a3":"a2"} 51 | {"a1":"a2","a2":"a3","a3":"a1"} 52 | {"a1":"a3","a2":"a4","a3":"a2"} 53 | {"a1":"a4","a2":"a4","a3":"a1"} 54 | 55 | {"c1":"c13","c2":"c23","c3":"c33"} 56 | {"c1":"c12","c2":"c22","c3":"c32"} 57 | {"c1":"c11","c2":"c21","c3":"c31"} 58 | -- 59 | { 60 | "name": "test", 61 | "profile": "data-package", 62 | "resources": [ 63 | { 64 | "name": "concat-a1", 65 | "dpp:streaming": true, 66 | "path": "concat-a1.csv", 67 | "profile": "data-resource", 68 | "schema": { "fields": [ 69 | {"name": "a1", "type": "string"}, 70 | {"name": "a2", "type": "string"}, 71 | {"name": "a3", "type": "string"} 72 | ]} 73 | }, 74 | { 75 | "name": "concat-a2", 76 | "dpp:streaming": true, 77 | "path": "concat-a2.csv", 78 | "profile": "data-resource", 79 | "schema": { "fields": [ 80 | {"name": "a1", "type": "string"}, 81 | {"name": "a2", "type": "string"}, 82 | {"name": "a3", "type": "string"} 83 | ]} 84 | }, 85 | { 86 | "name": "concat-c", 87 | "dpp:streaming": true, 88 | "path": "concat-c.csv", 89 | "profile": "data-resource", 90 | "schema": { "fields": [ 91 | {"name": "c1", "type": "string"}, 92 | {"name": "c2", "type": "string"}, 93 | {"name": "c3", "type": "string"} 94 | ]} 95 | } 96 | ] 97 | } 98 | -- 99 | {"a1":"a3","a2":"a2","a3":"a2"} 100 | {"a1":"a1","a2":"a1","a3":"a2"} 101 | {"a1":"a4","a2":"a2","a3":"a1"} 102 | {"a1":"a2","a2":"a1","a3":"a1"} 103 | 104 | {"a1":"a3","a2":"a4","a3":"a2"} 105 | {"a1":"a1","a2":"a3","a3":"a2"} 106 | {"a1":"a4","a2":"a4","a3":"a1"} 107 | {"a1":"a2","a2":"a3","a3":"a1"} 108 | 109 | {"c1":"c13","c2":"c23","c3":"c33"} 110 | {"c1":"c12","c2":"c22","c3":"c32"} 111 | {"c1":"c11","c2":"c21","c3":"c31"} 112 | 113 | {} 114 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_add_resource: -------------------------------------------------------------------------------- 1 | add_resource 2 | -- 3 | { 4 | "name": "my-spiffy-resource", 5 | "url": "http://not.existent.com" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [] 11 | } 12 | -- 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [ 17 | { 18 | "name": "my-spiffy-resource", 19 | "dpp:streamedFrom": "http://not.existent.com", 20 | "path": "_" 21 | } 22 | ] 23 | } 24 | -- 25 | {} 26 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_concat: -------------------------------------------------------------------------------- 1 | concatenate 2 | -- 3 | { 4 | "sources": ["concat-a", "concat-b", "concat-c"], 5 | "target": {"name": "target"}, 6 | "fields": { 7 | "t1": ["a1", "b1", "c1"], 8 | "t2": ["a2", "b2", "c2"], 9 | "c3": ["a3", "b3"], 10 | "d4": null, 11 | "e5": [] 12 | } 13 | } 14 | -- 15 | { 16 | "name": "test", 17 | "resources": [ 18 | { 19 | "name": "concat-a", 20 | "dpp:streaming": true, 21 | "path": "concat-a.csv", 22 | "schema": { "fields": [ 23 | {"name": "a1", "type": "string"}, 24 | {"name": "a2", "type": "string"}, 25 | {"name": "a3", "type": "string"} 26 | ]} 27 | }, 28 | { 29 | "name": "concat-b", 30 | "dpp:streaming": true, 31 | "path": "concat-b.csv", 32 | "schema": { "fields": [ 33 | {"name": "b1", "type": "string"}, 34 | {"name": "b2", "type": "string"}, 35 | {"name": "b3", "type": "string"} 36 | ]} 37 | }, 38 | { 39 | "name": "concat-c", 40 | "dpp:streaming": true, 41 | "path": "concat-c.csv", 42 | "schema": { "fields": [ 43 | {"name": "c1", "type": "string"}, 44 | {"name": "c2", "type": "string"}, 45 | {"name": "c3", "type": "string"} 46 | ]} 47 | } 48 | ] 49 | } 50 | -- 51 | {"a1":"a11","a2":"a21","a3":"a31"} 52 | {"a1":"a12","a2":"a22","a3":"a32"} 53 | {"a1":"a13","a2":"a23","a3":"a33"} 54 | 55 | {"b1":"b11","b2":"b21","b3":"b31"} 56 | {"b1":"b12","b2":"b22","b3":"b32"} 57 | {"b1":"b13","b2":"b23","b3":"b33"} 58 | 59 | {"c1":"c11","c2":"c21","c3":"c31"} 60 | {"c1":"c12","c2":"c22","c3":"c32"} 61 | {"c1":"c13","c2":"c23","c3":"c33"} 62 | -- 63 | { 64 | "name": "test", 65 | "profile": "data-package", 66 | "resources": [ 67 | { 68 | "name": "target", 69 | "dpp:streaming": true, 70 | "path": "data/target.csv", 71 | "mediatype": "text/csv", 72 | "profile": "tabular-data-resource", 73 | "schema": { "fields": [ 74 | {"name": "t1", "format": "default", "type": "string"}, 75 | {"name": "t2", "format": "default", "type": "string"}, 76 | {"name": "c3", "format": "default", "type": "string"}, 77 | {"name": "d4", "format": "default", "type": "string"}, 78 | {"name": "e5", "format": "default", "type": "string"} 79 | ], 80 | "missingValues": [""]} 81 | } 82 | ] 83 | } 84 | -- 85 | {"t1":"a11","t2":"a21","c3":"a31","d4":null,"e5":null} 86 | {"t1":"a12","t2":"a22","c3":"a32","d4":null,"e5":null} 87 | {"t1":"a13","t2":"a23","c3":"a33","d4":null,"e5":null} 88 | {"t1":"b11","t2":"b21","c3":"b31","d4":null,"e5":null} 89 | {"t1":"b12","t2":"b22","c3":"b32","d4":null,"e5":null} 90 | {"t1":"b13","t2":"b23","c3":"b33","d4":null,"e5":null} 91 | {"t1":"c11","t2":"c21","c3":"c31","d4":null,"e5":null} 92 | {"t1":"c12","t2":"c22","c3":"c32","d4":null,"e5":null} 93 | {"t1":"c13","t2":"c23","c3":"c33","d4":null,"e5":null} 94 | 95 | {} 96 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_deduplicate: -------------------------------------------------------------------------------- 1 | deduplicate 2 | -- 3 | { 4 | "resources": ["concat-a1", "concat-a2"] 5 | } 6 | -- 7 | { 8 | "name": "test", 9 | "resources": [ 10 | { 11 | "name": "concat-a1", 12 | "dpp:streaming": true, 13 | "path": "concat-a1.csv", 14 | "schema": { "fields": [ 15 | {"name": "a1", "type": "string"}, 16 | {"name": "a2", "type": "string"}, 17 | {"name": "a3", "type": "string"} 18 | ], "primaryKey": ["a1", "a2"]} 19 | }, 20 | { 21 | "name": "concat-a2", 22 | "dpp:streaming": true, 23 | "path": "concat-a2.csv", 24 | "schema": { "fields": [ 25 | {"name": "a1", "type": "string"}, 26 | {"name": "a2", "type": "string"}, 27 | {"name": "a3", "type": "string"} 28 | ]} 29 | }, 30 | { 31 | "name": "concat-c", 32 | "dpp:streaming": true, 33 | "path": "concat-c.csv", 34 | "schema": { "fields": [ 35 | {"name": "c1", "type": "string"}, 36 | {"name": "c2", "type": "string"}, 37 | {"name": "c3", "type": "string"} 38 | ]} 39 | } 40 | ] 41 | } 42 | -- 43 | {"a1":"a1","a2":"a1","a3":"a2"} 44 | {"a1":"a2","a2":"a1","a3":"a1"} 45 | {"a1":"a1","a2":"a1","a3":"a2"} 46 | {"a1":"a2","a2":"a1","a3":"a1"} 47 | 48 | {"a1":"a1","a2":"a3","a3":"a2"} 49 | {"a1":"a2","a2":"a3","a3":"a1"} 50 | {"a1":"a3","a2":"a4","a3":"a2"} 51 | {"a1":"a4","a2":"a4","a3":"a1"} 52 | 53 | {"c1":"c11","c2":"c21","c3":"c31"} 54 | {"c1":"c12","c2":"c22","c3":"c32"} 55 | {"c1":"c13","c2":"c23","c3":"c33"} 56 | -- 57 | { 58 | "name": "test", 59 | "profile": "data-package", 60 | "resources": [ 61 | { 62 | "name": "concat-a1", 63 | "dpp:streaming": true, 64 | "path": "concat-a1.csv", 65 | "profile": "data-resource", 66 | "schema": { "fields": [ 67 | {"name": "a1", "type": "string"}, 68 | {"name": "a2", "type": "string"}, 69 | {"name": "a3", "type": "string"} 70 | ], "primaryKey": ["a1", "a2"]} 71 | }, 72 | { 73 | "name": "concat-a2", 74 | "dpp:streaming": true, 75 | "path": "concat-a2.csv", 76 | "profile": "data-resource", 77 | "schema": { "fields": [ 78 | {"name": "a1", "type": "string"}, 79 | {"name": "a2", "type": "string"}, 80 | {"name": "a3", "type": "string"} 81 | ]} 82 | }, 83 | { 84 | "name": "concat-c", 85 | "dpp:streaming": true, 86 | "path": "concat-c.csv", 87 | "profile": "data-resource", 88 | "schema": { "fields": [ 89 | {"name": "c1", "type": "string"}, 90 | {"name": "c2", "type": "string"}, 91 | {"name": "c3", "type": "string"} 92 | ]} 93 | } 94 | ] 95 | } 96 | -- 97 | {"a1":"a1","a2":"a1","a3":"a2"} 98 | {"a1":"a2","a2":"a1","a3":"a1"} 99 | 100 | {"a1":"a1","a2":"a3","a3":"a2"} 101 | {"a1":"a2","a2":"a3","a3":"a1"} 102 | {"a1":"a3","a2":"a4","a3":"a2"} 103 | {"a1":"a4","a2":"a4","a3":"a1"} 104 | 105 | {"c1":"c11","c2":"c21","c3":"c31"} 106 | {"c1":"c12","c2":"c22","c3":"c32"} 107 | {"c1":"c13","c2":"c23","c3":"c33"} 108 | 109 | {} 110 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_delete_fields: -------------------------------------------------------------------------------- 1 | delete_fields 2 | -- 3 | { 4 | "fields": ["last_name", "age"], 5 | "resources": "got-characters" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [ 11 | { 12 | "name": "got-characters", 13 | "dpp:streaming": true, 14 | "path": "characters.csv", 15 | "schema": { 16 | "fields": [ 17 | {"name": "first_name", "type": "string"}, 18 | {"name": "last_name", "type": "string"}, 19 | {"name": "house", "type": "string"}, 20 | {"name": "age", "type": "number", "units": "Westerosian Years"} 21 | ] 22 | } 23 | }, 24 | { 25 | "name": "got-houses", 26 | "dpp:streaming": true, 27 | "path": "houses.csv", 28 | "schema": { "fields": [ 29 | {"name": "house", "type": "string"} 30 | ]} 31 | } 32 | ] 33 | } 34 | -- 35 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 36 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 37 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 38 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 39 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 40 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 41 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 42 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 43 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 44 | 45 | {"house": "House of Lannister"} 46 | {"house": "House of Greyjoy"} 47 | {"house": "House of Stark"} 48 | {"house": "House of Targaryen"} 49 | {"house": "House of Martell"} 50 | {"house": "House of Tyrell"} 51 | -- 52 | { 53 | "name": "test", 54 | "profile": "data-package", 55 | "resources": [ 56 | { 57 | "name": "got-characters", 58 | "dpp:streaming": true, 59 | "path": "characters.csv", 60 | "profile": "data-resource", 61 | "schema": { "fields": [ 62 | {"name": "first_name", "type": "string"}, 63 | {"name": "house", "type": "string"} 64 | ]} 65 | }, 66 | { 67 | "name": "got-houses", 68 | "dpp:streaming": true, 69 | "path": "houses.csv", 70 | "profile": "data-resource", 71 | "schema": { "fields": [ 72 | {"name": "house", "type": "string"} 73 | ]} 74 | } 75 | ] 76 | } 77 | -- 78 | {"first_name": "Tyrion", "house": "Lannister"} 79 | {"first_name": "Jaime", "house": "Lannister"} 80 | {"first_name": "Cersei", "house": "Lannister"} 81 | {"first_name": "Jon", "house": "Stark"} 82 | {"first_name": "Sansa", "house": "Stark"} 83 | {"first_name": "Arya", "house": "Stark"} 84 | {"first_name": "Bran", "house": "Stark"} 85 | {"first_name": "Rickon", "house": "Stark"} 86 | {"first_name": "Daenerys", "house": "Targaryen"} 87 | 88 | {"house": "House of Lannister"} 89 | {"house": "House of Greyjoy"} 90 | {"house": "House of Stark"} 91 | {"house": "House of Targaryen"} 92 | {"house": "House of Martell"} 93 | {"house": "House of Tyrell"} 94 | 95 | {} 96 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_dump_dot_to_zip: -------------------------------------------------------------------------------- 1 | dump.to_zip 2 | -- 3 | { 4 | "out-file": "my-spiffy-resource.zip" 5 | } 6 | -- 7 | { 8 | "name": "test", 9 | "resources": [ 10 | { 11 | "name": "my-spiffy-resource", 12 | "dpp:streaming": true, 13 | "path": "data/my-data.csv", 14 | "schema": { 15 | "fields": [ 16 | {"name": "mystring", "type": "string"}, 17 | {"name": "myinteger", "type": "integer"}, 18 | {"name": "mynumber", "type": "number"}, 19 | {"name": "mydate", "type": "date"} 20 | ] 21 | } 22 | } 23 | ] 24 | } 25 | -- 26 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 27 | -- 28 | { 29 | "name": "test", 30 | "resources": [ 31 | { 32 | "name": "my-spiffy-resource", 33 | "dpp:streaming": true, 34 | "path": "data/my-data.csv", 35 | "encoding": "utf-8", 36 | "format": "csv", 37 | "dialect": { 38 | "delimiter": ",", 39 | "doubleQuote": true, 40 | "lineTerminator": "\r\n", 41 | "quoteChar": "\"", 42 | "skipInitialSpace": false 43 | }, 44 | "schema": { 45 | "fields": [ 46 | {"name": "mystring", "type": "string"}, 47 | {"name": "myinteger", "type": "integer"}, 48 | {"name": "mynumber", "type": "number", "groupChar": "", "decimalChar": "."}, 49 | {"format": "%Y-%m-%d", "name": "mydate", "type": "date"} 50 | ] 51 | } 52 | } 53 | ] 54 | } 55 | -- 56 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 57 | 58 | {"bytes": 703, "count_of_rows": 1, "dataset_name": "test", "hash": "a730863e99517930eab15f55036d309f"} 59 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_dump_dot_to_zip_with_hash: -------------------------------------------------------------------------------- 1 | dump.to_zip 2 | -- 3 | { 4 | "out-file": "my-spiffy-resource.zip", 5 | "add-filehash-to-path": true 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [ 11 | { 12 | "name": "my-spiffy-resource", 13 | "dpp:streaming": true, 14 | "path": "data/my-data.csv", 15 | "schema": { 16 | "fields": [ 17 | {"name": "mystring", "type": "string"}, 18 | {"name": "myinteger", "type": "integer"}, 19 | {"name": "mynumber", "type": "number"}, 20 | {"name": "mydate", "type": "date"} 21 | ] 22 | } 23 | } 24 | ] 25 | } 26 | -- 27 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 28 | -- 29 | { 30 | "name": "test", 31 | "resources": [ 32 | { 33 | "name": "my-spiffy-resource", 34 | "dpp:streaming": true, 35 | "path": "data/my-data.csv", 36 | "encoding": "utf-8", 37 | "format": "csv", 38 | "dialect": { 39 | "delimiter": ",", 40 | "doubleQuote": true, 41 | "lineTerminator": "\r\n", 42 | "quoteChar": "\"", 43 | "skipInitialSpace": false 44 | }, 45 | "schema": { 46 | "fields": [ 47 | {"name": "mystring", "type": "string"}, 48 | {"name": "myinteger", "type": "integer"}, 49 | {"name": "mynumber", "type": "number", "groupChar": "", "decimalChar": "."}, 50 | {"format": "%Y-%m-%d", "name": "mydate", "type": "date"} 51 | ] 52 | } 53 | } 54 | ] 55 | } 56 | -- 57 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 58 | 59 | {"bytes": 736, "count_of_rows": 1, "dataset_name": "test", "hash": "24b55bb6b0ecacdadbc8a1dc1fd9dab9"} 60 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_dump_dot_to_zip_with_hash_and_pretty_descriptor: -------------------------------------------------------------------------------- 1 | dump.to_zip 2 | -- 3 | { 4 | "out-file": "my-spiffy-resource.zip", 5 | "add-filehash-to-path": true, 6 | "pretty-descriptor": true 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "resources": [ 12 | { 13 | "name": "my-spiffy-resource", 14 | "dpp:streaming": true, 15 | "path": "data/my-data.csv", 16 | "schema": { 17 | "fields": [ 18 | {"name": "mystring", "type": "string"}, 19 | {"name": "myinteger", "type": "integer"}, 20 | {"name": "mynumber", "type": "number"}, 21 | {"name": "mydate", "type": "date"} 22 | ] 23 | } 24 | } 25 | ] 26 | } 27 | -- 28 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 29 | -- 30 | { 31 | "name": "test", 32 | "resources": [ 33 | { 34 | "name": "my-spiffy-resource", 35 | "dpp:streaming": true, 36 | "path": "data/my-data.csv", 37 | "encoding": "utf-8", 38 | "format": "csv", 39 | "dialect": { 40 | "delimiter": ",", 41 | "doubleQuote": true, 42 | "lineTerminator": "\r\n", 43 | "quoteChar": "\"", 44 | "skipInitialSpace": false 45 | }, 46 | "schema": { 47 | "fields": [ 48 | {"name": "mystring", "type": "string"}, 49 | {"name": "myinteger", "type": "integer"}, 50 | {"name": "mynumber", "type": "number", "groupChar": "", "decimalChar": "."}, 51 | {"format": "%Y-%m-%d", "name": "mydate", "type": "date"} 52 | ] 53 | } 54 | } 55 | ] 56 | } 57 | -- 58 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 59 | 60 | {"bytes": 1110, "count_of_rows": 1, "dataset_name": "test", "hash": "174d14a56ce3c798b369d1716488ca75"} 61 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_dump_to_sql: -------------------------------------------------------------------------------- 1 | dump_to_sql 2 | -- 3 | { 4 | "tables": { 5 | "test": { 6 | "resource-name": "my-spiffy-resource" 7 | } 8 | } 9 | } 10 | -- 11 | { 12 | "name": "test", 13 | "resources": [ 14 | { 15 | "name": "my-spiffy-resource", 16 | "dpp:streaming": true, 17 | "path": "data/my-data.csv", 18 | "schema": { 19 | "fields": [ 20 | {"name": "mystring", "type": "string"}, 21 | {"name": "myinteger", "type": "integer"}, 22 | {"name": "mynumber", "type": "number"}, 23 | {"name": "mydate", "type": "date"} 24 | ] 25 | } 26 | } 27 | ] 28 | } 29 | -- 30 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 31 | -- 32 | { 33 | "name": "test", 34 | "profile": "data-package", 35 | "resources": [ 36 | { 37 | "name": "my-spiffy-resource", 38 | "dpp:streaming": true, 39 | "path": "data/my-data.csv", 40 | "profile": "data-resource", 41 | "schema": { 42 | "fields": [ 43 | {"name": "mystring", "type": "string"}, 44 | {"name": "myinteger", "type": "integer"}, 45 | {"name": "mynumber", "type": "number"}, 46 | {"name": "mydate", "type": "date"} 47 | ] 48 | } 49 | } 50 | ] 51 | } 52 | -- 53 | {"mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}, "myinteger": null} 54 | 55 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "c1c867cd9711aedd5c94a16ce4590ece"} 56 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_dump_to_zip: -------------------------------------------------------------------------------- 1 | dump_to_zip 2 | -- 3 | { 4 | "out-file": "my-spiffy-resource.zip" 5 | } 6 | -- 7 | { 8 | "name": "test", 9 | "resources": [ 10 | { 11 | "name": "my-spiffy-resource", 12 | "dpp:streaming": true, 13 | "path": "data/my-data.csv", 14 | "schema": { 15 | "fields": [ 16 | {"name": "mystring", "type": "string"}, 17 | {"name": "myinteger", "type": "integer"}, 18 | {"name": "mynumber", "type": "number"}, 19 | {"name": "mydate", "type": "date"} 20 | ] 21 | } 22 | } 23 | ] 24 | } 25 | -- 26 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 27 | -- 28 | { 29 | "name": "test", 30 | "profile": "data-package", 31 | "resources": [ 32 | { 33 | "name": "my-spiffy-resource", 34 | "dialect": { 35 | "delimiter": ",", 36 | "doubleQuote": true, 37 | "lineTerminator": "\r\n", 38 | "quoteChar": "\"", 39 | "skipInitialSpace": false 40 | }, 41 | "encoding": "utf-8", 42 | "format": "csv", 43 | "dpp:streaming": true, 44 | "path": "data/my-data.csv", 45 | "profile": "data-resource", 46 | "schema": { 47 | "fields": [ 48 | {"name": "mystring", "type": "string"}, 49 | {"name": "myinteger", "type": "integer"}, 50 | {"name": "mynumber", "type": "number", "decimalChar": ".", "groupChar": ""}, 51 | {"name": "mydate", "type": "date", "format": "%Y-%m-%d"} 52 | ] 53 | } 54 | } 55 | ] 56 | } 57 | -- 58 | {"mystring":"a", "myinteger": null, "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}} 59 | 60 | {"bytes": 1143, "count_of_rows": 1, "dataset_name": "test", "hash": "c68a5400c197333d75d34f4c198fea0b"} 61 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_dump_to_zip_with_hash: -------------------------------------------------------------------------------- 1 | dump_to_zip 2 | -- 3 | { 4 | "out-file": "my-spiffy-resource.zip", 5 | "add-filehash-to-path": true 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [ 11 | { 12 | "name": "my-spiffy-resource", 13 | "dpp:streaming": true, 14 | "path": "data/my-data.csv", 15 | "schema": { 16 | "fields": [ 17 | {"name": "mystring", "type": "string"}, 18 | {"name": "myinteger", "type": "integer"}, 19 | {"name": "mynumber", "type": "number"}, 20 | {"name": "mydate", "type": "date"} 21 | ] 22 | } 23 | } 24 | ] 25 | } 26 | -- 27 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 28 | -- 29 | { 30 | "name": "test", 31 | "profile": "data-package", 32 | "resources": [ 33 | { 34 | "name": "my-spiffy-resource", 35 | "dialect": { 36 | "delimiter": ",", 37 | "doubleQuote": true, 38 | "lineTerminator": "\r\n", 39 | "quoteChar": "\"", 40 | "skipInitialSpace": false 41 | }, 42 | "encoding": "utf-8", 43 | "format": "csv", 44 | "dpp:streaming": true, 45 | "path": "data/my-data.csv", 46 | "profile": "data-resource", 47 | "schema": { 48 | "fields": [ 49 | {"name": "mystring", "type": "string"}, 50 | {"name": "myinteger", "type": "integer"}, 51 | {"name": "mynumber", "type": "number", "decimalChar": ".", "groupChar": ""}, 52 | {"name": "mydate", "type": "date", "format": "%Y-%m-%d"} 53 | ] 54 | } 55 | } 56 | ] 57 | } 58 | -- 59 | {"mystring":"a", "myinteger": null, "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}} 60 | 61 | {"bytes": 1143, "count_of_rows": 1, "dataset_name": "test", "hash": "c68a5400c197333d75d34f4c198fea0b"} 62 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_dump_to_zip_with_hash_and_pretty_descriptor: -------------------------------------------------------------------------------- 1 | dump_to_zip 2 | -- 3 | { 4 | "out-file": "my-spiffy-resource.zip", 5 | "add-filehash-to-path": true, 6 | "pretty-descriptor": true 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "profile": "data-package", 12 | "resources": [ 13 | { 14 | "name": "my-spiffy-resource", 15 | "dpp:streaming": true, 16 | "path": "data/my-data.csv", 17 | "profile": "data-resource", 18 | "schema": { 19 | "fields": [ 20 | {"name": "mystring", "type": "string"}, 21 | {"name": "myinteger", "type": "integer"}, 22 | {"name": "mynumber", "type": "number"}, 23 | {"name": "mydate", "type": "date"} 24 | ] 25 | } 26 | } 27 | ] 28 | } 29 | -- 30 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}} 31 | -- 32 | { 33 | "name": "test", 34 | "profile": "data-package", 35 | "resources": [ 36 | { 37 | "name": "my-spiffy-resource", 38 | "dialect": { 39 | "delimiter": ",", 40 | "doubleQuote": true, 41 | "lineTerminator": "\r\n", 42 | "quoteChar": "\"", 43 | "skipInitialSpace": false 44 | }, 45 | "encoding": "utf-8", 46 | "format": "csv", 47 | "dpp:streaming": true, 48 | "path": "data/my-data.csv", 49 | "profile": "data-resource", 50 | "schema": { 51 | "fields": [ 52 | {"name": "mystring", "type": "string"}, 53 | {"name": "myinteger", "type": "integer"}, 54 | {"name": "mynumber", "type": "number", "decimalChar": ".", "groupChar": ""}, 55 | {"name": "mydate", "type": "date", "format": "%Y-%m-%d"} 56 | ] 57 | } 58 | } 59 | ] 60 | } 61 | -- 62 | {"mystring":"a", "myinteger": null, "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}} 63 | 64 | {"bytes": 1143, "count_of_rows": 1, "dataset_name": "test", "hash": "c68a5400c197333d75d34f4c198fea0b"} 65 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_filter: -------------------------------------------------------------------------------- 1 | filter 2 | -- 3 | { 4 | "resources": ["concat-a1", "concat-a2"], 5 | "in": [ 6 | {"a1": "a1", 7 | "a2": "a2"}, 8 | {"a1": "a2"} 9 | ], 10 | "out": [{"a3": "a1"}] 11 | } 12 | -- 13 | { 14 | "name": "test", 15 | "resources": [ 16 | { 17 | "name": "concat-a1", 18 | "dpp:streaming": true, 19 | "path": "concat-a1.csv", 20 | "schema": { "fields": [ 21 | {"name": "a1", "type": "string"}, 22 | {"name": "a2", "type": "string"}, 23 | {"name": "a3", "type": "string"} 24 | ]} 25 | }, 26 | { 27 | "name": "concat-a2", 28 | "dpp:streaming": true, 29 | "path": "concat-a2.csv", 30 | "schema": { "fields": [ 31 | {"name": "a1", "type": "string"}, 32 | {"name": "a2", "type": "string"}, 33 | {"name": "a3", "type": "string"} 34 | ]} 35 | }, 36 | { 37 | "name": "concat-c", 38 | "dpp:streaming": true, 39 | "path": "concat-c.csv", 40 | "schema": { "fields": [ 41 | {"name": "c1", "type": "string"}, 42 | {"name": "c2", "type": "string"}, 43 | {"name": "c3", "type": "string"} 44 | ]} 45 | } 46 | ] 47 | } 48 | -- 49 | {"a1":"a1","a2":"a1","a3":"a2"} 50 | {"a1":"a2","a2":"a1","a3":"a1"} 51 | {"a1":"a3","a2":"a2","a3":"a2"} 52 | {"a1":"a4","a2":"a2","a3":"a1"} 53 | 54 | {"a1":"a1","a2":"a3","a3":"a2"} 55 | {"a1":"a2","a2":"a3","a3":"a1"} 56 | {"a1":"a3","a2":"a4","a3":"a2"} 57 | {"a1":"a4","a2":"a4","a3":"a1"} 58 | 59 | {"c1":"c11","c2":"c21","c3":"c31"} 60 | {"c1":"c12","c2":"c22","c3":"c32"} 61 | {"c1":"c13","c2":"c23","c3":"c33"} 62 | -- 63 | { 64 | "name": "test", 65 | "profile": "data-package", 66 | "resources": [ 67 | { 68 | "name": "concat-a1", 69 | "dpp:streaming": true, 70 | "path": "concat-a1.csv", 71 | "profile": "data-resource", 72 | "schema": { "fields": [ 73 | {"name": "a1", "type": "string"}, 74 | {"name": "a2", "type": "string"}, 75 | {"name": "a3", "type": "string"} 76 | ]} 77 | }, 78 | { 79 | "name": "concat-a2", 80 | "dpp:streaming": true, 81 | "path": "concat-a2.csv", 82 | "profile": "data-resource", 83 | "schema": { "fields": [ 84 | {"name": "a1", "type": "string"}, 85 | {"name": "a2", "type": "string"}, 86 | {"name": "a3", "type": "string"} 87 | ]} 88 | }, 89 | { 90 | "name": "concat-c", 91 | "dpp:streaming": true, 92 | "path": "concat-c.csv", 93 | "profile": "data-resource", 94 | "schema": { "fields": [ 95 | {"name": "c1", "type": "string"}, 96 | {"name": "c2", "type": "string"}, 97 | {"name": "c3", "type": "string"} 98 | ]} 99 | } 100 | ] 101 | } 102 | -- 103 | {"a1":"a1","a2":"a1","a3":"a2"} 104 | {"a1":"a2","a2":"a1","a3":"a1"} 105 | {"a1":"a3","a2":"a2","a3":"a2"} 106 | {"a1":"a4","a2":"a2","a3":"a1"} 107 | 108 | {"a1":"a1","a2":"a3","a3":"a2"} 109 | {"a1":"a2","a2":"a3","a3":"a1"} 110 | {"a1":"a3","a2":"a4","a3":"a2"} 111 | 112 | {"c1":"c11","c2":"c21","c3":"c31"} 113 | {"c1":"c12","c2":"c22","c3":"c32"} 114 | {"c1":"c13","c2":"c23","c3":"c33"} 115 | 116 | {} 117 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_find_replace: -------------------------------------------------------------------------------- 1 | find_replace 2 | -- 3 | { 4 | "resources": "dates", 5 | "fields": [ 6 | { 7 | "name": "year", 8 | "patterns": [ 9 | { 10 | "find": "([0-9]{4})( \\(\\w+\\))", 11 | "replace": "\\1" 12 | } 13 | ] 14 | }, 15 | { 16 | "name": "quarter", 17 | "patterns": [ 18 | { 19 | "find": "Q1", 20 | "replace": "03-31" 21 | }, 22 | { 23 | "find": "Q2", 24 | "replace": "06-30" 25 | }, 26 | { 27 | "find": "Q3", 28 | "replace": "09-30" 29 | }, 30 | { 31 | "find": "Q4", 32 | "replace": "12-31" 33 | } 34 | ] 35 | } 36 | ] 37 | } 38 | -- 39 | { 40 | "name": "test", 41 | "resources": [ 42 | { 43 | "name": "dates", 44 | "dpp:streaming": true, 45 | "path": "dates.csv", 46 | "schema": { 47 | "fields": [ 48 | {"name": "year", "type": "string"}, 49 | {"name": "quarter", "type": "string"}, 50 | {"name": "char", "type": "string"} 51 | ] 52 | } 53 | } 54 | ] 55 | } 56 | -- 57 | {"year": "2001", "quarter": "2001-Q1", "char": "testing"} 58 | {"year": "2002", "quarter": "2002-Q2", "char": "testing"} 59 | {"year": "2003 (4)", "quarter": "2003-Q3", "char": "testing"} 60 | {"year": "2004", "quarter": "2004-Q1", "char": "testing"} 61 | {"year": "2005 (1)", "quarter": "2005-Q4", "char": "testing"} 62 | {"year": "2006 (Note)", "quarter": "2006-Q1", "char": "testing"} 63 | {"year": "2007 (2)", "quarter": "2007-Q2", "char": "testing"} 64 | {"year": "2008", "quarter": "2008-Q1", "char": "testing"} 65 | {"year": "2009 (10)", "quarter": "2009-Q3", "char": "testing"} 66 | -- 67 | { 68 | "name": "test", 69 | "profile": "data-package", 70 | "resources": [ 71 | { 72 | "name": "dates", 73 | "dpp:streaming": true, 74 | "path": "dates.csv", 75 | "profile": "data-resource", 76 | "schema": { 77 | "fields": [ 78 | {"name": "year", "type": "string"}, 79 | {"name": "quarter", "type": "string"}, 80 | {"name": "char", "type": "string"} 81 | ] 82 | } 83 | } 84 | ] 85 | } 86 | -- 87 | {"year": "2001", "quarter": "2001-03-31", "char": "testing"} 88 | {"year": "2002", "quarter": "2002-06-30", "char": "testing"} 89 | {"year": "2003", "quarter": "2003-09-30", "char": "testing"} 90 | {"year": "2004", "quarter": "2004-03-31", "char": "testing"} 91 | {"year": "2005", "quarter": "2005-12-31", "char": "testing"} 92 | {"year": "2006", "quarter": "2006-03-31", "char": "testing"} 93 | {"year": "2007", "quarter": "2007-06-30", "char": "testing"} 94 | {"year": "2008", "quarter": "2008-03-31", "char": "testing"} 95 | {"year": "2009", "quarter": "2009-09-30", "char": "testing"} 96 | 97 | {} 98 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_join: -------------------------------------------------------------------------------- 1 | join 2 | -- 3 | { 4 | "source": { 5 | "name": "got-characters", 6 | "key": "House of {house}", 7 | "delete": true 8 | }, 9 | "target": { 10 | "name": "got-houses", 11 | "key": "{house}" 12 | }, 13 | "fields": { 14 | "max_age": { 15 | "name": "age", 16 | "aggregate": "max" 17 | }, 18 | "avg_age": { 19 | "name": "age", 20 | "aggregate": "avg" 21 | }, 22 | "representative": { 23 | "name": "first_name", 24 | "aggregate": "last" 25 | }, 26 | "representative_age": { 27 | "name": "age" 28 | }, 29 | "number_of_characters": { 30 | "aggregate": "count" 31 | }, 32 | "last_names": { 33 | "name": "last_name", 34 | "aggregate": "counters" 35 | } 36 | }, 37 | "full": false 38 | } 39 | -- 40 | { 41 | "name": "test", 42 | "resources": [ 43 | { 44 | "name": "got-characters", 45 | "dpp:streaming": true, 46 | "path": "characters.csv", 47 | "schema": { 48 | "fields": [ 49 | {"name": "first_name", "type": "string"}, 50 | {"name": "last_name", "type": "string"}, 51 | {"name": "house", "type": "string"}, 52 | {"name": "age", "type": "number", "units": "Westerosian Years"} 53 | ] 54 | } 55 | }, 56 | { 57 | "name": "got-houses", 58 | "dpp:streaming": true, 59 | "path": "houses.csv", 60 | "schema": { "fields": [ 61 | {"name": "house", "type": "string"} 62 | ]} 63 | } 64 | ] 65 | } 66 | -- 67 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 68 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 69 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 70 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 71 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 72 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 73 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 74 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 75 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 76 | 77 | {"house": "House of Lannister"} 78 | {"house": "House of Greyjoy"} 79 | {"house": "House of Stark"} 80 | {"house": "House of Targaryen"} 81 | {"house": "House of Martell"} 82 | {"house": "House of Tyrell"} 83 | -- 84 | { 85 | "name": "test", 86 | "profile": "data-package", 87 | "resources": [ 88 | { 89 | "name": "got-houses", 90 | "dpp:streaming": true, 91 | "path": "houses.csv", 92 | "profile": "data-resource", 93 | "schema": { "fields": [ 94 | {"name": "house", "type": "string"}, 95 | {"name": "avg_age", "type": "number"}, 96 | {"name": "last_names", "type": "array"}, 97 | {"name": "max_age", "type": "number"}, 98 | {"name": "number_of_characters", "type": "integer"}, 99 | {"name": "representative", "type": "string"}, 100 | {"name": "representative_age", "type": "number", "units": "Westerosian Years"} 101 | ]} 102 | } 103 | ] 104 | } 105 | -- 106 | {"avg_age": 31.666666666666668, "house": "House of Lannister", "max_age": 34, "number_of_characters": 3, "representative": "Cersei", "representative_age": 34, "last_names": [["Lannister", 3]]} 107 | {"avg_age": 11.4, "house": "House of Stark", "max_age": 17, "number_of_characters": 5, "representative": "Rickon", "representative_age": 5, "last_names": [["Stark", 4], ["Snow", 1]]} 108 | {"avg_age": 16.0, "house": "House of Targaryen", "max_age": 16, "number_of_characters": 1, "representative": "Daenerys", "representative_age": 16, "last_names": [["Targaryen", 1]]} 109 | 110 | {} 111 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load: -------------------------------------------------------------------------------- 1 | load 2 | -- 3 | { 4 | "from": "https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/master/tests/data/sample.csv", 5 | "name": "my-spiffy-resource", 6 | "validate": true 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "resources": [] 12 | } 13 | -- 14 | -- 15 | { 16 | "name": "test", 17 | "profile": "data-package", 18 | "resources": [ 19 | { 20 | "dpp:streaming": true, 21 | "format": "csv", 22 | "name": "my-spiffy-resource", 23 | "path": "my-spiffy-resource.csv", 24 | "profile": "tabular-data-resource", 25 | "dpp:streamedFrom": "https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/master/tests/data/sample.csv", 26 | "schema": { 27 | "fields": [ 28 | {"format": "default", "name": "first_name", "type": "string"}, 29 | {"format": "default", "name": "last_name", "type": "string"}, 30 | {"format": "default", "name": "house", "type": "string"}, 31 | {"format": "default", "name": "age", "type": "integer"} 32 | ], 33 | "missingValues": [""] 34 | } 35 | } 36 | ] 37 | } 38 | -- 39 | {"age": 27, "first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister"} 40 | {"age": 34, "first_name": "Jaime", "house": "Lannister", "last_name": "Lannister"} 41 | {"age": 34, "first_name": "Cersei", "house": "Lannister", "last_name": "Lannister"} 42 | {"age": 17, "first_name": "Jon", "house": "Stark", "last_name": "Snow"} 43 | {"age": 14, "first_name": "Sansa", "house": "Stark", "last_name": "Stark"} 44 | {"age": 11, "first_name": "Arya", "house": "Stark", "last_name": "Stark"} 45 | {"age": 10, "first_name": "Bran", "house": "Stark", "last_name": "Stark"} 46 | {"age": 5, "first_name": "Rickon", "house": "Stark", "last_name": "Stark"} 47 | {"age": 16, "first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen"} 48 | 49 | {} 50 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_metadata: -------------------------------------------------------------------------------- 1 | load_metadata 2 | -- 3 | { 4 | "url": "tests/data/datapackage.json" 5 | } 6 | -- 7 | { 8 | "name": "test", 9 | "resources": [] 10 | } 11 | -- 12 | -- 13 | { 14 | "name": "my-spiffy-datapackage", 15 | "my-prop": "the-props-value", 16 | "profile": "data-package", 17 | "resources": [] 18 | } 19 | -- 20 | {} 21 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resource": "my-spiffy-resource", 5 | "url": "tests/data/datapackage.json" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [] 11 | } 12 | -- 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [ 17 | { 18 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 19 | "name": "my-spiffy-resource", 20 | "dpp:streaming": true, 21 | "profile": "data-resource", 22 | "path": "sample.csv", 23 | "schema": { 24 | "fields": [ 25 | {"name": "first_name", "type": "string"}, 26 | {"name": "last_name", "type": "string"}, 27 | {"name": "house", "type": "string"}, 28 | {"name": "age", "type": "integer"} 29 | ], 30 | "primaryKey": [ 31 | "first_name", "last_name" 32 | ] 33 | } 34 | } 35 | ] 36 | } 37 | -- 38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 43 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 44 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 45 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 46 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 47 | 48 | {} 49 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_dups: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resource": "my-spiffy-resource", 5 | "url": "tests/data/datapackage3.json" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [] 11 | } 12 | -- 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [ 17 | { 18 | "dpp:streamedFrom": "%(base)s/tests/data/sample.dups.csv", 19 | "name": "my-spiffy-resource", 20 | "dpp:streaming": true, 21 | "profile": "data-resource", 22 | "path": "sample.dups.csv", 23 | "schema": { 24 | "fields": [ 25 | {"name": "first_name", "type": "string"}, 26 | {"name": "last_name", "type": "string"}, 27 | {"name": "house", "type": "string"}, 28 | {"name": "age", "type": "integer"} 29 | ], 30 | "primaryKey": [ 31 | "first_name", "last_name" 32 | ] 33 | } 34 | } 35 | ] 36 | } 37 | -- 38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 43 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 44 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 45 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 46 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 47 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 48 | 49 | {} 50 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_index: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resource": 0, 5 | "url": "tests/data/datapackage.json" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [] 11 | } 12 | -- 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [ 17 | { 18 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 19 | "name": "my-spiffy-resource", 20 | "dpp:streaming": true, 21 | "profile": "data-resource", 22 | "path": "sample.csv", 23 | "schema": { 24 | "fields": [ 25 | {"name": "first_name", "type": "string"}, 26 | {"name": "last_name", "type": "string"}, 27 | {"name": "house", "type": "string"}, 28 | {"name": "age", "type": "integer"} 29 | ], 30 | "primaryKey": [ 31 | "first_name", "last_name" 32 | ] 33 | } 34 | } 35 | ] 36 | } 37 | -- 38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 43 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 44 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 45 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 46 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 47 | 48 | {} 49 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_limit_rows: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resource": "my-spiffy-resource", 5 | "url": "tests/data/datapackage.json", 6 | "limit-rows": 5 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "resources": [] 12 | } 13 | -- 14 | -- 15 | { 16 | "name": "test", 17 | "resources": [ 18 | { 19 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 20 | "name": "my-spiffy-resource", 21 | "dpp:streaming": true, 22 | "profile": "data-resource", 23 | "path": "sample.csv", 24 | "schema": { 25 | "fields": [ 26 | {"name": "first_name", "type": "string"}, 27 | {"name": "last_name", "type": "string"}, 28 | {"name": "house", "type": "string"}, 29 | {"name": "age", "type": "integer"} 30 | ], 31 | "primaryKey": [ 32 | "first_name", "last_name" 33 | ] 34 | } 35 | } 36 | ] 37 | } 38 | -- 39 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 40 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 41 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 42 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 43 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 44 | 45 | {} 46 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_list: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resource": ["my-spiffy-resource", "the-spiffy-resource"], 5 | "url": "tests/data/datapackage2.json" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [] 11 | } 12 | -- 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [ 17 | { 18 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 19 | "name": "my-spiffy-resource", 20 | "dpp:streaming": true, 21 | "profile": "data-resource", 22 | "path": "sample.csv", 23 | "schema": { 24 | "fields": [ 25 | {"name": "first_name", "type": "string"}, 26 | {"name": "last_name", "type": "string"}, 27 | {"name": "house", "type": "string"}, 28 | {"name": "age", "type": "integer"} 29 | ] 30 | } 31 | }, 32 | { 33 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 34 | "name": "the-spiffy-resource", 35 | "dpp:streaming": true, 36 | "profile": "data-resource", 37 | "path": "sample.csv", 38 | "schema": { 39 | "fields": [ 40 | {"name": "first_name", "type": "string"}, 41 | {"name": "last_name", "type": "string"}, 42 | {"name": "house", "type": "string"}, 43 | {"name": "age", "type": "integer"} 44 | ] 45 | } 46 | } 47 | ] 48 | } 49 | -- 50 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 51 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 52 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 53 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 54 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 55 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 56 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 57 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 58 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 59 | 60 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 61 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 62 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 63 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 64 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 65 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 66 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 67 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 68 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 69 | 70 | {} 71 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_multi: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resource": "t.+", 5 | "url": "tests/data/datapackage2.json" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [] 11 | } 12 | -- 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [ 17 | { 18 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 19 | "name": "the-spiffy-resource", 20 | "dpp:streaming": true, 21 | "profile": "data-resource", 22 | "path": "sample.csv", 23 | "schema": { 24 | "fields": [ 25 | {"name": "first_name", "type": "string"}, 26 | {"name": "last_name", "type": "string"}, 27 | {"name": "house", "type": "string"}, 28 | {"name": "age", "type": "integer"} 29 | ] 30 | } 31 | }, 32 | { 33 | "dpp:streamedFrom": "%(base)s/tests/data/sample2.csv", 34 | "name": "the-other-spiffy-resource", 35 | "dpp:streaming": true, 36 | "profile": "data-resource", 37 | "path": "sample2.csv", 38 | "schema": { 39 | "fields": [ 40 | {"name": "first_name", "type": "string"}, 41 | {"name": "last_name", "type": "string"}, 42 | {"name": "house", "type": "string"}, 43 | {"name": "age", "type": "integer"} 44 | ] 45 | } 46 | } 47 | ] 48 | } 49 | -- 50 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 51 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 52 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 53 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 54 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 55 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 56 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 57 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 58 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 59 | 60 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 61 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 62 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 63 | 64 | {} 65 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_required: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resource": "foobar", 5 | "url": "foo/bar/datapackage.json", 6 | "required": false 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "resources": [] 12 | } 13 | -- 14 | -- 15 | { 16 | "name": "test", 17 | "resources": [] 18 | } 19 | -- 20 | {} 21 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_resources: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resources": { 5 | "my-spiffy-resource": {}, 6 | "the-spiffy-resource": { 7 | "name": "renamed-spiffy-resource", 8 | "path": "renamed-spiffy-resource.csv" 9 | } 10 | }, 11 | "url": "tests/data/datapackage2.json" 12 | } 13 | -- 14 | { 15 | "name": "test", 16 | "resources": [] 17 | } 18 | -- 19 | -- 20 | { 21 | "name": "test", 22 | "resources": [ 23 | { 24 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 25 | "name": "my-spiffy-resource", 26 | "dpp:streaming": true, 27 | "profile": "data-resource", 28 | "path": "sample.csv", 29 | "schema": { 30 | "fields": [ 31 | {"name": "first_name", "type": "string"}, 32 | {"name": "last_name", "type": "string"}, 33 | {"name": "house", "type": "string"}, 34 | {"name": "age", "type": "integer"} 35 | ] 36 | } 37 | }, 38 | { 39 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 40 | "name": "renamed-spiffy-resource", 41 | "dpp:streaming": true, 42 | "profile": "data-resource", 43 | "path": "renamed-spiffy-resource.csv", 44 | "schema": { 45 | "fields": [ 46 | {"name": "first_name", "type": "string"}, 47 | {"name": "last_name", "type": "string"}, 48 | {"name": "house", "type": "string"}, 49 | {"name": "age", "type": "integer"} 50 | ] 51 | } 52 | } 53 | ] 54 | } 55 | -- 56 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 57 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 58 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 59 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 60 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 61 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 62 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 63 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 64 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 65 | 66 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 67 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 68 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 69 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 70 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 71 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 72 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 73 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 74 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 75 | 76 | {} 77 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_load_resource_resources_required: -------------------------------------------------------------------------------- 1 | load_resource 2 | -- 3 | { 4 | "resources": { 5 | "my-spiffy-resource": {}, 6 | "nonexistent-spiffy-resource": { 7 | "name": "renamed-spiffy-resource", 8 | "path": "renamed-spiffy-resource.csv" 9 | } 10 | }, 11 | "url": "tests/data/datapackage2.json", 12 | "required": false 13 | } 14 | -- 15 | { 16 | "name": "test", 17 | "resources": [] 18 | } 19 | -- 20 | -- 21 | { 22 | "name": "test", 23 | "resources": [ 24 | { 25 | "dpp:streamedFrom": "%(base)s/tests/data/sample.csv", 26 | "name": "my-spiffy-resource", 27 | "dpp:streaming": true, 28 | "profile": "data-resource", 29 | "path": "sample.csv", 30 | "schema": { 31 | "fields": [ 32 | {"name": "first_name", "type": "string"}, 33 | {"name": "last_name", "type": "string"}, 34 | {"name": "house", "type": "string"}, 35 | {"name": "age", "type": "integer"} 36 | ] 37 | } 38 | } 39 | ] 40 | } 41 | -- 42 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27} 43 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34} 44 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34} 45 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17} 46 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14} 47 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11} 48 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10} 49 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5} 50 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16} 51 | 52 | {} 53 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_resource_duplication: -------------------------------------------------------------------------------- 1 | duplicate 2 | -- 3 | { 4 | "source": "original", 5 | "target-name": "the-dup", 6 | "target-path": "the-dup.csv" 7 | } 8 | -- 9 | { 10 | "name": "test", 11 | "resources": [ 12 | { 13 | "name": "original", 14 | "dpp:streaming": true, 15 | "path": "data.csv", 16 | "schema": { "fields": [ 17 | {"name": "year", "type": "integer"}, 18 | {"name": "data", "type": "string"} 19 | ]} 20 | } 21 | ] 22 | } 23 | -- 24 | {"year":"2016","data":"foo","i":0} 25 | {"year":"2017","data":"baz","i":1} 26 | {"year":"2017","data":"bax","i":2} 27 | {"year":"2015","data":"","i":3} 28 | {"year":"2015","data":"","i":4} 29 | {"year":"2015","data":"","i":5} 30 | {"year":"2015","data":"","i":6} 31 | {"year":"2015","data":"","i":7} 32 | {"year":"2015","data":"","i":8} 33 | {"year":"2015","data":"","i":9} 34 | {"year":"2015","data":"","i":10} 35 | {"year":"2015","data":"","i":11} 36 | -- 37 | { 38 | "name": "test", 39 | "profile": "data-package", 40 | "resources": [ 41 | { 42 | "name": "original", 43 | "dpp:streaming": true, 44 | "path": "data.csv", 45 | "profile": "data-resource", 46 | "schema": { "fields": [ 47 | {"name": "year", "type": "integer"}, 48 | {"name": "data", "type": "string"} 49 | ]} 50 | }, 51 | { 52 | "name": "the-dup", 53 | "dpp:streaming": true, 54 | "path": "the-dup.csv", 55 | "profile": "data-resource", 56 | "schema": { "fields": [ 57 | {"name": "year", "type": "integer"}, 58 | {"name": "data", "type": "string"} 59 | ]} 60 | } 61 | ] 62 | } 63 | -- 64 | {"year":"2016","data":"foo","i":0} 65 | {"year":"2017","data":"baz","i":1} 66 | {"year":"2017","data":"bax","i":2} 67 | {"year":"2015","data":"","i":3} 68 | {"year":"2015","data":"","i":4} 69 | {"year":"2015","data":"","i":5} 70 | {"year":"2015","data":"","i":6} 71 | {"year":"2015","data":"","i":7} 72 | {"year":"2015","data":"","i":8} 73 | {"year":"2015","data":"","i":9} 74 | {"year":"2015","data":"","i":10} 75 | {"year":"2015","data":"","i":11} 76 | 77 | {"year":"2016","data":"foo","i":0} 78 | {"year":"2017","data":"baz","i":1} 79 | {"year":"2017","data":"bax","i":2} 80 | {"year":"2015","data":"","i":3} 81 | {"year":"2015","data":"","i":4} 82 | {"year":"2015","data":"","i":5} 83 | {"year":"2015","data":"","i":6} 84 | {"year":"2015","data":"","i":7} 85 | {"year":"2015","data":"","i":8} 86 | {"year":"2015","data":"","i":9} 87 | {"year":"2015","data":"","i":10} 88 | {"year":"2015","data":"","i":11} 89 | 90 | {} 91 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_set_types: -------------------------------------------------------------------------------- 1 | set_types 2 | -- 3 | { 4 | "types": { 5 | "t1": {"type": "number", "groupChar": ","}, 6 | "t2": null 7 | } 8 | } 9 | -- 10 | { 11 | "name": "test", 12 | "resources": [ 13 | { 14 | "name": "concat-a", 15 | "dpp:streaming": true, 16 | "path": "concat-a.csv", 17 | "schema": { "fields": [ 18 | {"name": "t1", "type": "string"}, 19 | {"name": "t2", "type": "string"} 20 | ]} 21 | } 22 | ] 23 | } 24 | -- 25 | {"t1": "123,456", "t2": "to-remove"} 26 | {"t1": "456,123", "t2": "to-remove"} 27 | -- 28 | { 29 | "name": "test", 30 | "profile": "data-package", 31 | "resources": [ 32 | { 33 | "name": "concat-a", 34 | "dpp:streaming": true, 35 | "path": "concat-a.csv", 36 | "profile": "data-resource", 37 | "schema": { "fields": [ 38 | {"name": "t1", "type": "number", "groupChar": ","} 39 | ]} 40 | } 41 | ] 42 | } 43 | -- 44 | {"t1": {"type{decimal}": "123456"}} 45 | {"t1": {"type{decimal}": "456123"}} 46 | 47 | {} 48 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_sort: -------------------------------------------------------------------------------- 1 | sort 2 | -- 3 | { 4 | "resources": ["concat-a1", "concat-a2"], 5 | "sort-by": "{a3} {a2} {a1}" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [ 11 | { 12 | "name": "concat-a1", 13 | "dpp:streaming": true, 14 | "path": "concat-a1.csv", 15 | "schema": { "fields": [ 16 | {"name": "a1", "type": "string"}, 17 | {"name": "a2", "type": "string"}, 18 | {"name": "a3", "type": "string"} 19 | ]} 20 | }, 21 | { 22 | "name": "concat-a2", 23 | "dpp:streaming": true, 24 | "path": "concat-a2.csv", 25 | "schema": { "fields": [ 26 | {"name": "a1", "type": "string"}, 27 | {"name": "a2", "type": "string"}, 28 | {"name": "a3", "type": "string"} 29 | ]} 30 | }, 31 | { 32 | "name": "concat-c", 33 | "dpp:streaming": true, 34 | "path": "concat-c.csv", 35 | "schema": { "fields": [ 36 | {"name": "c1", "type": "string"}, 37 | {"name": "c2", "type": "string"}, 38 | {"name": "c3", "type": "string"} 39 | ]} 40 | } 41 | ] 42 | } 43 | -- 44 | {"a1":"a1","a2":"a1","a3":"a2"} 45 | {"a1":"a2","a2":"a1","a3":"a1"} 46 | {"a1":"a3","a2":"a2","a3":"a2"} 47 | {"a1":"a4","a2":"a2","a3":"a1"} 48 | 49 | {"a1":"a1","a2":"a3","a3":"a2"} 50 | {"a1":"a2","a2":"a3","a3":"a1"} 51 | {"a1":"a3","a2":"a4","a3":"a2"} 52 | {"a1":"a4","a2":"a4","a3":"a1"} 53 | 54 | {"c1":"c13","c2":"c23","c3":"c33"} 55 | {"c1":"c12","c2":"c22","c3":"c32"} 56 | {"c1":"c11","c2":"c21","c3":"c31"} 57 | -- 58 | { 59 | "name": "test", 60 | "profile": "data-package", 61 | "resources": [ 62 | { 63 | "name": "concat-a1", 64 | "dpp:streaming": true, 65 | "path": "concat-a1.csv", 66 | "profile": "data-resource", 67 | "schema": { "fields": [ 68 | {"name": "a1", "type": "string"}, 69 | {"name": "a2", "type": "string"}, 70 | {"name": "a3", "type": "string"} 71 | ]} 72 | }, 73 | { 74 | "name": "concat-a2", 75 | "dpp:streaming": true, 76 | "path": "concat-a2.csv", 77 | "profile": "data-resource", 78 | "schema": { "fields": [ 79 | {"name": "a1", "type": "string"}, 80 | {"name": "a2", "type": "string"}, 81 | {"name": "a3", "type": "string"} 82 | ]} 83 | }, 84 | { 85 | "name": "concat-c", 86 | "dpp:streaming": true, 87 | "path": "concat-c.csv", 88 | "profile": "data-resource", 89 | "schema": { "fields": [ 90 | {"name": "c1", "type": "string"}, 91 | {"name": "c2", "type": "string"}, 92 | {"name": "c3", "type": "string"} 93 | ]} 94 | } 95 | ] 96 | } 97 | -- 98 | {"a1":"a2","a2":"a1","a3":"a1"} 99 | {"a1":"a4","a2":"a2","a3":"a1"} 100 | {"a1":"a1","a2":"a1","a3":"a2"} 101 | {"a1":"a3","a2":"a2","a3":"a2"} 102 | 103 | {"a1":"a2","a2":"a3","a3":"a1"} 104 | {"a1":"a4","a2":"a4","a3":"a1"} 105 | {"a1":"a1","a2":"a3","a3":"a2"} 106 | {"a1":"a3","a2":"a4","a3":"a2"} 107 | 108 | {"c1":"c13","c2":"c23","c3":"c33"} 109 | {"c1":"c12","c2":"c22","c3":"c32"} 110 | {"c1":"c11","c2":"c21","c3":"c31"} 111 | 112 | {} 113 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_stream_remote_resources: -------------------------------------------------------------------------------- 1 | stream_remote_resources 2 | -- 3 | { 4 | } 5 | -- 6 | { 7 | "name": "test", 8 | "resources": [ 9 | { 10 | "name": "my-remote-resource", 11 | "dpp:streamedFrom": "file://tests/data/sample.csv", 12 | "path": "_" 13 | } 14 | ] 15 | } 16 | -- 17 | -- 18 | { 19 | "name": "test", 20 | "resources": [ 21 | { 22 | "name": "my-remote-resource", 23 | "path": "data/my-remote-resource.csv", 24 | "dpp:streamedFrom": "file://tests/data/sample.csv", 25 | "dpp:streaming": true, 26 | "schema": { 27 | "fields": [ 28 | {"name": "first_name", "type": "string"}, 29 | {"name": "last_name", "type": "string"}, 30 | {"name": "house", "type": "string"}, 31 | {"name": "age", "type": "string"} 32 | ] 33 | } 34 | } 35 | ] 36 | } 37 | -- 38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": "27"} 39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": "34"} 40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": "34"} 41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": "17"} 42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": "14"} 43 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": "11"} 44 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": "10"} 45 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": "5"} 46 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": "16"} 47 | 48 | {} 49 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_stream_remote_resources_limit_rows: -------------------------------------------------------------------------------- 1 | stream_remote_resources 2 | -- 3 | { 4 | "limit-rows": 5 5 | } 6 | -- 7 | { 8 | "name": "test", 9 | "resources": [ 10 | { 11 | "name": "my-remote-resource", 12 | "dpp:streamedFrom": "file://tests/data/sample.csv", 13 | "path": "_" 14 | } 15 | ] 16 | } 17 | -- 18 | -- 19 | { 20 | "name": "test", 21 | "resources": [ 22 | { 23 | "name": "my-remote-resource", 24 | "path": "data/my-remote-resource.csv", 25 | "dpp:streamedFrom": "file://tests/data/sample.csv", 26 | "dpp:streaming": true, 27 | "schema": { 28 | "fields": [ 29 | {"name": "first_name", "type": "string"}, 30 | {"name": "last_name", "type": "string"}, 31 | {"name": "house", "type": "string"}, 32 | {"name": "age", "type": "string"} 33 | ] 34 | } 35 | } 36 | ] 37 | } 38 | -- 39 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": "27"} 40 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": "34"} 41 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": "34"} 42 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": "17"} 43 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": "14"} 44 | 45 | {} 46 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_stream_remote_resources_zip: -------------------------------------------------------------------------------- 1 | stream_remote_resources 2 | -- 3 | { 4 | } 5 | -- 6 | { 7 | "name": "test", 8 | "resources": [ 9 | { 10 | "name": "my-remote-resource", 11 | "dpp:streamedFrom": "file://tests/data/sample.zip", 12 | "path": "_", 13 | "compression": "zip", 14 | "format": "csv" 15 | } 16 | ] 17 | } 18 | -- 19 | -- 20 | { 21 | "name": "test", 22 | "resources": [ 23 | { 24 | "name": "my-remote-resource", 25 | "path": "data/my-remote-resource.csv", 26 | "dpp:streamedFrom": "file://tests/data/sample.zip", 27 | "dpp:streaming": true, 28 | "compression": "zip", 29 | "format": "csv", 30 | "schema": { 31 | "fields": [ 32 | {"name": "first_name", "type": "string"}, 33 | {"name": "last_name", "type": "string"}, 34 | {"name": "house", "type": "string"}, 35 | {"name": "age", "type": "string"} 36 | ] 37 | } 38 | } 39 | ] 40 | } 41 | -- 42 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": "27"} 43 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": "34"} 44 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": "34"} 45 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": "17"} 46 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": "14"} 47 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": "11"} 48 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": "10"} 49 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": "5"} 50 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": "16"} 51 | 52 | {} 53 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_unpivot: -------------------------------------------------------------------------------- 1 | unpivot 2 | -- 3 | { 4 | "resources": "balance", 5 | "extraKeyFields": [ 6 | {"name": "year", "type": "integer"}, 7 | {"name": "direction", "type": "string", "constraints": {"enum": ["In", "Out"]}} 8 | ], 9 | "extraValueField": { 10 | "name": "amount", 11 | "type": "number" 12 | }, 13 | "unpivot": [ 14 | { 15 | "name": "2015_incomes", 16 | "keys": { 17 | "year": 2015, 18 | "direction": "In" 19 | } 20 | }, 21 | { 22 | "name": "2015_expenses", 23 | "keys": { 24 | "year": 2015, 25 | "direction": "Out" 26 | } 27 | }, 28 | { 29 | "name": "2016_incomes", 30 | "keys": { 31 | "year": 2016, 32 | "direction": "In" 33 | } 34 | }, 35 | { 36 | "name": "2016_expenses", 37 | "keys": { 38 | "year": 2016, 39 | "direction": "Out" 40 | } 41 | }, 42 | { 43 | "name": "([0-9]{4}) (\\w+)", 44 | "keys": { 45 | "year": "\\1", 46 | "direction": "\\2" 47 | } 48 | } 49 | ] 50 | } 51 | -- 52 | { 53 | "name": "test", 54 | "resources": [ 55 | { 56 | "name": "balance", 57 | "dpp:streaming": true, 58 | "path": "balance.csv", 59 | "schema": { "fields": [ 60 | {"name": "company", "type": "string"}, 61 | {"name": "2015_incomes", "type": "number"}, 62 | {"name": "2015_expenses", "type": "number"}, 63 | {"name": "2016_incomes", "type": "number"}, 64 | {"name": "2016_expenses", "type": "number"}, 65 | {"name": "2017 In", "type": "number"}, 66 | {"name": "2017 Out", "type": "number"} 67 | ]} 68 | } 69 | ] 70 | } 71 | -- 72 | {"company": "his-company", "2015_incomes": 100000, "2015_expenses": 80000, "2016_incomes": 150000, "2016_expenses": 120000, "2017 In": 100000, "2017 Out": 120000} 73 | {"company": "her-company", "2015_incomes": 150000, "2015_expenses": 160000, "2016_incomes": 300000, "2016_expenses": 200000, "2017 In": 100000, "2017 Out": 120000} 74 | -- 75 | { 76 | "name": "test", 77 | "profile": "data-package", 78 | "resources": [ 79 | { 80 | "name": "balance", 81 | "dpp:streaming": true, 82 | "path": "balance.csv", 83 | "profile": "data-resource", 84 | "schema": { "fields": [ 85 | {"name": "company", "type": "string"}, 86 | {"name": "year", "type": "integer"}, 87 | {"name": "direction", "type": "string", "constraints": {"enum": ["In", "Out"]}}, 88 | {"name": "amount", "type": "number"} 89 | ]} 90 | } 91 | ] 92 | } 93 | -- 94 | {"company": "his-company", "year": 2015, "direction": "In", "amount": 100000} 95 | {"company": "his-company", "year": 2015, "direction": "Out", "amount": 80000} 96 | {"company": "his-company", "year": 2016, "direction": "In", "amount": 150000} 97 | {"company": "his-company", "year": 2016, "direction": "Out", "amount": 120000} 98 | {"company": "his-company", "year": "2017", "direction": "In", "amount": 100000} 99 | {"company": "his-company", "year": "2017", "direction": "Out", "amount": 120000} 100 | {"company": "her-company", "year": 2015, "direction": "In", "amount": 150000} 101 | {"company": "her-company", "year": 2015, "direction": "Out", "amount": 160000} 102 | {"company": "her-company", "year": 2016, "direction": "In", "amount": 300000} 103 | {"company": "her-company", "year": 2016, "direction": "Out", "amount": 200000} 104 | {"company": "her-company", "year": "2017", "direction": "In", "amount": 100000} 105 | {"company": "her-company", "year": "2017", "direction": "Out", "amount": 120000} 106 | 107 | {} 108 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_update_package: -------------------------------------------------------------------------------- 1 | update_package 2 | -- 3 | { 4 | "title": "Moshe", 5 | "sources": { 6 | "web": "http://google.com" 7 | } 8 | } 9 | -- 10 | { 11 | "name": "test", 12 | "resources": [] 13 | } 14 | -- 15 | -- 16 | { 17 | "name": "test", 18 | "title": "Moshe", 19 | "profile": "data-package", 20 | "sources": { 21 | "web": "http://google.com" 22 | }, 23 | "resources": [] 24 | } 25 | -- 26 | {} 27 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/simple_update_resource: -------------------------------------------------------------------------------- 1 | update_resource 2 | -- 3 | { 4 | "resources": ["name1"], 5 | "metadata": { 6 | "path": "path1-new" 7 | } 8 | } 9 | -- 10 | { 11 | "title": "Test", 12 | "resources": [ 13 | {"name": "name1", "path": "path1"}, 14 | {"name": "name2", "path": "path2"} 15 | ] 16 | } 17 | -- 18 | -- 19 | { 20 | "title": "Test", 21 | "resources": [ 22 | {"name": "name1", "path": "path1-new", "profile": "data-resource"}, 23 | {"name": "name2", "path": "path2", "profile": "data-resource"} 24 | ], 25 | "profile": "data-package" 26 | } 27 | -- 28 | {} 29 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/sort_with_duplicate_keys: -------------------------------------------------------------------------------- 1 | sort 2 | -- 3 | { 4 | "resources": ["data"], 5 | "sort-by": "{year}" 6 | } 7 | -- 8 | { 9 | "name": "test", 10 | "resources": [ 11 | { 12 | "name": "data", 13 | "dpp:streaming": true, 14 | "path": "data.csv", 15 | "schema": { "fields": [ 16 | {"name": "year", "type": "integer"}, 17 | {"name": "data", "type": "string"} 18 | ]} 19 | } 20 | ] 21 | } 22 | -- 23 | {"year":"2016","data":"foo","i":0} 24 | {"year":"2017","data":"baz","i":1} 25 | {"year":"2017","data":"bax","i":2} 26 | {"year":"2015","data":"","i":3} 27 | {"year":"2015","data":"","i":4} 28 | {"year":"2015","data":"","i":5} 29 | {"year":"2015","data":"","i":6} 30 | {"year":"2015","data":"","i":7} 31 | {"year":"2015","data":"","i":8} 32 | {"year":"2015","data":"","i":9} 33 | {"year":"2015","data":"","i":10} 34 | {"year":"2015","data":"","i":11} 35 | -- 36 | { 37 | "name": "test", 38 | "profile": "data-package", 39 | "resources": [ 40 | { 41 | "name": "data", 42 | "dpp:streaming": true, 43 | "path": "data.csv", 44 | "profile": "data-resource", 45 | "schema": { "fields": [ 46 | {"name": "year", "type": "integer"}, 47 | {"name": "data", "type": "string"} 48 | ]} 49 | } 50 | ] 51 | } 52 | -- 53 | {"year":"2015","data":"","i":3} 54 | {"year":"2015","data":"","i":4} 55 | {"year":"2015","data":"","i":5} 56 | {"year":"2015","data":"","i":6} 57 | {"year":"2015","data":"","i":7} 58 | {"year":"2015","data":"","i":8} 59 | {"year":"2015","data":"","i":9} 60 | {"year":"2015","data":"","i":10} 61 | {"year":"2015","data":"","i":11} 62 | {"year":"2016","data":"foo","i":0} 63 | {"year":"2017","data":"baz","i":1} 64 | {"year":"2017","data":"bax","i":2} 65 | 66 | {} 67 | -------------------------------------------------------------------------------- /tests/stdlib/fixtures/stream_remote_resources_txt_format: -------------------------------------------------------------------------------- 1 | stream_remote_resources 2 | -- 3 | {} 4 | -- 5 | { 6 | "name": "test-stream-remote-resources-txt-format", 7 | "resources": [ 8 | { 9 | "name": "my-remote-txt-format-resource", 10 | "dpp:streamedFrom": "file://tests/data/sample.txt", 11 | "path": "_", 12 | "format": "txt" 13 | } 14 | ] 15 | } 16 | -- 17 | -- 18 | { 19 | "name": "test-stream-remote-resources-txt-format", 20 | "resources": [ 21 | { 22 | "name": "my-remote-txt-format-resource", 23 | "dpp:streamedFrom": "file://tests/data/sample.txt", 24 | "dpp:streaming": true, 25 | "path": "data/my-remote-txt-format-resource.csv", 26 | "format": "txt", 27 | "schema": { 28 | "fields": [ 29 | {"name": "data", "type": "string"} 30 | ] 31 | } 32 | } 33 | ] 34 | } 35 | -- 36 | {"data": "<<< tabulator has html decection, keeping that causes the failure which we want to test"} 37 | {"data": "This is a plain text file - not a CSV file!"} 38 | {"data": "testing"} 39 | {"data": "one two three"} 40 | 41 | {} 42 | -------------------------------------------------------------------------------- /tests/stdlib/test_stdlib.py: -------------------------------------------------------------------------------- 1 | import os, logging 2 | from datapackage_pipelines.utilities.lib_test_helpers import ProcessorFixtureTestsBase 3 | from sqlalchemy.orm import sessionmaker 4 | from sqlalchemy import create_engine, text 5 | 6 | ROOT_PATH = os.path.join(os.path.dirname(__file__), '..', '..') 7 | ENV = os.environ.copy() 8 | ENV['PYTHONPATH'] = ROOT_PATH 9 | 10 | ENV['EXISTENT_ENV'] = 'tests/data/sample.csv' 11 | 12 | DEFAULT_TEST_DB = "sqlite://" 13 | ENV['DPP_DB_ENGINE'] = os.environ.get("OVERRIDE_TEST_DB", DEFAULT_TEST_DB) 14 | 15 | 16 | class StdlibfixtureTests(ProcessorFixtureTestsBase): 17 | 18 | def _get_procesor_env(self, filename): 19 | if ENV['DPP_DB_ENGINE'] != DEFAULT_TEST_DB: 20 | engine = create_engine(ENV['DPP_DB_ENGINE']) 21 | conn = engine.connect() 22 | conn.execute(text("DROP TABLE IF EXISTS test;")) 23 | if filename == "dump_to_sql_update_mode__update": 24 | engine = create_engine(ENV['DPP_DB_ENGINE']) 25 | conn = engine.connect() 26 | conn.execute(text(""" 27 | CREATE TABLE test ( 28 | id integer not null primary key, 29 | mystring text, 30 | mynumber double precision, 31 | mydate date 32 | ) 33 | """)) 34 | conn.execute(text(""" 35 | INSERT INTO test VALUES (1, 'foo', 5.6, null); 36 | """)) 37 | return ENV 38 | 39 | def _get_processor_file(self, processor): 40 | processor = processor.replace('.', '/') 41 | return os.path.join(ROOT_PATH, 'datapackage_pipelines', 'lib', processor.strip() + '.py') 42 | 43 | 44 | for filename, _func in StdlibfixtureTests(os.path.join(os.path.dirname(__file__), 'fixtures')).get_tests(): 45 | globals()['test_stdlib_%s' % filename] = _func 46 | 47 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import threading 4 | import time 5 | from http.server import HTTPServer, BaseHTTPRequestHandler 6 | 7 | from datapackage_pipelines.manager import execute_pipeline, run_pipelines 8 | from datapackage_pipelines.specs.specs import pipelines 9 | from datapackage_pipelines.utilities.execution_id import gen_execution_id 10 | from datapackage_pipelines.status import status_mgr 11 | 12 | 13 | called_hooks = [] 14 | progresses = 0 15 | status = status_mgr() 16 | 17 | class SaveHooks(BaseHTTPRequestHandler): 18 | 19 | def do_POST(self): 20 | global progresses 21 | content_len = int(self.headers.get('content-length', 0)) 22 | post_body = self.rfile.read(content_len) 23 | hook = json.loads(post_body) 24 | if hook['event'] != 'progress': 25 | called_hooks.append(hook) 26 | else: 27 | progresses += 1 28 | self.send_response(200) 29 | self.end_headers() 30 | return 31 | 32 | 33 | def test_pipeline(): 34 | '''Tests a few pipelines.''' 35 | global progresses 36 | 37 | server = HTTPServer(('', 9000), SaveHooks) 38 | thread = threading.Thread(target = server.serve_forever, daemon=True) 39 | thread.start() 40 | 41 | results = run_pipelines('./tests/env/dummy/pipeline-test%', '.', 42 | use_cache=False, 43 | dirty=False, 44 | force=False, 45 | concurrency=1, 46 | verbose_logs=True) 47 | failed_results = list(filter(lambda r: not r.success, results)) 48 | assert len(failed_results) == 0, "Failed results: {}".format(["{} {}".format(result.pipeline_id, ", ".join(result.errors)) 49 | for result in failed_results]) 50 | assert len(called_hooks) == 3 51 | assert called_hooks == [ 52 | {"pipeline_id": "./tests/env/dummy/pipeline-test-hooks", "event": "queue"}, 53 | {"pipeline_id": "./tests/env/dummy/pipeline-test-hooks", "event": "start"}, 54 | {"pipeline_id": "./tests/env/dummy/pipeline-test-hooks", "event": "finish", "success": True, 55 | 'stats': {'.dpp': {'out-datapackage-url': 'hooks-outputs/datapackage.json'}, 56 | 'bytes': 15787, 'count_of_rows': 40, 57 | 'dataset_name': 'hook-tests', 'hash': '9fc202087094c7becf98228a1327b21c'}} 58 | ] 59 | assert progresses >= 1 -------------------------------------------------------------------------------- /tests/wrapper/test_wrapper.py: -------------------------------------------------------------------------------- 1 | import unittest.mock as mock 2 | from datapackage_pipelines.wrapper import spew 3 | 4 | 5 | class TestWrapper(object): 6 | def test_spew_finalizer_runs_before_we_signal_that_were_done(self): 7 | '''Assert that the finalizer param is executed before spew is finished. 8 | 9 | We signal to other processors that we're done by writing an empty line 10 | to STDOUT. The finalizer parameter to spew() must be executed before that, 11 | as there can be processors that depend on us finishing our processing 12 | before they're able to run. For example, a processor that depends on 13 | `dump_to_zip` must wait until it has finished writing to the local 14 | filesystem. 15 | ''' 16 | datapackage = {} 17 | resources_iterator = iter([]) 18 | 19 | with mock.patch('datapackage_pipelines.wrapper.wrapper.stdout') as stdout_mock: 20 | def finalizer(): 21 | last_call_args = stdout_mock.write.call_args_list[-1] 22 | assert last_call_args != mock.call('\n') 23 | 24 | spew(datapackage, resources_iterator, finalizer=finalizer) 25 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | package=datapackage_pipelines 3 | skip_missing_interpreters=true 4 | envlist= 5 | py3{7,8,9}-{sqlite,plyvel} 6 | 7 | [testenv] 8 | deps= 9 | mock 10 | pytest 11 | pytest-cov 12 | coverage 13 | pyyaml 14 | py37-plyvel: plyvel 15 | py38-plyvel: plyvel 16 | py39-plyvel: plyvel 17 | passenv= 18 | PWD 19 | CI 20 | TRAVIS 21 | TRAVIS_JOB_ID 22 | TRAVIS_BRANCH 23 | commands= 24 | cp tests/sitecustomize.py {envsitepackagesdir} 25 | py.test -s \ 26 | --cov {[tox]package} \ 27 | --cov-config tox.ini \ 28 | --cov-report term-missing \ 29 | {posargs} 30 | allowlist_externals= 31 | cp 32 | [pytest] 33 | # pytest.ini configuration here 34 | testpaths = tests 35 | 36 | [report] 37 | # .coveragerc configuration here 38 | 39 | [run] 40 | omit= 41 | .tox/* 42 | parallel=True 43 | --------------------------------------------------------------------------------