├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── main.yml
├── .gitignore
├── CONTRIBUTING.md
├── Dockerfile
├── Dockerfile.slim
├── LICENSE.md
├── MANIFEST.in
├── Makefile
├── README.md
├── TUTORIAL.ipynb
├── datapackage_pipelines
    ├── VERSION
    ├── __init__.py
    ├── app.py
    ├── celery_tasks
    │   ├── __init__.py
    │   ├── celery_app.py
    │   ├── celery_common.py
    │   ├── celery_tasks.py
    │   └── dependency_manager.py
    ├── cli.py
    ├── generators
    │   ├── __init__.py
    │   ├── generator_base.py
    │   ├── schedules.py
    │   └── utilities.py
    ├── lib
    │   ├── __init__.py
    │   ├── add_computed_field.py
    │   ├── add_metadata.py
    │   ├── add_resource.py
    │   ├── cache_loader.py
    │   ├── concatenate.py
    │   ├── deduplicate.py
    │   ├── delete_fields.py
    │   ├── dump
    │   │   ├── __init__.py
    │   │   ├── dumper_base.py
    │   │   ├── file_formats.py
    │   │   ├── to_path.py
    │   │   ├── to_sql.py
    │   │   └── to_zip.py
    │   ├── dump_to_path.py
    │   ├── dump_to_sql.py
    │   ├── dump_to_zip.py
    │   ├── duplicate.py
    │   ├── filter.py
    │   ├── find_replace.py
    │   ├── flow.py
    │   ├── internal
    │   │   ├── __init__.py
    │   │   └── sink.py
    │   ├── join.py
    │   ├── load.py
    │   ├── load_metadata.py
    │   ├── load_resource.py
    │   ├── printer.py
    │   ├── set_types.py
    │   ├── sort.py
    │   ├── stream_remote_resources.py
    │   ├── unpivot.py
    │   ├── update_package.py
    │   └── update_resource.py
    ├── manager
    │   ├── __init__.py
    │   ├── logging_config.py
    │   ├── runner.py
    │   ├── runners
    │   │   ├── __init__.py
    │   │   ├── base_runner.py
    │   │   ├── local_python.py
    │   │   └── runner_config.py
    │   └── tasks.py
    ├── specs
    │   ├── __init__.py
    │   ├── errors.py
    │   ├── hashers
    │   │   ├── __init__.py
    │   │   ├── dependency_resolver.py
    │   │   └── hash_calculator.py
    │   ├── parsers
    │   │   ├── __init__.py
    │   │   ├── base_parser.py
    │   │   ├── basic_pipeline.py
    │   │   └── source_spec_pipeline.py
    │   ├── resolver.py
    │   ├── schemas
    │   │   ├── __init__.py
    │   │   ├── pipeline-spec.schema.json
    │   │   └── validator.py
    │   └── specs.py
    ├── status
    │   ├── __init__.py
    │   ├── backend_filesystem.py
    │   ├── backend_redis.py
    │   ├── backend_sqlite.py
    │   ├── hook_sender.py
    │   ├── pipeline_execution.py
    │   ├── pipeline_status.py
    │   └── status_manager.py
    ├── utilities
    │   ├── __init__.py
    │   ├── dirtools.py
    │   ├── execution_id.py
    │   ├── extended_json.py
    │   ├── flow_utils.py
    │   ├── lazy_dict.py
    │   ├── lib_test_helpers.py
    │   ├── resources.py
    │   ├── stat_utils.py
    │   └── tabulator_txt_parser.py
    ├── web
    │   ├── __init__.py
    │   ├── server.py
    │   └── templates
    │   │   └── dashboard.html
    └── wrapper
    │   ├── __init__.py
    │   ├── input_processor.py
    │   └── wrapper.py
├── docker
    ├── github_config.py
    └── run.sh
├── pylama.ini
├── samples
    ├── add_constant.py
    ├── co2-information-cdiac.zip
    └── pipeline-spec.yaml
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── cli
    │   ├── custom_formatters
    │   │   ├── __init__.py
    │   │   └── xlsx_format.py
    │   ├── expected_flow_data.csv
    │   ├── pipeline-spec.yaml
    │   ├── setup.py
    │   ├── test_cli_exit_codes.sh
    │   ├── test_cli_logs.sh
    │   ├── test_custom_formatters.sh
    │   ├── test_exclude_dirnames.sh
    │   ├── test_flow.py
    │   └── test_flow.sh
    ├── data
    │   ├── datapackage.json
    │   ├── datapackage2.json
    │   ├── datapackage3.json
    │   ├── sample.csv
    │   ├── sample.dups.csv
    │   ├── sample.txt
    │   ├── sample.zip
    │   └── sample2.csv
    ├── docker
    │   ├── .gitignore
    │   ├── lib
    │   │   ├── dpp_docker_test.py
    │   │   └── setup.py
    │   ├── pipeline-spec.yaml
    │   ├── test.py
    │   └── test.sh
    ├── env
    │   ├── common
    │   │   └── pipeline-common.py
    │   ├── dummy
    │   │   ├── big-outputs.py
    │   │   ├── pipeline-spec.yaml
    │   │   ├── pipeline-test-supplier-titleize.py
    │   │   └── types.csv
    │   └── extract-year.py
    ├── serve
    │   ├── html_output.py
    │   └── pipeline-spec.yaml
    ├── sitecustomize.py
    ├── stdlib
    │   ├── README.md
    │   ├── __init__.py
    │   ├── fixtures
    │   │   ├── add_resource_existent_env
    │   │   ├── dump_to_sql_update_mode__insert
    │   │   ├── dump_to_sql_update_mode__update
    │   │   ├── dump_to_sql_with_updated_data
    │   │   ├── load_existent_env
    │   │   ├── obj_fix_dump_to_sql
    │   │   ├── reverse_sort
    │   │   ├── simple_add_computed_field
    │   │   ├── simple_add_resource
    │   │   ├── simple_concat
    │   │   ├── simple_deduplicate
    │   │   ├── simple_delete_fields
    │   │   ├── simple_dump_dot_to_zip
    │   │   ├── simple_dump_dot_to_zip_with_hash
    │   │   ├── simple_dump_dot_to_zip_with_hash_and_pretty_descriptor
    │   │   ├── simple_dump_to_sql
    │   │   ├── simple_dump_to_zip
    │   │   ├── simple_dump_to_zip_with_hash
    │   │   ├── simple_dump_to_zip_with_hash_and_pretty_descriptor
    │   │   ├── simple_filter
    │   │   ├── simple_find_replace
    │   │   ├── simple_join
    │   │   ├── simple_load
    │   │   ├── simple_load_metadata
    │   │   ├── simple_load_resource
    │   │   ├── simple_load_resource_dups
    │   │   ├── simple_load_resource_index
    │   │   ├── simple_load_resource_limit_rows
    │   │   ├── simple_load_resource_list
    │   │   ├── simple_load_resource_multi
    │   │   ├── simple_load_resource_required
    │   │   ├── simple_load_resource_resources
    │   │   ├── simple_load_resource_resources_required
    │   │   ├── simple_resource_duplication
    │   │   ├── simple_set_types
    │   │   ├── simple_sort
    │   │   ├── simple_stream_remote_resources
    │   │   ├── simple_stream_remote_resources_limit_rows
    │   │   ├── simple_stream_remote_resources_zip
    │   │   ├── simple_unpivot
    │   │   ├── simple_update_package
    │   │   ├── simple_update_resource
    │   │   ├── sort_with_duplicate_keys
    │   │   └── stream_remote_resources_txt_format
    │   └── test_stdlib.py
    ├── test_main.py
    └── wrapper
    │   └── test_wrapper.py
└── tox.ini


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .tox/
 2 | .git/
 3 | .cache/
 4 | .dpp.db
 5 | .dppdb
 6 | .github/
 7 | .idea/
 8 | build/
 9 | datapackage_pipelines.egg-info/
10 | dist/
11 | samples/
12 | tests/
13 | 
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | In order to submit an issue, please ensure you can check the following. Thanks!
2 | 
3 | * [ ] Declare which version of Python you are using (`python --version`)
4 | * [ ] Declare which operating system you are using
5 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | This pull request fixes # .
 2 | 
 3 | * [ ] I've added tests to cover the proposed changes
 4 | 
 5 | Changes proposed in this pull request:
 6 | 
 7 | -
 8 | -
 9 | -
10 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   push:
 4 |     branches: [ master ]
 5 |     tags: [ '*' ]
 6 |   workflow_dispatch:
 7 | jobs:
 8 |   build-server:
 9 |     runs-on: ubuntu-22.04
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - uses: actions/setup-python@v2
13 |       with:
14 |         python-version: '3.9'
15 |     # - name: install
16 |     #   run: |
17 |     #     sudo apt-get install libleveldb-dev libleveldb1d
18 |     #     make install-speedup
19 |     - name: build
20 |       run: |
21 |         make build
22 |         tests/docker/test.sh
23 |     - name: version
24 |       if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
25 |       env:
26 |         DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
27 |         DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
28 |         TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
29 |         TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
30 |       run: |
31 |         make deploy-pip
32 |         make deploy-tags
33 |     - name: master branch
34 |       env:
35 |         DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
36 |         DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
37 |       if: github.event_name == 'push' && contains(github.ref, '/heads/master')
38 |       run: |
39 |         make deploy-latest
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | *.egg-info/
 22 | .installed.cfg
 23 | *.egg
 24 | 
 25 | # PyInstaller
 26 | #  Usually these files are written by a python script from a template
 27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 28 | *.manifest
 29 | *.spec
 30 | 
 31 | # Installer logs
 32 | pip-log.txt
 33 | pip-delete-this-directory.txt
 34 | 
 35 | # Unit test / coverage reports
 36 | htmlcov/
 37 | .tox/
 38 | .coverage
 39 | .coverage.*
 40 | .cache
 41 | nosetests.xml
 42 | coverage.xml
 43 | *,cover
 44 | .hypothesis/
 45 | .pytest_cache
 46 | 
 47 | # Translations
 48 | *.mo
 49 | *.pot
 50 | 
 51 | # Django stuff:
 52 | *.log
 53 | local_settings.py
 54 | 
 55 | # Flask instance folder
 56 | instance/
 57 | 
 58 | # Scrapy stuff:
 59 | .scrapy
 60 | 
 61 | # Sphinx documentation
 62 | docs/_build/
 63 | 
 64 | # PyBuilder
 65 | target/
 66 | 
 67 | # IPython Notebook
 68 | .ipynb_checkpoints
 69 | 
 70 | # pyenv
 71 | .python-version
 72 | 
 73 | # dotenv
 74 | .env
 75 | 
 76 | # Spyder project settings
 77 | .spyderproject
 78 | 
 79 | # Extras
 80 | .projectile
 81 | .idea/
 82 | datapackage-pipelines.iml
 83 | celerybeat-schedule
 84 | 
 85 | # Datapackage Pipeline DB
 86 | .dpp.db
 87 | 
 88 | # Resources created by our tests
 89 | my-spiffy-resource.zip
 90 | tests/env/dummy/dump.zip
 91 | tests/env/dummy/hooks-outputs
 92 | tests/env/dummy/nulls-test
 93 | tests/env/dummy/type-tests-output
 94 | tests/env/dummy/type-tests-output2
 95 | tests/cli/.code
 96 | .dpp
 97 | .coverage.*
 98 | .code/
 99 | .vscode/
100 | 
101 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | The datapackage-pipelines project accepts contributions via GitHub pull requests. This document outlines the process to help get your contribution accepted.
 4 | 
 5 | The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards).
 6 | 
 7 | We welcome adding new processors to the standard library, the following guidelines will improve the chances of your processor being accepted:
 8 | 
 9 | * The processor has practical and common use-cases.
10 | * Minimal new dependencies - preferably, no new dependencies.
11 | 
12 | ## Getting Started
13 | 
14 | Recommended way to get started is to create and activate a project virtual environment.
15 | 
16 | You should ensure you are using a supported Python version, you can check the .travis.yml to see which versions we use for CI.
17 | 
18 | * [Pythonz](https://github.com/saghul/pythonz#installation) can be used to install a specific Python version.
19 | * [Virtualenvwrapper](http://virtualenvwrapper.readthedocs.io/en/latest/install.html#basic-installation) can help setting up and managing virtualenvs
20 | 
21 | To install package and development dependencies into active environment:
22 | 
23 | ```
24 | $ make install
25 | ```
26 | 
27 | ## Lint & Test
28 | 
29 | Before pushing code you should ensure lint and tests pass otherwise build will fail and your Pull request won't be merged :(
30 | 
31 | You can use the following snippet to ensure everything works:
32 | 
33 | ```
34 | make install && make lint && make test
35 | ```
36 | 
37 | 
38 | ## Linting
39 | 
40 | To lint the project codebase:
41 | 
42 | ```
43 | $ make lint
44 | ```
45 | 
46 | Under the hood `pylama` configured in `pylama.ini` is used. On this stage it's already
47 | installed into your environment and could be used separately with more fine-grained control
48 | as described in documentation - https://www.pylint.org/.
49 | 
50 | For example to check only errors:
51 | 
52 | ```
53 | $ pylanma
54 | ```
55 | 
56 | ## Testing
57 | 
58 | To run tests with coverage:
59 | 
60 | ```
61 | $ make test
62 | ```
63 | Under the hood `tox` powered by `py.test` and `coverage` configured in `tox.ini` is used.
64 | It's already installed into your environment and could be used separately with more fine-grained control
65 | as described in documentation - https://testrun.org/tox/latest/.
66 | 
67 | For example to check subset of tests against Python 3 environment with increased verbosity.
68 | All positional arguments and options after `--` will be passed to `py.test`:
69 | 
70 | ```
71 | tox -e py35 -- -v tests/<path>
72 | ```
73 | 
74 | ## Testing with other databases
75 | 
76 | By default the tests run with sqlite in-memory database which doesn't require any setup.
77 | However, most projects will want to use a real DB, like PostgreSQL.
78 | 
79 | To run the tests with a different DB, you need to supply the connection string via environment variable.
80 | For example, to run with local postgresql databsae:
81 | 
82 | `OVERRIDE_TEST_DB=postgresql://postgres:123456@localhost:5432/postgres py.test`
83 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-alpine
 2 | 
 3 | RUN apk --update --no-cache --virtual=build-dependencies add \
 4 |         build-base python3-dev \libxml2-dev libxslt-dev postgresql-dev leveldb leveldb-dev  && \
 5 |     apk --update --no-cache add libstdc++ redis libpq && \
 6 |     mkdir -p /run/redis && mkdir -p /var/run/dpp && \
 7 |     pip install psycopg2 datapackage-pipelines-github datapackage-pipelines-aws datapackage-pipelines-sourcespec-registry 
 8 | 
 9 | ADD . /dpp/
10 | 
11 | RUN pip install -U /dpp/[speedup] && \
12 |     mkdir -p /var/redis && chmod 775 /var/redis && chown redis:redis /var/redis
13 | 
14 | ENV DPP_NUM_WORKERS=4
15 | ENV DPP_REDIS_HOST=127.0.0.1
16 | ENV DPP_CELERY_BROKER=redis://localhost:6379/6
17 | 
18 | EXPOSE 5000
19 | WORKDIR /pipelines/
20 | ENTRYPOINT ["/dpp/docker/run.sh"]
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/Dockerfile.slim:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | RUN apt-get update && apt-get install --no-install-recommends -y redis libleveldb1d libleveldb-dev build-essential libpq-dev && \
 4 |     update-ca-certificates && mkdir -p /run/redis && mkdir -p /var/run/dpp && \
 5 |     apt-get clean && rm -rf /var/lib/apt/lists/*
 6 | 
 7 | RUN pip install psycopg2 datapackage-pipelines-github datapackage-pipelines-sourcespec-registry datapackage-pipelines-aws 
 8 |     
 9 | ADD . /dpp/
10 | 
11 | RUN pip install -U /dpp/[speedup] && \
12 |     mkdir -p /var/redis && chmod 775 /var/redis && chown redis.redis /var/redis && \
13 |     mkdir -p /var/log/redis && cd /etc && ln -s redis/redis.conf
14 | 
15 | ENV DPP_NUM_WORKERS=4
16 | ENV DPP_REDIS_HOST=127.0.0.1
17 | ENV DPP_CELERY_BROKER=redis://localhost:6379/6
18 | 
19 | EXPOSE 5000
20 | WORKDIR /pipelines/
21 | ENTRYPOINT ["/dpp/docker/run.sh"]
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Open Knowledge
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | global-include *.json
 2 | global-include *.yml
 3 | global-include *.txt
 4 | global-include *.html
 5 | global-include VERSION
 6 | include LICENSE.md
 7 | include Makefile
 8 | include pylintrc
 9 | include README.md
10 | include tox.ini
11 | prune .tox
12 | 
13 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all install list lint release test version build
 2 | 
 3 | 
 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2)
 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION)
 6 | 
 7 | 
 8 | all: list
 9 | 
10 | install:
11 | 	pip install --upgrade -e .[develop]
12 | 
13 | install-speedup:
14 | 	pip install --upgrade -e .[develop,speedup]
15 | 
16 | list:
17 | 	@grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n'
18 | 
19 | lint:
20 | 	pylama $(PACKAGE)
21 | 
22 | release:
23 | 	bash -c '[[ -z `git status -s` ]]'
24 | 	git tag -a -m release $(VERSION)
25 | 	git push --tags
26 | 
27 | test:
28 | 	tox &&\
29 | 	tests/cli/test_cli_exit_codes.sh &&\
30 | 	tests/cli/test_cli_logs.sh &&\
31 | 	tests/cli/test_custom_formatters.sh &&\
32 | 	tests/cli/test_exclude_dirnames.sh &&\
33 | 	tests/cli/test_flow.sh
34 | 
35 | version:
36 | 	@echo $(VERSION)
37 | 
38 | build:
39 | 	docker login -u "${DOCKER_USERNAME}" -p "${DOCKER_PASSWORD}"
40 | 	docker pull frictionlessdata/datapackage-pipelines:latest &&\
41 | 	docker build -t frictionlessdata/datapackage-pipelines:latest --cache-from frictionlessdata/datapackage-pipelines . &&\
42 | 	docker build -t frictionlessdata/datapackage-pipelines:latest-alpine --cache-from frictionlessdata/datapackage-pipelines . &&\
43 | 	docker build -t frictionlessdata/datapackage-pipelines:${VERSION} --cache-from frictionlessdata/datapackage-pipelines . &&\
44 | 	docker build -t frictionlessdata/datapackage-pipelines:${VERSION}-alpine --cache-from frictionlessdata/datapackage-pipelines . &&\
45 | 	docker pull frictionlessdata/datapackage-pipelines:latest-slim &&\
46 | 	docker build -t frictionlessdata/datapackage-pipelines:latest-slim -f Dockerfile.slim --cache-from frictionlessdata/datapackage-pipelines:latest-slim . &&\
47 | 	docker build -t frictionlessdata/datapackage-pipelines:${VERSION}-slim -f Dockerfile.slim --cache-from frictionlessdata/datapackage-pipelines:latest-slim .
48 | 
49 | 
50 | deploy-latest:
51 | 	docker login -u "${DOCKER_USERNAME}" -p "${DOCKER_PASSWORD}" &&\
52 | 	docker push frictionlessdata/datapackage-pipelines:latest &&\
53 | 	docker push frictionlessdata/datapackage-pipelines:latest-alpine &&\
54 | 	docker push frictionlessdata/datapackage-pipelines:latest-slim
55 | 
56 | deploy-tags:
57 | 	docker login -u "${DOCKER_USERNAME}" -p "${DOCKER_PASSWORD}" &&\
58 | 	docker push frictionlessdata/datapackage-pipelines:${VERSION} &&\
59 | 	docker push frictionlessdata/datapackage-pipelines:${VERSION}-alpine &&\
60 | 	docker push frictionlessdata/datapackage-pipelines:${VERSION}-slim
61 | 
62 | deploy-pip:
63 | 	rm -rf dist/ || true
64 | 	pip install wheel twine
65 | 	python setup.py sdist bdist_wheel
66 | 	python -m twine upload dist/*


--------------------------------------------------------------------------------
/datapackage_pipelines/VERSION:
--------------------------------------------------------------------------------
1 | 2.2.11


--------------------------------------------------------------------------------
/datapackage_pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import io
 3 | import os
 4 | 
 5 | from .specs import pipelines
 6 | from .manager import execute_pipeline
 7 | 
 8 | VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION')
 9 | 
10 | __version__ = io.open(VERSION_FILE, encoding='utf-8').readline().strip()
11 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/app.py:
--------------------------------------------------------------------------------
1 | # pylama:ignore=W0611
2 | from .celery_tasks.celery_app import celery_app
3 | from .manager.logging_config import logging
4 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/celery_tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/celery_tasks/__init__.py


--------------------------------------------------------------------------------
/datapackage_pipelines/celery_tasks/celery_app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from celery.schedules import crontab
 4 | 
 5 | from .celery_common import get_celery_app, MANAGEMENT_TASK_NAME, SCHEDULED_TASK_NAME
 6 | from .celery_tasks import build_dependents
 7 | from datapackage_pipelines.specs import pipelines
 8 | from datapackage_pipelines.status import status_mgr
 9 | 
10 | import logging
11 | 
12 | kw = {}
13 | if os.environ.get('SCHEDULER'):
14 |     CELERY_SCHEDULE = {
15 |         '/management': {
16 |             'task': MANAGEMENT_TASK_NAME,
17 |             'schedule': crontab(),
18 |             'args': ('update', None, None),
19 |             'options': {'queue': 'datapackage-pipelines-management'}
20 |         }
21 |     }
22 | 
23 |     for spec in pipelines():
24 |         if spec.schedule is not None:
25 |             entry = {
26 |                 'task': SCHEDULED_TASK_NAME,
27 |                 'schedule': crontab(*spec.schedule),
28 |                 'args': (spec.pipeline_id,),
29 |                 'options': {'queue': 'datapackage-pipelines-management'}
30 |             }
31 |             CELERY_SCHEDULE[spec.pipeline_id] = entry
32 |             logging.info('SCHEDULING task %r: %r', spec.pipeline_id, spec.schedule)
33 | 
34 |         ps = status_mgr().get(spec.pipeline_id)
35 |         ex = ps.get_last_execution()
36 |         if ex is not None and not ex.finish_time:
37 |             ex.invalidate()
38 |             ex.finish_execution(False, {}, ['Cancelled'])
39 | 
40 |     kw = dict(CELERYBEAT_SCHEDULE=CELERY_SCHEDULE)
41 | 
42 | logging.error('CELERY INITIALIZING')
43 | celery_app = get_celery_app(**kw)
44 | 
45 | if os.environ.get('SCHEDULER'):
46 |     build_dependents()
47 |     celery_app.send_task(MANAGEMENT_TASK_NAME, ('init', None, None))
48 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/celery_tasks/celery_common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from celery import Celery
 4 | 
 5 | 
 6 | REGULAR_TASK_NAME = 'datapackage_pipelines.celery_tasks.celery_tasks' + \
 7 |                         '.execute_pipeline_task'
 8 | SCHEDULED_TASK_NAME = 'datapackage_pipelines.celery_tasks.celery_tasks' + \
 9 |                         '.execute_scheduled_pipeline'
10 | MANAGEMENT_TASK_NAME = 'datapackage_pipelines.celery_tasks.celery_tasks' + \
11 |                         '.update_pipelines'
12 | 
13 | 
14 | def get_celery_app(**kwargs):
15 |     celery_app = Celery('dpp')
16 | 
17 |     broker = os.environ.get('DPP_CELERY_BROKER', 'redis://localhost:6379/6')
18 | 
19 |     conf = dict(
20 |         CELERY_TIMEZONE='UTC',
21 |         CELERY_REDIRECT_STDOUTS=False,
22 |         BROKER_URL=broker,
23 |         CELERY_RESULT_BACKEND=broker,
24 |         CELERYD_LOG_LEVEL="DEBUG",
25 |         CELERY_TASK_SERIALIZER='json',
26 |         CELERY_RESULT_SERIALIZER='json',
27 |         CELERY_ACCEPT_CONTENT=['json'],
28 |         CELERYD_LOG_FORMAT='[%(asctime)s: %(levelname)s/%(processName)s(%(process)d)] %(message)s',
29 |         CELERY_ROUTES={
30 |             REGULAR_TASK_NAME: {'queue': 'datapackage-pipelines'},
31 |             SCHEDULED_TASK_NAME: {'queue': 'datapackage-pipelines-management'},
32 |             MANAGEMENT_TASK_NAME: {'queue': 'datapackage-pipelines-management'},
33 |         }
34 |     )
35 |     conf.update(kwargs)
36 | 
37 |     celery_app.conf.update(**conf)
38 | 
39 |     return celery_app
40 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/celery_tasks/dependency_manager.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import redis
 5 | 
 6 | 
 7 | class DependencyManager(object):
 8 | 
 9 |     def __init__(self, host=os.environ.get('DPP_REDIS_HOST'), port=6379):
10 |         self.redis = None
11 |         if host is not None and len(host) > 0:
12 |             conn = redis.StrictRedis(host=host, port=port, db=5)
13 |             try:
14 |                 conn.ping()
15 |                 self.redis = conn
16 |             except redis.exceptions.ConnectionError:
17 |                 logging.warning('Failed to connect to Redis, host:%s, port:%s',
18 |                                 host, port)
19 |         else:
20 |             logging.info('Skipping redis connection, host:%s, port:%s',
21 |                          host, port)
22 | 
23 |     @staticmethod
24 |     def dependents_key(x):
25 |         return 'Dependents:%s' % x
26 | 
27 |     @staticmethod
28 |     def dependencies_key(x):
29 |         return 'Dependencies:%s' % x
30 | 
31 |     @staticmethod
32 |     def encode(x):
33 |         if isinstance(x, str):
34 |             return x.encode('utf8')
35 |         if isinstance(x, list):
36 |             return [y.encode('utf8') for y in x]
37 | 
38 |     @staticmethod
39 |     def decode(x):
40 |         if isinstance(x, bytes):
41 |             return x.decode('utf8')
42 |         if isinstance(x, (list, set)):
43 |             return [y.decode('utf8') for y in x]
44 |         assert False, "Unknown type for x: %r" % x
45 | 
46 |     def is_init(self):
47 |         return self.redis is not None
48 | 
49 |     def update(self, spec):
50 |         if self.is_init():
51 |             for dep in spec.dependencies:
52 |                 self.redis.sadd(self.dependents_key(dep), self.encode(spec.pipeline_id))
53 |             self.redis.delete(self.dependencies_key(spec.pipeline_id))
54 |             for dep in self.encode(spec.dependencies):
55 |                 self.redis.sadd(self.dependencies_key(spec.pipeline_id), dep)
56 | 
57 |     def get_dependencies(self, pipeline_id):
58 |         if self.is_init():
59 |             members = self.redis.smembers(self.dependencies_key(pipeline_id))
60 |             if members is not None:
61 |                 return self.decode(members)
62 |         return []
63 | 
64 |     def get_dependents(self, pipeline_id):
65 |         if self.is_init():
66 |             members = self.redis.smembers(self.dependents_key(pipeline_id))
67 |             if members is not None:
68 |                 return self.decode(members)
69 |         return []
70 | 
71 |     def remove(self, pipeline_id):
72 |         if self.is_init():
73 |             dependencies = self.get_dependencies(pipeline_id)
74 |             dependents = self.get_dependents(pipeline_id)
75 | 
76 |             for p in dependencies:
77 |                 self.redis.srem(self.dependents_key(p), self.encode(pipeline_id))
78 |             for p in dependents:
79 |                 self.redis.srem(self.dependencies_key(p), self.encode(pipeline_id))
80 |             self.redis.delete(self.dependents_key(pipeline_id))
81 |             self.redis.delete(self.dependencies_key(pipeline_id))
82 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/generators/__init__.py:
--------------------------------------------------------------------------------
1 | from slugify import slugify
2 | 
3 | from .schedules import * # noqa
4 | from .generator_base import GeneratorBase
5 | from .utilities import steps
6 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/generators/generator_base.py:
--------------------------------------------------------------------------------
 1 | import jsonschema
 2 | 
 3 | 
 4 | class GeneratorBase(object):
 5 | 
 6 |     def __init__(self):
 7 |         self.schema = None
 8 | 
 9 |     def _get_schema(self):
10 |         if self.schema is not None:
11 |             return self.schema
12 |         self.schema = self.get_schema()
13 |         validator = jsonschema.validators.validator_for(self.schema)
14 |         self.schema = validator(self.schema)
15 |         return self.schema
16 | 
17 |     def internal_validate(self, source):
18 |         schema = self._get_schema()
19 |         try:
20 |             schema.validate(source)
21 |         except jsonschema.ValidationError:
22 |             return False
23 |         return True
24 | 
25 |     def internal_generate(self, source, base):
26 |         if not self.internal_validate(source):
27 |             return None
28 |         return self.generate_pipeline(source, base)
29 | 
30 |     @classmethod
31 |     def get_schema(cls):
32 |         raise NotImplementedError()
33 | 
34 |     @classmethod
35 |     def generate_pipeline(cls, source, base):
36 |         raise NotImplementedError()
37 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/generators/schedules.py:
--------------------------------------------------------------------------------
1 | SCHEDULE_NONE = None
2 | SCHEDULE_HOURLY = '0 * * * *'
3 | SCHEDULE_DAILY = '0 0 * * *'
4 | SCHEDULE_WEEKLY = '0 0 * * 0'
5 | SCHEDULE_MONTHLY = '0 0 1 * *'
6 | SCHEDULE_YEARLY = '0 0 1 1 *'
7 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/generators/utilities.py:
--------------------------------------------------------------------------------
 1 | def arg_to_step(arg):
 2 |     if isinstance(arg, str):
 3 |         return {'run': arg}
 4 |     else:
 5 |         return dict(zip(['run', 'parameters', 'cache'], arg))
 6 | 
 7 | 
 8 | def steps(*args):
 9 |     return [arg_to_step(arg) for arg in args]
10 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/lib/__init__.py


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/add_computed_field.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, add_computed_field
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         add_computed_field(
 9 |             parameters.get('fields', []),
10 |             resources=parameters.get('resources')
11 |         ),
12 |     )
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     with ingest() as ctx:
17 |         spew_flow(flow(ctx.parameters), ctx)
18 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/add_metadata.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from datapackage_pipelines.wrapper import ingest
 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 5 | 
 6 | from datapackage_pipelines.lib.update_package import flow
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     warnings.warn(
11 |         'add_metadata will be removed in the future, use "update_package" instead',
12 |         DeprecationWarning
13 |     )
14 |     with ingest() as ctx:
15 |         spew_flow(flow(ctx.parameters), ctx)
16 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/add_resource.py:
--------------------------------------------------------------------------------
 1 | from datapackage_pipelines.wrapper import ingest, spew
 2 | import os
 3 | 
 4 | from datapackage_pipelines.utilities.resources import PATH_PLACEHOLDER, PROP_STREAMED_FROM
 5 | 
 6 | parameters, datapackage, res_iter = ingest()
 7 | 
 8 | 
 9 | if datapackage is None:
10 |     datapackage = {}
11 | 
12 | datapackage.setdefault('resources', [])
13 | 
14 | for param in ['url', 'name']:
15 |     assert param in parameters, \
16 |         "You must define {} in your parameters".format(param)
17 | 
18 | url = parameters.pop('url')
19 | if url.startswith('env://'):
20 |     env_var = url[6:]
21 |     env_url = os.environ.get(env_var)
22 |     assert env_url is not None, \
23 |         "Missing Value - " \
24 |         "Please set your '%s' environment variable" % env_var
25 | 
26 |     url = env_url
27 | 
28 | if 'path' not in parameters:
29 |     parameters['path'] = PATH_PLACEHOLDER
30 | parameters[PROP_STREAMED_FROM] = url
31 | 
32 | datapackage['resources'].append(parameters)
33 | 
34 | spew(datapackage, res_iter)
35 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/cache_loader.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import shutil
 3 | import gzip
 4 | 
 5 | from datapackage_pipelines.wrapper import ingest
 6 | 
 7 | params, _, _ = ingest()
 8 | 
 9 | load_from = params['load-from']
10 | 
11 | shutil.copyfileobj(gzip.open(load_from, "rt"), sys.stdout)
12 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/concatenate.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, concatenate, update_resource
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.resources import PROP_STREAMING
 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 5 | 
 6 | 
 7 | def flow(parameters):
 8 |     return Flow(
 9 |         concatenate(
10 |             parameters.get('fields', {}),
11 |             parameters.get('target', {}),
12 |             parameters.get('sources')
13 |         ),
14 |         update_resource(
15 |             parameters.get('target', {}).get('name', 'concat'),
16 |             **{
17 |                 PROP_STREAMING: True
18 |             }
19 |         )
20 |     )
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     with ingest() as ctx:
25 |         spew_flow(flow(ctx.parameters), ctx)
26 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/deduplicate.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, deduplicate
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         deduplicate(
 9 |             resources=parameters.get('resources'),
10 |         )
11 |     )
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     with ingest() as ctx:
16 |         spew_flow(flow(ctx.parameters), ctx)
17 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/delete_fields.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, delete_fields
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     resources = parameters.get('resources')
 8 |     regex = parameters.get('regex', True)
 9 |     return Flow(
10 |         delete_fields(
11 |             parameters.get('fields', []),
12 |             resources=resources,
13 |             regex=regex,
14 |         )
15 |     )
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     with ingest() as ctx:
20 |         spew_flow(flow(ctx.parameters), ctx)
21 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/dump/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/lib/dump/__init__.py


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/dump/to_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import warnings
 4 | 
 5 | from datapackage_pipelines.lib.dump.dumper_base import FileDumper
 6 | 
 7 | 
 8 | class PathDumper(FileDumper):
 9 | 
10 |     def initialize(self, params):
11 |         super(PathDumper, self).initialize(params)
12 |         self.out_path = params.get('out-path', '.')
13 |         self.add_filehash_to_path = params.get('add-filehash-to-path', False)
14 |         PathDumper.__makedirs(self.out_path)
15 | 
16 |     def write_file_to_output(self, filename, path):
17 |         path = os.path.join(self.out_path, path)
18 |         # Avoid rewriting existing files
19 |         if self.add_filehash_to_path and os.path.exists(path):
20 |             return
21 |         path_part = os.path.dirname(path)
22 |         PathDumper.__makedirs(path_part)
23 |         shutil.copy(filename, path)
24 |         os.chmod(path, 0o666)
25 |         return path
26 | 
27 |     @staticmethod
28 |     def __makedirs(path):
29 |         os.makedirs(path, exist_ok=True)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     warnings.warn(
34 |         'dump.to_path will be removed in the future, use "dump_to_path" instead',
35 |         DeprecationWarning
36 |     )
37 |     PathDumper()()
38 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/dump/to_sql.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | from datapackage_pipelines.wrapper import ingest
 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 5 | 
 6 | from datapackage_pipelines.lib.dump_to_sql import flow
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     warnings.warn(
11 |         'dump.to_sql will be removed in the future, use "dump_to_sql" instead',
12 |         DeprecationWarning
13 |     )
14 |     with ingest() as ctx:
15 |         spew_flow(flow(ctx.parameters), ctx)
16 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/dump/to_zip.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | import zipfile
 3 | 
 4 | from datapackage_pipelines.lib.dump.dumper_base import FileDumper
 5 | 
 6 | 
 7 | class ZipDumper(FileDumper):
 8 | 
 9 |     def initialize(self, params):
10 |         super(ZipDumper, self).initialize(params)
11 |         out_filename = open(params['out-file'], 'wb')
12 |         self.zip_file = zipfile.ZipFile(out_filename, 'w')
13 | 
14 |     def write_file_to_output(self, filename, path):
15 |         self.zip_file.write(filename, arcname=path,
16 |                             compress_type=zipfile.ZIP_DEFLATED)
17 | 
18 |     def finalize(self):
19 |         self.zip_file.close()
20 |         super(ZipDumper, self).finalize()
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     warnings.warn(
25 |         'dump.to_zip will be removed in the future, use "dump_to_zip" instead',
26 |         DeprecationWarning
27 |     )
28 |     ZipDumper()()
29 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/dump_to_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dataflows import Flow, dump_to_path
 4 | from datapackage_pipelines.wrapper import ingest
 5 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 6 | 
 7 | from datapackage_pipelines.utilities.stat_utils import STATS_DPP_KEY, STATS_OUT_DP_URL_KEY
 8 | 
 9 | 
10 | def flow(parameters: dict, stats: dict):
11 |     out_path = parameters.pop('out-path', '.')
12 |     stats.setdefault(STATS_DPP_KEY, {})[STATS_OUT_DP_URL_KEY] = os.path.join(out_path, 'datapackage.json')
13 |     return Flow(
14 |         dump_to_path(
15 |             out_path,
16 |             **parameters
17 |         )
18 |     )
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     with ingest() as ctx:
23 |         spew_flow(flow(ctx.parameters, ctx.stats), ctx)
24 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/dump_to_sql.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, dump_to_sql
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         dump_to_sql(
 9 |             parameters['tables'],
10 |             engine=parameters.get('engine', 'env://DPP_DB_ENGINE'),
11 |             updated_column=parameters.get("updated_column"),
12 |             updated_id_column=parameters.get("updated_id_column")
13 |         )
14 |     )
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     with ingest() as ctx:
19 |         spew_flow(flow(ctx.parameters), ctx)
20 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/dump_to_zip.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, dump_to_zip
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters: dict):
 7 |     out_file = parameters.pop('out-file')
 8 |     return Flow(
 9 |         dump_to_zip(
10 |             out_file,
11 |             **parameters
12 |         )
13 |     )
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     with ingest() as ctx:
18 |         spew_flow(flow(ctx.parameters), ctx)
19 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/duplicate.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, duplicate
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow, load_lazy_json
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         load_lazy_json(parameters.get('source')),
 9 |         duplicate(
10 |             parameters.get('source'),
11 |             parameters.get('target-name'),
12 |             parameters.get('target-path'),
13 |             parameters.get('batch_size', 1000),
14 |             parameters.get('duplicate_to_end', False)
15 |         )
16 |     )
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     with ingest() as ctx:
21 |         spew_flow(flow(ctx.parameters), ctx)
22 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/filter.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, filter_rows
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         filter_rows(
 9 |             equals=parameters.get('in', []),
10 |             not_equals=parameters.get('out', []),
11 |             resources=parameters.get('resources'),
12 |         )
13 |     )
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     with ingest() as ctx:
18 |         spew_flow(flow(ctx.parameters), ctx)
19 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/find_replace.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, find_replace
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         find_replace(
 9 |             parameters.get('fields', []),
10 |             resources=parameters.get('resources')
11 |         )
12 |     )
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     with ingest() as ctx:
17 |         spew_flow(flow(ctx.parameters), ctx)
18 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/flow.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from importlib import import_module
 3 | from datapackage_pipelines.wrapper import ingest
 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 5 | 
 6 | 
 7 | with ingest() as ctx:
 8 |     parameters, datapackage, resources = ctx
 9 |     stats = {}
10 | 
11 |     sys.path.append(parameters.pop('__path'))
12 |     flow_module = import_module(parameters.pop('__flow'))
13 |     flow = flow_module.flow(parameters, datapackage, resources, ctx.stats)
14 | 
15 |     spew_flow(flow, ctx)
16 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/internal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/lib/internal/__init__.py


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/internal/sink.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from datapackage_pipelines.wrapper import ingest, spew
 4 | 
 5 | SINK_MAGIC = '>>> PROCESSED ROWS: '
 6 | 
 7 | 
 8 | def sink(res_iter_):
 9 |     count = 0
10 |     for res in res_iter_:
11 |         for row in res:
12 |             count += 1
13 |             if count % 100 == 0:
14 |                 sys.stderr.write('%s%d\n' % (SINK_MAGIC, count))
15 |                 sys.stderr.flush()
16 |     sys.stderr.write('%s%d\n' % (SINK_MAGIC, count))
17 |     sys.stderr.flush()
18 |     yield from ()
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     sys.stderr.write('%s%d\n' % (SINK_MAGIC, 0))
23 |     sys.stderr.flush()
24 |     params, dp, res_iter = ingest()
25 |     spew({'name': 'boop', 'resources': []},
26 |         sink(res_iter))
27 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/join.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, join, update_resource
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.resources import PROP_STREAMING
 4 | from datapackage_pipelines.utilities.flow_utils import spew_flow, load_lazy_json
 5 | 
 6 | 
 7 | def flow(parameters):
 8 |     source = parameters['source']
 9 |     target = parameters['target']
10 |     return Flow(
11 |         load_lazy_json(source['name']),
12 |         join(
13 |             source['name'],
14 |             source['key'],
15 |             target['name'],
16 |             target['key'],
17 |             parameters['fields'],
18 |             parameters.get('full', None),
19 |             parameters.get('mode', 'half-outer'),
20 |             source.get('delete', False)
21 |         ),
22 |         update_resource(
23 |             target['name'],
24 |             **{
25 |                 PROP_STREAMING: True
26 |             }
27 |         )
28 |     )
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     with ingest() as ctx:
33 |         spew_flow(flow(ctx.parameters), ctx)
34 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/load.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, load
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | from datapackage_pipelines.utilities.resources import PROP_STREAMING, PROP_STREAMED_FROM
 5 | 
 6 | 
 7 | def flow(parameters):
 8 |     _from = parameters.pop('from')
 9 | 
10 |     num_resources = 0
11 | 
12 |     def count_resources():
13 |         def func(package):
14 |             global num_resources
15 |             num_resources = len(package.pkg.resources)
16 |             yield package.pkg
17 |             yield from package
18 |         return func
19 | 
20 |     def mark_streaming(_from):
21 |         def func(package):
22 |             for i in range(num_resources, len(package.pkg.resources)):
23 |                 package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMING, True)
24 |                 package.pkg.descriptor['resources'][i].setdefault(PROP_STREAMED_FROM,  _from)
25 |             yield package.pkg
26 |             yield from package
27 |         return func
28 | 
29 |     return Flow(
30 |         count_resources(),
31 |         load(_from, **parameters),
32 |         mark_streaming(_from),
33 |     )
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     with ingest() as ctx:
38 |         spew_flow(flow(ctx.parameters), ctx)
39 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/load_metadata.py:
--------------------------------------------------------------------------------
 1 | import datapackage
 2 | 
 3 | from datapackage_pipelines.wrapper import ingest, spew, get_dependency_datapackage_url
 4 | 
 5 | dep_prefix = 'dependency://'
 6 | 
 7 | parameters, dp, res_iter = ingest()
 8 | 
 9 | url = parameters['url']
10 | if url.startswith(dep_prefix):
11 |     dependency = url[len(dep_prefix):].strip()
12 |     url = get_dependency_datapackage_url(dependency)
13 |     assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
14 | 
15 | datapackage = datapackage.DataPackage(url)
16 | for k, v in datapackage.descriptor.items():
17 |     if k != 'resources':
18 |         dp[k] = v
19 | 
20 | spew(dp, res_iter)
21 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/load_resource.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import copy
 3 | import logging
 4 | 
 5 | import datapackage
 6 | 
 7 | from dataflows.helpers.resource_matcher import ResourceMatcher
 8 | 
 9 | from datapackage_pipelines.wrapper import ingest, spew, get_dependency_datapackage_url
10 | from datapackage_pipelines.utilities.resources import tabular, PROP_STREAMING, \
11 |     PROP_STREAMED_FROM
12 | 
13 | 
14 | def progress_logger(iter, log_progress_rows):
15 |     for i, row in enumerate(iter, 1):
16 |         yield row
17 |         if i % log_progress_rows == 0:
18 |             logging.info('loaded {} rows'.format(i))
19 | 
20 | 
21 | class ResourceLoader(object):
22 | 
23 |     def __init__(self):
24 |         self.parameters, self.dp, self.res_iter = ingest()
25 | 
26 |     def __call__(self):
27 |         url = self.parameters['url']
28 |         limit_rows = self.parameters.get('limit-rows')
29 |         log_progress_rows = self.parameters.get('log-progress-rows')
30 |         dep_prefix = 'dependency://'
31 |         if url.startswith(dep_prefix):
32 |             dependency = url[len(dep_prefix):].strip()
33 |             url = get_dependency_datapackage_url(dependency)
34 |             assert url is not None, "Failed to fetch output datapackage for dependency '%s'" % dependency
35 |         stream = self.parameters.get('stream', True)
36 |         required = self.parameters.get('required', True)
37 |         resource = self.parameters.get('resource')
38 |         resources = self.parameters.get('resources')
39 |         if resource is not None:
40 |             assert not resources
41 |             resource_index = resource if isinstance(resource, int) else None
42 |         else:
43 |             assert resources
44 |             resource_index = None
45 |             resource = list(resources.keys())
46 |         name_matcher = (
47 |             ResourceMatcher(resource, self.dp)
48 |             if isinstance(resource, (str, list))
49 |             else None
50 |         )
51 | 
52 |         selected_resources = []
53 |         found = False
54 |         try:
55 |             dp = datapackage.DataPackage(url)
56 |         except Exception:
57 |             if required:
58 |                 raise
59 |             else:
60 |                 dp = None
61 |         if dp:
62 |             dp = self.process_datapackage(dp)
63 |             for i, orig_res in enumerate(dp.resources):
64 |                 if resource_index == i or \
65 |                         (name_matcher is not None and name_matcher.match(orig_res.descriptor.get('name'))):
66 |                     found = True
67 |                     desc = copy.deepcopy(orig_res.descriptor)
68 |                     if 'primaryKey' in desc.get('schema', {}):
69 |                         # Avoid duplication checks
70 |                         del orig_res.descriptor['schema']['primaryKey']
71 |                         orig_res.commit()
72 |                     desc[PROP_STREAMED_FROM] = orig_res.source
73 |                     if resources:
74 |                         desc.update(resources[desc['name']])
75 |                     self.dp['resources'].append(desc)
76 |                     if tabular(desc) and stream:
77 |                         desc[PROP_STREAMING] = True
78 |                         orig_res_iter = orig_res.iter(keyed=True)
79 |                         if limit_rows:
80 |                             orig_res_iter = itertools.islice(orig_res_iter, limit_rows)
81 |                         if log_progress_rows:
82 |                             orig_res_iter = progress_logger(orig_res_iter, log_progress_rows)
83 |                         selected_resources.append(orig_res_iter)
84 |                     else:
85 |                         desc[PROP_STREAMING] = False
86 | 
87 |         assert found or not required, "Failed to find resource with index or name matching %r" % resource
88 |         spew(self.dp, itertools.chain(self.res_iter, selected_resources))
89 | 
90 |     def process_datapackage(self, dp_):
91 |         return dp_
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     ResourceLoader()()
96 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/printer.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, printer
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         printer(),
 9 |     )
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     with ingest() as ctx:
14 |         spew_flow(flow(ctx.parameters), ctx)
15 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/set_types.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, set_type, validate, delete_fields
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     resources = parameters.get('resources')
 8 |     regex = parameters.get('regex', True)
 9 |     if 'types' in parameters:
10 |         return Flow(
11 |             *[
12 |                 set_type(name, resources=resources, regex=regex, **options)
13 |                 if options is not None else
14 |                 delete_fields([name], resources=resources)
15 |                 for name, options in parameters['types'].items()
16 |             ]
17 |         )
18 |     else:
19 |         return Flow(
20 |             validate()
21 |         )
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     with ingest() as ctx:
26 |         print(flow(ctx.parameters).chain)
27 |         spew_flow(flow(ctx.parameters), ctx)
28 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/sort.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, sort_rows
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow, load_lazy_json
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         load_lazy_json(parameters.get('resources')),
 9 |         sort_rows(
10 |             parameters['sort-by'],
11 |             resources=parameters.get('resources'),
12 |             reverse=parameters.get('reverse')
13 |         )
14 |     )
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     with ingest() as ctx:
19 |         spew_flow(flow(ctx.parameters), ctx)
20 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/unpivot.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, unpivot
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         unpivot(
 9 |             parameters.get('unpivot'),
10 |             parameters.get('extraKeyFields'),
11 |             parameters.get('extraValueField'),
12 |             resources=parameters.get('resources')
13 |         )
14 |     )
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     with ingest() as ctx:
19 |         spew_flow(flow(ctx.parameters), ctx)
20 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/update_package.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, update_package
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     return Flow(
 8 |         update_package(**parameters)
 9 |     )
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     with ingest() as ctx:
14 |         spew_flow(flow(ctx.parameters), ctx)
15 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/lib/update_resource.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, update_resource
 2 | from datapackage_pipelines.wrapper import ingest
 3 | from datapackage_pipelines.utilities.flow_utils import spew_flow
 4 | 
 5 | 
 6 | def flow(parameters):
 7 |     resources = parameters.get('resources', None)
 8 |     metadata = parameters.pop('metadata', {})
 9 |     return Flow(
10 |         update_resource(resources, **metadata),
11 |     )
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     with ingest() as ctx:
16 |         spew_flow(flow(ctx.parameters), ctx)
17 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/manager/__init__.py:
--------------------------------------------------------------------------------
1 | from .tasks import execute_pipeline, finalize
2 | from .runner import run_pipelines, ExecutionResult, ProgressReport
3 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/manager/logging_config.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | own_name = '%-32s' % 'Main'
4 | logging.basicConfig(level=logging.INFO,
5 |                     format="%(levelname)-8s:"+own_name+":%(message)s")
6 | logging.root.setLevel(logging.INFO)
7 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/manager/runners/__init__.py:
--------------------------------------------------------------------------------
1 | from .runner_config import RunnerConfiguration
2 | 
3 | runner_config = RunnerConfiguration()
4 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/manager/runners/base_runner.py:
--------------------------------------------------------------------------------
1 | class BaseRunner(object):
2 | 
3 |     def __init__(self, name, parameters):
4 |         self.name = name
5 |         self.parameters = parameters
6 | 
7 |     def get_execution_args(self, step, cwd, idx):
8 |         raise NotImplementedError()
9 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/manager/runners/local_python.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shlex
 4 | 
 5 | from ...utilities.extended_json import json
 6 | from .base_runner import BaseRunner
 7 | 
 8 | 
 9 | class LocalPythonRunner(BaseRunner):
10 | 
11 |     def get_execution_args(self, step, _, idx):
12 |         return [
13 |             sys.executable,
14 |             step['executor'],
15 |             str(idx),
16 |             json.dumps(step.get('parameters', {})),
17 |             str(step.get('validate', False)),
18 |             step.get('_cache_hash') if step.get('cache') else ''
19 |         ]
20 | 
21 | 
22 | class WrappedPythonRunner(LocalPythonRunner):
23 | 
24 |     def get_execution_args(self, step, cwd, idx):
25 |         args = super(WrappedPythonRunner, self).get_execution_args(step, cwd, idx)
26 |         for i in range(len(args)):
27 |             args[i] = '\\\"' + args[i].replace('"', '\\\\\\\"') + '\\\"'
28 |         cmd = " ".join(args)
29 |         abspath = os.path.abspath(cwd)
30 |         cmd = self.parameters['wrapper'].format(path=cwd,
31 |                                                 abspath=abspath,
32 |                                                 cmd=cmd,
33 |                                                 env=os.environ)
34 |         args = shlex.split(cmd)
35 |         return args
36 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/manager/runners/runner_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | from .local_python import LocalPythonRunner, WrappedPythonRunner
 5 | 
 6 | 
 7 | class RunnerConfiguration(object):
 8 | 
 9 |     ENV_VAR = 'DPP_RUNNER_CONFIG'
10 |     DEFAULT_RUNNER_CONFIG = 'dpp-runners.yaml'
11 | 
12 |     def __init__(self):
13 | 
14 |         config_fn = os.environ.get(self.ENV_VAR, self.DEFAULT_RUNNER_CONFIG)
15 |         if os.path.exists(config_fn):
16 |             self.config = yaml.load(open(config_fn), Loader=yaml.Loader)
17 |         else:
18 |             self.config = {}
19 | 
20 |     def get_runner_class(self, kind):
21 |         return {
22 |             'local-python': LocalPythonRunner,
23 |             'wrapped-python': WrappedPythonRunner,
24 |         }.get(kind, LocalPythonRunner)
25 | 
26 |     def get_runner(self, name):
27 |         runner_config = self.config.get(name, {})
28 |         kind = runner_config.get('kind')
29 |         parameters = runner_config.get('parameters')
30 |         return self.get_runner_class(kind)(name, parameters)
31 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/__init__.py:
--------------------------------------------------------------------------------
1 | from .specs import pipelines, register_all_pipelines
2 | from .parsers.base_parser import PipelineSpec
3 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/errors.py:
--------------------------------------------------------------------------------
 1 | # pylama:skip=1
 2 | from typing import NamedTuple
 3 | 
 4 | 
 5 | class SpecError(NamedTuple):
 6 |     short_msg: str
 7 |     long_msg: str
 8 | 
 9 |     def __str__(self):
10 |         return '{}: {}'.format(self.short_msg, self.long_msg)
11 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/hashers/__init__.py:
--------------------------------------------------------------------------------
1 | from .hash_calculator import HashCalculator
2 | from .dependency_resolver import resolve_dependencies, DependencyMissingException
3 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/hashers/dependency_resolver.py:
--------------------------------------------------------------------------------
 1 | import datapackage
 2 | from datapackage.exceptions import DataPackageException
 3 | from ..parsers.base_parser import PipelineSpec
 4 | 
 5 | from ..errors import SpecError
 6 | 
 7 | 
 8 | class DependencyMissingException(Exception):
 9 | 
10 |     def __init__(self, spec, missing):
11 |         self.spec = spec
12 |         self.missing = missing
13 | 
14 | 
15 | def resolve_dependencies(spec: PipelineSpec, all_pipeline_ids, status_mgr):
16 | 
17 |     cache_hash = ''
18 |     dependencies = spec.pipeline_details.get('dependencies', ())
19 |     for dependency in dependencies:
20 |         if 'pipeline' in dependency:
21 |             pipeline_id = dependency['pipeline']
22 |             if pipeline_id not in all_pipeline_ids:
23 |                 raise DependencyMissingException(spec, pipeline_id)
24 | 
25 |     for dependency in dependencies:
26 |         if 'pipeline' in dependency:
27 |             pipeline_id = dependency['pipeline']
28 |             ps = status_mgr.get(pipeline_id)
29 |             if not ps.runnable():
30 |                 spec.validation_errors.append(
31 |                     SpecError('Invalid dependency',
32 |                               'Cannot run until dependency passes validation: {}'.format(pipeline_id))
33 |                 )
34 |             elif ps.dirty():
35 |                 spec.validation_errors.append(
36 |                     SpecError('Dirty dependency',
37 |                               'Cannot run until dependency is executed: {}'.format(pipeline_id))
38 |                 )
39 |             elif ps.get_last_execution() is not None and not ps.get_last_execution().success:
40 |                 spec.validation_errors.append(
41 |                     SpecError('Dependency unsuccessful',
42 |                               'Cannot run until dependency "{}" is successfully '
43 |                               'executed'.format(pipeline_id))
44 |                 )
45 | 
46 |             for dep_err in ps.validation_errors:
47 |                 spec.validation_errors.append(
48 |                     SpecError('From {}'.format(pipeline_id), dep_err)
49 |                 )
50 | 
51 |             pipeline_hash = all_pipeline_ids.get(pipeline_id).cache_hash
52 |             assert pipeline_hash is not None
53 |             cache_hash += pipeline_hash
54 | 
55 |             spec.dependencies.append(pipeline_id)
56 | 
57 |         elif 'datapackage' in dependency:
58 |             dp_id = dependency['datapackage']
59 |             try:
60 |                 dp = datapackage.DataPackage(dp_id)
61 |                 if 'hash' in dp.descriptor:
62 |                     cache_hash += dp.descriptor['hash']
63 |                 else:
64 |                     spec.validation_errors.append(
65 |                         SpecError('Missing dependency',
66 |                                   "Couldn't get data from datapackage %s"
67 |                                   % dp_id))
68 |             except DataPackageException:
69 |                 spec.validation_errors.append(
70 |                     SpecError('Missing dependency',
71 |                               "Couldn't open datapackage %s"
72 |                               % dp_id))
73 | 
74 |         else:
75 |             spec.validation_errors.append(
76 |                 SpecError('Missing dependency',
77 |                           'Unknown dependency provided (%r)' % dependency))
78 | 
79 |     return cache_hash
80 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/hashers/hash_calculator.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | from ...utilities.extended_json import json
 4 | from ..parsers.base_parser import PipelineSpec
 5 | 
 6 | from ..errors import SpecError
 7 | from .dependency_resolver import resolve_dependencies
 8 | 
 9 | 
10 | class HashCalculator(object):
11 | 
12 |     def __init__(self):
13 |         self.all_pipeline_ids = {}
14 | 
15 |     def calculate_hash(self, spec: PipelineSpec, status_mgr, ignore_missing_deps=False):
16 | 
17 |         cache_hash = None
18 |         if spec.pipeline_id in self.all_pipeline_ids:
19 |             message = 'Duplicate key {0} in {1}' \
20 |                 .format(spec.pipeline_id, spec.path)
21 |             spec.validation_errors.append(SpecError('Duplicate Pipeline Id', message))
22 | 
23 |         else:
24 |             if ignore_missing_deps:
25 |                 cache_hash = ''
26 |             else:
27 |                 cache_hash = resolve_dependencies(spec, self.all_pipeline_ids, status_mgr)
28 | 
29 |             self.all_pipeline_ids[spec.pipeline_id] = spec
30 |             if len(spec.validation_errors) > 0:
31 |                 return cache_hash
32 | 
33 |             for step in spec.pipeline_details['pipeline']:
34 |                 m = hashlib.md5()
35 |                 m.update(cache_hash.encode('ascii'))
36 |                 with open(step['executor'], 'rb') as f:
37 |                     m.update(f.read())
38 |                 m.update(json.dumps(step, ensure_ascii=True, sort_keys=True)
39 |                          .encode('ascii'))
40 |                 cache_hash = m.hexdigest()
41 |                 step['_cache_hash'] = cache_hash
42 | 
43 |         spec.cache_hash = cache_hash
44 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | from .basic_pipeline import BasicPipelineParser
2 | from .source_spec_pipeline import SourceSpecPipelineParser
3 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/parsers/base_parser.py:
--------------------------------------------------------------------------------
 1 | class PipelineSpec(object):
 2 |     def __init__(self,
 3 |                  path=None,
 4 |                  pipeline_id=None,
 5 |                  pipeline_details=None,
 6 |                  source_details=None,
 7 |                  validation_errors=None,
 8 |                  dependencies=None,
 9 |                  cache_hash='',
10 |                  schedule=None,
11 |                  environment=None):
12 |         self.path = path
13 |         self.pipeline_id = pipeline_id
14 |         self.pipeline_details = pipeline_details
15 |         self.source_details = source_details
16 |         self.validation_errors = [] if validation_errors is None else validation_errors
17 |         self.dependencies = [] if dependencies is None else dependencies
18 |         self.cache_hash = cache_hash
19 |         self.schedule = schedule
20 |         self.environment = environment
21 | 
22 |     def __str__(self):
23 |         return 'PipelineSpec({}, validation_errors={}, ' \
24 |                'dependencies={}, cache_hash={})'\
25 |             .format(self.pipeline_id, self.validation_errors,
26 |                     self.dependencies, self.cache_hash)
27 | 
28 |     def __repr__(self):
29 |         return str(self)
30 | 
31 | 
32 | class BaseParser(object):
33 | 
34 |     class InvalidFileException(Exception):
35 |         def __init__(self, short_msg, long_msg):
36 |             self.short_msg = short_msg
37 |             self.long_msg = long_msg
38 | 
39 |     @classmethod
40 |     def check_filename(cls, filename):
41 |         raise NotImplementedError()
42 | 
43 |     @classmethod
44 |     def to_pipeline(cls, spec, fullpath):
45 |         raise NotImplementedError()
46 | 
47 |     @staticmethod
48 |     def replace_root_dir(path, root_dir):
49 |         if root_dir.endswith('/'):
50 |             root_dir = root_dir[:-1]
51 |         if path.startswith(root_dir):
52 |             path = '.' + path[len(root_dir):]
53 |         return path
54 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/parsers/basic_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Iterator
 3 | 
 4 | from .base_parser import BaseParser, PipelineSpec
 5 | 
 6 | 
 7 | class BasicPipelineParser(BaseParser):
 8 | 
 9 |     SPEC_FILENAME = 'pipeline-spec.yaml'
10 | 
11 |     @classmethod
12 |     def check_filename(cls, filename):
13 |         return filename == cls.SPEC_FILENAME
14 | 
15 |     @classmethod
16 |     def to_pipeline(cls, spec, fullpath, root_dir='.') -> Iterator[PipelineSpec]:
17 |         dirpath = os.path.dirname(fullpath)
18 | 
19 |         for pipeline_id, pipeline_details in spec.items():
20 |             pipeline_id = os.path.join(dirpath, pipeline_id)
21 |             pipeline_id = cls.replace_root_dir(pipeline_id, root_dir)
22 |             yield PipelineSpec(path=dirpath,
23 |                                pipeline_id=pipeline_id,
24 |                                pipeline_details=pipeline_details)
25 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/parsers/source_spec_pipeline.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from ..resolver import resolve_generator
 4 | from ..errors import SpecError
 5 | from .base_parser import BaseParser, PipelineSpec
 6 | 
 7 | 
 8 | class SourceSpecPipelineParser(BaseParser):
 9 | 
10 |     SOURCE_FILENAME_SUFFIX = '.source-spec.yaml'
11 | 
12 |     @classmethod
13 |     def check_filename(cls, filename):
14 |         return filename.endswith(cls.SOURCE_FILENAME_SUFFIX)
15 | 
16 |     @classmethod
17 |     def fix_dependency(cls, dep, dirpath, root_dir):
18 |         if dep.startswith('./'):
19 |             dep = dep[2:]
20 |         return os.path.join(cls.replace_root_dir(dirpath, root_dir), dep)
21 | 
22 |     @classmethod
23 |     def to_pipeline(cls, source_spec, fullpath, root_dir='.'):
24 |         filename = os.path.basename(fullpath)
25 |         dirpath = os.path.dirname(fullpath)
26 | 
27 |         module_name = filename[:-len(cls.SOURCE_FILENAME_SUFFIX)]
28 |         pipeline_id = os.path.join(dirpath, module_name)
29 |         generator = resolve_generator(module_name)
30 | 
31 |         if generator is None:
32 |             message = 'Unknown source description kind "{}" in {}' \
33 |                 .format(module_name, fullpath)
34 |             error = SpecError('Unknown source kind', message)
35 |             yield PipelineSpec(pipeline_id=module_name,
36 |                                path=dirpath,
37 |                                validation_errors=[error],
38 |                                pipeline_details={'pipeline': []})
39 |             return
40 | 
41 |         base = cls.replace_root_dir(dirpath, root_dir)
42 |         if generator.internal_validate(source_spec):
43 |             try:
44 |                 spec = generator.internal_generate(source_spec, base)
45 |                 for pipeline_id, pipeline_details in spec:
46 |                     if pipeline_id[0] == ':' and pipeline_id[-1] == ':':
47 |                         module = pipeline_id[1:-1]
48 |                         filename = module + cls.SOURCE_FILENAME_SUFFIX
49 |                         yield from cls.to_pipeline(pipeline_details,
50 |                                                    os.path.join(dirpath, filename))
51 |                     else:
52 |                         yield PipelineSpec(path=pipeline_details.get('__path', dirpath),
53 |                                            pipeline_id=pipeline_id,
54 |                                            pipeline_details=pipeline_details,
55 |                                            source_details=source_spec)
56 |             except Exception as e:
57 |                 message = '"{}" in {}' \
58 |                     .format(e, fullpath)
59 |                 error = SpecError('Error converting source', message)
60 |                 yield PipelineSpec(pipeline_id=pipeline_id,
61 |                                    path=dirpath, validation_errors=[error],
62 |                                    pipeline_details={'pipeline': []})
63 |         else:
64 |             message = 'Invalid source description for "{}" in {}' \
65 |                 .format(module_name, fullpath)
66 |             error = SpecError('Invalid Source', message)
67 |             yield PipelineSpec(pipeline_id=pipeline_id,
68 |                                path=dirpath,
69 |                                validation_errors=[error],
70 |                                pipeline_details={'pipeline': []})
71 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/schemas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/specs/schemas/__init__.py


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/schemas/pipeline-spec.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "http://json-schema.org/draft-04/schema#",
 3 |   "type": "object",
 4 |   "required": [
 5 |     "pipeline"
 6 |   ],
 7 |   "properties": {
 8 |     "title": {
 9 |       "type": "string"
10 |     },
11 |     "description": {
12 |       "type": "string"
13 |     },
14 |     "environment": {
15 |       "type": "object"
16 |     },
17 |     "schedule": {
18 |       "type": "object",
19 |       "properties": {
20 |         "crontab": {
21 |           "type": "string"
22 |         }
23 |       }
24 |     },
25 |     "pipeline": {
26 |       "type": "array",
27 |       "minLength": 1,
28 |       "items": {
29 |         "type": "object",
30 |         "oneOf": [
31 |           {
32 |             "required": [
33 |               "run"
34 |             ]
35 |           },
36 |           {
37 |             "required": [
38 |               "flow"
39 |             ]
40 |           }
41 |         ],
42 |         "properties": {
43 |           "run": {
44 |             "type": "string"
45 |           },
46 |           "parameters": {
47 |             "type": "object"
48 |           },
49 |           "cache": {
50 |             "type": "boolean"
51 |           },
52 |           "validate": {
53 |             "type": "booelan"
54 |           }
55 |         }
56 |       }
57 |     },
58 |     "dependencies": {
59 |       "type": "array",
60 |       "items": {
61 |         "type": "object",
62 |         "maxProperties": 1,
63 |         "properties": {
64 |           "datapackage": {
65 |             "type": "string",
66 |             "format": "uri"
67 |           },
68 |           "pipeline": {
69 |             "type": "string"
70 |           }
71 |         }
72 |       }
73 |     }
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/specs/schemas/validator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import jsonschema
 4 | 
 5 | from ..errors import SpecError
 6 | 
 7 | 
 8 | schema_filename = 'pipeline-spec.schema.json'
 9 | schema_filename = os.path.join(os.path.dirname(__file__),
10 |                                schema_filename)
11 | schema = json.load(open(schema_filename))
12 | validator = jsonschema.validators.validator_for(schema)
13 | schema = validator(schema)
14 | 
15 | 
16 | def validate_pipeline(pipeline_details, errors):
17 |     try:
18 |         schema.validate(pipeline_details)
19 |     except jsonschema.ValidationError as e:
20 |         errors.append(SpecError('Invalid Pipeline', str(e)))
21 |         return False
22 |     return True
23 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/status/__init__.py:
--------------------------------------------------------------------------------
1 | from .status_manager import status_mgr
2 | from .hook_sender import hook_sender
3 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/status/backend_filesystem.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import codecs
 3 | import ujson
 4 | 
 5 | 
 6 | class FilesystemBackend(object):
 7 | 
 8 |     KIND = 'filesystem'
 9 | 
10 |     def __init__(self, root_dir='.'):
11 |         dpp_dirname = os.environ.get('DPP_DB_DIRNAME', '.dpp')
12 |         self.base_dir = os.path.join(root_dir, dpp_dirname)
13 |         os.makedirs(self.base_dir, exist_ok=True)
14 | 
15 |     def fn(self, pipeline_id):
16 |         pipeline_id = codecs.encode(pipeline_id.encode('utf8'), 'base64').decode('ascii').replace('\n', '')
17 |         return os.path.join(self.base_dir, pipeline_id)
18 | 
19 |     def get_status(self, pipeline_id):
20 |         try:
21 |             with open(self.fn(pipeline_id)) as f:
22 |                 return ujson.load(f)
23 |         except FileNotFoundError:
24 |             pass
25 |         except ValueError:
26 |             pass
27 | 
28 |     def set_status(self, pipeline_id, status):
29 |         fn = self.fn(pipeline_id)
30 |         with open(fn+'.tmp', 'w') as f:
31 |             ujson.dump(status, f)
32 |         os.rename(fn+'.tmp', fn)
33 | 
34 |     def del_status(self, pipeline_id):
35 |         try:
36 |             os.unlink(self.fn(pipeline_id))
37 |         except FileNotFoundError:
38 |             pass
39 | 
40 |     def register_pipeline_id(self, pipeline_id):
41 |         pass
42 | 
43 |     def deregister_pipeline_id(self, pipeline_id):
44 |         self.del_status(pipeline_id)
45 | 
46 |     def reset(self):
47 |         for p in self.all_pipeline_ids():
48 |             self.del_status(p)
49 | 
50 |     def all_pipeline_ids(self):
51 |         # Decoding encoded identifiers
52 |         dec_ids = []
53 |         enc_ids = sorted(os.listdir(self.base_dir))
54 |         for enc_id in enc_ids:
55 |             dec_id = codecs.decode(enc_id.encode('utf8'), 'base64').decode('utf8')
56 |             if dec_id.startswith('PipelineStatus:'):
57 |                 dec_id = dec_id.replace('PipelineStatus:', '')
58 |                 dec_ids.append(dec_id)
59 |         return dec_ids
60 | 
61 |     def all_statuses(self):
62 |         return [self.get_status(_id)
63 |                 for _id in self.all_pipeline_ids()]
64 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/status/backend_redis.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import redis
 4 | 
 5 | from datapackage_pipelines.utilities.extended_json import json
 6 | 
 7 | 
 8 | class RedisBackend(object):
 9 | 
10 |     KIND = 'redis'
11 | 
12 |     def __init__(self, host=None, port=6379):
13 |         self.redis = None
14 |         if host is not None and len(host) > 0:
15 |             conn = redis.StrictRedis(host=host, port=port, db=5)
16 |             try:
17 |                 conn.ping()
18 |                 self.redis = conn
19 |             except redis.exceptions.ConnectionError:
20 |                 logging.warning('Failed to connect to Redis, host:%s, port:%s',
21 |                                 host, port)
22 | 
23 |     def is_init(self):
24 |         return self.redis is not None
25 | 
26 |     def get_status(self, pipeline_id):
27 |         if self.is_init():
28 |             status = self.redis.get(pipeline_id)
29 |             if status is not None:
30 |                 status = json.loads(status.decode('ascii'))
31 |                 return status
32 | 
33 |     def set_status(self, pipeline_id, status):
34 |         if self.is_init():
35 |             self.redis.set(pipeline_id, json.dumps(status, ensure_ascii=True))
36 | 
37 |     def del_status(self, pipeline_id):
38 |         if self.is_init():
39 |             self.redis.delete(pipeline_id)
40 | 
41 |     def register_pipeline_id(self, pipeline_id):
42 |         if self.is_init():
43 |             self.redis.sadd('all-pipelines', pipeline_id.strip())
44 | 
45 |     def deregister_pipeline_id(self, pipeline_id):
46 |         if self.is_init():
47 |             self.redis.srem('all-pipelines', pipeline_id.strip())
48 | 
49 |     def reset(self):
50 |         if self.is_init():
51 |             self.redis.delete('all-pipelines')
52 | 
53 |     def all_pipeline_ids(self):
54 |         if self.is_init():
55 |             return [x.decode('utf-8') for x in self.redis.smembers('all-pipelines')]
56 |         return []
57 | 
58 |     def all_statuses(self):
59 |         if self.is_init():
60 |             all_ids = self.redis.smembers('all-pipelines')
61 |             pipe = self.redis.pipeline()
62 |             for _id in sorted(all_ids):
63 |                 pipe.get(_id)
64 |             return [json.loads(sts.decode('ascii')) for sts in pipe.execute()]
65 |         return []
66 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/status/backend_sqlite.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sqlite3
 3 | from datapackage_pipelines.utilities.extended_json import json
 4 | 
 5 | DPP_DB_FILENAME = os.environ.get('DPP_DB_FILENAME', '.dpp.db')
 6 | 
 7 | 
 8 | class Sqlite3Dict(object):
 9 |     def __init__(self, filename):
10 |         self.filename = filename
11 |         conn = sqlite3.connect(self.filename)
12 |         cursor = conn.cursor()
13 |         cursor.execute(
14 |             '''CREATE TABLE IF NOT EXISTS d (_key text, _value text)'''
15 |         )
16 |         conn.commit()
17 |         conn.close()
18 | 
19 |     def __getitem__(self, key):
20 |         conn = sqlite3.connect(self.filename)
21 |         cursor = conn.cursor()
22 |         result = cursor.execute(
23 |             'SELECT _value from d where _key=?',
24 |             (key,)
25 |         ).fetchone()
26 |         conn.close()
27 |         if result is not None:
28 |             return json.loads(result[0])
29 |         return None
30 | 
31 |     def __setitem__(self, key, value):
32 |         conn = sqlite3.connect(self.filename)
33 |         value = json.dumps(value)
34 |         cursor = conn.cursor()
35 |         cursor.execute('DELETE FROM d where _key=?', (key,))
36 |         cursor.execute('INSERT INTO d VALUES (?,?)', (key, value))
37 |         conn.commit()
38 |         conn.close()
39 | 
40 |     def __delitem__(self, key):
41 |         conn = sqlite3.connect(self.filename)
42 |         cursor = conn.cursor()
43 |         cursor.execute('DELETE FROM d where _key=?', (key,))
44 |         conn.commit()
45 |         conn.close()
46 | 
47 | 
48 | class SqliteBackend(object):
49 | 
50 |     KIND = 'sqlite3'
51 |     ALL_PIPELINES_KEY = 'all-pipelines'
52 | 
53 |     def __init__(self):
54 |         self.db = Sqlite3Dict(DPP_DB_FILENAME)
55 | 
56 |     def get_status(self, pipeline_id):
57 |         return self.db[pipeline_id]
58 | 
59 |     def set_status(self, pipeline_id, status):
60 |         self.db[pipeline_id] = status
61 | 
62 |     def del_status(self, pipeline_id):
63 |         del self.db[pipeline_id]
64 | 
65 |     def register_pipeline_id(self, pipeline_id):
66 |         all_pipelines = self.db[self.ALL_PIPELINES_KEY]
67 |         if all_pipelines is None:
68 |             all_pipelines = []
69 |         if pipeline_id not in all_pipelines:
70 |             all_pipelines.append(pipeline_id)
71 |         self.db[self.ALL_PIPELINES_KEY] = all_pipelines
72 | 
73 |     def deregister_pipeline_id(self, pipeline_id):
74 |         all_pipelines = self.db[self.ALL_PIPELINES_KEY]
75 |         if all_pipelines is None:
76 |             all_pipelines = []
77 |         if pipeline_id in all_pipelines:
78 |             all_pipelines = filter(lambda x: x != pipeline_id, all_pipelines)
79 |         self.db[self.ALL_PIPELINES_KEY] = all_pipelines
80 | 
81 |     def reset(self):
82 |         self.db[self.ALL_PIPELINES_KEY] = []
83 | 
84 |     def all_pipeline_ids(self):
85 |         all_ids = sorted(self.db[self.ALL_PIPELINES_KEY])
86 |         return all_ids
87 | 
88 |     def all_statuses(self):
89 |         all_ids = sorted(self.db[self.ALL_PIPELINES_KEY])
90 |         return [self.db[_id] for _id in all_ids]
91 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/status/hook_sender.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from concurrent.futures import ThreadPoolExecutor
 3 | 
 4 | import requests
 5 | from requests.exceptions import RequestException
 6 | 
 7 | tpe = ThreadPoolExecutor(max_workers=1)
 8 | 
 9 | 
10 | def _send(hook, payload):
11 |     try:
12 |         response = requests.post(hook, json=payload)
13 |         if response.status_code != 200:
14 |             logging.warning('Server returned %s, hook %s with payload %r ',
15 |                             response.status_code, hook, payload)
16 |     except RequestException as e:
17 |         logging.warning('Failed to call hook %s with payload %r (%s)',
18 |                         hook, payload, e)
19 | 
20 | 
21 | class HookSender():
22 |     def send(self, hook, payload, blocking=False):
23 |         if blocking:
24 |             _send(hook, payload)
25 |         else:
26 |             tpe.submit(_send, hook, payload)
27 | 
28 | 
29 | hook_sender = HookSender()
30 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/status/status_manager.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from .backend_redis import RedisBackend
 4 | from .backend_filesystem import FilesystemBackend
 5 | from .pipeline_status import PipelineStatus
 6 | 
 7 | 
 8 | class StatusManager(object):
 9 | 
10 |     def __init__(self, *, host=None, port=6379, root_dir='.'):
11 |         self._host = host
12 |         self._port = port
13 |         self._backend = None
14 |         self._root_dir = root_dir
15 | 
16 |     @property
17 |     def backend(self):
18 |         if self._backend is None:
19 |             redis = RedisBackend(self._host, self._port)
20 |             self._backend = redis if redis.is_init() else FilesystemBackend(self._root_dir)
21 |         return self._backend
22 | 
23 |     def get_errors(self, _id):
24 |         ex = self.get(_id).get_last_execution()
25 |         if ex is not None:
26 |             return ex.error_log
27 |         return []
28 | 
29 |     def initialize(self):
30 |         self.backend.reset()
31 | 
32 |     def get(self, _id) -> PipelineStatus:
33 |         return PipelineStatus(self.backend, _id)
34 | 
35 |     def all_statuses(self):
36 |         return self.backend.all_statuses()
37 | 
38 |     def all_pipeline_ids(self):
39 |         return self.backend.all_pipeline_ids()
40 | 
41 |     def deregister(self, pipeline_id):
42 |         return self.get(pipeline_id).deregister()
43 | 
44 | 
45 | _status = None
46 | _root_dir = None
47 | 
48 | 
49 | def status_mgr(root_dir='.') -> StatusManager:
50 |     global _status
51 |     global _root_dir
52 | 
53 |     if _status is not None and _root_dir == root_dir:
54 |         return _status
55 |     _root_dir = root_dir
56 |     _status = StatusManager(host=os.environ.get('DPP_REDIS_HOST'), root_dir=root_dir)
57 |     return _status
58 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/datapackage_pipelines/utilities/__init__.py


--------------------------------------------------------------------------------
/datapackage_pipelines/utilities/execution_id.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | 
3 | 
4 | def gen_execution_id():
5 |     return str(uuid.uuid4())
6 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/utilities/flow_utils.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, load, update_package
 2 | from dataflows.helpers.resource_matcher import ResourceMatcher
 3 | 
 4 | from datapackage_pipelines.wrapper import ProcessorContext
 5 | from datapackage_pipelines.utilities.extended_json import LazyJsonLine
 6 | 
 7 | 
 8 | def load_lazy_json(resources):
 9 | 
10 |     def func(package):
11 |         matcher = ResourceMatcher(resources, package.pkg)
12 |         yield package.pkg
13 |         for rows in package:
14 |             if matcher.match(rows.res.name):
15 |                 yield (
16 |                     row.inner
17 |                     if isinstance(row, LazyJsonLine)
18 |                     else row
19 |                     for row in rows
20 |                 )
21 |             else:
22 |                 yield rows
23 | 
24 |     return func
25 | 
26 | 
27 | class MergeableStats():
28 |     def __init__(self, ds_stats, ctx_stats):
29 |         self.ds_stats = ds_stats
30 |         self.ctx_stats = ctx_stats
31 | 
32 |     def __iter__(self):
33 |         if self.ds_stats is not None:
34 |             for x in self.ds_stats:
35 |                 yield from x.items()
36 |         if self.ctx_stats is not None:
37 |             yield from self.ctx_stats.items()
38 | 
39 | 
40 | def spew_flow(flow, ctx: ProcessorContext):
41 |     flow = Flow(
42 |         update_package(**ctx.datapackage),
43 |         load((ctx.datapackage, ctx.resource_iterator)),
44 |         flow,
45 |     )
46 |     datastream = flow.datastream()
47 |     ctx.datapackage = datastream.dp.descriptor
48 |     ctx.resource_iterator = datastream.res_iter
49 |     ctx.stats = MergeableStats(datastream.stats, ctx.stats)
50 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/utilities/lazy_dict.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | try:
 4 |     MutableMapping = collections.MutableMapping
 5 | except:
 6 |     MutableMapping = collections.abc.MutableMapping
 7 | 
 8 | class LazyDict(MutableMapping):
 9 | 
10 |     def __init__(self):
11 |         self._inner = None
12 |         self._dirty = False
13 | 
14 |     @property
15 |     def dirty(self):
16 |         return self._dirty
17 | 
18 |     @property
19 |     def inner(self):
20 |         self.__ensure()
21 |         return self._inner
22 | 
23 |     def _evaluate(self):
24 |         raise NotImplementedError()
25 | 
26 |     def __ensure(self):
27 |         if self._inner is None:
28 |             self._inner = self._evaluate()
29 | 
30 |     def __len__(self):
31 |         self.__ensure()
32 |         return len(self._inner)
33 | 
34 |     def __getitem__(self, item):
35 |         self.__ensure()
36 |         return self._inner.__getitem__(item)
37 | 
38 |     def __setitem__(self, key, value):
39 |         self.__ensure()
40 |         self._inner.__setitem__(key, value)
41 |         self._dirty = True
42 | 
43 |     def __delitem__(self, key):
44 |         self.__ensure()
45 |         self._inner.__delitem__(key)
46 |         self._dirty = True
47 | 
48 |     def __iter__(self):
49 |         self.__ensure()
50 |         return self._inner.__iter__()
51 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/utilities/resources.py:
--------------------------------------------------------------------------------
 1 | def is_a_url(path):
 2 |     return (path is not None and isinstance(path, str) and
 3 |             (path.startswith('http://') or
 4 |              path.startswith('https://'))
 5 |             )
 6 | 
 7 | 
 8 | def tabular(descriptor):
 9 |     return 'schema' in descriptor
10 | 
11 | 
12 | def streaming(descriptor):
13 |     return descriptor.get(PROP_STREAMING)
14 | 
15 | 
16 | def streamable(descriptor):
17 |     return PROP_STREAMED_FROM in descriptor and \
18 |            not streaming(descriptor)
19 | 
20 | 
21 | def get_path(descriptor):
22 |     path = descriptor.get('path')
23 |     if isinstance(path, str):
24 |         return path
25 |     if isinstance(path, list):
26 |         if len(path) > 0:
27 |             return path.pop(0)
28 |         else:
29 |             return None
30 |     assert path is None, '%r' % path
31 |     return None
32 | 
33 | 
34 | PATH_PLACEHOLDER = '_'
35 | PROP_STREAMED_FROM = 'dpp:streamedFrom'
36 | PROP_STREAMING = 'dpp:streaming'
37 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/utilities/stat_utils.py:
--------------------------------------------------------------------------------
1 | STATS_DPP_KEY = '.dpp'
2 | STATS_OUT_DP_URL_KEY = 'out-datapackage-url'
3 | 
4 | 
5 | def user_facing_stats(stats):
6 |     if stats is not None and isinstance(stats, dict):
7 |         return dict((k, v) for k, v in stats.items() if k != STATS_DPP_KEY)
8 |     return None
9 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/utilities/tabulator_txt_parser.py:
--------------------------------------------------------------------------------
 1 | from tabulator.parser import Parser
 2 | from tabulator.helpers import reset_stream
 3 | 
 4 | 
 5 | class TXTParser(Parser):
 6 |     """Parser to parse TXT data format.
 7 |     """
 8 | 
 9 |     # Public
10 | 
11 |     options = []
12 | 
13 |     def __init__(self, loader, **options):
14 |         super(TXTParser, self).__init__(loader, **options)
15 | 
16 |         # Set attributes
17 |         self.__options = options
18 |         self.__extended_rows = None
19 |         self.__loader = loader
20 |         self.__chars = None
21 |         self.__encoding = None
22 | 
23 |     @property
24 |     def closed(self):
25 |         return self.__chars is None or self.__chars.closed
26 | 
27 |     def open(self, source, encoding=None, force_parse=False):
28 |         self.close()
29 |         self.__chars = self.__loader.load(source, encoding)
30 |         self.__encoding = getattr(self.__chars, 'encoding', encoding)
31 |         if self.__encoding:
32 |             self.__encoding.lower()
33 |         self.reset()
34 | 
35 |     def close(self):
36 |         if not self.closed:
37 |             self.__chars.close()
38 | 
39 |     def reset(self):
40 |         reset_stream(self.__chars)
41 |         self.__extended_rows = self.__iter_extended_rows()
42 | 
43 |     @property
44 |     def extended_rows(self):
45 |         return self.__extended_rows
46 | 
47 |     @property
48 |     def encoding(self):
49 |         return self.__encoding
50 | 
51 |     # Private
52 | 
53 |     def __iter_extended_rows(self):
54 |         for number, line in enumerate(self.__chars, start=1):
55 |             if line.endswith('\n'):
56 |                 line = line[:-1]
57 |             yield (number, None, [line])
58 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/web/__init__.py:
--------------------------------------------------------------------------------
1 | from .server import app
2 | 


--------------------------------------------------------------------------------
/datapackage_pipelines/wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from .wrapper import ingest, spew, process, \
2 |     get_dependency_datapackage_url, ProcessorContext
3 | 


--------------------------------------------------------------------------------
/docker/github_config.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | 
 4 | if __name__ == "__main__":
 5 |     repos = os.environ.get('DPP_GITHUB_REPOSITORIES')
 6 |     if repos is not None:
 7 |         repos = repos.split(';')
 8 | 
 9 |         config = {}
10 |         for repo in repos:
11 |             repo = repo.split(':')
12 |             if len(repo) > 1:
13 |                 repo, path = repo
14 |             else:
15 |                 repo = repo[0]
16 |                 path = None
17 |             config[repo] = {
18 |                 'repository': repo,
19 |             }
20 |             if path is not None:
21 |                 config[repo]['base-path'] = path
22 |         with open('github.source-spec.yaml', 'w') as source_spec:
23 |             yaml.dump(config, source_spec)
24 | 


--------------------------------------------------------------------------------
/docker/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | list_descendants() {
 4 |     local root_pid=$1
 5 |     local children=$(for PID in `ps -o pid,ppid | grep " $root_pid"'$'`; do [ "$PID" != "$root_pid" ] && echo $PID; done)
 6 |     for PID in $children; do list_descendants "$PID"; done
 7 |     [ "$children" != "" ] && echo "$children"
 8 | }
 9 | 
10 | if [ "$1" = "server" ]; then 
11 |     echo "Starting Server"
12 |     redis-server /etc/redis.conf --daemonize yes --dir /var/redis
13 |     until [ `redis-cli ping | grep -c PONG` = 1 ]; do echo "Waiting 1s for Redis to load"; sleep 1; done
14 |     rm -f /var/run/dpp/dpp-celerybeat.pid /var/run/dpp/dpp-celeryd-management.pid /var/run/dpp/dpp-celeryd-worker.pid
15 |     python /dpp/docker/github_config.py
16 |     dpp init
17 | 
18 |     echo "Deleting `redis-cli -n 6 KEYS '*' | wc -l` keys"
19 |     redis-cli -n 6 FLUSHDB
20 |     echo "Remaining `redis-cli -n 6 KEYS '*' | wc -l` keys"
21 | 
22 |     SCHEDULER=1 python3 -m celery -b $DPP_CELERY_BROKER -A datapackage_pipelines.app -l INFO --pidfile=/var/run/dpp/dpp-celerybeat.pid beat &
23 |     python3 -m celery -b $DPP_CELERY_BROKER --concurrency=1 -A datapackage_pipelines.app -Q datapackage-pipelines-management -l INFO --pidfile=/var/run/dpp/dpp-celeryd-management.pid worker &
24 |     python3 -m celery -b $DPP_CELERY_BROKER --concurrency=$DPP_NUM_WORKERS -A datapackage_pipelines.app -Q datapackage-pipelines -l INFO --pidfile=/var/run/dpp/dpp-celeryd-worker.pid worker &
25 |     dpp serve &
26 |     DPP_SERVE_PID=$!
27 |     sleep 5
28 |     echo $DPP_SERVE_PID > /var/run/dpp/dpp-serve.pid
29 |     wait $DPP_SERVE_PID
30 |     rm -f /var/run/dpp/dpp-serve.pid
31 |     exit 0
32 | elif [ "$1" = "server-reload" ]; then
33 |     trap 'echo reloading...; while ! /dpp/docker/run.sh stop-server; do echo .; sleep 1; done' HUP
34 |     while true; do
35 |         /dpp/docker/run.sh server &
36 |         wait $!
37 |     done
38 | elif [ "$1" == "stop-server" ]; then
39 |     DPP_SERVE_PID=`cat /var/run/dpp/dpp-serve.pid 2>/dev/null` && rm /var/run/dpp/dpp-serve.pid
40 |     [ "$?" != "0" ] && echo missing dpp-serve.pid && exit 1
41 |     DPP_SERVE_PIDS="$(list_descendants $DPP_SERVE_PID) $DPP_SERVE_PID"
42 |     pstree -p
43 |     echo collecting pids to terminate
44 |     PIDS=""
45 |     for PIDFILE in dpp-celeryd-worker dpp-celeryd-management dpp-celerybeat redis; do
46 |         PID=`cat /var/run/dpp/$PIDFILE.pid 2>/dev/null` \
47 |         && PIDS="$PIDS $(list_descendants $PID) $PID"
48 |     done
49 |     if [ "$PIDS" != "" ]; then
50 |         echo sending TERM signal for pids: ${PIDS}
51 |         for PID in $PIDS; do kill $PID; done
52 |         echo sleeping ${DPP_RELOAD_GRACE_PERIOD:-5} seconds before sending KILL signal
53 |         sleep ${DPP_RELOAD_GRACE_PERIOD:-5}
54 |         for PID in $PIDS; do kill -9 $PID 2>/dev/null; done
55 |         echo ensuring all PIDS were terminated
56 |         for PID in $PIDS; do kill -0 $PID 2>/dev/null \
57 |                           && kill -9 $PID 2>/dev/null \
58 |                           && echo sleeping ${DPP_RELOAD_TERMINATE_PERIOD:-2} seconds to allow process $PID to be KILLed \
59 |                           && sleep ${DPP_RELOAD_TERMINATE_PERIOD:-2} \
60 |                           && kill -0 $PID 2>/dev/null && echo $PID not killed && exit 1; done
61 |     fi
62 |     for PIDFILE in dpp-celeryd-worker dpp-celeryd-management dpp-celerybeat redis; do
63 |         rm -f /var/run/dpp/$PIDFILE.pid
64 |     done
65 |     echo sending TERM signal to dpp-serve and descendats
66 |     kill $DPP_SERVE_PIDS 2>/dev/null
67 |     kill -0 $DPP_SERVE_PID 2>/dev/null && echo waiting up to 5 seconds to let dpp-serve to be killed peacefully \
68 |     && for i in 0 1 2 3 4 5; do ! kill -0 $DPP_SERVE_PID 2>/dev/null || sleep 1; done
69 |     kill -9 $DPP_SERVE_PIDS
70 |     sleep ${DPP_RELOAD_TERMINATE_PERIOD:-2} && kill -0 $DPP_SERVE_PID 2>/dev/null && echo dpp serve not killed && exit 1
71 |     echo killed server PID $DPP_SERVE_PID
72 |     pstree -p
73 |     exit 0
74 | else
75 |     /usr/local/bin/dpp "$@"
76 | fi;
77 | 


--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
 1 | [pylama]
 2 | linters = pyflakes,pep8
 3 | ignore = E128,E301,E741
 4 | 
 5 | [pylama:pep8]
 6 | max_line_length = 120
 7 | 
 8 | [pylama:*/__init__.py]
 9 | ignore = W0611
10 | 


--------------------------------------------------------------------------------
/samples/add_constant.py:
--------------------------------------------------------------------------------
 1 | # Add new column with constant value to first resource
 2 | # Column name and value are taken from the processor's parameters
 3 | from datapackage_pipelines.wrapper import process
 4 | 
 5 | 
 6 | def modify_datapackage(datapackage, parameters, _):
 7 |     datapackage['resources'][0]['schema']['fields'].append({
 8 |       'name': parameters['column-name'],
 9 |       'type': 'string'
10 |     })
11 |     return datapackage
12 | 
13 | 
14 | def process_row(row, _1, _2, resource_index, parameters, _):
15 |     if resource_index == 0:
16 |         row[parameters['column-name']] = parameters['value']
17 |     return row
18 | 
19 | 
20 | process(modify_datapackage=modify_datapackage,
21 |         process_row=process_row)
22 | 


--------------------------------------------------------------------------------
/samples/co2-information-cdiac.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/samples/co2-information-cdiac.zip


--------------------------------------------------------------------------------
/samples/pipeline-spec.yaml:
--------------------------------------------------------------------------------
 1 | worldbank-co2-emissions:
 2 |   schedule:
 3 |     crontab: '0 * * * *'
 4 |   pipeline:
 5 |     -
 6 |       run: update_package
 7 |       parameters:
 8 |         name: 'co2-emissions'
 9 |         title: 'CO2 emissions [metric tons per capita]'
10 |         homepage: 'http://worldbank.org/'
11 |     -
12 |       run: load
13 |       parameters:
14 |         from: "http://api.worldbank.org/v2/en/indicator/EN.ATM.CO2E.PC?downloadformat=excel"
15 |         name: 'global-data'
16 |         headers: 4
17 |         format: xls
18 |     -
19 |       run: set_types
20 |       parameters:
21 |          resources: global-data
22 |          types:
23 |            "[12][0-9]{3}":
24 |               type: number
25 |     -
26 |       run: dump_to_zip
27 |       parameters:
28 |           out-file: co2-emissions-wb.zip
29 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import absolute_import
 5 | 
 6 | import os
 7 | import io
 8 | from setuptools import setup, find_packages
 9 | 
10 | 
11 | # Helpers
12 | def read(*paths):
13 |     """Read a text file."""
14 |     basedir = os.path.dirname(__file__)
15 |     fullpath = os.path.join(basedir, *paths)
16 |     contents = io.open(fullpath, encoding='utf-8').read().strip()
17 |     return contents
18 | 
19 | 
20 | # Prepare
21 | PACKAGE = 'datapackage_pipelines'
22 | NAME = PACKAGE.replace('_', '-')
23 | INSTALL_REQUIRES = [
24 |     'celery<5',
25 |     'requests',
26 |     'datapackage>=1.14.0',
27 |     'tableschema>=1.2.5',
28 |     'tableschema-sql>=0.10.4',
29 |     'pyyaml',
30 |     'ujson',
31 |     'mistune<2',
32 |     'markupsafe==2.0.1',
33 |     'redis>=3,<4',
34 |     'click<8.0',
35 |     'awesome-slugify',
36 |     'flask<2.0.0',
37 |     'flask-cors',
38 |     'flask-jsonpify',
39 |     'flask-basicauth',
40 |     'cachetools',
41 |     'tabulator>=1.50.0',
42 |     'globster>=0.1.0',
43 |     'dataflows>=0.2.11',
44 |     'python-dateutil<2.8.1',
45 |     'werkzeug<1.0',
46 | ]
47 | SPEEDUP_REQUIRES = [
48 |     'dataflows[speedup]',
49 | ]
50 | LINT_REQUIRES = [
51 |     'pylama',
52 | ]
53 | TESTS_REQUIRE = [
54 |     'tox',
55 |     'sqlalchemy',
56 | ]
57 | README = read('README.md')
58 | VERSION = read(PACKAGE, 'VERSION')
59 | PACKAGES = find_packages(exclude=['examples', 'tests', '.tox'])
60 | 
61 | # Run
62 | setup(
63 |     name=NAME,
64 |     version=VERSION,
65 |     packages=PACKAGES,
66 |     include_package_data=True,
67 |     install_requires=INSTALL_REQUIRES,
68 |     tests_require=TESTS_REQUIRE,
69 |     extras_require={
70 |         'develop': LINT_REQUIRES + TESTS_REQUIRE,
71 |         'speedup': SPEEDUP_REQUIRES,
72 |     },
73 |     zip_safe=False,
74 |     long_description=README,
75 |     long_description_content_type='text/markdown',
76 |     description='{{ DESCRIPTION }}',
77 |     author='Open Knowledge Foundation',
78 |     author_email='info@okfn.org',
79 |     url='https://github.com/frictionlessdata/datapackage-pipelines',
80 |     license='MIT',
81 |     keywords=[
82 |         'data',
83 |     ],
84 |     classifiers=[
85 |         'Development Status :: 4 - Beta',
86 |         'Intended Audience :: Developers',
87 |         'License :: OSI Approved :: MIT License',
88 |         'Operating System :: OS Independent',
89 |         'Programming Language :: Python :: 3.6',
90 |         'Topic :: Software Development :: Libraries :: Python Modules',
91 |     ],
92 |     entry_points={
93 |       'console_scripts': [
94 |         'dpp = datapackage_pipelines.cli:cli',
95 |       ]
96 |     },
97 | )
98 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from datapackage_pipelines.manager.logging_config import logging
2 | 


--------------------------------------------------------------------------------
/tests/cli/custom_formatters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/tests/cli/custom_formatters/__init__.py


--------------------------------------------------------------------------------
/tests/cli/custom_formatters/xlsx_format.py:
--------------------------------------------------------------------------------
 1 | from datapackage_pipelines.lib.dump.file_formats import CSVFormat, get_path
 2 | import os
 3 | import openpyxl
 4 | 
 5 | 
 6 | class XLSXFormat(CSVFormat):
 7 | 
 8 |     def prepare_resource(self, resource):
 9 |         super(XLSXFormat, self).prepare_resource(resource)
10 |         basename, _ = os.path.splitext(get_path(resource))
11 |         resource['path'] = basename + '.xlsx'
12 |         resource['format'] = 'xlsx'
13 | 
14 |     def initialize_file(self, file, headers):
15 |         self.file = file
16 |         self.headers = headers
17 |         wb = openpyxl.Workbook()
18 |         wb.active.append(self.headers)
19 |         return wb
20 | 
21 |     def write_transformed_row(self, writer, transformed_row, fields):
22 |         writer.active.append([transformed_row[k] for k in self.headers])
23 | 
24 |     def finalize_file(self, writer):
25 |         writer.save(self.file.name)
26 | 


--------------------------------------------------------------------------------
/tests/cli/expected_flow_data.csv:
--------------------------------------------------------------------------------
 1 | first_name,last_name,house,age,foo
 2 | Tyrion,Lannister,Lannister,27,foo
 3 | Jaime,Lannister,Lannister,34,foo
 4 | Cersei,Lannister,Lannister,34,foo
 5 | Jon,Snow,Stark,17,foo
 6 | Sansa,Stark,Stark,14,foo
 7 | Arya,Stark,Stark,11,foo
 8 | Bran,Stark,Stark,10,foo
 9 | Rickon,Stark,Stark,5,foo
10 | Daenerys,Targaryen,Targaryen,16,foo
11 | 


--------------------------------------------------------------------------------
/tests/cli/pipeline-spec.yaml:
--------------------------------------------------------------------------------
 1 | raise-exception:
 2 |   pipeline:
 3 |     - run: raise_exception
 4 |       code: raise Exception()
 5 | 
 6 | failure-no-errors:
 7 |   pipeline:
 8 |     - run: success
 9 |       code: ""
10 | 
11 | success:
12 |   pipeline:
13 |   - run: success
14 |     code: |
15 |           from datapackage_pipelines.wrapper import ingest, spew
16 |           parameters, datapackage, resources = ingest()
17 |           spew(datapackage, [])
18 | 
19 | verbose-logs-with-sleep:
20 |   pipeline:
21 |   - run: code
22 |     code: |
23 |       from datapackage_pipelines.wrapper import ingest, spew
24 |       import logging, itertools, time
25 |       log_numbers = itertools.count()
26 |       def log_line():
27 |         logging.info('log line {}'.format(next(log_numbers)))
28 |       log_line()
29 |       time.sleep(.1)
30 |       log_line()
31 |       time.sleep(.1)
32 |       log_line()
33 |       time.sleep(.1)
34 |       parameters, datapackage, resources = ingest()
35 |       log_line()
36 |       time.sleep(.1)
37 |       log_line()
38 |       time.sleep(.1)
39 |       log_line()
40 |       time.sleep(.1)
41 |       spew(datapackage, [])
42 |       log_line()
43 |       time.sleep(.1)
44 |       log_line()
45 |       time.sleep(.1)
46 |       log_line()
47 | 
48 | load-resource-progress-log:
49 |   pipeline:
50 |   - run: load_resource
51 |     parameters:
52 |       url: ../data/datapackage.json
53 |       resource: my-spiffy-resource
54 |       log-progress-rows: 2
55 | 
56 | custom-formatters:
57 |   pipeline:
58 |   - run: load_resource
59 |     parameters:
60 |       url: ../data/datapackage.json
61 |       resource: my-spiffy-resource
62 |   - run: duplicate
63 |     parameters:
64 |       source: my-spiffy-resource
65 |       target-name: my-spiffy-xlsx-resource
66 |       target-path: my-spiffy-resource.xlsx
67 |   - run: dump.to_path
68 |     parameters:
69 |       out-path: custom_formatters
70 |       force-format: false
71 |       counters:
72 |         resource-hash: ''
73 |       file-formatters:
74 |         xlsx: custom_formatters.xlsx_format.XLSXFormat
75 |   - run: dump.to_zip
76 |     parameters:
77 |       out-file: custom_formatters/datapackage.zip
78 |       force-format: false
79 |       counters:
80 |         resource-hash: ''
81 |       file-formatters:
82 |         xlsx: custom_formatters.xlsx_format.XLSXFormat
83 | 
84 | dataflows:
85 |   pipeline:
86 |   - run: load_resource
87 |     parameters:
88 |       url: ../data/datapackage.json
89 |       resource: my-spiffy-resource
90 |   - flow: test_flow
91 |     parameters:
92 |       attr: foo
93 |   - run: dump_to_path
94 |     parameters:
95 |       out-path: test_flow_data
96 | 


--------------------------------------------------------------------------------
/tests/cli/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(
4 |     name="dpp_tests_cli",
5 |     packages=["custom_formatters"]
6 | )
7 | 


--------------------------------------------------------------------------------
/tests/cli/test_cli_exit_codes.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ! dpp \
 4 |     && echo "test failed: dpp returned with non-zero exit code $?" && exit 1
 5 | 
 6 | dpp run ./tests/cli/raise-exception \
 7 |     && echo "test failed: exception in pipeline returned successful exit code" && exit 1
 8 | 
 9 | dpp run ./tests/cli/failure-no-errors \
10 |     && echo "test failed: pipeline that failed without errors returned successful exit code" && exit 1
11 | 
12 | ! dpp run ./tests/cli/success \
13 |     && echo "test failed: success pipeline returned with non-zero exit code $?" && exit 1
14 | 
15 | dpp run --concurrency 4 \
16 |         ./tests/cli/raise-exception,./tests/env/dummy/pipeline-test-data%,./tests/cli/failure-no-errors \
17 |     && echo "test failed: concurrent run with failures returned successful exit code" && exit 1
18 | 
19 | ! dpp run --concurrency 2 \
20 |           ./tests/cli/success,./tests/cli/verbose-logs-with-sleep,./tests/env/dummy/pipeline-test-data% \
21 |     && echo "test failed: concurrent run without failures returned non-zero exit code $?" && exit 1
22 | 
23 | echo "Great Success"
24 | exit 0
25 | 


--------------------------------------------------------------------------------
/tests/cli/test_cli_logs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | TEMPDIR=`mktemp -d`
 4 | 
 5 | ! script -ec "dpp run --verbose ./tests/cli/verbose-logs-with-sleep" $TEMPDIR/verbose_log && echo failed to run with --verbose && exit 1
 6 | cat -v $TEMPDIR/verbose_log | grep '\^\[\[[0-9][0-9]*A' && echo running with --verbose - found terminal escape sequences && exit 1
 7 | 
 8 | ! script -ec "dpp run ./tests/cli/verbose-logs-with-sleep" $TEMPDIR/log && echo failed to run without verbose && exit 1
 9 | ! cat -v $TEMPDIR/log | grep '\^\[\[[0-9][0-9]*A' && echo running without verbose - did not find terminal escape sequences && exit 1
10 | 
11 | ! OUTPUT=`dpp run --verbose ./tests/cli/load-resource-progress-log 2>&1` && echo failed to run load-resource-progress && exit 1
12 | for i in 2 4 6 8; do
13 |     ! echo $OUTPUT | grep -q "loaded $i rows" && echo failed to detect load resource log && exit 1
14 | done
15 | 
16 | rm -rf "${TEMPDIR}"
17 | 
18 | echo Great Success
19 | exit 0
20 | 


--------------------------------------------------------------------------------
/tests/cli/test_custom_formatters.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | pip install -e tests/cli
 4 | pip install openpyxl
 5 | 
 6 | OUTPUT_FILES="tests/cli/custom_formatters/my-spiffy-resource.xlsx
 7 |               tests/cli/custom_formatters/sample.csv
 8 |               tests/cli/custom_formatters/datapackage.json
 9 |               tests/cli/custom_formatters/datapackage.zip"
10 | 
11 | rm -f $OUTPUT_FILES
12 | 
13 | ! dpp run ./tests/cli/custom-formatters && echo failed to run custom formatters pipeline && exit 1
14 | 
15 | ! ls -lah $OUTPUT_FILES && echo missing custom formatters output files && exit 1
16 | 
17 | validate_lannisters() {
18 |     NUM_LANNISTERS=$(python - <<EOF
19 | import openpyxl
20 | wb=openpyxl.load_workbook('my-spiffy-resource.xlsx')
21 | print(len([True for row in wb.active.rows for cell in row if cell.value == 'Lannister']))
22 | EOF
23 | )
24 |     [ "${NUM_LANNISTERS}" != "6" ] && echo invalid number of Lannisters && return 1
25 |     return 0
26 | }
27 | 
28 | pushd tests/cli/custom_formatters >/dev/null
29 |     ! validate_lannisters && exit 1
30 | popd >/dev/null
31 | 
32 | DATAPACKAGE_ZIP=`pwd`/tests/cli/custom_formatters/datapackage.zip
33 | TEMP_DIR=`mktemp -d`
34 | pushd $TEMP_DIR >/dev/null
35 | unzip "${DATAPACKAGE_ZIP}"
36 |     ! validate_lannisters && exit 1
37 | popd >/dev/null
38 | 
39 | rm -rf $TEMP_DIR
40 | rm -f $OUTPUT_FILES
41 | 
42 | echo Great Success
43 | exit 0
44 | 


--------------------------------------------------------------------------------
/tests/cli/test_exclude_dirnames.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ! dpp | grep ./tests/cli && echo missing tests/cli pipelines && exit 1
 4 | ! dpp | grep ./samples/worldbank && echo missing samples pipelines && exit 1
 5 | ! dpp | grep ./tests/env/ && echo missing tests/env pipelines && exit 1
 6 | ! dpp | grep ./tests/docker/ && echo missing tests/docker pipelines && exit 1
 7 | 
 8 | echo "env
 9 | /samples
10 | /tests/cli" > .dpp_spec_ignore
11 | 
12 | dpp | grep ./tests/cli && echo tests/cli pipelines not excluded && exit 1
13 | dpp | grep ./samples/worldbank && echo samples pipelines not excluded && exit 1
14 | dpp | grep ./tests/env/ && echo tests/env pipelines not excluded && exit 1
15 | ! dpp | grep ./tests/docker/ && echo missing tests/docker pipelines && exit 1
16 | 
17 | rm .dpp_spec_ignore
18 | 
19 | echo Great Success
20 | exit 0
21 | 


--------------------------------------------------------------------------------
/tests/cli/test_flow.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow, dump_to_path, PackageWrapper, load, update_package
 2 | 
 3 | 
 4 | def hello_dataflows(package: PackageWrapper):
 5 |     print('hello dataflows')
 6 |     yield package.pkg
 7 |     yield from package
 8 | 
 9 | 
10 | def flow(parameters, datapackage, resources, stats):
11 |     stats['foo_values'] = 0
12 | 
13 |     def add_foo_field(package: PackageWrapper):
14 |         package.pkg.descriptor['resources'][0]['schema']['fields'] += [
15 |             {'name': parameters['attr'], 'type': 'string'}]
16 |         yield package.pkg
17 |         yield from package
18 | 
19 |     def add_foo_value(row):
20 |         row[parameters['attr']] = 'foo'
21 |         stats['foo_values'] += 1
22 | 
23 |     return Flow(update_package(name='_'),
24 |                 hello_dataflows,
25 |                 add_foo_field,
26 |                 add_foo_value)
27 | 


--------------------------------------------------------------------------------
/tests/cli/test_flow.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cd tests/cli
 4 | 
 5 | rm -rf test_flow_data
 6 | 
 7 | TEMPFILE=`mktemp`
 8 | 
 9 | set -o pipefail
10 | ! dpp run --verbose ./dataflows >/dev/stdout 2>&1 | tee $TEMPFILE && echo failed to run dataflows pipeline && exit 1
11 | set +o pipefail
12 | ! cat "${TEMPFILE}" | grep "hello dataflows" && echo dataflows output is missing && exit 1
13 | ! cat "${TEMPFILE}" | grep "'foo_values': 9" && echo dataflows output is missing stats && exit 1
14 | rm $TEMPFILE
15 | 
16 | ! diff test_flow_data/sample.csv expected_flow_data.csv && echo unexpected output data && exit 1
17 | 
18 | echo Great Success
19 | exit 0
20 | 


--------------------------------------------------------------------------------
/tests/data/datapackage.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "my-spiffy-datapackage",
 3 |   "my-prop": "the-props-value",
 4 |   "resources": [
 5 |     {
 6 |       "name": "my-spiffy-resource",
 7 |       "path": "sample.csv",
 8 |       "schema": {
 9 |         "fields": [
10 |           {"name": "first_name", "type": "string"},
11 |           {"name": "last_name", "type": "string"},
12 |           {"name": "house", "type": "string"},
13 |           {"name": "age", "type": "integer"}
14 |         ],
15 |         "primaryKey": [
16 |           "first_name", "last_name"
17 |         ]
18 |       }
19 |     }
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/data/datapackage2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "my-spiffy-datapackage",
 3 |   "resources": [
 4 |     {
 5 |       "name": "my-spiffy-resource",
 6 |       "path": "sample.csv",
 7 |       "schema": {
 8 |         "fields": [
 9 |           {"name": "first_name", "type": "string"},
10 |           {"name": "last_name", "type": "string"},
11 |           {"name": "house", "type": "string"},
12 |           {"name": "age", "type": "integer"}
13 |         ]
14 |       }
15 |     },
16 |     {
17 |       "name": "the-spiffy-resource",
18 |       "path": "sample.csv",
19 |       "schema": {
20 |         "fields": [
21 |           {"name": "first_name", "type": "string"},
22 |           {"name": "last_name", "type": "string"},
23 |           {"name": "house", "type": "string"},
24 |           {"name": "age", "type": "integer"}
25 |         ]
26 |       }
27 |     },
28 |     {
29 |       "name": "the-other-spiffy-resource",
30 |       "path": "sample2.csv",
31 |       "schema": {
32 |         "fields": [
33 |           {"name": "first_name", "type": "string"},
34 |           {"name": "last_name", "type": "string"},
35 |           {"name": "house", "type": "string"},
36 |           {"name": "age", "type": "integer"}
37 |         ]
38 |       }
39 |     }
40 |   ]
41 | }
42 | 


--------------------------------------------------------------------------------
/tests/data/datapackage3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "my-spiffy-datapackage",
 3 |   "my-prop": "the-props-value",
 4 |   "resources": [
 5 |     {
 6 |       "name": "my-spiffy-resource",
 7 |       "path": "sample.dups.csv",
 8 |       "schema": {
 9 |         "fields": [
10 |           {"name": "first_name", "type": "string"},
11 |           {"name": "last_name", "type": "string"},
12 |           {"name": "house", "type": "string"},
13 |           {"name": "age", "type": "integer"}
14 |         ],
15 |         "primaryKey": [
16 |           "first_name", "last_name"
17 |         ]
18 |       }
19 |     }
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/data/sample.csv:
--------------------------------------------------------------------------------
 1 | first_name,last_name,house,age
 2 | Tyrion,Lannister,Lannister,27
 3 | Jaime,Lannister,Lannister,34
 4 | Cersei,Lannister,Lannister,34
 5 | Jon,Snow,Stark,17
 6 | Sansa,Stark,Stark,14
 7 | Arya,Stark,Stark,11
 8 | Bran,Stark,Stark,10
 9 | Rickon,Stark,Stark,5
10 | Daenerys,Targaryen,Targaryen,16
11 | 


--------------------------------------------------------------------------------
/tests/data/sample.dups.csv:
--------------------------------------------------------------------------------
 1 | first_name,last_name,house,age
 2 | Tyrion,Lannister,Lannister,27
 3 | Jaime,Lannister,Lannister,34
 4 | Cersei,Lannister,Lannister,34
 5 | Jon,Snow,Stark,17
 6 | Sansa,Stark,Stark,14
 7 | Sansa,Stark,Stark,14
 8 | Arya,Stark,Stark,11
 9 | Bran,Stark,Stark,10
10 | Rickon,Stark,Stark,5
11 | Daenerys,Targaryen,Targaryen,16
12 | 


--------------------------------------------------------------------------------
/tests/data/sample.txt:
--------------------------------------------------------------------------------
1 | <html><<< tabulator has html decection, keeping that causes the failure which we want to test
2 | This is a plain text file - not a CSV file!
3 | testing
4 | one two three


--------------------------------------------------------------------------------
/tests/data/sample.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/tests/data/sample.zip


--------------------------------------------------------------------------------
/tests/data/sample2.csv:
--------------------------------------------------------------------------------
1 | first_name,last_name,house,age
2 | Tyrion,Lannister,Lannister,27
3 | Jaime,Lannister,Lannister,34
4 | Cersei,Lannister,Lannister,34
5 | 


--------------------------------------------------------------------------------
/tests/docker/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | 


--------------------------------------------------------------------------------
/tests/docker/lib/dpp_docker_test.py:
--------------------------------------------------------------------------------
1 | DPP_DOCKER_TEST=True
2 | 


--------------------------------------------------------------------------------
/tests/docker/lib/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | 
3 | setup(
4 |     name="dpp_docker_test",
5 |     py_modules=['dpp_docker_test']
6 | )
7 | 


--------------------------------------------------------------------------------
/tests/docker/pipeline-spec.yaml:
--------------------------------------------------------------------------------
 1 | test:
 2 |   pipeline:
 3 |   - run: test
 4 |   - run: dump_to_path
 5 |     parameters:
 6 |       out-path: data
 7 | 
 8 | test-sleep:
 9 |   pipeline:
10 |   - run: sleep
11 |     code: |
12 |       import os
13 |       os.system('sleep 86400')
14 | 
15 | test-package:
16 |   pipeline:
17 |   - run: test
18 |     parameters:
19 |       test-package: true
20 |   - run: dump_to_path
21 |     parameters:
22 |       out-path: data/test_package
23 | 


--------------------------------------------------------------------------------
/tests/docker/test.py:
--------------------------------------------------------------------------------
 1 | from datapackage_pipelines.wrapper import ingest, spew
 2 | from datapackage_pipelines.utilities.resources import PROP_STREAMING
 3 | import datetime
 4 | 
 5 | parameters, datapackage, resources, stats = tuple(ingest()) + ({},)
 6 | 
 7 | 
 8 | if parameters.get('test-package'):
 9 |     from dpp_docker_test import DPP_DOCKER_TEST
10 |     assert DPP_DOCKER_TEST
11 | 
12 | 
13 | datapackage['resources'] = [{'name': 'test', 'path': 'test.csv',
14 |                              PROP_STREAMING: True,
15 |                              'schema': {'fields': [{'name': 'a', 'type': 'string'}]}}]
16 | 
17 | 
18 | spew(datapackage, [({'a': 'foo'}, {'a': 'bar'})], {'last_run_time': str(datetime.datetime.now())})
19 | 


--------------------------------------------------------------------------------
/tests/docker/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | sudo rm -rf tests/docker/data
 4 | 
 5 | ! docker run -v `pwd`/tests/docker:/pipelines:rw frictionlessdata/datapackage-pipelines run ./test \
 6 |     && echo failed to run docker && exit 1
 7 | 
 8 | ! ls -lah tests/docker/data/datapackage.json tests/docker/data/test.csv \
 9 |     && echo failed to find output files from docker run && exit 1
10 | 
11 | sudo rm -rf tests/docker/data
12 | 
13 | ! docker run -d --name dpp -v `pwd`/tests/docker:/pipelines:rw frictionlessdata/datapackage-pipelines server-reload \
14 |     && echo failed to start daemonized docker container && exit 1
15 | 
16 | for i in 1 2 3 4 5 6 7 8 9; do
17 |     sleep 10
18 |     ls -lah tests/docker/data/test.csv 2>/dev/null && break
19 |     echo .
20 | done
21 | 
22 | ! ls -lah tests/docker/data/datapackage.json tests/docker/data/test.csv \
23 |     && docker logs dpp && echo Failed to detect output data from daemonized docker container && exit 1
24 | 
25 | ls -lah tests/docker/data/test_package 2>/dev/null \
26 |     && docker logs dpp && echo detected test_package data && exit 1
27 | 
28 | ! docker exec dpp sh -c "cd lib; python3 setup.py install" \
29 |     && echo failed to install docker test package && exit 1
30 | 
31 | ! docker kill -s HUP dpp \
32 |     && docker logs && echo failed to send HUP to docker && exit 1
33 | 
34 | for i in 1 2 3 4 5 6 7 8 9; do
35 |     sleep 10
36 |     ls -lah tests/docker/data/test_package/test.csv 2>/dev/null && break
37 |     echo .
38 | done
39 | 
40 | ! ls -lah tests/docker/data/test_package/datapackage.json tests/docker/data/test_package/test.csv \
41 |     && docker logs dpp && echo Failed to detect test package output data from daemonized docker container && exit 1
42 | 
43 | docker logs dpp
44 | 
45 | docker rm --force dpp
46 | 
47 | sudo rm -rf tests/docker
48 | 
49 | echo Great Success
50 | exit 0
51 | 


--------------------------------------------------------------------------------
/tests/env/common/pipeline-common.py:
--------------------------------------------------------------------------------
1 | from datapackage_pipelines.wrapper import ingest, spew
2 | 
3 | params, datapackage, res_iter = ingest()
4 | for res in datapackage['resources']:
5 |     res['profile'] = 'tabular-data-resource'
6 | spew(datapackage, res_iter)
7 | 


--------------------------------------------------------------------------------
/tests/env/dummy/big-outputs.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import itertools
 3 | import os
 4 | 
 5 | from datapackage_pipelines.wrapper import ingest, spew
 6 | from datapackage_pipelines.utilities.resources import PROP_STREAMING
 7 | 
 8 | params, dp, res_iter = ingest()
 9 | 
10 | big_string = 'z'*64*1024
11 | 
12 | logging.info('Look at me %s', big_string)
13 | 
14 | dp['name'] = 'a'
15 | dp['resources'].append({
16 |     'name': 'aa%f' % os.getpid(),
17 |     'path': 'data/bla.csv',
18 |     'schema': {
19 |         'fields': [
20 |             {'name': 'a', 'type': 'string'}
21 |         ]
22 |     },
23 |     'very-large-prop': '*' * 100 * 1024,
24 |     PROP_STREAMING: True
25 | })
26 | 
27 | res = iter([{'a': big_string}])
28 | 
29 | spew(dp, itertools.chain(res_iter, [res]))
30 | 


--------------------------------------------------------------------------------
/tests/env/dummy/pipeline-test-supplier-titleize.py:
--------------------------------------------------------------------------------
 1 | from datapackage_pipelines.wrapper import ingest, spew
 2 | 
 3 | params, datapackage, res_iter = ingest()
 4 | 
 5 | key = params['key']
 6 | 
 7 | 
 8 | def process_resources(_res_iter):
 9 |     for res in _res_iter:
10 |         def process_res(_res):
11 |             for line in _res:
12 |                 if key in line:
13 |                     line[key] = line[key].title()
14 |                     yield line
15 |         yield process_res(res)
16 | 
17 | spew(datapackage, process_resources(res_iter))
18 | 


--------------------------------------------------------------------------------
/tests/env/dummy/types.csv:
--------------------------------------------------------------------------------
1 | string,number,integer,boolean,object,array,date,time,datetime,year,yearmonth,duration,geopoint,geojson
2 | "My name is Josef",1.23,10,true,{},"[1,2,3]",2015-01-31,03:00:10,2015-01-31T03:00:10Z,2015,2015-12,P3Y6M4DT12H30M5S,"90, 45","{""type"": ""Feature"",""geometry"": {""type"": ""Point"",""coordinates"": [125.6, 10.1]},""properties"": {""name"": ""Dinagat Islands""}}"
3 | "",NaN,0,True,{},[],2015-02-28,13:34:39,2015-02-28T13:34:39Z,2525,1982-01,"P3,5Y","180,      -90",
4 | "",inf,0,FALSE,"{""a"":1}","[""a"",""b""]",1970-01-01,23:59:59,1970-01-01T23:59:59Z,9999,9999-09,P300YT5.2S,"12.2, 12.3",
5 | "",-inF,0,0,{},[],1900-01-01,00:00:00,1900-01-01T00:00:00Z,0000,0000-01,PT0S,"27.0, -90.0",
6 | "",1.2e2,0,1,{},[],1405-12-30,12:34:56,1405-12-30T12:34:56Z,1786,1786-09,P999Y9999M9999DT9999H9999M9999S,"0,0",


--------------------------------------------------------------------------------
/tests/env/extract-year.py:
--------------------------------------------------------------------------------
 1 | from datapackage_pipelines.wrapper import ingest, spew
 2 | 
 3 | params, datapackage, res_iter = ingest()
 4 | 
 5 | from_key = params['from-key']
 6 | to_key = params['to-key']
 7 | 
 8 | 
 9 | def process_resources(_res_iter):
10 |     for res in _res_iter:
11 |         def process_res(_res):
12 |             for line in _res:
13 |                 if from_key in line:
14 |                     line[to_key] = line[from_key].year
15 |                     yield line
16 |     yield process_res(res)
17 | 
18 | 
19 | for resource in datapackage['resources']:
20 |     if len(list(filter(lambda field: field['name'] == from_key, resource.get('schema',{}).get('fields',[])))) > 0:
21 |         resource['schema']['fields'].append({
22 |             'name': to_key,
23 |             'osType': 'date:fiscal-year',
24 |             'type': 'integer'
25 |         })
26 | 
27 | spew(datapackage, process_resources(res_iter))
28 | 


--------------------------------------------------------------------------------
/tests/serve/html_output.py:
--------------------------------------------------------------------------------
 1 | from dataflows import Flow
 2 | import logging
 3 | 
 4 | 
 5 | class MyClass():
 6 |     pass
 7 | 
 8 | 
 9 | def flow(*_):
10 |     logging.info('my_object=' + str(MyClass()))
11 |     return Flow()
12 | 


--------------------------------------------------------------------------------
/tests/serve/pipeline-spec.yaml:
--------------------------------------------------------------------------------
1 | html-output:
2 |   pipeline:
3 |     - flow: html_output
4 | 


--------------------------------------------------------------------------------
/tests/sitecustomize.py:
--------------------------------------------------------------------------------
1 | import os
2 | import coverage
3 | 
4 | os.environ['COVERAGE_PROCESS_START']= os.path.join(os.environ["PWD"], 'tox.ini')
5 | coverage.process_startup()  
6 | 
7 | 


--------------------------------------------------------------------------------
/tests/stdlib/README.md:
--------------------------------------------------------------------------------
 1 | # tests for the pipelines standard library
 2 | 
 3 | ## fixtures
 4 | 
 5 | Each file in the fixtures sub-directory corresponds to paramaters of test to run.
 6 | 
 7 | The parameters are laid out in the file, separated by `\n--\n`
 8 | 
 9 | This is the order of parameters:
10 | 
11 | * `processor` - name of the processor to run
12 | * `params` - parameters
13 | * `dp_in` - input datapackage
14 | * `data_in` - input data
15 | * `dp_out` - expected output datapackage
16 | * `data_out` - expected output data
17 | 
18 | ## setting up the test environment and running a specific test
19 | 
20 | * `pip install -e .[develop]`
21 | * `py.test -svk name-of-the-fixture`
22 | 


--------------------------------------------------------------------------------
/tests/stdlib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/06c8ccbd44be420233b73563cfb5bd3eb37de7cc/tests/stdlib/__init__.py


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/add_resource_existent_env:
--------------------------------------------------------------------------------
 1 | add_resource
 2 | --
 3 | {
 4 |     "name": "my-env-resource",
 5 |     "url": "env://EXISTENT_ENV"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": []
11 | }
12 | --
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": [
17 |         {
18 |             "name": "my-env-resource",
19 |             "dpp:streamedFrom": "tests/data/sample.csv",
20 |             "path": "_"
21 |         }
22 |     ]
23 | }
24 | --
25 | {}
26 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/dump_to_sql_update_mode__insert:
--------------------------------------------------------------------------------
 1 | dump_to_sql
 2 | --
 3 | {
 4 |     "tables": {
 5 |         "test": {
 6 |             "resource-name": "my-spiffy-resource",
 7 |             "mode": "update"
 8 |         }
 9 |     }
10 | }
11 | --
12 | {
13 |     "name": "test",
14 |     "resources": [
15 |         {
16 |             "name": "my-spiffy-resource",
17 |             "dpp:streaming": true,
18 |             "path": "data/my-data.csv",
19 |             "schema": {
20 |                 "fields": [
21 |                     {"name": "id", "type": "integer"},
22 |                     {"name": "mystring", "type": "string"},
23 |                     {"name": "mynumber", "type": "number"},
24 |                     {"name": "mydate", "type": "date"}
25 |                 ],
26 |                 "primaryKey": ["id"]
27 |             }
28 |         }
29 |     ]
30 | }
31 | --
32 | {"id": 1, "mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
33 | --
34 | {
35 |     "name": "test",
36 |     "profile": "data-package",
37 |     "resources": [
38 |         {
39 |             "name": "my-spiffy-resource",
40 |             "dpp:streaming": true,
41 |             "path": "data/my-data.csv",
42 |             "profile": "data-resource",
43 |             "schema": {
44 |                 "fields": [
45 |                     {"name": "id", "type": "integer"},
46 |                     {"name": "mystring", "type": "string"},
47 |                     {"name": "mynumber", "type": "number"},
48 |                     {"name": "mydate", "type": "date"}
49 |                 ],
50 |                 "primaryKey": ["id"]
51 |             }
52 |         }
53 |     ]
54 | }
55 | --
56 | {"id": 1, "mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}}
57 | 
58 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "5dad5b7c7fb3fecb7478b4f34fabbd23"}
59 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/dump_to_sql_update_mode__update:
--------------------------------------------------------------------------------
 1 | dump_to_sql
 2 | --
 3 | {
 4 |     "tables": {
 5 |         "test": {
 6 |             "resource-name": "my-spiffy-resource",
 7 |             "mode": "update"
 8 |         }
 9 |     }
10 | }
11 | --
12 | {
13 |     "name": "test",
14 |     "resources": [
15 |         {
16 |             "name": "my-spiffy-resource",
17 |             "dpp:streaming": true,
18 |             "path": "data/my-data.csv",
19 |             "schema": {
20 |                 "fields": [
21 |                     {"name": "id", "type": "integer"},
22 |                     {"name": "mystring", "type": "string"},
23 |                     {"name": "mynumber", "type": "number"},
24 |                     {"name": "mydate", "type": "date"}
25 |                 ],
26 |                 "primaryKey": ["id"]
27 |             }
28 |         }
29 |     ]
30 | }
31 | --
32 | {"id": 1, "mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
33 | --
34 | {
35 |     "name": "test",
36 |     "profile": "data-package",
37 |     "resources": [
38 |         {
39 |             "name": "my-spiffy-resource",
40 |             "dpp:streaming": true,
41 |             "path": "data/my-data.csv",
42 |             "profile": "data-resource",
43 |             "schema": {
44 |                 "fields": [
45 |                     {"name": "id", "type": "integer"},
46 |                     {"name": "mystring", "type": "string"},
47 |                     {"name": "mynumber", "type": "number"},
48 |                     {"name": "mydate", "type": "date"}
49 |                 ],
50 |                 "primaryKey": ["id"]
51 |             }
52 |         }
53 |     ]
54 | }
55 | --
56 | {"id": 1, "mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}}
57 | 
58 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "5dad5b7c7fb3fecb7478b4f34fabbd23"}
59 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/dump_to_sql_with_updated_data:
--------------------------------------------------------------------------------
 1 | dump_to_sql
 2 | --
 3 | {
 4 |     "tables": {
 5 |         "test": {
 6 |             "resource-name": "my-spiffy-resource"
 7 |         }
 8 |     },
 9 |     "updated_column": "updated",
10 |     "updated_id_column": "updated_id"
11 | }
12 | --
13 | {
14 |     "name": "test",
15 |     "resources": [
16 |         {
17 |             "name": "my-spiffy-resource",
18 |             "dpp:streaming": true,
19 |             "path": "data/my-data.csv",
20 |             "schema": {
21 |                 "fields": [
22 |                     {"name": "mystring", "type": "string"},
23 |                     {"name": "myinteger", "type": "integer"},
24 |                     {"name": "mynumber", "type": "number"},
25 |                     {"name": "mydate", "type": "date"}
26 |                 ]
27 |             }
28 |         }
29 |     ]
30 | }
31 | --
32 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
33 | --
34 | {
35 |     "name": "test",
36 |     "profile": "data-package",
37 |     "resources": [
38 |         {
39 |             "name": "my-spiffy-resource",
40 |             "dpp:streaming": true,
41 |             "path": "data/my-data.csv",
42 |             "profile": "data-resource",
43 |             "schema": {
44 |                 "fields": [
45 |                     {"name": "mystring", "type": "string"},
46 |                     {"name": "myinteger", "type": "integer"},
47 |                     {"name": "mynumber", "type": "number"},
48 |                     {"name": "mydate", "type": "date"}
49 |                 ]
50 |             }
51 |         }
52 |     ]
53 | }
54 | --
55 | {"mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}, "myinteger": null, "updated": false, "updated_id": null}
56 | 
57 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "c1c867cd9711aedd5c94a16ce4590ece"}
58 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/load_existent_env:
--------------------------------------------------------------------------------
 1 | load
 2 | --
 3 | {
 4 |     "from": "env://EXISTENT_ENV",
 5 |     "name": "my-env-resource",
 6 |     "validate": true
 7 | }
 8 | --
 9 | {
10 |     "name": "test",
11 |     "resources": []
12 | }
13 | --
14 | --
15 | {
16 |     "name": "test", 
17 |     "profile": "data-package", 
18 |     "resources": [
19 |         {
20 |             "dpp:streamedFrom": "env://EXISTENT_ENV", 
21 |             "dpp:streaming": true, 
22 |             "format": "csv", 
23 |             "name": "my-env-resource", 
24 |             "path": "my-env-resource.csv", 
25 |             "profile": "tabular-data-resource", 
26 |             "schema": {
27 |                 "fields": [
28 |                     {"format": "default", "name": "first_name", "type": "string"}, 
29 |                     {"format": "default", "name": "last_name", "type": "string"}, 
30 |                     {"format": "default", "name": "house", "type": "string"}, 
31 |                     {"format": "default", "name": "age", "type": "integer"}
32 |                 ], 
33 |                 "missingValues": [""]
34 |             }
35 |         }
36 |     ]
37 | }
38 | --
39 | {"age": 27, "first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister"}
40 | {"age": 34, "first_name": "Jaime", "house": "Lannister", "last_name": "Lannister"}
41 | {"age": 34, "first_name": "Cersei", "house": "Lannister", "last_name": "Lannister"}
42 | {"age": 17, "first_name": "Jon", "house": "Stark", "last_name": "Snow"}
43 | {"age": 14, "first_name": "Sansa", "house": "Stark", "last_name": "Stark"}
44 | {"age": 11, "first_name": "Arya", "house": "Stark", "last_name": "Stark"}
45 | {"age": 10, "first_name": "Bran", "house": "Stark", "last_name": "Stark"}
46 | {"age": 5, "first_name": "Rickon", "house": "Stark", "last_name": "Stark"}
47 | {"age": 16, "first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen"}
48 | 
49 | {}
50 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/obj_fix_dump_to_sql:
--------------------------------------------------------------------------------
 1 | dump_to_sql
 2 | --
 3 | {
 4 |     "comment": [
 5 |         "this test involves data types which work differently in sqlite and postgresql",
 6 |         "so, forcing sqlite engine here"
 7 |     ],
 8 |     "engine": "sqlite://",
 9 |     "tables": {
10 |         "test": {
11 |             "resource-name": "my-spiffy-resource"
12 |         }
13 |     }
14 | }
15 | --
16 | {
17 |     "name": "test",
18 |     "resources": [
19 |         {
20 |             "name": "my-spiffy-resource",
21 |             "dpp:streaming": true,
22 |             "path": "data/my-data.csv",
23 |             "schema": {
24 |                 "fields": [
25 |                     {"name": "myarray", "type": "array"},
26 |                     {"name": "myobject", "type": "object"},
27 |                     {"name": "mynumber", "type": "number"},
28 |                     {"name": "mydate", "type": "date"}
29 |                 ]
30 |             }
31 |         }
32 |     ]
33 | }
34 | --
35 | {"myarray":[{"type{date}": "2016-12-31"}, {"type{datetime}": "2016-11-10 12:34:56"}], "myobject": {"n1": {"n2": {"type{decimal}": "78.99"}}}, "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
36 | --
37 | {
38 |     "name": "test",
39 |     "profile": "data-package",
40 |     "resources": [
41 |         {
42 |             "name": "my-spiffy-resource",
43 |             "dpp:streaming": true,
44 |             "path": "data/my-data.csv",
45 |             "profile": "data-resource",
46 |             "schema": {
47 |                 "fields": [
48 |                     {"name": "myarray", "type": "array"},
49 |                     {"name": "myobject", "type": "object"},
50 |                     {"name": "mynumber", "type": "number"},
51 |                     {"name": "mydate", "type": "date"}
52 |                 ]
53 |             }
54 |         }
55 |     ]
56 | }
57 | --
58 | {"myarray": "[\"2016-12-31\", \"2016-11-10T12:34:56\"]", "mydate": {"type{date}": "2016-12-31"}, "mynumber": {"type{decimal}": "2.0"}, "myobject": "{\"n1\": {\"n2\": 78.99}}"}
59 | 
60 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "bed26992ae39b43e8b58c0190e8a52e5"}
61 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/reverse_sort:
--------------------------------------------------------------------------------
  1 | sort
  2 | --
  3 | {
  4 |     "resources": ["concat-a1", "concat-a2"],
  5 |     "sort-by": "{a3} {a2} {a1}",
  6 |     "reverse": true
  7 | }
  8 | --
  9 | {
 10 |     "name": "test",
 11 |     "resources": [
 12 |         {
 13 |             "name": "concat-a1",
 14 |             "dpp:streaming": true,
 15 |             "path": "concat-a1.csv",
 16 |             "schema": { "fields": [
 17 |                 {"name": "a1", "type": "string"},
 18 |                 {"name": "a2", "type": "string"},
 19 |                 {"name": "a3", "type": "string"}
 20 |             ]}
 21 |         },
 22 |         {
 23 |             "name": "concat-a2",
 24 |             "dpp:streaming": true,
 25 |             "path": "concat-a2.csv",
 26 |             "schema": { "fields": [
 27 |                 {"name": "a1", "type": "string"},
 28 |                 {"name": "a2", "type": "string"},
 29 |                 {"name": "a3", "type": "string"}
 30 |             ]}
 31 |         },
 32 |         {
 33 |             "name": "concat-c",
 34 |             "dpp:streaming": true,
 35 |             "path": "concat-c.csv",
 36 |             "schema": { "fields": [
 37 |                 {"name": "c1", "type": "string"},
 38 |                 {"name": "c2", "type": "string"},
 39 |                 {"name": "c3", "type": "string"}
 40 |             ]}
 41 |         }
 42 |     ]
 43 | }
 44 | --
 45 | {"a1":"a1","a2":"a1","a3":"a2"}
 46 | {"a1":"a2","a2":"a1","a3":"a1"}
 47 | {"a1":"a3","a2":"a2","a3":"a2"}
 48 | {"a1":"a4","a2":"a2","a3":"a1"}
 49 | 
 50 | {"a1":"a1","a2":"a3","a3":"a2"}
 51 | {"a1":"a2","a2":"a3","a3":"a1"}
 52 | {"a1":"a3","a2":"a4","a3":"a2"}
 53 | {"a1":"a4","a2":"a4","a3":"a1"}
 54 | 
 55 | {"c1":"c13","c2":"c23","c3":"c33"}
 56 | {"c1":"c12","c2":"c22","c3":"c32"}
 57 | {"c1":"c11","c2":"c21","c3":"c31"}
 58 | --
 59 | {
 60 |     "name": "test",
 61 |     "profile": "data-package",
 62 |     "resources": [
 63 |         {
 64 |             "name": "concat-a1",
 65 |             "dpp:streaming": true,
 66 |             "path": "concat-a1.csv",
 67 |             "profile": "data-resource",
 68 |             "schema": { "fields": [
 69 |                 {"name": "a1", "type": "string"},
 70 |                 {"name": "a2", "type": "string"},
 71 |                 {"name": "a3", "type": "string"}
 72 |             ]}
 73 |         },
 74 |         {
 75 |             "name": "concat-a2",
 76 |             "dpp:streaming": true,
 77 |             "path": "concat-a2.csv",
 78 |             "profile": "data-resource",
 79 |             "schema": { "fields": [
 80 |                 {"name": "a1", "type": "string"},
 81 |                 {"name": "a2", "type": "string"},
 82 |                 {"name": "a3", "type": "string"}
 83 |             ]}
 84 |         },
 85 |         {
 86 |             "name": "concat-c",
 87 |             "dpp:streaming": true,
 88 |             "path": "concat-c.csv",
 89 |             "profile": "data-resource",
 90 |             "schema": { "fields": [
 91 |                 {"name": "c1", "type": "string"},
 92 |                 {"name": "c2", "type": "string"},
 93 |                 {"name": "c3", "type": "string"}
 94 |             ]}
 95 |         }
 96 |     ]
 97 | }
 98 | --
 99 | {"a1":"a3","a2":"a2","a3":"a2"}
100 | {"a1":"a1","a2":"a1","a3":"a2"}
101 | {"a1":"a4","a2":"a2","a3":"a1"}
102 | {"a1":"a2","a2":"a1","a3":"a1"}
103 | 
104 | {"a1":"a3","a2":"a4","a3":"a2"}
105 | {"a1":"a1","a2":"a3","a3":"a2"}
106 | {"a1":"a4","a2":"a4","a3":"a1"}
107 | {"a1":"a2","a2":"a3","a3":"a1"}
108 | 
109 | {"c1":"c13","c2":"c23","c3":"c33"}
110 | {"c1":"c12","c2":"c22","c3":"c32"}
111 | {"c1":"c11","c2":"c21","c3":"c31"}
112 | 
113 | {}
114 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_add_resource:
--------------------------------------------------------------------------------
 1 | add_resource
 2 | --
 3 | {
 4 |     "name": "my-spiffy-resource",
 5 |     "url": "http://not.existent.com"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": []
11 | }
12 | --
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": [
17 |         {
18 |             "name": "my-spiffy-resource",
19 |             "dpp:streamedFrom": "http://not.existent.com",
20 |             "path": "_"
21 |         }
22 |     ]
23 | }
24 | --
25 | {}
26 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_concat:
--------------------------------------------------------------------------------
 1 | concatenate
 2 | --
 3 | {
 4 |     "sources": ["concat-a", "concat-b", "concat-c"],
 5 |     "target": {"name": "target"},
 6 |     "fields": {
 7 |         "t1": ["a1", "b1", "c1"],
 8 |         "t2": ["a2", "b2", "c2"],
 9 |         "c3": ["a3", "b3"],
10 |         "d4": null,
11 |         "e5": []
12 |     }
13 | }
14 | --
15 | {
16 |     "name": "test",
17 |     "resources": [
18 |         {
19 |             "name": "concat-a",
20 |             "dpp:streaming": true,
21 |             "path": "concat-a.csv",
22 |             "schema": { "fields": [
23 |                 {"name": "a1", "type": "string"},
24 |                 {"name": "a2", "type": "string"},
25 |                 {"name": "a3", "type": "string"}
26 |             ]}
27 |         },
28 |         {
29 |             "name": "concat-b",
30 |             "dpp:streaming": true,
31 |             "path": "concat-b.csv",
32 |             "schema": { "fields": [
33 |                 {"name": "b1", "type": "string"},
34 |                 {"name": "b2", "type": "string"},
35 |                 {"name": "b3", "type": "string"}
36 |             ]}
37 |         },
38 |         {
39 |             "name": "concat-c",
40 |             "dpp:streaming": true,
41 |             "path": "concat-c.csv",
42 |             "schema": { "fields": [
43 |                 {"name": "c1", "type": "string"},
44 |                 {"name": "c2", "type": "string"},
45 |                 {"name": "c3", "type": "string"}
46 |             ]}
47 |         }
48 |     ]
49 | }
50 | --
51 | {"a1":"a11","a2":"a21","a3":"a31"}
52 | {"a1":"a12","a2":"a22","a3":"a32"}
53 | {"a1":"a13","a2":"a23","a3":"a33"}
54 | 
55 | {"b1":"b11","b2":"b21","b3":"b31"}
56 | {"b1":"b12","b2":"b22","b3":"b32"}
57 | {"b1":"b13","b2":"b23","b3":"b33"}
58 | 
59 | {"c1":"c11","c2":"c21","c3":"c31"}
60 | {"c1":"c12","c2":"c22","c3":"c32"}
61 | {"c1":"c13","c2":"c23","c3":"c33"}
62 | --
63 | {
64 |     "name": "test",
65 |     "profile": "data-package",
66 |      "resources": [
67 |         {
68 |             "name": "target",
69 |             "dpp:streaming": true,
70 |             "path": "data/target.csv",
71 |             "mediatype": "text/csv",
72 |             "profile": "tabular-data-resource",
73 |             "schema": { "fields": [
74 |                 {"name": "t1", "format": "default", "type": "string"},
75 |                 {"name": "t2", "format": "default", "type": "string"},
76 |                 {"name": "c3", "format": "default", "type": "string"},
77 |                 {"name": "d4", "format": "default", "type": "string"},
78 |                 {"name": "e5", "format": "default", "type": "string"}
79 |             ],
80 |             "missingValues": [""]}
81 |         }
82 |     ]
83 | }
84 | --
85 | {"t1":"a11","t2":"a21","c3":"a31","d4":null,"e5":null}
86 | {"t1":"a12","t2":"a22","c3":"a32","d4":null,"e5":null}
87 | {"t1":"a13","t2":"a23","c3":"a33","d4":null,"e5":null}
88 | {"t1":"b11","t2":"b21","c3":"b31","d4":null,"e5":null}
89 | {"t1":"b12","t2":"b22","c3":"b32","d4":null,"e5":null}
90 | {"t1":"b13","t2":"b23","c3":"b33","d4":null,"e5":null}
91 | {"t1":"c11","t2":"c21","c3":"c31","d4":null,"e5":null}
92 | {"t1":"c12","t2":"c22","c3":"c32","d4":null,"e5":null}
93 | {"t1":"c13","t2":"c23","c3":"c33","d4":null,"e5":null}
94 | 
95 | {}
96 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_deduplicate:
--------------------------------------------------------------------------------
  1 | deduplicate
  2 | --
  3 | {
  4 |     "resources": ["concat-a1", "concat-a2"]
  5 | }
  6 | --
  7 | {
  8 |     "name": "test",
  9 |     "resources": [
 10 |         {
 11 |             "name": "concat-a1",
 12 |             "dpp:streaming": true,
 13 |             "path": "concat-a1.csv",
 14 |             "schema": { "fields": [
 15 |                 {"name": "a1", "type": "string"},
 16 |                 {"name": "a2", "type": "string"},
 17 |                 {"name": "a3", "type": "string"}
 18 |             ], "primaryKey": ["a1", "a2"]}
 19 |         },
 20 |         {
 21 |             "name": "concat-a2",
 22 |             "dpp:streaming": true,
 23 |             "path": "concat-a2.csv",
 24 |             "schema": { "fields": [
 25 |                 {"name": "a1", "type": "string"},
 26 |                 {"name": "a2", "type": "string"},
 27 |                 {"name": "a3", "type": "string"}
 28 |             ]}
 29 |         },
 30 |         {
 31 |             "name": "concat-c",
 32 |             "dpp:streaming": true,
 33 |             "path": "concat-c.csv",
 34 |             "schema": { "fields": [
 35 |                 {"name": "c1", "type": "string"},
 36 |                 {"name": "c2", "type": "string"},
 37 |                 {"name": "c3", "type": "string"}
 38 |             ]}
 39 |         }
 40 |     ]
 41 | }
 42 | --
 43 | {"a1":"a1","a2":"a1","a3":"a2"}
 44 | {"a1":"a2","a2":"a1","a3":"a1"}
 45 | {"a1":"a1","a2":"a1","a3":"a2"}
 46 | {"a1":"a2","a2":"a1","a3":"a1"}
 47 | 
 48 | {"a1":"a1","a2":"a3","a3":"a2"}
 49 | {"a1":"a2","a2":"a3","a3":"a1"}
 50 | {"a1":"a3","a2":"a4","a3":"a2"}
 51 | {"a1":"a4","a2":"a4","a3":"a1"}
 52 | 
 53 | {"c1":"c11","c2":"c21","c3":"c31"}
 54 | {"c1":"c12","c2":"c22","c3":"c32"}
 55 | {"c1":"c13","c2":"c23","c3":"c33"}
 56 | --
 57 | {
 58 |     "name": "test",
 59 |     "profile": "data-package",
 60 |     "resources": [
 61 |         {
 62 |             "name": "concat-a1",
 63 |             "dpp:streaming": true,
 64 |             "path": "concat-a1.csv",
 65 |             "profile": "data-resource",
 66 |             "schema": { "fields": [
 67 |                 {"name": "a1", "type": "string"},
 68 |                 {"name": "a2", "type": "string"},
 69 |                 {"name": "a3", "type": "string"}
 70 |             ], "primaryKey": ["a1", "a2"]}
 71 |         },
 72 |         {
 73 |             "name": "concat-a2",
 74 |             "dpp:streaming": true,
 75 |             "path": "concat-a2.csv",
 76 |             "profile": "data-resource",
 77 |             "schema": { "fields": [
 78 |                 {"name": "a1", "type": "string"},
 79 |                 {"name": "a2", "type": "string"},
 80 |                 {"name": "a3", "type": "string"}
 81 |             ]}
 82 |         },
 83 |         {
 84 |             "name": "concat-c",
 85 |             "dpp:streaming": true,
 86 |             "path": "concat-c.csv",
 87 |             "profile": "data-resource",
 88 |             "schema": { "fields": [
 89 |                 {"name": "c1", "type": "string"},
 90 |                 {"name": "c2", "type": "string"},
 91 |                 {"name": "c3", "type": "string"}
 92 |             ]}
 93 |         }
 94 |     ]
 95 | }
 96 | --
 97 | {"a1":"a1","a2":"a1","a3":"a2"}
 98 | {"a1":"a2","a2":"a1","a3":"a1"}
 99 | 
100 | {"a1":"a1","a2":"a3","a3":"a2"}
101 | {"a1":"a2","a2":"a3","a3":"a1"}
102 | {"a1":"a3","a2":"a4","a3":"a2"}
103 | {"a1":"a4","a2":"a4","a3":"a1"}
104 | 
105 | {"c1":"c11","c2":"c21","c3":"c31"}
106 | {"c1":"c12","c2":"c22","c3":"c32"}
107 | {"c1":"c13","c2":"c23","c3":"c33"}
108 | 
109 | {}
110 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_delete_fields:
--------------------------------------------------------------------------------
 1 | delete_fields
 2 | --
 3 | {
 4 |     "fields": ["last_name", "age"],
 5 |     "resources": "got-characters"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": [
11 |         {
12 |             "name": "got-characters",
13 |             "dpp:streaming": true,
14 |             "path": "characters.csv",
15 |             "schema": {
16 |                 "fields": [
17 |                     {"name": "first_name", "type": "string"},
18 |                     {"name": "last_name", "type": "string"},
19 |                     {"name": "house", "type": "string"},
20 |                     {"name": "age", "type": "number", "units": "Westerosian Years"}
21 |                 ]
22 |             }
23 |         },
24 |         {
25 |             "name": "got-houses",
26 |             "dpp:streaming": true,
27 |             "path": "houses.csv",
28 |             "schema": { "fields": [
29 |                 {"name": "house", "type": "string"}
30 |             ]}
31 |         }
32 |     ]
33 | }
34 | --
35 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
36 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
37 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
38 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
39 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
40 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
41 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
42 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
43 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
44 | 
45 | {"house": "House of Lannister"}
46 | {"house": "House of Greyjoy"}
47 | {"house": "House of Stark"}
48 | {"house": "House of Targaryen"}
49 | {"house": "House of Martell"}
50 | {"house": "House of Tyrell"}
51 | --
52 | {
53 |     "name": "test",
54 |     "profile": "data-package",
55 |     "resources": [
56 |         {
57 |             "name": "got-characters",
58 |             "dpp:streaming": true,
59 |             "path": "characters.csv",
60 |             "profile": "data-resource",
61 |             "schema": { "fields": [
62 |                 {"name": "first_name", "type": "string"},
63 |                 {"name": "house", "type": "string"}
64 |             ]}
65 |         },
66 |         {
67 |             "name": "got-houses",
68 |             "dpp:streaming": true,
69 |             "path": "houses.csv",
70 |             "profile": "data-resource",
71 |             "schema": { "fields": [
72 |                 {"name": "house", "type": "string"}
73 |             ]}
74 |         }
75 |     ]
76 | }
77 | --
78 | {"first_name": "Tyrion", "house": "Lannister"}
79 | {"first_name": "Jaime", "house": "Lannister"}
80 | {"first_name": "Cersei", "house": "Lannister"}
81 | {"first_name": "Jon", "house": "Stark"}
82 | {"first_name": "Sansa", "house": "Stark"}
83 | {"first_name": "Arya", "house": "Stark"}
84 | {"first_name": "Bran", "house": "Stark"}
85 | {"first_name": "Rickon", "house": "Stark"}
86 | {"first_name": "Daenerys", "house": "Targaryen"}
87 | 
88 | {"house": "House of Lannister"}
89 | {"house": "House of Greyjoy"}
90 | {"house": "House of Stark"}
91 | {"house": "House of Targaryen"}
92 | {"house": "House of Martell"}
93 | {"house": "House of Tyrell"}
94 | 
95 | {}
96 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_dump_dot_to_zip:
--------------------------------------------------------------------------------
 1 | dump.to_zip
 2 | --
 3 | {
 4 |     "out-file": "my-spiffy-resource.zip"
 5 | }
 6 | --
 7 | {
 8 |     "name": "test",
 9 |     "resources": [
10 |         {
11 |             "name": "my-spiffy-resource",
12 |             "dpp:streaming": true,
13 |             "path": "data/my-data.csv",
14 |             "schema": {
15 |                 "fields": [
16 |                     {"name": "mystring", "type": "string"},
17 |                     {"name": "myinteger", "type": "integer"},
18 |                     {"name": "mynumber", "type": "number"},
19 |                     {"name": "mydate", "type": "date"}
20 |                 ]
21 |             }
22 |         }
23 |     ]
24 | }
25 | --
26 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
27 | --
28 | {
29 |     "name": "test",
30 |     "resources": [
31 |         {
32 |             "name": "my-spiffy-resource",
33 |             "dpp:streaming": true,
34 |             "path": "data/my-data.csv",
35 |             "encoding": "utf-8",
36 |             "format": "csv",
37 |             "dialect": {
38 |                 "delimiter": ",",
39 |                 "doubleQuote": true,
40 |                 "lineTerminator": "\r\n",
41 |                 "quoteChar": "\"",
42 |                 "skipInitialSpace": false
43 |             },
44 |             "schema": {
45 |                 "fields": [
46 |                     {"name": "mystring", "type": "string"},
47 |                     {"name": "myinteger", "type": "integer"},
48 |                     {"name": "mynumber", "type": "number", "groupChar": "", "decimalChar": "."},
49 |                     {"format": "%Y-%m-%d", "name": "mydate", "type": "date"}
50 |                 ]
51 |             }
52 |         }
53 |     ]
54 | }
55 | --
56 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
57 | 
58 | {"bytes": 703, "count_of_rows": 1, "dataset_name": "test", "hash": "a730863e99517930eab15f55036d309f"}
59 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_dump_dot_to_zip_with_hash:
--------------------------------------------------------------------------------
 1 | dump.to_zip
 2 | --
 3 | {
 4 |     "out-file": "my-spiffy-resource.zip",
 5 |     "add-filehash-to-path": true
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": [
11 |         {
12 |             "name": "my-spiffy-resource",
13 |             "dpp:streaming": true,
14 |             "path": "data/my-data.csv",
15 |             "schema": {
16 |                 "fields": [
17 |                     {"name": "mystring", "type": "string"},
18 |                     {"name": "myinteger", "type": "integer"},
19 |                     {"name": "mynumber", "type": "number"},
20 |                     {"name": "mydate", "type": "date"}
21 |                 ]
22 |             }
23 |         }
24 |     ]
25 | }
26 | --
27 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
28 | --
29 | {
30 |     "name": "test",
31 |     "resources": [
32 |         {
33 |             "name": "my-spiffy-resource",
34 |             "dpp:streaming": true,
35 |             "path": "data/my-data.csv",
36 |             "encoding": "utf-8",
37 |             "format": "csv",
38 |             "dialect": {
39 |                 "delimiter": ",",
40 |                 "doubleQuote": true,
41 |                 "lineTerminator": "\r\n",
42 |                 "quoteChar": "\"",
43 |                 "skipInitialSpace": false
44 |             },
45 |             "schema": {
46 |                 "fields": [
47 |                     {"name": "mystring", "type": "string"},
48 |                     {"name": "myinteger", "type": "integer"},
49 |                     {"name": "mynumber", "type": "number", "groupChar": "", "decimalChar": "."},
50 |                     {"format": "%Y-%m-%d", "name": "mydate", "type": "date"}
51 |                 ]
52 |             }
53 |         }
54 |     ]
55 | }
56 | --
57 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
58 | 
59 | {"bytes": 736, "count_of_rows": 1, "dataset_name": "test", "hash": "24b55bb6b0ecacdadbc8a1dc1fd9dab9"}
60 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_dump_dot_to_zip_with_hash_and_pretty_descriptor:
--------------------------------------------------------------------------------
 1 | dump.to_zip
 2 | --
 3 | {
 4 |     "out-file": "my-spiffy-resource.zip",
 5 |     "add-filehash-to-path": true,
 6 |     "pretty-descriptor": true
 7 | }
 8 | --
 9 | {
10 |     "name": "test",
11 |     "resources": [
12 |         {
13 |             "name": "my-spiffy-resource",
14 |             "dpp:streaming": true,
15 |             "path": "data/my-data.csv",
16 |             "schema": {
17 |                 "fields": [
18 |                     {"name": "mystring", "type": "string"},
19 |                     {"name": "myinteger", "type": "integer"},
20 |                     {"name": "mynumber", "type": "number"},
21 |                     {"name": "mydate", "type": "date"}
22 |                 ]
23 |             }
24 |         }
25 |     ]
26 | }
27 | --
28 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
29 | --
30 | {
31 |     "name": "test",
32 |     "resources": [
33 |         {
34 |             "name": "my-spiffy-resource",
35 |             "dpp:streaming": true,
36 |             "path": "data/my-data.csv",
37 |             "encoding": "utf-8",
38 |             "format": "csv",
39 |             "dialect": {
40 |                 "delimiter": ",",
41 |                 "doubleQuote": true,
42 |                 "lineTerminator": "\r\n",
43 |                 "quoteChar": "\"",
44 |                 "skipInitialSpace": false
45 |             },
46 |             "schema": {
47 |                 "fields": [
48 |                     {"name": "mystring", "type": "string"},
49 |                     {"name": "myinteger", "type": "integer"},
50 |                     {"name": "mynumber", "type": "number", "groupChar": "", "decimalChar": "."},
51 |                     {"format": "%Y-%m-%d", "name": "mydate", "type": "date"}
52 |                 ]
53 |             }
54 |         }
55 |     ]
56 | }
57 | --
58 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
59 | 
60 | {"bytes": 1110, "count_of_rows": 1, "dataset_name": "test", "hash": "174d14a56ce3c798b369d1716488ca75"}
61 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_dump_to_sql:
--------------------------------------------------------------------------------
 1 | dump_to_sql
 2 | --
 3 | {
 4 |     "tables": {
 5 |         "test": {
 6 |             "resource-name": "my-spiffy-resource"
 7 |         }
 8 |     }
 9 | }
10 | --
11 | {
12 |     "name": "test",
13 |     "resources": [
14 |         {
15 |             "name": "my-spiffy-resource",
16 |             "dpp:streaming": true,
17 |             "path": "data/my-data.csv",
18 |             "schema": {
19 |                 "fields": [
20 |                     {"name": "mystring", "type": "string"},
21 |                     {"name": "myinteger", "type": "integer"},
22 |                     {"name": "mynumber", "type": "number"},
23 |                     {"name": "mydate", "type": "date"}
24 |                 ]
25 |             }
26 |         }
27 |     ]
28 | }
29 | --
30 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
31 | --
32 | {
33 |     "name": "test",
34 |     "profile": "data-package",
35 |     "resources": [
36 |         {
37 |             "name": "my-spiffy-resource",
38 |             "dpp:streaming": true,
39 |             "path": "data/my-data.csv",
40 |             "profile": "data-resource",
41 |             "schema": {
42 |                 "fields": [
43 |                     {"name": "mystring", "type": "string"},
44 |                     {"name": "myinteger", "type": "integer"},
45 |                     {"name": "mynumber", "type": "number"},
46 |                     {"name": "mydate", "type": "date"}
47 |                 ]
48 |             }
49 |         }
50 |     ]
51 | }
52 | --
53 | {"mystring":"a", "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}, "myinteger": null}
54 | 
55 | {"bytes": null, "count_of_rows": 1, "dataset_name": "test", "hash": "c1c867cd9711aedd5c94a16ce4590ece"}
56 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_dump_to_zip:
--------------------------------------------------------------------------------
 1 | dump_to_zip
 2 | --
 3 | {
 4 |     "out-file": "my-spiffy-resource.zip"
 5 | }
 6 | --
 7 | {
 8 |     "name": "test",
 9 |     "resources": [
10 |         {
11 |             "name": "my-spiffy-resource",
12 |             "dpp:streaming": true,
13 |             "path": "data/my-data.csv",
14 |             "schema": {
15 |                 "fields": [
16 |                     {"name": "mystring", "type": "string"},
17 |                     {"name": "myinteger", "type": "integer"},
18 |                     {"name": "mynumber", "type": "number"},
19 |                     {"name": "mydate", "type": "date"}
20 |                 ]
21 |             }
22 |         }
23 |     ]
24 | }
25 | --
26 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
27 | --
28 | {
29 |     "name": "test",
30 |     "profile": "data-package",
31 |     "resources": [
32 |         {
33 |             "name": "my-spiffy-resource",
34 |             "dialect": {
35 |                 "delimiter": ",", 
36 |                 "doubleQuote": true, 
37 |                 "lineTerminator": "\r\n", 
38 |                 "quoteChar": "\"", 
39 |                 "skipInitialSpace": false
40 |             },
41 |             "encoding": "utf-8",
42 |             "format": "csv",
43 |             "dpp:streaming": true,
44 |             "path": "data/my-data.csv",
45 |             "profile": "data-resource",
46 |             "schema": {
47 |                 "fields": [
48 |                     {"name": "mystring", "type": "string"},
49 |                     {"name": "myinteger", "type": "integer"},
50 |                     {"name": "mynumber", "type": "number", "decimalChar": ".", "groupChar": ""},
51 |                     {"name": "mydate", "type": "date", "format": "%Y-%m-%d"}
52 |                 ]
53 |             }
54 |         }
55 |     ]
56 | }
57 | --
58 | {"mystring":"a", "myinteger": null, "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}}
59 | 
60 | {"bytes": 1143, "count_of_rows": 1, "dataset_name": "test", "hash": "c68a5400c197333d75d34f4c198fea0b"}
61 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_dump_to_zip_with_hash:
--------------------------------------------------------------------------------
 1 | dump_to_zip
 2 | --
 3 | {
 4 |     "out-file": "my-spiffy-resource.zip",
 5 |     "add-filehash-to-path": true
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": [
11 |         {
12 |             "name": "my-spiffy-resource",
13 |             "dpp:streaming": true,
14 |             "path": "data/my-data.csv",
15 |             "schema": {
16 |                 "fields": [
17 |                     {"name": "mystring", "type": "string"},
18 |                     {"name": "myinteger", "type": "integer"},
19 |                     {"name": "mynumber", "type": "number"},
20 |                     {"name": "mydate", "type": "date"}
21 |                 ]
22 |             }
23 |         }
24 |     ]
25 | }
26 | --
27 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
28 | --
29 | {
30 |     "name": "test",
31 |     "profile": "data-package",
32 |     "resources": [
33 |         {
34 |             "name": "my-spiffy-resource",
35 |             "dialect": {
36 |                 "delimiter": ",", 
37 |                 "doubleQuote": true, 
38 |                 "lineTerminator": "\r\n", 
39 |                 "quoteChar": "\"", 
40 |                 "skipInitialSpace": false
41 |             },
42 |             "encoding": "utf-8",
43 |             "format": "csv",
44 |             "dpp:streaming": true,
45 |             "path": "data/my-data.csv",
46 |             "profile": "data-resource",
47 |             "schema": {
48 |                 "fields": [
49 |                     {"name": "mystring", "type": "string"},
50 |                     {"name": "myinteger", "type": "integer"},
51 |                     {"name": "mynumber", "type": "number", "decimalChar": ".", "groupChar": ""},
52 |                     {"name": "mydate", "type": "date", "format": "%Y-%m-%d"}
53 |                 ]
54 |             }
55 |         }
56 |     ]
57 | }
58 | --
59 | {"mystring":"a", "myinteger": null, "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}}
60 | 
61 | {"bytes": 1143, "count_of_rows": 1, "dataset_name": "test", "hash": "c68a5400c197333d75d34f4c198fea0b"}
62 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_dump_to_zip_with_hash_and_pretty_descriptor:
--------------------------------------------------------------------------------
 1 | dump_to_zip
 2 | --
 3 | {
 4 |     "out-file": "my-spiffy-resource.zip",
 5 |     "add-filehash-to-path": true,
 6 |     "pretty-descriptor": true
 7 | }
 8 | --
 9 | {
10 |     "name": "test",
11 |     "profile": "data-package",
12 |     "resources": [
13 |         {
14 |             "name": "my-spiffy-resource",
15 |             "dpp:streaming": true,
16 |             "path": "data/my-data.csv",
17 |             "profile": "data-resource",
18 |             "schema": {
19 |                 "fields": [
20 |                     {"name": "mystring", "type": "string"},
21 |                     {"name": "myinteger", "type": "integer"},
22 |                     {"name": "mynumber", "type": "number"},
23 |                     {"name": "mydate", "type": "date"}
24 |                 ]
25 |             }
26 |         }
27 |     ]
28 | }
29 | --
30 | {"mystring":"a", "mynumber": 2.0, "mydate": {"type{date}": "2016-12-31"}}
31 | --
32 | {
33 |     "name": "test",
34 |     "profile": "data-package",
35 |     "resources": [
36 |         {
37 |             "name": "my-spiffy-resource",
38 |             "dialect": {
39 |                 "delimiter": ",", 
40 |                 "doubleQuote": true, 
41 |                 "lineTerminator": "\r\n", 
42 |                 "quoteChar": "\"", 
43 |                 "skipInitialSpace": false
44 |             },
45 |             "encoding": "utf-8",
46 |             "format": "csv",
47 |             "dpp:streaming": true,
48 |             "path": "data/my-data.csv",
49 |             "profile": "data-resource",
50 |             "schema": {
51 |                 "fields": [
52 |                     {"name": "mystring", "type": "string"},
53 |                     {"name": "myinteger", "type": "integer"},
54 |                     {"name": "mynumber", "type": "number", "decimalChar": ".", "groupChar": ""},
55 |                     {"name": "mydate", "type": "date", "format": "%Y-%m-%d"}
56 |                 ]
57 |             }
58 |         }
59 |     ]
60 | }
61 | --
62 | {"mystring":"a", "myinteger": null, "mynumber": {"type{decimal}": "2.0"}, "mydate": {"type{date}": "2016-12-31"}}
63 | 
64 | {"bytes": 1143, "count_of_rows": 1, "dataset_name": "test", "hash": "c68a5400c197333d75d34f4c198fea0b"}
65 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_filter:
--------------------------------------------------------------------------------
  1 | filter
  2 | --
  3 | {
  4 |     "resources": ["concat-a1", "concat-a2"],
  5 |     "in": [
  6 |         {"a1": "a1",
  7 |          "a2": "a2"},
  8 |         {"a1": "a2"}
  9 |     ],
 10 |     "out": [{"a3": "a1"}]
 11 | }
 12 | --
 13 | {
 14 |     "name": "test",
 15 |     "resources": [
 16 |         {
 17 |             "name": "concat-a1",
 18 |             "dpp:streaming": true,
 19 |             "path": "concat-a1.csv",
 20 |             "schema": { "fields": [
 21 |                 {"name": "a1", "type": "string"},
 22 |                 {"name": "a2", "type": "string"},
 23 |                 {"name": "a3", "type": "string"}
 24 |             ]}
 25 |         },
 26 |         {
 27 |             "name": "concat-a2",
 28 |             "dpp:streaming": true,
 29 |             "path": "concat-a2.csv",
 30 |             "schema": { "fields": [
 31 |                 {"name": "a1", "type": "string"},
 32 |                 {"name": "a2", "type": "string"},
 33 |                 {"name": "a3", "type": "string"}
 34 |             ]}
 35 |         },
 36 |         {
 37 |             "name": "concat-c",
 38 |             "dpp:streaming": true,
 39 |             "path": "concat-c.csv",
 40 |             "schema": { "fields": [
 41 |                 {"name": "c1", "type": "string"},
 42 |                 {"name": "c2", "type": "string"},
 43 |                 {"name": "c3", "type": "string"}
 44 |             ]}
 45 |         }
 46 |     ]
 47 | }
 48 | --
 49 | {"a1":"a1","a2":"a1","a3":"a2"}
 50 | {"a1":"a2","a2":"a1","a3":"a1"}
 51 | {"a1":"a3","a2":"a2","a3":"a2"}
 52 | {"a1":"a4","a2":"a2","a3":"a1"}
 53 | 
 54 | {"a1":"a1","a2":"a3","a3":"a2"}
 55 | {"a1":"a2","a2":"a3","a3":"a1"}
 56 | {"a1":"a3","a2":"a4","a3":"a2"}
 57 | {"a1":"a4","a2":"a4","a3":"a1"}
 58 | 
 59 | {"c1":"c11","c2":"c21","c3":"c31"}
 60 | {"c1":"c12","c2":"c22","c3":"c32"}
 61 | {"c1":"c13","c2":"c23","c3":"c33"}
 62 | --
 63 | {
 64 |     "name": "test",
 65 |     "profile": "data-package",
 66 |     "resources": [
 67 |         {
 68 |             "name": "concat-a1",
 69 |             "dpp:streaming": true,
 70 |             "path": "concat-a1.csv",
 71 |             "profile": "data-resource",
 72 |             "schema": { "fields": [
 73 |                 {"name": "a1", "type": "string"},
 74 |                 {"name": "a2", "type": "string"},
 75 |                 {"name": "a3", "type": "string"}
 76 |             ]}
 77 |         },
 78 |         {
 79 |             "name": "concat-a2",
 80 |             "dpp:streaming": true,
 81 |             "path": "concat-a2.csv",
 82 |             "profile": "data-resource",
 83 |             "schema": { "fields": [
 84 |                 {"name": "a1", "type": "string"},
 85 |                 {"name": "a2", "type": "string"},
 86 |                 {"name": "a3", "type": "string"}
 87 |             ]}
 88 |         },
 89 |         {
 90 |             "name": "concat-c",
 91 |             "dpp:streaming": true,
 92 |             "path": "concat-c.csv",
 93 |             "profile": "data-resource",
 94 |             "schema": { "fields": [
 95 |                 {"name": "c1", "type": "string"},
 96 |                 {"name": "c2", "type": "string"},
 97 |                 {"name": "c3", "type": "string"}
 98 |             ]}
 99 |         }
100 |     ]
101 | }
102 | --
103 | {"a1":"a1","a2":"a1","a3":"a2"}
104 | {"a1":"a2","a2":"a1","a3":"a1"}
105 | {"a1":"a3","a2":"a2","a3":"a2"}
106 | {"a1":"a4","a2":"a2","a3":"a1"}
107 | 
108 | {"a1":"a1","a2":"a3","a3":"a2"}
109 | {"a1":"a2","a2":"a3","a3":"a1"}
110 | {"a1":"a3","a2":"a4","a3":"a2"}
111 | 
112 | {"c1":"c11","c2":"c21","c3":"c31"}
113 | {"c1":"c12","c2":"c22","c3":"c32"}
114 | {"c1":"c13","c2":"c23","c3":"c33"}
115 | 
116 | {}
117 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_find_replace:
--------------------------------------------------------------------------------
 1 | find_replace
 2 | --
 3 | {
 4 |   "resources": "dates",
 5 |   "fields": [
 6 |     {
 7 |       "name": "year",
 8 |       "patterns": [
 9 |         {
10 |           "find": "([0-9]{4})( \\(\\w+\\))",
11 |           "replace": "\\1"
12 |         }
13 |       ]
14 |     },
15 |     {
16 |       "name": "quarter",
17 |       "patterns": [
18 |         {
19 |           "find": "Q1",
20 |           "replace": "03-31"
21 |         },
22 |         {
23 |           "find": "Q2",
24 |           "replace": "06-30"
25 |         },
26 |         {
27 |           "find": "Q3",
28 |           "replace": "09-30"
29 |         },
30 |         {
31 |           "find": "Q4",
32 |           "replace": "12-31"
33 |         }
34 |       ]
35 |     }
36 |   ]
37 | }
38 | --
39 | {
40 |     "name": "test",
41 |     "resources": [
42 |         {
43 |             "name": "dates",
44 |             "dpp:streaming": true,
45 |             "path": "dates.csv",
46 |             "schema": {
47 |                 "fields": [
48 |                     {"name": "year", "type": "string"},
49 |                     {"name": "quarter", "type": "string"},
50 |                     {"name": "char", "type": "string"}
51 |                 ]
52 |             }
53 |         }
54 |     ]
55 | }
56 | --
57 | {"year": "2001", "quarter": "2001-Q1", "char": "testing"}
58 | {"year": "2002", "quarter": "2002-Q2", "char": "testing"}
59 | {"year": "2003 (4)", "quarter": "2003-Q3", "char": "testing"}
60 | {"year": "2004", "quarter": "2004-Q1", "char": "testing"}
61 | {"year": "2005 (1)", "quarter": "2005-Q4", "char": "testing"}
62 | {"year": "2006 (Note)", "quarter": "2006-Q1", "char": "testing"}
63 | {"year": "2007 (2)", "quarter": "2007-Q2", "char": "testing"}
64 | {"year": "2008", "quarter": "2008-Q1", "char": "testing"}
65 | {"year": "2009 (10)", "quarter": "2009-Q3", "char": "testing"}
66 | --
67 | {
68 |     "name": "test",
69 |     "profile": "data-package",
70 |     "resources": [
71 |         {
72 |             "name": "dates",
73 |             "dpp:streaming": true,
74 |             "path": "dates.csv",
75 |             "profile": "data-resource",
76 |             "schema": {
77 |                 "fields": [
78 |                     {"name": "year", "type": "string"},
79 |                     {"name": "quarter", "type": "string"},
80 |                     {"name": "char", "type": "string"}
81 |                 ]
82 |             }
83 |         }
84 |     ]
85 | }
86 | --
87 | {"year": "2001", "quarter": "2001-03-31", "char": "testing"}
88 | {"year": "2002", "quarter": "2002-06-30", "char": "testing"}
89 | {"year": "2003", "quarter": "2003-09-30", "char": "testing"}
90 | {"year": "2004", "quarter": "2004-03-31", "char": "testing"}
91 | {"year": "2005", "quarter": "2005-12-31", "char": "testing"}
92 | {"year": "2006", "quarter": "2006-03-31", "char": "testing"}
93 | {"year": "2007", "quarter": "2007-06-30", "char": "testing"}
94 | {"year": "2008", "quarter": "2008-03-31", "char": "testing"}
95 | {"year": "2009", "quarter": "2009-09-30", "char": "testing"}
96 | 
97 | {}
98 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_join:
--------------------------------------------------------------------------------
  1 | join
  2 | --
  3 | {
  4 |     "source": {
  5 |         "name": "got-characters",
  6 |         "key": "House of {house}",
  7 |         "delete": true
  8 |     },
  9 |     "target": {
 10 |         "name": "got-houses",
 11 |         "key": "{house}"
 12 |     },
 13 |     "fields": {
 14 |         "max_age": {
 15 |             "name": "age",
 16 |             "aggregate": "max"
 17 |         },
 18 |         "avg_age": {
 19 |             "name": "age",
 20 |             "aggregate": "avg"
 21 |         },
 22 |         "representative": {
 23 |             "name": "first_name",
 24 |             "aggregate": "last"
 25 |         },
 26 |         "representative_age": {
 27 |             "name": "age"
 28 |         },
 29 |         "number_of_characters": {
 30 |             "aggregate": "count"
 31 |         },
 32 |         "last_names": {
 33 |             "name": "last_name",
 34 |             "aggregate": "counters"
 35 |         }
 36 |     },
 37 |     "full": false
 38 | }
 39 | --
 40 | {
 41 |     "name": "test",
 42 |     "resources": [
 43 |         {
 44 |             "name": "got-characters",
 45 |             "dpp:streaming": true,
 46 |             "path": "characters.csv",
 47 |             "schema": {
 48 |                 "fields": [
 49 |                     {"name": "first_name", "type": "string"},
 50 |                     {"name": "last_name", "type": "string"},
 51 |                     {"name": "house", "type": "string"},
 52 |                     {"name": "age", "type": "number", "units": "Westerosian Years"}
 53 |                 ]
 54 |             }
 55 |         },
 56 |         {
 57 |             "name": "got-houses",
 58 |             "dpp:streaming": true,
 59 |             "path": "houses.csv",
 60 |             "schema": { "fields": [
 61 |                 {"name": "house", "type": "string"}
 62 |             ]}
 63 |         }
 64 |     ]
 65 | }
 66 | --
 67 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
 68 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
 69 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
 70 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
 71 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
 72 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
 73 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
 74 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
 75 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
 76 | 
 77 | {"house": "House of Lannister"}
 78 | {"house": "House of Greyjoy"}
 79 | {"house": "House of Stark"}
 80 | {"house": "House of Targaryen"}
 81 | {"house": "House of Martell"}
 82 | {"house": "House of Tyrell"}
 83 | --
 84 | {
 85 |     "name": "test",
 86 |     "profile": "data-package",
 87 |     "resources": [
 88 |         {
 89 |             "name": "got-houses",
 90 |             "dpp:streaming": true,
 91 |             "path": "houses.csv",
 92 |             "profile": "data-resource",
 93 |             "schema": { "fields": [
 94 |                 {"name": "house", "type": "string"},
 95 |                 {"name": "avg_age", "type": "number"},
 96 |                 {"name": "last_names", "type": "array"},
 97 |                 {"name": "max_age", "type": "number"},
 98 |                 {"name": "number_of_characters", "type": "integer"},
 99 |                 {"name": "representative", "type": "string"},
100 |                 {"name": "representative_age", "type": "number", "units": "Westerosian Years"}
101 |             ]}
102 |         }
103 |     ]
104 | }
105 | --
106 | {"avg_age": 31.666666666666668, "house": "House of Lannister", "max_age": 34, "number_of_characters": 3, "representative": "Cersei", "representative_age": 34, "last_names": [["Lannister", 3]]}
107 | {"avg_age": 11.4, "house": "House of Stark", "max_age": 17, "number_of_characters": 5, "representative": "Rickon", "representative_age": 5, "last_names": [["Stark", 4], ["Snow", 1]]}
108 | {"avg_age": 16.0, "house": "House of Targaryen", "max_age": 16, "number_of_characters": 1, "representative": "Daenerys", "representative_age": 16, "last_names": [["Targaryen", 1]]}
109 | 
110 | {}
111 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load:
--------------------------------------------------------------------------------
 1 | load
 2 | --
 3 | {
 4 |     "from": "https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/master/tests/data/sample.csv",
 5 |     "name": "my-spiffy-resource",
 6 |     "validate": true
 7 | }
 8 | --
 9 | {
10 |     "name": "test",
11 |     "resources": []
12 | }
13 | --
14 | --
15 | {
16 |     "name": "test",
17 |     "profile": "data-package",
18 |     "resources": [
19 |         {
20 |             "dpp:streaming": true, 
21 |             "format": "csv", 
22 |             "name": "my-spiffy-resource", 
23 |             "path": "my-spiffy-resource.csv", 
24 |             "profile": "tabular-data-resource", 
25 |             "dpp:streamedFrom": "https://raw.githubusercontent.com/frictionlessdata/datapackage-pipelines/master/tests/data/sample.csv",
26 |             "schema": {
27 |                 "fields": [
28 |                     {"format": "default", "name": "first_name", "type": "string"}, 
29 |                     {"format": "default", "name": "last_name", "type": "string"}, 
30 |                     {"format": "default", "name": "house", "type": "string"}, 
31 |                     {"format": "default", "name": "age", "type": "integer"}
32 |                 ], 
33 |                 "missingValues": [""]
34 |             }
35 |         }
36 |     ]
37 | }
38 | --
39 | {"age": 27, "first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister"}
40 | {"age": 34, "first_name": "Jaime", "house": "Lannister", "last_name": "Lannister"}
41 | {"age": 34, "first_name": "Cersei", "house": "Lannister", "last_name": "Lannister"}
42 | {"age": 17, "first_name": "Jon", "house": "Stark", "last_name": "Snow"}
43 | {"age": 14, "first_name": "Sansa", "house": "Stark", "last_name": "Stark"}
44 | {"age": 11, "first_name": "Arya", "house": "Stark", "last_name": "Stark"}
45 | {"age": 10, "first_name": "Bran", "house": "Stark", "last_name": "Stark"}
46 | {"age": 5, "first_name": "Rickon", "house": "Stark", "last_name": "Stark"}
47 | {"age": 16, "first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen"}
48 | 
49 | {}
50 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_metadata:
--------------------------------------------------------------------------------
 1 | load_metadata
 2 | --
 3 | {
 4 |     "url": "tests/data/datapackage.json"
 5 | }
 6 | --
 7 | {
 8 |     "name": "test",
 9 |     "resources": []
10 | }
11 | --
12 | --
13 | {
14 |     "name": "my-spiffy-datapackage",
15 |     "my-prop": "the-props-value",
16 |     "profile": "data-package",
17 |     "resources": []
18 | }
19 | --
20 | {}
21 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resource": "my-spiffy-resource",
 5 |     "url": "tests/data/datapackage.json"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": []
11 | }
12 | --
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": [
17 |         {
18 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
19 |             "name": "my-spiffy-resource",
20 |             "dpp:streaming": true,
21 |             "profile": "data-resource",
22 |             "path": "sample.csv",
23 |             "schema": {
24 |                 "fields": [
25 |                     {"name": "first_name", "type": "string"},
26 |                     {"name": "last_name", "type": "string"},
27 |                     {"name": "house", "type": "string"},
28 |                     {"name": "age", "type": "integer"}
29 |                 ],
30 |                 "primaryKey": [
31 |                     "first_name", "last_name"
32 |                 ]
33 |             }
34 |         }
35 |     ]
36 | }
37 | --
38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
43 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
44 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
45 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
46 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
47 | 
48 | {}
49 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_dups:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resource": "my-spiffy-resource",
 5 |     "url": "tests/data/datapackage3.json"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": []
11 | }
12 | --
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": [
17 |         {
18 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.dups.csv",
19 |             "name": "my-spiffy-resource",
20 |             "dpp:streaming": true,
21 |             "profile": "data-resource",
22 |             "path": "sample.dups.csv",
23 |             "schema": {
24 |                 "fields": [
25 |                     {"name": "first_name", "type": "string"},
26 |                     {"name": "last_name", "type": "string"},
27 |                     {"name": "house", "type": "string"},
28 |                     {"name": "age", "type": "integer"}
29 |                 ],
30 |                 "primaryKey": [
31 |                     "first_name", "last_name"
32 |                 ]
33 |             }
34 |         }
35 |     ]
36 | }
37 | --
38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
43 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
44 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
45 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
46 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
47 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
48 | 
49 | {}
50 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_index:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resource": 0,
 5 |     "url": "tests/data/datapackage.json"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": []
11 | }
12 | --
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": [
17 |         {
18 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
19 |             "name": "my-spiffy-resource",
20 |             "dpp:streaming": true,
21 |             "profile": "data-resource",
22 |             "path": "sample.csv",
23 |             "schema": {
24 |                 "fields": [
25 |                     {"name": "first_name", "type": "string"},
26 |                     {"name": "last_name", "type": "string"},
27 |                     {"name": "house", "type": "string"},
28 |                     {"name": "age", "type": "integer"}
29 |                 ],
30 |                 "primaryKey": [
31 |                     "first_name", "last_name"
32 |                 ]
33 |             }
34 |         }
35 |     ]
36 | }
37 | --
38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
43 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
44 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
45 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
46 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
47 | 
48 | {}
49 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_limit_rows:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resource": "my-spiffy-resource",
 5 |     "url": "tests/data/datapackage.json",
 6 |     "limit-rows": 5
 7 | }
 8 | --
 9 | {
10 |     "name": "test",
11 |     "resources": []
12 | }
13 | --
14 | --
15 | {
16 |     "name": "test",
17 |     "resources": [
18 |         {
19 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
20 |             "name": "my-spiffy-resource",
21 |             "dpp:streaming": true,
22 |             "profile": "data-resource",
23 |             "path": "sample.csv",
24 |             "schema": {
25 |                 "fields": [
26 |                     {"name": "first_name", "type": "string"},
27 |                     {"name": "last_name", "type": "string"},
28 |                     {"name": "house", "type": "string"},
29 |                     {"name": "age", "type": "integer"}
30 |                 ],
31 |                 "primaryKey": [
32 |                     "first_name", "last_name"
33 |                 ]
34 |             }
35 |         }
36 |     ]
37 | }
38 | --
39 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
40 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
41 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
42 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
43 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
44 | 
45 | {}
46 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_list:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resource": ["my-spiffy-resource", "the-spiffy-resource"],
 5 |     "url": "tests/data/datapackage2.json"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": []
11 | }
12 | --
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": [
17 |         {
18 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
19 |             "name": "my-spiffy-resource",
20 |             "dpp:streaming": true,
21 |             "profile": "data-resource",
22 |             "path": "sample.csv",
23 |             "schema": {
24 |                 "fields": [
25 |                     {"name": "first_name", "type": "string"},
26 |                     {"name": "last_name", "type": "string"},
27 |                     {"name": "house", "type": "string"},
28 |                     {"name": "age", "type": "integer"}
29 |                 ]
30 |             }
31 |         },
32 |         {
33 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
34 |             "name": "the-spiffy-resource",
35 |             "dpp:streaming": true,
36 |             "profile": "data-resource",
37 |             "path": "sample.csv",
38 |             "schema": {
39 |                 "fields": [
40 |                     {"name": "first_name", "type": "string"},
41 |                     {"name": "last_name", "type": "string"},
42 |                     {"name": "house", "type": "string"},
43 |                     {"name": "age", "type": "integer"}
44 |                 ]
45 |             }
46 |         }
47 |     ]
48 | }
49 | --
50 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
51 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
52 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
53 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
54 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
55 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
56 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
57 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
58 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
59 | 
60 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
61 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
62 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
63 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
64 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
65 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
66 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
67 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
68 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
69 | 
70 | {}
71 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_multi:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resource": "t.+",
 5 |     "url": "tests/data/datapackage2.json"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": []
11 | }
12 | --
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": [
17 |         {
18 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
19 |             "name": "the-spiffy-resource",
20 |             "dpp:streaming": true,
21 |             "profile": "data-resource",
22 |             "path": "sample.csv",
23 |             "schema": {
24 |                 "fields": [
25 |                     {"name": "first_name", "type": "string"},
26 |                     {"name": "last_name", "type": "string"},
27 |                     {"name": "house", "type": "string"},
28 |                     {"name": "age", "type": "integer"}
29 |                 ]
30 |             }
31 |         },
32 |         {
33 |             "dpp:streamedFrom": "%(base)s/tests/data/sample2.csv",
34 |             "name": "the-other-spiffy-resource",
35 |             "dpp:streaming": true,
36 |             "profile": "data-resource",
37 |             "path": "sample2.csv",
38 |             "schema": {
39 |                 "fields": [
40 |                     {"name": "first_name", "type": "string"},
41 |                     {"name": "last_name", "type": "string"},
42 |                     {"name": "house", "type": "string"},
43 |                     {"name": "age", "type": "integer"}
44 |                 ]
45 |             }
46 |         }
47 |     ]
48 | }
49 | --
50 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
51 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
52 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
53 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
54 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
55 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
56 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
57 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
58 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
59 | 
60 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
61 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
62 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
63 | 
64 | {}
65 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_required:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resource": "foobar",
 5 |     "url": "foo/bar/datapackage.json",
 6 |     "required": false
 7 | }
 8 | --
 9 | {
10 |     "name": "test",
11 |     "resources": []
12 | }
13 | --
14 | --
15 | {
16 |     "name": "test",
17 |     "resources": []
18 | }
19 | --
20 | {}
21 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_resources:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resources": {
 5 |         "my-spiffy-resource": {},
 6 |         "the-spiffy-resource": {
 7 |             "name": "renamed-spiffy-resource",
 8 |             "path": "renamed-spiffy-resource.csv"
 9 |         }
10 |     },
11 |     "url": "tests/data/datapackage2.json"
12 | }
13 | --
14 | {
15 |     "name": "test",
16 |     "resources": []
17 | }
18 | --
19 | --
20 | {
21 |     "name": "test",
22 |     "resources": [
23 |         {
24 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
25 |             "name": "my-spiffy-resource",
26 |             "dpp:streaming": true,
27 |             "profile": "data-resource",
28 |             "path": "sample.csv",
29 |             "schema": {
30 |                 "fields": [
31 |                     {"name": "first_name", "type": "string"},
32 |                     {"name": "last_name", "type": "string"},
33 |                     {"name": "house", "type": "string"},
34 |                     {"name": "age", "type": "integer"}
35 |                 ]
36 |             }
37 |         },
38 |         {
39 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
40 |             "name": "renamed-spiffy-resource",
41 |             "dpp:streaming": true,
42 |             "profile": "data-resource",
43 |             "path": "renamed-spiffy-resource.csv",
44 |             "schema": {
45 |                 "fields": [
46 |                     {"name": "first_name", "type": "string"},
47 |                     {"name": "last_name", "type": "string"},
48 |                     {"name": "house", "type": "string"},
49 |                     {"name": "age", "type": "integer"}
50 |                 ]
51 |             }
52 |         }
53 |     ]
54 | }
55 | --
56 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
57 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
58 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
59 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
60 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
61 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
62 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
63 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
64 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
65 | 
66 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
67 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
68 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
69 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
70 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
71 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
72 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
73 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
74 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
75 | 
76 | {}
77 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_load_resource_resources_required:
--------------------------------------------------------------------------------
 1 | load_resource
 2 | --
 3 | {
 4 |     "resources": {
 5 |         "my-spiffy-resource": {},
 6 |         "nonexistent-spiffy-resource": {
 7 |             "name": "renamed-spiffy-resource",
 8 |             "path": "renamed-spiffy-resource.csv"
 9 |         }
10 |     },
11 |     "url": "tests/data/datapackage2.json",
12 |     "required": false
13 | }
14 | --
15 | {
16 |     "name": "test",
17 |     "resources": []
18 | }
19 | --
20 | --
21 | {
22 |     "name": "test",
23 |     "resources": [
24 |         {
25 |             "dpp:streamedFrom": "%(base)s/tests/data/sample.csv",
26 |             "name": "my-spiffy-resource",
27 |             "dpp:streaming": true,
28 |             "profile": "data-resource",
29 |             "path": "sample.csv",
30 |             "schema": {
31 |                 "fields": [
32 |                     {"name": "first_name", "type": "string"},
33 |                     {"name": "last_name", "type": "string"},
34 |                     {"name": "house", "type": "string"},
35 |                     {"name": "age", "type": "integer"}
36 |                 ]
37 |             }
38 |         }
39 |     ]
40 | }
41 | --
42 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": 27}
43 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": 34}
44 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": 34}
45 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": 17}
46 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": 14}
47 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": 11}
48 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": 10}
49 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": 5}
50 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": 16}
51 | 
52 | {}
53 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_resource_duplication:
--------------------------------------------------------------------------------
 1 | duplicate
 2 | --
 3 | {
 4 |     "source": "original",
 5 |     "target-name": "the-dup",
 6 |     "target-path": "the-dup.csv"
 7 | }
 8 | --
 9 | {
10 |     "name": "test",
11 |     "resources": [
12 |         {
13 |             "name": "original",
14 |             "dpp:streaming": true,
15 |             "path": "data.csv",
16 |             "schema": { "fields": [
17 |                 {"name": "year", "type": "integer"},
18 |                 {"name": "data", "type": "string"}
19 |             ]}
20 |         }
21 |     ]
22 | }
23 | --
24 | {"year":"2016","data":"foo","i":0}
25 | {"year":"2017","data":"baz","i":1}
26 | {"year":"2017","data":"bax","i":2}
27 | {"year":"2015","data":"","i":3}
28 | {"year":"2015","data":"","i":4}
29 | {"year":"2015","data":"","i":5}
30 | {"year":"2015","data":"","i":6}
31 | {"year":"2015","data":"","i":7}
32 | {"year":"2015","data":"","i":8}
33 | {"year":"2015","data":"","i":9}
34 | {"year":"2015","data":"","i":10}
35 | {"year":"2015","data":"","i":11}
36 | --
37 | {
38 |     "name": "test",
39 |     "profile": "data-package",
40 |     "resources": [
41 |         {
42 |             "name": "original",
43 |             "dpp:streaming": true,
44 |             "path": "data.csv",
45 |             "profile": "data-resource",
46 |             "schema": { "fields": [
47 |                 {"name": "year", "type": "integer"},
48 |                 {"name": "data", "type": "string"}
49 |             ]}
50 |         },
51 |         {
52 |             "name": "the-dup",
53 |             "dpp:streaming": true,
54 |             "path": "the-dup.csv",
55 |             "profile": "data-resource",
56 |             "schema": { "fields": [
57 |                 {"name": "year", "type": "integer"},
58 |                 {"name": "data", "type": "string"}
59 |             ]}
60 |         }
61 |     ]
62 | }
63 | --
64 | {"year":"2016","data":"foo","i":0}
65 | {"year":"2017","data":"baz","i":1}
66 | {"year":"2017","data":"bax","i":2}
67 | {"year":"2015","data":"","i":3}
68 | {"year":"2015","data":"","i":4}
69 | {"year":"2015","data":"","i":5}
70 | {"year":"2015","data":"","i":6}
71 | {"year":"2015","data":"","i":7}
72 | {"year":"2015","data":"","i":8}
73 | {"year":"2015","data":"","i":9}
74 | {"year":"2015","data":"","i":10}
75 | {"year":"2015","data":"","i":11}
76 | 
77 | {"year":"2016","data":"foo","i":0}
78 | {"year":"2017","data":"baz","i":1}
79 | {"year":"2017","data":"bax","i":2}
80 | {"year":"2015","data":"","i":3}
81 | {"year":"2015","data":"","i":4}
82 | {"year":"2015","data":"","i":5}
83 | {"year":"2015","data":"","i":6}
84 | {"year":"2015","data":"","i":7}
85 | {"year":"2015","data":"","i":8}
86 | {"year":"2015","data":"","i":9}
87 | {"year":"2015","data":"","i":10}
88 | {"year":"2015","data":"","i":11}
89 | 
90 | {}
91 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_set_types:
--------------------------------------------------------------------------------
 1 | set_types
 2 | --
 3 | {
 4 |     "types": {
 5 |         "t1": {"type": "number", "groupChar": ","},
 6 |         "t2": null
 7 |     }
 8 | }
 9 | --
10 | {
11 |     "name": "test",
12 |     "resources": [
13 |         {
14 |             "name": "concat-a",
15 |             "dpp:streaming": true,
16 |             "path": "concat-a.csv",
17 |             "schema": { "fields": [
18 |                 {"name": "t1", "type": "string"},
19 |                 {"name": "t2", "type": "string"}
20 |             ]}
21 |         }
22 |     ]
23 | }
24 | --
25 | {"t1": "123,456", "t2": "to-remove"}
26 | {"t1": "456,123", "t2": "to-remove"}
27 | --
28 | {
29 |     "name": "test",
30 |     "profile": "data-package",
31 |     "resources": [
32 |         {
33 |             "name": "concat-a",
34 |             "dpp:streaming": true,
35 |             "path": "concat-a.csv",
36 |             "profile": "data-resource",
37 |             "schema": { "fields": [
38 |                 {"name": "t1", "type": "number", "groupChar": ","}
39 |             ]}
40 |         }
41 |     ]
42 | }
43 | --
44 | {"t1": {"type{decimal}": "123456"}}
45 | {"t1": {"type{decimal}": "456123"}}
46 | 
47 | {}
48 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_sort:
--------------------------------------------------------------------------------
  1 | sort
  2 | --
  3 | {
  4 |     "resources": ["concat-a1", "concat-a2"],
  5 |     "sort-by": "{a3} {a2} {a1}"
  6 | }
  7 | --
  8 | {
  9 |     "name": "test",
 10 |     "resources": [
 11 |         {
 12 |             "name": "concat-a1",
 13 |             "dpp:streaming": true,
 14 |             "path": "concat-a1.csv",
 15 |             "schema": { "fields": [
 16 |                 {"name": "a1", "type": "string"},
 17 |                 {"name": "a2", "type": "string"},
 18 |                 {"name": "a3", "type": "string"}
 19 |             ]}
 20 |         },
 21 |         {
 22 |             "name": "concat-a2",
 23 |             "dpp:streaming": true,
 24 |             "path": "concat-a2.csv",
 25 |             "schema": { "fields": [
 26 |                 {"name": "a1", "type": "string"},
 27 |                 {"name": "a2", "type": "string"},
 28 |                 {"name": "a3", "type": "string"}
 29 |             ]}
 30 |         },
 31 |         {
 32 |             "name": "concat-c",
 33 |             "dpp:streaming": true,
 34 |             "path": "concat-c.csv",
 35 |             "schema": { "fields": [
 36 |                 {"name": "c1", "type": "string"},
 37 |                 {"name": "c2", "type": "string"},
 38 |                 {"name": "c3", "type": "string"}
 39 |             ]}
 40 |         }
 41 |     ]
 42 | }
 43 | --
 44 | {"a1":"a1","a2":"a1","a3":"a2"}
 45 | {"a1":"a2","a2":"a1","a3":"a1"}
 46 | {"a1":"a3","a2":"a2","a3":"a2"}
 47 | {"a1":"a4","a2":"a2","a3":"a1"}
 48 | 
 49 | {"a1":"a1","a2":"a3","a3":"a2"}
 50 | {"a1":"a2","a2":"a3","a3":"a1"}
 51 | {"a1":"a3","a2":"a4","a3":"a2"}
 52 | {"a1":"a4","a2":"a4","a3":"a1"}
 53 | 
 54 | {"c1":"c13","c2":"c23","c3":"c33"}
 55 | {"c1":"c12","c2":"c22","c3":"c32"}
 56 | {"c1":"c11","c2":"c21","c3":"c31"}
 57 | --
 58 | {
 59 |     "name": "test",
 60 |     "profile": "data-package",
 61 |     "resources": [
 62 |         {
 63 |             "name": "concat-a1",
 64 |             "dpp:streaming": true,
 65 |             "path": "concat-a1.csv",
 66 |             "profile": "data-resource",
 67 |             "schema": { "fields": [
 68 |                 {"name": "a1", "type": "string"},
 69 |                 {"name": "a2", "type": "string"},
 70 |                 {"name": "a3", "type": "string"}
 71 |             ]}
 72 |         },
 73 |         {
 74 |             "name": "concat-a2",
 75 |             "dpp:streaming": true,
 76 |             "path": "concat-a2.csv",
 77 |             "profile": "data-resource",
 78 |             "schema": { "fields": [
 79 |                 {"name": "a1", "type": "string"},
 80 |                 {"name": "a2", "type": "string"},
 81 |                 {"name": "a3", "type": "string"}
 82 |             ]}
 83 |         },
 84 |         {
 85 |             "name": "concat-c",
 86 |             "dpp:streaming": true,
 87 |             "path": "concat-c.csv",
 88 |             "profile": "data-resource",
 89 |             "schema": { "fields": [
 90 |                 {"name": "c1", "type": "string"},
 91 |                 {"name": "c2", "type": "string"},
 92 |                 {"name": "c3", "type": "string"}
 93 |             ]}
 94 |         }
 95 |     ]
 96 | }
 97 | --
 98 | {"a1":"a2","a2":"a1","a3":"a1"}
 99 | {"a1":"a4","a2":"a2","a3":"a1"}
100 | {"a1":"a1","a2":"a1","a3":"a2"}
101 | {"a1":"a3","a2":"a2","a3":"a2"}
102 | 
103 | {"a1":"a2","a2":"a3","a3":"a1"}
104 | {"a1":"a4","a2":"a4","a3":"a1"}
105 | {"a1":"a1","a2":"a3","a3":"a2"}
106 | {"a1":"a3","a2":"a4","a3":"a2"}
107 | 
108 | {"c1":"c13","c2":"c23","c3":"c33"}
109 | {"c1":"c12","c2":"c22","c3":"c32"}
110 | {"c1":"c11","c2":"c21","c3":"c31"}
111 | 
112 | {}
113 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_stream_remote_resources:
--------------------------------------------------------------------------------
 1 | stream_remote_resources
 2 | --
 3 | {
 4 | }
 5 | --
 6 | {
 7 |     "name": "test",
 8 |     "resources": [
 9 |         {
10 |             "name": "my-remote-resource",
11 |             "dpp:streamedFrom": "file://tests/data/sample.csv",
12 |             "path": "_"
13 |         }
14 |     ]
15 | }
16 | --
17 | --
18 | {
19 |     "name": "test",
20 |     "resources": [
21 |         {
22 |             "name": "my-remote-resource",
23 |             "path": "data/my-remote-resource.csv",
24 |             "dpp:streamedFrom": "file://tests/data/sample.csv",
25 |             "dpp:streaming": true,
26 |             "schema": {
27 |                 "fields": [
28 |                     {"name": "first_name", "type": "string"},
29 |                     {"name": "last_name", "type": "string"},
30 |                     {"name": "house", "type": "string"},
31 |                     {"name": "age", "type": "string"}
32 |                 ]
33 |             }
34 |         }
35 |     ]
36 | }
37 | --
38 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": "27"}
39 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": "34"}
40 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": "34"}
41 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": "17"}
42 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": "14"}
43 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": "11"}
44 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": "10"}
45 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": "5"}
46 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": "16"}
47 | 
48 | {}
49 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_stream_remote_resources_limit_rows:
--------------------------------------------------------------------------------
 1 | stream_remote_resources
 2 | --
 3 | {
 4 |     "limit-rows": 5
 5 | }
 6 | --
 7 | {
 8 |     "name": "test",
 9 |     "resources": [
10 |         {
11 |             "name": "my-remote-resource",
12 |             "dpp:streamedFrom": "file://tests/data/sample.csv",
13 |             "path": "_"
14 |         }
15 |     ]
16 | }
17 | --
18 | --
19 | {
20 |     "name": "test",
21 |     "resources": [
22 |         {
23 |             "name": "my-remote-resource",
24 |             "path": "data/my-remote-resource.csv",
25 |             "dpp:streamedFrom": "file://tests/data/sample.csv",
26 |             "dpp:streaming": true,
27 |             "schema": {
28 |                 "fields": [
29 |                     {"name": "first_name", "type": "string"},
30 |                     {"name": "last_name", "type": "string"},
31 |                     {"name": "house", "type": "string"},
32 |                     {"name": "age", "type": "string"}
33 |                 ]
34 |             }
35 |         }
36 |     ]
37 | }
38 | --
39 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": "27"}
40 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": "34"}
41 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": "34"}
42 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": "17"}
43 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": "14"}
44 | 
45 | {}
46 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_stream_remote_resources_zip:
--------------------------------------------------------------------------------
 1 | stream_remote_resources
 2 | --
 3 | {
 4 | }
 5 | --
 6 | {
 7 |     "name": "test",
 8 |     "resources": [
 9 |         {
10 |             "name": "my-remote-resource",
11 |             "dpp:streamedFrom": "file://tests/data/sample.zip",
12 |             "path": "_",
13 |             "compression": "zip",
14 |             "format": "csv"
15 |         }
16 |     ]
17 | }
18 | --
19 | --
20 | {
21 |     "name": "test",
22 |     "resources": [
23 |         {
24 |             "name": "my-remote-resource",
25 |             "path": "data/my-remote-resource.csv",
26 |             "dpp:streamedFrom": "file://tests/data/sample.zip",
27 |             "dpp:streaming": true,
28 |             "compression": "zip",
29 |             "format": "csv",
30 |             "schema": {
31 |                 "fields": [
32 |                     {"name": "first_name", "type": "string"},
33 |                     {"name": "last_name", "type": "string"},
34 |                     {"name": "house", "type": "string"},
35 |                     {"name": "age", "type": "string"}
36 |                 ]
37 |             }
38 |         }
39 |     ]
40 | }
41 | --
42 | {"first_name": "Tyrion", "house": "Lannister", "last_name": "Lannister", "age": "27"}
43 | {"first_name": "Jaime", "house": "Lannister", "last_name": "Lannister", "age": "34"}
44 | {"first_name": "Cersei", "house": "Lannister", "last_name": "Lannister", "age": "34"}
45 | {"first_name": "Jon", "house": "Stark", "last_name": "Snow", "age": "17"}
46 | {"first_name": "Sansa", "house": "Stark", "last_name": "Stark", "age": "14"}
47 | {"first_name": "Arya", "house": "Stark", "last_name": "Stark", "age": "11"}
48 | {"first_name": "Bran", "house": "Stark", "last_name": "Stark", "age": "10"}
49 | {"first_name": "Rickon", "house": "Stark", "last_name": "Stark", "age": "5"}
50 | {"first_name": "Daenerys", "house": "Targaryen", "last_name": "Targaryen", "age": "16"}
51 | 
52 | {}
53 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_unpivot:
--------------------------------------------------------------------------------
  1 | unpivot
  2 | --
  3 | {
  4 |   "resources": "balance",
  5 |   "extraKeyFields": [
  6 |     {"name": "year", "type": "integer"},
  7 |     {"name": "direction", "type": "string", "constraints": {"enum": ["In", "Out"]}}
  8 |   ],
  9 |   "extraValueField": {
 10 |     "name": "amount",
 11 |     "type": "number"
 12 |   },
 13 |   "unpivot": [
 14 |     {
 15 |       "name": "2015_incomes",
 16 |       "keys": {
 17 |         "year": 2015,
 18 |         "direction": "In"
 19 |       }
 20 |     },
 21 |     {
 22 |       "name": "2015_expenses",
 23 |       "keys": {
 24 |         "year": 2015,
 25 |         "direction": "Out"
 26 |       }
 27 |     },
 28 |     {
 29 |       "name": "2016_incomes",
 30 |       "keys": {
 31 |         "year": 2016,
 32 |         "direction": "In"
 33 |       }
 34 |     },
 35 |     {
 36 |       "name": "2016_expenses",
 37 |       "keys": {
 38 |         "year": 2016,
 39 |         "direction": "Out"
 40 |       }
 41 |     },
 42 |     {
 43 |       "name": "([0-9]{4}) (\\w+)",
 44 |       "keys": {
 45 |         "year": "\\1",
 46 |         "direction": "\\2"
 47 |       }
 48 |     }
 49 |   ]
 50 | }
 51 | --
 52 | {
 53 |     "name": "test",
 54 |     "resources": [
 55 |         {
 56 |             "name": "balance",
 57 |             "dpp:streaming": true,
 58 |             "path": "balance.csv",
 59 |             "schema": { "fields": [
 60 |                 {"name": "company", "type": "string"},
 61 |                 {"name": "2015_incomes", "type": "number"},
 62 |                 {"name": "2015_expenses", "type": "number"},
 63 |                 {"name": "2016_incomes", "type": "number"},
 64 |                 {"name": "2016_expenses", "type": "number"},
 65 |                 {"name": "2017 In", "type": "number"},
 66 |                 {"name": "2017 Out", "type": "number"}
 67 |             ]}
 68 |         }
 69 |     ]
 70 | }
 71 | --
 72 | {"company": "his-company", "2015_incomes": 100000, "2015_expenses": 80000, "2016_incomes": 150000, "2016_expenses": 120000, "2017 In": 100000, "2017 Out": 120000}
 73 | {"company": "her-company", "2015_incomes": 150000, "2015_expenses": 160000, "2016_incomes": 300000, "2016_expenses": 200000, "2017 In": 100000, "2017 Out": 120000}
 74 | --
 75 | {
 76 |     "name": "test",
 77 |     "profile": "data-package",
 78 |     "resources": [
 79 |         {
 80 |             "name": "balance",
 81 |             "dpp:streaming": true,
 82 |             "path": "balance.csv",
 83 |             "profile": "data-resource",
 84 |             "schema": { "fields": [
 85 |                 {"name": "company", "type": "string"},
 86 |                 {"name": "year", "type": "integer"},
 87 |                 {"name": "direction", "type": "string", "constraints": {"enum": ["In", "Out"]}},
 88 |                 {"name": "amount", "type": "number"}
 89 |             ]}
 90 |         }
 91 |     ]
 92 | }
 93 | --
 94 | {"company": "his-company", "year": 2015, "direction": "In", "amount": 100000}
 95 | {"company": "his-company", "year": 2015, "direction": "Out", "amount": 80000}
 96 | {"company": "his-company", "year": 2016, "direction": "In", "amount": 150000}
 97 | {"company": "his-company", "year": 2016, "direction": "Out", "amount": 120000}
 98 | {"company": "his-company", "year": "2017", "direction": "In", "amount": 100000}
 99 | {"company": "his-company", "year": "2017", "direction": "Out", "amount": 120000}
100 | {"company": "her-company", "year": 2015, "direction": "In", "amount": 150000}
101 | {"company": "her-company", "year": 2015, "direction": "Out", "amount": 160000}
102 | {"company": "her-company", "year": 2016, "direction": "In", "amount": 300000}
103 | {"company": "her-company", "year": 2016, "direction": "Out", "amount": 200000}
104 | {"company": "her-company", "year": "2017", "direction": "In", "amount": 100000}
105 | {"company": "her-company", "year": "2017", "direction": "Out", "amount": 120000}
106 | 
107 | {}
108 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_update_package:
--------------------------------------------------------------------------------
 1 | update_package
 2 | --
 3 | {
 4 |     "title": "Moshe",
 5 |     "sources": {
 6 |         "web": "http://google.com"
 7 |     }
 8 | }
 9 | --
10 | {
11 |     "name": "test",
12 |     "resources": []
13 | }
14 | --
15 | --
16 | {
17 |     "name": "test",
18 |     "title": "Moshe",
19 |     "profile": "data-package",
20 |     "sources": {
21 |         "web": "http://google.com"
22 |     },
23 |     "resources": []
24 | }
25 | --
26 | {}
27 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/simple_update_resource:
--------------------------------------------------------------------------------
 1 | update_resource
 2 | --
 3 | {
 4 |     "resources": ["name1"],
 5 |     "metadata": {
 6 |         "path": "path1-new"
 7 |     }
 8 | }
 9 | --
10 | {
11 |     "title": "Test",
12 |     "resources": [
13 |         {"name": "name1", "path": "path1"},
14 |         {"name": "name2", "path": "path2"}
15 |     ]
16 | }
17 | --
18 | --
19 | {
20 |     "title": "Test",
21 |     "resources": [
22 |         {"name": "name1", "path": "path1-new", "profile": "data-resource"},
23 |         {"name": "name2", "path": "path2", "profile": "data-resource"}
24 |     ],
25 |     "profile": "data-package"
26 | }
27 | --
28 | {}
29 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/sort_with_duplicate_keys:
--------------------------------------------------------------------------------
 1 | sort
 2 | --
 3 | {
 4 |     "resources": ["data"],
 5 |     "sort-by": "{year}"
 6 | }
 7 | --
 8 | {
 9 |     "name": "test",
10 |     "resources": [
11 |         {
12 |             "name": "data",
13 |             "dpp:streaming": true,
14 |             "path": "data.csv",
15 |             "schema": { "fields": [
16 |                 {"name": "year", "type": "integer"},
17 |                 {"name": "data", "type": "string"}
18 |             ]}
19 |         }
20 |     ]
21 | }
22 | --
23 | {"year":"2016","data":"foo","i":0}
24 | {"year":"2017","data":"baz","i":1}
25 | {"year":"2017","data":"bax","i":2}
26 | {"year":"2015","data":"","i":3}
27 | {"year":"2015","data":"","i":4}
28 | {"year":"2015","data":"","i":5}
29 | {"year":"2015","data":"","i":6}
30 | {"year":"2015","data":"","i":7}
31 | {"year":"2015","data":"","i":8}
32 | {"year":"2015","data":"","i":9}
33 | {"year":"2015","data":"","i":10}
34 | {"year":"2015","data":"","i":11}
35 | --
36 | {
37 |     "name": "test",
38 |     "profile": "data-package",
39 |     "resources": [
40 |         {
41 |             "name": "data",
42 |             "dpp:streaming": true,
43 |             "path": "data.csv",
44 |             "profile": "data-resource",
45 |             "schema": { "fields": [
46 |                 {"name": "year", "type": "integer"},
47 |                 {"name": "data", "type": "string"}
48 |             ]}
49 |         }
50 |     ]
51 | }
52 | --
53 | {"year":"2015","data":"","i":3}
54 | {"year":"2015","data":"","i":4}
55 | {"year":"2015","data":"","i":5}
56 | {"year":"2015","data":"","i":6}
57 | {"year":"2015","data":"","i":7}
58 | {"year":"2015","data":"","i":8}
59 | {"year":"2015","data":"","i":9}
60 | {"year":"2015","data":"","i":10}
61 | {"year":"2015","data":"","i":11}
62 | {"year":"2016","data":"foo","i":0}
63 | {"year":"2017","data":"baz","i":1}
64 | {"year":"2017","data":"bax","i":2}
65 | 
66 | {}
67 | 


--------------------------------------------------------------------------------
/tests/stdlib/fixtures/stream_remote_resources_txt_format:
--------------------------------------------------------------------------------
 1 | stream_remote_resources
 2 | --
 3 | {}
 4 | --
 5 | {
 6 |     "name": "test-stream-remote-resources-txt-format",
 7 |     "resources": [
 8 |         {
 9 |             "name": "my-remote-txt-format-resource",
10 |             "dpp:streamedFrom": "file://tests/data/sample.txt",
11 |             "path": "_",
12 |             "format": "txt"
13 |         }
14 |     ]
15 | }
16 | --
17 | --
18 | {
19 |     "name": "test-stream-remote-resources-txt-format",
20 |     "resources": [
21 |         {
22 |             "name": "my-remote-txt-format-resource",
23 |             "dpp:streamedFrom": "file://tests/data/sample.txt",
24 |             "dpp:streaming": true,
25 |             "path": "data/my-remote-txt-format-resource.csv",
26 |             "format": "txt",
27 |             "schema": {
28 |                 "fields": [
29 |                     {"name": "data", "type": "string"}
30 |                 ]
31 |             }
32 |         }
33 |     ]
34 | }
35 | --
36 | {"data": "<html><<< tabulator has html decection, keeping that causes the failure which we want to test"}
37 | {"data": "This is a plain text file - not a CSV file!"}
38 | {"data": "testing"}
39 | {"data": "one two three"}
40 | 
41 | {}
42 | 


--------------------------------------------------------------------------------
/tests/stdlib/test_stdlib.py:
--------------------------------------------------------------------------------
 1 | import os, logging
 2 | from datapackage_pipelines.utilities.lib_test_helpers import ProcessorFixtureTestsBase
 3 | from sqlalchemy.orm import sessionmaker
 4 | from sqlalchemy import create_engine, text
 5 | 
 6 | ROOT_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
 7 | ENV = os.environ.copy()
 8 | ENV['PYTHONPATH'] = ROOT_PATH
 9 | 
10 | ENV['EXISTENT_ENV'] = 'tests/data/sample.csv'
11 | 
12 | DEFAULT_TEST_DB = "sqlite://"
13 | ENV['DPP_DB_ENGINE'] = os.environ.get("OVERRIDE_TEST_DB", DEFAULT_TEST_DB)
14 | 
15 | 
16 | class StdlibfixtureTests(ProcessorFixtureTestsBase):
17 | 
18 |     def _get_procesor_env(self, filename):
19 |         if ENV['DPP_DB_ENGINE'] != DEFAULT_TEST_DB:
20 |             engine = create_engine(ENV['DPP_DB_ENGINE'])
21 |             conn = engine.connect()
22 |             conn.execute(text("DROP TABLE IF EXISTS test;"))
23 |         if filename == "dump_to_sql_update_mode__update":
24 |             engine = create_engine(ENV['DPP_DB_ENGINE'])
25 |             conn = engine.connect()
26 |             conn.execute(text("""
27 |                 CREATE TABLE test (
28 |                   id integer not null primary key,
29 |                   mystring text,
30 |                   mynumber double precision,
31 |                   mydate date
32 |                 )
33 |             """))
34 |             conn.execute(text("""
35 |                 INSERT INTO test VALUES (1, 'foo', 5.6, null);
36 |             """))
37 |         return ENV
38 | 
39 |     def _get_processor_file(self, processor):
40 |         processor = processor.replace('.', '/')
41 |         return os.path.join(ROOT_PATH, 'datapackage_pipelines', 'lib', processor.strip() + '.py')
42 | 
43 | 
44 | for filename, _func in StdlibfixtureTests(os.path.join(os.path.dirname(__file__), 'fixtures')).get_tests():
45 |     globals()['test_stdlib_%s' % filename] = _func
46 | 
47 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import threading
 4 | import time
 5 | from http.server import HTTPServer, BaseHTTPRequestHandler
 6 | 
 7 | from datapackage_pipelines.manager import execute_pipeline, run_pipelines
 8 | from datapackage_pipelines.specs.specs import pipelines
 9 | from datapackage_pipelines.utilities.execution_id import gen_execution_id
10 | from datapackage_pipelines.status import status_mgr
11 | 
12 | 
13 | called_hooks = []
14 | progresses = 0
15 | status = status_mgr()
16 | 
17 | class SaveHooks(BaseHTTPRequestHandler):
18 | 
19 |     def do_POST(self):
20 |         global progresses
21 |         content_len = int(self.headers.get('content-length', 0))
22 |         post_body = self.rfile.read(content_len)
23 |         hook = json.loads(post_body)
24 |         if hook['event'] != 'progress':
25 |             called_hooks.append(hook)
26 |         else:
27 |             progresses += 1
28 |         self.send_response(200)
29 |         self.end_headers()
30 |         return
31 | 
32 | 
33 | def test_pipeline():
34 |     '''Tests a few pipelines.'''
35 |     global progresses
36 | 
37 |     server = HTTPServer(('', 9000), SaveHooks)
38 |     thread = threading.Thread(target = server.serve_forever, daemon=True)
39 |     thread.start()
40 | 
41 |     results = run_pipelines('./tests/env/dummy/pipeline-test%', '.', 
42 |                             use_cache=False,
43 |                             dirty=False,
44 |                             force=False,
45 |                             concurrency=1,
46 |                             verbose_logs=True)
47 |     failed_results = list(filter(lambda r: not r.success, results))
48 |     assert len(failed_results) == 0, "Failed results: {}".format(["{} {}".format(result.pipeline_id, ", ".join(result.errors))
49 |                                                                  for result in failed_results])
50 |     assert len(called_hooks) == 3
51 |     assert called_hooks == [
52 |         {"pipeline_id": "./tests/env/dummy/pipeline-test-hooks", "event": "queue"},
53 |         {"pipeline_id": "./tests/env/dummy/pipeline-test-hooks", "event": "start"},
54 |         {"pipeline_id": "./tests/env/dummy/pipeline-test-hooks", "event": "finish", "success": True,
55 |          'stats': {'.dpp': {'out-datapackage-url': 'hooks-outputs/datapackage.json'},
56 |                    'bytes': 15787, 'count_of_rows': 40,
57 |                    'dataset_name': 'hook-tests', 'hash': '9fc202087094c7becf98228a1327b21c'}}
58 |     ]
59 |     assert progresses >= 1


--------------------------------------------------------------------------------
/tests/wrapper/test_wrapper.py:
--------------------------------------------------------------------------------
 1 | import unittest.mock as mock
 2 | from datapackage_pipelines.wrapper import spew
 3 | 
 4 | 
 5 | class TestWrapper(object):
 6 |     def test_spew_finalizer_runs_before_we_signal_that_were_done(self):
 7 |         '''Assert that the finalizer param is executed before spew is finished.
 8 | 
 9 |         We signal to other processors that we're done by writing an empty line
10 |         to STDOUT. The finalizer parameter to spew() must be executed before that,
11 |         as there can be processors that depend on us finishing our processing
12 |         before they're able to run. For example, a processor that depends on
13 |         `dump_to_zip` must wait until it has finished writing to the local
14 |         filesystem.
15 |         '''
16 |         datapackage = {}
17 |         resources_iterator = iter([])
18 | 
19 |         with mock.patch('datapackage_pipelines.wrapper.wrapper.stdout') as stdout_mock:
20 |             def finalizer():
21 |                 last_call_args = stdout_mock.write.call_args_list[-1]
22 |                 assert last_call_args != mock.call('\n')
23 | 
24 |             spew(datapackage, resources_iterator, finalizer=finalizer)
25 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | package=datapackage_pipelines
 3 | skip_missing_interpreters=true
 4 | envlist=
 5 |   py3{7,8,9}-{sqlite,plyvel}
 6 | 
 7 | [testenv]
 8 | deps=
 9 |   mock
10 |   pytest
11 |   pytest-cov
12 |   coverage
13 |   pyyaml
14 |   py37-plyvel: plyvel
15 |   py38-plyvel: plyvel
16 |   py39-plyvel: plyvel
17 | passenv=
18 |   PWD
19 |   CI
20 |   TRAVIS
21 |   TRAVIS_JOB_ID
22 |   TRAVIS_BRANCH
23 | commands=
24 |   cp tests/sitecustomize.py {envsitepackagesdir}
25 |   py.test -s \
26 |     --cov {[tox]package} \
27 |     --cov-config tox.ini \
28 |     --cov-report term-missing \
29 |     {posargs}
30 | allowlist_externals=
31 |   cp
32 | [pytest]
33 | # pytest.ini configuration here
34 | testpaths = tests
35 | 
36 | [report]
37 | # .coveragerc configuration here
38 | 
39 | [run]
40 | omit=
41 |   .tox/*
42 | parallel=True
43 | 


--------------------------------------------------------------------------------