├── .codeclimate.yml ├── .github ├── pull_request_template.md └── workflows │ ├── prepare-release.yml │ ├── publish.yml │ └── python-package.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── .vscode └── settings.json ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── data_pipelines_cli ├── __init__.py ├── __main__.py ├── airbyte_utils.py ├── bi_utils.py ├── cli.py ├── cli_commands │ ├── __init__.py │ ├── clean.py │ ├── compile.py │ ├── create.py │ ├── deploy.py │ ├── docs.py │ ├── generate │ │ ├── __init__.py │ │ ├── databricks_job.py │ │ ├── generate.py │ │ ├── model_yaml.py │ │ ├── source_sql.py │ │ ├── source_yaml.py │ │ └── utils.py │ ├── init.py │ ├── prepare_env.py │ ├── publish.py │ ├── run.py │ ├── seed.py │ ├── template.py │ ├── test.py │ └── update.py ├── cli_configs.py ├── cli_constants.py ├── cli_utils.py ├── config_generation.py ├── data_structures.py ├── dbt_utils.py ├── docker_response_reader.py ├── errors.py ├── filesystem_utils.py ├── io_utils.py ├── jinja.py ├── looker_utils.py └── vcs_utils.py ├── docs ├── Makefile ├── api.rst ├── changelog.rst ├── cli.rst ├── conf.py ├── configuration.rst ├── images │ ├── created.png │ ├── creating.png │ ├── init.png │ ├── integration.png │ ├── railsroad.png │ ├── run.png │ └── test.png ├── index.rst ├── installation.rst ├── integration.rst ├── setup_environment.rst ├── source │ ├── data_pipelines_cli.cli_commands.generate.rst │ ├── data_pipelines_cli.cli_commands.rst │ └── data_pipelines_cli.rst └── usage.rst ├── pyproject.toml ├── requirements-dev.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── cli_commands │ ├── __init__.py │ ├── test_clean.py │ ├── test_compile.py │ ├── test_create.py │ ├── test_deploy.py │ ├── test_generate.py │ ├── test_init.py │ ├── test_prepare_env.py │ ├── test_publish.py │ ├── test_run_test.py │ ├── test_template.py │ └── test_update.py ├── goldens │ ├── config │ │ ├── airbyte │ │ │ └── airbyte.yml │ │ ├── base │ │ │ ├── airflow.yml │ │ │ ├── bi.yml │ │ │ ├── bigquery.yml │ │ │ ├── datahub.yml │ │ │ ├── dbt.yml │ │ │ ├── execution_env.yml │ │ │ ├── k8s.yml │ │ │ ├── looker.yml │ │ │ └── publish.yml │ │ ├── datahub │ │ │ └── dbt.yml │ │ ├── dev │ │ │ └── ingestion.yml │ │ ├── image_tag │ │ │ └── execution_env.yml │ │ ├── local │ │ │ ├── bi.yml │ │ │ ├── dbt.yml │ │ │ └── snowflake.yml │ │ ├── prod │ │ │ └── ingestion.yml │ │ └── staging │ │ │ ├── airflow.yml │ │ │ ├── bi.yml │ │ │ ├── bigquery.yml │ │ │ └── dbt.yml │ ├── config_template │ │ ├── copier.yml │ │ └── dp.yml.jinja │ ├── dag │ │ ├── a.txt │ │ └── b.txt │ ├── dbt_project.yml │ ├── example_config.yml │ ├── example_profiles │ │ ├── dev_bigquery.yml │ │ ├── local_snowflake.yml │ │ └── staging_bigquery.yml │ ├── lookml │ │ ├── model1.model.lkml │ │ └── view1.view.lkml │ ├── source_yaml.yml │ ├── target │ │ └── manifest.json │ ├── test_sync_2nd_directory │ │ ├── a │ │ │ └── b │ │ │ │ └── c │ │ │ │ └── xyz │ │ └── test2.txt │ └── test_sync_directory │ │ ├── a │ │ └── b │ │ │ └── c │ │ │ └── xyz │ │ ├── test1.txt │ │ └── test2.txt ├── manifest_generation_tutorial.md ├── test_airbyte_utils.py ├── test_bi_utils.py ├── test_cli_utils.py ├── test_config_generation.py ├── test_data_structures.py ├── test_dbt_utils.py ├── test_docker_response_reader.py ├── test_filesystem_utils.py ├── test_io_utils.py ├── test_looker_utils.py └── test_vcs_utils.py └── tox.ini /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | version: "2" # required to adjust maintainability checks 2 | 3 | checks: 4 | argument-count: 5 | enabled: true 6 | config: 7 | threshold: 4 8 | complex-logic: 9 | enabled: true 10 | config: 11 | threshold: 4 12 | file-lines: 13 | enabled: true 14 | config: 15 | threshold: 250 16 | method-complexity: 17 | enabled: true 18 | config: 19 | threshold: 5 20 | method-count: 21 | enabled: true 22 | config: 23 | threshold: 20 24 | method-lines: 25 | enabled: true 26 | config: 27 | threshold: 25 28 | nested-control-flow: 29 | enabled: true 30 | config: 31 | threshold: 4 32 | return-statements: 33 | enabled: true 34 | config: 35 | threshold: 4 36 | similar-code: 37 | enabled: true 38 | config: 39 | threshold: #language-specific defaults. overrides affect all languages. 40 | identical-code: 41 | enabled: true 42 | config: 43 | threshold: #language-specific defaults. overrides affect all languages. 44 | 45 | plugins: 46 | pylint: 47 | enabled: true 48 | checks: 49 | import-error: 50 | enabled: false 51 | bad-continuation: 52 | enabled: false 53 | 54 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | `` 2 | 3 | Resolves `` 4 | 5 | --- 6 | Keep in mind: 7 | - [ ] Documentation updates 8 | - [ ] [Changelog](CHANGELOG.md) updates -------------------------------------------------------------------------------- /.github/workflows/prepare-release.yml: -------------------------------------------------------------------------------- 1 | name: Prepare release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version_part: 7 | description: The part of the version to update (patch, minor or major) 8 | required: true 9 | default: 'minor' 10 | 11 | jobs: 12 | prepare-release: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.9] 17 | env: 18 | PYTHON_PACKAGE: data_pipelines_cli 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Validate inputs 26 | run: | 27 | echo "INPUT_VERSION_PART: ${{ github.event.inputs.version_part }}" 28 | python -c "if '${{ github.event.inputs.version_part }}' not in ['patch', 'minor', 'major']: raise ValueError(\"'${{ github.event.inputs.version_part }}' must be one of ['patch', 'minor', 'major'])\")" 29 | - name: Bump the version number # bump2version is a maintained fork of original bumpversion 30 | id: bump_version 31 | run: | 32 | pip install bump2version 33 | bump2version ${{ github.event.inputs.version_part }} 34 | echo "::set-output name=package_version::$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" 35 | - name: Update the CHANGELOG according to 'Keep a Changelog' guidelines 36 | uses: thomaseizinger/keep-a-changelog-new-release@v1 37 | with: 38 | version: ${{ steps.bump_version.outputs.package_version }} 39 | - name: Create a new release branch 40 | run: | 41 | git config user.name github-actions 42 | git config user.email github-actions@github.com 43 | git checkout -b release-${{ steps.bump_version.outputs.package_version }} 44 | git push -u origin release-${{ steps.bump_version.outputs.package_version }} 45 | - name: Open a PR to merge the release to main 46 | id: open_pr 47 | uses: vsoch/pull-request-action@1.1.0 48 | env: 49 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 50 | PULL_REQUEST_BRANCH: main 51 | PULL_REQUEST_FROM_BRANCH: release-${{ steps.bump_version.outputs.package_version }} 52 | PULL_REQUEST_TITLE: "Release ${{ steps.bump_version.outputs.package_version }}" 53 | PULL_REQUEST_BODY: "Bump version and CHANGELOG for next release." 54 | PULL_REQUEST_ASSIGNEES: " p-pekala" 55 | - name: Commit the changes 56 | run: | 57 | git add docs 58 | git commit -am "FIX #${{ steps.open_pr.outputs.pull_request_number }} - Bump version and CHANGELOG for release ${{ steps.bump_version.outputs.package_version }}" 59 | git push 60 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: [3.7] 14 | env: 15 | PYTHON_PACKAGE: data_pipelines_cli 16 | steps: 17 | - name: Checkout the repo 18 | uses: actions/checkout@v2 19 | with: 20 | fetch-depth: 0 # necessary to enable merging, all the history is needed 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Build package dist from source # A better way will be : https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ but pep 517 is still marked as experimental 26 | run: | 27 | python setup.py sdist 28 | - name: Merge back to develop # we have to set the config first on a fresh machine 29 | run: | 30 | git config user.name github-actions 31 | git config user.email github-actions@github.com 32 | git checkout -b develop --track origin/develop 33 | git merge main 34 | git push 35 | - name: Set dynamically package version as output variable # see https://github.com/actions/create-release/issues/39 36 | # see https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions#setting-an-environment-variable 37 | id: set_package_version 38 | run: | 39 | echo "::set-output name=PACKAGE_VERSION::$(cat $PYTHON_PACKAGE/__init__.py | grep -Po '\d+\.\d+\.\d+')" 40 | - name: Create temporary file with the body content for the release 41 | run: | 42 | grep -Poz "## \[${{steps.set_package_version.outputs.PACKAGE_VERSION}}] - \d{4}-\d{2}-\d{2}[\S\s]+?(?=## \[\d+\.\d+\.\d+\]|\[.+\]:)" CHANGELOG.md > release_body.md 43 | - name: Create Release # https://github.com/actions/create-release 44 | id: create_release 45 | uses: actions/create-release@v1.1.4 46 | env: 47 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token 48 | with: 49 | tag_name: ${{ steps.set_package_version.outputs.PACKAGE_VERSION }} 50 | release_name: Release ${{ steps.set_package_version.outputs.PACKAGE_VERSION }} 51 | body_path: ./release_body.md 52 | draft: false 53 | prerelease: false 54 | - name: Rollback Release in case of run failure 55 | if: failure() && steps.create_release.outputs.id != '' 56 | uses: author/action-rollback@stable 57 | with: 58 | # Using a known release ID 59 | release_id: ${{ steps.create_release.outputs.id }} 60 | env: 61 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 62 | - name: Publish distribution to PyPI # official action from python maintainers 63 | uses: pypa/gh-action-pypi-publish@master 64 | with: 65 | user: __token__ 66 | password: ${{ secrets.PYPI_PASSWORD }} 67 | verbose: true # trace if the upload fails 68 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - develop 8 | pull_request: 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.9", "3.10"] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - name: Setup python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | 25 | - name: Setup virtualenv 26 | run: | 27 | python -V 28 | python -m pip install virtualenv pipdeptree 29 | virtualenv venv 30 | source venv/bin/activate 31 | pip install --upgrade pip 32 | 33 | - name: Check pre-commit status 34 | run: | 35 | pip install .[tests,databricks] 36 | pip freeze 37 | pipdeptree 38 | pre-commit run --all-files 39 | 40 | - name: Test with tox 41 | run: | 42 | tox 43 | 44 | - name: Report coverage 45 | uses: paambaati/codeclimate-action@v5.0.0 46 | env: 47 | CC_TEST_REPORTER_ID: ${{ secrets.CODE_CLIMATE }} 48 | with: 49 | coverageCommand: coverage xml 50 | debug: true 51 | coverageLocations: coverage.xml:coverage.py 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IntelliJ 2 | .idea/ 3 | *.iml 4 | out/ 5 | .idea_modules/ 6 | 7 | ### macOS 8 | *.DS_Store 9 | .AppleDouble 10 | .LSOverride 11 | .Trashes 12 | 13 | # Vim 14 | *~ 15 | .*.swo 16 | .*.swp 17 | 18 | # emacs 19 | *~ 20 | \#*\# 21 | /.emacs.desktop 22 | /.emacs.desktop.lock 23 | *.elc 24 | 25 | ### Python template 26 | # Byte-compiled / optimized / DLL files 27 | __pycache__/ 28 | *.py[cod] 29 | *$py.class 30 | 31 | # Distribution / packaging 32 | .Python 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | wheels/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | MANIFEST 49 | 50 | # PyInstaller 51 | # Usually these files are written by a python script from a template 52 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 53 | *.manifest 54 | *.spec 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | htmlcov/ 62 | .tox/ 63 | .coverage 64 | .coverage.* 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | *.cover 69 | .hypothesis/ 70 | 71 | # Translations 72 | *.mo 73 | *.pot 74 | 75 | # Django stuff: 76 | *.log 77 | .static_storage/ 78 | .media/ 79 | local_settings.py 80 | 81 | # Flask stuff: 82 | instance/ 83 | .webassets-cache 84 | 85 | # Scrapy stuff: 86 | .scrapy 87 | 88 | # Sphinx documentation 89 | docs/_build/ 90 | 91 | # PyBuilder 92 | target/ 93 | 94 | # Jupyter Notebook 95 | .ipynb_checkpoints 96 | 97 | # IPython 98 | .ipython/profile_default/history.sqlite 99 | .ipython/profile_default/startup/README 100 | 101 | # pyenv 102 | .python-version 103 | 104 | # celery beat schedule file 105 | celerybeat-schedule 106 | 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | 126 | docs/_build 127 | 128 | dp-testing -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | known_first_party = data_pipelines_cli 4 | default_section = THIRDPARTY 5 | known_third_party = setuptools 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | 3 | - repo: https://github.com/PyCQA/isort 4 | rev: 5.12.0 5 | hooks: 6 | - id: isort 7 | 8 | - repo: https://github.com/psf/black 9 | rev: 22.3.0 10 | hooks: 11 | - id: black 12 | 13 | - repo: https://github.com/pre-commit/pre-commit-hooks 14 | rev: v4.3.0 15 | hooks: 16 | - id: trailing-whitespace 17 | args: [--markdown-linebreak-ext=md] 18 | - id: check-merge-conflict 19 | - id: debug-statements 20 | - id: name-tests-test 21 | args: ['--django'] # PyCharm requires tests to have 'test' as prefix, not a suffix 22 | - id: check-json 23 | - id: check-yaml 24 | 25 | - repo: https://github.com/pycqa/flake8 26 | rev: 4.0.1 27 | hooks: 28 | - id: flake8 29 | additional_dependencies: [ 30 | 'flake8-blind-except', 31 | 'flake8-comprehensions', 32 | 'flake8-pep3101', 33 | ] 34 | 35 | - repo: https://github.com/pre-commit/mirrors-mypy 36 | rev: v0.961 37 | hooks: 38 | - id: mypy 39 | additional_dependencies: 40 | - 'types-PyYAML' 41 | - 'types-requests' 42 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.10" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - method: pip 22 | path: . 23 | extra_requirements: 24 | - docs 25 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "makefile.extensionOutputFolder": "./.vscode" 3 | } -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## PR Guidelines 2 | 1. Fork branch from `develop`. 3 | 2. Ensure to provide unit tests for new functionality. 4 | 3. Install dev requirements: `pip install -r requirements-dev.txt` and setup a hook: `pre-commit install`. 5 | 4. Update documentation accordingly. 6 | 5. Update [changelog](CHANGELOG.md) according to ["Keep a changelog"](https://keepachangelog.com/en/1.0.0/) guidelines. 7 | 6. Squash changes with a single commit as much as possible and ensure verbose PR name. 8 | 7. Open a PR against the `develop` branch. 9 | 10 | *We reserve the right to take over and modify or abandon PRs that do not match the workflow or are abandoned.* 11 | 12 | ## Release workflow 13 | 14 | 1. Create the release candidate: 15 | - Go to the [Prepare release](https://github.com/getindata/data-pipelines-cli/actions?query=workflow%3A%22Prepare+release%22) action. 16 | - Click "Run workflow" 17 | - Enter the part of the version to bump (one of `..`). Minor (x.**x**.x) is a default. 18 | 2. If the workflow has run sucessfully: 19 | - Go to the newly openened PR named `Release candidate ` 20 | - Check that changelog and version have been properly updated. If not pull the branch and apply manual changes if necessary. 21 | - Merge the PR to main 22 | 3. Checkout the [Publish](https://github.com/getindata/data-pipelines-cli/actions?query=workflow%3APublish) workflow to see if: 23 | - The package has been uploaded on PyPI successfully 24 | - The changes have been merged back to develop 25 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-pipelines-cli 2 | 3 | [![Python Version](https://img.shields.io/badge/python-3.9%20%7C%203.10-blue.svg)](https://github.com/getindata/data-pipelines-cli) 4 | [![PyPI Version](https://badge.fury.io/py/data-pipelines-cli.svg)](https://pypi.org/project/data-pipelines-cli/) 5 | [![Downloads](https://pepy.tech/badge/data-pipelines-cli)](https://pepy.tech/project/data-pipelines-cli) 6 | [![Maintainability](https://api.codeclimate.com/v1/badges/e44ed9383a42b59984f6/maintainability)](https://codeclimate.com/github/getindata/data-pipelines-cli/maintainability) 7 | [![Test Coverage](https://api.codeclimate.com/v1/badges/e44ed9383a42b59984f6/test_coverage)](https://codeclimate.com/github/getindata/data-pipelines-cli/test_coverage) 8 | [![Documentation Status](https://readthedocs.org/projects/data-pipelines-cli/badge/?version=latest)](https://data-pipelines-cli.readthedocs.io/en/latest/?badge=latest) 9 | 10 | CLI for data platform 11 | 12 | ## Documentation 13 | 14 | Read the full documentation at [https://data-pipelines-cli.readthedocs.io/](https://data-pipelines-cli.readthedocs.io/en/latest/index.html) 15 | 16 | ## Installation 17 | Use the package manager [pip](https://pip.pypa.io/en/stable/) to install [dp (data-pipelines-cli)](https://pypi.org/project/data-pipelines-cli/): 18 | 19 | ```bash 20 | pip install data-pipelines-cli[bigquery,docker,datahub,gcs] 21 | ``` 22 | 23 | ## Usage 24 | First, create a repository with a global configuration file that you or your organization will be using. The repository 25 | should contain `dp.yml.tmpl` file looking similar to this: 26 | ```yaml 27 | _templates_suffix: ".tmpl" 28 | _envops: 29 | autoescape: false 30 | block_end_string: "%]" 31 | block_start_string: "[%" 32 | comment_end_string: "#]" 33 | comment_start_string: "[#" 34 | keep_trailing_newline: true 35 | variable_end_string: "]]" 36 | variable_start_string: "[[" 37 | 38 | templates: 39 | my-first-template: 40 | template_name: my-first-template 41 | template_path: https://github.com//.git 42 | 43 | vars: 44 | username: [[ YOUR_USERNAME ]] 45 | ``` 46 | Thanks to the [copier](https://copier.readthedocs.io/en/stable/), you can leverage tmpl template syntax to create 47 | easily modifiable configuration templates. Just create a `copier.yml` file next to the `dp.yml.tmpl` one and configure 48 | the template questions (read more at [copier documentation](https://copier.readthedocs.io/en/stable/configuring/)). 49 | 50 | Then, run `dp init ` to initialize **dp**. You can also drop `` argument, 51 | **dp** will get initialized with an empty config. 52 | 53 | ### Project creation 54 | 55 | You can use `dp create ` to choose one of the templates added before and create the project in the 56 | `` directory. You can also use `dp create ` to point 57 | directly to a template repository. If `` proves to be the name of the template defined in 58 | **dp**'s config file, `dp create` will choose the template by the name instead of trying to download the repository. 59 | 60 | `dp template-list` lists all added templates. 61 | 62 | ### Project update 63 | 64 | To update your pipeline project use `dp update `. It will sync your existing project with updated 65 | template version selected by `--vcs-ref` option (default `HEAD`). 66 | 67 | ### Project deployment 68 | 69 | `dp deploy` will sync with your bucket provider. The provider will be chosen automatically based on the remote URL. 70 | Usually, it is worth pointing `dp deploy` to JSON or YAML file with provider-specific data like access tokens or project 71 | names. E.g., to connect with Google Cloud Storage, one should run: 72 | ```bash 73 | echo '{"token": "", "project_name": ""}' > gs_args.json 74 | dp deploy --dags-path "gs://" --blob-args gs_args.json 75 | ``` 76 | However, in some cases you do not need to do so, e.g. when using `gcloud` with properly set local credentials. In such 77 | case, you can try to run just the `dp deploy --dags-path "gs://"` command. Please refer to 78 | [documentation](https://data-pipelines-cli.readthedocs.io/en/latest/usage.html#project-deployment) for more information. 79 | 80 | When finished, call `dp clean` to remove compilation related directories. 81 | 82 | ### Variables 83 | You can put a dictionary of variables to be passed to `dbt` in your `config//dbt.yml` file, following the convention 84 | presented in [the guide at the dbt site](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/using-variables#defining-variables-in-dbt_projectyml). 85 | E.g., if one of the fields of `config//snowflake.yml` looks like this: 86 | ```yaml 87 | schema: "{{ var('snowflake_schema') }}" 88 | ``` 89 | you should put the following in your `config//dbt.yml` file: 90 | ```yaml 91 | vars: 92 | snowflake_schema: EXAMPLE_SCHEMA 93 | ``` 94 | and then run your `dp run --env ` (or any similar command). 95 | 96 | ## Contributing 97 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. 98 | 99 | Please make sure to update tests as appropriate. 100 | -------------------------------------------------------------------------------- /data_pipelines_cli/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | data-pipelines-cli (dp) is a CLI tool designed for data platform. 3 | 4 | dp helps data analysts to create, maintain and make full use of their data 5 | pipelines. 6 | """ 7 | 8 | version = "0.30.0" 9 | -------------------------------------------------------------------------------- /data_pipelines_cli/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import cli 2 | 3 | if __name__ == "__main__": 4 | cli() 5 | -------------------------------------------------------------------------------- /data_pipelines_cli/airbyte_utils.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import copy 3 | import os 4 | import pathlib 5 | from typing import Any, Dict, Iterable, Optional, Union 6 | 7 | import requests 8 | import yaml 9 | 10 | from .cli_constants import BUILD_DIR 11 | from .cli_utils import echo_error, echo_info, echo_warning 12 | 13 | 14 | class AirbyteError(Exception): 15 | pass 16 | 17 | 18 | class AirbyteNoWorkspaceConfiguredError(AirbyteError): 19 | pass 20 | 21 | 22 | class AirbyteFactory: 23 | """A class used to create and update Airbyte connections defined in config yaml file""" 24 | 25 | airbyte_config_path: pathlib.Path 26 | """Path to config yaml file containing connections definitions""" 27 | auth_token: Optional[str] 28 | """Authorization OIDC ID token for a service account to communication with Airbyte instance""" 29 | 30 | def __init__(self, airbyte_config_path: pathlib.Path, auth_token: Optional[str]) -> None: 31 | self.airbyte_config_path = airbyte_config_path 32 | self.auth_token = auth_token 33 | 34 | with open(self.airbyte_config_path, "r") as airbyte_config_file: 35 | self.airbyte_config = yaml.safe_load(airbyte_config_file) 36 | self.airbyte_url = self.airbyte_config["airbyte_url"] 37 | 38 | @staticmethod 39 | def find_config_file(env: str, config_name: str = "airbyte") -> pathlib.Path: 40 | if BUILD_DIR.joinpath("dag", "config", env, f"{config_name}.yml").is_file(): 41 | return BUILD_DIR.joinpath("dag", "config", env, f"{config_name}.yml") 42 | return BUILD_DIR.joinpath("dag", "config", "base", f"{config_name}.yml") 43 | 44 | @staticmethod 45 | def env_replacer(config: Dict[str, Any]) -> Dict[str, Any]: 46 | return ast.literal_eval(os.path.expandvars(f"{config}")) 47 | 48 | def get_default_workspace_id(self) -> str: 49 | workspaces = self.request_handler("workspaces/list").get("workspaces") 50 | if not workspaces: 51 | raise AirbyteNoWorkspaceConfiguredError( 52 | f"No workspaces found in {self.airbyte_url} instance." 53 | ) 54 | 55 | return workspaces[0].get("workspaceId") 56 | 57 | def create_update_connections(self) -> None: 58 | """Create and update Airbyte connections defined in config yaml file""" 59 | if not self.airbyte_config["connections"]: 60 | return 61 | 62 | workspace_id = self.airbyte_config.get("workspace_id") 63 | if workspace_id is None: 64 | echo_warning( 65 | "workspace_id was not provided in the configuration file - " 66 | "fetching the default one from Airbyte deployment" 67 | ) 68 | workspace_id = self.get_default_workspace_id() 69 | 70 | for connection in self.airbyte_config["connections"]: 71 | self.create_update_connection( 72 | connection_config=self.airbyte_config["connections"][connection], 73 | workspace_id=workspace_id, 74 | ) 75 | 76 | for task in self.airbyte_config["tasks"]: 77 | task.update(self.env_replacer(task)) 78 | 79 | self.update_file(self.airbyte_config) 80 | 81 | def create_update_connection(self, connection_config: Dict[str, Any], workspace_id: str) -> Any: 82 | def configs_equal( 83 | conf_a: Dict[str, Any], conf_b: Dict[str, Any], equality_fields: Iterable[str] 84 | ) -> bool: 85 | conn_a = {k: v for k, v in conf_a.items() if k in equality_fields} 86 | conn_b = {k: v for k, v in conf_b.items() if k in equality_fields} 87 | return conn_a == conn_b 88 | 89 | connection_config_copy = copy.deepcopy(connection_config) 90 | 91 | response_search = self.request_handler( 92 | "connections/list", data={"workspaceId": workspace_id} 93 | ) 94 | 95 | equality_fields = [ 96 | "sourceId", 97 | "destinationId", 98 | "namespaceDefinition", 99 | "namespaceFormat", 100 | ] 101 | 102 | matching_connections = [ 103 | connection 104 | for connection in response_search["connections"] 105 | if configs_equal(connection_config_copy, connection, equality_fields) 106 | ] 107 | 108 | if not matching_connections: 109 | echo_info(f"Creating connection config for {connection_config_copy['name']}") 110 | response_create = self.request_handler( 111 | "connections/create", 112 | connection_config_copy, 113 | ) 114 | os.environ[response_create["name"]] = response_create["connectionId"] 115 | return 116 | 117 | echo_info(f"Updating connection config for {connection_config_copy['name']}") 118 | connection_config_copy.pop("sourceId", None) 119 | connection_config_copy.pop("destinationId", None) 120 | connection_config_copy["connectionId"] = matching_connections[0]["connectionId"] 121 | response_update = self.request_handler( 122 | "connections/update", 123 | connection_config_copy, 124 | ) 125 | os.environ[response_update["name"]] = response_update["connectionId"] 126 | 127 | def update_file(self, updated_config: Dict[str, Any]) -> None: 128 | with open(self.airbyte_config_path, "w") as airbyte_config_file: 129 | yaml.safe_dump(updated_config, airbyte_config_file) 130 | 131 | def request_handler( 132 | self, endpoint: str, data: Optional[Dict[str, Any]] = None 133 | ) -> Union[Dict[str, Any], Any]: 134 | url = f"{self.airbyte_url}/api/v1/{endpoint}" 135 | headers = { 136 | "Accept": "application/json", 137 | "Content-Type": "application/json", 138 | } 139 | if self.auth_token is not None: 140 | headers["Authorization"] = f"Bearer {self.auth_token}" 141 | 142 | try: 143 | response = requests.post(url=url, headers=headers, json=data) 144 | response.raise_for_status() 145 | data = response.json() 146 | return data 147 | except requests.exceptions.HTTPError as e: 148 | echo_error(e.response.text) # type: ignore 149 | return None 150 | -------------------------------------------------------------------------------- /data_pipelines_cli/bi_utils.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Any, Dict, Optional, Tuple 3 | 4 | from .cli_constants import BUILD_DIR 5 | from .cli_utils import echo_info 6 | from .config_generation import read_dictionary_from_config_directory 7 | from .errors import DataPipelinesError, NotSuppertedBIError 8 | from .looker_utils import deploy_lookML_model, generate_lookML_model 9 | 10 | 11 | class BiAction(Enum): 12 | COMPILE = 1 13 | DEPLOY = 2 14 | 15 | 16 | def read_bi_config(env: str) -> Dict[str, Any]: 17 | """ 18 | Read BI configuration. 19 | 20 | :param env: Name of the environment 21 | :type env: str 22 | :return: Compiled dictionary 23 | :rtype: Dict[str, Any] 24 | """ 25 | return read_dictionary_from_config_directory(BUILD_DIR.joinpath("dag"), env, "bi.yml") 26 | 27 | 28 | def _bi_looker( 29 | env: str, generate_code: bool, deploy: bool = False, key_path: Optional[str] = None 30 | ) -> None: 31 | if generate_code: 32 | echo_info("Generating Looker codes") 33 | generate_lookML_model() 34 | 35 | if deploy: 36 | echo_info("Deploying Looker project") 37 | if key_path is None: 38 | raise DataPipelinesError( 39 | "Error raised when pushing Looker code. No repository key provided. " 40 | "Provide key using '--bi-git-key-path' option or disable BI in bi.yml" 41 | ) 42 | deploy_lookML_model(key_path, env) 43 | 44 | 45 | def bi(env: str, bi_action: BiAction, key_path: Optional[str] = None) -> None: 46 | """ 47 | Generate and deploy BI codes using dbt compiled data. 48 | 49 | :param env: Name of the environment 50 | :type env: str 51 | :param bi_action: Action to be run [COMPILE, DEPLOY] 52 | :type env: BiAction 53 | :param key_path: Path to the key with write access to git repository 54 | :type env: str 55 | :raises NotSuppertedBIError: Not supported bi in bi.yml configuration 56 | """ 57 | bi_config = read_bi_config(env) 58 | 59 | if not bi_config.get("is_bi_enabled", False): 60 | echo_info("BI is disabled") 61 | return 62 | 63 | if bi_config["bi_target"] == "looker": 64 | echo_info("Running BI...") 65 | compile, deploy = _prepare_bi_parameters(bi_action, bi_config) 66 | _bi_looker(env, compile, deploy, key_path) 67 | else: 68 | raise NotSuppertedBIError() 69 | 70 | 71 | def _prepare_bi_parameters(bi_action: BiAction, bi_config: Dict[str, Any]) -> Tuple[bool, bool]: 72 | if bi_action == BiAction.COMPILE: 73 | return bi_config["is_bi_compile"], False 74 | elif bi_action == BiAction.DEPLOY: 75 | return False, bi_config["is_bi_deploy"] 76 | else: 77 | return False, False 78 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import click 4 | 5 | from .cli_commands.clean import clean_command 6 | from .cli_commands.compile import compile_project_command 7 | from .cli_commands.create import create_command 8 | from .cli_commands.deploy import deploy_command 9 | from .cli_commands.docs import docs_command 10 | from .cli_commands.generate.generate import generate_group 11 | from .cli_commands.init import init_command 12 | from .cli_commands.prepare_env import prepare_env_command 13 | from .cli_commands.publish import publish_command 14 | from .cli_commands.run import run_command 15 | from .cli_commands.seed import seed_command 16 | from .cli_commands.template import list_templates_command 17 | from .cli_commands.test import test_command 18 | from .cli_commands.update import update_command 19 | from .cli_utils import echo_error, echo_suberror 20 | from .errors import DataPipelinesError 21 | 22 | 23 | @click.group() 24 | @click.version_option(prog_name="dp") 25 | def _cli() -> None: 26 | pass 27 | 28 | 29 | def cli() -> None: 30 | try: 31 | _cli() 32 | except DataPipelinesError as err: 33 | echo_error(f"CLI Error: {err.message}") 34 | if err.submessage: 35 | echo_suberror(err.submessage) 36 | sys.exit(1) 37 | 38 | 39 | _cli.add_command(clean_command) 40 | _cli.add_command(compile_project_command) 41 | _cli.add_command(create_command) 42 | _cli.add_command(deploy_command) 43 | _cli.add_command(docs_command) 44 | _cli.add_command(generate_group) 45 | _cli.add_command(init_command) 46 | _cli.add_command(list_templates_command) 47 | _cli.add_command(prepare_env_command) 48 | _cli.add_command(publish_command) 49 | _cli.add_command(run_command) 50 | _cli.add_command(seed_command) 51 | _cli.add_command(test_command) 52 | _cli.add_command(update_command) 53 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/data_pipelines_cli/cli_commands/__init__.py -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/clean.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | import click 4 | 5 | from ..cli_constants import BUILD_DIR 6 | from ..cli_utils import echo_info, echo_subinfo, subprocess_run 7 | 8 | 9 | def _dbt_clean() -> None: 10 | echo_info("dbt clean") 11 | subprocess_run(["dbt", "clean"]) 12 | 13 | 14 | def _remove_build_dir() -> None: 15 | if BUILD_DIR.exists(): 16 | echo_info(f"Removing {BUILD_DIR}") 17 | shutil.rmtree(BUILD_DIR) 18 | echo_subinfo(f"{BUILD_DIR} removed") 19 | 20 | 21 | def clean() -> None: 22 | """Delete local working directories.""" 23 | _dbt_clean() 24 | _remove_build_dir() 25 | 26 | 27 | @click.command(name="clean", help="Delete local working directories") 28 | def clean_command() -> None: 29 | clean() 30 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/compile.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | import shutil 4 | from typing import Dict, Optional 5 | 6 | import click 7 | import yaml 8 | 9 | from ..bi_utils import BiAction, bi 10 | from ..cli_configs import find_datahub_config_file 11 | from ..cli_constants import BUILD_DIR, IMAGE_TAG_TO_REPLACE 12 | from ..cli_utils import echo_info, echo_warning 13 | from ..config_generation import ( 14 | copy_config_dir_to_build_dir, 15 | copy_dag_dir_to_build_dir, 16 | generate_profiles_yml, 17 | ) 18 | from ..data_structures import DockerArgs 19 | from ..dbt_utils import read_dbt_vars_from_configs, run_dbt_command 20 | from ..docker_response_reader import DockerResponseReader 21 | from ..errors import DockerErrorResponseError, DockerNotInstalledError 22 | from ..io_utils import replace 23 | from ..jinja import replace_vars_with_values 24 | 25 | 26 | def _docker_build(docker_args: DockerArgs) -> None: 27 | """ 28 | :param docker_args: Arguments required by the Docker to make a push to \ 29 | the repository 30 | :raises DataPipelinesError: Docker not installed 31 | """ 32 | try: 33 | import docker 34 | import docker.errors 35 | except ModuleNotFoundError: 36 | raise DockerNotInstalledError() 37 | 38 | echo_info("Building Docker image") 39 | docker_client = docker.from_env() 40 | docker_tag = docker_args.docker_build_tag() 41 | try: 42 | _, logs_generator = docker_client.images.build( 43 | path=".", tag=docker_tag, buildargs=docker_args.build_args 44 | ) 45 | DockerResponseReader(logs_generator).click_echo_ok_responses() 46 | except docker.errors.BuildError as err: 47 | build_log = "\n".join([str(log) for log in err.build_log]) 48 | raise DockerErrorResponseError(f"{err.msg}\n{build_log}") 49 | 50 | 51 | def _dbt_compile(env: str) -> None: 52 | profiles_path = generate_profiles_yml(env, False) 53 | echo_info("Running dbt commands:") 54 | run_dbt_command(("deps",), env, profiles_path) 55 | run_dbt_command(("compile",), env, profiles_path) 56 | run_dbt_command(("docs", "generate"), env, profiles_path) 57 | run_dbt_command(("source", "freshness"), env, profiles_path) 58 | 59 | 60 | def _copy_dbt_manifest() -> None: 61 | echo_info("Copying DBT manifest") 62 | shutil.copyfile( 63 | pathlib.Path.cwd().joinpath("target", "manifest.json"), 64 | BUILD_DIR.joinpath("dag", "manifest.json"), 65 | ) 66 | 67 | 68 | def replace_image_settings(image_tag: str) -> None: 69 | k8s_config = BUILD_DIR.joinpath("dag", "config", "base", "execution_env.yml") 70 | echo_info(f"Replacing {IMAGE_TAG_TO_REPLACE} with image tag = {image_tag}") 71 | replace(k8s_config, IMAGE_TAG_TO_REPLACE, image_tag) 72 | 73 | 74 | def _replace_datahub_with_jinja_vars(env: str) -> None: 75 | datahub_config_path: pathlib.Path = find_datahub_config_file(env) 76 | 77 | if not datahub_config_path.is_file(): 78 | echo_warning( 79 | f"File config/base/datahub.yml does not exist in {BUILD_DIR}. " 80 | "Content will not be replaced." 81 | ) 82 | return 83 | 84 | echo_info(f"Replacing Jinja variables in {datahub_config_path}.") 85 | with open(datahub_config_path, "r") as datahub_config_file: 86 | updated_config = replace_vars_with_values( 87 | yaml.safe_load(datahub_config_file), read_dbt_vars_from_configs(env) 88 | ) 89 | with open(datahub_config_path, "w") as datahub_config_file: 90 | yaml.dump(updated_config, datahub_config_file) 91 | 92 | 93 | def compile_project( 94 | env: str, 95 | docker_tag: Optional[str] = None, 96 | docker_build: bool = False, 97 | docker_build_args: Optional[Dict[str, str]] = None, 98 | ) -> None: 99 | """ 100 | Create local working directories and build artifacts. 101 | 102 | :param env: Name of the environment 103 | :type env: str 104 | :param docker_tag: Image tag of a Docker image to create 105 | :type docker_tag: Optional[str] 106 | :param docker_build: Whether to build a Docker image 107 | :type docker_build: bool 108 | :param bi_build: Whether to generate a BI codes 109 | :raises DataPipelinesError: 110 | """ 111 | copy_dag_dir_to_build_dir() 112 | copy_config_dir_to_build_dir() 113 | 114 | docker_args = DockerArgs(env, docker_tag, docker_build_args or {}) 115 | 116 | replace_image_settings(docker_args.image_tag or "Empty") 117 | 118 | _replace_datahub_with_jinja_vars(env) 119 | 120 | _dbt_compile(env) 121 | _copy_dbt_manifest() 122 | 123 | if docker_build: 124 | _docker_build(docker_args) 125 | 126 | bi(env, BiAction.COMPILE) 127 | 128 | 129 | @click.command( 130 | name="compile", 131 | help="Create local working directories and build artifacts", 132 | ) 133 | @click.option( 134 | "--env", 135 | default="local", 136 | type=str, 137 | show_default=True, 138 | required=True, 139 | help="Name of the environment", 140 | ) 141 | @click.option( 142 | "--docker-build", 143 | is_flag=True, 144 | default=False, 145 | help="Whether to build a Docker image", 146 | ) 147 | @click.option( 148 | "--docker-tag", type=str, required=False, help="Image tag of a Docker image to create" 149 | ) 150 | @click.option( 151 | "--docker-args", type=str, required=False, help="Args required to build project in json format" 152 | ) 153 | def compile_project_command( 154 | env: str, 155 | docker_build: bool, 156 | docker_tag: Optional[str], 157 | docker_args: Optional[str], 158 | ) -> None: 159 | compile_project(env, docker_tag, docker_build, json.loads(docker_args or "{}")) 160 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/create.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Sequence 2 | 3 | import click 4 | import copier 5 | import questionary 6 | 7 | from data_pipelines_cli.cli_utils import echo_warning 8 | from data_pipelines_cli.data_structures import TemplateConfig, read_env_config 9 | from data_pipelines_cli.errors import DataPipelinesError 10 | from data_pipelines_cli.vcs_utils import add_suffix_to_git_template_path 11 | 12 | 13 | def _choose_template(config_templates: Dict[str, TemplateConfig]) -> TemplateConfig: 14 | """ 15 | :raises DataPipelinesError: no template found in *config_templates* 16 | """ 17 | if len(config_templates) == 0: 18 | raise DataPipelinesError( 19 | "No template provided. Either run 'dp create " 20 | "' to use template from the link, or add template " 21 | "to `~/.dp.yml` file", 22 | ) 23 | 24 | template_name = questionary.select("", choices=list(config_templates.keys())).ask() 25 | template_config = config_templates[template_name] 26 | 27 | return template_config 28 | 29 | 30 | def _get_template_path( 31 | config_templates: Dict[str, TemplateConfig], template_path: Optional[str] 32 | ) -> str: 33 | """:raises DataPipelinesError: no template found in *config_templates*""" 34 | if template_path: 35 | if template_path in config_templates.keys(): 36 | to_return = config_templates[template_path]["template_path"] 37 | else: 38 | to_return = add_suffix_to_git_template_path(template_path) 39 | else: 40 | to_return = _choose_template(config_templates)["template_path"] 41 | return to_return 42 | 43 | 44 | def create(project_path: str, template_path: Optional[str], vcs_ref: str) -> None: 45 | """ 46 | Create a new project using a template. 47 | 48 | :param project_path: Path to a directory to create 49 | :type project_path: str 50 | :param template_path: Path or URI to the repository of the project template 51 | :type template_path: Optional[str] 52 | :raises DataPipelinesError: no template found in `.dp.yml` config file 53 | """ 54 | config = read_env_config() 55 | config_templates = config["templates"] 56 | src_template_path = _get_template_path(config_templates, template_path) 57 | copier.run_auto(src_path=src_template_path, dst_path=project_path, vcs_ref=vcs_ref) 58 | 59 | 60 | @click.command(name="create", help="Create a new project using a template") 61 | @click.argument( 62 | "project-path", 63 | type=click.Path(writable=True, path_type=str, dir_okay=True, file_okay=False), 64 | ) 65 | @click.argument("template-path", nargs=-1) 66 | @click.option("--vcs-ref", default="HEAD", type=str, help="Git reference to checkout") 67 | def create_command(project_path: str, template_path: Sequence[str], vcs_ref: str) -> None: 68 | if template_path and len(template_path) > 1: 69 | echo_warning("dp create expects at most two arguments -- project-path and template-path") 70 | create(project_path, template_path[0] if template_path else None, vcs_ref) 71 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/docs.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from ..config_generation import get_profiles_dir_build_path 4 | from ..dbt_utils import run_dbt_command 5 | from .compile import compile_project 6 | 7 | 8 | def docs(env: str, port: int) -> None: 9 | """ 10 | Generate and serve dbt documentation. 11 | 12 | :param env: Name of the environment 13 | :type env: str 14 | :param port: Port to serve dbt documentation on. 15 | :type port: int 16 | """ 17 | compile_project(env) 18 | profiles_path = get_profiles_dir_build_path(env) 19 | run_dbt_command(("docs", "serve", "--port", str(port)), env, profiles_path) 20 | 21 | 22 | @click.command(name="docs-serve", help="Generate and serve dbt documentation.") 23 | @click.option( 24 | "--env", 25 | default="local", 26 | type=str, 27 | show_default=True, 28 | help="Name of the environment", 29 | ) 30 | @click.option( 31 | "--port", 32 | default=9328, 33 | type=int, 34 | show_default=True, 35 | help="Port to be used by the 'dbt docs serve' command", 36 | ) 37 | def docs_command(env: str, port: int) -> None: 38 | docs(env, port) 39 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/generate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/data_pipelines_cli/cli_commands/generate/__init__.py -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/generate/databricks_job.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import click 4 | from dbt_databricks_factory.cli import create_job_cli 5 | from dbt_databricks_factory.config import GitProvider 6 | 7 | 8 | @click.command("databricks-job", help="Generate a Databricks job") 9 | @click.argument( 10 | "manifest-file", 11 | type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True), 12 | ) 13 | @click.option("--job-name", required=True, help="Name of the job to create.") 14 | @click.option("--project-dir", required=True, help="Path to dbt project directory.") 15 | @click.option("--profiles-dir", required=True, help="Path to dbt profiles directory.") 16 | @click.option("--cron-schedule", help="Cron schedule for the job.") 17 | @click.option( 18 | "--job-cluster", multiple=True, type=click.Tuple([str, str]), help="Job cluster config." 19 | ) 20 | @click.option( 21 | "--task-cluster", 22 | multiple=True, 23 | type=click.Tuple([str, str]), 24 | help="Job cluster name or existing cluster id.", 25 | ) 26 | @click.option("--default-task-cluster", help="Default task cluster name or existing cluster id.") 27 | @click.option("--library", multiple=True, type=str, help="Libraries config.") 28 | @click.option("--git-url", required=True, help="Git url.") 29 | @click.option("--git-branch", help="Git branch.") 30 | @click.option("--git-commit", help="Git commit.") 31 | @click.option("--git-tag", help="Git tag.") 32 | @click.option( 33 | "--git-provider", 34 | required=True, 35 | help="Git provider.", 36 | type=click.Choice([provider.value for provider in GitProvider]), 37 | ) 38 | @click.option("--pretty", is_flag=True, help="Pretty print the output.") 39 | @click.option( 40 | "--output-file", 41 | help="Output file path.", 42 | type=click.Path(file_okay=True, dir_okay=False, writable=True), 43 | ) 44 | def generate_databricks_job_command( 45 | job_name: str, 46 | manifest_file: str, 47 | project_dir: str, 48 | profiles_dir: str, 49 | cron_schedule: str | None, 50 | job_cluster: list[tuple[str, str]], 51 | task_cluster: list[tuple[str, str]], 52 | default_task_cluster: str | None, 53 | library: list[str], 54 | git_url: str, 55 | git_branch: str | None, 56 | git_commit: str | None, 57 | git_tag: str | None, 58 | git_provider: str, 59 | pretty: bool, 60 | output_file: str, 61 | ) -> None: 62 | """Generate a Databricks job.""" 63 | create_job_cli( 64 | job_name, 65 | manifest_file, 66 | project_dir, 67 | profiles_dir, 68 | cron_schedule, 69 | job_cluster, 70 | task_cluster, 71 | default_task_cluster, 72 | library, 73 | git_url, 74 | git_branch, 75 | git_commit, 76 | git_tag, 77 | git_provider, 78 | pretty, 79 | output_file, 80 | ) 81 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/generate/generate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import click 4 | 5 | from .model_yaml import generate_model_yamls_command 6 | from .source_sql import generate_source_sqls_command 7 | from .source_yaml import generate_source_yamls_command 8 | 9 | 10 | @click.group(name="generate", help="Generate additional dbt files") 11 | def generate_group() -> None: 12 | pass 13 | 14 | 15 | generate_group.add_command(generate_model_yamls_command) 16 | generate_group.add_command(generate_source_sqls_command) 17 | generate_group.add_command(generate_source_yamls_command) 18 | 19 | try: 20 | from .databricks_job import generate_databricks_job_command 21 | 22 | generate_group.add_command(generate_databricks_job_command) 23 | except ImportError: 24 | logging.info("Databricks CLI not installed") 25 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/generate/model_yaml.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | import sys 4 | from typing import Any, Dict, Sequence 5 | 6 | import click 7 | import yaml 8 | 9 | from ...cli_utils import echo_info, echo_warning 10 | from ...config_generation import get_profiles_dir_build_path 11 | from ...errors import DataPipelinesError, SubprocessNonZeroExitError 12 | from ..compile import compile_project 13 | from .utils import ( 14 | generate_models_or_sources_from_single_table, 15 | get_output_file_or_warn_if_exists, 16 | ) 17 | 18 | if sys.version_info >= (3, 8): 19 | from typing import TypedDict # pylint: disable=no-name-in-module 20 | else: 21 | from typing_extensions import TypedDict 22 | 23 | 24 | class MacroArgName(TypedDict): 25 | deps_name: str 26 | macro_name: str 27 | arg_name: str 28 | 29 | 30 | def _get_deps_macro_and_arg_name(with_meta: bool) -> MacroArgName: 31 | return ( 32 | MacroArgName( 33 | deps_name="dbt_profiler", macro_name="print_profile_schema", arg_name="relation_name" 34 | ) 35 | if with_meta 36 | else MacroArgName( 37 | deps_name="codegen", macro_name="generate_model_yaml", arg_name="model_name" 38 | ) 39 | ) 40 | 41 | 42 | def _is_ephemeral_model(manifest: Dict[str, Any], model_name: str) -> bool: 43 | for node in manifest["nodes"].values(): 44 | if node["name"] == model_name: 45 | return node["config"]["materialized"] == "ephemeral" 46 | raise DataPipelinesError(f"Could not find {model_name} in project's 'manifest.json' file.") 47 | 48 | 49 | def _generate_model_yamls_for_directory( 50 | directory: pathlib.Path, 51 | env: str, 52 | overwrite: bool, 53 | macro_arg_name: MacroArgName, 54 | profiles_path: pathlib.Path, 55 | ) -> None: 56 | output_path = get_output_file_or_warn_if_exists(directory, overwrite, "yml") 57 | if output_path is None: 58 | return 59 | 60 | click.echo(f"Generating schema file for directory: {str(directory)}") 61 | with open(pathlib.Path.cwd().joinpath("target", "manifest.json"), "r") as manifest_json: 62 | manifest = json.load(manifest_json) 63 | models = [ 64 | model 65 | for file in directory.glob("*.sql") 66 | if not _is_ephemeral_model(manifest, file.stem) 67 | for model in generate_models_or_sources_from_single_table( 68 | env, 69 | macro_arg_name["macro_name"], 70 | {macro_arg_name["arg_name"]: file.stem}, 71 | profiles_path, 72 | )["models"] 73 | ] 74 | if len(models) == 0: 75 | echo_warning( 76 | f"{str(directory)} does not have any models. Schema file will not be generated." 77 | ) 78 | else: 79 | with open(output_path, "w") as output_file: 80 | yaml.dump( 81 | {"version": 2, "models": models}, 82 | output_file, 83 | default_flow_style=False, 84 | sort_keys=False, 85 | ) 86 | echo_info(f"Generated source schema file and saved in {output_path}") 87 | 88 | 89 | def generate_model_yamls( 90 | env: str, with_meta: bool, overwrite: bool, model_paths: Sequence[pathlib.Path] 91 | ) -> None: 92 | compile_project(env) 93 | profiles_path = get_profiles_dir_build_path(env) 94 | 95 | macro_arg_name = _get_deps_macro_and_arg_name(with_meta) 96 | echo_info(f"Generating schema files for directories: {' '.join(map(str, model_paths))}") 97 | try: 98 | for paths in model_paths: 99 | for subdir in paths.glob("**/"): 100 | _generate_model_yamls_for_directory( 101 | subdir, env, overwrite, macro_arg_name, profiles_path 102 | ) 103 | except SubprocessNonZeroExitError as err: 104 | raise DataPipelinesError( 105 | "Error while running dbt command. Ensure that you have " 106 | f"{macro_arg_name['deps_name']} installed and you have chosen correct models to " 107 | "generate schema.yml out of.\n" + err.message, 108 | submessage=err.submessage, 109 | ) 110 | 111 | 112 | @click.command(name="model-yaml", help="Generate schema YAML using codegen or dbt-profiler") 113 | @click.option("--env", default="local", type=str, help="Name of the environment", show_default=True) 114 | @click.option( 115 | "--with-meta", type=bool, is_flag=True, help="Whether to generate dbt-profiler metadata" 116 | ) 117 | @click.option( 118 | "--overwrite", type=bool, is_flag=True, help="Whether to overwrite existing YAML files" 119 | ) 120 | @click.argument( 121 | "model-path", 122 | type=click.Path(exists=True, path_type=pathlib.Path, file_okay=False, dir_okay=True), 123 | nargs=-1, 124 | ) 125 | def generate_model_yamls_command( 126 | env: str, with_meta: bool, overwrite: bool, model_path: Sequence[pathlib.Path] 127 | ) -> None: 128 | if len(model_path) == 0: 129 | raise DataPipelinesError("Command expects at least one 'model-path' argument") 130 | generate_model_yamls(env, with_meta, overwrite, model_path) 131 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/generate/source_sql.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import click 4 | import yaml 5 | 6 | from ...config_generation import generate_profiles_yml 7 | from ...errors import DataPipelinesError, SubprocessNonZeroExitError 8 | from .utils import get_macro_run_output, get_output_file_or_warn_if_exists 9 | 10 | 11 | def generate_source_sqls( 12 | env: str, source_yaml_path: pathlib.Path, staging_path: pathlib.Path, overwrite: bool 13 | ) -> None: 14 | profiles_path = generate_profiles_yml(env) 15 | staging_path.mkdir(parents=True, exist_ok=True) 16 | with open(source_yaml_path, "r") as source_yaml: 17 | source_dict = yaml.safe_load(source_yaml) 18 | tables_by_source = [ 19 | (source["name"], table["name"]) 20 | for source in source_dict["sources"] 21 | for table in source["tables"] 22 | ] 23 | try: 24 | for source_name, table_name in tables_by_source: 25 | output_path = get_output_file_or_warn_if_exists( 26 | staging_path.joinpath(source_name), overwrite, "sql", f"stg_{table_name}" 27 | ) 28 | if output_path is None: 29 | continue 30 | table_sql = get_macro_run_output( 31 | env, 32 | "generate_base_model", 33 | {"source_name": source_name, "table_name": table_name}, 34 | profiles_path, 35 | ) 36 | output_path.parent.mkdir(parents=True, exist_ok=True) 37 | with open(output_path, "w") as output: 38 | output.write(table_sql) 39 | except SubprocessNonZeroExitError as err: 40 | raise DataPipelinesError( 41 | "Error while running dbt command. Ensure that you have codegen " 42 | "installed and you have chosen correct existing sources to " 43 | "generate table sqls out of.\n" + err.message, 44 | submessage=err.submessage, 45 | ) 46 | 47 | 48 | @click.command(name="source-sql", help="Generate SQLs that represents tables in given dataset") 49 | @click.option("--env", default="local", type=str, help="Name of the environment", show_default=True) 50 | @click.option( 51 | "--source-yaml-path", 52 | type=click.Path(exists=True, path_type=pathlib.Path, file_okay=True), 53 | default=pathlib.Path.cwd().joinpath("models", "source", "source.yml"), 54 | show_default=True, 55 | help="Path to the 'source.yml' schema file", 56 | required=True, 57 | ) 58 | @click.option( 59 | "--staging-path", 60 | default=pathlib.Path.cwd().joinpath("models", "staging"), 61 | show_default=True, 62 | type=pathlib.Path, 63 | help="Path to the 'staging' directory", 64 | required=True, 65 | ) 66 | @click.option( 67 | "--overwrite", type=bool, is_flag=True, help="Whether to overwrite existing SQL files" 68 | ) 69 | def generate_source_sqls_command( 70 | env: str, source_yaml_path: pathlib.Path, staging_path: pathlib.Path, overwrite: bool 71 | ) -> None: 72 | generate_source_sqls(env, source_yaml_path, staging_path, overwrite) 73 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/generate/source_yaml.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Sequence 3 | 4 | import click 5 | import yaml 6 | 7 | from ...cli_utils import echo_info 8 | from ...config_generation import generate_profiles_yml 9 | from ...errors import DataPipelinesError, SubprocessNonZeroExitError 10 | from ..generate.utils import ( 11 | generate_models_or_sources_from_single_table, 12 | get_output_file_or_warn_if_exists, 13 | ) 14 | 15 | 16 | def generate_source_yamls( 17 | env: str, source_path: pathlib.Path, overwrite: bool, schema_names: Sequence[str] 18 | ) -> None: 19 | profiles_path = generate_profiles_yml(env) 20 | output_path = get_output_file_or_warn_if_exists(source_path, overwrite, "yml") 21 | if output_path is None: 22 | return 23 | source_path.mkdir(parents=True, exist_ok=True) 24 | 25 | try: 26 | sources = [ 27 | source 28 | for schema in schema_names 29 | for source in generate_models_or_sources_from_single_table( 30 | env, 31 | "generate_source", 32 | {"schema_name": schema, "generate_columns": True, "include_descriptions": True}, 33 | profiles_path, 34 | )["sources"] 35 | ] 36 | with open(output_path, "w") as output_file: 37 | yaml.dump( 38 | {"version": 2, "sources": sources}, 39 | output_file, 40 | default_flow_style=False, 41 | sort_keys=False, 42 | ) 43 | echo_info(f"Generated source schema file and saved in {output_path}") 44 | except SubprocessNonZeroExitError as err: 45 | raise DataPipelinesError( 46 | "Error while running dbt command. Ensure that you have codegen " 47 | "installed and you have chosen correct existing datasets (schemas) to " 48 | "generate source.yml out of.\n" + err.message, 49 | submessage=err.submessage, 50 | ) 51 | 52 | 53 | @click.command(name="source-yaml", help="Generate source YAML using codegen") 54 | @click.option("--env", default="local", type=str, help="Name of the environment", show_default=True) 55 | @click.option( 56 | "--source-path", 57 | default=pathlib.Path.cwd().joinpath("models", "source"), 58 | show_default=True, 59 | type=pathlib.Path, 60 | help="Path to the 'source' directory", 61 | required=True, 62 | ) 63 | @click.option( 64 | "--overwrite", type=bool, is_flag=True, help="Whether to overwrite an existing YAML file" 65 | ) 66 | @click.argument("schema-name", type=str, nargs=-1) 67 | def generate_source_yamls_command( 68 | env: str, source_path: pathlib.Path, overwrite: bool, schema_name: Sequence[str] 69 | ) -> None: 70 | if len(schema_name) == 0: 71 | raise DataPipelinesError("Command expects at least one 'schema-name' argument") 72 | generate_source_yamls(env, source_path, overwrite, schema_name) 73 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/generate/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | import sys 4 | from typing import Any, Dict, Optional 5 | 6 | import yaml 7 | 8 | from ...cli_utils import echo_warning 9 | from ...dbt_utils import run_dbt_command 10 | from ...errors import DataPipelinesError 11 | 12 | 13 | def get_macro_run_output( 14 | env: str, macro_name: str, macro_args: Dict[str, str], profiles_path: pathlib.Path 15 | ) -> str: 16 | print_args = yaml.dump(macro_args, default_flow_style=True, width=sys.maxsize).rstrip() 17 | dbt_command_result_bytes = run_dbt_command( 18 | ("run-operation", macro_name, "--args", print_args), 19 | env, 20 | profiles_path, 21 | log_format_json=True, 22 | capture_output=True, 23 | ) 24 | decoded_output = dbt_command_result_bytes.stdout.decode(encoding=sys.stdout.encoding or "utf-8") 25 | for line in map(json.loads, decoded_output.splitlines()): 26 | if line.get("code") == "M011": 27 | return line["msg"] 28 | raise DataPipelinesError(f"No macro output found in the dbt output:\n{decoded_output}") 29 | 30 | 31 | def generate_models_or_sources_from_single_table( 32 | env: str, macro_name: str, macro_args: Dict[str, Any], profiles_path: pathlib.Path 33 | ) -> Dict[str, Any]: 34 | return yaml.safe_load(get_macro_run_output(env, macro_name, macro_args, profiles_path)) 35 | 36 | 37 | def get_output_file_or_warn_if_exists( 38 | directory: pathlib.Path, overwrite: bool, file_extension: str, filename: Optional[str] = None 39 | ) -> Optional[pathlib.Path]: 40 | output_path = directory.joinpath(f"{filename or directory.name}.{file_extension}") 41 | if output_path.exists(): 42 | if not overwrite: 43 | echo_warning( 44 | f"{str(output_path)} in directory {str(directory)} exists, it " 45 | "will not be overwritten. If you want to overwrite it, pass " 46 | "'--overwrite' flag." 47 | ) 48 | return None 49 | else: 50 | echo_warning( 51 | f"{str(output_path)} in directory {str(directory)} exists, it gets overwritten." 52 | ) 53 | return output_path 54 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/init.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tempfile 3 | from typing import Optional, Sequence 4 | 5 | import click 6 | import copier 7 | import questionary 8 | import yaml 9 | 10 | from ..cli_constants import DEFAULT_GLOBAL_CONFIG, ENV_CONFIGURATION_PATH 11 | from ..data_structures import DataPipelinesConfig 12 | from ..errors import DataPipelinesError 13 | from ..vcs_utils import add_suffix_to_git_template_path 14 | 15 | 16 | def _download_global_config(config_path: str) -> DataPipelinesConfig: 17 | config_path = add_suffix_to_git_template_path(config_path) 18 | 19 | with tempfile.TemporaryDirectory() as tmp: 20 | copier.run_auto(config_path, tmp, quiet=True) 21 | with open(pathlib.Path(tmp).joinpath("dp.yml")) as config_file: 22 | config = yaml.safe_load(config_file) 23 | return config 24 | 25 | 26 | def init(config_path: Optional[str]) -> None: 27 | """ 28 | Configure the tool for the first time. 29 | 30 | :param config_path: URI of the repository with a template of the config file 31 | :type config_path: Optional[str] 32 | :raises DataPipelinesError: user do not want to overwrite existing config file 33 | """ 34 | if ENV_CONFIGURATION_PATH.is_file(): 35 | overwrite_confirm = questionary.confirm( 36 | "dp config already exists. Do you want to overwrite it?", 37 | default=False, 38 | ).ask() 39 | if not overwrite_confirm: 40 | raise DataPipelinesError("Could not overwrite existing config") 41 | 42 | if config_path: 43 | config = _download_global_config(config_path) 44 | else: 45 | config = DEFAULT_GLOBAL_CONFIG 46 | 47 | with open(ENV_CONFIGURATION_PATH, "w") as config_file: 48 | yaml.dump(config, config_file, default_flow_style=False) 49 | 50 | 51 | @click.command(name="init", help="Configure the tool for the first time") 52 | @click.argument("config_path", nargs=-1) 53 | def init_command(config_path: Sequence[str]) -> None: 54 | init(config_path[0] if config_path else None) 55 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/prepare_env.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Dict 3 | 4 | import click 5 | import yaml 6 | 7 | from ..cli_utils import echo_subinfo 8 | from ..config_generation import DbtProfile, generate_profiles_dict 9 | from ..dbt_utils import read_dbt_vars_from_configs, run_dbt_command 10 | from ..jinja import replace_vars_with_values 11 | 12 | 13 | def prepare_env(env: str) -> None: 14 | """ 15 | Prepare local environment for use with dbt-related applications. 16 | 17 | Prepare local environment for use with applications expecting a "traditional" 18 | dbt structure, such as plugins to VS Code. If in doubt, use ``dp run`` and 19 | ``dp test`` instead. 20 | 21 | :param env: Name of the environment 22 | :type env: str 23 | """ 24 | profile: Dict[str, DbtProfile] = replace_vars_with_values( 25 | generate_profiles_dict(env, True), read_dbt_vars_from_configs(env) 26 | ) 27 | 28 | home_profiles_path = pathlib.Path.home().joinpath(".dbt", "profiles.yml") 29 | home_profiles_path.parent.mkdir(parents=True, exist_ok=True) 30 | with open(home_profiles_path, "w") as profiles: 31 | yaml.dump(profile, profiles, default_flow_style=False) 32 | 33 | echo_subinfo(f"Saved profiles.yml in {home_profiles_path.parent}") 34 | run_dbt_command(("deps",), env, home_profiles_path.parent) 35 | 36 | 37 | @click.command( 38 | name="prepare-env", 39 | help="Prepare local environment for apps interfacing with dbt", 40 | ) 41 | @click.option("--env", default="local", type=str, help="Name of the environment") 42 | def prepare_env_command(env: str) -> None: 43 | prepare_env(env) 44 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/run.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from ..config_generation import get_profiles_dir_build_path 4 | from ..dbt_utils import run_dbt_command 5 | from .compile import compile_project 6 | 7 | 8 | def run(env: str) -> None: 9 | """ 10 | Run the project on the local machine. 11 | 12 | :param env: Name of the environment 13 | :type env: str 14 | """ 15 | compile_project(env) 16 | profiles_path = get_profiles_dir_build_path(env) 17 | run_dbt_command(("run",), env, profiles_path) 18 | 19 | 20 | @click.command(name="run", help="Run the project on the local machine") 21 | @click.option( 22 | "--env", 23 | default="local", 24 | type=str, 25 | show_default=True, 26 | help="Name of the environment", 27 | ) 28 | def run_command(env: str) -> None: 29 | run(env) 30 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/seed.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from ..config_generation import get_profiles_dir_build_path 4 | from ..dbt_utils import run_dbt_command 5 | from .compile import compile_project 6 | 7 | 8 | def seed(env: str) -> None: 9 | """ 10 | Run the project on the local machine. 11 | 12 | :param env: Name of the environment 13 | :type env: str 14 | """ 15 | compile_project(env) 16 | profiles_path = get_profiles_dir_build_path(env) 17 | run_dbt_command(("seed",), env, profiles_path) 18 | 19 | 20 | @click.command(name="seed", help="Run 'dbt seed'") 21 | @click.option( 22 | "--env", 23 | default="local", 24 | type=str, 25 | show_default=True, 26 | help="Name of the environment", 27 | ) 28 | def seed_command(env: str) -> None: 29 | seed(env) 30 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/template.py: -------------------------------------------------------------------------------- 1 | import click 2 | import yaml 3 | 4 | from data_pipelines_cli.data_structures import read_env_config 5 | 6 | 7 | def list_templates() -> None: 8 | """Print a list of all templates saved in the config file.""" 9 | config = read_env_config() 10 | 11 | click.echo("AVAILABLE TEMPLATES:\n") 12 | for tc in config["templates"].values(): 13 | click.echo(yaml.dump(tc)) 14 | 15 | 16 | @click.command(name="template-list", help="Print a list of all templates saved in the config file") 17 | def list_templates_command() -> None: 18 | list_templates() 19 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/test.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from ..config_generation import get_profiles_dir_build_path 4 | from ..dbt_utils import run_dbt_command 5 | from .compile import compile_project 6 | 7 | 8 | def test(env: str) -> None: 9 | """ 10 | Run tests of the project on the local machine. 11 | 12 | :param env: Name of the environment 13 | :type env: str 14 | """ 15 | compile_project(env) 16 | profiles_path = get_profiles_dir_build_path(env) 17 | run_dbt_command(("test",), env, profiles_path) 18 | 19 | 20 | @click.command(name="test", help="Run tests of the project on the local machine") 21 | @click.option( 22 | "--env", 23 | default="local", 24 | type=str, 25 | show_default=True, 26 | help="Name of the environment", 27 | ) 28 | def test_command(env: str) -> None: 29 | test(env) 30 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_commands/update.py: -------------------------------------------------------------------------------- 1 | import click 2 | import copier 3 | 4 | from data_pipelines_cli.cli_utils import echo_warning 5 | from data_pipelines_cli.errors import NotAProjectDirectoryError 6 | 7 | 8 | def update(project_path: str, vcs_ref: str) -> None: 9 | """ 10 | Update an existing project from its template. 11 | 12 | :param project_path: Path to a directory to create 13 | :type project_path: str 14 | :param vcs_ref: Git reference to checkout in projects template 15 | :type vcs_ref: str 16 | """ 17 | try: 18 | copier.run_auto(dst_path=project_path, vcs_ref=vcs_ref) 19 | except ValueError: 20 | raise NotAProjectDirectoryError(project_path) 21 | 22 | 23 | @click.command(name="update", help="Update project from its template") 24 | @click.argument("project-path", nargs=-1) 25 | @click.option("--vcs-ref", default="HEAD", type=str, help="Git reference to checkout") 26 | def update_command(project_path: str, vcs_ref: str) -> None: 27 | if len(project_path) > 1: 28 | echo_warning("dp expects at most 1 argument -- project-path") 29 | update(project_path[0] if project_path else ".", vcs_ref) 30 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_configs.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from data_pipelines_cli.cli_constants import BUILD_DIR 4 | 5 | 6 | def find_datahub_config_file(env: str) -> pathlib.Path: 7 | if BUILD_DIR.joinpath("dag", "config", env, "datahub.yml").is_file(): 8 | return BUILD_DIR.joinpath("dag", "config", env, "datahub.yml") 9 | return BUILD_DIR.joinpath("dag", "config", "base", "datahub.yml") 10 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_constants.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from data_pipelines_cli.data_structures import DataPipelinesConfig 4 | 5 | #: 6 | IMAGE_TAG_TO_REPLACE: str = "" 7 | #: Name of the environment and dbt target to use for a local machine 8 | PROFILE_NAME_LOCAL_ENVIRONMENT = "local" 9 | #: Name of the dbt target to use for a remote machine 10 | PROFILE_NAME_ENV_EXECUTION = "env_execution" 11 | AVAILABLE_ENVS = [PROFILE_NAME_LOCAL_ENVIRONMENT, PROFILE_NAME_ENV_EXECUTION] 12 | 13 | #: Content of the config file created by `dp init` command if no template path 14 | #: is provided 15 | DEFAULT_GLOBAL_CONFIG: DataPipelinesConfig = { 16 | "templates": {}, 17 | "vars": {}, 18 | } 19 | 20 | ENV_CONFIGURATION_PATH: pathlib.Path = pathlib.Path.home().joinpath(".dp.yml") 21 | BUILD_DIR: pathlib.Path = pathlib.Path.cwd().joinpath("build") 22 | 23 | 24 | def get_dbt_profiles_env_name(env: str) -> str: 25 | """ 26 | Given a name of the environment, returns one of target names expected by 27 | the `profiles.yml` file. 28 | 29 | :param env: Name of the environment 30 | :type env: str 31 | :return: Name of the `target` to be used in `profiles.yml` 32 | """ 33 | return ( 34 | PROFILE_NAME_LOCAL_ENVIRONMENT 35 | if env == PROFILE_NAME_LOCAL_ENVIRONMENT 36 | else PROFILE_NAME_ENV_EXECUTION 37 | ) 38 | -------------------------------------------------------------------------------- /data_pipelines_cli/cli_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import subprocess 5 | import sys 6 | from typing import Any, List, Optional 7 | 8 | import click 9 | 10 | from data_pipelines_cli.errors import ( 11 | DataPipelinesError, 12 | SubprocessNonZeroExitError, 13 | SubprocessNotFound, 14 | ) 15 | 16 | 17 | def echo_error(text: str, **kwargs: Any) -> None: 18 | """ 19 | Print an error message to stderr using click-specific print function. 20 | 21 | :param text: Message to print 22 | :type text: str 23 | :param kwargs: 24 | """ 25 | click.secho(text, file=sys.stderr, fg="red", bold=True, **kwargs) 26 | 27 | 28 | def echo_suberror(text: str, **kwargs: Any) -> None: 29 | """ 30 | Print a suberror message to stderr using click-specific print function. 31 | 32 | :param text: Message to print 33 | :type text: str 34 | :param kwargs: 35 | """ 36 | click.secho(text, file=sys.stderr, fg="bright_red", **kwargs) 37 | 38 | 39 | def echo_warning(text: str, **kwargs: Any) -> None: 40 | """ 41 | Print a warning message to stderr using click-specific print function. 42 | 43 | :param text: Message to print 44 | :type text: str 45 | :param kwargs: 46 | """ 47 | click.secho(text, file=sys.stderr, fg="yellow", **kwargs) 48 | 49 | 50 | def echo_info(text: str, **kwargs: Any) -> None: 51 | """ 52 | Print a message to stdout using click-specific print function. 53 | 54 | :param text: Message to print 55 | :type text: str 56 | :param kwargs: 57 | """ 58 | click.secho(text, fg="blue", bold=True, **kwargs) 59 | 60 | 61 | def echo_subinfo(text: str, **kwargs: Any) -> None: 62 | """ 63 | Print a subinfo message to stdout using click-specific print function. 64 | 65 | :param text: Message to print 66 | :type text: str 67 | :param kwargs: 68 | """ 69 | click.secho(text, fg="bright_blue", **kwargs) 70 | 71 | 72 | def get_argument_or_environment_variable( 73 | argument: Optional[str], argument_name: str, environment_variable_name: str 74 | ) -> str: 75 | """ 76 | Given *argument* is not ``None``, return its value. Otherwise, search 77 | for *environment_variable_name* amongst environment variables and return 78 | it. If such a variable is not set, raise :exc:`.DataPipelinesError`. 79 | 80 | :param argument: Optional value passed to the CLI as the *argument_name* 81 | :type argument: Optional[str] 82 | :param argument_name: Name of the CLI's argument 83 | :type argument_name: str 84 | :param environment_variable_name: Name of the environment variable to search for 85 | :type environment_variable_name: str 86 | :return: Value of the *argument* or specified environment variable 87 | :raises DataPipelinesError: *argument* is ``None`` and \ 88 | *environment_variable_name* is not set 89 | """ 90 | result = argument or os.environ.get(environment_variable_name) 91 | if not result: 92 | raise DataPipelinesError( 93 | f"Could not get {environment_variable_name}. Either set it as an " 94 | f"environment variable {environment_variable_name} or pass as a " 95 | f"`--{argument_name}` CLI argument." 96 | ) 97 | return result 98 | 99 | 100 | def subprocess_run( 101 | args: List[str], capture_output: bool = False 102 | ) -> subprocess.CompletedProcess[bytes]: 103 | """ 104 | Run subprocess and return its state if completed with a success. If not, 105 | raise :exc:`.SubprocessNonZeroExitError`. 106 | 107 | :param args: List of strings representing subprocess and its arguments 108 | :type args: List[str] 109 | :param capture_output: Whether to capture output of subprocess. 110 | :type capture_output: bool 111 | :return: State of the completed process 112 | :rtype: subprocess.CompletedProcess[bytes] 113 | :raises SubprocessNonZeroExitError: subprocess exited with non-zero exit code 114 | """ 115 | try: 116 | return subprocess.run(args, check=True, capture_output=capture_output) 117 | except FileNotFoundError: 118 | raise SubprocessNotFound(args[0]) 119 | except subprocess.CalledProcessError as err: 120 | raise SubprocessNonZeroExitError( 121 | args[0], 122 | err.returncode, 123 | err.output.decode(encoding=sys.stdout.encoding or "utf-8") if err.output else None, 124 | ) 125 | -------------------------------------------------------------------------------- /data_pipelines_cli/data_structures.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Any, Dict, List, Optional 3 | 4 | import yaml 5 | 6 | from data_pipelines_cli.cli_utils import echo_warning 7 | from data_pipelines_cli.errors import DataPipelinesError, NoConfigFileError 8 | from data_pipelines_cli.io_utils import git_revision_hash 9 | 10 | if sys.version_info >= (3, 8): 11 | from typing import TypedDict # pylint: disable=no-name-in-module 12 | else: 13 | from typing_extensions import TypedDict 14 | 15 | 16 | class TemplateConfig(TypedDict): 17 | """ 18 | POD representing value referenced in the `templates` section of 19 | the `.dp.yml` config file. 20 | """ 21 | 22 | template_name: str 23 | """Name of the template""" 24 | template_path: str 25 | """Local path or Git URI to the template repository""" 26 | 27 | 28 | class DataPipelinesConfig(TypedDict): 29 | """POD representing `.dp.yml` config file.""" 30 | 31 | templates: Dict[str, TemplateConfig] 32 | """Dictionary of saved templates to use in `dp create` command""" 33 | vars: Dict[str, str] 34 | """Variables to be passed to dbt as `--vars` argument""" 35 | 36 | 37 | def read_env_config() -> DataPipelinesConfig: 38 | """ 39 | Parse `.dp.yml` config file, if it exists. Otherwise, raises 40 | :exc:`.NoConfigFileError`. 41 | 42 | :return: POD representing `.dp.yml` config file, if it exists 43 | :rtype: DataPipelinesConfig 44 | :raises NoConfigFileError: `.dp.yml` file not found 45 | """ 46 | # Avoiding a dependency loop between `cli_constants` and `data_structures` 47 | from data_pipelines_cli.cli_constants import ENV_CONFIGURATION_PATH 48 | 49 | if not ENV_CONFIGURATION_PATH.is_file(): 50 | echo_warning( 51 | "No configuration file found. Run 'dp init' to create it.", 52 | ) 53 | raise NoConfigFileError() 54 | 55 | with open(ENV_CONFIGURATION_PATH, "r") as f: 56 | return yaml.safe_load(f) 57 | 58 | 59 | class DockerArgs: 60 | """Arguments required by the Docker to make a push to the repository. 61 | 62 | :raises DataPipelinesError: *repository* variable not set or git hash not found 63 | """ 64 | 65 | repository: str 66 | """URI of the Docker images repository""" 67 | image_tag: str 68 | """An image tag""" 69 | build_args: Dict[str, str] 70 | 71 | def __init__(self, env: str, image_tag: Optional[str], build_args: Dict[str, str]) -> None: 72 | self.repository = self._get_docker_repository_uri_from_k8s_config(env) 73 | self.image_tag = self._get_image_tag_from_k8s_config(env, image_tag) 74 | self.build_args = build_args 75 | 76 | def docker_build_tag(self) -> str: 77 | """ 78 | Prepare a tag for Docker Python API build command. 79 | 80 | :return: Tag for Docker Python API build command 81 | :rtype: str 82 | """ 83 | return f"{self.repository}:{self.image_tag}" 84 | 85 | def _get_docker_repository_uri_from_k8s_config(self, env: str) -> str: 86 | return self._get_docker_image_variable_from_k8s_config("repository", env) 87 | 88 | def _get_image_tag_from_k8s_config(self, env: str, image_tag: Optional[str]) -> str: 89 | from data_pipelines_cli.cli_constants import IMAGE_TAG_TO_REPLACE 90 | 91 | config_tag = image_tag or self._get_docker_image_variable_from_k8s_config("tag", env) 92 | if config_tag != IMAGE_TAG_TO_REPLACE: 93 | return config_tag 94 | 95 | commit_sha = git_revision_hash() 96 | if not commit_sha: 97 | echo_warning("Could not get git revision hash.") 98 | commit_sha = "None" 99 | return commit_sha 100 | 101 | @staticmethod 102 | def _get_docker_image_variable_from_k8s_config(key: str, env: str) -> str: 103 | # Avoiding a dependency loop between `cli_constants` and `data_structures` 104 | from data_pipelines_cli.cli_constants import BUILD_DIR 105 | from data_pipelines_cli.config_generation import ( 106 | read_dictionary_from_config_directory, 107 | ) 108 | 109 | execution_env_config = read_dictionary_from_config_directory( 110 | BUILD_DIR.joinpath("dag"), env, "execution_env.yml" 111 | ) 112 | try: 113 | return execution_env_config["image"][key] 114 | except KeyError as key_error: 115 | raise DataPipelinesError( 116 | f"Could not find '{key}' variable in build/config/{env}/execution_env.yml." 117 | ) from key_error 118 | 119 | 120 | class DbtTableColumn(TypedDict, total=False): 121 | """POD representing a single column from 'schema.yml' file.""" 122 | 123 | name: str 124 | description: str 125 | meta: Dict[str, Any] 126 | quote: bool 127 | tests: List[str] 128 | tags: List[str] 129 | 130 | 131 | class DbtModel(TypedDict, total=False): 132 | """POD representing a single model from 'schema.yml' file.""" 133 | 134 | name: str 135 | description: str 136 | meta: Dict[str, Any] 137 | identifier: str 138 | tests: List[str] 139 | tags: List[str] 140 | columns: List[DbtTableColumn] 141 | 142 | 143 | class DbtSource(TypedDict, total=False): 144 | """POD representing a single source from 'schema.yml' file.""" 145 | 146 | name: str 147 | description: str 148 | database: str 149 | schema: str 150 | meta: Dict[str, Any] 151 | tags: List[str] 152 | tables: List[DbtModel] 153 | -------------------------------------------------------------------------------- /data_pipelines_cli/dbt_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pathlib 4 | import subprocess 5 | import sys 6 | from typing import Any, Dict, Tuple 7 | 8 | import yaml 9 | 10 | from .cli_constants import BUILD_DIR, get_dbt_profiles_env_name 11 | from .cli_utils import echo_subinfo, subprocess_run 12 | from .config_generation import read_dictionary_from_config_directory 13 | from .data_structures import DataPipelinesConfig, read_env_config 14 | from .errors import NoConfigFileError 15 | 16 | 17 | def read_dbt_vars_from_configs(env: str) -> Dict[str, Any]: 18 | """Read `vars` field from dp configuration file (``$HOME/.dp.yml``), base 19 | ``dbt.yml`` config (``config/base/dbt.yml``) and environment-specific config 20 | (``config/{env}/dbt.yml``) and compile into one dictionary. 21 | 22 | :param env: Name of the environment 23 | :type env: str 24 | :return: Dictionary with `vars` and their keys 25 | :rtype: Dict[str, Any] 26 | """ 27 | dbt_env_config = read_dictionary_from_config_directory( 28 | BUILD_DIR.joinpath("dag"), env, "dbt.yml" 29 | ) 30 | 31 | try: 32 | dp_config = read_env_config() 33 | except NoConfigFileError: 34 | dp_config = DataPipelinesConfig(templates={}, vars={}) 35 | dp_vars = dp_config.get("vars", {}) 36 | dbt_vars: Dict[str, str] = dbt_env_config.get("vars", {}) 37 | 38 | return dict(dbt_vars, **dp_vars) 39 | 40 | 41 | def _dump_dbt_vars_from_configs_to_string(env: str) -> str: 42 | dbt_vars = read_dbt_vars_from_configs(env) 43 | return yaml.dump(dbt_vars, default_flow_style=True, width=sys.maxsize) 44 | 45 | 46 | def run_dbt_command( 47 | command: Tuple[str, ...], 48 | env: str, 49 | profiles_path: pathlib.Path, 50 | log_format_json: bool = False, 51 | capture_output: bool = False, 52 | ) -> subprocess.CompletedProcess[bytes]: 53 | """ 54 | Run dbt subprocess in a context of specified *env*. 55 | 56 | :param command: Tuple representing dbt command and its optional arguments 57 | :type command: Tuple[str, ...] 58 | :param env: Name of the environment 59 | :type env: str 60 | :param profiles_path: Path to the directory containing `profiles.yml` file 61 | :type profiles_path: pathlib.Path 62 | :param log_format_json: Whether to run dbt command with `--log-format=json` flag 63 | :type log_format_json: bool 64 | :param capture_output: Whether to capture stdout of subprocess. 65 | :type capture_output: bool 66 | :return: State of the completed process 67 | :rtype: subprocess.CompletedProcess[bytes] 68 | :raises SubprocessNotFound: dbt not installed 69 | :raises SubprocessNonZeroExitError: dbt exited with error 70 | """ 71 | command_str = " ".join(list(command)) 72 | echo_subinfo(f"dbt {command_str}") 73 | 74 | dbt_env_config = read_dictionary_from_config_directory( 75 | BUILD_DIR.joinpath("dag"), env, "dbt.yml" 76 | ) 77 | dbt_vars = _dump_dbt_vars_from_configs_to_string(env) 78 | return subprocess_run( 79 | [ 80 | "dbt", 81 | *(["--log-format=json"] if log_format_json else []), 82 | *command, 83 | "--profile", 84 | dbt_env_config["target_type"], 85 | "--profiles-dir", 86 | str(profiles_path), 87 | "--target", 88 | get_dbt_profiles_env_name(env), 89 | "--vars", 90 | dbt_vars, 91 | ], 92 | capture_output=capture_output, 93 | ) 94 | -------------------------------------------------------------------------------- /data_pipelines_cli/docker_response_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Dict, Iterable, List, Optional, Union, cast 3 | 4 | import click 5 | 6 | from data_pipelines_cli.errors import DockerErrorResponseError 7 | 8 | 9 | class DockerReadResponse: 10 | """POD representing Docker response processed by :class:`DockerResponseReader`.""" 11 | 12 | msg: str 13 | """Read and processed message""" 14 | is_error: bool 15 | """Whether response is error or not""" 16 | 17 | def __init__(self, msg: str, is_error: bool) -> None: 18 | self.msg = msg 19 | self.is_error = is_error 20 | 21 | def __str__(self) -> str: 22 | return self.msg 23 | 24 | 25 | class DockerResponseReader: 26 | """ 27 | Read and process Docker response. 28 | 29 | Docker response turns into processed strings instead of plain dictionaries. 30 | """ 31 | 32 | logs_generator: Iterable[Union[str, Dict[str, Union[str, Dict[str, str]]]]] 33 | """Iterable representing Docker response""" 34 | cached_read_response: Optional[List[DockerReadResponse]] 35 | """Internal cache of already processed response""" 36 | 37 | def __init__( 38 | self, 39 | logs_generator: Iterable[Union[str, Dict[str, Union[str, Dict[str, str]]]]], 40 | ): 41 | self.logs_generator = logs_generator 42 | self.cached_read_response = None 43 | 44 | def read_response(self) -> List[DockerReadResponse]: 45 | """ 46 | Read and process Docker response. 47 | 48 | :return: List of processed lines of response 49 | :rtype: List[DockerReadResponse] 50 | """ 51 | to_return = [] 52 | 53 | for log in self.logs_generator: 54 | if isinstance(log, str): 55 | log = json.loads(log) 56 | log = cast(Dict[str, Union[str, Dict[str, str]]], log) 57 | 58 | if "status" in log: 59 | to_return.append(self._prepare_status(log)) 60 | if "stream" in log: 61 | to_return += self._prepare_stream(log) 62 | if "aux" in log: 63 | to_return += self._prepare_aux(log) 64 | 65 | if "errorDetail" in log: 66 | to_return.append(self._prepare_error_detail(log)) 67 | elif "error" in log: 68 | to_return.append(self._prepare_error(log)) 69 | 70 | self.cached_read_response = to_return 71 | return to_return 72 | 73 | def click_echo_ok_responses(self) -> None: 74 | """Read, process and print positive Docker updates. 75 | 76 | :raises DockerErrorResponseError: Came across error update in Docker response. 77 | """ 78 | read_response = self.cached_read_response or self.read_response() 79 | 80 | for response in read_response: 81 | if response.is_error: 82 | raise DockerErrorResponseError(response.msg) 83 | click.echo(response.msg) 84 | 85 | @staticmethod 86 | def _prepare_status(log: Dict[str, Union[str, Dict[str, str]]]) -> DockerReadResponse: 87 | status_message = cast(str, log["status"]) 88 | progress_detail = cast(str, log.get("progressDetail", "")) 89 | status_id = cast(str, log.get("id", "")) 90 | message = ( 91 | status_message 92 | + (f" ({status_id})" if status_id else "") 93 | + (f": {progress_detail}" if progress_detail else "") 94 | ) 95 | 96 | return DockerReadResponse(message, False) 97 | 98 | @staticmethod 99 | def _prepare_stream(log: Dict[str, Union[str, Dict[str, str]]]) -> List[DockerReadResponse]: 100 | stream = cast(str, log["stream"]) 101 | return [ 102 | DockerReadResponse(line, False) for line in filter(lambda x: x, stream.splitlines()) 103 | ] 104 | 105 | @staticmethod 106 | def _prepare_aux(log: Dict[str, Union[str, Dict[str, str]]]) -> List[DockerReadResponse]: 107 | aux = cast(Dict[str, str], log["aux"]) 108 | to_return = [] 109 | if "Digest" in aux: 110 | to_return.append(DockerReadResponse(f"Digest: {aux['Digest']}", False)) 111 | if "ID" in aux: 112 | to_return.append(DockerReadResponse(f"ID: {aux['ID']}", False)) 113 | return to_return 114 | 115 | @staticmethod 116 | def _prepare_error_detail(log: Dict[str, Union[str, Dict[str, str]]]) -> DockerReadResponse: 117 | error_detail = cast(Dict[str, str], log["errorDetail"]) 118 | error_message = error_detail.get("message", "") 119 | error_code = error_detail.get("code", None) 120 | return DockerReadResponse( 121 | "ERROR: " + error_message + (f"\nError code: {error_code}" if error_code else ""), 122 | True, 123 | ) 124 | 125 | @staticmethod 126 | def _prepare_error(log: Dict[str, Union[str, Dict[str, str]]]) -> DockerReadResponse: 127 | return DockerReadResponse("ERROR: " + cast(str, log["error"]), True) 128 | -------------------------------------------------------------------------------- /data_pipelines_cli/errors.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | 4 | class DataPipelinesError(Exception): 5 | """Base class for all exceptions in data_pipelines_cli module""" 6 | 7 | message: str 8 | """explanation of the error""" 9 | submessage: Optional[str] 10 | """additional informations for the error""" 11 | 12 | def __init__(self, message: str, submessage: Optional[str] = None) -> None: 13 | self.message = message 14 | self.submessage = submessage 15 | 16 | 17 | class DependencyNotInstalledError(DataPipelinesError): 18 | """Exception raised if certain dependency is not installed""" 19 | 20 | def __init__(self, program_name: str) -> None: 21 | super().__init__( 22 | f"'{program_name}' not installed. Run 'pip install " 23 | f"data-pipelines-cli[{program_name}]'" 24 | ) 25 | 26 | 27 | class NoConfigFileError(DataPipelinesError): 28 | """Exception raised if `.dp.yml` does not exist""" 29 | 30 | def __init__(self) -> None: 31 | super().__init__("`.dp.yml` config file does not exists. Run 'dp init' to create it.") 32 | 33 | 34 | class NotAProjectDirectoryError(DataPipelinesError): 35 | """Exception raised if `.copier-answers.yml` file does not exist in given dir""" 36 | 37 | def __init__(self, project_path: str) -> None: 38 | super().__init__( 39 | f"Given path {project_path} is not a data-pipelines project directory." 40 | " Run 'dp create' first to create a project" 41 | ) 42 | 43 | 44 | class SubprocessNonZeroExitError(DataPipelinesError): 45 | """Exception raised if subprocess exits with non-zero exit code""" 46 | 47 | def __init__( 48 | self, subprocess_name: str, exit_code: int, subprocess_output: Optional[str] = None 49 | ) -> None: 50 | super().__init__( 51 | f"{subprocess_name} has exited with non-zero exit code: {exit_code}", 52 | submessage=subprocess_output, 53 | ) 54 | 55 | 56 | class SubprocessNotFound(DataPipelinesError): 57 | """Exception raised if subprocess cannot be found""" 58 | 59 | def __init__(self, subprocess_name: str) -> None: 60 | super().__init__( 61 | f"{subprocess_name} cannot be found. " 62 | "Ensure it is installed and listed in your $PATH." 63 | ) 64 | 65 | 66 | class DockerNotInstalledError(DependencyNotInstalledError): 67 | """Exception raised if 'docker' is not installed""" 68 | 69 | def __init__(self) -> None: 70 | super().__init__("docker") 71 | 72 | 73 | class JinjaVarKeyError(DataPipelinesError): 74 | def __init__(self, key: str) -> None: 75 | super().__init__( 76 | f"Variable '{key}' cannot be found neither in 'dbt.yml' and " 77 | "'$HOME/.dp.yml' vars nor in environment variables, causing Jinja " 78 | "template rendering to fail." 79 | ) 80 | 81 | 82 | class AirflowDagsPathKeyError(DataPipelinesError): 83 | """Exception raised if there is no ``dags_path`` in `airflow.yml` file.""" 84 | 85 | def __init__(self) -> None: 86 | super().__init__("Variable 'dags_path' cannot be found in 'airflow.yml' config file.") 87 | 88 | 89 | class DockerErrorResponseError(DataPipelinesError): 90 | """Exception raised if there is an error response from Docker client.""" 91 | 92 | def __init__(self, error_msg: str) -> None: 93 | super().__init__("Error raised when using Docker.\n" + error_msg) 94 | 95 | 96 | class NotSuppertedBIError(DataPipelinesError): 97 | """Exception raised if there is no ``target_id`` in `bi.yml`""" 98 | 99 | def __init__(self) -> None: 100 | super().__init__( 101 | "Variable 'target_id' cannot be found in 'bi.yml' " 102 | "config file or the value not matched supported solutions." 103 | ) 104 | -------------------------------------------------------------------------------- /data_pipelines_cli/filesystem_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import pathlib 5 | from typing import Dict, Set, Union 6 | 7 | import fsspec 8 | from fsspec import AbstractFileSystem 9 | 10 | from .cli_utils import echo_subinfo 11 | from .errors import DataPipelinesError 12 | 13 | 14 | class LocalRemoteSync: 15 | """Synchronizes local directory with a cloud storage's one.""" 16 | 17 | local_fs: AbstractFileSystem 18 | """FS representing local directory""" 19 | local_path_str: str 20 | """Path to local directory""" 21 | remote_path_str: str 22 | """Path/URI of the cloud storage directory""" 23 | _local_directory_suffixes: Set[str] 24 | 25 | def __init__( 26 | self, 27 | local_path: Union[str, os.PathLike[str]], 28 | remote_path: str, 29 | remote_kwargs: Dict[str, str], 30 | ) -> None: 31 | if not pathlib.Path(local_path).exists(): 32 | raise DataPipelinesError(f"{local_path} does not exists. Run 'dp compile' before.") 33 | 34 | self.local_path_str = str(local_path).rstrip("/") 35 | self.local_fs = fsspec.filesystem("file") 36 | self.remote_fs, self.remote_path_str = fsspec.core.url_to_fs( 37 | remote_path.rstrip("/"), **remote_kwargs 38 | ) 39 | self._local_directory_suffixes = set() 40 | 41 | def sync(self, delete: bool = True) -> None: 42 | """ 43 | Send local files to the remote directory and (optionally) delete 44 | unnecessary ones. 45 | 46 | :param delete: Whether to delete remote files that are \ 47 | no longer present in local directory 48 | :type delete: bool 49 | """ 50 | self._push_sync() 51 | if delete: 52 | self._delete() 53 | 54 | def _push_sync(self) -> None: 55 | """Push every file to the remote.""" 56 | 57 | # TODO: Is it "lazy" (checking what to update) or not? 58 | local_directory = self.local_fs.find(self.local_path_str) 59 | self._local_directory_suffixes = set() 60 | for local_file in local_directory: 61 | local_file_suffix = local_file[len(self.local_path_str) :] 62 | self._local_directory_suffixes.add(local_file_suffix) 63 | remote_path_with_suffix = self.remote_path_str + local_file_suffix 64 | echo_subinfo(f"- Pushing {str(local_file)} to {remote_path_with_suffix}") 65 | self.remote_fs.put_file(local_file, remote_path_with_suffix) 66 | 67 | def _delete(self) -> None: 68 | """Remove every file from remote that's not local.""" 69 | remote_directory = self.remote_fs.find(self.remote_path_str) 70 | for remote_file in remote_directory: 71 | remote_file_suffix = remote_file[len(self.remote_path_str) :] 72 | if remote_file_suffix not in self._local_directory_suffixes: 73 | self.remote_fs.rm(remote_file) 74 | -------------------------------------------------------------------------------- /data_pipelines_cli/io_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import re 5 | import shutil 6 | import subprocess 7 | import sys 8 | import tempfile 9 | from typing import Optional, Union 10 | 11 | import click 12 | 13 | 14 | # Python's `sed` equivalent, based on the following answer: 15 | # https://stackoverflow.com/a/31499114 16 | def replace(filename: Union[str, os.PathLike[str]], pattern: str, replacement: str) -> None: 17 | """ 18 | Perform the pure-Python equivalent of in-place `sed` substitution: e.g., 19 | ``sed -i -e 's/'${pattern}'/'${replacement}' "${filename}"``. 20 | 21 | Beware however, it uses Python regex dialect instead of `sed`'s one. 22 | It can introduce regex-related bugs. 23 | """ 24 | 25 | # For efficiency, precompile the passed regular expression. 26 | pattern_compiled = re.compile(pattern) 27 | 28 | # For portability, NamedTemporaryFile() defaults to mode "w+b" (i.e., 29 | # binary writing with updating). This is usually a good thing. In this 30 | # case, however, binary writing imposes non-trivial encoding constraints 31 | # trivially resolved by switching to text writing. Let's do that. 32 | with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: 33 | with open(filename) as src_file: 34 | for line in src_file: 35 | tmp_file.write(pattern_compiled.sub(replacement, line)) 36 | 37 | # Overwrite the original file with the munged temporary file in a 38 | # manner preserving file attributes (e.g., permissions). 39 | shutil.copystat(filename, tmp_file.name) 40 | shutil.move(tmp_file.name, filename) 41 | 42 | 43 | def git_revision_hash() -> Optional[str]: 44 | """ 45 | Get current Git revision hash, if Git is installed and any revision exists. 46 | 47 | :return: Git revision hash, if possible. 48 | :rtype: Optional[str] 49 | """ 50 | try: 51 | rev_process = subprocess.run(["git", "rev-parse", "HEAD"], check=True, capture_output=True) 52 | return rev_process.stdout.decode("ascii").strip() 53 | except FileNotFoundError: 54 | click.echo( 55 | "The tool has run across an error when trying to get Git " 56 | "revision hash. Ensure you have `git` installed", 57 | file=sys.stderr, 58 | ) 59 | return None 60 | except subprocess.CalledProcessError as err: 61 | click.echo( 62 | "The tool has run across a following error when trying to get Git " 63 | "revision hash. Ensure your project is a Git repository (run 'git " 64 | "init', if not).", 65 | file=sys.stderr, 66 | ) 67 | click.echo(err.stderr, file=sys.stderr) 68 | return None 69 | -------------------------------------------------------------------------------- /data_pipelines_cli/jinja.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Dict 3 | 4 | from jinja2 import BaseLoader, DictLoader 5 | from jinja2.nativetypes import NativeEnvironment 6 | 7 | from data_pipelines_cli.errors import JinjaVarKeyError 8 | 9 | 10 | def _prepare_jinja_replace_environment( 11 | jinja_loader: BaseLoader, dbt_vars: Dict[str, Any] 12 | ) -> NativeEnvironment: 13 | def _jinja_vars(var_name: str) -> Any: 14 | return dbt_vars[var_name] 15 | 16 | def _jinja_env_vars(var_name: str) -> Any: 17 | return os.environ[var_name] 18 | 19 | jinja_env = NativeEnvironment(loader=jinja_loader) 20 | # Hacking Jinja to use our functions, following: 21 | # https://stackoverflow.com/a/6038550 22 | jinja_env.globals["var"] = _jinja_vars 23 | jinja_env.globals["env_var"] = _jinja_env_vars 24 | 25 | return jinja_env 26 | 27 | 28 | def replace_vars_with_values( 29 | templated_dictionary: Dict[str, Any], dbt_vars: Dict[str, Any] 30 | ) -> Dict[str, Any]: 31 | """ 32 | Replace variables in given dictionary using Jinja template in its values. 33 | 34 | :param templated_dictionary: Dictionary with Jinja-templated values 35 | :type templated_dictionary: Dict[str, Any] 36 | :param dbt_vars: Variables to replace 37 | :type dbt_vars: Dict[str, Any] 38 | :return: Dictionary with replaced variables 39 | :rtype: Dict[str, Any] 40 | :raises JinjaVarKeyError: Variable referenced in Jinja template does not exist 41 | """ 42 | jinja_loader = DictLoader(templated_dictionary) 43 | jinja_env = _prepare_jinja_replace_environment(jinja_loader, dbt_vars) 44 | 45 | rendered_settings = {} 46 | for setting_key, setting_old_value in templated_dictionary.items(): 47 | if isinstance(setting_old_value, dict): 48 | rendered_settings[setting_key] = replace_vars_with_values(setting_old_value, dbt_vars) 49 | else: 50 | try: 51 | rendered_settings[setting_key] = jinja_env.get_template(setting_key).render() 52 | except TypeError: 53 | # Jinja is accepting only str or Template and fails on int, etc. 54 | rendered_settings[setting_key] = setting_old_value 55 | except KeyError as key_error: 56 | # Variable does not exist and _jinja_vars or _jinja_env_vars thrown 57 | raise JinjaVarKeyError(key_error.args[0]) 58 | return rendered_settings 59 | -------------------------------------------------------------------------------- /data_pipelines_cli/looker_utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import pathlib 4 | from shutil import copy, rmtree 5 | from typing import Any, Dict, Tuple 6 | 7 | import requests 8 | import yaml 9 | from git import Repo 10 | 11 | from .cli_constants import BUILD_DIR 12 | from .cli_utils import echo_info, subprocess_run 13 | from .config_generation import ( 14 | generate_profiles_yml, 15 | read_dictionary_from_config_directory, 16 | ) 17 | from .dbt_utils import run_dbt_command 18 | 19 | LOOKML_DEST_PATH: pathlib.Path = BUILD_DIR.joinpath("lookml") 20 | LOOKML_VIEWS_SUBDIR: str = "views" 21 | 22 | 23 | def read_looker_config(env: str) -> Dict[str, Any]: 24 | """ 25 | Read Looker configuration. 26 | 27 | :param env: Name of the environment 28 | :type env: str 29 | :return: Compiled dictionary 30 | :rtype: Dict[str, Any] 31 | """ 32 | return read_dictionary_from_config_directory(BUILD_DIR.joinpath("dag"), env, "looker.yml") 33 | 34 | 35 | def generate_lookML_model() -> None: 36 | """ 37 | Generate lookML codes based on compiled dbt project. 38 | """ 39 | subprocess_run(["dbt2looker", "--output-dir", str(LOOKML_DEST_PATH)]) 40 | 41 | 42 | def deploy_lookML_model(key_path: str, env: str) -> None: 43 | """ 44 | Write compiled lookML to Looker's repository and deploy project to production 45 | 46 | :param key_path: Path to the key with write access to git repository 47 | :type key_path: str 48 | :param env: Name of the environment 49 | :type env: str 50 | """ 51 | profiles_path = generate_profiles_yml(env, False) 52 | run_dbt_command(("docs", "generate"), env, profiles_path) 53 | 54 | looker_config = read_looker_config(env) 55 | local_repo_path = BUILD_DIR.joinpath("looker_project_repo") 56 | 57 | if local_repo_path.exists(): 58 | echo_info(f"Removing {local_repo_path}") 59 | rmtree(local_repo_path) 60 | 61 | ssh_command_with_key = f"ssh -i {key_path}" 62 | repo = Repo.clone_from( 63 | looker_config["looker_repository"], 64 | local_repo_path, 65 | branch=looker_config["looker_repository_branch"], 66 | env={"GIT_SSH_COMMAND": ssh_command_with_key}, 67 | ) 68 | 69 | project_name, project_version = _get_project_name_and_version() 70 | with repo.git.custom_environment(GIT_SSH_COMMAND=ssh_command_with_key): 71 | _prepare_repo_changes(LOOKML_DEST_PATH, local_repo_path) 72 | _configure_git_env(repo, looker_config) 73 | _commit_and_push_changes(repo, project_name, project_version) 74 | _deploy_looker_project_to_production( 75 | looker_config["looker_instance_url"], 76 | looker_config["looker_project_id"], 77 | looker_config["looker_repository_branch"], 78 | looker_config["looker_webhook_secret"], 79 | ) 80 | 81 | 82 | def _prepare_repo_changes(src: pathlib.Path, local_repo_gen_path: pathlib.Path) -> None: 83 | _clear_repo_before_writing_lookml(local_repo_gen_path) 84 | _copy_all_files_by_extention(src, local_repo_gen_path, "model.lkml") 85 | _copy_all_files_by_extention( 86 | src, local_repo_gen_path.joinpath(LOOKML_VIEWS_SUBDIR), "view.lkml" 87 | ) 88 | 89 | with open(f"{local_repo_gen_path}/readme.txt", "w") as readme: 90 | readme.write( 91 | """models and views with extention '.dp.[view|model].lkml' are generated by data-pipelines-cli. 92 | Do not edit manually! Your changes could be overwrite! 93 | """ 94 | ) 95 | 96 | 97 | def _clear_repo_before_writing_lookml(local_repo_gen_path: pathlib.Path) -> None: 98 | if local_repo_gen_path.exists(): 99 | _remove_dp_files_from_repo(local_repo_gen_path, ".dp.model.lkml") 100 | 101 | if local_repo_gen_path.joinpath(LOOKML_VIEWS_SUBDIR).exists(): 102 | _remove_dp_files_from_repo( 103 | local_repo_gen_path.joinpath(LOOKML_VIEWS_SUBDIR), ".dp.view.lkml" 104 | ) 105 | 106 | 107 | def _remove_dp_files_from_repo(dir_path: pathlib.Path, files_extention: str) -> None: 108 | for file in os.listdir(dir_path): 109 | if file.endswith(files_extention): 110 | os.remove(dir_path.joinpath(file)) 111 | 112 | 113 | def _configure_git_env(repo: Repo, config: Dict[str, Any]) -> None: 114 | repo.config_writer().set_value("user", "name", config["looker_repository_username"]).release() 115 | repo.config_writer().set_value("user", "email", config["looker_repository_email"]).release() 116 | 117 | 118 | def _copy_all_files_by_extention( 119 | src: pathlib.Path, dest: pathlib.Path, files_extention: str 120 | ) -> None: 121 | os.makedirs(dest, exist_ok=True) 122 | for file_path in glob.glob(os.path.join(src, "**", "*." + files_extention), recursive=True): 123 | file_path_with_dp_ext = "{0}.dp.{1}.{2}".format(*file_path.rsplit(".", 2)) 124 | new_path = os.path.join(dest, os.path.basename(file_path_with_dp_ext)) 125 | copy(file_path, new_path) 126 | 127 | 128 | def _get_project_name_and_version() -> Tuple[str, str]: 129 | with open(pathlib.Path.cwd().joinpath("dbt_project.yml"), "r") as f: 130 | dbt_project_config = yaml.safe_load(f) 131 | return dbt_project_config["name"], dbt_project_config["version"] 132 | 133 | 134 | def _commit_and_push_changes(repo: Repo, project_name: str, project_version: str) -> None: 135 | echo_info("Publishing BI codes to Looker repository") 136 | repo.git.add(all=True) 137 | repo.index.commit(f"Publication from project {project_name}, version: {project_version}") 138 | origin = repo.remote(name="origin") 139 | origin.push() 140 | 141 | 142 | def _deploy_looker_project_to_production( 143 | looker_instance_url: str, project_id: str, branch: str, webhook_secret: str 144 | ) -> None: 145 | echo_info("Deploying Looker project to production") 146 | headers = {"X-Looker-Deploy-Secret": webhook_secret} 147 | requests.post( 148 | url=f"{looker_instance_url}/webhooks/projects/{project_id}/deploy/branch/{branch}", 149 | headers=headers, 150 | ) 151 | -------------------------------------------------------------------------------- /data_pipelines_cli/vcs_utils.py: -------------------------------------------------------------------------------- 1 | """Utilities related to VCS.""" 2 | 3 | from copier.vcs import GIT_PREFIX 4 | 5 | 6 | def add_suffix_to_git_template_path(template_path: str) -> str: 7 | """Add ``.git`` suffix to *template_path*, if necessary. 8 | 9 | Check if *template_path* starts with Git-specific prefix (e.g. `git://`), 10 | or `http://` or `https://` protocol. If so, then add ``.git`` suffix if 11 | not present. Does nothing otherwise (as *template_path* probably points to 12 | a local directory). 13 | 14 | :param template_path: Path or URI to Git-based repository 15 | :type template_path: str 16 | :return: *template_path* with ``.git`` as suffix, if necessary 17 | :rtype: str 18 | """ 19 | if not template_path.startswith(GIT_PREFIX) and not template_path.startswith( 20 | ("http://", "https://") 21 | ): 22 | return template_path 23 | return template_path + ("" if template_path.endswith(".git") else ".git") 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= -Ea 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | apidoc: 16 | @sphinx-apidoc ../data_pipelines_cli -o source -fMT 17 | 18 | .PHONY: help apidoc Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | If you are looking for information on a specific function, class, or 5 | method, this part of the documentation is for you. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | source/data_pipelines_cli 11 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGELOG.md 2 | :parser: myst_parser.docutils_ -------------------------------------------------------------------------------- /docs/cli.rst: -------------------------------------------------------------------------------- 1 | CLI Commands Reference 2 | ====================== 3 | 4 | If you are looking for extensive information on a specific CLI command, 5 | this part of the documentation is for you. 6 | 7 | .. click:: data_pipelines_cli.cli:_cli 8 | :prog: dp 9 | :nested: full 10 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | import os 10 | import sys 11 | 12 | sys.path.insert(0, os.path.abspath(".")) 13 | sys.path.insert(0, os.path.abspath("../")) 14 | sys.path.insert(0, os.path.abspath("../data_pipelines_cli")) 15 | 16 | 17 | # -- Project information ----------------------------------------------------- 18 | 19 | project = "data-pipelines-cli" 20 | copyright = "2021, GetInData" 21 | author = "GetInData" 22 | 23 | 24 | # -- General configuration --------------------------------------------------- 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be 27 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 28 | # ones. 29 | extensions = [ 30 | "sphinx.ext.autodoc", 31 | "sphinx.ext.viewcode", 32 | "sphinx_click", 33 | "myst_parser", 34 | ] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ["_templates"] 38 | 39 | # List of patterns, relative to source directory, that match files and 40 | # directories to ignore when looking for source files. 41 | # This pattern also affects html_static_path and html_extra_path. 42 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 43 | 44 | add_module_names = False 45 | 46 | 47 | # -- Options for HTML output ------------------------------------------------- 48 | 49 | # The theme to use for HTML and HTML Help pages. See the documentation for 50 | # a list of builtin themes. 51 | # 52 | html_theme = "sphinx_rtd_theme" 53 | 54 | # Add any paths that contain custom static files (such as style sheets) here, 55 | # relative to this directory. They are copied after the builtin static files, 56 | # so a file named "default.css" will overwrite the builtin "default.css". 57 | html_static_path = ["_static"] 58 | -------------------------------------------------------------------------------- /docs/images/created.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/docs/images/created.png -------------------------------------------------------------------------------- /docs/images/creating.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/docs/images/creating.png -------------------------------------------------------------------------------- /docs/images/init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/docs/images/init.png -------------------------------------------------------------------------------- /docs/images/integration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/docs/images/integration.png -------------------------------------------------------------------------------- /docs/images/railsroad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/docs/images/railsroad.png -------------------------------------------------------------------------------- /docs/images/run.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/docs/images/run.png -------------------------------------------------------------------------------- /docs/images/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/docs/images/test.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ``Data Pipelines CLI``: CLI for data platform 2 | ============================================== 3 | 4 | .. image:: https://img.shields.io/badge/python-3.9%20%7C%203.10-blue.svg 5 | :target: https://github.com/getindata/data-pipelines-cli 6 | :alt: Python Version 7 | 8 | .. image:: https://badge.fury.io/py/data-pipelines-cli.svg 9 | :target: https://pypi.org/project/data-pipelines-cli/ 10 | :alt: PyPI Version 11 | 12 | .. image:: https://pepy.tech/badge/data-pipelines-cli 13 | :target: https://pepy.tech/project/data-pipelines-cli 14 | :alt: Downloads 15 | 16 | .. image:: https://api.codeclimate.com/v1/badges/e44ed9383a42b59984f6/maintainability 17 | :target: https://codeclimate.com/github/getindata/data-pipelines-cli/maintainability 18 | :alt: Maintainability 19 | 20 | .. image:: https://api.codeclimate.com/v1/badges/e44ed9383a42b59984f6/test_coverage 21 | :target: https://codeclimate.com/github/getindata/data-pipelines-cli/test_coverage 22 | :alt: Test Coverage 23 | 24 | Introduction 25 | ------------ 26 | **Data Pipelines CLI**, also called **DP tool**, is a command-line tool providing an easy way to build and manage data pipelines based on **dbt** 27 | in an environment with GIT, Airflow, DataHub, VSCode, etc. 28 | 29 | The tool can be used in any environment with access to shell and ``Python`` installed. 30 | 31 | **data-pipelines-cli**'s main task is to cover technical complexities and provides an abstraction over all components 32 | that take part in Data Pipelines creation and execution. Thanks to the integration with templating engine it allows Analytics 33 | Engineers to create and configure new projects. The tool also simplifies automation as it handles deployments and 34 | publications of created transformations. 35 | 36 | Community 37 | ------------ 38 | Although the tools were created by `GetInData `_ and used in their project it is open-sourced and everyone is welcome 39 | to use and contribute to making it better and more powerful. 40 | 41 | .. toctree:: 42 | :maxdepth: 1 43 | :caption: Contents: 44 | 45 | installation 46 | setup_environment 47 | usage 48 | configuration 49 | integration 50 | cli 51 | api 52 | changelog 53 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | Use the package manager `pip `_ to 4 | install `data-pipelines-cli `_: 5 | 6 | 7 | .. code-block:: bash 8 | 9 | pip install data-pipelines-cli[] 10 | 11 | Depending on the systems that you want to integrate with you need to provide different flags in square brackets. You can provide comma separate list of flags, for example: 12 | 13 | .. code-block:: bash 14 | 15 | pip install data-pipelines-cli[gcs,git,bigquery] 16 | 17 | 18 | Depending on the data storage you have you can use: 19 | 20 | * bigquery 21 | * snowflake 22 | * redshift 23 | * postgres 24 | 25 | If you need git integration for loading packages published by other projects or publish them by yourself you will need: 26 | 27 | * git 28 | 29 | If you want to deploy created artifacts (docker images and DataHub metadata) add the following flags: 30 | 31 | * docker 32 | * datahub 33 | 34 | These are not usually used by a person user. 35 | 36 | If you need Business Intelligence integration you can use following options: 37 | 38 | * looker 39 | -------------------------------------------------------------------------------- /docs/integration.rst: -------------------------------------------------------------------------------- 1 | Integration with environment 2 | ====================== 3 | 4 | **Data Pipelines CLI** provides some sort of abstraction over multiple other components that take part in Data Pipeline 5 | processes. The following picture presents the whole environment which is handled by our tool. 6 | 7 | .. image:: images/integration.png 8 | :width: 700 9 | 10 | dbt 11 | ++++++++++++++++++++++++++++++++++++++++++++++ 12 | 13 | `dbt `_ is currently the main tool that **DP** integrates with. The purpose of the **DP** tool is to cover **dbt** technicalities 14 | including configuration and generates it on the fly whenever needed. At the same time, it gives more control over **dbt** 15 | process management by chaining commands, interpolating configuration, and providing easy environments portability. 16 | 17 | Copier 18 | ++++++++++++++++++++++++++++++++++++++++++++++ 19 | 20 | **DP** is heavily using `Copier `_ as templating tool. It gives a possibility to easily create new projects that are configured 21 | automatically after a series of questions. It is also used to configure the working environment with required environment 22 | variables. 23 | 24 | Docker 25 | ++++++++++++++++++++++++++++++++++++++++++++++ 26 | 27 | One of the artifacts during building and publishing Data Pipelines are `Docker's `_ images. Each 28 | created image contains **dbt** with its transformation and scripts to run. Created images are environment agnostic and 29 | can be deployed in any external configuration. Images are pushed to the selected Container Registry which configuration 30 | should be taken from the environment (there should be a docker client configured). 31 | 32 | Git 33 | ++++++++++++++++++++++++++++++++++++++++++++++ 34 | 35 | The `Data Pipelines CLI` can also publish created **dbt** packages for downstream usage into configured 36 | `GIT `_ repository. It uses key-based authentication where the key is provided as parameter `--key-path` 37 | 38 | Airflow 39 | ++++++++++++++++++++++++++++++++++++++++++++++ 40 | 41 | **DP** doesn't communicate directly with Airflow, it rather sends artifacts to Object storage managed by Airflow and 42 | `dbt-airflow-factory `_ library handles the rest. Created 43 | projects keep DAG and configuration required to execute on the Airflow side. 44 | 45 | Object storage 46 | ++++++++++++++++++++++++++++++++++++++++++++++ 47 | 48 | Configuration, Airflow DAG, and **dbt** manifest.json file are stored in Object storage for Airflow to be picked up and executed. 49 | the **DP** uses `fsspec `_ which gives a good abstraction over different 50 | object storage providers. Currently, the tools were tested with GCS and S3. 51 | 52 | DataHub 53 | ++++++++++++++++++++++++++++++++++++++++++++++ 54 | 55 | The `Data Pipelines CLI` is able to send data to `DataHub `_ based on a recipe in configuration. 56 | The tool uses DataHub CLI under the hoot. 57 | 58 | Visual Studio Code 59 | ++++++++++++++++++++++++++++++++++++++++++++++ 60 | 61 | `VS Code `_ is one of the recommended by us tools to work with **dbt**. **DP** tool simplify 62 | integration of the created project with the `VS Code` plugin for **dbt** management. 63 | 64 | Airbyte 65 | ++++++++++++++++++++++++++++++++++++++++++++++ 66 | 67 | `Data Pipelines CLI` can manage Airbyte connections and execute their syncs in Airflow tasks preceding dbt build. 68 | 69 | Looker 70 | ++++++++++++++++++++++++++++++++++++++++++++++ 71 | 72 | **dp** can generate lookML codes for your models and views, publish and deploy your `Looker `_ project 73 | 74 | -------------------------------------------------------------------------------- /docs/setup_environment.rst: -------------------------------------------------------------------------------- 1 | Setup an environment 2 | ===== 3 | 4 | This section is for Data Engineers who will be preparing and administrating the whole environment. 5 | It describes steps that should be done to prepare the DP tool to be used in an organization with full potential. 6 | 7 | Create Data Pipeline project template 8 | ---------------- 9 | 10 | The first thing that you need to do is to create a git repository with a project template used later to create multiple 11 | projects. The template should contain the whole directory structure and files used in your projects. 12 | Additionally, it should have a connection configuration to all components in your environment, CICD, and all other 13 | aspects specific to your company. Here you can find templates examples that you can adjust to your need: 14 | https://github.com/getindata/data-pipelines-template-example . Based on the template The Data Pipelines CLI will ask a user 15 | a series of questions to build the final project. 16 | 17 | Thanks to the ``copier`` you can leverage Jinja template syntax to create easily modifiable configuration templates. 18 | Just create a ``copier.yml`` and configure the template questions (read more at 19 | `copier documentation `_). 20 | 21 | Create a template to setup a local environment 22 | ---------------- 23 | 24 | Working with Data Pipelines usually requires local variables to be set to run and test avoiding messing in shared environments (DEV, STAGE, PROD). To simplify working environment preparation we also 25 | decided to use templates that will ask a series of questions and generate local configuration in a home directory. 26 | 27 | It requires a repository with a global configuration template file that you or your organization will be using. 28 | The repository should contain ``dp.yml.tmpl`` file looking similar to this: 29 | 30 | .. code-block:: yaml 31 | 32 | _templates_suffix: ".tmpl" 33 | _envops: 34 | autoescape: false 35 | block_end_string: "%]" 36 | block_start_string: "[%" 37 | comment_end_string: "#]" 38 | comment_start_string: "[#" 39 | keep_trailing_newline: true 40 | variable_end_string: "]]" 41 | variable_start_string: "[[" 42 | 43 | templates: 44 | my-first-template: 45 | template_name: my-first-template 46 | template_path: https://github.com//.git 47 | vars: 48 | username: [[ YOUR_USERNAME ]] 49 | 50 | The file must contain a list of available templates. The templates will be displayed and available for selection in 51 | Data Pipelines CLI. The next section contains variables that will be passed to the project whenever running in the configured environment. The 52 | same rules apply in template creation as for project templates. 53 | -------------------------------------------------------------------------------- /docs/source/data_pipelines_cli.cli_commands.generate.rst: -------------------------------------------------------------------------------- 1 | data\_pipelines\_cli.cli\_commands.generate package 2 | =================================================== 3 | 4 | .. automodule:: data_pipelines_cli.cli_commands.generate 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | data\_pipelines\_cli.cli\_commands.generate.generate module 13 | ----------------------------------------------------------- 14 | 15 | .. automodule:: data_pipelines_cli.cli_commands.generate.generate 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | data\_pipelines\_cli.cli\_commands.generate.model\_yaml module 21 | -------------------------------------------------------------- 22 | 23 | .. automodule:: data_pipelines_cli.cli_commands.generate.model_yaml 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | data\_pipelines\_cli.cli\_commands.generate.source\_sql module 29 | -------------------------------------------------------------- 30 | 31 | .. automodule:: data_pipelines_cli.cli_commands.generate.source_sql 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | data\_pipelines\_cli.cli\_commands.generate.source\_yaml module 37 | --------------------------------------------------------------- 38 | 39 | .. automodule:: data_pipelines_cli.cli_commands.generate.source_yaml 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | data\_pipelines\_cli.cli\_commands.generate.databricks\_job module 45 | --------------------------------------------------------------- 46 | 47 | .. automodule:: data_pipelines_cli.cli_commands.generate.source_yaml 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | data\_pipelines\_cli.cli\_commands.generate.utils module 53 | -------------------------------------------------------- 54 | 55 | .. automodule:: data_pipelines_cli.cli_commands.generate.utils 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | -------------------------------------------------------------------------------- /docs/source/data_pipelines_cli.cli_commands.rst: -------------------------------------------------------------------------------- 1 | data\_pipelines\_cli.cli\_commands package 2 | ========================================== 3 | 4 | .. automodule:: data_pipelines_cli.cli_commands 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | data_pipelines_cli.cli_commands.generate 15 | 16 | Submodules 17 | ---------- 18 | 19 | data\_pipelines\_cli.cli\_commands.clean module 20 | ----------------------------------------------- 21 | 22 | .. automodule:: data_pipelines_cli.cli_commands.clean 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | 27 | data\_pipelines\_cli.cli\_commands.compile module 28 | ------------------------------------------------- 29 | 30 | .. automodule:: data_pipelines_cli.cli_commands.compile 31 | :members: 32 | :undoc-members: 33 | :show-inheritance: 34 | 35 | data\_pipelines\_cli.cli\_commands.create module 36 | ------------------------------------------------ 37 | 38 | .. automodule:: data_pipelines_cli.cli_commands.create 39 | :members: 40 | :undoc-members: 41 | :show-inheritance: 42 | 43 | data\_pipelines\_cli.cli\_commands.deploy module 44 | ------------------------------------------------ 45 | 46 | .. automodule:: data_pipelines_cli.cli_commands.deploy 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | 51 | data\_pipelines\_cli.cli\_commands.docs module 52 | ---------------------------------------------- 53 | 54 | .. automodule:: data_pipelines_cli.cli_commands.docs 55 | :members: 56 | :undoc-members: 57 | :show-inheritance: 58 | 59 | data\_pipelines\_cli.cli\_commands.init module 60 | ---------------------------------------------- 61 | 62 | .. automodule:: data_pipelines_cli.cli_commands.init 63 | :members: 64 | :undoc-members: 65 | :show-inheritance: 66 | 67 | data\_pipelines\_cli.cli\_commands.prepare\_env module 68 | ------------------------------------------------------ 69 | 70 | .. automodule:: data_pipelines_cli.cli_commands.prepare_env 71 | :members: 72 | :undoc-members: 73 | :show-inheritance: 74 | 75 | data\_pipelines\_cli.cli\_commands.publish module 76 | ------------------------------------------------- 77 | 78 | .. automodule:: data_pipelines_cli.cli_commands.publish 79 | :members: 80 | :undoc-members: 81 | :show-inheritance: 82 | 83 | data\_pipelines\_cli.cli\_commands.run module 84 | --------------------------------------------- 85 | 86 | .. automodule:: data_pipelines_cli.cli_commands.run 87 | :members: 88 | :undoc-members: 89 | :show-inheritance: 90 | 91 | data\_pipelines\_cli.cli\_commands.seed module 92 | ---------------------------------------------- 93 | 94 | .. automodule:: data_pipelines_cli.cli_commands.seed 95 | :members: 96 | :undoc-members: 97 | :show-inheritance: 98 | 99 | data\_pipelines\_cli.cli\_commands.template module 100 | -------------------------------------------------- 101 | 102 | .. automodule:: data_pipelines_cli.cli_commands.template 103 | :members: 104 | :undoc-members: 105 | :show-inheritance: 106 | 107 | data\_pipelines\_cli.cli\_commands.test module 108 | ---------------------------------------------- 109 | 110 | .. automodule:: data_pipelines_cli.cli_commands.test 111 | :members: 112 | :undoc-members: 113 | :show-inheritance: 114 | 115 | data\_pipelines\_cli.cli\_commands.update module 116 | ------------------------------------------------ 117 | 118 | .. automodule:: data_pipelines_cli.cli_commands.update 119 | :members: 120 | :undoc-members: 121 | :show-inheritance: 122 | 123 | -------------------------------------------------------------------------------- /docs/source/data_pipelines_cli.rst: -------------------------------------------------------------------------------- 1 | data\_pipelines\_cli package 2 | ============================ 3 | 4 | .. automodule:: data_pipelines_cli 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | 14 | data_pipelines_cli.cli_commands 15 | 16 | Submodules 17 | ---------- 18 | 19 | data\_pipelines\_cli.airbyte\_utils module 20 | ------------------------------------------ 21 | 22 | .. automodule:: data_pipelines_cli.airbyte_utils 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | 27 | data\_pipelines\_cli.bi\_utils module 28 | ------------------------------------- 29 | 30 | .. automodule:: data_pipelines_cli.bi_utils 31 | :members: 32 | :undoc-members: 33 | :show-inheritance: 34 | 35 | data\_pipelines\_cli.cli module 36 | ------------------------------- 37 | 38 | .. automodule:: data_pipelines_cli.cli 39 | :members: 40 | :undoc-members: 41 | :show-inheritance: 42 | 43 | data\_pipelines\_cli.cli\_configs module 44 | ---------------------------------------- 45 | 46 | .. automodule:: data_pipelines_cli.cli_configs 47 | :members: 48 | :undoc-members: 49 | :show-inheritance: 50 | 51 | data\_pipelines\_cli.cli\_constants module 52 | ------------------------------------------ 53 | 54 | .. automodule:: data_pipelines_cli.cli_constants 55 | :members: 56 | :undoc-members: 57 | :show-inheritance: 58 | 59 | data\_pipelines\_cli.cli\_utils module 60 | -------------------------------------- 61 | 62 | .. automodule:: data_pipelines_cli.cli_utils 63 | :members: 64 | :undoc-members: 65 | :show-inheritance: 66 | 67 | data\_pipelines\_cli.config\_generation module 68 | ---------------------------------------------- 69 | 70 | .. automodule:: data_pipelines_cli.config_generation 71 | :members: 72 | :undoc-members: 73 | :show-inheritance: 74 | 75 | data\_pipelines\_cli.data\_structures module 76 | -------------------------------------------- 77 | 78 | .. automodule:: data_pipelines_cli.data_structures 79 | :members: 80 | :undoc-members: 81 | :show-inheritance: 82 | 83 | data\_pipelines\_cli.dbt\_utils module 84 | -------------------------------------- 85 | 86 | .. automodule:: data_pipelines_cli.dbt_utils 87 | :members: 88 | :undoc-members: 89 | :show-inheritance: 90 | 91 | data\_pipelines\_cli.docker\_response\_reader module 92 | ---------------------------------------------------- 93 | 94 | .. automodule:: data_pipelines_cli.docker_response_reader 95 | :members: 96 | :undoc-members: 97 | :show-inheritance: 98 | 99 | data\_pipelines\_cli.errors module 100 | ---------------------------------- 101 | 102 | .. automodule:: data_pipelines_cli.errors 103 | :members: 104 | :undoc-members: 105 | :show-inheritance: 106 | 107 | data\_pipelines\_cli.filesystem\_utils module 108 | --------------------------------------------- 109 | 110 | .. automodule:: data_pipelines_cli.filesystem_utils 111 | :members: 112 | :undoc-members: 113 | :show-inheritance: 114 | 115 | data\_pipelines\_cli.io\_utils module 116 | ------------------------------------- 117 | 118 | .. automodule:: data_pipelines_cli.io_utils 119 | :members: 120 | :undoc-members: 121 | :show-inheritance: 122 | 123 | data\_pipelines\_cli.jinja module 124 | --------------------------------- 125 | 126 | .. automodule:: data_pipelines_cli.jinja 127 | :members: 128 | :undoc-members: 129 | :show-inheritance: 130 | 131 | data\_pipelines\_cli.looker\_utils module 132 | ----------------------------------------- 133 | 134 | .. automodule:: data_pipelines_cli.looker_utils 135 | :members: 136 | :undoc-members: 137 | :show-inheritance: 138 | 139 | data\_pipelines\_cli.vcs\_utils module 140 | -------------------------------------- 141 | 142 | .. automodule:: data_pipelines_cli.vcs_utils 143 | :members: 144 | :undoc-members: 145 | :show-inheritance: 146 | 147 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit==2.15.0 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.30.0 3 | 4 | [bumpversion:file:setup.py] 5 | 6 | [bumpversion:file:data_pipelines_cli/__init__.py] 7 | 8 | [flake8] 9 | exclude = .git,__pycache__,build,dist,docs/source/conf.py 10 | max-line-length = 100 11 | extend-ignore = E203 12 | 13 | [mypy] 14 | ignore_missing_imports = True 15 | follow_imports = silent 16 | strict_optional = True 17 | no_implicit_optional = True 18 | warn_redundant_casts = True 19 | warn_unused_ignores = True 20 | disallow_any_generics = True 21 | check_untyped_defs = True 22 | no_implicit_reexport = True 23 | disallow_untyped_defs = True 24 | 25 | [mypy-tests.*] 26 | ignore_errors = True 27 | 28 | [mypy-requests.*] 29 | ignore_missing_imports = True 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """data_pipelines_cli module.""" 2 | 3 | from setuptools import find_packages, setup 4 | 5 | with open("README.md") as f: 6 | README = f.read() 7 | 8 | INSTALL_REQUIREMENTS = [ 9 | "MarkupSafe==2.1.1", 10 | "Werkzeug==2.2.3", 11 | "click==8.1.3", 12 | "pyyaml==6.0.1", 13 | "types-PyYAML==6.0.12.2", 14 | "copier==7.0.1", 15 | "Jinja2==3.1.2", 16 | "fsspec==2023.12.1", 17 | "packaging==21.3", 18 | "colorama==0.4.5", 19 | "dbt-core==1.7.3", 20 | "pydantic<2", 21 | ] 22 | 23 | EXTRA_FILESYSTEMS_REQUIRE = { 24 | "gcs": ["gcsfs==2023.12.1"], 25 | "s3": ["s3fs==2023.12.1"], 26 | } 27 | 28 | EXTRA_REQUIRE = { 29 | # DBT adapters 30 | "bigquery": ["dbt-bigquery==1.7.2"], 31 | "postgres": ["dbt-postgres==1.7.3"], 32 | "snowflake": ["dbt-snowflake==1.7.1"], 33 | "redshift": ["dbt-redshift==1.7.1"], 34 | "glue": ["dbt-glue==1.7.0", "dbt-spark[session]==1.7.1"], 35 | "databricks": ["dbt-databricks-factory>=0.1.1"], 36 | "dbt-all": [ 37 | "dbt-bigquery==1.7.2", 38 | "dbt-postgres==1.7.3", 39 | "dbt-snowflake==1.7.1", 40 | "dbt-redshift==1.7.1", 41 | "dbt-glue==1.7.0", 42 | ], 43 | # --- 44 | "docker": ["docker==6.0.1"], 45 | "datahub": ["acryl-datahub[dbt]==0.12.0.5"], 46 | "git": ["GitPython==3.1.29"], 47 | "looker": ["dbt2looker==0.11.0"], 48 | "tests": [ 49 | "pytest==7.2.0", 50 | "pytest-cov==4.0.0", 51 | "pre-commit==2.20.0", 52 | "tox==3.27.1", 53 | "tox-gh-actions==2.12.0", 54 | "moto[s3]==4.0.11", 55 | "gcp-storage-emulator==2022.6.11", 56 | "GitPython==3.1.29", 57 | "types-requests==2.28.11.5", 58 | "gcsfs==2023.12.1", 59 | "s3fs==2023.12.1", 60 | ], 61 | "docs": [ 62 | "sphinx==5.3.0", 63 | "sphinx-rtd-theme==1.1.1", 64 | "sphinx-click==4.4.0", 65 | "myst-parser==0.18.1", 66 | "GitPython==3.1.29", 67 | "colorama==0.4.5", 68 | "pytz==2023.3", 69 | ], 70 | **EXTRA_FILESYSTEMS_REQUIRE, 71 | } 72 | 73 | setup( 74 | name="data_pipelines_cli", 75 | version="0.30.0", 76 | description="CLI for data platform", 77 | long_description=README, 78 | long_description_content_type="text/markdown", 79 | license="Apache Software License (Apache 2.0)", 80 | license_files=("LICENSE",), 81 | python_requires=">=3.9", 82 | classifiers=[ 83 | "Development Status :: 1 - Planning", 84 | "Programming Language :: Python :: 3.9", 85 | "Programming Language :: Python :: 3.10", 86 | ], 87 | keywords="dbt airflow cli", 88 | author="Andrzej Swatowski", 89 | author_email="andrzej.swatowski@getindata.com", 90 | url="https://github.com/getindata/data-pipelines-cli/", 91 | packages=find_packages(exclude=["docs", "tests"]), 92 | include_package_data=True, 93 | install_requires=INSTALL_REQUIREMENTS, 94 | extras_require=EXTRA_REQUIRE, 95 | entry_points={"console_scripts": ["dp=data_pipelines_cli.cli:cli"]}, 96 | ) 97 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/tests/__init__.py -------------------------------------------------------------------------------- /tests/cli_commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/getindata/data-pipelines-cli/f3813edf8757cb98be5c2f1e073cd628df770b25/tests/cli_commands/__init__.py -------------------------------------------------------------------------------- /tests/cli_commands/test_clean.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tempfile 3 | import unittest 4 | from typing import List 5 | from unittest.mock import patch 6 | 7 | from click.testing import CliRunner 8 | 9 | from data_pipelines_cli.cli import _cli 10 | 11 | 12 | class CleanCommandTestCase(unittest.TestCase): 13 | def setUp(self) -> None: 14 | self.subprocess_run_args = [] 15 | 16 | def _mock_run(self, args: List[str]): 17 | self.subprocess_run_args = args 18 | 19 | def test_clean(self): 20 | with patch("data_pipelines_cli.cli_commands.clean.subprocess_run", self._mock_run): 21 | runner = CliRunner() 22 | result = runner.invoke(_cli, ["clean"]) 23 | self.assertEqual(0, result.exit_code, msg=result.exception) 24 | self.assertListEqual(["dbt", "clean"], self.subprocess_run_args) 25 | 26 | def test_clean_remove_dir(self): 27 | with patch( 28 | "data_pipelines_cli.cli_commands.clean.subprocess_run", self._mock_run 29 | ), tempfile.TemporaryDirectory() as tmp_dir, patch( 30 | "data_pipelines_cli.cli_commands.clean.BUILD_DIR", pathlib.Path(tmp_dir) 31 | ): 32 | assert pathlib.Path(tmp_dir).exists() 33 | runner = CliRunner() 34 | result = runner.invoke(_cli, ["clean"]) 35 | self.assertEqual(0, result.exit_code, msg=result.exception) 36 | self.assertListEqual(["dbt", "clean"], self.subprocess_run_args) 37 | assert not pathlib.Path(tmp_dir).exists() 38 | 39 | # Fix for Python 3.7. 40 | # Otherwise it throws, as there is no directory to remove. 41 | pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True) 42 | -------------------------------------------------------------------------------- /tests/cli_commands/test_create.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tempfile 3 | import unittest 4 | from unittest.mock import MagicMock, patch 5 | 6 | import yaml 7 | from click.testing import CliRunner 8 | 9 | from data_pipelines_cli.cli import _cli 10 | from data_pipelines_cli.errors import DataPipelinesError, NoConfigFileError 11 | 12 | 13 | class CreateCommandTestCase(unittest.TestCase): 14 | copier_src_path = "source_path" 15 | copier_dst_path = "destination_path" 16 | vcs_ref = "0ffedb3edc9dc588e6f466cbcea953ad26fbc037" 17 | goldens_dir_path = pathlib.Path(__file__).parent.parent.joinpath("goldens") 18 | 19 | def _mock_copier(self, src_path: str, dst_path: str, vcs_ref: str): 20 | self.assertEqual(self.copier_src_path, src_path) 21 | self.assertEqual(self.copier_dst_path, dst_path) 22 | # self.assertEqual(self.vcs_ref, vcs_ref) 23 | 24 | def test_create_no_config(self): 25 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 26 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 27 | pathlib.Path(tmp_dir).joinpath("non_existing_file"), 28 | ): 29 | runner = CliRunner(mix_stderr=False) 30 | result = runner.invoke(_cli, ["create", "some_path"]) 31 | self.assertEqual(1, result.exit_code) 32 | self.assertIsInstance(result.exception, NoConfigFileError) 33 | 34 | @patch( 35 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 36 | goldens_dir_path.joinpath("example_config.yml"), 37 | ) 38 | def test_create_with_template_path(self): 39 | with patch("copier.run_auto", self._mock_copier): 40 | runner = CliRunner(mix_stderr=False) 41 | result = runner.invoke(_cli, ["create", self.copier_dst_path, self.copier_src_path]) 42 | self.assertEqual(0, result.exit_code, msg=result.exception) 43 | 44 | @patch( 45 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 46 | goldens_dir_path.joinpath("example_config.yml"), 47 | ) 48 | def test_create_with_template_name(self): 49 | with patch("copier.run_auto", self._mock_copier): 50 | runner = CliRunner(mix_stderr=False) 51 | result = runner.invoke(_cli, ["create", self.copier_dst_path, "create_test"]) 52 | self.assertEqual(0, result.exit_code, msg=result.exception) 53 | 54 | @patch( 55 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 56 | goldens_dir_path.joinpath("example_config.yml"), 57 | ) 58 | @patch("questionary.select") 59 | def test_create_without_template_path(self, mock_select): 60 | magic_mock = MagicMock() 61 | magic_mock.configure_mock(**{"ask": lambda: "create_test"}) 62 | mock_select.return_value = magic_mock 63 | 64 | with patch("copier.run_auto", self._mock_copier): 65 | runner = CliRunner(mix_stderr=False) 66 | result = runner.invoke(_cli, ["create", self.copier_dst_path]) 67 | self.assertEqual(0, result.exit_code, msg=result.exception) 68 | 69 | def test_no_templates(self): 70 | with tempfile.NamedTemporaryFile() as tmp_file, patch( 71 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 72 | pathlib.Path(tmp_file.name), 73 | ): 74 | with open(tmp_file.name, "w") as f: 75 | yaml.dump({"vars": {}, "templates": {}}, f) 76 | runner = CliRunner(mix_stderr=False) 77 | result = runner.invoke(_cli, ["create", "some_path"]) 78 | self.assertEqual(1, result.exit_code) 79 | self.assertIsInstance(result.exception, DataPipelinesError) 80 | self.assertRegex(result.exception.message, r"^No template provided\..*") 81 | -------------------------------------------------------------------------------- /tests/cli_commands/test_init.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tempfile 3 | import unittest 4 | from unittest.mock import MagicMock, patch 5 | 6 | import yaml 7 | from click.testing import CliRunner 8 | 9 | from data_pipelines_cli.cli import _cli 10 | from data_pipelines_cli.cli_commands.init import init 11 | from data_pipelines_cli.cli_constants import DEFAULT_GLOBAL_CONFIG 12 | from data_pipelines_cli.data_structures import ( 13 | DataPipelinesConfig, 14 | TemplateConfig, 15 | read_env_config, 16 | ) 17 | from data_pipelines_cli.errors import DataPipelinesError 18 | 19 | 20 | class InitCommandTestCase(unittest.TestCase): 21 | test_config_template_path = pathlib.Path(__file__).parent.parent.joinpath( 22 | "goldens", "config_template" 23 | ) 24 | example_config_dict = DataPipelinesConfig( 25 | templates={ 26 | "my-template": TemplateConfig( 27 | template_name="my-template", 28 | template_path="https://example.com/git/example.git", 29 | ), 30 | "local-template": TemplateConfig( 31 | template_name="local-template", 32 | template_path="/Users/test_user/Documents/project-template", 33 | ), 34 | }, 35 | vars={ 36 | "username": "test_user", 37 | }, 38 | ) 39 | 40 | @patch("data_pipelines_cli.cli_commands.init._download_global_config") 41 | def test_init(self, mock_download): 42 | runner = CliRunner() 43 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 44 | "data_pipelines_cli.cli_commands.init.ENV_CONFIGURATION_PATH", 45 | pathlib.Path(tmp_dir).joinpath(".dp.yml"), 46 | ), patch( 47 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 48 | pathlib.Path(tmp_dir).joinpath(".dp.yml"), 49 | ): 50 | mock_download.return_value = self.example_config_dict 51 | result = runner.invoke(_cli, ["init", str(self.test_config_template_path)]) 52 | self.assertEqual(0, result.exit_code, msg=result.exception) 53 | self.assertEqual(self.example_config_dict, read_env_config()) 54 | 55 | def test_global_config(self): 56 | runner = CliRunner() 57 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 58 | "data_pipelines_cli.cli_commands.init.ENV_CONFIGURATION_PATH", 59 | pathlib.Path(tmp_dir).joinpath(".dp.yml"), 60 | ), patch( 61 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 62 | pathlib.Path(tmp_dir).joinpath(".dp.yml"), 63 | ): 64 | result = runner.invoke(_cli, ["init"]) 65 | self.assertEqual(0, result.exit_code, msg=result.exception) 66 | self.assertEqual(DEFAULT_GLOBAL_CONFIG, read_env_config()) 67 | 68 | @patch("questionary.confirm") 69 | def test_overwrite_yes(self, mock_questionary): 70 | magic_mock = MagicMock() 71 | magic_mock.configure_mock(**{"ask": lambda: True}) 72 | mock_questionary.return_value = magic_mock 73 | 74 | with tempfile.TemporaryDirectory() as tmp_dir: 75 | tmp_dp_path = pathlib.Path(tmp_dir).joinpath(".dp.yml") 76 | with patch( 77 | "data_pipelines_cli.cli_commands.init.ENV_CONFIGURATION_PATH", 78 | tmp_dp_path, 79 | ), patch( 80 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 81 | tmp_dp_path, 82 | ): 83 | with open(tmp_dp_path, "w") as tmp_file: 84 | yaml.dump(self.example_config_dict, tmp_file) 85 | self.assertEqual(self.example_config_dict, read_env_config()) 86 | init(None) 87 | self.assertEqual(DEFAULT_GLOBAL_CONFIG, read_env_config()) 88 | 89 | @patch("questionary.confirm") 90 | def test_overwrite_no(self, mock_questionary): 91 | magic_mock = MagicMock() 92 | magic_mock.configure_mock(**{"ask": lambda: False}) 93 | mock_questionary.return_value = magic_mock 94 | 95 | with tempfile.TemporaryDirectory() as tmp_dir: 96 | tmp_dp_path = pathlib.Path(tmp_dir).joinpath(".dp.yml") 97 | with patch( 98 | "data_pipelines_cli.cli_commands.init.ENV_CONFIGURATION_PATH", 99 | tmp_dp_path, 100 | ), patch( 101 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 102 | tmp_dp_path, 103 | ): 104 | with open(tmp_dp_path, "w") as tmp_file: 105 | yaml.dump(self.example_config_dict, tmp_file) 106 | self.assertEqual(self.example_config_dict, read_env_config()) 107 | 108 | with self.assertRaises(DataPipelinesError): 109 | init(None) 110 | self.assertEqual(self.example_config_dict, read_env_config()) 111 | -------------------------------------------------------------------------------- /tests/cli_commands/test_prepare_env.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tempfile 3 | import unittest 4 | from unittest.mock import patch 5 | 6 | import yaml 7 | from click.testing import CliRunner 8 | 9 | from data_pipelines_cli.cli import _cli 10 | from data_pipelines_cli.cli_commands.prepare_env import prepare_env 11 | from data_pipelines_cli.errors import JinjaVarKeyError 12 | 13 | 14 | class GenHomeProfilesCommandTestCase(unittest.TestCase): 15 | goldens_dir_path = pathlib.Path(__file__).parent.parent.joinpath("goldens") 16 | rendered_from_vars_profile = { 17 | "bigquery": { 18 | "target": "env_execution", 19 | "outputs": { 20 | "env_execution": { 21 | "method": "service-account", 22 | "project": "exampleproject", 23 | "dataset": "var21-dataset", 24 | "keyfile": "/tmp/a/b/c/d.json", 25 | "timeout_seconds": 150, 26 | "priority": "interactive", 27 | "location": "us-west1", 28 | "threads": 1337, 29 | "retries": 1, 30 | "type": "bigquery", 31 | } 32 | }, 33 | } 34 | } 35 | 36 | def setUp(self) -> None: 37 | self.maxDiff = None 38 | 39 | def test_no_var_profiles_generation(self): 40 | runner = CliRunner() 41 | 42 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 43 | "data_pipelines_cli.cli_constants.BUILD_DIR", pathlib.Path(tmp_dir) 44 | ), patch("data_pipelines_cli.config_generation.BUILD_DIR", pathlib.Path(tmp_dir),), patch( 45 | "data_pipelines_cli.dbt_utils.BUILD_DIR", 46 | pathlib.Path(tmp_dir), 47 | ), patch( 48 | "pathlib.Path.cwd", lambda: self.goldens_dir_path 49 | ), tempfile.TemporaryDirectory() as tmp_dir2, patch( 50 | "pathlib.Path.home", lambda: pathlib.Path(tmp_dir2) 51 | ), patch( 52 | "data_pipelines_cli.dbt_utils.subprocess_run", lambda _args: None 53 | ): 54 | runner.invoke(_cli, ["prepare-env"]) 55 | with open( 56 | pathlib.Path(tmp_dir2).joinpath(".dbt", "profiles.yml"), "r" 57 | ) as generated, open( 58 | self.goldens_dir_path.joinpath("example_profiles", "local_snowflake.yml"), 59 | "r", 60 | ) as prepared: 61 | self.assertDictEqual(yaml.safe_load(prepared), yaml.safe_load(generated)) 62 | 63 | def test_vars_profiles_generation(self): 64 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 65 | "data_pipelines_cli.cli_constants.BUILD_DIR", pathlib.Path(tmp_dir) 66 | ), patch("data_pipelines_cli.config_generation.BUILD_DIR", pathlib.Path(tmp_dir),), patch( 67 | "data_pipelines_cli.dbt_utils.BUILD_DIR", 68 | pathlib.Path(tmp_dir), 69 | ), patch.dict( 70 | "os.environ", BIGQUERY_KEYFILE="/tmp/a/b/c/d.json" 71 | ), patch( 72 | "pathlib.Path.cwd", lambda: self.goldens_dir_path 73 | ), tempfile.TemporaryDirectory() as tmp_dir2, patch( 74 | "pathlib.Path.home", lambda: pathlib.Path(tmp_dir2) 75 | ), patch( 76 | "data_pipelines_cli.dbt_utils.subprocess_run", lambda _args, **_kwargs: None 77 | ): 78 | prepare_env("staging") 79 | 80 | with open(pathlib.Path(tmp_dir2).joinpath(".dbt", "profiles.yml"), "r") as generated: 81 | self.assertDictEqual(self.rendered_from_vars_profile, yaml.safe_load(generated)) 82 | 83 | def test_raise_missing_variable(self): 84 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 85 | "data_pipelines_cli.cli_constants.BUILD_DIR", pathlib.Path(tmp_dir) 86 | ), patch("data_pipelines_cli.config_generation.BUILD_DIR", pathlib.Path(tmp_dir),), patch( 87 | "data_pipelines_cli.cli_commands.prepare_env.read_dbt_vars_from_configs", 88 | lambda _env: {}, 89 | ), patch.dict( 90 | "os.environ", BIGQUERY_KEYFILE="/tmp/a/b/c/d.json" 91 | ), patch( 92 | "pathlib.Path.cwd", lambda: self.goldens_dir_path 93 | ), tempfile.TemporaryDirectory() as tmp_dir2, patch( 94 | "pathlib.Path.home", lambda: pathlib.Path(tmp_dir2) 95 | ), patch( 96 | "data_pipelines_cli.dbt_utils.subprocess_run", lambda _args: None 97 | ): 98 | with self.assertRaises(JinjaVarKeyError): 99 | prepare_env("staging") 100 | 101 | def test_raise_missing_environment_variable(self): 102 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 103 | "data_pipelines_cli.cli_constants.BUILD_DIR", pathlib.Path(tmp_dir) 104 | ), patch("data_pipelines_cli.config_generation.BUILD_DIR", pathlib.Path(tmp_dir),), patch( 105 | "data_pipelines_cli.dbt_utils.BUILD_DIR", 106 | pathlib.Path(tmp_dir), 107 | ), patch.dict( 108 | "os.environ", {} 109 | ), patch( 110 | "pathlib.Path.cwd", lambda: self.goldens_dir_path 111 | ), tempfile.TemporaryDirectory() as tmp_dir2, patch( 112 | "pathlib.Path.home", lambda: pathlib.Path(tmp_dir2) 113 | ), patch( 114 | "data_pipelines_cli.dbt_utils.subprocess_run", lambda _args: None 115 | ): 116 | with self.assertRaises(JinjaVarKeyError): 117 | prepare_env("staging") 118 | -------------------------------------------------------------------------------- /tests/cli_commands/test_run_test.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tempfile 3 | import unittest 4 | from typing import List 5 | from unittest.mock import patch 6 | 7 | from click.testing import CliRunner 8 | 9 | from data_pipelines_cli.cli import _cli 10 | 11 | 12 | class RunTestCommandTestCase(unittest.TestCase): 13 | commands_to_test = ["run", "test", "seed", "docs-serve"] 14 | goldens_dir_path = pathlib.Path(__file__).parent.parent.joinpath("goldens") 15 | 16 | def setUp(self) -> None: 17 | self.subprocess_run_args = [] 18 | 19 | def _mock_run(self, args: List[str], **_kwargs): 20 | self.subprocess_run_args = args 21 | 22 | @patch( 23 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 24 | goldens_dir_path.joinpath("example_config.yml"), 25 | ) 26 | def test_no_arg(self): 27 | for cmd in self.commands_to_test: 28 | with self.subTest(command=cmd), patch( 29 | "pathlib.Path.cwd", lambda: self.goldens_dir_path 30 | ), tempfile.TemporaryDirectory() as tmp_dir, patch( 31 | "data_pipelines_cli.config_generation.BUILD_DIR", pathlib.Path(tmp_dir) 32 | ), patch( 33 | "data_pipelines_cli.cli_commands.compile.BUILD_DIR", 34 | pathlib.Path(tmp_dir), 35 | ), patch( 36 | "data_pipelines_cli.dbt_utils.BUILD_DIR", pathlib.Path(tmp_dir) 37 | ), patch( 38 | "data_pipelines_cli.cli_constants.BUILD_DIR", pathlib.Path(tmp_dir) 39 | ), patch( 40 | "data_pipelines_cli.dbt_utils.subprocess_run", self._mock_run 41 | ), patch( 42 | "data_pipelines_cli.cli_commands.compile.bi" 43 | ): 44 | runner = CliRunner() 45 | result = runner.invoke(_cli, [cmd]) 46 | self.assertEqual(0, result.exit_code, msg=result.exception) 47 | 48 | self.assertEqual("dbt", self.subprocess_run_args[0]) 49 | split_cmd = cmd.split("-") 50 | self.assertEqual(split_cmd, self.subprocess_run_args[1 : 1 + len(split_cmd)]) 51 | args_str = " ".join(self.subprocess_run_args) 52 | self.assertIn("--profile snowflake", args_str) 53 | self.assertIn("--target local", args_str) 54 | 55 | @patch( 56 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 57 | goldens_dir_path.joinpath("example_config.yml"), 58 | ) 59 | def test_dev_arg(self): 60 | for cmd in self.commands_to_test: 61 | for env in ["dev", "staging"]: 62 | with self.subTest(command=cmd, environment=env), patch( 63 | "pathlib.Path.cwd", lambda: self.goldens_dir_path 64 | ), tempfile.TemporaryDirectory() as tmp_dir, patch( 65 | "data_pipelines_cli.config_generation.BUILD_DIR", 66 | pathlib.Path(tmp_dir), 67 | ), patch( 68 | "data_pipelines_cli.cli_commands.compile.BUILD_DIR", 69 | pathlib.Path(tmp_dir), 70 | ), patch( 71 | "data_pipelines_cli.dbt_utils.BUILD_DIR", pathlib.Path(tmp_dir) 72 | ), patch( 73 | "data_pipelines_cli.cli_constants.BUILD_DIR", pathlib.Path(tmp_dir) 74 | ), patch( 75 | "data_pipelines_cli.dbt_utils.subprocess_run", self._mock_run 76 | ), patch( 77 | "data_pipelines_cli.cli_commands.compile.bi" 78 | ): 79 | runner = CliRunner() 80 | result = runner.invoke(_cli, [cmd, "--env", env]) 81 | self.assertEqual(0, result.exit_code, msg=result.exception) 82 | 83 | self.assertEqual("dbt", self.subprocess_run_args[0]) 84 | split_cmd = cmd.split("-") 85 | self.assertEqual(split_cmd, self.subprocess_run_args[1 : 1 + len(split_cmd)]) 86 | args_str = " ".join(self.subprocess_run_args) 87 | self.assertIn("--profile bigquery", args_str) 88 | self.assertIn("--target env_execution", args_str) 89 | -------------------------------------------------------------------------------- /tests/cli_commands/test_template.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import unittest 3 | from unittest.mock import patch 4 | 5 | from click.testing import CliRunner 6 | 7 | from data_pipelines_cli.cli import _cli 8 | 9 | 10 | class TemplateCommandTestCase(unittest.TestCase): 11 | example_config_path = pathlib.Path(__file__).parent.parent.joinpath( 12 | "goldens", "example_config.yml" 13 | ) 14 | 15 | def test_list_templates(self): 16 | runner = CliRunner() 17 | with patch( 18 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 19 | self.example_config_path, 20 | ): 21 | result = runner.invoke(_cli, ["template-list"]) 22 | self.assertEqual(0, result.exit_code, msg=result.exception) 23 | self.assertEqual( 24 | "AVAILABLE TEMPLATES:\n\n" 25 | "template_name: template1\n" 26 | "template_path: https://example.com/xyz/abcd.git\n\n" 27 | "template_name: template2\n" 28 | "template_path: https://example.com/git/example.git\n\n" 29 | "template_name: create_test\n" 30 | "template_path: source_path\n\n", 31 | result.output, 32 | ) 33 | -------------------------------------------------------------------------------- /tests/cli_commands/test_update.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | 4 | from click.testing import CliRunner 5 | from copier.errors import UserMessageError 6 | 7 | from data_pipelines_cli.cli import _cli 8 | from data_pipelines_cli.cli_commands import update 9 | 10 | 11 | class UpdateCommandTestCase(unittest.TestCase): 12 | def setUp(self) -> None: 13 | self.dst_path = None 14 | self.vcs_ref = "" 15 | 16 | def _mock_copier(self, dst_path: str, vcs_ref: str): 17 | self.dst_path = dst_path 18 | self.vcs_ref = vcs_ref 19 | 20 | def test_update_with_dst_path(self): 21 | with patch("copier.run_auto", self._mock_copier): 22 | runner = CliRunner(mix_stderr=False) 23 | result = runner.invoke(_cli, ["update", "/some_path/", "/other_path/"]) 24 | self.assertEqual(0, result.exit_code, msg=result.exception) 25 | self.assertEqual("HEAD", self.vcs_ref) 26 | self.assertEqual("/some_path/", self.dst_path) 27 | 28 | def test_update_with_dst_path_and_vcs_ref(self): 29 | with patch("copier.run_auto", self._mock_copier): 30 | runner = CliRunner(mix_stderr=False) 31 | result = runner.invoke( 32 | _cli, 33 | [ 34 | "update", 35 | "/some_path/", 36 | "--vcs-ref", 37 | "2514ef4ca5929e0f5e7a2b9c702a4cd58a6d2ecf", 38 | ], 39 | ) 40 | self.assertEqual(0, result.exit_code, msg=result.exception) 41 | self.assertEqual("2514ef4ca5929e0f5e7a2b9c702a4cd58a6d2ecf", self.vcs_ref) 42 | self.assertEqual("/some_path/", self.dst_path) 43 | 44 | def test_update_no_copier_answers(self): 45 | with self.assertRaises(UserMessageError): 46 | update.update("not_existing_path", "HEAD") 47 | -------------------------------------------------------------------------------- /tests/goldens/config/airbyte/airbyte.yml: -------------------------------------------------------------------------------- 1 | airbyte_connection_id: airbyte_connection_id 2 | airbyte_url: https://airbyte.dataops-dev.getindata.dev 3 | workspace_id: 35ac8060-b4da-4742-b5ba-16ce29dcf526 4 | connections: 5 | POSTGRES_BQ_CONNECTION: 6 | destinationId: b3696ac3-93b2-4039-9021-e1f884b03a95 7 | name: POSTGRES_BQ_CONNECTION 8 | namespaceDefinition: customformat 9 | namespaceFormat: jaffle_shop 10 | operationIds: [] 11 | operations: 12 | - name: Normalization 13 | operatorConfiguration: 14 | normalization: 15 | option: basic 16 | operatorType: normalization 17 | workspaceId: 2e1fabea-1066-4094-82e5-8dd0e9d09dd3 18 | sourceId: 06a6f19f-b747-4672-a191-80b96f67c36e 19 | status: inactive 20 | syncCatalog: 21 | streams: 22 | - config: 23 | aliasName: raw_orders 24 | cursorField: [] 25 | destinationSyncMode: append 26 | primaryKey: [] 27 | selected: true 28 | syncMode: full_refresh 29 | stream: 30 | defaultCursorField: [] 31 | jsonSchema: 32 | properties: 33 | id: 34 | airbyte_type: integer 35 | type: number 36 | order_date: 37 | format: date 38 | type: string 39 | status: 40 | type: string 41 | user_id: 42 | airbyte_type: integer 43 | type: number 44 | type: object 45 | name: raw_orders 46 | namespace: public 47 | sourceDefinedPrimaryKey: [] 48 | supportedSyncModes: 49 | - full_refresh 50 | - config: 51 | aliasName: raw_payments 52 | cursorField: [] 53 | destinationSyncMode: append 54 | primaryKey: [] 55 | selected: true 56 | syncMode: full_refresh 57 | stream: 58 | defaultCursorField: [] 59 | jsonSchema: 60 | properties: 61 | amount: 62 | airbyte_type: integer 63 | type: number 64 | id: 65 | airbyte_type: integer 66 | type: number 67 | order_id: 68 | airbyte_type: integer 69 | type: number 70 | payment_method: 71 | type: string 72 | type: object 73 | name: raw_payments 74 | namespace: public 75 | sourceDefinedPrimaryKey: [] 76 | supportedSyncModes: 77 | - full_refresh 78 | - config: 79 | aliasName: raw_customers 80 | cursorField: [] 81 | destinationSyncMode: append 82 | primaryKey: [] 83 | selected: true 84 | syncMode: full_refresh 85 | stream: 86 | defaultCursorField: [] 87 | jsonSchema: 88 | properties: 89 | first_name: 90 | type: string 91 | id: 92 | airbyte_type: integer 93 | type: number 94 | last_name: 95 | type: string 96 | type: object 97 | name: raw_customers 98 | namespace: public 99 | sourceDefinedPrimaryKey: [] 100 | supportedSyncModes: 101 | - full_refresh 102 | destinations: {} 103 | sources: {} 104 | tasks: 105 | - api_version: v1 106 | asyncrounous: false 107 | connection_id: ${POSTGRES_BQ_CONNECTION} 108 | task_id: postgres_bq_connection_task 109 | timeout: 110.0 110 | wait_seconds: 3 111 | -------------------------------------------------------------------------------- /tests/goldens/config/base/airflow.yml: -------------------------------------------------------------------------------- 1 | default_args: 2 | owner: Test Team 3 | depends_on_past: False 4 | start_date: 2021-10-31T01:23:45 5 | email_on_failure: False 6 | email_on_retry: False 7 | retries: 0 8 | retry_delay: 5m 9 | 10 | dag: 11 | dag_id: experimental-dag 12 | description: 'Some DAG' 13 | schedule_interval: '0 12 * * *' 14 | catchup: False 15 | max_active_runs: 1 16 | concurrency: 2 17 | 18 | dags_path: gcs://test-sync-project/sync-dir/dags/my-project-name 19 | 20 | use_task_group: True 21 | manifest_file_name: manifest.json 22 | -------------------------------------------------------------------------------- /tests/goldens/config/base/bi.yml: -------------------------------------------------------------------------------- 1 | is_bi_enabled: True 2 | bi_target: looker 3 | is_bi_compile: True 4 | is_bi_deploy: True -------------------------------------------------------------------------------- /tests/goldens/config/base/bigquery.yml: -------------------------------------------------------------------------------- 1 | method: service-account 2 | project: exampleproject 3 | dataset: example-dataset 4 | threads: 1 5 | keyfile: /var/keyfile.json 6 | timeout_seconds: 150 7 | priority: interactive 8 | location: us-west1 9 | retries: 1 10 | -------------------------------------------------------------------------------- /tests/goldens/config/base/datahub.yml: -------------------------------------------------------------------------------- 1 | sink: 2 | config: 3 | server: "https://ingest.some-datahub-endpoint.co.uk:8080" 4 | -------------------------------------------------------------------------------- /tests/goldens/config/base/dbt.yml: -------------------------------------------------------------------------------- 1 | target: env_execution 2 | target_type: bigquery 3 | vars: 4 | variable_1: 123 5 | variable_2: "var2" 6 | -------------------------------------------------------------------------------- /tests/goldens/config/base/execution_env.yml: -------------------------------------------------------------------------------- 1 | image: 2 | repository: my_docker_repository_uri 3 | tag: 4 | 5 | type: k8s 6 | -------------------------------------------------------------------------------- /tests/goldens/config/base/k8s.yml: -------------------------------------------------------------------------------- 1 | variable1: 1337 2 | var2: "Hello, world!" 3 | envs: 4 | SOME_BOOLEAN: true 5 | -------------------------------------------------------------------------------- /tests/goldens/config/base/looker.yml: -------------------------------------------------------------------------------- 1 | looker_repository: https://gitlab.com/getindata/dataops/some_looker_repo.git 2 | looker_repository_username: DataOps Team 3 | looker_repository_email: DataOps-Team@getindata.com 4 | looker_project_id: dbt-looker-test 5 | looker_webhook_secret: 9809k8909k345testTest 6 | looker_repository_branch: master 7 | looker_instance_url: https://test.looker.com -------------------------------------------------------------------------------- /tests/goldens/config/base/publish.yml: -------------------------------------------------------------------------------- 1 | repository: https://gitlab.com/getindata/dataops/some_repo.git 2 | branch: main 3 | username: "DataOps Team" 4 | email: dataops@getindata.com -------------------------------------------------------------------------------- /tests/goldens/config/datahub/dbt.yml: -------------------------------------------------------------------------------- 1 | vars: 2 | datahub_path: "http://example.com/datahub/RaNdOmTe$T__PAHT" 3 | -------------------------------------------------------------------------------- /tests/goldens/config/dev/ingestion.yml: -------------------------------------------------------------------------------- 1 | enable: True 2 | engine: airbyte -------------------------------------------------------------------------------- /tests/goldens/config/image_tag/execution_env.yml: -------------------------------------------------------------------------------- 1 | image: 2 | repository: my_docker_repository_uri 3 | tag: some_test_tag_a1s2d3f 4 | -------------------------------------------------------------------------------- /tests/goldens/config/local/bi.yml: -------------------------------------------------------------------------------- 1 | is_bi_enabled: True 2 | bi_target: looker 3 | is_bi_compile: True 4 | is_bi_deploy: True -------------------------------------------------------------------------------- /tests/goldens/config/local/dbt.yml: -------------------------------------------------------------------------------- 1 | target: local 2 | target_type: snowflake 3 | -------------------------------------------------------------------------------- /tests/goldens/config/local/snowflake.yml: -------------------------------------------------------------------------------- 1 | account: account_id 2 | 3 | user: EXAMPLE_USERNAME 4 | password: EXAMPLE_PASSWORD 5 | 6 | role: DBT_ROLE 7 | database: DBT_EXAMPLE_DB 8 | warehouse: EXAMPLE_WAREHOUSE 9 | schema: EXAMPLE_SCHEMA 10 | threads: 1 11 | client_session_keep_alive: False 12 | -------------------------------------------------------------------------------- /tests/goldens/config/prod/ingestion.yml: -------------------------------------------------------------------------------- 1 | enable: False 2 | -------------------------------------------------------------------------------- /tests/goldens/config/staging/airflow.yml: -------------------------------------------------------------------------------- 1 | dags_path: "gcs://test/jinja/path/com/my/project/name" 2 | -------------------------------------------------------------------------------- /tests/goldens/config/staging/bi.yml: -------------------------------------------------------------------------------- 1 | is_bi_enabled: True 2 | bi_target: looker 3 | is_bi_compile: True 4 | is_bi_deploy: True -------------------------------------------------------------------------------- /tests/goldens/config/staging/bigquery.yml: -------------------------------------------------------------------------------- 1 | method: service-account 2 | project: exampleproject 3 | dataset: "{{ var('variable_2') }}-dataset" 4 | threads: "{{ var('variable_1') }}" 5 | keyfile: "{{ env_var('BIGQUERY_KEYFILE') }}" 6 | timeout_seconds: 150 7 | priority: interactive 8 | location: us-west1 9 | retries: 1 10 | -------------------------------------------------------------------------------- /tests/goldens/config/staging/dbt.yml: -------------------------------------------------------------------------------- 1 | target: env_execution 2 | target_type: bigquery 3 | vars: 4 | variable_1: 1337 5 | variable_2: "var21" 6 | -------------------------------------------------------------------------------- /tests/goldens/config_template/copier.yml: -------------------------------------------------------------------------------- 1 | username: 2 | type: str 3 | -------------------------------------------------------------------------------- /tests/goldens/config_template/dp.yml.jinja: -------------------------------------------------------------------------------- 1 | templates: 2 | local-template: 3 | template_name: local-template 4 | template_path: /Users/{{ username }}/Documents/project-template 5 | my-template: 6 | template_name: my-template 7 | template_path: https://example.com/git/example.git 8 | vars: 9 | username: {{ username }} 10 | -------------------------------------------------------------------------------- /tests/goldens/dag/a.txt: -------------------------------------------------------------------------------- 1 | abcdef -------------------------------------------------------------------------------- /tests/goldens/dag/b.txt: -------------------------------------------------------------------------------- 1 | 123456 -------------------------------------------------------------------------------- /tests/goldens/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'my_test_project_1337' 2 | version: '1.2.3' 3 | config-version: 2 4 | 5 | profile: 'bigquery' 6 | 7 | source-paths: ["models"] 8 | analysis-paths: ["analysis"] 9 | test-paths: ["tests"] 10 | data-paths: ["data"] 11 | macro-paths: ["macros"] 12 | snapshot-paths: ["snapshots"] 13 | 14 | target-path: "target" 15 | clean-targets: 16 | - "target" 17 | - "dbt_modules" 18 | 19 | models: 20 | my_test_project_1337: 21 | +materialized: view 22 | -------------------------------------------------------------------------------- /tests/goldens/example_config.yml: -------------------------------------------------------------------------------- 1 | vars: 2 | username: testuser 3 | templates: 4 | template1: 5 | template_name: template1 6 | template_path: https://example.com/xyz/abcd.git 7 | template2: 8 | template_name: template2 9 | template_path: https://example.com/git/example.git 10 | create_test: 11 | template_name: create_test 12 | template_path: source_path 13 | -------------------------------------------------------------------------------- /tests/goldens/example_profiles/dev_bigquery.yml: -------------------------------------------------------------------------------- 1 | bigquery: 2 | outputs: 3 | env_execution: 4 | method: service-account 5 | project: exampleproject 6 | dataset: example-dataset 7 | keyfile: /var/keyfile.json 8 | timeout_seconds: 150 9 | priority: interactive 10 | location: us-west1 11 | threads: 1 12 | retries: 1 13 | type: bigquery 14 | target: env_execution 15 | -------------------------------------------------------------------------------- /tests/goldens/example_profiles/local_snowflake.yml: -------------------------------------------------------------------------------- 1 | snowflake: 2 | outputs: 3 | local: 4 | account: account_id 5 | database: DBT_EXAMPLE_DB 6 | warehouse: EXAMPLE_WAREHOUSE 7 | schema: EXAMPLE_SCHEMA 8 | 9 | role: DBT_ROLE 10 | user: EXAMPLE_USERNAME 11 | password: EXAMPLE_PASSWORD 12 | 13 | threads: 1 14 | client_session_keep_alive: False 15 | type: snowflake 16 | target: local 17 | -------------------------------------------------------------------------------- /tests/goldens/example_profiles/staging_bigquery.yml: -------------------------------------------------------------------------------- 1 | bigquery: 2 | outputs: 3 | env_execution: 4 | method: service-account 5 | project: exampleproject 6 | dataset: "{{ var('variable_2') }}-dataset" 7 | keyfile: "{{ env_var('BIGQUERY_KEYFILE') }}" 8 | timeout_seconds: 150 9 | priority: interactive 10 | location: us-west1 11 | threads: "{{ var('variable_1') }}" 12 | retries: 1 13 | type: bigquery 14 | target: env_execution 15 | -------------------------------------------------------------------------------- /tests/goldens/lookml/model1.model.lkml: -------------------------------------------------------------------------------- 1 | asdasd -------------------------------------------------------------------------------- /tests/goldens/lookml/view1.view.lkml: -------------------------------------------------------------------------------- 1 | hsjkadhds -------------------------------------------------------------------------------- /tests/goldens/source_yaml.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | sources: 3 | - name: source1 4 | tables: 5 | - name: table1 6 | columns: 7 | - name: column1 8 | - name: column2 9 | - name: table2 10 | columns: 11 | - name: column1 12 | - name: column2 13 | - name: source2 14 | tables: 15 | - name: table1 16 | columns: 17 | - name: column1 18 | -------------------------------------------------------------------------------- /tests/goldens/test_sync_2nd_directory/a/b/c/xyz: -------------------------------------------------------------------------------- 1 | Lorem ipsum -------------------------------------------------------------------------------- /tests/goldens/test_sync_2nd_directory/test2.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent commodo blandit justo. Maecenas tempor massa sit amet ante aliquet, sit amet condimentum eros vehicula. Proin consectetur ex at consectetur tristique. Morbi hendrerit sed ipsum eget elementum. Phasellus volutpat egestas congue. Sed ultrices rutrum mollis. Proin ac metus sed ligula congue blandit sit amet in erat. Ut pretium quam non malesuada blandit. Sed commodo lacus a neque fringilla, ac imperdiet lectus feugiat. Nam pulvinar, tortor vitae ullamcorper luctus, justo eros lobortis sem, ut ullamcorper turpis sapien nec quam. 2 | 3 | Fusce congue velit sit amet cursus ultrices. In hac habitasse platea dictumst. Nulla maximus efficitur quam at mattis. Phasellus nisl purus, consectetur in euismod et, finibus vel libero. Nulla facilisi. Quisque vehicula risus vitae purus cursus, in varius arcu pulvinar. Nam in neque congue, euismod felis id, volutpat lacus. 4 | 5 | Quisque vitae purus sit amet tortor dictum euismod nec non odio. Nam eu nulla augue. Donec vitae ipsum in libero mollis sagittis vel in turpis. Duis vel tempus nulla. Aenean in ullamcorper augue, eu volutpat nibh. Praesent interdum odio nisi, a faucibus diam finibus nec. Mauris ut lectus aliquet, dapibus leo vitae, pellentesque orci. Mauris neque arcu, condimentum nec lacus sed, mollis porta mi. Integer rhoncus congue risus, sed fermentum purus fringilla non. Duis pretium, odio vel vestibulum auctor, massa arcu iaculis enim, eu viverra nibh urna id massa. Mauris sagittis non justo ac tempor. Suspendisse non lacinia diam. Nullam in ante a risus consequat varius. 6 | 7 | Fusce vitae tempor ex. Duis tempus ligula a porttitor consequat. Praesent congue quam faucibus porta semper. Proin at pulvinar justo. Integer suscipit placerat nunc eget gravida. Nullam scelerisque, nisi nec gravida sollicitudin, orci metus fermentum ex, lacinia tempor risus lectus et metus. Sed et sapien at nisl feugiat malesuada. Praesent porttitor venenatis sem in ultrices. 8 | 9 | Aliquam at risus nec nulla ultricies laoreet. In pretium mattis libero, sed porta felis faucibus ac. Quisque eu tortor venenatis, pharetra eros volutpat, tempor nisi. Mauris sit amet massa ullamcorper, dapibus nisi et, ullamcorper mauris. Vivamus suscipit augue sapien, nec molestie ipsum venenatis quis. Donec ornare vehicula magna ut placerat. Aenean fermentum mi ac nunc consequat aliquet. Praesent mattis leo ut malesuada pulvinar. Aenean in libero leo. Etiam nec massa ut lectus efficitur ultrices id vel tellus. Vivamus lacinia interdum nisi eget volutpat. Ut elementum vehicula nunc, quis cursus eros ullamcorper et. 10 | -------------------------------------------------------------------------------- /tests/goldens/test_sync_directory/a/b/c/xyz: -------------------------------------------------------------------------------- 1 | Lorem ipsum -------------------------------------------------------------------------------- /tests/goldens/test_sync_directory/test1.txt: -------------------------------------------------------------------------------- 1 | abcdef -------------------------------------------------------------------------------- /tests/goldens/test_sync_directory/test2.txt: -------------------------------------------------------------------------------- 1 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent commodo blandit justo. Maecenas tempor massa sit amet ante aliquet, sit amet condimentum eros vehicula. Proin consectetur ex at consectetur tristique. Morbi hendrerit sed ipsum eget elementum. Phasellus volutpat egestas congue. Sed ultrices rutrum mollis. Proin ac metus sed ligula congue blandit sit amet in erat. Ut pretium quam non malesuada blandit. Sed commodo lacus a neque fringilla, ac imperdiet lectus feugiat. Nam pulvinar, tortor vitae ullamcorper luctus, justo eros lobortis sem, ut ullamcorper turpis sapien nec quam. 2 | 3 | Fusce congue velit sit amet cursus ultrices. In hac habitasse platea dictumst. Nulla maximus efficitur quam at mattis. Phasellus nisl purus, consectetur in euismod et, finibus vel libero. Nulla facilisi. Quisque vehicula risus vitae purus cursus, in varius arcu pulvinar. Nam in neque congue, euismod felis id, volutpat lacus. 4 | 5 | Quisque vitae purus sit amet tortor dictum euismod nec non odio. Nam eu nulla augue. Donec vitae ipsum in libero mollis sagittis vel in turpis. Duis vel tempus nulla. Aenean in ullamcorper augue, eu volutpat nibh. Praesent interdum odio nisi, a faucibus diam finibus nec. Mauris ut lectus aliquet, dapibus leo vitae, pellentesque orci. Mauris neque arcu, condimentum nec lacus sed, mollis porta mi. Integer rhoncus congue risus, sed fermentum purus fringilla non. Duis pretium, odio vel vestibulum auctor, massa arcu iaculis enim, eu viverra nibh urna id massa. Mauris sagittis non justo ac tempor. Suspendisse non lacinia diam. Nullam in ante a risus consequat varius. 6 | 7 | Fusce vitae tempor ex. Duis tempus ligula a porttitor consequat. Praesent congue quam faucibus porta semper. Proin at pulvinar justo. Integer suscipit placerat nunc eget gravida. Nullam scelerisque, nisi nec gravida sollicitudin, orci metus fermentum ex, lacinia tempor risus lectus et metus. Sed et sapien at nisl feugiat malesuada. Praesent porttitor venenatis sem in ultrices. 8 | 9 | Aliquam at risus nec nulla ultricies laoreet. In pretium mattis libero, sed porta felis faucibus ac. Quisque eu tortor venenatis, pharetra eros volutpat, tempor nisi. Mauris sit amet massa ullamcorper, dapibus nisi et, ullamcorper mauris. Vivamus suscipit augue sapien, nec molestie ipsum venenatis quis. Donec ornare vehicula magna ut placerat. Aenean fermentum mi ac nunc consequat aliquet. Praesent mattis leo ut malesuada pulvinar. Aenean in libero leo. Etiam nec massa ut lectus efficitur ultrices id vel tellus. Vivamus lacinia interdum nisi eget volutpat. Ut elementum vehicula nunc, quis cursus eros ullamcorper et. 10 | -------------------------------------------------------------------------------- /tests/manifest_generation_tutorial.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This document instructs how to generate `tests/goldens/manifest.json` for a particular dbt and manifest schema version. 4 | 5 | # Tutorial 6 | 7 | dbt version to Manifest version mapping is documented here: https://docs.getdbt.com/reference/artifacts/manifest-json 8 | 9 | 10 | ## dbt Core v.1.5.4 (Manifest version v9) 11 | 12 | ### 1. Prepare environment 13 | 14 | ``` 15 | virtualenv manifest_test --python=python3.9 16 | source manifest_test/bin/activate 17 | (manifest_test) % pip install -r requirements.txt 18 | ``` 19 | Where `requirements.txt` contains the following packages (please check which plugin version is compatible with a corresponding dbt version). 20 | 21 | ``` 22 | acryl-datahub[dbt]==0.10.4 23 | dbt-core==1.5.4 24 | dbt-spark==1.5.2 25 | dbt-bigquery==1.5.5 26 | dbt-postgres==1.5.4 27 | dbt-snowflake==1.5.2 28 | dbt-redshift==1.5.9 29 | ``` 30 | 31 | ### 2. Init dbt 32 | 33 | ```commandline 34 | (manifest_test) % mkdir manifest_test_project 35 | (manifest_test) % cd manifest_test_project 36 | (manifest_test) % dbt init 37 | ``` 38 | Fill in the information in the wizard as below. 39 | 40 | ``` 41 | Running with dbt=1.5.4 42 | Enter a name for your project (letters, digits, underscore): my_new_project 43 | Which database would you like to use? 44 | [1] bigquery 45 | [2] snowflake 46 | [3] redshift 47 | [4] postgres 48 | [5] spark 49 | Enter a number: 1 50 | [1] oauth 51 | [2] service_account 52 | Desired authentication method option (enter a number): 1 53 | project (GCP project id): exampleproject 54 | dataset (the name of your dbt dataset): username_private_working_dataset 55 | threads (1 or more): 1 56 | job_execution_timeout_seconds [300]: 150 57 | [1] US 58 | [2] EU 59 | Desired location option (enter a number): 1 60 | Profile my_new_project written to /Users/your-user/.dbt/profiles.yml using target's profile_template.yml and your supplied values. Run 'dbt debug' to validate the connection. 61 | 62 | Your new dbt project "my_new_project" was created! 63 | ``` 64 | 65 | ### 3. Compile your dbt project 66 | 67 | Navigate to your newly created project and run `dbt compile`: 68 | 69 | ``` 70 | cd my_new_project 71 | dbt compile 72 | ``` 73 | 74 | You can find the generated `manifest.json` file in `target` folder. 75 | 76 | ### 4. Copy manifest to your data-pipelines-cli branch 77 | 78 | Overwrite `tests/goldens/manifest.json` on your local branch of `data-pipelines-cli` repository with the generated `manifest.json`. Verify if tests run successfully, if not - adjust the code to the new version of Manifest schema. 79 | 80 | Navigate to folder my_new_project2p/target where manifest.json has just been generated -------------------------------------------------------------------------------- /tests/test_bi_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock, patch 3 | 4 | from data_pipelines_cli.bi_utils import BiAction, _bi_looker, bi 5 | from data_pipelines_cli.errors import DataPipelinesError, NotSuppertedBIError 6 | 7 | 8 | class BiUtilsTestCase(unittest.TestCase): 9 | def test_bi_compile_looker(self): 10 | bi_config = { 11 | "is_bi_enabled": True, 12 | "bi_target": "looker", 13 | "is_bi_compile": True, 14 | "is_bi_deploy": False, 15 | } 16 | 17 | _bi_looker_mock = MagicMock() 18 | 19 | with patch("data_pipelines_cli.bi_utils.read_bi_config", return_value=bi_config), patch( 20 | "data_pipelines_cli.bi_utils._bi_looker", _bi_looker_mock 21 | ): 22 | bi("env", BiAction.COMPILE) 23 | _bi_looker_mock.assert_called_with("env", True, False, None) 24 | 25 | def test_bi_deploy_looker(self): 26 | bi_config = { 27 | "is_bi_enabled": True, 28 | "bi_target": "looker", 29 | "is_bi_compile": True, 30 | "is_bi_deploy": True, 31 | } 32 | 33 | _bi_looker_mock = MagicMock() 34 | 35 | with patch("data_pipelines_cli.bi_utils.read_bi_config", return_value=bi_config), patch( 36 | "data_pipelines_cli.bi_utils._bi_looker", _bi_looker_mock 37 | ): 38 | bi("env", BiAction.DEPLOY) 39 | _bi_looker_mock.assert_called_with("env", False, True, None) 40 | 41 | def test_bi_disabled(self): 42 | bi_config = { 43 | "is_bi_enabled": False, 44 | "bi_target": "looker", 45 | "is_bi_compile": True, 46 | "is_bi_deploy": False, 47 | } 48 | 49 | _bi_looker_mock = MagicMock() 50 | 51 | with patch("data_pipelines_cli.bi_utils.read_bi_config", return_value=bi_config), patch( 52 | "data_pipelines_cli.bi_utils._bi_looker", _bi_looker_mock 53 | ): 54 | bi("env", BiAction.COMPILE) 55 | _bi_looker_mock.assert_not_called() 56 | 57 | @patch("data_pipelines_cli.bi_utils._bi_looker") 58 | @patch("data_pipelines_cli.bi_utils.read_bi_config") 59 | def test_bi_disabled_when_config_not_exists(self, mock_read_bi_config, mock__bi_looker): 60 | mock_read_bi_config.return_value = {} 61 | bi("non_existent_env", BiAction.COMPILE) 62 | mock__bi_looker.assert_not_called() 63 | 64 | def test_bi_not_supported_bi(self): 65 | bi_config = { 66 | "is_bi_enabled": True, 67 | "bi_target": "superset", 68 | "is_bi_compile": True, 69 | "is_bi_deploy": False, 70 | } 71 | 72 | with patch("data_pipelines_cli.bi_utils.read_bi_config", return_value=bi_config): 73 | self.assertRaises(NotSuppertedBIError, bi, "env", BiAction.COMPILE) 74 | 75 | def test_bi_looker_compile(self): 76 | generate_lookML_model_mock = MagicMock() 77 | deploy_lookML_model_mock = MagicMock() 78 | with patch( 79 | "data_pipelines_cli.bi_utils.generate_lookML_model", generate_lookML_model_mock 80 | ), patch("data_pipelines_cli.bi_utils.deploy_lookML_model", deploy_lookML_model_mock): 81 | _bi_looker("env", True) 82 | 83 | generate_lookML_model_mock.assert_called_once() 84 | deploy_lookML_model_mock.assert_not_called() 85 | 86 | def test_bi_looker_deploy(self): 87 | generate_lookML_model_mock = MagicMock() 88 | deploy_lookML_model_mock = MagicMock() 89 | with patch( 90 | "data_pipelines_cli.bi_utils.generate_lookML_model", generate_lookML_model_mock 91 | ), patch("data_pipelines_cli.bi_utils.deploy_lookML_model", deploy_lookML_model_mock): 92 | _bi_looker("env", False, True, "/path/to/git/key") 93 | 94 | generate_lookML_model_mock.assert_not_called() 95 | deploy_lookML_model_mock.assert_called_once() 96 | 97 | def test_bi_looker_deploy_no_key_provided(self): 98 | generate_lookML_model_mock = MagicMock() 99 | deploy_lookML_model_mock = MagicMock() 100 | with patch( 101 | "data_pipelines_cli.bi_utils.generate_lookML_model", generate_lookML_model_mock 102 | ), patch("data_pipelines_cli.bi_utils.deploy_lookML_model", deploy_lookML_model_mock): 103 | self.assertRaises(DataPipelinesError, _bi_looker, "env", False, True) 104 | 105 | generate_lookML_model_mock.assert_not_called() 106 | 107 | def test_bi_not_supported_action(self): 108 | bi_config = { 109 | "is_bi_enabled": True, 110 | "bi_target": "looker", 111 | "is_bi_compile": True, 112 | "is_bi_deploy": False, 113 | } 114 | 115 | _bi_looker_mock = MagicMock() 116 | 117 | with patch("data_pipelines_cli.bi_utils.read_bi_config", return_value=bi_config), patch( 118 | "data_pipelines_cli.bi_utils._bi_looker", _bi_looker_mock 119 | ): 120 | bi("env", 2) 121 | _bi_looker_mock.assert_called_with("env", False, False, None) 122 | -------------------------------------------------------------------------------- /tests/test_cli_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import unittest 4 | from io import StringIO 5 | from unittest.mock import patch 6 | 7 | from data_pipelines_cli.cli_utils import ( 8 | echo_error, 9 | echo_info, 10 | echo_suberror, 11 | echo_subinfo, 12 | echo_warning, 13 | get_argument_or_environment_variable, 14 | subprocess_run, 15 | ) 16 | from data_pipelines_cli.errors import ( 17 | DataPipelinesError, 18 | SubprocessNonZeroExitError, 19 | SubprocessNotFound, 20 | ) 21 | 22 | 23 | class CliUtilsTest(unittest.TestCase): 24 | echo_is_printing_to_out = [ 25 | (echo_error, False), 26 | (echo_suberror, False), 27 | (echo_warning, False), 28 | (echo_info, True), 29 | (echo_subinfo, True), 30 | ] 31 | 32 | def test_echoes_to_proper_streams(self): 33 | test_string = "Hello world!" 34 | endlined_test_string = test_string + "\n" 35 | 36 | for echo_fun, is_stdout in self.echo_is_printing_to_out: 37 | with self.subTest(function=echo_fun.__name__), patch( 38 | "sys.stdout", new=StringIO() 39 | ) as fake_out, patch("sys.stderr", new=StringIO()) as fake_err: 40 | echo_fun(test_string) 41 | self.assertEqual(endlined_test_string if is_stdout else "", fake_out.getvalue()) 42 | self.assertEqual(endlined_test_string if not is_stdout else "", fake_err.getvalue()) 43 | 44 | some_env_variable_key = "SOME_VARIABLE" 45 | some_env_variable_value = "some_value" 46 | 47 | @patch.dict(os.environ, {some_env_variable_key: some_env_variable_value}) 48 | def test_get_argument_from_argument(self): 49 | argument = "argument" 50 | self.assertEqual( 51 | argument, 52 | get_argument_or_environment_variable(argument, "arg", self.some_env_variable_key), 53 | ) 54 | 55 | @patch.dict(os.environ, {some_env_variable_key: some_env_variable_value}) 56 | def test_get_argument_from_env_var(self): 57 | self.assertEqual( 58 | self.some_env_variable_value, 59 | get_argument_or_environment_variable(None, "arg", self.some_env_variable_key), 60 | ) 61 | 62 | @patch.dict(os.environ, {}) 63 | def test_get_argument_throw(self): 64 | with self.assertRaises(DataPipelinesError): 65 | get_argument_or_environment_variable(None, "arg", self.some_env_variable_key) 66 | 67 | @patch("data_pipelines_cli.cli_utils.subprocess.run") 68 | def test_subprocess_run_return_code(self, mock_run): 69 | mock_run.return_value.returncode = 0 70 | result = subprocess_run(["testproc", "--arg"]) 71 | self.assertEqual(0, result.returncode) 72 | 73 | @patch("data_pipelines_cli.cli_utils.subprocess.run") 74 | def test_subprocess_run_not_exist(self, mock_run): 75 | mock_run.side_effect = FileNotFoundError 76 | with self.assertRaises(SubprocessNotFound) as exc: 77 | _ = subprocess_run(["testproc", "--arg"]) 78 | self.assertRegex(exc.exception.message, r"^testproc.*$") 79 | 80 | @patch("data_pipelines_cli.cli_utils.subprocess.run") 81 | def test_subprocess_run_nonzero_throws(self, mock_run): 82 | mock_run.side_effect = subprocess.CalledProcessError(21, cmd="") 83 | with self.assertRaises(SubprocessNonZeroExitError) as exc: 84 | _ = subprocess_run(["testproc", "--arg"]) 85 | self.assertRegex(exc.exception.message, r"^testproc.*21$") 86 | -------------------------------------------------------------------------------- /tests/test_config_generation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import tempfile 4 | import unittest 5 | from unittest.mock import patch 6 | 7 | import yaml 8 | 9 | import data_pipelines_cli.config_generation as cgen 10 | 11 | 12 | def _noop(): 13 | pass 14 | 15 | 16 | class TestConfigGeneration(unittest.TestCase): 17 | envs_to_test = [ 18 | ("dev", "bigquery"), 19 | ("staging", "bigquery"), 20 | ("local", "snowflake"), 21 | ] 22 | configs_to_test = { 23 | "dev": { 24 | "target": "env_execution", 25 | "target_type": "bigquery", 26 | "vars": { 27 | "variable_1": 123, 28 | "variable_2": "var2", 29 | }, 30 | }, 31 | "local": { 32 | "target": "local", 33 | "target_type": "snowflake", 34 | "vars": { 35 | "variable_1": 123, 36 | "variable_2": "var2", 37 | }, 38 | }, 39 | "staging": { 40 | "target": "env_execution", 41 | "target_type": "bigquery", 42 | "vars": { 43 | "variable_1": 1337, 44 | "variable_2": "var21", 45 | }, 46 | }, 47 | } 48 | 49 | def setUp(self) -> None: 50 | self.maxDiff = None 51 | self.goldens_dir_path = pathlib.Path(__file__).parent.joinpath("goldens") 52 | 53 | def test_copy_dag_dir(self): 54 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 55 | "data_pipelines_cli.config_generation.BUILD_DIR", pathlib.Path(tmp_dir) 56 | ), patch("pathlib.Path.cwd", lambda: self.goldens_dir_path): 57 | cgen.copy_dag_dir_to_build_dir() 58 | tmp_dag_dir = pathlib.Path(tmp_dir).joinpath("dag") 59 | self.assertEqual(2, len(os.listdir(tmp_dag_dir))) 60 | with open(tmp_dag_dir.joinpath("a.txt"), "r") as f: 61 | self.assertEqual("abcdef", f.read()) 62 | with open(tmp_dag_dir.joinpath("b.txt"), "r") as f: 63 | self.assertEqual("123456", f.read()) 64 | 65 | def test_copy_dir_rmdir_if_exists(self): 66 | with tempfile.TemporaryDirectory() as tmp_dir1, tempfile.TemporaryDirectory() as tmp_dir2: # noqa: E501 67 | path1 = pathlib.Path(tmp_dir2) 68 | with open(path1.joinpath("a.txt"), "w") as f: 69 | f.write("qwerty987") 70 | path1.joinpath("b.txt").touch() 71 | self.assertEqual(2, len(os.listdir(path1))) 72 | 73 | path2 = pathlib.Path(tmp_dir1) 74 | with open(path2.joinpath("a.txt"), "w") as f: 75 | f.write("abc1234") 76 | path2.joinpath("b.txt").touch() 77 | path2.joinpath("c.txt").touch() 78 | self.assertEqual(3, len(os.listdir(path2))) 79 | with open(path2.joinpath("a.txt"), "r") as f: 80 | self.assertEqual("abc1234", f.read()) 81 | 82 | cgen._copy_src_dir_to_dst_dir(path1, path2) 83 | 84 | self.assertEqual(2, len(os.listdir(path2))) 85 | with open(path2.joinpath("a.txt"), "r") as f: 86 | self.assertEqual("qwerty987", f.read()) 87 | 88 | def test_read_from_config_dir(self): 89 | for env, _ in self.envs_to_test: 90 | with self.subTest(env=env): 91 | self.assertDictEqual( 92 | self.configs_to_test[env], 93 | cgen.read_dictionary_from_config_directory( 94 | self.goldens_dir_path, env, "dbt.yml" 95 | ), 96 | ) 97 | 98 | def test_generation(self): 99 | for env, profile_type in self.envs_to_test: 100 | with self.subTest( 101 | env=env, profile_type=profile_type 102 | ), tempfile.TemporaryDirectory() as tmp_dir, patch( 103 | "data_pipelines_cli.cli_constants.BUILD_DIR", pathlib.Path(tmp_dir) 104 | ), patch( 105 | "data_pipelines_cli.config_generation.BUILD_DIR", 106 | pathlib.Path(tmp_dir), 107 | ), patch( 108 | "pathlib.Path.cwd", lambda: self.goldens_dir_path 109 | ): 110 | self.profiles_path = cgen.generate_profiles_yml(env).joinpath("profiles.yml") 111 | with open(self.profiles_path, "r") as generated, open( 112 | self.goldens_dir_path.joinpath("example_profiles", f"{env}_{profile_type}.yml"), 113 | "r", 114 | ) as prepared: 115 | self.assertDictEqual(yaml.safe_load(prepared), yaml.safe_load(generated)) 116 | 117 | os.remove(self.profiles_path) 118 | os.rmdir(self.profiles_path.parent) 119 | os.rmdir(self.profiles_path.parent.parent) 120 | -------------------------------------------------------------------------------- /tests/test_data_structures.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import shutil 3 | import tempfile 4 | import unittest 5 | from unittest.mock import patch 6 | 7 | from data_pipelines_cli.data_structures import ( 8 | DataPipelinesConfig, 9 | DockerArgs, 10 | TemplateConfig, 11 | read_env_config, 12 | ) 13 | from data_pipelines_cli.errors import DataPipelinesError, NoConfigFileError 14 | 15 | 16 | class DataStructuresTestCase(unittest.TestCase): 17 | example_config_dict = DataPipelinesConfig( 18 | templates={ 19 | "template1": TemplateConfig( 20 | template_name="template1", 21 | template_path="https://example.com/xyz/abcd.git", 22 | ), 23 | "template2": TemplateConfig( 24 | template_name="template2", 25 | template_path="https://example.com/git/example.git", 26 | ), 27 | "create_test": TemplateConfig( 28 | template_name="create_test", 29 | template_path="source_path", 30 | ), 31 | }, 32 | vars={"username": "testuser"}, 33 | ) 34 | example_config_path = pathlib.Path(__file__).parent.joinpath("goldens", "example_config.yml") 35 | 36 | def test_read_config(self): 37 | with patch( 38 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 39 | self.example_config_path, 40 | ): 41 | self.assertEqual(self.example_config_dict, read_env_config()) 42 | 43 | def test_read_config_no_file(self): 44 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 45 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 46 | pathlib.Path(tmp_dir).joinpath("non_existing_file.yml"), 47 | ): 48 | with self.assertRaises(NoConfigFileError): 49 | read_env_config() 50 | 51 | 52 | class DockerArgsTest(unittest.TestCase): 53 | goldens_dir_path = pathlib.Path(__file__).parent.joinpath("goldens") 54 | 55 | def setUp(self) -> None: 56 | self.build_temp_dir = pathlib.Path(tempfile.mkdtemp()) 57 | dags_path = pathlib.Path(self.build_temp_dir).joinpath("dag") 58 | dags_path.mkdir(parents=True) 59 | shutil.copytree(self.goldens_dir_path.joinpath("config"), dags_path.joinpath("config")) 60 | 61 | def tearDown(self) -> None: 62 | shutil.rmtree(self.build_temp_dir) 63 | 64 | @patch("data_pipelines_cli.data_structures.git_revision_hash") 65 | def test_build_tag(self, mock_git_revision_hash): 66 | repository = "my_docker_repository_uri" 67 | commit_sha = "eee440bfbe0801ec3f533f897c1d55e6a5afd5cd" 68 | mock_git_revision_hash.return_value = commit_sha 69 | 70 | with patch("data_pipelines_cli.cli_constants.BUILD_DIR", self.build_temp_dir): 71 | docker_args = DockerArgs("base", None, {}) 72 | 73 | self.assertEqual(f"{repository}:{commit_sha}", docker_args.docker_build_tag()) 74 | self.assertEqual(repository, docker_args.repository) 75 | self.assertEqual(commit_sha, docker_args.image_tag) 76 | 77 | def test_given_tag(self): 78 | repository = "my_docker_repository_uri" 79 | image_tag = "my_awesome_tag_eee440b_latest" 80 | 81 | with patch("data_pipelines_cli.cli_constants.BUILD_DIR", self.build_temp_dir): 82 | docker_args = DockerArgs("base", image_tag, {}) 83 | 84 | self.assertEqual(f"{repository}:{image_tag}", docker_args.docker_build_tag()) 85 | self.assertEqual(repository, docker_args.repository) 86 | self.assertEqual(image_tag, docker_args.image_tag) 87 | 88 | def test_set_tag(self): 89 | repository = "my_docker_repository_uri" 90 | image_tag = "some_test_tag_a1s2d3f" 91 | 92 | with patch("data_pipelines_cli.cli_constants.BUILD_DIR", self.build_temp_dir): 93 | docker_args = DockerArgs("image_tag", None, {}) 94 | 95 | self.assertEqual(f"{repository}:{image_tag}", docker_args.docker_build_tag()) 96 | self.assertEqual(repository, docker_args.repository) 97 | self.assertEqual(image_tag, docker_args.image_tag) 98 | 99 | @patch("data_pipelines_cli.cli_constants.BUILD_DIR", goldens_dir_path) 100 | @patch("data_pipelines_cli.data_structures.git_revision_hash") 101 | def test_no_repository(self, mock_git_revision_hash): 102 | commit_sha = "eee440bfbe0801ec3f533f897c1d55e6a5afd5cd" 103 | mock_git_revision_hash.return_value = commit_sha 104 | 105 | with self.assertRaises(DataPipelinesError): 106 | _ = DockerArgs("base", None, {}) 107 | 108 | @patch("data_pipelines_cli.data_structures.git_revision_hash") 109 | def test_no_git_hash(self, mock_git_revision_hash): 110 | mock_git_revision_hash.return_value = None 111 | 112 | patch("data_pipelines_cli.cli_constants.BUILD_DIR", self.build_temp_dir) 113 | -------------------------------------------------------------------------------- /tests/test_dbt_utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import tempfile 3 | import unittest 4 | from typing import List 5 | from unittest.mock import patch 6 | 7 | import yaml 8 | 9 | from data_pipelines_cli.dbt_utils import read_dbt_vars_from_configs, run_dbt_command 10 | from data_pipelines_cli.errors import NoConfigFileError 11 | 12 | 13 | class DbtUtilsTest(unittest.TestCase): 14 | dp_config = { 15 | "vars": { 16 | "var1": 1, 17 | "var2": "var2_value", 18 | }, 19 | "templates": {}, 20 | } 21 | dbt_config = { 22 | "target": "env_execution", 23 | "target_type": "bigquery", 24 | "vars": { 25 | "var1": 2, 26 | }, 27 | } 28 | goldens_dir_path = pathlib.Path(__file__).parent.joinpath("goldens") 29 | 30 | def setUp(self) -> None: 31 | self.subprocess_run_args = [] 32 | 33 | def _mock_run(self, args: List[str], **_kwargs): 34 | self.subprocess_run_args = args 35 | 36 | def test_dbt_run(self): 37 | with tempfile.NamedTemporaryFile() as tmp_file, patch( 38 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 39 | pathlib.Path(tmp_file.name), 40 | ), patch( 41 | "data_pipelines_cli.dbt_utils.read_dictionary_from_config_directory", 42 | lambda _a, _b, _c: self.dbt_config, 43 | ), patch( 44 | "data_pipelines_cli.dbt_utils.subprocess_run", self._mock_run 45 | ): 46 | with open(tmp_file.name, "w") as f: 47 | yaml.dump(self.dp_config, f) 48 | run_dbt_command( 49 | ("really", "long", "command"), "test_env", pathlib.Path("profiles_path") 50 | ) 51 | self.assertListEqual( 52 | [ 53 | "dbt", 54 | "really", 55 | "long", 56 | "command", 57 | "--profile", 58 | "bigquery", 59 | "--profiles-dir", 60 | "profiles_path", 61 | "--target", 62 | "env_execution", 63 | "--vars", 64 | "{var1: 1, var2: var2_value}\n", 65 | ], 66 | self.subprocess_run_args, 67 | ) 68 | 69 | def test_read_vars_no_throw(self): 70 | with tempfile.TemporaryDirectory() as tmp_dir, patch( 71 | "data_pipelines_cli.cli_constants.ENV_CONFIGURATION_PATH", 72 | pathlib.Path(tmp_dir).joinpath("non_existing_config_file"), 73 | ): 74 | try: 75 | result = read_dbt_vars_from_configs("env") 76 | self.assertDictEqual({}, result) 77 | except NoConfigFileError: 78 | self.fail("_read_dbt_vars_from_configs() raised NoConfigFileError!") 79 | -------------------------------------------------------------------------------- /tests/test_docker_response_reader.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from data_pipelines_cli.docker_response_reader import DockerResponseReader 4 | from data_pipelines_cli.errors import DockerErrorResponseError 5 | 6 | 7 | class DockerResponseReaderTestCase(unittest.TestCase): 8 | def test_status(self): 9 | docker_response = [ 10 | '{"status":"The push refers to repository [docker.io/library/rep]"}', 11 | '{"status":"abcdef"}', 12 | ] 13 | reader = DockerResponseReader(docker_response) 14 | self.assertListEqual( 15 | ["The push refers to repository [docker.io/library/rep]", "abcdef"], 16 | list(map(str, reader.read_response())), 17 | ) 18 | 19 | def test_stream(self): 20 | docker_response = [ 21 | '{"stream":"Step 1/10 : FROM abc/blabla:tag123\\n\\n\\n' 22 | " ---> abcdef123456\\n\\n" 23 | 'Step 2/10 : ADD some_important_dir /var/importantes/\\n\\n"}', 24 | ] 25 | reader = DockerResponseReader(docker_response) 26 | self.assertListEqual( 27 | [ 28 | "Step 1/10 : FROM abc/blabla:tag123", 29 | " ---> abcdef123456", 30 | "Step 2/10 : ADD some_important_dir /var/importantes/", 31 | ], 32 | list(map(str, reader.read_response())), 33 | ) 34 | 35 | def test_aux(self): 36 | docker_response = [ 37 | '{"aux":' 38 | "{" 39 | '"Digest": "abcde", ' 40 | '"ID": "sha256:99bb86b06ec9a43d0be231c8794666c1dba8ac38a9d6f46656fd286137db092d"' # noqa: E501 41 | "}" 42 | "}", 43 | ] 44 | reader = DockerResponseReader(docker_response) 45 | self.assertListEqual( 46 | [ 47 | "Digest: abcde", 48 | "ID: sha256:99bb86b06ec9a43d0be231c8794666c1dba8ac38a9d6f46656fd286137db092d", # noqa: E501 49 | ], 50 | list(map(str, reader.read_response())), 51 | ) 52 | 53 | def test_error(self): 54 | docker_response = [ 55 | '{"error":"Something went wrong"}', 56 | ] 57 | reader = DockerResponseReader(docker_response) 58 | self.assertListEqual( 59 | ["ERROR: Something went wrong"], list(map(str, reader.read_response())) 60 | ) 61 | 62 | def test_error_detail(self): 63 | docker_response = [ 64 | '{"errorDetail":{"message": "Something went wrong"}}', 65 | '{"errorDetail":{"message": "Something went really wrong", "code": 500}}', 66 | ] 67 | reader = DockerResponseReader(docker_response) 68 | self.assertListEqual( 69 | [ 70 | "ERROR: Something went wrong", 71 | "ERROR: Something went really wrong\nError code: 500", 72 | ], 73 | list(map(str, reader.read_response())), 74 | ) 75 | 76 | def test_error_raised(self): 77 | docker_response = [ 78 | '{"status":"The push refers to repository [docker.io/library/rep]"}', 79 | '{"status":"abcdef"}', 80 | '{"errorDetail":{"message": "Something went wrong"}}', 81 | ] 82 | with self.assertRaises(DockerErrorResponseError): 83 | reader = DockerResponseReader(docker_response) 84 | reader.click_echo_ok_responses() 85 | -------------------------------------------------------------------------------- /tests/test_io_utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import subprocess 3 | import tempfile 4 | import unittest 5 | from io import StringIO 6 | from unittest.mock import MagicMock, patch 7 | 8 | from data_pipelines_cli.io_utils import git_revision_hash, replace 9 | 10 | 11 | class TestReplace(unittest.TestCase): 12 | pattern_to_replace = "<(TEST-PaTtErN___!@#$%^abcdef__" 13 | regex_to_replace = r"<\(TEST-PaTtErN___!@#\$%\^[a-z]+__" 14 | replacement = "0x13370x42 || http://example,com<>" 15 | text_to_replace = ( 16 | f'<(TEST-Pattern,1234{pattern_to_replace}>>>>=><><"\n\n{pattern_to_replace}xxx' 17 | ) 18 | expected_result = f'<(TEST-Pattern,1234{replacement}>>>>=><><"\n\n{replacement}xxx' 19 | 20 | def test_replace(self): 21 | with tempfile.TemporaryDirectory() as tmp_dir: 22 | filename = pathlib.Path(tmp_dir).joinpath("test") 23 | with open(filename, "w") as tmp_file: 24 | tmp_file.write(self.text_to_replace) 25 | replace(tmp_file.name, self.regex_to_replace, self.replacement) 26 | with open(filename, "r") as tmp_file: 27 | output = tmp_file.readlines() 28 | self.assertEqual(self.expected_result, "".join(output)) 29 | 30 | 31 | class TestGitRevisionHash(unittest.TestCase): 32 | @patch("data_pipelines_cli.io_utils.subprocess.run") 33 | def test_git_revision_hash(self, mock_run): 34 | git_sha = "abcdef1337" 35 | 36 | mock_stdout = MagicMock() 37 | mock_stdout.configure_mock(**{"stdout.decode.return_value": git_sha}) 38 | mock_run.return_value = mock_stdout 39 | 40 | result = git_revision_hash() 41 | self.assertEqual(git_sha, result) 42 | 43 | @patch("data_pipelines_cli.io_utils.subprocess.run") 44 | def test_git_does_not_exist(self, mock_run): 45 | mock_run.side_effect = FileNotFoundError 46 | result = git_revision_hash() 47 | self.assertEqual(None, result) 48 | 49 | @patch("data_pipelines_cli.io_utils.subprocess.run") 50 | def test_git_error(self, mock_run): 51 | test_error = "Some test error" 52 | mock_run.side_effect = subprocess.CalledProcessError(128, cmd="", stderr=test_error) 53 | 54 | with patch("sys.stderr", new=StringIO()) as fake_out: 55 | result = git_revision_hash() 56 | self.assertEqual(None, result) 57 | self.assertIn(test_error, fake_out.getvalue()) 58 | -------------------------------------------------------------------------------- /tests/test_looker_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | import shutil 4 | import tempfile 5 | import unittest 6 | from os import PathLike 7 | from typing import Any 8 | from unittest.mock import MagicMock, patch 9 | 10 | from data_pipelines_cli.looker_utils import ( 11 | _clear_repo_before_writing_lookml, 12 | _deploy_looker_project_to_production, 13 | deploy_lookML_model, 14 | generate_lookML_model, 15 | ) 16 | 17 | goldens_dir_path = pathlib.Path(__file__).parent.joinpath("goldens") 18 | 19 | 20 | class LookerUtilsTestCase(unittest.TestCase): 21 | 22 | dbt_project = { 23 | "config-version": 2, 24 | "name": "my_test_project_1338_sources", 25 | "version": "1.2.3", 26 | "source-paths": ["models"], 27 | } 28 | 29 | def setUp(self) -> None: 30 | self.build_temp_dir = pathlib.Path(tempfile.mkdtemp()) 31 | dags_path = pathlib.Path(self.build_temp_dir).joinpath("dag") 32 | dags_path.mkdir(parents=True) 33 | shutil.copytree(goldens_dir_path.joinpath("config"), dags_path.joinpath("config")) 34 | shutil.copytree(goldens_dir_path.joinpath("lookml"), self.build_temp_dir.joinpath("lookml")) 35 | 36 | def tearDown(self) -> None: 37 | shutil.rmtree(self.build_temp_dir) 38 | 39 | def mock_origin(self, name: str): 40 | self.origin = MagicMock() 41 | self.origin.push = MagicMock() 42 | return self.origin 43 | 44 | def mock_clone_from(self, url: PathLike, to_path: PathLike, **kwargs: Any): 45 | self.assertEqual("https://gitlab.com/getindata/dataops/some_looker_repo.git", url) 46 | self.assertEqual("master", kwargs["branch"]) 47 | 48 | def noop(): 49 | pass 50 | 51 | repo_mock = MagicMock() 52 | self.git = MagicMock() 53 | self.index = MagicMock() 54 | self.index.commit = MagicMock() 55 | self.git.add = MagicMock() 56 | config_writer_mock = MagicMock() 57 | set_value_mock = MagicMock() 58 | set_value_mock.configure_mock(**{"release": noop}) 59 | config_writer_mock.configure_mock(**{"set_value": lambda x, y, z: set_value_mock}) 60 | repo_mock.configure_mock( 61 | **{ 62 | "config_writer": config_writer_mock, 63 | "git": self.git, 64 | "index": self.index, 65 | "remote": self.mock_origin, 66 | } 67 | ) 68 | return repo_mock 69 | 70 | def repo_class_mock(self): 71 | self.repo_mock_class = MagicMock() 72 | self.repo_mock_class.configure_mock(**{"clone_from": self.mock_clone_from}) 73 | return self.repo_mock_class 74 | 75 | @patch("pathlib.Path.cwd", lambda: goldens_dir_path) 76 | def test_bi_deploy_looker(self): 77 | os.mkdir(self.build_temp_dir.joinpath("looker_project_repo")) 78 | with patch("data_pipelines_cli.looker_utils.BUILD_DIR", self.build_temp_dir), patch( 79 | "data_pipelines_cli.looker_utils.Repo", self.repo_class_mock() 80 | ), patch("data_pipelines_cli.looker_utils._deploy_looker_project_to_production"), patch( 81 | "data_pipelines_cli.looker_utils.LOOKML_DEST_PATH", 82 | self.build_temp_dir.joinpath("lookml"), 83 | ), patch( 84 | "data_pipelines_cli.looker_utils.generate_profiles_yml" 85 | ), patch( 86 | "data_pipelines_cli.looker_utils.run_dbt_command" 87 | ): 88 | deploy_lookML_model("/path/to/key", "env") 89 | 90 | self.assertTrue( 91 | os.path.exists( 92 | self.build_temp_dir.joinpath("looker_project_repo", "views", "view1.dp.view.lkml") 93 | ) 94 | ) 95 | self.assertTrue( 96 | os.path.exists( 97 | self.build_temp_dir.joinpath("looker_project_repo", "model1.dp.model.lkml") 98 | ) 99 | ) 100 | self.assertTrue( 101 | os.path.exists(self.build_temp_dir.joinpath("looker_project_repo", "readme.txt")) 102 | ) 103 | 104 | def test_bi_compile_looker(self): 105 | subprocess_run_mock = MagicMock() 106 | with patch("data_pipelines_cli.looker_utils.subprocess_run", subprocess_run_mock), patch( 107 | "data_pipelines_cli.looker_utils.LOOKML_DEST_PATH", "/path/for/lookml" 108 | ): 109 | generate_lookML_model() 110 | 111 | subprocess_run_mock.assert_called_once_with( 112 | ["dbt2looker", "--output-dir", "/path/for/lookml"] 113 | ) 114 | 115 | def test_bi_deploy_looker_project_to_production(self): 116 | looker_instance_url = "getindata.looker.com" 117 | project_id = "getindata_unittest" 118 | branch = "master" 119 | webhook_secret = "super_secret_value" 120 | 121 | headers = {"X-Looker-Deploy-Secret": webhook_secret} 122 | requests_post = MagicMock() 123 | with patch("data_pipelines_cli.looker_utils.requests.post", requests_post): 124 | _deploy_looker_project_to_production( 125 | looker_instance_url, project_id, branch, webhook_secret 126 | ) 127 | 128 | requests_post.assert_called_once_with( 129 | url="getindata.looker.com/webhooks/projects/getindata_unittest/deploy/branch/master", 130 | headers=headers, 131 | ) 132 | 133 | def test_bi_clear_repo_before_writing_lookml(self): 134 | local_repo_dir = self.build_temp_dir.joinpath("looker_project_repo") 135 | os.mkdir(local_repo_dir) 136 | os.mkdir(local_repo_dir.joinpath("views")) 137 | with open(f"{local_repo_dir}/test.dp.model.lkml", "w") as tst: 138 | tst.write("test") 139 | 140 | _clear_repo_before_writing_lookml(local_repo_dir) 141 | self.assertFalse(os.path.isfile(f"{local_repo_dir}/test.dp.model.lkml")) 142 | -------------------------------------------------------------------------------- /tests/test_vcs_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | import unittest 4 | 5 | from data_pipelines_cli.vcs_utils import add_suffix_to_git_template_path 6 | 7 | 8 | class GitSuffixTestCase(unittest.TestCase): 9 | remote_prefixes = ["git://", "git@", "git+", "http://", "https://"] 10 | 11 | def test_local_uri_not_has_git_suffix(self): 12 | for i in range(50): 13 | random_path = "".join( 14 | random.choices(string.ascii_letters + string.digits + "/", k=30) 15 | ).strip("/") 16 | 17 | with self.subTest(i=i, path=random_path): 18 | generated_path = add_suffix_to_git_template_path(random_path) 19 | self.assertFalse(generated_path.endswith(".git")) 20 | 21 | def test_remote_uri_has_git_suffix(self): 22 | for i in range(50): 23 | prefix = random.choice(self.remote_prefixes) 24 | random_path = "".join( 25 | random.choices(string.ascii_letters + string.digits + "/", k=30) 26 | ).strip("/") 27 | git_suffix = ".git" if bool(random.getrandbits(1)) else "" 28 | full_path = prefix + random_path + git_suffix 29 | 30 | with self.subTest(i=i, uri=full_path): 31 | generated_path = add_suffix_to_git_template_path(full_path) 32 | self.assertTrue(generated_path.endswith(".git")) 33 | self.assertFalse(generated_path.endswith(".git.git")) 34 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | 2 | [tox] 3 | envlist = py39, py310 4 | 5 | [gh-actions] 6 | python = 7 | 3.9: py39 8 | 3.10: py310 9 | 10 | [testenv] 11 | extras = 12 | tests 13 | databricks 14 | commands= 15 | python -m pytest --cov data_pipelines_cli --cov-report xml --cov-report term-missing --ignore=venv 16 | 17 | # Lint 18 | [flake8] 19 | exclude = .git,__pycache__,build,dist,docs/source/conf.py 20 | max-line-length = 100 21 | extend-ignore = E203 22 | 23 | [mypy] 24 | no_strict_optional = True 25 | ignore_missing_imports = True 26 | 27 | [mypy-tests.*] 28 | ignore_errors = True 29 | 30 | # Autoformatter 31 | [testenv:black] 32 | basepython = python3 33 | skip_install = true 34 | deps = 35 | black 36 | commands = 37 | black 38 | 39 | # Release tooling 40 | [testenv:build] 41 | basepython = python3 42 | skip_install = true 43 | deps = 44 | wheel 45 | setuptools 46 | commands = 47 | python setup.py sdist 48 | 49 | [testenv:release] 50 | basepython = python3 51 | skip_install = true 52 | setenv = 53 | TWINE_USERNAME = {env:TWINE_USERNAME} 54 | TWINE_PASSWORD = {env:TWINE_PASSWORD} 55 | deps = 56 | {[testenv:build]deps} 57 | twine >= 1.5.0 58 | commands = 59 | {[testenv:build]commands} 60 | twine upload --skip-existing dist/* 61 | --------------------------------------------------------------------------------